diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
new file mode 100644
index 0000000000000000000000000000000000000000..16076301f9cb482707677b562e8c7b01820ca14d
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -0,0 +1,263 @@
+What:		/sys/bus/dsa/devices/dsa<m>/version
+Date:		Apr 15, 2020
+KernelVersion:	5.8.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The hardware version number.
+
+What:           /sys/bus/dsa/devices/dsa<m>/cdev_major
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:	The major number that the character device driver assigned to
+		this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/errors
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The error information for this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_batch_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The largest number of work descriptors in a batch.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_work_queues_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum work queue size supported by this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_engines
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of engines supported by this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_groups
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of groups can be created under this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_read_buffers
+Date:           Dec 10, 2021
+KernelVersion:  5.17.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The total number of read buffers supported by this device.
+		The read buffers represent resources within the DSA
+		implementation, and these resources are allocated by engines to
+		support operations. See DSA spec v1.2 9.2.4 Total Read Buffers.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_transfer_size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The number of bytes to be read from the source address to
+		perform the operation. The maximum transfer size is dependent on
+		the workqueue the descriptor was submitted to.
+
+What:           /sys/bus/dsa/devices/dsa<m>/max_work_queues
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum work queue number that this device supports.
+
+What:           /sys/bus/dsa/devices/dsa<m>/numa_node
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The numa node number for this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/op_cap
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The operation capability bit mask specify the operation types
+		supported by the this device.
+
+What:		/sys/bus/dsa/devices/dsa<m>/pasid_enabled
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate if PASID (process address space identifier) is
+		enabled or not for this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/state
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The state information of this device. It can be either enabled
+		or disabled.
+
+What:           /sys/bus/dsa/devices/dsa<m>/group<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned group under this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned engine under this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The assigned work queue under this device.
+
+What:           /sys/bus/dsa/devices/dsa<m>/configurable
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    To indicate if this device is configurable or not.
+
+What:           /sys/bus/dsa/devices/dsa<m>/read_buffer_limit
+Date:           Dec 10, 2021
+KernelVersion:  5.17.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The maximum number of read buffers that may be in use at
+		one time by operations that access low bandwidth memory in the
+		device. See DSA spec v1.2 9.2.8 GENCFG on Global Read Buffer Limit.
+
+What:		/sys/bus/dsa/devices/dsa<m>/cmd_status
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The last executed device administrative command's status/error.
+		Also last configuration error overloaded.
+		Writing to it will clear the status.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate block on fault is allowed or not for the work queue
+		to support on demand paging.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/group_id
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The group id that this work queue belongs to.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/size
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The work queue size for this work queue.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/type
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The type of this work queue, it can be "kernel" type for work
+		queue usages in the kernel space or "user" type for work queue
+		usages by applications in user space.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The minor number assigned to this work queue by the character
+		device driver.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/mode
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The work queue mode type for this work queue.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/priority
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The priority value of this work queue, it is a vlue relative to
+		other work queue in the same group to control quality of service
+		for dispatching work from multiple workqueues in the same group.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/state
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The current state of the work queue.
+
+What:           /sys/bus/dsa/devices/wq<m>.<n>/threshold
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The number of entries in this work queue that may be filled
+		via a limited portal.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/max_transfer_size
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The max transfer sized for this workqueue. Cannot exceed device
+		max transfer size. Configurable parameter.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/max_batch_size
+Date:		Aug 28, 2020
+KernelVersion:	5.10.0
+Contact:	dmaengine@vger.kernel.org
+Description:	The max batch size for this workqueue. Cannot exceed device
+		max batch size. Configurable parameter.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/ats_disable
+Date:		Nov 13, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicate whether ATS disable is turned on for the workqueue.
+		0 indicates ATS is on, and 1 indicates ATS is off for the workqueue.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/occupancy
+Date		May 25, 2021
+KernelVersion:	5.14.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Show the current number of entries in this WQ if WQ Occupancy
+		Support bit WQ capabilities is 1.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/enqcmds_retries
+Date		Oct 29, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicate the number of retires for an enqcmds submission on a sharedwq.
+		A max value to set attribute is capped at 64.
+
+What:		/sys/bus/dsa/devices/wq<m>.<n>/driver_name
+Date:		Jan 21, 2022
+KernelVersion:	5.18.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Name of driver to be bounded to the wq.
+
+What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
+Date:           Oct 25, 2019
+KernelVersion:  5.6.0
+Contact:        dmaengine@vger.kernel.org
+Description:    The group that this engine belongs to.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/use_read_buffer_limit
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Enable the use of global read buffer limit for the group. See DSA
+		spec v1.2 9.2.18 GRPCFG Use Global Read Buffer Limit.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/read_buffers_allowed
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicates max number of read buffers that may be in use at one time
+		by all engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read
+		Buffers Allowed.
+
+What:		/sys/bus/dsa/devices/group<m>.<n>/read_buffers_reserved
+Date:		Dec 10, 2021
+KernelVersion:	5.17.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicates the number of Read Buffers reserved for the use of
+		engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
+		Reserved.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
new file mode 100644
index 0000000000000000000000000000000000000000..3c7d132281b03a88346ea4538cacac7f870db783
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
@@ -0,0 +1,30 @@
+What:		/sys/bus/event_source/devices/dsa*/format
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:	Read-only.  Attribute group to describe the magic bits
+		that go into perf_event_attr.config or
+		perf_event_attr.config1 for the IDXD DSA pmu.  (See also
+		ABI/testing/sysfs-bus-event_source-devices-format).
+
+		Each attribute in this group defines a bit range in
+		perf_event_attr.config or perf_event_attr.config1.
+		All supported attributes are listed below (See the
+		IDXD DSA Spec for possible attribute values)::
+
+		    event_category = "config:0-3"    - event category
+		    event          = "config:4-31"   - event ID
+
+		    filter_wq      = "config1:0-31"  - workqueue filter
+		    filter_tc      = "config1:32-39" - traffic class filter
+		    filter_pgsz    = "config1:40-43" - page size filter
+		    filter_sz      = "config1:44-51" - transfer size filter
+		    filter_eng     = "config1:52-59" - engine filter
+
+What:		/sys/bus/event_source/devices/dsa*/cpumask
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:    Read-only.  This file always returns the cpu to which the
+                IDXD DSA pmu is bound for access to all dsa pmu
+		performance monitoring events.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore b/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore
new file mode 100644
index 0000000000000000000000000000000000000000..b56e8f019fd4a5b31dc8bcc0610f9bade2cbcb86
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-uncore
@@ -0,0 +1,13 @@
+What:		/sys/bus/event_source/devices/uncore_*/alias
+Date:		June 2021
+KernelVersion:	5.15
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Read-only.  An attribute to describe the alias name of
+		the uncore PMU if an alias exists on some platforms.
+		The 'perf(1)' tool should treat both names the same.
+		They both can be used to access the uncore PMU.
+
+		Example:
+
+		$ cat /sys/devices/uncore_cha_2/alias
+		uncore_type_0_2
diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt
new file mode 100644
index 0000000000000000000000000000000000000000..ed4c886a21b1ee1640d21f11e5347fa06a5b95b6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-intel_pmt
@@ -0,0 +1,119 @@
+What:		/sys/class/intel_pmt/
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The intel_pmt/ class directory contains information for
+		devices that expose hardware telemetry using Intel Platform
+		Monitoring Technology (PMT)
+
+What:		/sys/class/intel_pmt/telem<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		The telem<x> directory contains files describing an instance of
+		a PMT telemetry device that exposes hardware telemetry. Each
+		telem<x> directory has an associated telem file. This file
+		may be opened and mapped or read to access the telemetry space
+		of the device. The register layout of the telemetry space is
+		determined from an XML file that matches the PCI device id and
+		GUID for the device.
+
+What:		/sys/class/intel_pmt/telem<x>/telem
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The telemetry data for this telemetry device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/telem<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The GUID for this telemetry device. The GUID identifies
+		the version of the XML file for the parent device that is to
+		be used to get the register layout.
+
+What:		/sys/class/intel_pmt/telem<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The size of telemetry region in bytes that corresponds to
+		the mapping size for the telem file.
+
+What:		/sys/class/intel_pmt/telem<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The offset of telemetry region in bytes that corresponds to
+		the mapping for the telem file.
+
+What:		/sys/class/intel_pmt/crashlog<x>
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		The crashlog<x> directory contains files for configuring an
+		instance of a PMT crashlog device that can perform crash data
+		recording. Each crashlog<x> device has an associated crashlog
+		file. This file can be opened and mapped or read to access the
+		resulting crashlog buffer. The register layout for the buffer
+		can be determined from an XML file of specified GUID for the
+		parent device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/crashlog
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	David Box <david.e.box@linux.intel.com>
+Description:
+		(RO) The crashlog buffer for this crashlog device. This file
+		may be mapped or read to obtain the data.
+
+What:		/sys/class/intel_pmt/crashlog<x>/guid
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The GUID for this crashlog device. The GUID identifies the
+		version of the XML file for the parent device that should be
+		used to determine the register layout.
+
+What:		/sys/class/intel_pmt/crashlog<x>/size
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The length of the result buffer in bytes that corresponds
+		to the size for the crashlog buffer.
+
+What:		/sys/class/intel_pmt/crashlog<x>/offset
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RO) The offset of the buffer in bytes that corresponds
+		to the mapping for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/enable
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling if the crashlog functionality
+		is enabled for the crashlog device.
+
+What:		/sys/class/intel_pmt/crashlog<x>/trigger
+Date:		October 2020
+KernelVersion:	5.10
+Contact:	Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Description:
+		(RW) Boolean value controlling the triggering of the crashlog
+		device node. When read it provides data on if the crashlog has
+		been triggered. When written to it can be used to either clear
+		the current trigger by writing false, or to trigger a new
+		event if the trigger is not currently set.
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index 0000000000000000000000000000000000000000..490ccfd67f125a005cc411bda7496018938691e3
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What:           /sys/devices/uncore_iio_x/dieX
+Date:           February 2020
+Contact:        Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Description:
+                Each IIO stack (PCIe root port) has its own IIO PMON block, so
+                each dieX file (where X is die number) holds "Segment:Root Bus"
+                for PCIe root port, which can be monitored by that IIO PMON
+                block.
+                For example, on 4-die Xeon platform with up to 6 IIO stacks per
+                die and, therefore, 6 IIO PMON blocks per die, the mapping of
+                IIO PMON block 0 exposes as the following:
+
+                $ ls /sys/devices/uncore_iio_0/die*
+                -r--r--r-- /sys/devices/uncore_iio_0/die0
+                -r--r--r-- /sys/devices/uncore_iio_0/die1
+                -r--r--r-- /sys/devices/uncore_iio_0/die2
+                -r--r--r-- /sys/devices/uncore_iio_0/die3
+
+                $ tail /sys/devices/uncore_iio_0/die*
+                ==> /sys/devices/uncore_iio_0/die0 <==
+                0000:00
+                ==> /sys/devices/uncore_iio_0/die1 <==
+                0000:40
+                ==> /sys/devices/uncore_iio_0/die2 <==
+                0000:80
+                ==> /sys/devices/uncore_iio_0/die3 <==
+                0000:c0
+
+                Which means:
+                IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
+                IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
+                IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
+                IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 017f5bc3920cefc718865af5e5605ee62d855b3b..b15af6a5bc08145206553a5faa5270ed586773c2 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -25,11 +25,41 @@ Description:    /sys/kernel/iommu_groups/reserved_regions list IOVA
 		the base IOVA, the second is the end IOVA and the third
 		field describes the type of the region.
 
-What:		/sys/kernel/iommu_groups/reserved_regions
-Date: 		June 2019
-KernelVersion:  v5.3
-Contact: 	Eric Auger <eric.auger@redhat.com>
-Description:    In case an RMRR is used only by graphics or USB devices
-		it is now exposed as "direct-relaxable" instead of "direct".
-		In device assignment use case, for instance, those RMRR
-		are considered to be relaxable and safe.
+		Since kernel 5.3, in case an RMRR is used only by graphics or
+		USB devices it is now exposed as "direct-relaxable" instead
+		of "direct". In device assignment use case, for instance,
+		those RMRR are considered to be relaxable and safe.
+
+What:		/sys/kernel/iommu_groups/<grp_id>/type
+Date:		November 2020
+KernelVersion:	v5.11
+Contact:	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
+Description:	/sys/kernel/iommu_groups/<grp_id>/type shows the type of default
+		domain in use by iommu for this group. See include/linux/iommu.h
+		for possible read values. A privileged user could request kernel to
+		change the group type by writing to this file. Valid write values:
+
+		========  ======================================================
+		DMA       All the DMA transactions from the device in this group
+		          are translated by the iommu.
+		DMA-FQ    As above, but using batched invalidation to lazily
+		          remove translations after use. This may offer reduced
+			  overhead at the cost of reduced memory protection.
+		identity  All the DMA transactions from the device in this group
+		          are not translated by the iommu. Maximum performance
+			  but zero protection.
+		auto      Change to the type the device was booted with.
+		========  ======================================================
+
+		The default domain type of a group may be modified only when
+
+		- The group has only one device.
+		- The device in the group is not bound to any device driver.
+		  So, the users must unbind the appropriate driver before
+		  changing the default domain type.
+
+		Unbinding a device driver will take away the driver's control
+		over the device and if done on devices that host root file
+		system could lead to catastrophic effects (the users might
+		need to reboot the machine to get it to normal state). So, it's
+		expected that the users understand what they're doing.
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 2d8d2fed731720b130248caccf8badc22b5ffec2..f41620439ef349b0592d177052b7d6fa85fcf152 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -204,6 +204,14 @@ Returns the maximum size of a mapping for the device. The size parameter
 of the mapping functions like dma_map_single(), dma_map_page() and
 others should not be larger than the returned value.
 
+::
+
+	bool
+	dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+
+Returns %true if dma_sync_single_for_{device,cpu} calls are required to
+transfer memory ownership.  Returns %false if those calls can be skipped.
+
 ::
 
 	unsigned long
diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
index 8f8d97f65d7375a9fc87f4bad9f4a6c8033e8bba..6ec8de3fde830047412691cd76c3e2629fcfdce9 100644
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -43,14 +43,6 @@ Since it is optional for platforms to implement DMA_ATTR_WRITE_COMBINE,
 those that do not will simply ignore the attribute and exhibit default
 behavior.
 
-DMA_ATTR_NON_CONSISTENT
------------------------
-
-DMA_ATTR_NON_CONSISTENT lets the platform to choose to return either
-consistent or non-consistent memory as it sees fit.  By using this API,
-you are guaranteeing to the platform that you have all the correct and
-necessary sync points for this memory in the driver.
-
 DMA_ATTR_NO_KERNEL_MAPPING
 --------------------------
 
diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
index 994cbb660adef4150c1ef5e8b378ee2c3d280c16..43e45f46af371dc2146632b4cf6fa5a242fa3d6a 100644
--- a/Documentation/PCI/msi-howto.rst
+++ b/Documentation/PCI/msi-howto.rst
@@ -164,6 +164,26 @@ the driver can specify that only MSI or MSI-X is acceptable::
 	if (nvec < 0)
 		goto out_err;
 
+To request additional MSI-X vectors after the probe phase, the
+pci_add_msix_irq_vector() API can be used::
+
+        irq = pci_add_msix_irq_vectors(struct pci_dev *dev);
+        if (irq < 0)
+                goto out_err;
+
+Each time this API is called, one MSI-X vector gets added to the device. This
+API should be called after pci_alloc_irq_vectors has been called by the driver.
+
+This API returns the device-relative interrupt vector index (0-based) which can
+be passed to pci_irq_vector() to retrieve the corresponding Linux IRQ number.
+
+To free the allocated resources associated with a particular MSI-X vector, the
+pci_free_msix_irq_vector() API can be used::
+
+        void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+
+Here, 'irq' refers to the Linux IRQ number.
+
 Legacy APIs
 -----------
 
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
index 6864f9a70f5f0b119b91ef6b9b4b7806cd4116d5..0070a28dd4fd20ab8aa7a1167b14a54bfff598c3 100644
--- a/Documentation/PCI/pci.rst
+++ b/Documentation/PCI/pci.rst
@@ -103,6 +103,7 @@ need pass only as many optional fields as necessary:
   - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
   - class and classmask fields default to 0
   - driver_data defaults to 0UL.
+  - override_only field defaults to 0.
 
 Note that driver_data must match the value used by any of the pci_device_id
 entries defined in the driver. This makes the driver_data field mandatory
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index 18bdefaafd1a89a42ed70a1740ca44b4bfd86a4f..0b36b9ebfa4b4296ec1a4fc55ac9b443a7bdea3c 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -156,12 +156,6 @@ default reset_link function, but different upstream ports might
 have different specifications to reset pci express link, so all
 upstream ports should provide their own reset_link functions.
 
-In struct pcie_port_service_driver, a new pointer, reset_link, is
-added.
-::
-
-	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
-
 Section 3.2.2.2 provides more detailed info on when to call
 reset_link.
 
@@ -212,15 +206,10 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within
 a hierarchy in question. Then, performing link reset at upstream is
 necessary. As different kinds of devices might use different approaches
 to reset link, AER port service driver is required to provide the
-function to reset link. Firstly, kernel looks for if the upstream
-component has an aer driver. If it has, kernel uses the reset_link
-callback of the aer driver. If the upstream component has no aer driver
-and the port is downstream port, we will perform a hot reset as the
-default by setting the Secondary Bus Reset bit of the Bridge Control
-register associated with the downstream port. As for upstream ports,
-they should provide their own aer service drivers with reset_link
-function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
-reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
+function to reset link via callback parameter of pcie_do_recovery()
+function. If reset_link is not NULL, recovery function will use it
+to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER
+and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
 to mmio_enabled.
 
 helper functions
@@ -243,9 +232,9 @@ messages to root port when an error is detected.
 
 ::
 
-  int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
+  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);`
 
-pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
+pci_aer_clear_nonfatal_status clears non-fatal errors in the uncorrectable
 error status register.
 
 Frequent Asked Questions
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
index 10bf48bae0b045d54d9d3d5cc9ab4399a7b391f6..9498e09f6c4e2ee84154dbdd1f50e9d680c9a8f5 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -13,6 +13,7 @@ Control Groups version 1
     devices
     freezer-subsystem
     hugetlb
+    ioasids
     memcg_test
     memory
     net_cls
diff --git a/Documentation/admin-guide/cgroup-v1/ioasids.rst b/Documentation/admin-guide/cgroup-v1/ioasids.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b30eb41bf1bed6c85344ec961fcfb592c90b961a
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/ioasids.rst
@@ -0,0 +1,110 @@
+========================================
+I/O Address Space ID (IOASID) Controller
+========================================
+
+Acronyms
+--------
+PASID:
+	Process Address Space ID, defined by PCIe
+SVA:
+	Shared Virtual Address
+
+Introduction
+------------
+
+IOASIDs are used to associate DMA requests with virtual address spaces. As
+a system-wide limited¹ resource, its constraints are managed by the IOASIDs
+cgroup subsystem. The specific use cases are:
+
+1. Some user applications exhaust all the available IOASIDs thus depriving
+   others of the same host.
+
+2. System admins need to provision VMs based on their needs for IOASIDs,
+   e.g. the number of VMs with assigned devices that perform DMA requests
+   with PASID.
+
+The IOASID subsystem consists of three components:
+
+- IOASID core: provides APIs for allocation, pool management,
+  notifications and refcounting. See Documentation/driver-api/ioasid.rst
+  for details
+- IOASID user:  provides user allocation interface via /dev/ioasid
+- IOASID cgroup controller: manage resource distribution
+
+Resource Distribution Model
+---------------------------
+IOASID allocation is process-based in that IOASIDs are tied to page tables²,
+the threaded model is not supported. The allocation is rejected by the
+cgroup hierarchy once a limit is reached. However, organizational changes
+such as moving processes across cgroups are exempted. Therefore, it is
+possible to have ioasids.current > ioasids.max. It is not possible to do
+further allocation after the organizational change that exceeds the max.
+
+The system capacity of the IOASIDs is default to PCIe PASID size of 20 bits.
+IOASID core provides API to adjust the system capacity based on platforms.
+IOASIDs are used by both user applications (e.g. VMs and userspace drivers)
+and kernel (e.g. supervisor SVA). However, only user allocation is subject
+to cgroup constraints. Host kernel allocates a pool of IOASIDs where its
+quota is subtracted from the system capacity. IOASIDs cgroup consults with
+the IOASID core for available capacity when a new cgroup limit is granted.
+Upon creation, no IOASID allocation is allowed by the user processes within
+the new cgroup.
+
+Usage
+-----
+CGroup filesystem has the following IOASIDs controller specific entries:
+::
+
+ ioasids.current
+ ioasids.events
+ ioasids.max
+
+To use the IOASIDs controller, set ioasids.max to the limit of the number
+of IOASIDs that can be allocated. The file ioasids.current shows the current
+number of IOASIDs allocated within the cgroup.
+
+Example
+--------
+1. Mount the cgroup2 FS ::
+
+	$ mount -t cgroup2 none /mnt/cg2/
+
+2. Add ioasids controller ::
+
+	$ echo '+ioasids' > /mnt/cg2/cgroup.subtree_control
+
+3. Create a hierarchy, set non-zero limit (default 0) ::
+
+	$ mkdir /mnt/cg2/test1
+	$ echo 5 > /mnt/cg2/test1/ioasids.max
+
+4. Allocate IOASIDs within limit should succeed ::
+
+	$echo $$ > /mnt/cg2/test1/cgroup.procs
+	Do IOASID allocation via /dev/ioasid
+	ioasids.current:1
+	ioasids.max:5
+
+5. Attempt to allocate IOASIDs beyond limit should fail ::
+
+	ioasids.current:5
+	ioasids.max:5
+
+6. Attach a new process with IOASID already allocated to a cgroup could
+result in ioasids.current > ioasids.max, e.g. process with PID 1234 under
+a cgroup with IOASIDs controller has one IOASID allocated, moving it to
+test1 cgroup ::
+
+	$echo 1234 > /mnt/cg2/test1/cgroup.procs
+	ioasids.current:6
+	ioasids.max:5
+
+Notes
+-----
+¹ When IOASID is used for PCI Express PASID, the range is limited to the
+PASID size of 20 bits. For a device that its resources can be shared across
+the platform, the IOASID namespace must be system-wide in order to uniquely
+identify DMA request with PASID inside the device.
+
+² The primary use case is SVA, where CPU page tables are shared with DMA via
+IOMMU.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47eb94d8d011413a1d94b84123d03dfd1a513392..87de86346d319f5775c28e3c60392dd60243f334 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -290,10 +290,7 @@
 	amd_iommu=	[HW,X86-64]
 			Pass parameters to the AMD IOMMU driver in the system.
 			Possible values are:
-			fullflush - enable flushing of IO/TLB entries when
-				    they are unmapped. Otherwise they are
-				    flushed before they will be reused, which
-				    is a lot of faster
+			fullflush - Deprecated, equivalent to iommu.strict=1
 			off	  - do not initialize any AMD IOMMU found in
 				    the system
 			force_isolation - Force device isolation for all
@@ -567,6 +564,12 @@
 			loops can be debugged more effectively on production
 			systems.
 
+	clocksource.max_cswd_read_retries= [KNL]
+			Number of clocksource_watchdog() retries due to
+			external delays before the clock will be marked
+			unstable.  Defaults to two retries, that is,
+			three attempts to read the clock under test.
+
 	clearcpuid=BITNUM[,BITNUM...] [X86]
 			Disable CPUID feature X for the kernel. See
 			arch/x86/include/asm/cpufeatures.h for the valid bit
@@ -879,12 +882,6 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
-	perf_v4_pmi=	[X86,INTEL]
-			Format: <bool>
-			Disable Intel PMU counter freezing feature.
-			The feature only exists starting from
-			Arch Perfmon v4 (Skylake and newer).
-
 	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
@@ -1172,7 +1169,8 @@
 			Format: {"off" | "on" | "skip[mbr]"}
 
 	efi=		[EFI]
-			Format: { "old_map", "nochunk", "noruntime", "debug" }
+			Format: { "old_map", "nochunk", "noruntime", "debug",
+				  "nosoftreserve" }
 			old_map [X86-64]: switch to the old ioremap-based EFI
 			runtime services mapping. 32-bit still uses this one by
 			default.
@@ -1181,6 +1179,12 @@
 			firmware implementations.
 			noruntime : disable EFI runtime services support
 			debug: enable misc debug output
+			nosoftreserve: The EFI_MEMORY_SP (Specific Purpose)
+			attribute may cause the kernel to reserve the
+			memory range for a memory mapping driver to
+			claim. Specify efi=nosoftreserve to disable this
+			reservation and treat the memory by its base type
+			(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
 
 	efi_no_storage_paranoia [EFI; X86]
 			Using this parameter you can use more than 50% of
@@ -1193,15 +1197,21 @@
 			updating original EFI memory map.
 			Region of memory which aa attribute is added to is
 			from ss to ss+nn.
+
 			If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
 			is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
 			attribute is added to range 0x100000000-0x180000000 and
 			0x10a0000000-0x1120000000.
 
+			If efi_fake_mem=8G@9G:0x40000 is specified, the
+			EFI_MEMORY_SP(0x40000) attribute is added to
+			range 0x240000000-0x43fffffff.
+
 			Using this parameter you can do debugging of EFI memmap
-			related feature. For example, you can do debugging of
+			related features. For example, you can do debugging of
 			Address Range Mirroring feature even if your box
-			doesn't support it.
+			doesn't support it, or mark specific memory as
+			"soft reserved".
 
 	efivar_ssdt=	[EFI; X86] Name of an EFI variable that contains an SSDT
 			that is to be dynamically loaded by Linux. If there are
@@ -1543,6 +1553,17 @@
 			In such case C2/C3 won't be used again.
 			idle=nomwait: Disable mwait for CPU C-states
 
+	idxd.sva=	[HW]
+			Format: <bool>
+			Allow force disabling of Shared Virtual Memory (SVA)
+			support for the idxd driver. By default it is set to
+			true (1).
+
+	idxd.tc_override= [HW]
+			Format: <bool>
+			Allow override of default traffic class configuration
+			for the device. By default it is set to false (0).
+
 	ieee754=	[MIPS] Select IEEE Std 754 conformance mode
 			Format: { strict | legacy | 2008 | relaxed }
 			Default: strict
@@ -1717,26 +1738,18 @@
 			bypassed by not enabling DMAR with this option. In
 			this case, gfx device will use physical address for
 			DMA.
-		forcedac [x86_64]
-			With this option iommu will not optimize to look
-			for io virtual address below 32-bit forcing dual
-			address cycle on pci bus for cards supporting greater
-			than 32-bit addressing. The default is to look
-			for translation below 32-bit and if not available
-			then look in the higher range.
 		strict [Default Off]
-			With this option on every unmap_single operation will
-			result in a hardware IOTLB flush operation as opposed
-			to batching them for performance.
+			Deprecated, equivalent to iommu.strict=1.
 		sp_off [Default Off]
 			By default, super page will be supported if Intel IOMMU
 			has the capability. With this option, super page will
 			not be supported.
-		sm_on [Default Off]
-			By default, scalable mode will be disabled even if the
-			hardware advertises that it has support for the scalable
-			mode translation. With this option set, scalable mode
-			will be used on hardware which claims to support it.
+		sm_on
+			Enable the Intel IOMMU scalable mode if the hardware
+			advertises that it has support for the scalable mode
+			translation.
+		sm_off
+			Disallow use of the Intel IOMMU scalable mode.
 		tboot_noforce [Default Off]
 			Do not force the Intel IOMMU enabled under tboot.
 			By default, tboot will force Intel IOMMU on, which
@@ -1746,11 +1759,6 @@
 			Note that using this option lowers the security
 			provided by tboot because it makes the system
 			vulnerable to DMA attacks.
-		nobounce [Default off]
-			Disable bounce buffer for unstrusted devices such as
-			the Thunderbolt devices. This will treat the untrusted
-			devices as the trusted ones, hence might expose security
-			risks of DMA attacks.
 
 	intel_idle.max_cstate=	[KNL,HW,ACPI,X86]
 			0	disables intel_idle and fall back on acpi_idle.
@@ -1817,6 +1825,14 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.
 
+	iommu.forcedac=	[ARM64, X86] Control IOVA allocation for PCI devices.
+			Format: { "0" | "1" }
+			0 - Try to allocate a 32-bit DMA address first, before
+			  falling back to the full range if needed.
+			1 - Allocate directly from the full usable range,
+			  forcing Dual Address Cycle for PCI cards supporting
+			  greater than 32-bit addressing.
+
 	iommu.strict=	[ARM64] Configure TLB invalidation behaviour
 			Format: { "0" | "1" }
 			0 - Lazy mode.
@@ -1825,10 +1841,12 @@
 			  throughput at the cost of reduced device isolation.
 			  Will fall back to strict mode if not supported by
 			  the relevant IOMMU driver.
-			1 - Strict mode (default).
+			1 - Strict mode.
 			  DMA unmap operations invalidate IOMMU hardware TLBs
 			  synchronously.
-
+			unset - Use value of CONFIG_IOMMU_DEFAULT_DMA_{LAZY,STRICT}.
+			Note: on x86, strict mode specified via one of the
+			legacy driver-specific options takes precedence.
 	iommu.passthrough=
 			[ARM64, X86] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
@@ -1836,6 +1854,14 @@
 			1 - Bypass the IOMMU for DMA.
 			unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
 
+	iommu.prq_timeout=
+			Timeout in seconds to wait for page response
+			of a pending page request.
+			Format: <integer>
+			Default: 10
+			0 - no timeout tracking
+			1 to 100 - allowed range
+
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
@@ -4620,6 +4646,46 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 			off:    Disable mitigation and remove
 				performance impact to RDRAND and RDSEED
 
+	split_lock_detect=
+			[X86] Enable split lock detection or bus lock detection
+
+			When enabled (and if hardware support is present), atomic
+			instructions that access data across cache line
+			boundaries will result in an alignment check exception
+			for split lock detection or a debug exception for
+			bus lock detection.
+
+			off	- not enabled
+
+			warn	- the kernel will emit rate-limited warnings
+				  about applications triggering the #AC
+				  exception or the #DB exception. This mode is
+				  the default on CPUs that support split lock
+				  detection or bus lock detection. Default
+				  behavior is by #AC if both features are
+				  enabled in hardware.
+
+			fatal	- the kernel will send SIGBUS to applications
+				  that trigger the #AC exception or the #DB
+				  exception. Default behavior is by #AC if
+				  both features are enabled in hardware.
+
+			ratelimit:N -
+				  Set system wide rate limit to N bus locks
+				  per second for bus lock detection.
+				  0 < N <= 1000.
+
+				  N/A for split lock detection.
+
+
+			If an #AC exception is hit in the kernel or in
+			firmware (i.e. not while executing in user mode)
+			the kernel will oops in either "warn" or "fatal"
+			mode.
+
+			#DB exception for bus lock is triggered only when
+			CPL > 0.
+
 	srcutree.counter_wrap_check [KNL]
 			Specifies how frequently to check for
 			grace-period sequence counter wrap for the
@@ -4685,6 +4751,15 @@ qspinlock.numa_spinlock_threshold_ns=	[NUMA, PV_OPS]
 	stifb=		[HW]
 			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 
+        strict_sas_size=
+			[X86]
+			Format: <bool>
+			Enable or disable strict sigaltstack size checks
+			against the required signal frame size which
+			depends on the supported FPU features. This can
+			be used to filter out binaries which have
+			not yet been made aware of AT_MINSIGSTKSZ.
+
 	sunrpc.min_resvport=
 	sunrpc.max_resvport=
 			[NFS,SUNRPC]
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index 311cd7cc2b75854f337e54e7a6db2b1bc98cd31d..6a06dc473dd68518aa56144fb402b4d55daf4bc2 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -632,16 +632,16 @@ class priority list and destroyed.  If that happens, the priority list mechanism
 will be used, again, to determine the new effective value for the whole list
 and that value will become the new real constraint.
 
-In turn, for each CPU there is only one resume latency PM QoS request
-associated with the :file:`power/pm_qos_resume_latency_us` file under
+In turn, for each CPU there is one resume latency PM QoS request associated with
+the :file:`power/pm_qos_resume_latency_us` file under
 :file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
 this single PM QoS request to be updated regardless of which user space
 process does that.  In other words, this PM QoS request is shared by the entire
 user space, so access to the file associated with it needs to be arbitrated
 to avoid confusion.  [Arguably, the only legitimate use of this mechanism in
 practice is to pin a process to the CPU in question and let it use the
-``sysfs`` interface to control the resume latency constraint for it.]  It
-still only is a request, however.  It is a member of a priority list used to
+``sysfs`` interface to control the resume latency constraint for it.]  It is
+still only a request, however.  It is an entry in a priority list used to
 determine the effective value to be set as the resume latency constraint for the
 CPU in question every time the list of requests is updated this way or another
 (there may be other requests coming from kernel code in that list).
diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
index afbf778035f820656fe8cfa3354c0fab71a1342f..89309e1b0e484d3312865f2ea97f6301d1c1cefd 100644
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -60,6 +60,9 @@ of the system.  The former are always used if the processor model at hand is
 recognized by ``intel_idle`` and the latter are used if that is required for
 the given processor model (which is the case for all server processor models
 recognized by ``intel_idle``) or if the processor model is not recognized.
+[There is a module parameter that can be used to make the driver use the ACPI
+tables with any processor model recognized by it; see
+`below <intel-idle-parameters_>`_.]
 
 If the ACPI tables are going to be used for building the list of available idle
 states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
@@ -165,7 +168,7 @@ and ``idle=nomwait``.  If any of them is present in the kernel command line, the
 ``MWAIT`` instruction is not allowed to be used, so the initialization of
 ``intel_idle`` will fail.
 
-Apart from that there are two module parameters recognized by ``intel_idle``
+Apart from that there are four module parameters recognized by ``intel_idle``
 itself that can be set via the kernel command line (they cannot be updated via
 sysfs, so that is the only way to change their values).
 
@@ -186,9 +189,28 @@ QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
 even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
 Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
 
-The ``noacpi`` module parameter (which is recognized by ``intel_idle`` if the
-kernel has been configured with ACPI support), can be set to make the driver
-ignore the system's ACPI tables entirely (it is unset by default).
+The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
+if the kernel has been configured with ACPI support) can be set to make the
+driver ignore the system's ACPI tables entirely or use them for all of the
+recognized processor models, respectively (they both are unset by default and
+``use_acpi`` has no effect if ``no_acpi`` is set).
+
+The value of the ``states_off`` module parameter (0 by default) represents a
+list of idle states to be disabled by default in the form of a bitmask.
+
+Namely, the positions of the bits that are set in the ``states_off`` value are
+the indices of idle states to be disabled by default (as reflected by the names
+of the corresponding idle state directories in ``sysfs``, :file:`state0`,
+:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
+idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
+
+For example, if ``states_off`` is equal to 3, the driver will disable idle
+states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
+disabled by default and so on (bit positions beyond the maximum idle state index
+are ignored).
+
+The idle states disabled this way can be enabled (on a per-CPU basis) from user
+space via ``sysfs``.
 
 
 .. _intel-idle-core-and-package-idle-states:
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24c5c7b901fa4a5ac6d2c93d8f37e0..a534cc7ebd056fdda25ae87389141be6bcd7f55d 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
    :functions: gen_pool_for_each_chunk
 
 .. kernel-doc:: lib/genalloc.c
-   :functions: addr_in_gen_pool
+   :functions: gen_pool_has_addr
 
 .. kernel-doc:: lib/genalloc.c
    :functions: gen_pool_avail
diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index dfc4486b5743c127fa98d8b6dc0a9e4b9659d562..ceec2374add6188822aacf8a828fa5d9694313a7 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -162,6 +162,29 @@ Currently, the types available are:
 
   - The device is able to do memory to memory copies
 
+- - DMA_MEMCPY_SG
+
+  - The device supports memory to memory scatter-gather transfers.
+
+  - Even though a plain memcpy can look like a particular case of a
+    scatter-gather transfer, with a single chunk to copy, it's a distinct
+    transaction type in the mem2mem transfer case. This is because some very
+    simple devices might be able to do contiguous single-chunk memory copies,
+    but have no support for more complex SG transfers.
+
+  - No matter what the overall size of the combined chunks for source and
+    destination is, only as many bytes as the smallest of the two will be
+    transmitted. That means the number and size of the scatter-gather buffers in
+    both lists need not be the same, and that the operation functionally is
+    equivalent to a ``strncpy`` where the ``count`` argument equals the smallest
+    total size of the two scatter-gather list buffers.
+
+  - It's usually used for copying pixel data between host memory and
+    memory-mapped GPU device memory, such as found on modern PCI video graphics
+    cards. The most immediate example is the OpenGL API function
+    ``glReadPielx()``, which might require a verbatim copy of a huge framebuffer
+    from local device memory onto host memory.
+
 - DMA_XOR
 
   - The device is able to perform XOR operations on memory areas
@@ -305,7 +328,9 @@ supported.
 
     - tx_submit: A pointer to a function you have to implement,
       that is supposed to push the current transaction descriptor to a
-      pending queue, waiting for issue_pending to be called.
+      pending queue, waiting for issue_pending to be called. Each
+      descriptor is given a cookie to identify it. See the section
+      "Cookie Management" below.
 
   - In this structure the function pointer callback_result can be
     initialized in order for the submitter to be notified that a
@@ -407,6 +432,40 @@ supported.
 
   - May sleep.
 
+Cookie Management
+------------------
+
+When a transaction is queued for submission via tx_submit(), the provider
+must assign that transaction a cookie (dma_cookie_t) to uniquely identify it.
+The provider is allowed to perform this assignment however it wants, but for
+convenience the following utility functions are available to create
+monotonically increasing cookies
+
+  .. code-block:: c
+
+    void dma_cookie_init(struct dma_chan *chan);
+
+  Called once at channel creation
+
+  .. code-block:: c
+
+    dma_cookie_t dma_cookie_assign(struct dma_async_tx_descriptor *tx);
+
+  Assign a cookie to the given descriptor
+
+  .. code-block:: c
+
+    void dma_cookie_complete(struct dma_async_tx_descriptor *tx);
+
+  Mark the descriptor as complete and invalidate the cookie
+
+  .. code-block:: c
+
+    enum dma_status dma_cookie_status(struct dma_chan *chan,
+      dma_cookie_t cookie, struct dma_tx_state *state);
+
+  Report the status of the cookie and filling in state, if not NULL.
+
 
 Misc notes
 ==========
diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 25eb7d5b834ba300a9b9ca3ccbc4828213137fda..9f26079cacae35e7aa31ce0c53a0648019bf5707 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -93,20 +93,18 @@ interfaces:
 Registration Interface for a Mediated Bus Driver
 ------------------------------------------------
 
-The registration interface for a mediated bus driver provides the following
+The registration interface for a mediated device driver provides the following
 structure to represent a mediated device's driver::
 
      /*
       * struct mdev_driver [2] - Mediated device's driver
-      * @name: driver name
       * @probe: called when new device created
       * @remove: called when device removed
       * @driver: device driver structure
       */
      struct mdev_driver {
-	     const char *name;
-	     int  (*probe)  (struct device *dev);
-	     void (*remove) (struct device *dev);
+	     int  (*probe)  (struct mdev_device *dev);
+	     void (*remove) (struct mdev_device *dev);
 	     struct device_driver    driver;
      };
 
@@ -115,8 +113,7 @@ to register and unregister itself with the core driver:
 
 * Register::
 
-    extern int  mdev_register_driver(struct mdev_driver *drv,
-				   struct module *owner);
+    extern int  mdev_register_driver(struct mdev_driver *drv);
 
 * Unregister::
 
@@ -139,37 +136,26 @@ The structures in the mdev_parent_ops structure are as follows:
 * dev_attr_groups: attributes of the parent device
 * mdev_attr_groups: attributes of the mediated device
 * supported_config: attributes to define supported configurations
+* device_driver: device driver to bind for mediated device instances
 
-The functions in the mdev_parent_ops structure are as follows:
+The mdev_parent_ops also still has various functions pointers.  Theses exist
+for historical reasons only and shall not be used for new drivers.
 
-* create: allocate basic resources in a driver for a mediated device
-* remove: free resources in a driver when a mediated device is destroyed
-
-(Note that mdev-core provides no implicit serialization of create/remove
-callbacks per mdev parent device, per mdev type, or any other categorization.
-Vendor drivers are expected to be fully asynchronous in this respect or
-provide their own internal resource protection.)
-
-The callbacks in the mdev_parent_ops structure are as follows:
-
-* open: open callback of mediated device
-* close: close callback of mediated device
-* ioctl: ioctl callback of mediated device
-* read : read emulation callback
-* write: write emulation callback
-* mmap: mmap emulation callback
-
-A driver should use the mdev_parent_ops structure in the function call to
-register itself with the mdev core driver::
+When a driver wants to add the GUID creation sysfs to an existing device it has
+probe'd to then it should call::
 
 	extern int  mdev_register_device(struct device *dev,
 	                                 const struct mdev_parent_ops *ops);
 
-However, the mdev_parent_ops structure is not required in the function call
-that a driver should use to unregister itself with the mdev core driver::
+This will provide the 'mdev_supported_types/XX/create' files which can then be
+used to trigger the creation of a mdev_device. The created mdev_device will be
+attached to the specified driver.
+
+When the driver needs to remove itself it calls::
 
 	extern void mdev_unregister_device(struct device *dev);
 
+Which will unbind and destroy all the created mdevs and remove the sysfs files.
 
 Mediated Device Management Interface Through sysfs
 ==================================================
diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0bb140603c23eeb9da35856703969a..d8265ec8cca624117cf1db8d57e1611feabd8f20 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -249,35 +249,43 @@ VFIO bus driver API
 
 VFIO bus drivers, such as vfio-pci make use of only a few interfaces
 into VFIO core.  When devices are bound and unbound to the driver,
-the driver should call vfio_add_group_dev() and vfio_del_group_dev()
-respectively::
-
-	extern int vfio_add_group_dev(struct device *dev,
-				      const struct vfio_device_ops *ops,
-				      void *device_data);
-
-	extern void *vfio_del_group_dev(struct device *dev);
-
-vfio_add_group_dev() indicates to the core to begin tracking the
-iommu_group of the specified dev and register the dev as owned by
-a VFIO bus driver.  The driver provides an ops structure for callbacks
+the driver should call vfio_register_group_dev() and
+vfio_unregister_group_dev() respectively::
+
+	void vfio_init_group_dev(struct vfio_device *device,
+				struct device *dev,
+				const struct vfio_device_ops *ops);
+	void vfio_uninit_group_dev(struct vfio_device *device);
+	int vfio_register_group_dev(struct vfio_device *device);
+	void vfio_unregister_group_dev(struct vfio_device *device);
+
+The driver should embed the vfio_device in its own structure and call
+vfio_init_group_dev() to pre-configure it before going to registration
+and call vfio_uninit_group_dev() after completing the un-registration.
+vfio_register_group_dev() indicates to the core to begin tracking the
+iommu_group of the specified dev and register the dev as owned by a VFIO bus
+driver. Once vfio_register_group_dev() returns it is possible for userspace to
+start accessing the driver, thus the driver should ensure it is completely
+ready before calling it. The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
 	struct vfio_device_ops {
-		int	(*open)(void *device_data);
-		void	(*release)(void *device_data);
-		ssize_t	(*read)(void *device_data, char __user *buf,
+		int	(*open)(struct vfio_device *vdev);
+		void	(*release)(struct vfio_device *vdev);
+		ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 				size_t count, loff_t *ppos);
-		ssize_t	(*write)(void *device_data, const char __user *buf,
+		ssize_t	(*write)(struct vfio_device *vdev,
+				 const char __user *buf,
 				 size_t size, loff_t *ppos);
-		long	(*ioctl)(void *device_data, unsigned int cmd,
+		long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 				 unsigned long arg);
-		int	(*mmap)(void *device_data, struct vm_area_struct *vma);
+		int	(*mmap)(struct vfio_device *vdev,
+				struct vm_area_struct *vma);
 	};
 
-Each function is passed the device_data that was originally registered
-in the vfio_add_group_dev() call above.  This allows the bus driver
-an easy place to store its opaque, private data.  The open/release
+Each function is passed the vdev that was originally registered
+in the vfio_register_group_dev() call above.  This allows the bus driver
+to obtain its private data using container_of().  The open/release
 callbacks are issued when a new file descriptor is created for a
 device (via VFIO_GROUP_GET_DEVICE_FD).  The ioctl interface provides
 a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
index d1702f9e7c39ec55bd934bbf8a4261d76dda33ea..9250d2aa5fc2a878e2f20cc4cddea7aed875f1c0 100644
--- a/Documentation/ioctl/ioctl-number.rst
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -352,6 +352,7 @@ Code  Seq#    Include File                                           Comments
                                                                      <mailto:aherrman@de.ibm.com>
 0xE5  00-3F  linux/fuse.h
 0xEC  00-01  drivers/platform/chrome/cros_ec_dev.h                   ChromeOS EC driver
+0xEE  00-1F  uapi/linux/pfru.h                                       Platform Firmware Runtime Update and Telemetry
 0xF3  00-3F  drivers/usb/misc/sisusbvga/sisusb.h                     sisfb (in development)
                                                                      <mailto:thomas@winischhofer.net>
 0xF4  00-1F  video/mbxfb.h                                           mbxfb
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index ad494da40009c10c114f03ff84ef2bc190d8ca08..b7c95a13f1348746864f1bddd3ccbb2c45d4dafd 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -21,6 +21,7 @@ place where this information is gathered.
    unshare
    spec_ctrl
    accelerators/ocxl
+   ioasid
 
 .. only::  subproject and html
 
diff --git a/Documentation/userspace-api/ioasid.rst b/Documentation/userspace-api/ioasid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..879d6cbae858478132c6d8d50974ab9bbeff62b5
--- /dev/null
+++ b/Documentation/userspace-api/ioasid.rst
@@ -0,0 +1,49 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. ioasid:
+
+=====================================
+IOASID Userspace API
+=====================================
+
+The IOASID UAPI is used for userspace IOASID allocation/free requests,
+thus IOASID management is centralized in the IOASID core[1] in the kernel. The
+primary use case is guest Shared Virtual Address (SVA) today.
+
+Requests such as allocation/free can be issued by the users and managed
+on a per-process basis through the ioasid core. Upon opening ("/dev/ioasid"),
+a process obtains a unique handle associated with the process's mm_struct.
+This handle is mapped to an FD in the userspace. Only a single open is
+allowed per process.
+
+File descriptors can be transferred across processes by employing fork() or
+UNIX domain socket. FDs obtained by transfer cannot be used to perform
+IOASID requests. The following behaviors are recommended for the
+applications:
+
+ - forked children close the parent's IOASID FDs immediately, open new
+   /dev/ioasid FDs if IOASID allocation is desired
+
+ - do not share FDs via UNIX domain socket, e.g. via sendmsg
+
+================
+Userspace APIs
+================
+
+/dev/ioasid provides below ioctls:
+
+*) IOASID_GET_API_VERSION: returns the API version, userspace should check
+   the API version first with the one it has embedded.
+*) IOASID_GET_INFO: returns the information on the /dev/ioasid.
+   - ioasid_bits: the ioasid bit width supported by this uAPI, userspace
+     should check the ioasid_bits returned by this ioctl with the ioasid
+     bits it wants and should fail if it's smaller than the one that
+     userspace wants, otherwise, allocation will be failed.
+*) IOASID_REQUEST_ALLOC: returns an IOASID which is allocated in kernel within
+   the specified ioasid range.
+*) IOASID_REQUEST_FREE: free an IOASID per userspace's request.
+
+For detailed definition, please see include/uapi/linux/ioasid.h.
+
+.. contents:: :local:
+
+[1] Documentation/driver-api/ioasid.rst
diff --git a/Documentation/userspace-api/iommu.rst b/Documentation/userspace-api/iommu.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ad06bb94aad53daecc46bd3e1f324415238b5e44
--- /dev/null
+++ b/Documentation/userspace-api/iommu.rst
@@ -0,0 +1,212 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. iommu:
+
+=====================================
+IOMMU Userspace API
+=====================================
+
+IOMMU UAPI is used for virtualization cases where communications are
+needed between physical and virtual IOMMU drivers. For baremetal
+usage, the IOMMU is a system device which does not need to communicate
+with userspace directly.
+
+The primary use cases are guest Shared Virtual Address (SVA) and
+guest IO virtual address (IOVA), wherein the vIOMMU implementation
+relies on the physical IOMMU and for this reason requires interactions
+with the host driver.
+
+.. contents:: :local:
+
+Functionalities
+===============
+Communications of user and kernel involve both directions. The
+supported user-kernel APIs are as follows:
+
+1. Bind/Unbind guest PASID (e.g. Intel VT-d)
+2. Bind/Unbind guest PASID table (e.g. ARM SMMU)
+3. Invalidate IOMMU caches upon guest requests
+4. Report errors to the guest and serve page requests
+5. Read iommu_nesting_info from kernel
+
+Requirements
+============
+The IOMMU UAPIs are generic and extensible to meet the following
+requirements:
+
+1. Emulated and para-virtualised vIOMMUs
+2. Multiple vendors (Intel VT-d, ARM SMMU, etc.)
+3. Extensions to the UAPI shall not break existing userspace
+
+Interfaces
+==========
+Although the data structures defined in IOMMU UAPI are self-contained,
+there are no user API functions introduced. Instead, IOMMU UAPI is
+designed to work with existing user driver frameworks such as VFIO.
+
+Extension Rules & Precautions
+-----------------------------
+When IOMMU UAPI gets extended, the data structures can *only* be
+modified in two ways:
+
+1. Adding new fields by re-purposing the padding[] field. No size change.
+2. Adding new union members at the end. May increase the structure sizes.
+
+No new fields can be added *after* the variable sized union in that it
+will break backward compatibility when offset moves. A new flag must
+be introduced whenever a change affects the structure using either
+method. The IOMMU driver processes the data based on flags which
+ensures backward compatibility.
+
+Version field is only reserved for the unlikely event of UAPI upgrade
+at its entirety.
+
+It's *always* the caller's responsibility to indicate the size of the
+structure passed by setting argsz appropriately.
+Though at the same time, argsz is user provided data which is not
+trusted. The argsz field allows the user app to indicate how much data
+it is providing; it's still the kernel's responsibility to validate
+whether it's correct and sufficient for the requested operation.
+
+Compatibility Checking
+----------------------
+When IOMMU UAPI extension results in some structure size increase,
+IOMMU UAPI code shall handle the following cases:
+
+1. User and kernel has exact size match
+2. An older user with older kernel header (smaller UAPI size) running on a
+   newer kernel (larger UAPI size)
+3. A newer user with newer kernel header (larger UAPI size) running
+   on an older kernel.
+4. A malicious/misbehaving user passing illegal/invalid size but within
+   range. The data may contain garbage.
+
+Feature Checking
+----------------
+While launching a guest with vIOMMU, it is strongly advised to check
+the compatibility upfront, as some subsequent errors happening during
+vIOMMU operation, such as cache invalidation failures cannot be nicely
+escalated to the guest due to IOMMU specifications. This can lead to
+catastrophic failures for the users.
+
+User applications such as QEMU are expected to import kernel UAPI
+headers. Backward compatibility is supported per feature flags.
+For example, an older QEMU (with older kernel header) can run on newer
+kernel. Newer QEMU (with new kernel header) may refuse to initialize
+on an older kernel if new feature flags are not supported by older
+kernel. Simply recompiling existing code with newer kernel header should
+not be an issue in that only existing flags are used.
+
+IOMMU vendor driver should report the below features to IOMMU UAPI
+consumers (e.g. via VFIO). The feature list is passed by struct
+iommu_nesting_info. The future extension to this structure follows
+the rule defined in section "Extension Rules & Precautions".
+
+1. IOMMU_NESTING_FEAT_SYSWIDE_PASID
+2. IOMMU_NESTING_FEAT_BIND_PGTBL
+3. IOMMU_NESTING_FEAT_BIND_PASID_TABLE
+4. IOMMU_NESTING_FEAT_CACHE_INVLD
+5. IOMMU_NESTING_FEAT_PAGE_REQUEST
+
+Take VFIO as example, upon request from VFIO userspace (e.g. QEMU),
+VFIO kernel code shall query IOMMU vendor driver for the support of
+the above features. Query result can then be reported back to the
+userspace caller. Details can be found in
+Documentation/driver-api/vfio.rst.
+
+
+Data Passing Example with VFIO
+------------------------------
+As the ubiquitous userspace driver framework, VFIO is already IOMMU
+aware and shares many key concepts such as device model, group, and
+protection domain. Other user driver frameworks can also be extended
+to support IOMMU UAPI but it is outside the scope of this document.
+
+In this tight-knit VFIO-IOMMU interface, the ultimate consumer of the
+IOMMU UAPI data is the host IOMMU driver. VFIO facilitates user-kernel
+transport, capability checking, security, and life cycle management of
+process address space ID (PASID).
+
+VFIO layer conveys the data structures down to the IOMMU driver. It
+follows the pattern below::
+
+   struct {
+	__u32 argsz;
+	__u32 flags;
+	__u8  data[];
+   };
+
+Here data[] contains the IOMMU UAPI data structures. VFIO has the
+freedom to bundle the data as well as parse data size based on its own flags.
+
+In order to determine the size and feature set of the user data, argsz
+and flags (or the equivalent) are also embedded in the IOMMU UAPI data
+structures.
+
+A "__u32 argsz" field is *always* at the beginning of each structure.
+
+For example:
+::
+
+   struct iommu_cache_invalidate_info {
+	__u32	argsz;
+	#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
+	__u32	version;
+	/* IOMMU paging structure cache */
+	#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
+	#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
+	#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
+	#define IOMMU_CACHE_INV_TYPE_NR		(3)
+	__u8	cache;
+	__u8	granularity;
+	__u8	padding[6];
+	union {
+		struct iommu_inv_pasid_info pasid_info;
+		struct iommu_inv_addr_info addr_info;
+	} granu;
+   };
+
+VFIO is responsible for checking its own argsz and flags. It then
+invokes appropriate IOMMU UAPI functions. The user pointers are passed
+to the IOMMU layer for further processing. The responsibilities are
+divided as follows:
+
+- Generic IOMMU layer checks argsz range based on UAPI data in the
+  current kernel version.
+
+- Generic IOMMU layer checks content of the UAPI data for non-zero
+  reserved bits in flags, padding fields, and unsupported version.
+  This is to ensure not breaking userspace in the future when these
+  fields or flags are used.
+
+- Vendor IOMMU driver checks argsz based on vendor flags. UAPI data
+  is consumed based on flags. Vendor driver has access to
+  unadulterated argsz value in case of vendor specific future
+  extensions. Currently, it does not perform the copy_from_user()
+  itself. A __user pointer can be provided in some future scenarios
+  where there's vendor data outside of the structure definition.
+
+IOMMU code treats UAPI data in two categories:
+
+- structure contains vendor data
+  (Example: iommu_uapi_cache_invalidate())
+
+- structure contains only generic data
+  (Example: iommu_uapi_sva_bind_gpasid())
+
+
+
+Sharing UAPI with in-kernel users
+---------------------------------
+For UAPIs that are shared with in-kernel users, a wrapper function is
+provided to distinguish the callers. For example,
+
+Userspace caller ::
+
+  int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+                                   struct device *dev,
+                                   void __user *udata)
+
+In-kernel caller ::
+
+  int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+                              struct device *dev, ioasid_t ioasid);
diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index fd22224853e581e63ee269436343cba9ec91d8d6..3a2cee9af7f02ee6f32b405117d09ecb8332f284 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -1330,6 +1330,7 @@ Returns: 0 on success, -1 on error
 
 struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 This ioctl would copy current vcpu's xsave struct to the userspace.
@@ -1337,7 +1338,7 @@ This ioctl would copy current vcpu's xsave struct to the userspace.
 
 4.43 KVM_SET_XSAVE
 
-Capability: KVM_CAP_XSAVE
+Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2
 Architectures: x86
 Type: vcpu ioctl
 Parameters: struct kvm_xsave (in)
@@ -1345,9 +1346,18 @@ Returns: 0 on success, -1 on error
 
 struct kvm_xsave {
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
-This ioctl would copy userspace's xsave struct to the kernel.
+This ioctl would copy userspace's xsave struct to the kernel. It copies
+as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2),
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the
+contents of CPUID leaf 0xD on the host.
 
 
 4.44 KVM_GET_XCRS
@@ -1435,6 +1445,10 @@ userspace capabilities, and with user requirements (for example, the
 user may wish to constrain cpuid to emulate older hardware, or for
 feature consistency across a cluster).
 
+Dynamically-enabled feature bits need to be requested with
+``arch_prctl()`` before calling this ioctl. Feature bits that have not
+been requested are excluded from the result.
+
 Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may
 expose cpuid features (e.g. MONITOR) which are not supported by kvm in
 its default configuration. If userspace enables such capabilities, it
@@ -2717,6 +2731,7 @@ struct kvm_create_device {
 
 Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
   KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+  KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
 Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
@@ -2745,6 +2760,7 @@ struct kvm_device_attr {
 
 Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
   KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+  KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device
 Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
@@ -4132,6 +4148,33 @@ Valid values for 'action':
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+4.42 KVM_GET_XSAVE2
+------------------
+
+:Capability: KVM_CAP_XSAVE2
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xsave (out)
+:Returns: 0 on success, -1 on error
+
+
+::
+
+  struct kvm_xsave {
+	__u32 region[1024];
+	__u32 extra[0];
+  };
+
+This ioctl would copy current vcpu's xsave struct to the userspace. It
+copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+when invoked on the vm file descriptor. The size value returned by
+KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096.
+Currently, it is only greater than 4096 if a dynamic feature has been
+enabled with ``arch_prctl()``, but this may change in the future.
+
+The offsets of the state save areas in struct kvm_xsave follow the contents
+of CPUID leaf 0xD on the host.
+
 
 5. The kvm_run structure
 ------------------------
@@ -5069,6 +5112,27 @@ it hard or impossible to use it correctly.  The availability of
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed.
 Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT.
 
+7.32 KVM_CAP_MAX_VCPU_ID
+------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] - maximum APIC ID value set for current VM
+:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_IDS
+          supported in KVM or if it has been set.
+
+This capability allows userspace to specify maximum possible APIC ID
+assigned for current VM session prior to the creation of vCPUs, saving
+memory for data structures indexed by the APIC ID.  Userspace is able
+to calculate the limit to APIC ID values from designated
+CPU topology.
+
+The value can be changed only until KVM_ENABLE_CAP is set to a nonzero
+value or until a vCPU is created.  Upon creation of the first vCPU,
+if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM
+uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as
+the maximum APIC ID.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/Documentation/x86/buslock.rst b/Documentation/x86/buslock.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7c051e714943cb7bc625b332dd36669e13dbb839
--- /dev/null
+++ b/Documentation/x86/buslock.rst
@@ -0,0 +1,126 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: <isonum.txt>
+
+===============================
+Bus lock detection and handling
+===============================
+
+:Copyright: |copy| 2021 Intel Corporation
+:Authors: - Fenghua Yu <fenghua.yu@intel.com>
+          - Tony Luck <tony.luck@intel.com>
+
+Problem
+=======
+
+A split lock is any atomic operation whose operand crosses two cache lines.
+Since the operand spans two cache lines and the operation must be atomic,
+the system locks the bus while the CPU accesses the two cache lines.
+
+A bus lock is acquired through either split locked access to writeback (WB)
+memory or any locked access to non-WB memory. This is typically thousands of
+cycles slower than an atomic operation within a cache line. It also disrupts
+performance on other cores and brings the whole system to its knees.
+
+Detection
+=========
+
+Intel processors may support either or both of the following hardware
+mechanisms to detect split locks and bus locks.
+
+#AC exception for split lock detection
+--------------------------------------
+
+Beginning with the Tremont Atom CPU split lock operations may raise an
+Alignment Check (#AC) exception when a split lock operation is attemped.
+
+#DB exception for bus lock detection
+------------------------------------
+
+Some CPUs have the ability to notify the kernel by an #DB trap after a user
+instruction acquires a bus lock and is executed. This allows the kernel to
+terminate the application or to enforce throttling.
+
+Software handling
+=================
+
+The kernel #AC and #DB handlers handle bus lock based on the kernel
+parameter "split_lock_detect". Here is a summary of different options:
+
++------------------+----------------------------+-----------------------+
+|split_lock_detect=|#AC for split lock		|#DB for bus lock	|
++------------------+----------------------------+-----------------------+
+|off	  	   |Do nothing			|Do nothing		|
++------------------+----------------------------+-----------------------+
+|warn		   |Kernel OOPs			|Warn once per task and |
+|(default)	   |Warn once per task and	|and continues to run.  |
+|		   |disable future checking	|			|
+|		   |When both features are	|			|
+|		   |supported, warn in #AC	|			|
++------------------+----------------------------+-----------------------+
+|fatal		   |Kernel OOPs			|Send SIGBUS to user.	|
+|		   |Send SIGBUS to user		|			|
+|		   |When both features are	|			|
+|		   |supported, fatal in #AC	|			|
++------------------+----------------------------+-----------------------+
+|ratelimit:N	   |Do nothing			|Limit bus lock rate to	|
+|(0 < N <= 1000)   |				|N bus locks per second	|
+|		   |				|system wide and warn on|
+|		   |				|bus locks.		|
++------------------+----------------------------+-----------------------+
+
+Usages
+======
+
+Detecting and handling bus lock may find usages in various areas:
+
+It is critical for real time system designers who build consolidated real
+time systems. These systems run hard real time code on some cores and run
+"untrusted" user processes on other cores. The hard real time cannot afford
+to have any bus lock from the untrusted processes to hurt real time
+performance. To date the designers have been unable to deploy these
+solutions as they have no way to prevent the "untrusted" user code from
+generating split lock and bus lock to block the hard real time code to
+access memory during bus locking.
+
+It's also useful for general computing to prevent guests or user
+applications from slowing down the overall system by executing instructions
+with bus lock.
+
+
+Guidance
+========
+off
+---
+
+Disable checking for split lock and bus lock. This option can be useful if
+there are legacy applications that trigger these events at a low rate so
+that mitigation is not needed.
+
+warn
+----
+
+A warning is emitted when a bus lock is detected which allows to identify
+the offending application. This is the default behavior.
+
+fatal
+-----
+
+In this case, the bus lock is not tolerated and the process is killed.
+
+ratelimit
+---------
+
+A system wide bus lock rate limit N is specified where 0 < N <= 1000. This
+allows a bus lock rate up to N bus locks per second. When the bus lock rate
+is exceeded then any task which is caught via the buslock #DB exception is
+throttled by enforced sleeps until the rate goes under the limit again.
+
+This is an effective mitigation in cases where a minimal impact can be
+tolerated, but an eventual Denial of Service attack has to be prevented. It
+allows to identify the offending processes and analyze whether they are
+malicious or just badly written.
+
+Selecting a rate limit of 1000 allows the bus to be locked for up to about
+seven million cycles each second (assuming 7000 cycles for each bus
+lock). On a 2 GHz processor that would be about 0.35% system slowdown.
diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/x86/elf_auxvec.rst
new file mode 100644
index 0000000000000000000000000000000000000000..18e4744717f9e025e570299ca62fc9d8a78833d6
--- /dev/null
+++ b/Documentation/x86/elf_auxvec.rst
@@ -0,0 +1,53 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================================
+x86-specific ELF Auxiliary Vectors
+==================================
+
+This document describes the semantics of the x86 auxiliary vectors.
+
+Introduction
+============
+
+ELF Auxiliary vectors enable the kernel to efficiently provide
+configuration-specific parameters to userspace. In this example, a program
+allocates an alternate stack based on the kernel-provided size::
+
+   #include <sys/auxv.h>
+   #include <elf.h>
+   #include <signal.h>
+   #include <stdlib.h>
+   #include <assert.h>
+   #include <err.h>
+
+   #ifndef AT_MINSIGSTKSZ
+   #define AT_MINSIGSTKSZ	51
+   #endif
+
+   ....
+   stack_t ss;
+
+   ss.ss_sp = malloc(ss.ss_size);
+   assert(ss.ss_sp);
+
+   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ss.ss_flags = 0;
+
+   if (sigaltstack(&ss, NULL))
+        err(1, "sigaltstack");
+
+
+The exposed auxiliary vectors
+=============================
+
+AT_SYSINFO is used for locating the vsyscall entry point.  It is not
+exported on 64-bit mode.
+
+AT_SYSINFO_EHDR is the start address of the page containing the vDSO.
+
+AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to
+deliver a signal to user-space.  AT_MINSIGSTKSZ comprehends the space
+consumed by the kernel to accommodate the user context for the current
+hardware configuration.  It does not comprehend subsequent user-space stack
+consumption, which must be added by the user.  (e.g. Above, user-space adds
+SIGSTKSZ to AT_MINSIGSTKSZ.)
diff --git a/Documentation/x86/exception-tables.rst b/Documentation/x86/exception-tables.rst
index ed6d4b0cf62cf4601bd9a2329f8b84f9f4f3a65a..514f51829da7763503acea933c227c637b8d86db 100644
--- a/Documentation/x86/exception-tables.rst
+++ b/Documentation/x86/exception-tables.rst
@@ -337,10 +337,4 @@ pointer which points to one of:
      entry->insn. It is used to distinguish page faults from machine
      check.
 
-3) ``int ex_handler_ext(const struct exception_table_entry *fixup)``
-     This case is used for uaccess_err ... we need to set a flag
-     in the task structure. Before the handler functions existed this
-     case was handled by adding a large offset to the fixup to tag
-     it as special.
-
 More functions can easily be added.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 971f30a7d166b9fb55fa9b7d57668313809917bf..747b4c6a6f8ef89411a189afb2ecf237d31b8ea5 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -26,9 +26,13 @@ x86-specific Documentation
    pti
    mds
    microcode
-   resctrl_ui
+   resctrl
    tsx_async_abort
+   buslock
    usb-legacy-support
    i386/index
    x86_64/index
    sgx
+   sva
+   elf_auxvec
+   xstate
diff --git a/Documentation/x86/pfru.rst b/Documentation/x86/pfru.rst
new file mode 100644
index 0000000000000000000000000000000000000000..321729f46737a5e883f61ccade7b4784882c5234
--- /dev/null
+++ b/Documentation/x86/pfru.rst
@@ -0,0 +1,98 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================================
+The Linux Platform Firmware Runtime Update and Telemetry
+========================================================
+
+According to the specification of <Management Mode Firmware Runtime Update>[1],
+certain computing systems require high Service Level Agreements (SLAs) where
+system reboot fewer firmware updates are required to deploy firmware changes
+to address bug fixes, security updates and to debug and root cause issues. This
+technology is called Intel Seamless Update. The management mode (MM),
+UEFI runtime services and ACPI services handle most of the system runtime
+functions. Changing the MM code execution during runtime is called MM Runtime
+Update. Since the "MM" acronyms might be misunderstood as "Memory Management",
+this driver uses "Platform Firmware Runtime Update"(PFRU)
+
+PFRU provides the following facilities: Performs a runtime firmware driver update
+and activate. Ability to inject firmware code at runtime, for dynamic instrumentation.
+PFRU Telemetry is a service which allows Runtime Update handler to produce telemetry
+data to upper layer OS consumer at runtime. The OS provides interfaces to let the
+users query the telemetry data via read operations. The specification specifies the
+interface and recommended policy to extract the data, the format and use are left to
+individual OEM's and BIOS implementations on what that data represents.
+
+PFRU interfaces
+=====================
+
+The user space tool manipulates on /dev/pfru/update for code injection and
+driver update. PFRU stands for Platform Firmware Runtime Update, and the /dev/pfru
+directory might be reserved for future usage.
+
+ 1. mmap the capsule file
+    fd_capsule = open("capsule.cap", O_RDONLY);
+    fstat(fd_capsule, &stat);
+    addr = mmap(0, stat.st_size, PROT_READ, fd_capsule);
+
+ 2. Get the capability information(version control, etc) from BIOS via
+    read() and do sanity check in user space.
+    fd_update = open("/dev/pfru/update", O_RDWR);
+    read(fd_update, &cap, sizeof(cap));
+    sanity_check(&cap);
+
+ 3. Write the capsule file to runtime update communication buffer
+    //kernel might return error if capsule file size is longer than
+    //communication buffer
+    write(fd_update, addr, stat.st_size);
+
+ 4. Stage the code injection
+    ioctl(fd_update, PFRU_IOC_STATGE);
+
+ 5. Activate the code injection
+    ioctl(fd_update, PFRU_IOC_ACTIVATE);
+
+ 6. Stage and activate the code injection
+    ioctl(fd_update, PFRU_IOC_STAGE_ACTIVATE);
+
+    PFRU_IOC_STATGE: Stage a capsule image from communication buffer
+                    and perform authentication.
+    PFRU_IOC_ACTIVATE: Activate a previous staged capsule image.
+    PFRU_IOC_STAGE_ACTIVATE: Perform both stage and activation actions.
+
+PFRU Telemetry
+=============
+
+The user space tool manipulates on /dev/pfru/telemetry for PFRU telemetry log.
+Sample code:
+
+ 1. Open telemetry device
+    fd_log = open("/dev/pfru/telemetry", O_RDWR);
+
+ 2. Get log level, log type, revision_id via one ioctl invoke
+    ioctl(fd_log, PFRU_IOC_GET_LOG_INFO, &info);
+
+ 3. Set log level, log type, revision_id
+    ioctl(fd_log, PFRU_IOC_SET_LOG_INFO, &info);
+
+ 4. ioctl(fd_log, PFRU_IOC_GET_DATA_INFO, &data_info);
+    Query the information of PFRU telemetry log buffer. The user is
+    responsible for parsing the result per the specification.
+
+ 5. Read the telemetry data:
+    read(fd_log, buf, data_info.size);
+
+Please refer to tools/testing/selftests/pfru/pfru_test.c for detail.
+
+According to <Management Mode Firmware Runtime Update>[1], the telemetry
+buffer is a wrap around buffer. If the telemetry buffer gets full, most recent
+log data will overwrite old log data. Besides, it is required in the spec that
+the read of telemetry should support both full data retrieval and delta telemetry
+data retrieval. Since this requirement is more likely a policy we leave this
+implementation in user space. That is to say, it is recommended for the user
+to double-read the telemetry parameters such as chunk1_size, chunk2_size,
+rollover_cnt in data_info structure to make sure that there is no more data appended
+while the user is reading the buffer. Besides, only after the runtime update has
+been run at least once, the telemetry log would have valid data, otherwise errno code
+of EBUSY would be returned.
+
+[1] https://uefi.org/sites/default/files/resources/Intel_MM_OS_Interface_Spec_Rev100.pdf
diff --git a/Documentation/x86/resctrl_ui.rst b/Documentation/x86/resctrl.rst
similarity index 89%
rename from Documentation/x86/resctrl_ui.rst
rename to Documentation/x86/resctrl.rst
index 5368cedfb5309dbffe7528805c13b180e197cc2a..71a531061e4e2d1d5116a744ae30136efdd8757e 100644
--- a/Documentation/x86/resctrl_ui.rst
+++ b/Documentation/x86/resctrl.rst
@@ -138,6 +138,18 @@ with respect to allocation:
 		non-linear. This field is purely informational
 		only.
 
+"thread_throttle_mode":
+		Indicator on Intel systems of how tasks running on threads
+		of a physical core are throttled in cases where they
+		request different memory bandwidth percentages:
+
+		"max":
+			the smallest percentage is applied
+			to all threads
+		"per-thread":
+			bandwidth percentages are directly applied to
+			the threads running on the core
+
 If RDT monitoring is available there will be an "L3_MON" directory
 with the following files:
 
@@ -364,8 +376,10 @@ to the next control step available on the hardware.
 
 The bandwidth throttling is a core specific mechanism on some of Intel
 SKUs. Using a high bandwidth and a low bandwidth setting on two threads
-sharing a core will result in both threads being throttled to use the
-low bandwidth. The fact that Memory bandwidth allocation(MBA) is a core
+sharing a core may result in both threads being throttled to use the
+low bandwidth (see "thread_throttle_mode").
+
+The fact that Memory bandwidth allocation(MBA) may be a core
 specific mechanism where as memory bandwidth monitoring(MBM) is done at
 the package level may lead to confusion when users try to apply control
 via the MBA and then monitor the bandwidth to see if the controls are
@@ -1195,3 +1209,96 @@ View the llc occupancy snapshot::
 
   # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
   11234000
+
+Intel RDT Errata
+================
+
+Intel MBM Counters May Report System Memory Bandwidth Incorrectly
+-----------------------------------------------------------------
+
+Errata SKX99 for Skylake server and BDF102 for Broadwell server.
+
+Problem: Intel Memory Bandwidth Monitoring (MBM) counters track metrics
+according to the assigned Resource Monitor ID (RMID) for that logical
+core. The IA32_QM_CTR register (MSR 0xC8E), used to report these
+metrics, may report incorrect system bandwidth for certain RMID values.
+
+Implication: Due to the errata, system memory bandwidth may not match
+what is reported.
+
+Workaround: MBM total and local readings are corrected according to the
+following correction factor table:
+
++---------------+---------------+---------------+-----------------+
+|core count	|rmid count	|rmid threshold	|correction factor|
++---------------+---------------+---------------+-----------------+
+|1		|8		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|2		|16		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|3		|24		|15		|0.969650	  |
++---------------+---------------+---------------+-----------------+
+|4		|32		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|6		|48		|31		|0.969650	  |
++---------------+---------------+---------------+-----------------+
+|7		|56		|47		|1.142857	  |
++---------------+---------------+---------------+-----------------+
+|8		|64		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|9		|72		|63		|1.185115	  |
++---------------+---------------+---------------+-----------------+
+|10		|80		|63		|1.066553	  |
++---------------+---------------+---------------+-----------------+
+|11		|88		|79		|1.454545	  |
++---------------+---------------+---------------+-----------------+
+|12		|96		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|13		|104		|95		|1.230769	  |
++---------------+---------------+---------------+-----------------+
+|14		|112		|95		|1.142857	  |
++---------------+---------------+---------------+-----------------+
+|15		|120		|95		|1.066667	  |
++---------------+---------------+---------------+-----------------+
+|16		|128		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|17		|136		|127		|1.254863	  |
++---------------+---------------+---------------+-----------------+
+|18		|144		|127		|1.185255	  |
++---------------+---------------+---------------+-----------------+
+|19		|152		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|20		|160		|127		|1.066667	  |
++---------------+---------------+---------------+-----------------+
+|21		|168		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|22		|176		|159		|1.454334	  |
++---------------+---------------+---------------+-----------------+
+|23		|184		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|24		|192		|127		|0.969744	  |
++---------------+---------------+---------------+-----------------+
+|25		|200		|191		|1.280246	  |
++---------------+---------------+---------------+-----------------+
+|26		|208		|191		|1.230921	  |
++---------------+---------------+---------------+-----------------+
+|27		|216		|0		|1.000000	  |
++---------------+---------------+---------------+-----------------+
+|28		|224		|191		|1.143118	  |
++---------------+---------------+---------------+-----------------+
+
+If rmid > rmid threshold, MBM total and local values should be multiplied
+by the correction factor.
+
+See:
+
+1. Erratum SKX99 in Intel Xeon Processor Scalable Family Specification Update:
+http://web.archive.org/web/20200716124958/https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-spec-update.html
+
+2. Erratum BDF102 in Intel Xeon E5-2600 v4 Processor Product Family Specification Update:
+http://web.archive.org/web/20191125200531/https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e5-v4-spec-update.pdf
+
+3. The errata in Intel Resource Director Technology (Intel RDT) on 2nd Generation Intel Xeon Scalable Processors Reference Manual:
+https://software.intel.com/content/www/us/en/develop/articles/intel-resource-director-technology-rdt-reference-manual.html
+
+for further information.
diff --git a/Documentation/x86/sva.rst b/Documentation/x86/sva.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e9b8b0f9a0f491c6773db0604f389e6741397c9
--- /dev/null
+++ b/Documentation/x86/sva.rst
@@ -0,0 +1,286 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================================
+Shared Virtual Addressing (SVA) with ENQCMD
+===========================================
+
+Background
+==========
+
+Shared Virtual Addressing (SVA) allows the processor and device to use the
+same virtual addresses avoiding the need for software to translate virtual
+addresses to physical addresses. SVA is what PCIe calls Shared Virtual
+Memory (SVM).
+
+In addition to the convenience of using application virtual addresses
+by the device, it also doesn't require pinning pages for DMA.
+PCIe Address Translation Services (ATS) along with Page Request Interface
+(PRI) allow devices to function much the same way as the CPU handling
+application page-faults. For more information please refer to the PCIe
+specification Chapter 10: ATS Specification.
+
+Use of SVA requires IOMMU support in the platform. IOMMU is also
+required to support the PCIe features ATS and PRI. ATS allows devices
+to cache translations for virtual addresses. The IOMMU driver uses the
+mmu_notifier() support to keep the device TLB cache and the CPU cache in
+sync. When an ATS lookup fails for a virtual address, the device should
+use the PRI in order to request the virtual address to be paged into the
+CPU page tables. The device must use ATS again in order the fetch the
+translation before use.
+
+Shared Hardware Workqueues
+==========================
+
+Unlike Single Root I/O Virtualization (SR-IOV), Scalable IOV (SIOV) permits
+the use of Shared Work Queues (SWQ) by both applications and Virtual
+Machines (VM's). This allows better hardware utilization vs. hard
+partitioning resources that could result in under utilization. In order to
+allow the hardware to distinguish the context for which work is being
+executed in the hardware by SWQ interface, SIOV uses Process Address Space
+ID (PASID), which is a 20-bit number defined by the PCIe SIG.
+
+PASID value is encoded in all transactions from the device. This allows the
+IOMMU to track I/O on a per-PASID granularity in addition to using the PCIe
+Resource Identifier (RID) which is the Bus/Device/Function.
+
+
+ENQCMD
+======
+
+ENQCMD is a new instruction on Intel platforms that atomically submits a
+work descriptor to a device. The descriptor includes the operation to be
+performed, virtual addresses of all parameters, virtual address of a completion
+record, and the PASID (process address space ID) of the current process.
+
+ENQCMD works with non-posted semantics and carries a status back if the
+command was accepted by hardware. This allows the submitter to know if the
+submission needs to be retried or other device specific mechanisms to
+implement fairness or ensure forward progress should be provided.
+
+ENQCMD is the glue that ensures applications can directly submit commands
+to the hardware and also permits hardware to be aware of application context
+to perform I/O operations via use of PASID.
+
+Process Address Space Tagging
+=============================
+
+A new thread-scoped MSR (IA32_PASID) provides the connection between
+user processes and the rest of the hardware. When an application first
+accesses an SVA-capable device, this MSR is initialized with a newly
+allocated PASID. The driver for the device calls an IOMMU-specific API
+that sets up the routing for DMA and page-requests.
+
+For example, the Intel Data Streaming Accelerator (DSA) uses
+iommu_sva_bind_device(), which will do the following:
+
+- Allocate the PASID, and program the process page-table (%cr3 register) in the
+  PASID context entries.
+- Register for mmu_notifier() to track any page-table invalidations to keep
+  the device TLB in sync. For example, when a page-table entry is invalidated,
+  the IOMMU propagates the invalidation to the device TLB. This will force any
+  future access by the device to this virtual address to participate in
+  ATS. If the IOMMU responds with proper response that a page is not
+  present, the device would request the page to be paged in via the PCIe PRI
+  protocol before performing I/O.
+
+This MSR is managed with the XSAVE feature set as "supervisor state" to
+ensure the MSR is updated during context switch.
+
+PASID Management
+================
+
+The kernel must allocate a PASID on behalf of each process which will use
+ENQCMD and program it into the new MSR to communicate the process identity to
+platform hardware.  ENQCMD uses the PASID stored in this MSR to tag requests
+from this process.  When a user submits a work descriptor to a device using the
+ENQCMD instruction, the PASID field in the descriptor is auto-filled with the
+value from MSR_IA32_PASID. Requests for DMA from the device are also tagged
+with the same PASID. The platform IOMMU uses the PASID in the transaction to
+perform address translation. The IOMMU APIs setup the corresponding PASID
+entry in IOMMU with the process address used by the CPU (e.g. %cr3 register in
+x86).
+
+The MSR must be configured on each logical CPU before any application
+thread can interact with a device. Threads that belong to the same
+process share the same page tables, thus the same MSR value.
+
+PASID Life Cycle Management
+===========================
+
+PASID is initialized as INVALID_IOASID (-1) when a process is created.
+
+Only processes that access SVA-capable devices need to have a PASID
+allocated. This allocation happens when a process opens/binds an SVA-capable
+device but finds no PASID for this process. Subsequent binds of the same, or
+other devices will share the same PASID.
+
+Although the PASID is allocated to the process by opening a device,
+it is not active in any of the threads of that process. It's loaded to the
+IA32_PASID MSR lazily when a thread tries to submit a work descriptor
+to a device using the ENQCMD.
+
+That first access will trigger a #GP fault because the IA32_PASID MSR
+has not been initialized with the PASID value assigned to the process
+when the device was opened. The Linux #GP handler notes that a PASID has
+been allocated for the process, and so initializes the IA32_PASID MSR
+and returns so that the ENQCMD instruction is re-executed.
+
+On fork(2) or exec(2) the PASID is removed from the process as it no
+longer has the same address space that it had when the device was opened.
+
+On clone(2) the new task shares the same address space, so will be
+able to use the PASID allocated to the process. The IA32_PASID is not
+preemptively initialized as the PASID value might not be allocated yet or
+the kernel does not know whether this thread is going to access the device
+and the cleared IA32_PASID MSR reduces context switch overhead by xstate
+init optimization. Since #GP faults have to be handled on any threads that
+were created before the PASID was assigned to the mm of the process, newly
+created threads might as well be treated in a consistent way.
+
+Due to complexity of freeing the PASID and clearing all IA32_PASID MSRs in
+all threads in unbind, free the PASID lazily only on mm exit.
+
+If a process does a close(2) of the device file descriptor and munmap(2)
+of the device MMIO portal, then the driver will unbind the device. The
+PASID is still marked VALID in the PASID_MSR for any threads in the
+process that accessed the device. But this is harmless as without the
+MMIO portal they cannot submit new work to the device.
+
+Relationships
+=============
+
+ * Each process has many threads, but only one PASID.
+ * Devices have a limited number (~10's to 1000's) of hardware workqueues.
+   The device driver manages allocating hardware workqueues.
+ * A single mmap() maps a single hardware workqueue as a "portal" and
+   each portal maps down to a single workqueue.
+ * For each device with which a process interacts, there must be
+   one or more mmap()'d portals.
+ * Many threads within a process can share a single portal to access
+   a single device.
+ * Multiple processes can separately mmap() the same portal, in
+   which case they still share one device hardware workqueue.
+ * The single process-wide PASID is used by all threads to interact
+   with all devices.  There is not, for instance, a PASID for each
+   thread or each thread<->device pair.
+
+FAQ
+===
+
+* What is SVA/SVM?
+
+Shared Virtual Addressing (SVA) permits I/O hardware and the processor to
+work in the same address space, i.e., to share it. Some call it Shared
+Virtual Memory (SVM), but Linux community wanted to avoid confusing it with
+POSIX Shared Memory and Secure Virtual Machines which were terms already in
+circulation.
+
+* What is a PASID?
+
+A Process Address Space ID (PASID) is a PCIe-defined Transaction Layer Packet
+(TLP) prefix. A PASID is a 20-bit number allocated and managed by the OS.
+PASID is included in all transactions between the platform and the device.
+
+* How are shared workqueues different?
+
+Traditionally, in order for userspace applications to interact with hardware,
+there is a separate hardware instance required per process. For example,
+consider doorbells as a mechanism of informing hardware about work to process.
+Each doorbell is required to be spaced 4k (or page-size) apart for process
+isolation. This requires hardware to provision that space and reserve it in
+MMIO. This doesn't scale as the number of threads becomes quite large. The
+hardware also manages the queue depth for Shared Work Queues (SWQ), and
+consumers don't need to track queue depth. If there is no space to accept
+a command, the device will return an error indicating retry.
+
+A user should check Deferrable Memory Write (DMWr) capability on the device
+and only submits ENQCMD when the device supports it. In the new DMWr PCIe
+terminology, devices need to support DMWr completer capability. In addition,
+it requires all switch ports to support DMWr routing and must be enabled by
+the PCIe subsystem, much like how PCIe atomic operations are managed for
+instance.
+
+SWQ allows hardware to provision just a single address in the device. When
+used with ENQCMD to submit work, the device can distinguish the process
+submitting the work since it will include the PASID assigned to that
+process. This helps the device scale to a large number of processes.
+
+* Is this the same as a user space device driver?
+
+Communicating with the device via the shared workqueue is much simpler
+than a full blown user space driver. The kernel driver does all the
+initialization of the hardware. User space only needs to worry about
+submitting work and processing completions.
+
+* Is this the same as SR-IOV?
+
+Single Root I/O Virtualization (SR-IOV) focuses on providing independent
+hardware interfaces for virtualizing hardware. Hence, it's required to be
+almost fully functional interface to software supporting the traditional
+BARs, space for interrupts via MSI-X, its own register layout.
+Virtual Functions (VFs) are assisted by the Physical Function (PF)
+driver.
+
+Scalable I/O Virtualization builds on the PASID concept to create device
+instances for virtualization. SIOV requires host software to assist in
+creating virtual devices; each virtual device is represented by a PASID
+along with the bus/device/function of the device.  This allows device
+hardware to optimize device resource creation and can grow dynamically on
+demand. SR-IOV creation and management is very static in nature. Consult
+references below for more details.
+
+* Why not just create a virtual function for each app?
+
+Creating PCIe SR-IOV type Virtual Functions (VF) is expensive. VFs require
+duplicated hardware for PCI config space and interrupts such as MSI-X.
+Resources such as interrupts have to be hard partitioned between VFs at
+creation time, and cannot scale dynamically on demand. The VFs are not
+completely independent from the Physical Function (PF). Most VFs require
+some communication and assistance from the PF driver. SIOV, in contrast,
+creates a software-defined device where all the configuration and control
+aspects are mediated via the slow path. The work submission and completion
+happen without any mediation.
+
+* Does this support virtualization?
+
+ENQCMD can be used from within a guest VM. In these cases, the VMM helps
+with setting up a translation table to translate from Guest PASID to Host
+PASID. Please consult the ENQCMD instruction set reference for more
+details.
+
+* Does memory need to be pinned?
+
+When devices support SVA along with platform hardware such as IOMMU
+supporting such devices, there is no need to pin memory for DMA purposes.
+Devices that support SVA also support other PCIe features that remove the
+pinning requirement for memory.
+
+Device TLB support - Device requests the IOMMU to lookup an address before
+use via Address Translation Service (ATS) requests.  If the mapping exists
+but there is no page allocated by the OS, IOMMU hardware returns that no
+mapping exists.
+
+Device requests the virtual address to be mapped via Page Request
+Interface (PRI). Once the OS has successfully completed the mapping, it
+returns the response back to the device. The device requests again for
+a translation and continues.
+
+IOMMU works with the OS in managing consistency of page-tables with the
+device. When removing pages, it interacts with the device to remove any
+device TLB entry that might have been cached before removing the mappings from
+the OS.
+
+References
+==========
+
+VT-D:
+https://01.org/blogs/ashokraj/2018/recent-enhancements-intel-virtualization-technology-directed-i/o-intel-vt-d
+
+SIOV:
+https://01.org/blogs/2019/assignable-interfaces-intel-scalable-i/o-virtualization-linux
+
+ENQCMD in ISE:
+https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
+
+DSA spec:
+https://software.intel.com/sites/default/files/341204-intel-data-streaming-accelerator-spec.pdf
diff --git a/Documentation/x86/xstate.rst b/Documentation/x86/xstate.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5cec7fb558d60346d2b55039c2dd7882575ea3aa
--- /dev/null
+++ b/Documentation/x86/xstate.rst
@@ -0,0 +1,74 @@
+Using XSTATE features in user space applications
+================================================
+
+The x86 architecture supports floating-point extensions which are
+enumerated via CPUID. Applications consult CPUID and use XGETBV to
+evaluate which features have been enabled by the kernel XCR0.
+
+Up to AVX-512 and PKRU states, these features are automatically enabled by
+the kernel if available. Features like AMX TILE_DATA (XSTATE component 18)
+are enabled by XCR0 as well, but the first use of related instruction is
+trapped by the kernel because by default the required large XSTATE buffers
+are not allocated automatically.
+
+Using dynamically enabled XSTATE features in user space applications
+--------------------------------------------------------------------
+
+The kernel provides an arch_prctl(2) based mechanism for applications to
+request the usage of such features. The arch_prctl(2) options related to
+this are:
+
+-ARCH_GET_XCOMP_SUPP
+
+ arch_prctl(ARCH_GET_XCOMP_SUPP, &features);
+
+ ARCH_GET_XCOMP_SUPP stores the supported features in userspace storage of
+ type uint64_t. The second argument is a pointer to that storage.
+
+-ARCH_GET_XCOMP_PERM
+
+ arch_prctl(ARCH_GET_XCOMP_PERM, &features);
+
+ ARCH_GET_XCOMP_PERM stores the features for which the userspace process
+ has permission in userspace storage of type uint64_t. The second argument
+ is a pointer to that storage.
+
+-ARCH_REQ_XCOMP_PERM
+
+ arch_prctl(ARCH_REQ_XCOMP_PERM, feature_nr);
+
+ ARCH_REQ_XCOMP_PERM allows to request permission for a dynamically enabled
+ feature or a feature set. A feature set can be mapped to a facility, e.g.
+ AMX, and can require one or more XSTATE components to be enabled.
+
+ The feature argument is the number of the highest XSTATE component which
+ is required for a facility to work.
+
+When requesting permission for a feature, the kernel checks the
+availability. The kernel ensures that sigaltstacks in the process's tasks
+are large enough to accommodate the resulting large signal frame. It
+enforces this both during ARCH_REQ_XCOMP_SUPP and during any subsequent
+sigaltstack(2) calls. If an installed sigaltstack is smaller than the
+resulting sigframe size, ARCH_REQ_XCOMP_SUPP results in -ENOSUPP. Also,
+sigaltstack(2) results in -ENOMEM if the requested altstack is too small
+for the permitted features.
+
+Permission, when granted, is valid per process. Permissions are inherited
+on fork(2) and cleared on exec(3).
+
+The first use of an instruction related to a dynamically enabled feature is
+trapped by the kernel. The trap handler checks whether the process has
+permission to use the feature. If the process has no permission then the
+kernel sends SIGILL to the application. If the process has permission then
+the handler allocates a larger xstate buffer for the task so the large
+state can be context switched. In the unlikely cases that the allocation
+fails, the kernel sends SIGSEGV.
+
+Dynamic features in signal frames
+---------------------------------
+
+Dynamcally enabled features are not written to the signal frame upon signal
+entry if the feature is in its initial configuration.  This differs from
+non-dynamic features which are always written regardless of their
+configuration.  Signal handlers can examine the XSAVE buffer's XSTATE_BV
+field to determine if a features was written.
diff --git a/MAINTAINERS b/MAINTAINERS
index 974064e82ca9e130b42b500141ce587db09b88ab..4bb2cfd5800ec5cfc2ffbe32a77bff76b9f1d3ce 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8283,6 +8283,14 @@ Q:	https://patchwork.kernel.org/project/linux-dmaengine/list/
 S:	Supported
 F:	drivers/dma/ioat*
 
+INTEL IADX DRIVER
+M:	Dave Jiang <dave.jiang@intel.com>
+L:	dmaengine@vger.kernel.org
+S:	Supported
+F:	drivers/dma/idxd/*
+F:	include/uapi/linux/idxd.h
+F:	include/linux/idxd.h
+
 INTEL IDLE DRIVER
 M:	Jacob Pan <jacob.jun.pan@linux.intel.com>
 M:	Len Brown <lenb@kernel.org>
@@ -8305,8 +8313,7 @@ M:	Lu Baolu <baolu.lu@linux.intel.com>
 L:	iommu@lists.linux-foundation.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 S:	Supported
-F:	drivers/iommu/dmar.c
-F:	drivers/iommu/intel*.[ch]
+F:	drivers/iommu/intel/
 F:	include/linux/intel-iommu.h
 F:	include/linux/intel-svm.h
 
@@ -8413,6 +8420,12 @@ F:	drivers/mfd/intel_soc_pmic*
 F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
+INTEL PMT DRIVER
+M:	"David E. Box" <david.e.box@linux.intel.com>
+S:	Maintained
+F:	drivers/mfd/intel_pmt.c
+F:	drivers/platform/x86/intel_pmt_*
+
 INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
 M:	Stanislav Yakovlev <stas.yakovlev@gmail.com>
 L:	linux-wireless@vger.kernel.org
@@ -8573,6 +8586,7 @@ L:	iommu@lists.linux-foundation.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
+F:	Documentation/userspace-api/iommu.rst
 F:	drivers/iommu/
 F:	include/linux/iommu.h
 F:	include/linux/of_iommu.h
diff --git a/arch/Kconfig b/arch/Kconfig
index a8df66e6454422b5cc1c726281661cbeab68c9f0..6cb98cd17acfadc9f833eb069259835debaac9ff 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -264,11 +264,18 @@ config ARCH_HAS_SET_DIRECT_MAP
 	bool
 
 #
-# Select if arch has an uncached kernel segment and provides the
-# uncached_kernel_address / cached_kernel_address symbols to use it
+# Select if the architecture provides the arch_dma_set_uncached symbol to
+# either provide an uncached segement alias for a DMA allocation, or
+# to remap the page tables in place.
 #
-config ARCH_HAS_UNCACHED_SEGMENT
-	select ARCH_HAS_DMA_PREP_COHERENT
+config ARCH_HAS_DMA_SET_UNCACHED
+	bool
+
+#
+# Select if the architectures provides the arch_dma_clear_uncached symbol
+# to undo an in-place page table remap for uncached access.
+#
+config ARCH_HAS_DMA_CLEAR_UNCACHED
 	bool
 
 # Select if arch init_task must go in the __init_task_data section
@@ -915,27 +922,6 @@ config STRICT_MODULE_RWX
 config ARCH_HAS_PHYS_TO_DMA
 	bool
 
-config ARCH_HAS_REFCOUNT
-	bool
-	help
-	  An architecture selects this when it has implemented refcount_t
-	  using open coded assembly primitives that provide an optimized
-	  refcount_t implementation, possibly at the expense of some full
-	  refcount state checks of CONFIG_REFCOUNT_FULL=y.
-
-	  The refcount overflow check behavior, however, must be retained.
-	  Catching overflows is the primary security concern for protecting
-	  against bugs in reference counts.
-
-config REFCOUNT_FULL
-	bool "Perform full reference count validation at the expense of speed"
-	help
-	  Enabling this switches the refcounting infrastructure from a fast
-	  unchecked atomic_t implementation to a fully state checked
-	  implementation, which can be (slightly) slower but provides protections
-	  against various use-after-free conditions that can be used in
-	  security flaw exploits.
-
 config HAVE_ARCH_COMPILER_H
 	bool
 	help
@@ -983,6 +969,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
 	bool
 
+config DYNAMIC_SIGFRAME
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index f94c732fedebb46b7a6f4ef9b7303adbfd5c1316..0021580d79adbd577e5829841323ec1e889a1e30 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -71,10 +71,10 @@ static int pci_mmap_resource(struct kobject *kobj,
 	struct pci_bus_region bar;
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (res == &pdev->resource[i])
 			break;
-	if (i >= PCI_ROM_RESOURCE)
+	if (i >= PCI_STD_NUM_BARS)
 		return -ENODEV;
 
 	if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start))
@@ -115,7 +115,7 @@ void pci_remove_resource_files(struct pci_dev *pdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct bin_attribute *res_attr;
 
 		res_attr = pdev->res_attr[i];
@@ -232,7 +232,7 @@ int pci_create_resource_files(struct pci_dev *pdev)
 	int retval;
 
 	/* Expose the PCI resources from this device as files */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 
 		/* skip empty resources */
 		if (!pci_resource_len(pdev, i))
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 48b81d015d8a86107eec61c2f1816f5cc65539b4..5b514913a6552f163fc7cea5b8bfe576b9091787 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -251,8 +251,11 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 	childti->pcb.ksp = (unsigned long) childstack;
 	childti->pcb.flags = 1;	/* set FEN, clear everything else */
 
+	fpu_clone(p);
+
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
+		p->thread.pkru = pkru_get_init_value();
 		memset(childstack, 0,
 			sizeof(struct switch_stack) + sizeof(struct pt_regs));
 		childstack->r26 = (unsigned long) ret_from_kernel_thread;
@@ -262,6 +265,13 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 		childti->pcb.usp = 0;
 		return 0;
 	}
+
+	/*
+	 * Clone current's PKRU value from hardware. tsk->thread.pkru
+	 * is only valid when scheduled out.
+	 */
+	p->thread.pkru = read_pkru();
+
 	/* Note: if CLONE_SETTLS is not set, then we must inherit the
 	   value from the parent, which will have been set by the block
 	   copy in dup_task_struct.  This is non-intuitive, but is
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8383155c8c824f486c2cadd7e46178ca627bfe5a..4d7b671c8ff4aca60392c8913e9f882a4a4ded86 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,7 +6,6 @@
 config ARC
 	def_bool y
 	select ARC_TIMERS
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SETUP_DMA_OPS
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 73a7e88a1e9264c042418fe3a7cc070bb4581682..e947572a521ec08557c1580992b67f1887f9db24 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -48,8 +48,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
  * upper layer functions (in include/linux/dma-mapping.h)
  */
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
@@ -69,8 +69,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9aa88715f196cf45a4629b59daacf5c85b5c6159..1905d58b7920e4cf046d4789ce71a7ad94ff7825 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -7,7 +7,6 @@ config ARM
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
@@ -118,7 +117,6 @@ config ARM
 	select OLD_SIGSUSPEND3
 	select PCI_SYSCALL if PCI
 	select PERF_USE_VMALLOC
-	select REFCOUNT_FULL
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
 	# Above selects are sorted alphabetically; please add new ones
diff --git a/arch/arm/include/asm/dma-direct.h b/arch/arm/include/asm/dma-direct.h
index b67e5fc1fe436135e933a76faa393117d604ba80..bca0de567534396bf97c09c49a83dd7636392ac0 100644
--- a/arch/arm/include/asm/dma-direct.h
+++ b/arch/arm/include/asm/dma-direct.h
@@ -2,35 +2,16 @@
 #ifndef ASM_ARM_DMA_DIRECT_H
 #define ASM_ARM_DMA_DIRECT_H 1
 
-static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	unsigned int offset = paddr & ~PAGE_MASK;
 	return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset;
 }
 
-static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 {
 	unsigned int offset = dev_addr & ~PAGE_MASK;
 	return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	u64 limit, mask;
-
-	if (!dev->dma_mask)
-		return 0;
-
-	mask = *dev->dma_mask;
-
-	limit = (mask + 1) & ~mask;
-	if (limit && size > limit)
-		return 0;
-
-	if ((addr | (addr + size - 1)) & ~mask)
-		return 0;
-
-	return 1;
-}
-
 #endif /* ASM_ARM_DMA_DIRECT_H */
diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c
index 39a7d93936417a7a44354cb77ba42ed2f09498d0..24dd5bbe60e43c1302806541d77b4bba4604da93 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -62,13 +62,13 @@ static struct cpuidle_driver imx6q_cpuidle_driver = {
  */
 void imx6q_cpuidle_fec_irqs_used(void)
 {
-	imx6q_cpuidle_driver.states[1].disabled = true;
+	cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, true);
 }
 EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_used);
 
 void imx6q_cpuidle_fec_irqs_unused(void)
 {
-	imx6q_cpuidle_driver.states[1].disabled = false;
+	cpuidle_driver_state_disabled(&imx6q_cpuidle_driver, 1, false);
 }
 EXPORT_SYMBOL_GPL(imx6q_cpuidle_fec_irqs_unused);
 
diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index 2447427cb4a8f2e9510ec4fe60944a52b2fd324c..69f3fa270fbe396410c088d82130595aceaa5639 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -203,7 +203,7 @@ void tegra20_cpuidle_pcie_irqs_in_use(void)
 {
 	pr_info_once(
 		"Disabling cpuidle LP2 state, since PCIe IRQs are in use\n");
-	tegra_idle_driver.states[1].disabled = true;
+	cpuidle_driver_state_disabled(&tegra_idle_driver, 1, true);
 }
 
 int __init tegra20_cpuidle_init(void)
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 287ef898a55e11990c20d83c9a8f97cacc137861..43c6d66b6e733a334e15bdb6bf9d8f8ced4d075a 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -176,6 +176,8 @@ static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist
 const struct dma_map_ops arm_nommu_dma_ops = {
 	.alloc			= arm_nommu_dma_alloc,
 	.free			= arm_nommu_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_nommu_dma_mmap,
 	.map_page		= arm_nommu_dma_map_page,
 	.unmap_page		= arm_nommu_dma_unmap_page,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 27576c7b836ee6ecf3cb210749bdad46f898ee38..b182a59a3a4ee7553506fd5646654fcc8c43880b 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -182,6 +182,8 @@ static void arm_dma_sync_single_for_device(struct device *dev,
 const struct dma_map_ops arm_dma_ops = {
 	.alloc			= arm_dma_alloc,
 	.free			= arm_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_dma_mmap,
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_dma_map_page,
@@ -209,6 +211,8 @@ static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 const struct dma_map_ops arm_coherent_dma_ops = {
 	.alloc			= arm_coherent_dma_alloc,
 	.free			= arm_coherent_dma_free,
+	.alloc_pages		= dma_direct_alloc_pages,
+	.free_pages		= dma_direct_free_pages,
 	.mmap			= arm_coherent_dma_mmap,
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_coherent_dma_map_page,
@@ -529,7 +533,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
 
 static bool __in_atomic_pool(void *start, size_t size)
 {
-	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+	return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
 }
 
 static int __free_from_pool(void *start, size_t size)
@@ -2332,26 +2336,20 @@ void arch_teardown_dma_ops(struct device *dev)
 }
 
 #ifdef CONFIG_SWIOTLB
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_page_cpu_to_dev(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
 			      size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_page_dev_to_cpu(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
 			      size, dir);
 }
 
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return dma_to_pfn(dev, dma_addr);
-}
-
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs)
 {
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index 38fa917c8585c7e6b6e2efe75d785885f41260a2..a6a2514e5fe8fb0370b164128c5d6570ccd73109 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -70,20 +70,20 @@ static void dma_cache_maint(dma_addr_t handle, size_t size, u32 op)
  * pfn_valid returns true the pages is local and we can use the native
  * dma-direct functions, otherwise we call the Xen specific version.
  */
-void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (pfn_valid(PFN_DOWN(handle)))
-		arch_sync_dma_for_cpu(dev, paddr, size, dir);
+		arch_sync_dma_for_cpu(paddr, size, dir);
 	else if (dir != DMA_TO_DEVICE)
 		dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL);
 }
 
-void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (pfn_valid(PFN_DOWN(handle)))
-		arch_sync_dma_for_device(dev, paddr, size, dir);
+		arch_sync_dma_for_device(paddr, size, dir);
 	else if (dir == DMA_FROM_DEVICE)
 		dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL);
 	else
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 09d370f8c98965d8267cf239c9a038629dbda5af..c5d482b4392d06af9113a5105e375b6e74bb25d4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -12,7 +12,6 @@ config ARM64
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_FAST_MULTIPLIER
@@ -183,7 +182,6 @@ config ARM64
 	select PCI_SYSCALL if PCI
 	select POWER_RESET
 	select POWER_SUPPLY
-	select REFCOUNT_FULL
 	select SPARSE_IRQ
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index c9a867ac32d48d7cd04c17e499afd7e97727df68..b39b520ee46530dc32e457e9d5372251183407b4 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -858,4 +858,7 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
 # CONFIG_FTRACE is not set
-CONFIG_MEMTEST=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
+# CONFIG_INTEL_IDXD_COMPAT is not set
diff --git a/arch/arm64/configs/oc.config b/arch/arm64/configs/oc.config
index 9157c78e3899850d04f881b509683701a8a89aa8..c91e389cd8fe3212a1e0e885c87f9f8370beb33b 100644
--- a/arch/arm64/configs/oc.config
+++ b/arch/arm64/configs/oc.config
@@ -407,6 +407,7 @@ CONFIG_IP_DCCP=m
 CONFIG_SCTP_COOKIE_HMAC_SHA1=y
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
+CONFIG_DMATEST=m
 CONFIG_RDS_TCP=m
 CONFIG_TIPC=m
 CONFIG_TIPC_MEDIA_IB=y
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 9239416e93d4e9a9787ea9d5801914c075c359a9..6c45350e33aa5a47ce0ea74aaed61c945c1a101f 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -13,14 +13,14 @@
 
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_map_area(phys_to_virt(paddr), size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_unmap_area(phys_to_virt(paddr), size, dir);
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 99bc0289ab2b660fa1a003551879f7d6b0256cd1..05fbeb89650245b69f4abedebb627a16f6112756 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1061,6 +1061,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
+	memblock_clear_nomap(start, size);
+
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
 }
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index b319808e8f6bd3948790aee38711a3472d5b7565..a5909091cb14244f635d73a9b6ad76e3a03b6e19 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -140,7 +140,7 @@ void __init coherent_mem_init(phys_addr_t start, u32 size)
 		      sizeof(long));
 }
 
-static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+static void c6x_dma_sync(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	BUG_ON(!valid_dma_direction(dir));
@@ -160,14 +160,14 @@ static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	return c6x_dma_sync(dev, paddr, size, dir);
+	return c6x_dma_sync(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	return c6x_dma_sync(dev, paddr, size, dir);
+	return c6x_dma_sync(paddr, size, dir);
 }
diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c
index 06e85b56545427de4f1883d95a59319f817c85d3..8f6571ae27c867ad350c1eb2779e4e65b5fbd0b3 100644
--- a/arch/csky/mm/dma-mapping.c
+++ b/arch/csky/mm/dma-mapping.c
@@ -58,8 +58,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-			      size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
@@ -74,8 +74,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-			   size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index f561b127c4b43caa9324805e9c0998491b06ef27..25f388d9cfcc36650454ecd55217bff5c6eac7da 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -55,8 +55,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	gen_pool_free(coherent_pool, (unsigned long) vaddr, size);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *addr = phys_to_virt(paddr);
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 16714477eef429847cf5a57da5180f83b0167d44..69ef5c990dd18e2f9fc7f7548c1017604b23e039 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,7 +33,7 @@ config IA64
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_VIRT_CPU_ACCOUNTING
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select DMA_NONCOHERENT_MMAP
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
@@ -57,6 +57,7 @@ config IA64
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select NUMA if !FLATMEM
+	select PCI_MSI_ARCH_FALLBACKS
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 4a3262795890443625712445b0a797d21b35f438..09ef9ce9988d1fa218864d8e0c1193c75a38ba72 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -19,9 +19,3 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 {
 	dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 }
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return page_to_pfn(virt_to_page(cpu_addr));
-}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index ee50506d86f426cbe1c30c58ef277449a59e159e..df6d3dfa9d820637a6be2f005d7f22ab0e7d1a9f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -73,8 +73,8 @@ __ia64_sync_icache_dcache (pte_t pte)
  * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
  * flush them when they get mapped into an executable vm-area.
  */
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	unsigned long pfn = PHYS_PFN(paddr);
 
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 3fab684cc0db0b3f67ef346f2e7a15989e2d0260..871a0e11da341ada53ba4b816f98b676a1bb6ea2 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -61,8 +61,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 
 #endif /* CONFIG_MMU && !CONFIG_COLDFIRE */
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t handle,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t handle, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index c9c4be822456b4320800571e11dba5023ab67bf1..261c26df1c9ff19df521ea23ec3fed93048a4ffa 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -4,7 +4,6 @@ config MICROBLAZE
 	select ARCH_32BIT_OFF_T
 	select ARCH_NO_SWAP
 	select ARCH_HAS_BINFMT_FLAT if !MMU
-	select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index a89c2d4ed5ffc74dfa54cb864885d3596e9bbf6c..d7bebd04247b72b797185cca5494dd7ca8755fea 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -15,7 +15,7 @@
 #include <linux/bug.h>
 #include <asm/cacheflush.h>
 
-static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
+static void __dma_sync(phys_addr_t paddr, size_t size,
 		enum dma_data_direction direction)
 {
 	switch (direction) {
@@ -31,14 +31,14 @@ static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	__dma_sync(dev, paddr, size, dir);
+	__dma_sync(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	__dma_sync(dev, paddr, size, dir);
+	__dma_sync(paddr, size, dir);
 }
diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c
index 8c5f0c332d8b64ed4cf2008a292e05a39cfa0546..e09b66e43cb63f436f68a0444b32aca7a71dcb5b 100644
--- a/arch/microblaze/mm/consistent.c
+++ b/arch/microblaze/mm/consistent.c
@@ -40,7 +40,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 #define UNCACHED_SHADOW_MASK 0
 #endif /* CONFIG_XILINX_UNCACHED_SHADOW */
 
-void *uncached_kernel_address(void *ptr)
+void *arch_dma_set_uncached(void *ptr, size_t size)
 {
 	unsigned long addr = (unsigned long)ptr;
 
@@ -49,11 +49,4 @@ void *uncached_kernel_address(void *ptr)
 		pr_warn("ERROR: Your cache coherent area is CACHED!!!\n");
 	return (void *)addr;
 }
-
-void *cached_kernel_address(void *ptr)
-{
-	unsigned long addr = (unsigned long)ptr;
-
-	return (void *)(addr & ~UNCACHED_SHADOW_MASK);
-}
 #endif /* CONFIG_MMU */
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6ecdc690f7336a98cf433a42e2435a0916f019cd..57f957694dc1567c1f33d7a6e2ea7c0ab649f594 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -82,6 +82,7 @@ config MIPS
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
 	select MODULES_USE_ELF_REL if MODULES
 	select PERF_USE_VMALLOC
+	select PCI_MSI_ARCH_FALLBACKS
 	select RTC_LIB
 	select SYSCTL_EXCEPTION_TRACE
 	select VIRT_TO_BUS
@@ -1133,11 +1134,12 @@ config DMA_NONCOHERENT
 	# significant advantages.
 	#
 	select ARCH_HAS_DMA_WRITE_COMBINE
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_UNCACHED_SEGMENT
-	select NEED_DMA_MAP_STATE
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select ARCH_HAS_DMA_SET_UNCACHED
+	select DMA_NONCOHERENT_MMAP
 	select DMA_NONCOHERENT_CACHE_SYNC
+	select NEED_DMA_MAP_STATE
 
 config SYS_HAS_EARLY_PRINTK
 	bool
diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c
index 3d13c77c125f4a8b7fa7097446e7c550f43ecc19..ba2a5d33dfd3fa0d309a24de758d2fcf188a8f91 100644
--- a/arch/mips/bmips/dma.c
+++ b/arch/mips/bmips/dma.c
@@ -52,7 +52,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t pa)
 	return pa;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	struct bmips_dma_range *r;
 
@@ -64,7 +64,7 @@ phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 	return dma_addr;
 }
 
-void arch_sync_dma_for_cpu_all(struct device *dev)
+void arch_sync_dma_for_cpu_all(void)
 {
 	void __iomem *cbr = BMIPS_GET_CBR();
 	u32 cfg;
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index 14ea680d180e07339ea8c9ff8baac144ea2cf364..388b13ba2558c2e939076789718637943848dbbc 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -177,7 +177,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
 #ifdef CONFIG_PCI
 	if (dev && dev_is_pci(dev))
diff --git a/arch/mips/include/asm/dma-direct.h b/arch/mips/include/asm/dma-direct.h
index b5c240806e1bb72b4b1b35d2d8ed13ac3e562f64..8e178651c638c279fd74f431e08408fc0baaa9b8 100644
--- a/arch/mips/include/asm/dma-direct.h
+++ b/arch/mips/include/asm/dma-direct.h
@@ -2,15 +2,7 @@
 #ifndef _MIPS_DMA_DIRECT_H
 #define _MIPS_DMA_DIRECT_H 1
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	if (!dev->dma_mask)
-		return false;
-
-	return addr + size - 1 <= *dev->dma_mask;
-}
-
 dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr);
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr);
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
 
 #endif /* _MIPS_DMA_DIRECT_H */
diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index a01e14955187e40140b7b065ca665e42ecb2b172..c64a297e82b3c33323d19fb769ba4450fde5f32d 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -592,7 +592,7 @@ static dma_addr_t jazz_dma_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(dev, phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 	return vdma_alloc(phys, size);
 }
 
@@ -600,7 +600,7 @@ static void jazz_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_cpu(dev, vdma_log2phys(dma_addr), size, dir);
+		arch_sync_dma_for_cpu(vdma_log2phys(dma_addr), size, dir);
 	vdma_free(dma_addr);
 }
 
@@ -612,7 +612,7 @@ static int jazz_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 
 	for_each_sg(sglist, sg, nents, i) {
 		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+			arch_sync_dma_for_device(sg_phys(sg), sg->length,
 				dir);
 		sg->dma_address = vdma_alloc(sg_phys(sg), sg->length);
 		if (sg->dma_address == DMA_MAPPING_ERROR)
@@ -631,8 +631,7 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 
 	for_each_sg(sglist, sg, nents, i) {
 		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-			arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length,
-				dir);
+			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 		vdma_free(sg->dma_address);
 	}
 }
@@ -640,13 +639,13 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 static void jazz_dma_sync_single_for_device(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	arch_sync_dma_for_device(dev, vdma_log2phys(addr), size, dir);
+	arch_sync_dma_for_device(vdma_log2phys(addr), size, dir);
 }
 
 static void jazz_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	arch_sync_dma_for_cpu(dev, vdma_log2phys(addr), size, dir);
+	arch_sync_dma_for_cpu(vdma_log2phys(addr), size, dir);
 }
 
 static void jazz_dma_sync_sg_for_device(struct device *dev,
@@ -656,7 +655,7 @@ static void jazz_dma_sync_sg_for_device(struct device *dev,
 	int i;
 
 	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
 }
 
 static void jazz_dma_sync_sg_for_cpu(struct device *dev,
@@ -666,7 +665,7 @@ static void jazz_dma_sync_sg_for_cpu(struct device *dev,
 	int i;
 
 	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 }
 
 const struct dma_map_ops jazz_dma_ops = {
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 1d4d57dd9acf8ccff46780daa3107a16b1aad120..fcea92d95d86099d8fa5d00e52a1e825eee901c2 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -27,7 +27,7 @@
  * R10000 and R12000 are used in such systems, the SGI IP28 Indigo² rsp.
  * SGI IP32 aka O2.
  */
-static inline bool cpu_needs_post_dma_flush(struct device *dev)
+static inline bool cpu_needs_post_dma_flush(void)
 {
 	switch (boot_cpu_type()) {
 	case CPU_R10000:
@@ -49,22 +49,11 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	dma_cache_wback_inv((unsigned long)page_address(page), size);
 }
 
-void *uncached_kernel_address(void *addr)
+void *arch_dma_set_uncached(void *addr, size_t size)
 {
 	return (void *)(__pa(addr) + UNCAC_BASE);
 }
 
-void *cached_kernel_address(void *addr)
-{
-	return __va(addr) - UNCAC_BASE;
-}
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return page_to_pfn(virt_to_page(cached_kernel_address(cpu_addr)));
-}
-
 static inline void dma_sync_virt(void *addr, size_t size,
 		enum dma_data_direction dir)
 {
@@ -118,17 +107,17 @@ static inline void dma_sync_phys(phys_addr_t paddr, size_t size,
 	} while (left);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	dma_sync_phys(paddr, size, dir);
 }
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
-	if (cpu_needs_post_dma_flush(dev))
+	if (cpu_needs_post_dma_flush())
 		dma_sync_phys(paddr, size, dir);
 }
 #endif
diff --git a/arch/mips/pci/fixup-sb1250.c b/arch/mips/pci/fixup-sb1250.c
index 8a41b359cf900429b96bc3dad1173bd32f1fabec..40efc990cdceb8f1cd448b13ae5ae1d9ba347ed7 100644
--- a/arch/mips/pci/fixup-sb1250.c
+++ b/arch/mips/pci/fixup-sb1250.c
@@ -21,22 +21,22 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 
 /*
  * The BCM1250, etc. PCI host bridge does not support DAC on its 32-bit
- * bus, so we set the bus's DMA mask accordingly.  However the HT link
+ * bus, so we set the bus's DMA limit accordingly.  However the HT link
  * down the artificial PCI-HT bridge supports 40-bit addressing and the
  * SP1011 HT-PCI bridge downstream supports both DAC and a 64-bit bus
  * width, so we record the PCI-HT bridge's secondary and subordinate bus
- * numbers and do not set the mask for devices present in the inclusive
+ * numbers and do not set the limit for devices present in the inclusive
  * range of those.
  */
-struct sb1250_bus_dma_mask_exclude {
+struct sb1250_bus_dma_limit_exclude {
 	bool set;
 	unsigned char start;
 	unsigned char end;
 };
 
-static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
+static int sb1250_bus_dma_limit(struct pci_dev *dev, void *data)
 {
-	struct sb1250_bus_dma_mask_exclude *exclude = data;
+	struct sb1250_bus_dma_limit_exclude *exclude = data;
 	bool exclude_this;
 	bool ht_bridge;
 
@@ -55,7 +55,7 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 			exclude->start, exclude->end);
 	} else {
 		dev_dbg(&dev->dev, "disabling DAC for device");
-		dev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+		dev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	}
 
 	return 0;
@@ -63,9 +63,9 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data)
 
 static void quirk_sb1250_pci_dac(struct pci_dev *dev)
 {
-	struct sb1250_bus_dma_mask_exclude exclude = { .set = false };
+	struct sb1250_bus_dma_limit_exclude exclude = { .set = false };
 
-	pci_walk_bus(dev->bus, sb1250_bus_dma_mask, &exclude);
+	pci_walk_bus(dev->bus, sb1250_bus_dma_limit, &exclude);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI,
 			quirk_sb1250_pci_dac);
diff --git a/arch/mips/pci/pci-ar2315.c b/arch/mips/pci/pci-ar2315.c
index 0fed6fc17fe4090e40b3422a1316d1e6c4ec8d70..89b908ff499161d2f29e04448aa69a4c6a819cc9 100644
--- a/arch/mips/pci/pci-ar2315.c
+++ b/arch/mips/pci/pci-ar2315.c
@@ -175,7 +175,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return paddr + ar2315_dev_offset(dev);
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr - ar2315_dev_offset(dev);
 }
diff --git a/arch/mips/sgi-ip32/ip32-dma.c b/arch/mips/sgi-ip32/ip32-dma.c
index fa7b17cb53853ed04e7b452dbf3d19ba24a30005..160317294d97a93752348663a3de9fe93d66dfa6 100644
--- a/arch/mips/sgi-ip32/ip32-dma.c
+++ b/arch/mips/sgi-ip32/ip32-dma.c
@@ -27,7 +27,7 @@ dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 	return dma_addr;
 }
 
-phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr)
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr)
 {
 	phys_addr_t paddr = dma_addr & RAM_OFFSET_MASK;
 
diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c
index 4206d4b6c8cef40adfe7f59957e64012521fb35e..69d762182d49bfbd23ff57bc64e11b9328214519 100644
--- a/arch/nds32/kernel/dma.c
+++ b/arch/nds32/kernel/dma.c
@@ -46,8 +46,8 @@ static inline void cache_op(phys_addr_t paddr, size_t size,
 	} while (left);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_FROM_DEVICE:
@@ -61,8 +61,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_TO_DEVICE:
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 44b5da37e8bdc1b4fb123a0344f254b0c6ccf65a..2fc4ed210b5f0446ce526f1c063d91d4f347b6b6 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -2,9 +2,10 @@
 config NIOS2
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_UNCACHED_SEGMENT
+	select ARCH_HAS_DMA_SET_UNCACHED
 	select ARCH_NO_SWAP
 	select TIMER_OF
 	select GENERIC_ATOMIC64
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index 9cb238664584c6cef9b96809db406fed93beb2ec..fd887d5f3f9a786eb3c7fbde2ef103fe9cbc07ed 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -18,8 +18,8 @@
 #include <linux/cache.h>
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *vaddr = phys_to_virt(paddr);
 
@@ -42,8 +42,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *vaddr = phys_to_virt(paddr);
 
@@ -67,7 +67,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	flush_dcache_range(start, start + size);
 }
 
-void *uncached_kernel_address(void *ptr)
+void *arch_dma_set_uncached(void *ptr, size_t size)
 {
 	unsigned long addr = (unsigned long)ptr;
 
@@ -75,13 +75,3 @@ void *uncached_kernel_address(void *ptr)
 
 	return (void *)ptr;
 }
-
-void *cached_kernel_address(void *ptr)
-{
-	unsigned long addr = (unsigned long)ptr;
-
-	addr &= ~CONFIG_NIOS2_IO_REGION_BASE;
-	addr |= CONFIG_NIOS2_KERNEL_REGION_BASE;
-
-	return (void *)ptr;
-}
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 4d5b8bd1d795684ae51a9eb7cb6aa7acb729dda2..adec711ad39d5bafdab35cc07a87758cca4d5eef 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -125,7 +125,7 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	free_pages_exact(vaddr, size);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t addr, size_t size,
+void arch_sync_dma_for_device(phys_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
 	unsigned long cl;
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ca35d9a76e5062ea12e150310fdf3bda035e0893..a60d47fd4d55f7d598ce7033a93996ea3e385bc7 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -439,14 +439,14 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	free_pages((unsigned long)__va(dma_handle), order);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size);
 }
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 757175ccf53c0023ab048d7f8c3838530adfdf0d..c0c7d51fc3ee0588c11b55c4de67586c05e80abb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -242,6 +242,7 @@ config PPC
 	select OLD_SIGACTION			if PPC32
 	select OLD_SIGSUSPEND
 	select PCI_DOMAINS			if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select PCI_SYSCALL			if PCI
 	select PPC_DAWR				if PPC64
 	select RTC_LIB
diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h
index a2912b47102cf3a1f410726e116d060c739f44d7..e29e8a236b8dff399fac84e9293fd0ac6169b4de 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -2,15 +2,6 @@
 #ifndef ASM_POWERPC_DMA_DIRECT_H
 #define ASM_POWERPC_DMA_DIRECT_H 1
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
-{
-	if (!dev->dma_mask)
-		return false;
-
-	return addr + size - 1 <=
-		min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
-}
-
 static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
 	if (!dev)
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 2a82984356f81ffd3407361a50f8773adb272215..5ab4f868e919b8dd8f1efe75ddfefc94383a68e2 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -104,14 +104,14 @@ static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
 #endif
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_sync_page(paddr, size, dir);
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	__dma_sync_page(paddr, size, dir);
 }
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6f013e418834987c5b40218ce974e3d344357eb0..f5f973077c28c431902399c6cbe20deffd9de70f 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -529,6 +529,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
 		}
 	}
 	cpuhw->bhrb_stack.nr = u_index;
+	cpuhw->bhrb_stack.hw_idx = -1ULL;
 	return;
 }
 
@@ -2135,7 +2136,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
 						ppmu->get_mem_weight)
-			ppmu->get_mem_weight(&data.weight);
+			ppmu->get_mem_weight(&data.weight.full);
 
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f0330ce498d1e1b0e0ba67d6f0442396bfa95b93..97af19141aed72802a74fd972bde170c272a6e2b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -459,7 +459,6 @@ config NOT_COHERENT_CACHE
 	bool
 	depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
 		GAMECUBE_COMMON || AMIGAONE
-	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ff0e2b156cb5fab9b2639148ee31400eafd26982..617a443d673dad02e0e1c7b1f7bb5ddbb6e3ff5c 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -115,8 +115,8 @@ static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 
-	pdev->dev.bus_dma_mask =
-		hose->dma_window_base_cur + hose->dma_window_size;
+	pdev->dev.bus_dma_limit =
+		hose->dma_window_base_cur + hose->dma_window_size - 1;
 }
 
 static void setup_swiotlb_ops(struct pci_controller *hose)
@@ -135,7 +135,7 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 	 * mapping that allows addressing any RAM address from across PCI.
 	 */
 	if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
-		dev->bus_dma_mask = 0;
+		dev->bus_dma_limit = 0;
 		dev->archdata.dma_offset = pci64_dma_offset;
 	}
 }
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 43a81d0ad5074dc8fd22adba001ce61d428b6f3e..6bda8c9b519d2c26ac3cc6fe1699588f6a7608c2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -182,6 +182,7 @@ config S390
 	select OLD_SIGSUSPEND3
 	select PCI_DOMAINS		if PCI
 	select PCI_MSI			if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 38d64030aacf69dc6cd2c0b62b13c67d79a3d4ea..2e60c80395ab084afbe4713fbc8816f42bea3f60 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -62,7 +62,6 @@ CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STATIC_KEYS_SELFTEST=y
-CONFIG_REFCOUNT_FULL=y
 CONFIG_LOCK_EVENT_COUNTS=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 6087a4e9b2bfaf9667d719565c33cde983acc092..b05187ce5dbdc59cc72d69ba3cb8e43b0e220f57 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -2,9 +2,6 @@
 #ifndef __ASM_S390_PCI_H
 #define __ASM_S390_PCI_H
 
-/* must be set before including pci_clp.h */
-#define PCI_BAR_COUNT	6
-
 #include <linux/pci.h>
 #include <linux/mutex.h>
 #include <linux/iommu.h>
@@ -138,7 +135,7 @@ struct zpci_dev {
 
 	char res_name[16];
 	bool mio_capable;
-	struct zpci_bar_struct bars[PCI_BAR_COUNT];
+	struct zpci_bar_struct bars[PCI_STD_NUM_BARS];
 
 	u64		start_dma;	/* Start of available DMA addresses */
 	u64		end_dma;	/* End of available DMA addresses */
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index 50359172cc488b538845ebbfe8f2c32bcc35985b..bd2cb4ea7d93d3081e7b3486e5b79e3f9f7b76a2 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -77,7 +77,7 @@ struct mio_info {
 	struct {
 		u64 wb;
 		u64 wt;
-	} addr[PCI_BAR_COUNT];
+	} addr[PCI_STD_NUM_BARS];
 	u32 reserved[6];
 } __packed;
 
@@ -98,9 +98,9 @@ struct clp_rsp_query_pci {
 	u16 util_str_avail	:  1;	/* utility string available? */
 	u16 pfgid		:  8;	/* pci function group id */
 	u32 fid;			/* pci function id */
-	u8 bar_size[PCI_BAR_COUNT];
+	u8 bar_size[PCI_STD_NUM_BARS];
 	u16 pchid;
-	__le32 bar[PCI_BAR_COUNT];
+	__le32 bar[PCI_STD_NUM_BARS];
 	u8 pfip[CLP_PFIP_NR_SEGMENTS];	/* pci function internal path */
 	u32			: 16;
 	u8 fmb_len;
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 6105b1b6e49b71016511abee2e741de4a146fef2..b5af0b891326c4b0144491d635d94a10311765c9 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -43,7 +43,7 @@ static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES);
 static DEFINE_SPINLOCK(zpci_domain_lock);
 
 #define ZPCI_IOMAP_ENTRIES						\
-	min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2),	\
+	min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2),	\
 	    ZPCI_IOMAP_MAX_ENTRIES)
 
 static DEFINE_SPINLOCK(zpci_iomap_lock);
@@ -294,7 +294,7 @@ static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar,
 void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar,
 			      unsigned long offset, unsigned long max)
 {
-	if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT)
+	if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar))
 		return NULL;
 
 	if (static_branch_likely(&have_mio))
@@ -324,7 +324,7 @@ static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar,
 void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar,
 				 unsigned long offset, unsigned long max)
 {
-	if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT)
+	if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar))
 		return NULL;
 
 	if (static_branch_likely(&have_mio))
@@ -416,7 +416,7 @@ static void zpci_map_resources(struct pci_dev *pdev)
 	resource_size_t len;
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		len = pci_resource_len(pdev, i);
 		if (!len)
 			continue;
@@ -451,7 +451,7 @@ static void zpci_unmap_resources(struct pci_dev *pdev)
 	if (zpci_use_mio(zdev))
 		return;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		len = pci_resource_len(pdev, i);
 		if (!len)
 			continue;
@@ -514,7 +514,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
 	snprintf(zdev->res_name, sizeof(zdev->res_name),
 		 "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR);
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (!zdev->bars[i].size)
 			continue;
 		entry = zpci_alloc_iomap(zdev);
@@ -551,7 +551,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (!zdev->bars[i].size || !zdev->bars[i].res)
 			continue;
 
@@ -573,7 +573,7 @@ int pcibios_add_device(struct pci_dev *pdev)
 	pdev->dev.dma_ops = &s390_pci_dma_ops;
 	zpci_map_resources(pdev);
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		res = &pdev->resource[i];
 		if (res->parent || !res->flags)
 			continue;
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 20e093f863297032d687c442b0c2b30727d0ed10..25208fa95426043ff388841320d7aeaebf3be56b 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -145,7 +145,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 {
 	int i;
 
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		zdev->bars[i].val = le32_to_cpu(response->bar[i]);
 		zdev->bars[i].size = response->bar_size[i];
 	}
@@ -164,8 +164,8 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 		       sizeof(zdev->util_str));
 	}
 	zdev->mio_capable = response->mio_addr_avail;
-	for (i = 0; i < PCI_BAR_COUNT; i++) {
-		if (!(response->mio.valid & (1 << (PCI_BAR_COUNT - i - 1))))
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		if (!(response->mio.valid & (1 << (PCI_STD_NUM_BARS - i - 1))))
 			continue;
 
 		zdev->bars[i].mio_wb = (void __iomem *) response->mio.addr[i].wb;
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index dbd2cdec2ddb65c7cb305643f81195d947ededfa..b0f9c8f8fd146adb96addae6240f488519d2118c 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -67,7 +67,7 @@ static struct cpuidle_driver cpuidle_driver = {
 			.enter = cpuidle_sleep_enter,
 			.name = "C2",
 			.desc = "SuperH Sleep Mode [SF]",
-			.disabled = true,
+			.flags = CPUIDLE_FLAG_UNUSABLE,
 		},
 		{
 			.exit_latency = 2300,
@@ -76,7 +76,7 @@ static struct cpuidle_driver cpuidle_driver = {
 			.enter = cpuidle_sleep_enter,
 			.name = "C3",
 			.desc = "SuperH Mobile Standby Mode [SF]",
-			.disabled = true,
+			.flags = CPUIDLE_FLAG_UNUSABLE,
 		},
 	},
 	.safe_state_index = 0,
@@ -86,10 +86,10 @@ static struct cpuidle_driver cpuidle_driver = {
 int __init sh_mobile_setup_cpuidle(void)
 {
 	if (sh_mobile_sleep_supported & SUSP_SH_SF)
-		cpuidle_driver.states[1].disabled = false;
+		cpuidle_driver.states[1].flags = CPUIDLE_FLAG_NONE;
 
 	if (sh_mobile_sleep_supported & SUSP_SH_STANDBY)
-		cpuidle_driver.states[2].disabled = false;
+		cpuidle_driver.states[2].flags = CPUIDLE_FLAG_NONE;
 
 	return cpuidle_register(&cpuidle_driver, NULL);
 }
diff --git a/arch/sh/kernel/dma-coherent.c b/arch/sh/kernel/dma-coherent.c
index b17514619b7e1f7de0d2df82efeaf0ac3b8144e8..eeb25a4fa55f24e91071403b87da8f6d014878ec 100644
--- a/arch/sh/kernel/dma-coherent.c
+++ b/arch/sh/kernel/dma-coherent.c
@@ -25,7 +25,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	 * Pages from the page allocator may have data present in
 	 * cache. So flush the cache before using uncached memory.
 	 */
-	arch_sync_dma_for_device(dev, virt_to_phys(ret), size,
+	arch_sync_dma_for_device(virt_to_phys(ret), size,
 			DMA_BIDIRECTIONAL);
 
 	ret_nocache = (void __force *)ioremap_nocache(virt_to_phys(ret), size);
@@ -59,8 +59,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 	iounmap(vaddr);
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	void *addr = sh_cacheop_vaddr(phys_to_virt(paddr));
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 349e27771ceaff55f8dc0ca22647767e46eedfc0..2717701a0bfca50938aab205fd055150758c8101 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
 	select GENERIC_STRNLEN_USER
 	select MODULES_USE_ELF_RELA
 	select PCI_SYSCALL if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index f89603855f1ec4e81381881d0839070745335336..e59461d03b9a5ef8ffed637ddb8c5c26b6178fbe 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -366,8 +366,8 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 
 /* IIep is write-through, not flushing on cpu to device transfer. */
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	if (dir != PCI_DMA_TODEVICE)
 		dma_make_coherent(paddr, PAGE_ALIGN(size));
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3ece88cebd687f773311a9b32e2bbc86d4854899..cb1cc0ecbd1ad04cbd0ec2e7b121676b396a9215 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -73,7 +73,6 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
-	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_UACCESS_MCSAFE		if X86_64 && X86_MCE
 	select ARCH_HAS_SET_MEMORY
@@ -102,6 +101,7 @@ config X86
 	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CLOCKSOURCE_WATCHDOG
 	select DCACHE_WORD_ACCESS
+	select DYNAMIC_SIGFRAME
 	select EDAC_ATOMIC_SCRUB
 	select EDAC_SUPPORT
 	select GENERIC_CLOCKEVENTS
@@ -215,6 +215,7 @@ config X86
 	select NEED_SG_DMA_LENGTH
 	select PCI_DOMAINS			if PCI
 	select PCI_LOCKLESS_CONFIG		if PCI
+	select PCI_MSI_ARCH_FALLBACKS
 	select PERF_EVENTS
 	select RTC_LIB
 	select RTC_MC146818_LIB
@@ -455,6 +456,7 @@ config X86_CPU_RESCTRL
 	bool "x86 CPU resource control support"
 	depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
 	select KERNFS
+	select PROC_CPU_RESCTRL		if PROC_FS
 	help
 	  Enable x86 CPU resource control support.
 
@@ -2515,6 +2517,22 @@ config MODIFY_LDT_SYSCALL
 
 	  Saying 'N' here may make sense for embedded or server kernels.
 
+config STRICT_SIGALTSTACK_SIZE
+	bool "Enforce strict size checking for sigaltstack"
+	depends on DYNAMIC_SIGFRAME
+	help
+	  For historical reasons MINSIGSTKSZ is a constant which became
+	  already too small with AVX512 support. Add a mechanism to
+	  enforce strict checking of the sigaltstack size against the
+	  real size of the FPU frame. This option enables the check
+	  by default. It can also be controlled via the kernel command
+	  line option 'strict_sas_size' independent of this config
+	  switch. Enabling it might break existing applications which
+	  allocate a too small sigaltstack but 'work' because they
+	  never get a signal delivered.
+
+	  Say 'N' unless you want to really enforce this check.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
@@ -3005,9 +3023,6 @@ config HAVE_ATOMIC_IOMAP
 	def_bool y
 	depends on X86_32
 
-config X86_DEV_DMA_OPS
-	bool
-
 source "drivers/firmware/Kconfig"
 
 source "arch/x86/kvm/Kconfig"
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 82bc60c8acb240c29f925df7dde03ac724edf3b4..f2db8c5e4b0610fdc6ef7ba7dfb58a2dc9e53635 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -554,7 +554,11 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			e820_type = E820_TYPE_RAM;
+			if (efi_soft_reserve_enabled() &&
+			    (d->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else
+				e820_type = E820_TYPE_RAM;
 			break;
 
 		case EFI_ACPI_MEMORY_NVS:
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 2e53c056ba20c16eda669d304e0da61346f56970..da0eedd5635d3bb3c95ac069c06e2ddad629cb33 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -132,8 +132,14 @@ char *skip_spaces(const char *str)
 #include "../../../../lib/ctype.c"
 #include "../../../../lib/cmdline.c"
 
+enum parse_mode {
+	PARSE_MEMMAP,
+	PARSE_EFI,
+};
+
 static int
-parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
+parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
+		enum parse_mode mode)
 {
 	char *oldp;
 
@@ -156,8 +162,29 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
 		*start = memparse(p + 1, &p);
 		return 0;
 	case '@':
-		/* memmap=nn@ss specifies usable region, should be skipped */
-		*size = 0;
+		if (mode == PARSE_MEMMAP) {
+			/*
+			 * memmap=nn@ss specifies usable region, should
+			 * be skipped
+			 */
+			*size = 0;
+		} else {
+			unsigned long long flags;
+
+			/*
+			 * efi_fake_mem=nn@ss:attr the attr specifies
+			 * flags that might imply a soft-reservation.
+			 */
+			*start = memparse(p + 1, &p);
+			if (p && *p == ':') {
+				p++;
+				if (kstrtoull(p, 0, &flags) < 0)
+					*size = 0;
+				else if (flags & EFI_MEMORY_SP)
+					return 0;
+			}
+			*size = 0;
+		}
 		/* Fall through */
 	default:
 		/*
@@ -172,7 +199,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
 	return -EINVAL;
 }
 
-static void mem_avoid_memmap(char *str)
+static void mem_avoid_memmap(enum parse_mode mode, char *str)
 {
 	static int i;
 
@@ -187,7 +214,7 @@ static void mem_avoid_memmap(char *str)
 		if (k)
 			*k++ = 0;
 
-		rc = parse_memmap(str, &start, &size);
+		rc = parse_memmap(str, &start, &size, mode);
 		if (rc < 0)
 			break;
 		str = k;
@@ -238,7 +265,6 @@ static void parse_gb_huge_pages(char *param, char *val)
 	}
 }
 
-
 static void handle_mem_options(void)
 {
 	char *args = (char *)get_cmd_line_ptr();
@@ -271,7 +297,7 @@ static void handle_mem_options(void)
 		}
 
 		if (!strcmp(param, "memmap")) {
-			mem_avoid_memmap(val);
+			mem_avoid_memmap(PARSE_MEMMAP, val);
 		} else if (strstr(param, "hugepages")) {
 			parse_gb_huge_pages(param, val);
 		} else if (!strcmp(param, "mem")) {
@@ -284,6 +310,8 @@ static void handle_mem_options(void)
 				goto out;
 
 			mem_limit = mem_size;
+		} else if (!strcmp(param, "efi_fake_mem")) {
+			mem_avoid_memmap(PARSE_EFI, val);
 		}
 	}
 
@@ -760,6 +788,10 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
 		if (md->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (md->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (efi_mirror_found &&
 		    !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
 			continue;
diff --git a/arch/x86/configs/oc.config b/arch/x86/configs/oc.config
index d190a290bbf022617501d9c95d818044316f636b..26bd155d4d6004c5f363b348a6e3ef34072463b5 100644
--- a/arch/x86/configs/oc.config
+++ b/arch/x86/configs/oc.config
@@ -22,6 +22,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_IOASIDS=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
 CONFIG_CPUSETS=y
@@ -101,9 +102,14 @@ CONFIG_ACPI_HMAT=y
 CONFIG_ACPI_APEI=y
 CONFIG_ACPI_APEI_GHES=y
 CONFIG_ACPI_APEI_PCIEAER=y
+CONFIG_PCIE_DPC=y
+CONFIG_PCIE_EDR=y
 CONFIG_ACPI_APEI_MEMORY_FAILURE=y
 CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
+CONFIG_ACPI_PFRU=y
+CONFIG_ACPI_PFRU_TELEMETRY=y
+CONFIG_ACPI_PRMT=y
 CONFIG_ACPI_EXTLOG=m
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
@@ -134,6 +140,7 @@ CONFIG_VHOST_VSOCK=m
 CONFIG_OPROFILE=m
 CONFIG_OPROFILE_EVENT_MULTIPLEX=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
@@ -775,6 +782,7 @@ CONFIG_IXGBE_DCB=y
 CONFIG_IXGBEVF=m
 CONFIG_I40E=m
 CONFIG_I40EVF=m
+CONFIG_ICE=m
 CONFIG_IGC=m
 CONFIG_JME=m
 CONFIG_MVMDIO=m
@@ -1440,6 +1448,7 @@ CONFIG_SECURITY_YAMA=y
 # CONFIG_INTEGRITY is not set
 CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,selinux"
 CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_RSA=y
@@ -1537,3 +1546,14 @@ CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_X86_DECODER_SELFTEST=y
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_UNWINDER_FRAME_POINTER=y
+# Intel SPR
+CONFIG_MFD_INTEL_PMT=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
+CONFIG_INTEL_IDXD_SVM=y
+CONFIG_INTEL_IDXD_PERFMON=y
+CONFIG_VFIO_MDEV_IDXD=m
+CONFIG_IOMMU_DEFAULT_PASSTHROUGH=y
+CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
+CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 94794c41b2858aec3c6bf73806edc811d286f801..af598e2f362cd4510f55ee8233e773ce25934521 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -223,6 +223,8 @@ static bool check_hw_exists(void)
 		if (ret)
 			goto msr_fail;
 		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+			if (fixed_counter_disabled(i))
+				continue;
 			if (val & (0x03 << i*4)) {
 				bios_fail = 1;
 				val_fail = val;
@@ -360,6 +362,7 @@ void x86_release_hardware(void)
 	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		release_ds_buffers();
+		release_lbr_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -1499,6 +1502,8 @@ void perf_event_print_debug(void)
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		if (fixed_counter_disabled(idx))
+			continue;
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1937,7 +1942,9 @@ static int __init init_hw_perf_events(void)
 	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
 	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... fixed-purpose events:   %lu\n",
+			hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+					<< INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	/*
@@ -2354,6 +2361,13 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
+static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
+				  struct perf_event_context *next)
+{
+	if (x86_pmu.swap_task_ctx)
+		x86_pmu.swap_task_ctx(prev, next);
+}
+
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -2407,7 +2421,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
+	.swap_task_ctx		= x86_pmu_swap_task_ctx,
 	.check_period		= x86_pmu_check_period,
 
 	.aux_output_match	= x86_pmu_aux_output_match,
@@ -2532,7 +2546,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 	/* 32-bit process in 64-bit kernel. */
 	unsigned long ss_base, cs_base;
 	struct stack_frame_ia32 frame;
-	const void __user *fp;
+	const struct stack_frame_ia32 __user *fp;
 
 	if (!test_thread_flag(TIF_IA32))
 		return 0;
@@ -2543,18 +2557,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
 	while (entry->nr < entry->max_stack) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-		if (bytes != 0)
+		if (__get_user(frame.next_frame, &fp->next_frame))
 			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-		if (bytes != 0)
+		if (__get_user(frame.return_address, &fp->return_address))
 			break;
 
 		perf_callchain_store(entry, cs_base + frame.return_address);
@@ -2575,7 +2583,7 @@ void
 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
-	const unsigned long __user *fp;
+	const struct stack_frame __user *fp;
 
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
@@ -2588,7 +2596,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
 		return;
 
-	fp = (unsigned long __user *)regs->bp;
+	fp = (void __user *)regs->bp;
 
 	perf_callchain_store(entry, regs->ip);
 
@@ -2600,19 +2608,12 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	pagefault_disable();
 	while (entry->nr < entry->max_stack) {
-		unsigned long bytes;
-
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp));
-		if (bytes != 0)
+		if (__get_user(frame.next_frame, &fp->next_frame))
 			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp));
-		if (bytes != 0)
+		if (__get_user(frame.return_address, &fp->return_address))
 			break;
 
 		perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index e67a5886336c101b65c7a83769157fcf3fd3d648..10bde6c5abb2cb442271b41810b2e80885534d47 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel-uncore.o
-intel-uncore-objs			:= uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+intel-uncore-objs			:= uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE)	+= intel-cstate.o
 intel-cstate-objs			:= cstate.o
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f0ac975f3d2232120f9061a03f7af37ab1e31d20..4a67c42ac7a9ee0d236aa743d1fb76fbeddba17c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x01c0, 0),	/* INST_RETIRED.PREC_DIST */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	FIXED_EVENT_CONSTRAINT(0x0400, 3),	/* SLOTS */
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+	INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+	INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+	/*
+	 * Generally event codes < 0x90 are restricted to counters 0-3.
+	 * The 0x2E and 0x3C are exception, which has no restriction.
+	 */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+	INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+	INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+	INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+	INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+	INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+	/*
+	 * Generally event codes >= 0x90 are likely to have no restrictions.
+	 * The exception are defined as above.
+	 */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+	EVENT_CONSTRAINT_END
+};
+
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
 EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
 	"4", "2");
 
-EVENT_ATTR_STR(slots,			slots,		"event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring,	td_retiring,	"event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec,	td_bad_spec,	"event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound,	td_fe_bound,	"event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound,	td_be_bound,	"event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots,			slots,			"event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring,	td_retiring,		"event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec,	td_bad_spec,		"event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound,	td_fe_bound,		"event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound,	td_be_bound,		"event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops,	td_heavy_ops,		"event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict,	td_br_mispredict,	"event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat,	td_fetch_lat,		"event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound,	td_mem_bound,		"event=0x00,umask=0x87");
 
 static struct attribute *snb_events_attrs[] = {
 	EVENT_PTR(td_slots_issued),
@@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+static __initconst const u64 spr_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,
+		[ C(RESULT_MISS)   ] = 0xe124,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_MISS)   ] = 0xe424,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,
+		[ C(RESULT_MISS)   ] = 0xe12,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,
+		[ C(RESULT_MISS)   ] = 0xe13,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = 0xe11,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4c4,
+		[ C(RESULT_MISS)   ] = 0x4c5,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x12a,
+		[ C(RESULT_MISS)   ] = 0x12a,
+	},
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10001,
+		[ C(RESULT_MISS)   ] = 0x3fbfc00001,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+		[ C(RESULT_MISS)   ] = 0x3f3fc00002,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10c000001,
+		[ C(RESULT_MISS)   ] = 0x3fb3000001,
+	},
+ },
+};
+
 /*
  * Notes on the events:
  * - data reads do not include code reads (comparable to earlier tables)
@@ -2114,18 +2269,6 @@ static void intel_tfa_pmu_enable_all(int added)
 	intel_pmu_enable_all(added);
 }
 
-static void enable_counter_freeze(void)
-{
-	update_debugctlmsr(get_debugctlmsr() |
-			DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
-	update_debugctlmsr(get_debugctlmsr() &
-			~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -2317,8 +2460,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
 	}
 }
 
-static void update_saved_topdown_regs(struct perf_event *event,
-				      u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+				      u64 metrics, int metric_end)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2327,7 +2470,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
 	event->hw.saved_slots = slots;
 	event->hw.saved_metric = metrics;
 
-	for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
 			continue;
 		other = cpuc->events[idx];
@@ -2342,7 +2485,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
  * The PERF_METRICS and Fixed counter 3 are read separately. The values may be
  * modify by a NMI. PMU has to be disabled before calling this function.
  */
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct perf_event *other;
@@ -2358,7 +2502,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 	/* read PERF_METRICS */
 	rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
 
-	for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+	for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
 		if (!is_topdown_idx(idx))
 			continue;
 		other = cpuc->events[idx];
@@ -2384,7 +2528,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 		 * Don't need to reset the PERF_METRICS and Fixed counter 3.
 		 * Because the values will be restored in next schedule in.
 		 */
-		update_saved_topdown_regs(event, slots, metrics);
+		update_saved_topdown_regs(event, slots, metrics, metric_end);
 		reset = false;
 	}
 
@@ -2393,12 +2537,18 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 		wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
 		wrmsrl(MSR_PERF_METRICS, 0);
 		if (event)
-			update_saved_topdown_regs(event, 0, 0);
+			update_saved_topdown_regs(event, 0, 0, metric_end);
 	}
 
 	return slots;
 }
 
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+	return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+						 x86_pmu.num_topdown_events - 1);
+}
+
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2553,8 +2703,11 @@ static void intel_pmu_reset(void)
 		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
 		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
 	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		if (fixed_counter_disabled(idx))
+			continue;
 		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
 
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
@@ -2677,95 +2830,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	return handled;
 }
 
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
-	bool res;
-
-	if (kstrtobool(s, &res))
-		return -EINVAL;
-
-	disable_counter_freezing = !res;
-	return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int handled = 0;
-	bool bts = false;
-	u64 status;
-	int pmu_enabled = cpuc->enabled;
-	int loops = 0;
-
-	/* PMU has been disabled because of counter freezing */
-	cpuc->enabled = 0;
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		bts = true;
-		intel_bts_disable_local();
-		handled = intel_pmu_drain_bts_buffer();
-		handled += intel_bts_interrupt();
-	}
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-again:
-	intel_pmu_lbr_read();
-	if (++loops > 100) {
-		static bool warned;
-
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
-
-
-	handled += handle_pmi_common(regs, status);
-done:
-	/* Ack the PMI in the APIC */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	/*
-	 * The counters start counting immediately while ack the status.
-	 * Make it as close as possible to IRET. This avoids bogus
-	 * freezing on Skylake CPUs.
-	 */
-	if (status) {
-		intel_pmu_ack_status(status);
-	} else {
-		/*
-		 * CPU may issues two PMIs very close to each other.
-		 * When the PMI handler services the first one, the
-		 * GLOBAL_STATUS is already updated to reflect both.
-		 * When it IRETs, the second PMI is immediately
-		 * handled and it sees clear status. At the meantime,
-		 * there may be a third PMI, because the freezing bit
-		 * isn't set since the ack in first PMI handlers.
-		 * Double check if there is more work to be done.
-		 */
-		status = intel_pmu_get_status();
-		if (status)
-			goto again;
-	}
-
-	if (bts)
-		intel_bts_enable_local();
-	cpuc->enabled = pmu_enabled;
-	return handled;
-}
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -3531,6 +3595,26 @@ static int core_pmu_hw_config(struct perf_event *event)
 	return intel_pmu_bts_config(event);
 }
 
+#define INTEL_TD_METRIC_AVAILABLE_MAX	(INTEL_TD_METRIC_RETIRING + \
+					 ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+	return is_metric_event(event) &&
+		event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+	return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+	return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -3604,7 +3688,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		if (event->attr.config & X86_ALL_EVENT_FLAGS)
 			return -EINVAL;
 
-		if (is_metric_event(event)) {
+		if (is_available_metric_event(event)) {
 			struct perf_event *leader = event->group_leader;
 
 			/* The metric events don't support sampling. */
@@ -3633,6 +3717,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
+	/*
+	 * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+	 * doesn't function quite right. As a work-around it needs to always be
+	 * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+	 * The actual count of this second event is irrelevant it just needs
+	 * to be active to make the first event function correctly.
+	 *
+	 * In a group, the auxiliary event must be in front of the load latency
+	 * event. The rule is to simplify the implementation of the check.
+	 * That's because perf cannot have a complete group at the moment.
+	 */
+	if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+	    (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+	    is_mem_loads_event(event)) {
+		struct perf_event *leader = event->group_leader;
+		struct perf_event *sibling = NULL;
+
+		if (!is_mem_loads_aux_event(leader)) {
+			for_each_sibling_event(sibling, leader) {
+				if (is_mem_loads_aux_event(sibling))
+					break;
+			}
+			if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+				return -ENODATA;
+		}
+	}
+
 	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
 		return 0;
 
@@ -3820,6 +3931,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return hsw_get_event_constraints(cpuc, idx, event);
 }
 
+static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = icl_get_event_constraints(cpuc, idx, event);
+
+	/*
+	 * The :ppp indicates the Precise Distribution (PDist) facility, which
+	 * is only supported on the GP counter 0. If a :ppp event which is not
+	 * available on the GP counter 0, error out.
+	 */
+	if (event->attr.precise_ip == 3) {
+		if (c->idxmsk64 & BIT_ULL(0))
+			return &counter0_constraint;
+
+		return &emptyconstraint;
+	}
+
+	return c;
+}
+
 static struct event_constraint *
 glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			  struct perf_event *event)
@@ -3909,6 +4043,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
 	return max(left, 32ULL);
 }
 
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+	if (event->attr.precise_ip == 3)
+		return max(left, 128ULL);
+
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -4050,9 +4192,6 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (x86_pmu.version > 1)
 		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 
-	if (x86_pmu.counter_freezing)
-		enable_counter_freeze();
-
 	/* Disable perf metrics if any added CPU doesn't support it. */
 	if (x86_pmu.intel_cap.perf_metrics) {
 		union perf_capabilities perf_cap;
@@ -4123,9 +4262,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
 static void intel_pmu_cpu_dying(int cpu)
 {
 	fini_debug_store_on_cpu(cpu);
-
-	if (x86_pmu.counter_freezing)
-		disable_counter_freeze();
 }
 
 void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4154,6 +4290,12 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
 	intel_pmu_lbr_sched_task(ctx, sched_in);
 }
 
+static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
+				    struct perf_event_context *next)
+{
+	intel_pmu_lbr_swap_task_ctx(prev, next);
+}
+
 static int intel_pmu_check_period(struct perf_event *event, u64 value)
 {
 	return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
@@ -4249,6 +4391,11 @@ static __initconst const struct x86_pmu core_pmu = {
 	.cpu_dead		= intel_pmu_cpu_dead,
 
 	.check_period		= intel_pmu_check_period,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4289,10 +4436,16 @@ static __initconst const struct x86_pmu intel_pmu = {
 
 	.guest_get_msrs		= intel_guest_get_msrs,
 	.sched_task		= intel_pmu_sched_task,
+	.swap_task_ctx		= intel_pmu_swap_task_ctx,
 
 	.check_period		= intel_pmu_check_period,
 
 	.aux_output_match	= intel_pmu_aux_output_match,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4503,39 +4656,6 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	 2, 0x0000000e),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	 9, 0x0000002e),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT,	10, 0x00000008),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D,	 1, 0x00000028),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	 1, 0x00000028),
-	INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	 8, 0x00000006),
-	{}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
-	return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
-	/* Check if it's already disabled */
-	if (disable_counter_freezing)
-		return;
-
-	/*
-	 * If the system starts with the wrong ucode, leave the
-	 * counter-freezing feature permanently disabled.
-	 */
-	if (intel_counter_freezing_broken()) {
-		pr_info("PMU counter freezing disabled due to CPU errata,"
-			"please upgrade microcode\n");
-		x86_pmu.counter_freezing = false;
-		x86_pmu.handle_irq = intel_pmu_handle_irq;
-	}
-}
-
 /*
  * enable software workaround for errata:
  * SNB: BJ122
@@ -4645,6 +4765,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
 	NULL,
 };
 
+
+EVENT_ATTR_STR(mem-stores,	mem_st_spr,	"event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux,	mem_ld_aux,	"event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+	EVENT_PTR(mem_ld_hsw),
+	EVENT_PTR(mem_st_spr),
+	EVENT_PTR(mem_ld_aux),
+	NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+	EVENT_PTR(slots),
+	EVENT_PTR(td_retiring),
+	EVENT_PTR(td_bad_spec),
+	EVENT_PTR(td_fe_bound),
+	EVENT_PTR(td_be_bound),
+	EVENT_PTR(td_heavy_ops),
+	EVENT_PTR(td_br_mispredict),
+	EVENT_PTR(td_fetch_lat),
+	EVENT_PTR(td_mem_bound),
+	NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+	EVENT_PTR(tx_start),
+	EVENT_PTR(tx_abort),
+	EVENT_PTR(tx_commit),
+	EVENT_PTR(tx_capacity_read),
+	EVENT_PTR(tx_capacity_write),
+	EVENT_PTR(tx_conflict),
+	EVENT_PTR(cycles_t),
+	EVENT_PTR(cycles_ct),
+	NULL,
+};
+
 static ssize_t freeze_on_smi_show(struct device *cdev,
 				  struct device_attribute *attr,
 				  char *buf)
@@ -4868,7 +5024,7 @@ __init int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
 	struct event_constraint *c;
-	unsigned int unused;
+	unsigned int fixed_mask;
 	struct extra_reg *er;
 	bool pmem = false;
 	int version, i;
@@ -4890,7 +5046,7 @@ __init int intel_pmu_init(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired hw_event or not.
 	 */
-	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+	cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
 	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
 		return -ENODEV;
 
@@ -4914,15 +5070,15 @@ __init int intel_pmu_init(void)
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events, when not running in a hypervisor:
 	 */
-	if (version > 1) {
+	if (version > 1 && version < 5) {
 		int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
 
 		x86_pmu.num_counters_fixed =
 			max((int)edx.split.num_counters_fixed, assume);
-	}
 
-	if (version >= 4)
-		x86_pmu.counter_freezing = !disable_counter_freezing;
+		fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+	} else if (version >= 5)
+		x86_pmu.num_counters_fixed = fls(fixed_mask);
 
 	if (boot_cpu_has(X86_FEATURE_PDCM)) {
 		u64 capabilities;
@@ -4931,6 +5087,14 @@ __init int intel_pmu_init(void)
 		x86_pmu.intel_cap.capabilities = capabilities;
 	}
 
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
+		x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+		x86_pmu.lbr_read = intel_pmu_lbr_read_32;
+	}
+
+	if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
+		intel_pmu_arch_lbr_init();
+
 	intel_ds_init();
 
 	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
@@ -5037,7 +5201,6 @@ __init int intel_pmu_init(void)
 
 	case INTEL_FAM6_ATOM_GOLDMONT:
 	case INTEL_FAM6_ATOM_GOLDMONT_D:
-		x86_add_quirk(intel_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5064,7 +5227,6 @@ __init int intel_pmu_init(void)
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-		x86_add_quirk(intel_counter_freezing_quirk);
 		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5408,12 +5570,50 @@ __init int intel_pmu_init(void)
 		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
 		x86_pmu.lbr_pt_coexist = true;
 		intel_pmu_pebs_data_source_skl(pmem);
+		x86_pmu.num_topdown_events = 4;
 		x86_pmu.update_topdown_event = icl_update_topdown_event;
 		x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
 		pr_cont("Icelake events, ");
 		name = "icelake";
 		break;
 
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		pmem = true;
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		x86_pmu.event_constraints = intel_spr_event_constraints;
+		x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_spr_extra_regs;
+		x86_pmu.limit_period = spr_limit_period;
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.pebs_block = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_PEBS_ALL;
+		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = spr_get_event_constraints;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
+		extra_skl_attr = skl_format_attr;
+		mem_attr = spr_events_attrs;
+		td_attr = spr_td_events_attrs;
+		tsx_attr = spr_tsx_events_attrs;
+		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+		x86_pmu.lbr_pt_coexist = true;
+		intel_pmu_pebs_data_source_skl(pmem);
+		x86_pmu.num_topdown_events = 8;
+		x86_pmu.update_topdown_event = icl_update_topdown_event;
+		x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+		pr_cont("Sapphire Rapids events, ");
+		name = "sapphire_rapids";
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
@@ -5456,8 +5656,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
 	}
 
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+	x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
 
 	if (x86_pmu.event_constraints) {
 		/*
@@ -5470,13 +5669,22 @@ __init int intel_pmu_init(void)
 			 * events to the generic counters.
 			 */
 			if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+				/*
+				 * Disable topdown slots and metrics events,
+				 * if slots event is not in CPUID.
+				 */
+				if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+					c->idxmsk64 = 0;
 				c->weight = hweight64(c->idxmsk64);
 				continue;
 			}
 
-			if (c->cmask == FIXED_EVENT_FLAGS
-			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			if (c->cmask == FIXED_EVENT_FLAGS) {
+				/* Disabled fixed counters which are not in CPUID */
+				c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+				if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+					c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
 			}
 			c->idxmsk64 &=
 				~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
@@ -5490,7 +5698,7 @@ __init int intel_pmu_init(void)
 	 * Check all LBT MSR here.
 	 * Disable LBR access if any LBR MSRs can not be accessed.
 	 */
-	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+	if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
 		x86_pmu.lbr_nr = 0;
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
@@ -5522,13 +5730,6 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
-	/*
-	 * For arch perfmon 4 use counter freezing to avoid
-	 * several MSR accesses in the PMI.
-	 */
-	if (x86_pmu.counter_freezing)
-		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
 	if (x86_pmu.intel_cap.perf_metrics)
 		x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 5965d341350caa039d1fe07a31327d9692ebc413..4bb139db1a7b5b97a04567c811a96509e81a1a08 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -35,7 +35,9 @@ union intel_x86_pebs_dse {
 		unsigned int ld_dse:4;
 		unsigned int ld_stlb_miss:1;
 		unsigned int ld_locked:1;
-		unsigned int ld_reserved:26;
+		unsigned int ld_data_blk:1;
+		unsigned int ld_addr_blk:1;
+		unsigned int ld_reserved:24;
 	};
 	struct {
 		unsigned int st_l1d_hit:1;
@@ -44,6 +46,12 @@ union intel_x86_pebs_dse {
 		unsigned int st_locked:1;
 		unsigned int st_reserved2:26;
 	};
+	struct {
+		unsigned int st_lat_dse:4;
+		unsigned int st_lat_stlb_miss:1;
+		unsigned int st_lat_locked:1;
+		unsigned int ld_reserved3:26;
+	};
 };
 
 
@@ -197,6 +205,63 @@ static u64 load_latency_data(u64 status)
 	if (dse.ld_locked)
 		val |= P(LOCK, LOCKED);
 
+	/*
+	 * Ice Lake and earlier models do not support block infos.
+	 */
+	if (!x86_pmu.pebs_block) {
+		val |= P(BLK, NA);
+		return val;
+	}
+	/*
+	 * bit 6: load was blocked since its data could not be forwarded
+	 *        from a preceding store
+	 */
+	if (dse.ld_data_blk)
+		val |= P(BLK, DATA);
+
+	/*
+	 * bit 7: load was blocked due to potential address conflict with
+	 *        a preceding store
+	 */
+	if (dse.ld_addr_blk)
+		val |= P(BLK, ADDR);
+
+	if (!dse.ld_data_blk && !dse.ld_addr_blk)
+		val |= P(BLK, NA);
+
+	return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val;
+
+	dse.val = status;
+
+	/*
+	 * use the mapping table for bit 0-3
+	 */
+	val = pebs_data_source[dse.st_lat_dse];
+
+	/*
+	 * bit 4: TLB access
+	 * 0 = did not miss 2nd level TLB
+	 * 1 = missed 2nd level TLB
+	 */
+	if (dse.st_lat_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	/*
+	 * bit 5: locked prefix
+	 */
+	if (dse.st_lat_locked)
+		val |= P(LOCK, LOCKED);
+
+	val |= P(BLK, NA);
+
 	return val;
 }
 
@@ -867,6 +932,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+	INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+	/*
+	 * Everything else is handled by PMU_FL_PEBS_ALL, because we
+	 * need the full constraints from the main table.
+	 */
+
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -951,13 +1038,14 @@ static void adaptive_pebs_record_size_update(void)
 	if (pebs_data_cfg & PEBS_DATACFG_XMMS)
 		sz += sizeof(struct pebs_xmm);
 	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
-		sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+		sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
 
 	cpuc->pebs_record_size = sz;
 }
 
 #define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
-				PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+				PERF_SAMPLE_PHYS_ADDR |			     \
+				PERF_SAMPLE_WEIGHT_TYPE |		     \
 				PERF_SAMPLE_TRANSACTION)
 
 static u64 pebs_update_adaptive_cfg(struct perf_event *event)
@@ -983,7 +1071,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
 	       (attr->sample_regs_intr & PEBS_GP_REGS);
 
-	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
 		     ((attr->config & INTEL_ARCH_EVENT_MASK) ==
 		      x86_pmu.rtm_abort_event);
 
@@ -1327,6 +1415,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
 
 	if (fl & PERF_X86_EVENT_PEBS_LDLAT)
 		val = load_latency_data(aux);
+	else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+		val = store_latency_data(aux);
 	else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
 		val = precise_datala_hsw(event, aux);
 	else if (fst)
@@ -1361,8 +1451,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 	/*
 	 * Use latency for weight (only avail with PEBS-LL)
 	 */
-	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data->weight = pebs->lat;
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+		data->weight.full = pebs->lat;
 
 	/*
 	 * data.data_src encodes the data source
@@ -1454,8 +1544,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
-		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+		if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+			data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
 
 		if (sample_type & PERF_SAMPLE_TRANSACTION)
 			data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1499,6 +1589,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
 #endif
 }
 
+#define PEBS_LATENCY_MASK			0xffff
+#define PEBS_CACHE_LATENCY_OFFSET		32
+
 /*
  * With adaptive PEBS the layout depends on what fields are configured.
  */
@@ -1569,9 +1662,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_MEMINFO) {
-		if (sample_type & PERF_SAMPLE_WEIGHT)
-			data->weight = meminfo->latency ?:
-				intel_get_tsx_weight(meminfo->tsx_tuning);
+		if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+			u64 weight = meminfo->latency;
+
+			if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+				data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+				weight >>= PEBS_CACHE_LATENCY_OFFSET;
+			}
+
+			/*
+			 * Although meminfo::latency is defined as a u64,
+			 * only the lower 32 bits include the valid data
+			 * in practice on Ice Lake and earlier platforms.
+			 */
+			if (sample_type & PERF_SAMPLE_WEIGHT) {
+				data->weight.full = weight ?:
+					intel_get_tsx_weight(meminfo->tsx_tuning);
+			} else {
+				data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+					intel_get_tsx_weight(meminfo->tsx_tuning);
+			}
+		}
 
 		if (sample_type & PERF_SAMPLE_DATA_SRC)
 			data->data_src.val = get_data_src(event, meminfo->aux);
@@ -1592,10 +1703,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_LBRS) {
-		struct pebs_lbr *lbr = next_record;
+		struct lbr_entry *lbr = next_record;
 		int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
 					& 0xff) + 1;
-		next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry);
+		next_record = next_record + num_lbr * sizeof(struct lbr_entry);
 
 		if (has_branch_stack(event)) {
 			intel_pmu_store_pebs_lbrs(lbr);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 73dbd30f82d33ffb2cf398f1852c8dca930f421f..faa798839971a5e392b9ce7f6b38d977d98f1bbb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,17 +8,6 @@
 
 #include "../perf_event.h"
 
-enum {
-	LBR_FORMAT_32		= 0x00,
-	LBR_FORMAT_LIP		= 0x01,
-	LBR_FORMAT_EIP		= 0x02,
-	LBR_FORMAT_EIP_FLAGS	= 0x03,
-	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_TIME		= 0x06,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
-};
-
 static const enum {
 	LBR_EIP_FLAGS		= 1,
 	LBR_TSX			= 2,
@@ -143,8 +132,54 @@ enum {
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
 
+/*
+ * Intel LBR_CTL bits
+ *
+ * Hardware branch filter for Arch LBR
+ */
+#define ARCH_LBR_KERNEL_BIT		1  /* capture at ring0 */
+#define ARCH_LBR_USER_BIT		2  /* capture at ring > 0 */
+#define ARCH_LBR_CALL_STACK_BIT		3  /* enable call stack */
+#define ARCH_LBR_JCC_BIT		16 /* capture conditional branches */
+#define ARCH_LBR_REL_JMP_BIT		17 /* capture relative jumps */
+#define ARCH_LBR_IND_JMP_BIT		18 /* capture indirect jumps */
+#define ARCH_LBR_REL_CALL_BIT		19 /* capture relative calls */
+#define ARCH_LBR_IND_CALL_BIT		20 /* capture indirect calls */
+#define ARCH_LBR_RETURN_BIT		21 /* capture near returns */
+#define ARCH_LBR_OTHER_BRANCH_BIT	22 /* capture other branches */
+
+#define ARCH_LBR_KERNEL			(1ULL << ARCH_LBR_KERNEL_BIT)
+#define ARCH_LBR_USER			(1ULL << ARCH_LBR_USER_BIT)
+#define ARCH_LBR_CALL_STACK		(1ULL << ARCH_LBR_CALL_STACK_BIT)
+#define ARCH_LBR_JCC			(1ULL << ARCH_LBR_JCC_BIT)
+#define ARCH_LBR_REL_JMP		(1ULL << ARCH_LBR_REL_JMP_BIT)
+#define ARCH_LBR_IND_JMP		(1ULL << ARCH_LBR_IND_JMP_BIT)
+#define ARCH_LBR_REL_CALL		(1ULL << ARCH_LBR_REL_CALL_BIT)
+#define ARCH_LBR_IND_CALL		(1ULL << ARCH_LBR_IND_CALL_BIT)
+#define ARCH_LBR_RETURN			(1ULL << ARCH_LBR_RETURN_BIT)
+#define ARCH_LBR_OTHER_BRANCH		(1ULL << ARCH_LBR_OTHER_BRANCH_BIT)
+
+#define ARCH_LBR_ANY			 \
+	(ARCH_LBR_JCC			|\
+	 ARCH_LBR_REL_JMP		|\
+	 ARCH_LBR_IND_JMP		|\
+	 ARCH_LBR_REL_CALL		|\
+	 ARCH_LBR_IND_CALL		|\
+	 ARCH_LBR_RETURN		|\
+	 ARCH_LBR_OTHER_BRANCH)
+
+#define ARCH_LBR_CTL_MASK			0x7f000e
+
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
+static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return !!(config & ARCH_LBR_CALL_STACK);
+
+	return !!(config & LBR_CALL_STACK);
+}
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -168,33 +203,46 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (cpuc->lbr_sel)
 		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi && cpuc->lbr_sel)
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	orig_debugctl = debugctl;
-	debugctl |= DEBUGCTLMSR_LBR;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		debugctl |= DEBUGCTLMSR_LBR;
 	/*
 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
 	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
 	 * may cause superfluous increase/decrease of LBR_TOS.
 	 */
-	if (!(lbr_select & LBR_CALL_STACK))
+	if (is_lbr_call_stack_bit_set(lbr_select))
+		debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	else
 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
 	if (orig_debugctl != debugctl)
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
 }
 
 static void __intel_pmu_lbr_disable(void)
 {
 	u64 debugctl;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		wrmsrl(MSR_ARCH_LBR_CTL, 0);
+		return;
+	}
+
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-static void intel_pmu_lbr_reset_32(void)
+void intel_pmu_lbr_reset_32(void)
 {
 	int i;
 
@@ -202,7 +250,7 @@ static void intel_pmu_lbr_reset_32(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 }
 
-static void intel_pmu_lbr_reset_64(void)
+void intel_pmu_lbr_reset_64(void)
 {
 	int i;
 
@@ -210,10 +258,16 @@ static void intel_pmu_lbr_reset_64(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, 0);
+			wrmsrl(x86_pmu.lbr_info + i, 0);
 	}
 }
 
+static void intel_pmu_arch_lbr_reset(void)
+{
+	/* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */
+	wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr);
+}
+
 void intel_pmu_lbr_reset(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -221,10 +275,7 @@ void intel_pmu_lbr_reset(void)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_reset_32();
-	else
-		intel_pmu_lbr_reset_64();
+	x86_pmu.lbr_reset();
 
 	cpuc->last_task_ctx = NULL;
 	cpuc->last_log_id = 0;
@@ -308,69 +359,97 @@ static u64 lbr_from_signext_quirk_rd(u64 val)
 	return val;
 }
 
-static inline void wrlbr_from(unsigned int idx, u64 val)
+static __always_inline void wrlbr_from(unsigned int idx, u64 val)
 {
 	val = lbr_from_signext_quirk_wr(val);
 	wrmsrl(x86_pmu.lbr_from + idx, val);
 }
 
-static inline void wrlbr_to(unsigned int idx, u64 val)
+static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 {
 	wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
-static inline u64 rdlbr_from(unsigned int idx)
+static __always_inline void wrlbr_info(unsigned int idx, u64 val)
+{
+	wrmsrl(x86_pmu.lbr_info + idx, val);
+}
+
+static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->from;
+
 	rdmsrl(x86_pmu.lbr_from + idx, val);
 
 	return lbr_from_signext_quirk_rd(val);
 }
 
-static inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->to;
+
 	rdmsrl(x86_pmu.lbr_to + idx, val);
 
 	return val;
 }
 
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr)
+{
+	u64 val;
+
+	if (lbr)
+		return lbr->info;
+
+	rdmsrl(x86_pmu.lbr_info + idx, val);
+
+	return val;
+}
+
+static inline void
+wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	wrlbr_from(idx, lbr->from);
+	wrlbr_to(idx, lbr->to);
+	if (need_info)
+		wrlbr_info(idx, lbr->info);
+}
+
+static inline bool
+rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	u64 from = rdlbr_from(idx, NULL);
+
+	/* Don't read invalid entry */
+	if (!from)
+		return false;
+
+	lbr->from = from;
+	lbr->to = rdlbr_to(idx, NULL);
+	if (need_info)
+		lbr->info = rdlbr_info(idx, NULL);
+
+	return true;
+}
+
+void intel_pmu_lbr_restore(void *ctx)
 {
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
 	int i;
 	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
-		intel_pmu_lbr_reset();
-		return;
-	}
-
-	tos = task_ctx->tos;
-	/*
-	 * Does not restore the LBR registers, if
-	 * - No one else touched them, and
-	 * - Did not enter C6
-	 */
-	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->log_id == cpuc->last_log_id) &&
-	    rdlbr_from(tos)) {
-		task_ctx->lbr_stack_state = LBR_NONE;
-		return;
-	}
+	u64 tos = task_ctx->tos;
 
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
-		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
-
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+		wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info);
 	}
 
 	for (; i < x86_pmu.lbr_nr; i++) {
@@ -378,55 +457,172 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 		wrlbr_from(lbr_idx, 0);
 		wrlbr_to(lbr_idx, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+			wrlbr_info(lbr_idx, 0);
 	}
 
 	wrmsrl(x86_pmu.lbr_tos, tos);
-	task_ctx->lbr_stack_state = LBR_NONE;
 
 	if (cpuc->lbr_select)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+static void intel_pmu_arch_lbr_restore(void *ctx)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned lbr_idx, mask;
-	u64 tos, from;
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
 	int i;
 
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
+	/* Fast reset the LBRs before restore if the call stack is not full. */
+	if (!entries[x86_pmu.lbr_nr - 1].from)
+		intel_pmu_arch_lbr_reset();
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!entries[i].from)
+			break;
+		wrlbr_all(&entries[i], i, true);
+	}
+}
+
+/*
+ * Restore the Architecture LBR state from the xsave area in the perf
+ * context data for the task via the XRSTORS instruction.
+ */
+static void intel_pmu_arch_lbr_xrstors(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
+static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL);
+
+	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
+}
+
+static void __intel_pmu_lbr_restore(void *ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (task_context_opt(ctx)->lbr_callstack_users == 0 ||
+	    task_context_opt(ctx)->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
 		return;
 	}
 
+	/*
+	 * Does not restore the LBR registers, if
+	 * - No one else touched them, and
+	 * - Was not cleared in Cstate
+	 */
+	if ((ctx == cpuc->last_task_ctx) &&
+	    (task_context_opt(ctx)->log_id == cpuc->last_log_id) &&
+	    !lbr_is_reset_in_cstate(ctx)) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	x86_pmu.lbr_restore(ctx);
+
+	task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+}
+
+void intel_pmu_lbr_save(void *ctx)
+{
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
+	unsigned lbr_idx, mask;
+	u64 tos;
+	int i;
+
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		lbr_idx = (tos - i) & mask;
-		from = rdlbr_from(lbr_idx);
-		if (!from)
+		if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))
 			break;
-		task_ctx->lbr_from[i] = from;
-		task_ctx->lbr_to[i]   = rdlbr_to(lbr_idx);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
-	task_ctx->lbr_stack_state = LBR_VALID;
-
-	cpuc->last_task_ctx = task_ctx;
-	cpuc->last_log_id = ++task_ctx->log_id;
 
 	if (cpuc->lbr_select)
-               rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
+		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
+static void intel_pmu_arch_lbr_save(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!rdlbr_all(&entries[i], i, true))
+			break;
+	}
+
+	/* LBR call stack is not full. Reset is required in restore. */
+	if (i < x86_pmu.lbr_nr)
+		entries[x86_pmu.lbr_nr - 1].from = 0;
+}
+
+/*
+ * Save the Architecture LBR state to the xsave area in the perf
+ * context data for the task via the XSAVES instruction.
+ */
+static void intel_pmu_arch_lbr_xsaves(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
+static void __intel_pmu_lbr_save(void *ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (task_context_opt(ctx)->lbr_callstack_users == 0) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	x86_pmu.lbr_save(ctx);
+
+	task_context_opt(ctx)->lbr_stack_state = LBR_VALID;
+
+	cpuc->last_task_ctx = ctx;
+	cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
+}
+
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
+				 struct perf_event_context *next)
+{
+	void *prev_ctx_data, *next_ctx_data;
+
+	swap(prev->task_ctx_data, next->task_ctx_data);
+
+	/*
+	 * Architecture specific synchronization makes sense in
+	 * case both prev->task_ctx_data and next->task_ctx_data
+	 * pointers are allocated.
+	 */
+
+	prev_ctx_data = next->task_ctx_data;
+	next_ctx_data = prev->task_ctx_data;
+
+	if (!prev_ctx_data || !next_ctx_data)
+		return;
+
+	swap(task_context_opt(prev_ctx_data)->lbr_callstack_users,
+	     task_context_opt(next_ctx_data)->lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
+	void *task_ctx;
 
 	if (!cpuc->lbr_users)
 		return;
@@ -462,8 +658,8 @@ static inline bool branch_user_callstack(unsigned br_sel)
 
 void intel_pmu_lbr_add(struct perf_event *event)
 {
+	struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -473,10 +669,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users++;
-	}
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
 
 	/*
 	 * Request pmu::sched_task() callback, which will fire inside the
@@ -502,21 +696,41 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	perf_sched_cb_inc(event->ctx->pmu);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    kmem_cache && !cpuc->lbr_xsave &&
+	    (cpuc->lbr_users != cpuc->lbr_pebs_users))
+		cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
+}
+
+void release_lbr_buffers(void)
+{
+	struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+	struct cpu_hw_events *cpuc;
+	int cpu;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+		if (kmem_cache && cpuc->lbr_xsave) {
+			kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
+			cpuc->lbr_xsave = NULL;
+		}
+	}
 }
 
 void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
 	if (branch_user_callstack(cpuc->br_sel) &&
-	    event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users--;
-	}
+	    event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
 
 	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
 		cpuc->lbr_select = 0;
@@ -553,7 +767,7 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	u64 tos = intel_pmu_lbr_tos();
@@ -582,6 +796,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
+	cpuc->lbr_stack.hw_idx = tos;
 }
 
 /*
@@ -589,7 +804,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	bool need_info = false, call_stack = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -612,8 +827,8 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];
 
-		from = rdlbr_from(lbr_idx);
-		to   = rdlbr_to(lbr_idx);
+		from = rdlbr_from(lbr_idx, NULL);
+		to   = rdlbr_to(lbr_idx, NULL);
 
 		/*
 		 * Read LBR call stack entries
@@ -625,7 +840,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			info = rdlbr_info(lbr_idx, NULL);
 			mis = !!(info & LBR_INFO_MISPRED);
 			pred = !mis;
 			in_tx = !!(info & LBR_INFO_IN_TX);
@@ -677,6 +892,94 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		out++;
 	}
 	cpuc->lbr_stack.nr = out;
+	cpuc->lbr_stack.hw_idx = tos;
+}
+
+static __always_inline int get_lbr_br_type(u64 info)
+{
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
+		return 0;
+
+	return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+}
+
+static __always_inline bool get_lbr_mispred(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !!(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_predicted(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_cycles(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
+		return 0;
+
+	return info & LBR_INFO_CYCLES;
+}
+
+static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
+				struct lbr_entry *entries)
+{
+	struct perf_branch_entry *e;
+	struct lbr_entry *lbr;
+	u64 from, to, info;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr = entries ? &entries[i] : NULL;
+		e = &cpuc->lbr_entries[i];
+
+		from = rdlbr_from(i, lbr);
+		/*
+		 * Read LBR entries until invalid entry (0s) is detected.
+		 */
+		if (!from)
+			break;
+
+		to = rdlbr_to(i, lbr);
+		info = rdlbr_info(i, lbr);
+
+		e->from		= from;
+		e->to		= to;
+		e->mispred	= get_lbr_mispred(info);
+		e->predicted	= get_lbr_predicted(info);
+		e->in_tx	= !!(info & LBR_INFO_IN_TX);
+		e->abort	= !!(info & LBR_INFO_ABORT);
+		e->cycles	= get_lbr_cycles(info);
+		e->type		= get_lbr_br_type(info);
+		e->reserved	= 0;
+	}
+
+	cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+{
+	intel_pmu_store_lbr(cpuc, NULL);
+}
+
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
+
+	if (!xsave) {
+		intel_pmu_store_lbr(cpuc, NULL);
+		return;
+	}
+	xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
+
+	intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
 }
 
 void intel_pmu_lbr_read(void)
@@ -693,10 +996,7 @@ void intel_pmu_lbr_read(void)
 	    cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_read_32(cpuc);
-	else
-		intel_pmu_lbr_read_64(cpuc);
+	x86_pmu.lbr_read(cpuc);
 
 	intel_pmu_lbr_filter(cpuc);
 }
@@ -796,6 +1096,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		reg->config = mask;
+		return 0;
+	}
+
 	/*
 	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
 	 * in suppress mode. So LBR_SELECT should be set to
@@ -1052,6 +1357,27 @@ common_branch_type(int type)
 	return PERF_BR_UNKNOWN;
 }
 
+enum {
+	ARCH_LBR_BR_TYPE_JCC			= 0,
+	ARCH_LBR_BR_TYPE_NEAR_IND_JMP		= 1,
+	ARCH_LBR_BR_TYPE_NEAR_REL_JMP		= 2,
+	ARCH_LBR_BR_TYPE_NEAR_IND_CALL		= 3,
+	ARCH_LBR_BR_TYPE_NEAR_REL_CALL		= 4,
+	ARCH_LBR_BR_TYPE_NEAR_RET		= 5,
+	ARCH_LBR_BR_TYPE_KNOWN_MAX		= ARCH_LBR_BR_TYPE_NEAR_RET,
+
+	ARCH_LBR_BR_TYPE_MAP_MAX		= 16,
+};
+
+static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
+	[ARCH_LBR_BR_TYPE_JCC]			= X86_BR_JCC,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_JMP]		= X86_BR_IND_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_JMP]		= X86_BR_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_CALL]	= X86_BR_IND_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_CALL]	= X86_BR_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_RET]		= X86_BR_RET,
+};
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -1064,7 +1390,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 {
 	u64 from, to;
 	int br_sel = cpuc->br_sel;
-	int i, j, type;
+	int i, j, type, to_plm;
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
@@ -1076,8 +1402,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 
 		from = cpuc->lbr_entries[i].from;
 		to = cpuc->lbr_entries[i].to;
+		type = cpuc->lbr_entries[i].type;
 
-		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		/*
+		 * Parse the branch type recorded in LBR_x_INFO MSR.
+		 * Doesn't support OTHER_BRANCH decoding for now.
+		 * OTHER_BRANCH branch type still rely on software decoding.
+		 */
+		if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+		    type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
+			to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+			type = arch_lbr_br_type_map[type] | to_plm;
+		} else
+			type = branch_type(from, to, cpuc->lbr_entries[i].abort);
 		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
 			if (cpuc->lbr_entries[i].in_tx)
 				type |= X86_BR_IN_TX;
@@ -1112,25 +1449,18 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	}
 }
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
 
-	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		u64 info = lbr->lbr[i].info;
-		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
+	/* Cannot get TOS for large PEBS and Arch LBR */
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) ||
+	    (cpuc->n_pebs == cpuc->n_large_pebs))
+		cpuc->lbr_stack.hw_idx = -1ULL;
+	else
+		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
-		e->from		= lbr->lbr[i].from;
-		e->to		= lbr->lbr[i].to;
-		e->mispred	= !!(info & LBR_INFO_MISPRED);
-		e->predicted	= !(info & LBR_INFO_MISPRED);
-		e->in_tx	= !!(info & LBR_INFO_IN_TX);
-		e->abort	= !!(info & LBR_INFO_ABORT);
-		e->cycles	= info & LBR_INFO_CYCLES;
-		e->reserved	= 0;
-	}
+	intel_pmu_store_lbr(cpuc, lbr);
 	intel_pmu_lbr_filter(cpuc);
 }
 
@@ -1187,6 +1517,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
 };
 
+static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= ARCH_LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= ARCH_LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= ARCH_LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= ARCH_LBR_RETURN |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = ARCH_LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]         = ARCH_LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_RETURN |
+						  ARCH_LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= ARCH_LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= ARCH_LBR_REL_CALL,
+};
+
 /* core */
 void __init intel_pmu_lbr_init_core(void)
 {
@@ -1240,9 +1590,17 @@ void __init intel_pmu_lbr_init_snb(void)
 	 */
 }
 
+static inline struct kmem_cache *
+create_lbr_kmem_cache(size_t size, size_t align)
+{
+	return kmem_cache_create("x86_lbr", size, align, 0, NULL);
+}
+
 /* haswell */
 void intel_pmu_lbr_init_hsw(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 16;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1251,6 +1609,8 @@ void intel_pmu_lbr_init_hsw(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	if (lbr_from_signext_quirk_needed())
 		static_branch_enable(&lbr_from_quirk_key);
 }
@@ -1258,14 +1618,19 @@ void intel_pmu_lbr_init_hsw(void)
 /* skylake */
 __init void intel_pmu_lbr_init_skl(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 32;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
 	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+	x86_pmu.lbr_info = MSR_LBR_INFO_0;
 
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	/*
 	 * SW branch filter usage:
 	 * - support syscall, sysret capture.
@@ -1333,6 +1698,131 @@ void intel_pmu_lbr_init_knl(void)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+/*
+ * LBR state size is variable based on the max number of registers.
+ * This calculates the expected state size, which should match
+ * what the hardware enumerates for the size of XFEATURE_LBR.
+ */
+static inline unsigned int get_lbr_state_size(void)
+{
+	return sizeof(struct arch_lbr_state) +
+	       x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+}
+
+static bool is_arch_lbr_xsave_available(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XSAVES))
+		return false;
+
+	/*
+	 * Check the LBR state with the corresponding software structure.
+	 * Disable LBR XSAVES support if the size doesn't match.
+	 */
+	if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
+		return false;
+
+	return true;
+}
+
+void __init intel_pmu_arch_lbr_init(void)
+{
+	struct pmu *pmu = x86_get_pmu();
+	union cpuid28_eax eax;
+	union cpuid28_ebx ebx;
+	union cpuid28_ecx ecx;
+	unsigned int unused_edx;
+	bool arch_lbr_xsave;
+	size_t size;
+	u64 lbr_nr;
+
+	/* Arch LBR Capabilities */
+	cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx);
+
+	lbr_nr = fls(eax.split.lbr_depth_mask) * 8;
+	if (!lbr_nr)
+		goto clear_arch_lbr;
+
+	/* Apply the max depth of Arch LBR */
+	if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr))
+		goto clear_arch_lbr;
+
+	x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask;
+	x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset;
+	x86_pmu.lbr_lip = eax.split.lbr_lip;
+	x86_pmu.lbr_cpl = ebx.split.lbr_cpl;
+	x86_pmu.lbr_filter = ebx.split.lbr_filter;
+	x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack;
+	x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
+	x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
+	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+	x86_pmu.lbr_nr = lbr_nr;
+
+
+	arch_lbr_xsave = is_arch_lbr_xsave_available();
+	if (arch_lbr_xsave) {
+		size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) +
+		       get_lbr_state_size();
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size,
+							    XSAVE_ALIGNMENT);
+	}
+
+	if (!pmu->task_ctx_cache) {
+		arch_lbr_xsave = false;
+
+		size = sizeof(struct x86_perf_task_context_arch_lbr) +
+		       lbr_nr * sizeof(struct lbr_entry);
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+	}
+
+	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
+	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
+	x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
+
+	/* LBR callstack requires both CPL and Branch Filtering support */
+	if (!x86_pmu.lbr_cpl ||
+	    !x86_pmu.lbr_filter ||
+	    !x86_pmu.lbr_call_stack)
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
+
+	if (!x86_pmu.lbr_cpl) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP;
+	} else if (!x86_pmu.lbr_filter) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP;
+	}
+
+	x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK;
+	x86_pmu.lbr_ctl_map  = arch_lbr_ctl_map;
+
+	if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter)
+		x86_pmu.lbr_ctl_map = NULL;
+
+	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
+	if (arch_lbr_xsave) {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave;
+		pr_cont("XSAVE ");
+	} else {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
+	}
+
+	pr_cont("Architectural LBR, ");
+
+	return;
+
+clear_arch_lbr:
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR);
+}
+
 /**
  * x86_perf_get_lbr - get the LBR records information
  *
@@ -1347,7 +1837,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	lbr->nr = x86_pmu.lbr_nr;
 	lbr->from = x86_pmu.lbr_from;
 	lbr->to = x86_pmu.lbr_to;
-	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0;
+	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
 
 	return 0;
 }
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 7fb1f904ab6398d12a61ecdbb4661166f571af50..b02a900deb65e26a7fa3689cc5910829254b0535 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -4,19 +4,26 @@
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include "uncore.h"
+#include "uncore_discovery.h"
 
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
+				     "(default: enable the discovery mechanism).");
+struct intel_uncore_type *empty_uncore[] = { NULL, };
 struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
 struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
 
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
+/* The PCI driver for the device which the uncore doesn't own. */
+struct pci_driver *uncore_pci_sub_driver;
 /* pci bus to socket mapping */
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
 
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
@@ -29,21 +36,21 @@ struct event_constraint uncore_constraint_empty =
 
 MODULE_LICENSE("GPL");
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
-	int phys_id = -1;
+	int die_id = -1;
 
 	raw_spin_lock(&pci2phy_map_lock);
 	list_for_each_entry(map, &pci2phy_map_head, list) {
 		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
+			die_id = map->pbus_to_dieid[bus->number];
 			break;
 		}
 	}
 	raw_spin_unlock(&pci2phy_map_lock);
 
-	return phys_id;
+	return die_id;
 }
 
 static void uncore_free_pcibus_map(void)
@@ -84,7 +91,7 @@ struct pci2phy_map *__find_pci2phy_map(int segment)
 	alloc = NULL;
 	map->segment = segment;
 	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
+		map->pbus_to_dieid[i] = -1;
 	list_add_tail(&map->list, &pci2phy_map_head);
 
 end:
@@ -108,7 +115,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
 	 * The unsigned check also catches the '-1' return value for non
 	 * existent mappings in the topology map.
 	 */
-	return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+	return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -132,6 +139,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return 0;
 
+	if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+		return 0;
+
 	return readq(box->io_addr + event->hw.event_base);
 }
 
@@ -327,7 +337,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 
 	uncore_pmu_init_hrtimer(box);
 	box->cpu = -1;
-	box->pci_phys_id = -1;
 	box->dieid = -1;
 
 	/* set default hrtimer timeout */
@@ -825,6 +834,45 @@ static const struct attribute_group uncore_pmu_attr_group = {
 	.attrs = uncore_pmu_attrs,
 };
 
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
+{
+	struct intel_uncore_type *type = pmu->type;
+
+	if (type->num_boxes == 1)
+		sprintf(pmu_name, "uncore_type_%u", type->type_id);
+	else {
+		sprintf(pmu_name, "uncore_type_%u_%d",
+			type->type_id, type->box_ids[pmu->pmu_idx]);
+	}
+}
+
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+	struct intel_uncore_type *type = pmu->type;
+
+	/*
+	 * No uncore block name in discovery table.
+	 * Use uncore_type_&typeid_&boxid as name.
+	 */
+	if (!type->name) {
+		uncore_get_alias_name(pmu->name, pmu);
+		return;
+	}
+
+	if (type->num_boxes == 1) {
+		if (strlen(type->name) > 0)
+			sprintf(pmu->name, "uncore_%s", type->name);
+		else
+			sprintf(pmu->name, "uncore");
+	} else {
+		/*
+		 * Use the box ID from the discovery table if applicable.
+		 */
+		sprintf(pmu->name, "uncore_%s_%d", type->name,
+			type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx);
+	}
+}
+
 static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 {
 	int ret;
@@ -843,21 +891,15 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 			.read		= uncore_pmu_event_read,
 			.module		= THIS_MODULE,
 			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+			.attr_update	= pmu->type->attr_update,
 		};
 	} else {
 		pmu->pmu = *pmu->type->pmu;
 		pmu->pmu.attr_groups = pmu->type->attr_groups;
+		pmu->pmu.attr_update = pmu->type->attr_update;
 	}
 
-	if (pmu->type->num_boxes == 1) {
-		if (strlen(pmu->type->name) > 0)
-			sprintf(pmu->name, "uncore_%s", pmu->type->name);
-		else
-			sprintf(pmu->name, "uncore");
-	} else {
-		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-			pmu->pmu_idx);
-	}
+	uncore_get_pmu_name(pmu);
 
 	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
 	if (!ret)
@@ -877,7 +919,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int die;
 
-	for (die = 0; die < max_dies; die++)
+	for (die = 0; die < uncore_max_dies(); die++)
 		kfree(pmu->boxes[die]);
 	kfree(pmu->boxes);
 }
@@ -887,6 +929,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
 	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
 
+	if (type->cleanup_mapping)
+		type->cleanup_mapping(type);
+
 	if (pmu) {
 		for (i = 0; i < type->num_boxes; i++, pmu++) {
 			uncore_pmu_unregister(pmu);
@@ -895,6 +940,10 @@ static void uncore_type_exit(struct intel_uncore_type *type)
 		kfree(type->pmus);
 		type->pmus = NULL;
 	}
+	if (type->box_ids) {
+		kfree(type->box_ids);
+		type->box_ids = NULL;
+	}
 	kfree(type->events_group);
 	type->events_group = NULL;
 }
@@ -915,7 +964,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 	if (!pmus)
 		return -ENOMEM;
 
-	size = max_dies * sizeof(struct intel_uncore_box *);
+	size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
 
 	for (i = 0; i < type->num_boxes; i++) {
 		pmus[i].func_id	= setid ? i : -1;
@@ -954,6 +1003,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 
 	type->pmu_group = &uncore_pmu_attr_group;
 
+	if (type->set_mapping)
+		type->set_mapping(type);
+
 	return 0;
 
 err:
@@ -978,65 +1030,93 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
 }
 
 /*
- * add a pci uncore device
+ * Get the die information of a PCI device.
+ * @pdev: The PCI device.
+ * @die: The die id which the device maps to.
  */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
 {
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu = NULL;
-	struct intel_uncore_box *box;
-	int phys_id, die, ret;
-
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
-		return -ENODEV;
-
-	die = (topology_max_die_per_package() > 1) ? phys_id :
-					topology_phys_to_logical_pkg(phys_id);
-	if (die < 0)
+	*die = uncore_pcibus_to_dieid(pdev->bus);
+	if (*die < 0)
 		return -EINVAL;
 
-	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+	return 0;
+}
 
-		uncore_extra_pci_dev[die].dev[idx] = pdev;
-		pci_set_drvdata(pdev, NULL);
-		return 0;
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+	struct intel_uncore_type **types = uncore_pci_uncores;
+	struct intel_uncore_type *type;
+	u64 box_ctl;
+	int i, die;
+
+	for (; *types; types++) {
+		type = *types;
+		for (die = 0; die < __uncore_max_dies; die++) {
+			for (i = 0; i < type->num_boxes; i++) {
+				if (!type->box_ctls[die])
+					continue;
+				box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+				if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
+				    pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
+				    pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
+					return &type->pmus[i];
+			}
+		}
 	}
 
-	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+	return NULL;
+}
 
-	/*
-	 * Some platforms, e.g.  Knights Landing, use a common PCI device ID
-	 * for multiple instances of an uncore PMU device type. We should check
-	 * PCI slot and func to indicate the uncore box.
-	 */
-	if (id->driver_data & ~0xffff) {
-		struct pci_driver *pci_drv = pdev->driver;
-		const struct pci_device_id *ids = pci_drv->id_table;
-		unsigned int devfn;
-
-		while (ids && ids->vendor) {
-			if ((ids->vendor == pdev->vendor) &&
-			    (ids->device == pdev->device)) {
-				devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
-						  UNCORE_PCI_DEV_FUNC(ids->driver_data));
-				if (devfn == pdev->devfn) {
-					pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
-					break;
-				}
+/*
+ * Find the PMU of a PCI device.
+ * @pdev: The PCI device.
+ * @ids: The ID table of the available PCI devices with a PMU.
+ *       If NULL, search the whole uncore_pci_uncores.
+ */
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
+{
+	struct intel_uncore_pmu *pmu = NULL;
+	struct intel_uncore_type *type;
+	kernel_ulong_t data;
+	unsigned int devfn;
+
+	if (!ids)
+		return uncore_pci_find_dev_pmu_from_types(pdev);
+
+	while (ids && ids->vendor) {
+		if ((ids->vendor == pdev->vendor) &&
+		    (ids->device == pdev->device)) {
+			data = ids->driver_data;
+			devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data),
+					  UNCORE_PCI_DEV_FUNC(data));
+			if (devfn == pdev->devfn) {
+				type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)];
+				pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)];
+				break;
 			}
-			ids++;
 		}
-		if (pmu == NULL)
-			return -ENODEV;
-	} else {
-		/*
-		 * for performance monitoring unit with multiple boxes,
-		 * each box has a different function id.
-		 */
-		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+		ids++;
 	}
+	return pmu;
+}
+
+/*
+ * Register the PMU for a PCI device
+ * @pdev: The PCI device.
+ * @type: The corresponding PMU type of the device.
+ * @pmu: The corresponding PMU of the device.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_pmu_register(struct pci_dev *pdev,
+				   struct intel_uncore_type *type,
+				   struct intel_uncore_pmu *pmu,
+				   int die)
+{
+	struct intel_uncore_box *box;
+	int ret;
 
 	if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
 		return -EINVAL;
@@ -1051,12 +1131,10 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
 	atomic_inc(&box->refcnt);
-	box->pci_phys_id = phys_id;
 	box->dieid = die;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
 	uncore_box_init(box);
-	pci_set_drvdata(pdev, box);
 
 	pmu->boxes[die] = box;
 	if (atomic_inc_return(&pmu->activeboxes) > 1)
@@ -1065,7 +1143,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	/* First active box registers the pmu */
 	ret = uncore_pmu_register(pmu);
 	if (ret) {
-		pci_set_drvdata(pdev, NULL);
 		pmu->boxes[die] = NULL;
 		uncore_box_exit(box);
 		kfree(box);
@@ -1073,18 +1150,82 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	return ret;
 }
 
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu = NULL;
+	int die, ret;
+
+	ret = uncore_pci_get_dev_die_info(pdev, &die);
+	if (ret)
+		return ret;
+
+	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+		uncore_extra_pci_dev[die].dev[idx] = pdev;
+		pci_set_drvdata(pdev, NULL);
+		return 0;
+	}
+
+	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+
+	/*
+	 * Some platforms, e.g.  Knights Landing, use a common PCI device ID
+	 * for multiple instances of an uncore PMU device type. We should check
+	 * PCI slot and func to indicate the uncore box.
+	 */
+	if (id->driver_data & ~0xffff) {
+		struct pci_driver *pci_drv = pdev->driver;
+
+		pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
+		if (pmu == NULL)
+			return -ENODEV;
+	} else {
+		/*
+		 * for performance monitoring unit with multiple boxes,
+		 * each box has a different function id.
+		 */
+		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+	}
+
+	ret = uncore_pci_pmu_register(pdev, type, pmu, die);
+
+	pci_set_drvdata(pdev, pmu->boxes[die]);
+
+	return ret;
+}
+
+/*
+ * Unregister the PMU of a PCI device
+ * @pmu: The corresponding PMU is unregistered.
+ * @die: The die id which the device maps to.
+ */
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
+{
+	struct intel_uncore_box *box = pmu->boxes[die];
+
+	pmu->boxes[die] = NULL;
+	if (atomic_dec_return(&pmu->activeboxes) == 0)
+		uncore_pmu_unregister(pmu);
+	uncore_box_exit(box);
+	kfree(box);
+}
+
 static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box;
 	struct intel_uncore_pmu *pmu;
-	int i, phys_id, die;
+	int i, die;
 
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	if (uncore_pci_get_dev_die_info(pdev, &die))
+		return;
 
 	box = pci_get_drvdata(pdev);
 	if (!box) {
-		die = (topology_max_die_per_package() > 1) ? phys_id :
-					topology_phys_to_logical_pkg(phys_id);
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
 			if (uncore_extra_pci_dev[die].dev[i] == pdev) {
 				uncore_extra_pci_dev[die].dev[i] = NULL;
@@ -1096,15 +1237,133 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-		return;
 
 	pci_set_drvdata(pdev, NULL);
-	pmu->boxes[box->dieid] = NULL;
-	if (atomic_dec_return(&pmu->activeboxes) == 0)
-		uncore_pmu_unregister(pmu);
-	uncore_box_exit(box);
-	kfree(box);
+
+	uncore_pci_pmu_unregister(pmu, die);
+}
+
+static int uncore_bus_notify(struct notifier_block *nb,
+			     unsigned long action, void *data,
+			     const struct pci_device_id *ids)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct intel_uncore_pmu *pmu;
+	int die;
+
+	/* Unregister the PMU when the device is going to be deleted. */
+	if (action != BUS_NOTIFY_DEL_DEVICE)
+		return NOTIFY_DONE;
+
+	pmu = uncore_pci_find_dev_pmu(pdev, ids);
+	if (!pmu)
+		return NOTIFY_DONE;
+
+	if (uncore_pci_get_dev_die_info(pdev, &die))
+		return NOTIFY_DONE;
+
+	uncore_pci_pmu_unregister(pmu, die);
+
+	return NOTIFY_OK;
+}
+
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	return uncore_bus_notify(nb, action, data,
+				 uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+	.notifier_call = uncore_pci_sub_bus_notify,
+};
+
+static void uncore_pci_sub_driver_init(void)
+{
+	const struct pci_device_id *ids = uncore_pci_sub_driver->id_table;
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct pci_dev *pci_sub_dev;
+	bool notify = false;
+	unsigned int devfn;
+	int die;
+
+	while (ids && ids->vendor) {
+		pci_sub_dev = NULL;
+		type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)];
+		/*
+		 * Search the available device, and register the
+		 * corresponding PMU.
+		 */
+		while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+						     ids->device, pci_sub_dev))) {
+			devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+					  UNCORE_PCI_DEV_FUNC(ids->driver_data));
+			if (devfn != pci_sub_dev->devfn)
+				continue;
+
+			pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+			if (!pmu)
+				continue;
+
+			if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
+				continue;
+
+			if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
+						     die))
+				notify = true;
+		}
+		ids++;
+	}
+
+	if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier))
+		notify = false;
+
+	if (!notify)
+		uncore_pci_sub_driver = NULL;
+}
+
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+	.notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+	struct intel_uncore_type **types = uncore_pci_uncores;
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct pci_dev *pdev;
+	u64 box_ctl;
+	int i, die;
+
+	for (; *types; types++) {
+		type = *types;
+		for (die = 0; die < __uncore_max_dies; die++) {
+			for (i = 0; i < type->num_boxes; i++) {
+				if (!type->box_ctls[die])
+					continue;
+				box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+				pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl),
+								   UNCORE_DISCOVERY_PCI_BUS(box_ctl),
+								   UNCORE_DISCOVERY_PCI_DEVFN(box_ctl));
+				if (!pdev)
+					continue;
+				pmu = &type->pmus[i];
+
+				uncore_pci_pmu_register(pdev, type, pmu, die);
+			}
+		}
+	}
+
+	bus_register_notifier(&pci_bus_type, &uncore_pci_notifier);
 }
 
 static int __init uncore_pci_init(void)
@@ -1112,7 +1371,7 @@ static int __init uncore_pci_init(void)
 	size_t size;
 	int ret;
 
-	size = max_dies * sizeof(struct pci_extra_dev);
+	size = uncore_max_dies() * sizeof(struct pci_extra_dev);
 	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
 	if (!uncore_extra_pci_dev) {
 		ret = -ENOMEM;
@@ -1123,12 +1382,18 @@ static int __init uncore_pci_init(void)
 	if (ret)
 		goto errtype;
 
-	uncore_pci_driver->probe = uncore_pci_probe;
-	uncore_pci_driver->remove = uncore_pci_remove;
+	if (uncore_pci_driver) {
+		uncore_pci_driver->probe = uncore_pci_probe;
+		uncore_pci_driver->remove = uncore_pci_remove;
 
-	ret = pci_register_driver(uncore_pci_driver);
-	if (ret)
-		goto errtype;
+		ret = pci_register_driver(uncore_pci_driver);
+		if (ret)
+			goto errtype;
+	} else
+		uncore_pci_pmus_register();
+
+	if (uncore_pci_sub_driver)
+		uncore_pci_sub_driver_init();
 
 	pcidrv_registered = true;
 	return 0;
@@ -1147,7 +1412,12 @@ static void uncore_pci_exit(void)
 {
 	if (pcidrv_registered) {
 		pcidrv_registered = false;
-		pci_unregister_driver(uncore_pci_driver);
+		if (uncore_pci_sub_driver)
+			bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
+		if (uncore_pci_driver)
+			pci_unregister_driver(uncore_pci_driver);
+		else
+			bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier);
 		uncore_types_exit(uncore_pci_uncores);
 		kfree(uncore_extra_pci_dev);
 		uncore_free_pcibus_map();
@@ -1400,6 +1670,7 @@ struct intel_uncore_init_fun {
 	void	(*cpu_init)(void);
 	int	(*pci_init)(void);
 	void	(*mmio_init)(void);
+	bool	use_discovery;
 };
 
 static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
@@ -1482,6 +1753,19 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
 	.mmio_init = snr_uncore_mmio_init,
 };
 
+static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
+	.cpu_init = spr_uncore_cpu_init,
+	.pci_init = spr_uncore_pci_init,
+	.mmio_init = spr_uncore_mmio_init,
+	.use_discovery = true,
+};
+
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+	.cpu_init = intel_uncore_generic_uncore_cpu_init,
+	.pci_init = intel_uncore_generic_uncore_pci_init,
+	.mmio_init = intel_uncore_generic_uncore_mmio_init,
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,	  nhm_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,	  nhm_uncore_init),
@@ -1513,6 +1797,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE,	  icl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_D,	  icx_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_X,	  icx_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SAPPHIRERAPIDS_X,	spr_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init),
 	{},
 };
@@ -1525,16 +1810,26 @@ static int __init intel_uncore_init(void)
 	struct intel_uncore_init_fun *uncore_init;
 	int pret = 0, cret = 0, mret = 0, ret;
 
-	id = x86_match_cpu(intel_uncore_match);
-	if (!id)
-		return -ENODEV;
-
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return -ENODEV;
 
-	max_dies = topology_max_packages() * topology_max_die_per_package();
+	__uncore_max_dies =
+		topology_max_packages() * topology_max_die_per_package();
+
+	id = x86_match_cpu(intel_uncore_match);
+	if (!id) {
+		if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+			uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
+		else
+			return -ENODEV;
+	} else {
+		uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+		if (uncore_no_discover && uncore_init->use_discovery)
+			return -ENODEV;
+		if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables())
+			return -ENODEV;
+	}
 
-	uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
 	if (uncore_init->pci_init) {
 		pret = uncore_init->pci_init();
 		if (!pret)
@@ -1551,8 +1846,10 @@ static int __init intel_uncore_init(void)
 		mret = uncore_mmio_init();
 	}
 
-	if (cret && pret && mret)
-		return -ENODEV;
+	if (cret && pret && mret) {
+		ret = -ENODEV;
+		goto free_discovery;
+	}
 
 	/* Install hotplug callbacks to setup the targets for each package */
 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
@@ -1567,6 +1864,8 @@ static int __init intel_uncore_init(void)
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_types_exit(uncore_mmio_uncores);
 	uncore_pci_exit();
+free_discovery:
+	intel_uncore_clear_discovery_tables();
 	return ret;
 }
 module_init(intel_uncore_init);
@@ -1577,5 +1876,6 @@ static void __exit intel_uncore_exit(void)
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_types_exit(uncore_mmio_uncores);
 	uncore_pci_exit();
+	intel_uncore_clear_discovery_tables();
 }
 module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 97bb76605f7acefa424db34c21a36c0ac4fae06d..9e66271a8d7002909293ad55997530654cd9221d 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -50,6 +50,7 @@ struct intel_uncore_type {
 	int perf_ctr_bits;
 	int fixed_ctr_bits;
 	int num_freerunning_types;
+	int type_id;
 	unsigned perf_ctr;
 	unsigned event_ctl;
 	unsigned event_mask;
@@ -57,14 +58,21 @@ struct intel_uncore_type {
 	unsigned fixed_ctr;
 	unsigned fixed_ctl;
 	unsigned box_ctl;
+	u64 *box_ctls;	/* Unit ctrl addr of the first box of each die */
 	union {
 		unsigned msr_offset;
 		unsigned mmio_offset;
 	};
+	unsigned mmio_map_size;
 	unsigned num_shared_regs:8;
 	unsigned single_fixed:1;
 	unsigned pair_ctr_ctl:1;
-	unsigned *msr_offsets;
+	union {
+		unsigned *msr_offsets;
+		unsigned *pci_offsets;
+		unsigned *mmio_offsets;
+	};
+	unsigned *box_ids;
 	struct event_constraint unconstrainted;
 	struct event_constraint *constraints;
 	struct intel_uncore_pmu *pmus;
@@ -72,7 +80,19 @@ struct intel_uncore_type {
 	struct uncore_event_desc *event_descs;
 	struct freerunning_counters *freerunning;
 	const struct attribute_group *attr_groups[4];
+	const struct attribute_group **attr_update;
 	struct pmu *pmu; /* for custom pmu ops */
+	/*
+	 * Uncore PMU would store relevant platform topology configuration here
+	 * to identify which platform component each PMON block of that type is
+	 * supposed to monitor.
+	 */
+	u64 *topology;
+	/*
+	 * Optional callbacks for managing mapping of Uncore units to PMONs
+	 */
+	int (*set_mapping)(struct intel_uncore_type *type);
+	void (*cleanup_mapping)(struct intel_uncore_type *type);
 };
 
 #define pmu_group attr_groups[0]
@@ -111,7 +131,6 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int pci_phys_id;
 	int dieid;	/* Logical die ID */
 	int n_active;	/* number of active events */
 	int n_events;
@@ -160,15 +179,27 @@ struct freerunning_counters {
 struct pci2phy_map {
 	struct list_head list;
 	int segment;
-	int pbus_to_physid[256];
+	int pbus_to_dieid[256];
 };
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
 
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
 
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+	return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n)	container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n)	container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n)	to_dev_ext_attribute(to_device_attribute(n))
+
+extern int __uncore_max_dies;
+#define uncore_max_dies()	(__uncore_max_dies)
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
 {								\
 	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
@@ -196,6 +227,18 @@ static inline bool uncore_pmc_freerunning(int idx)
 	return idx == UNCORE_PMC_IDX_FREERUNNING;
 }
 
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+					       unsigned long offset)
+{
+	if (offset < box->pmu->type->mmio_map_size)
+		return true;
+
+	pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+		     offset, box->pmu->type->name);
+
+	return false;
+}
+
 static inline
 unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
 {
@@ -510,11 +553,14 @@ struct event_constraint *
 uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
 u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu);
 
+extern struct intel_uncore_type *empty_uncore[];
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct intel_uncore_type **uncore_mmio_uncores;
 extern struct pci_driver *uncore_pci_driver;
+extern struct pci_driver *uncore_pci_sub_driver;
 extern raw_spinlock_t pci2phy_map_lock;
 extern struct list_head pci2phy_map_head;
 extern struct pci_extra_dev *uncore_extra_pci_dev;
@@ -551,6 +597,9 @@ void snr_uncore_mmio_init(void);
 int icx_uncore_pci_init(void);
 void icx_uncore_cpu_init(void);
 void icx_uncore_mmio_init(void);
+int spr_uncore_pci_init(void);
+void spr_uncore_cpu_init(void);
+void spr_uncore_mmio_init(void);
 
 /* uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
new file mode 100644
index 0000000000000000000000000000000000000000..3049c646fa209f4c962c5126f2253a8ebe74c092
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -0,0 +1,622 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel uncore PerfMon discovery mechanism.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static struct rb_root discovery_tables = RB_ROOT;
+static int num_discovered_types[UNCORE_ACCESS_MAX];
+
+static bool has_generic_discovery_table(void)
+{
+	struct pci_dev *dev;
+	int dvsec;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL);
+	if (!dev)
+		return false;
+
+	/* A discovery table device has the unique capability ID. */
+	dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY);
+	pci_dev_put(dev);
+	if (dvsec)
+		return true;
+
+	return false;
+}
+
+static int logical_die_id;
+
+static int get_device_die_id(struct pci_dev *dev)
+{
+	int cpu, node = pcibus_to_node(dev->bus);
+
+	/*
+	 * If the NUMA info is not available, assume that the logical die id is
+	 * continuous in the order in which the discovery table devices are
+	 * detected.
+	 */
+	if (node < 0)
+		return logical_die_id++;
+
+	for_each_cpu(cpu, cpumask_of_node(node)) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->initialized && cpu_to_node(cpu) == node)
+			return c->logical_die_id;
+	}
+
+	/*
+	 * All CPUs of a node may be offlined. For this case,
+	 * the PCI and MMIO type of uncore blocks which are
+	 * enumerated by the device will be unavailable.
+	 */
+	return -1;
+}
+
+#define __node_2_type(cur)	\
+	rb_entry((cur), struct intel_uncore_discovery_type, node)
+
+static inline int __type_cmp(const void *key, const struct rb_node *b)
+{
+	struct intel_uncore_discovery_type *type_b = __node_2_type(b);
+	const u16 *type_id = key;
+
+	if (type_b->type > *type_id)
+		return -1;
+	else if (type_b->type < *type_id)
+		return 1;
+
+	return 0;
+}
+
+static inline struct intel_uncore_discovery_type *
+search_uncore_discovery_type(u16 type_id)
+{
+	struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp);
+
+	return (node) ? __node_2_type(node) : NULL;
+}
+
+static inline bool __type_less(struct rb_node *a, const struct rb_node *b)
+{
+	return (__node_2_type(a)->type < __node_2_type(b)->type);
+}
+
+static struct intel_uncore_discovery_type *
+add_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+	struct intel_uncore_discovery_type *type;
+
+	if (unit->access_type >= UNCORE_ACCESS_MAX) {
+		pr_warn("Unsupported access type %d\n", unit->access_type);
+		return NULL;
+	}
+
+	type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL);
+	if (!type)
+		return NULL;
+
+	type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+	if (!type->box_ctrl_die)
+		goto free_type;
+
+	type->access_type = unit->access_type;
+	num_discovered_types[type->access_type]++;
+	type->type = unit->box_type;
+
+	rb_add(&type->node, &discovery_tables, __type_less);
+
+	return type;
+
+free_type:
+	kfree(type);
+
+	return NULL;
+
+}
+
+static struct intel_uncore_discovery_type *
+get_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+	struct intel_uncore_discovery_type *type;
+
+	type = search_uncore_discovery_type(unit->box_type);
+	if (type)
+		return type;
+
+	return add_uncore_discovery_type(unit);
+}
+
+static void
+uncore_insert_box_info(struct uncore_unit_discovery *unit,
+		       int die, bool parsed)
+{
+	struct intel_uncore_discovery_type *type;
+	unsigned int *box_offset, *ids;
+	int i;
+
+	if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+		return;
+
+	if (parsed) {
+		type = search_uncore_discovery_type(unit->box_type);
+		if (WARN_ON_ONCE(!type))
+			return;
+		/* Store the first box of each die */
+		if (!type->box_ctrl_die[die])
+			type->box_ctrl_die[die] = unit->ctl;
+		return;
+	}
+
+	type = get_uncore_discovery_type(unit);
+	if (!type)
+		return;
+
+	box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+	if (!box_offset)
+		return;
+
+	ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+	if (!ids)
+		goto free_box_offset;
+
+	/* Store generic information for the first box */
+	if (!type->num_boxes) {
+		type->box_ctrl = unit->ctl;
+		type->box_ctrl_die[die] = unit->ctl;
+		type->num_counters = unit->num_regs;
+		type->counter_width = unit->bit_width;
+		type->ctl_offset = unit->ctl_offset;
+		type->ctr_offset = unit->ctr_offset;
+		*ids = unit->box_id;
+		goto end;
+	}
+
+	for (i = 0; i < type->num_boxes; i++) {
+		ids[i] = type->ids[i];
+		box_offset[i] = type->box_offset[i];
+
+		if (WARN_ON_ONCE(unit->box_id == ids[i]))
+			goto free_ids;
+	}
+	ids[i] = unit->box_id;
+	box_offset[i] = unit->ctl - type->box_ctrl;
+	kfree(type->ids);
+	kfree(type->box_offset);
+end:
+	type->ids = ids;
+	type->box_offset = box_offset;
+	type->num_boxes++;
+	return;
+
+free_ids:
+	kfree(ids);
+
+free_box_offset:
+	kfree(box_offset);
+
+}
+
+static int parse_discovery_table(struct pci_dev *dev, int die,
+				 u32 bar_offset, bool *parsed)
+{
+	struct uncore_global_discovery global;
+	struct uncore_unit_discovery unit;
+	void __iomem *io_addr;
+	resource_size_t addr;
+	unsigned long size;
+	u32 val;
+	int i;
+
+	pci_read_config_dword(dev, bar_offset, &val);
+
+	if (val & UNCORE_DISCOVERY_MASK)
+		return -EINVAL;
+
+	addr = (resource_size_t)(val & ~UNCORE_DISCOVERY_MASK);
+	size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
+	io_addr = ioremap(addr, size);
+	if (!io_addr)
+		return -ENOMEM;
+
+	/* Read Global Discovery State */
+	memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery));
+	if (uncore_discovery_invalid_unit(global)) {
+		pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n",
+			global.table1, global.ctl, global.table3);
+		iounmap(io_addr);
+		return -EINVAL;
+	}
+	iounmap(io_addr);
+
+	size = (1 + global.max_units) * global.stride * 8;
+	io_addr = ioremap(addr, size);
+	if (!io_addr)
+		return -ENOMEM;
+
+	/* Parsing Unit Discovery State */
+	for (i = 0; i < global.max_units; i++) {
+		memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8),
+			      sizeof(struct uncore_unit_discovery));
+
+		if (uncore_discovery_invalid_unit(unit))
+			continue;
+
+		if (unit.access_type >= UNCORE_ACCESS_MAX)
+			continue;
+
+		uncore_insert_box_info(&unit, die, *parsed);
+	}
+
+	*parsed = true;
+	iounmap(io_addr);
+	return 0;
+}
+
+bool intel_uncore_has_discovery_tables(void)
+{
+	u32 device, val, entry_id, bar_offset;
+	int die, dvsec = 0, ret = true;
+	struct pci_dev *dev = NULL;
+	bool parsed = false;
+
+	if (has_generic_discovery_table())
+		device = UNCORE_DISCOVERY_TABLE_DEVICE;
+	else
+		device = PCI_ANY_ID;
+
+	/*
+	 * Start a new search and iterates through the list of
+	 * the discovery table devices.
+	 */
+	while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+		while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) {
+			pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val);
+			entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK;
+			if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON)
+				continue;
+
+			pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val);
+
+			if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) {
+				ret = false;
+				goto err;
+			}
+			bar_offset = UNCORE_DISCOVERY_BIR_BASE +
+				     (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP;
+
+			die = get_device_die_id(dev);
+			if (die < 0)
+				continue;
+
+			parse_discovery_table(dev, die, bar_offset, &parsed);
+		}
+	}
+
+	/* None of the discovery tables are available */
+	if (!parsed)
+		ret = false;
+err:
+	pci_dev_put(dev);
+
+	return ret;
+}
+
+void intel_uncore_clear_discovery_tables(void)
+{
+	struct intel_uncore_discovery_type *type, *next;
+
+	rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+		kfree(type->box_ctrl_die);
+		kfree(type);
+	}
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31");
+
+static struct attribute *generic_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh.attr,
+	NULL,
+};
+
+static const struct attribute_group generic_uncore_format_group = {
+	.name = "format",
+	.attrs = generic_uncore_formats_attr,
+};
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(uncore_msr_box_ctl(box), 0);
+}
+
+static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, 0);
+}
+
+static struct intel_uncore_ops generic_uncore_msr_ops = {
+	.init_box		= intel_generic_uncore_msr_init_box,
+	.disable_box		= intel_generic_uncore_msr_disable_box,
+	.enable_box		= intel_generic_uncore_msr_enable_box,
+	.disable_event		= intel_generic_uncore_msr_disable_event,
+	.enable_event		= intel_generic_uncore_msr_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+};
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	__set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+					    struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, 0);
+}
+
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+					  struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops generic_uncore_pci_ops = {
+	.init_box	= intel_generic_uncore_pci_init_box,
+	.disable_box	= intel_generic_uncore_pci_disable_box,
+	.enable_box	= intel_generic_uncore_pci_enable_box,
+	.disable_event	= intel_generic_uncore_pci_disable_event,
+	.enable_event	= intel_generic_uncore_pci_enable_event,
+	.read_counter	= intel_generic_uncore_pci_read_counter,
+};
+
+#define UNCORE_GENERIC_MMIO_SIZE		0x4000
+
+static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+
+	if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets)
+		return 0;
+
+	return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
+}
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+	unsigned int box_ctl = generic_uncore_mmio_box_ctl(box);
+	struct intel_uncore_type *type = box->pmu->type;
+	resource_size_t addr;
+
+	if (!box_ctl) {
+		pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+			type->type_id, type->box_ids[box->pmu->pmu_idx]);
+		return;
+	}
+
+	addr = box_ctl;
+	box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+	if (!box->io_addr) {
+		pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+			type->type_id, type->box_ids[box->pmu->pmu_idx],
+			(unsigned long long)addr);
+		return;
+	}
+
+	writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+	if (!box->io_addr)
+		return;
+
+	writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+	if (!box->io_addr)
+		return;
+
+	writel(0, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops generic_uncore_mmio_ops = {
+	.init_box	= intel_generic_uncore_mmio_init_box,
+	.exit_box	= uncore_mmio_exit_box,
+	.disable_box	= intel_generic_uncore_mmio_disable_box,
+	.enable_box	= intel_generic_uncore_mmio_enable_box,
+	.disable_event	= intel_generic_uncore_mmio_disable_event,
+	.enable_event	= intel_generic_uncore_mmio_enable_event,
+	.read_counter	= uncore_mmio_read_counter,
+};
+
+static bool uncore_update_uncore_type(enum uncore_access_type type_id,
+				      struct intel_uncore_type *uncore,
+				      struct intel_uncore_discovery_type *type)
+{
+	uncore->type_id = type->type;
+	uncore->num_boxes = type->num_boxes;
+	uncore->num_counters = type->num_counters;
+	uncore->perf_ctr_bits = type->counter_width;
+	uncore->box_ids = type->ids;
+
+	switch (type_id) {
+	case UNCORE_ACCESS_MSR:
+		uncore->ops = &generic_uncore_msr_ops;
+		uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset;
+		uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset;
+		uncore->box_ctl = (unsigned int)type->box_ctrl;
+		uncore->msr_offsets = type->box_offset;
+		break;
+	case UNCORE_ACCESS_PCI:
+		uncore->ops = &generic_uncore_pci_ops;
+		uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset;
+		uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset;
+		uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl);
+		uncore->box_ctls = type->box_ctrl_die;
+		uncore->pci_offsets = type->box_offset;
+		break;
+	case UNCORE_ACCESS_MMIO:
+		uncore->ops = &generic_uncore_mmio_ops;
+		uncore->perf_ctr = (unsigned int)type->ctr_offset;
+		uncore->event_ctl = (unsigned int)type->ctl_offset;
+		uncore->box_ctl = (unsigned int)type->box_ctrl;
+		uncore->box_ctls = type->box_ctrl_die;
+		uncore->mmio_offsets = type->box_offset;
+		uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra)
+{
+	struct intel_uncore_discovery_type *type;
+	struct intel_uncore_type **uncores;
+	struct intel_uncore_type *uncore;
+	struct rb_node *node;
+	int i = 0;
+
+	uncores = kcalloc(num_discovered_types[type_id] + num_extra + 1,
+			  sizeof(struct intel_uncore_type *), GFP_KERNEL);
+	if (!uncores)
+		return empty_uncore;
+
+	for (node = rb_first(&discovery_tables); node; node = rb_next(node)) {
+		type = rb_entry(node, struct intel_uncore_discovery_type, node);
+		if (type->access_type != type_id)
+			continue;
+
+		uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL);
+		if (!uncore)
+			break;
+
+		uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK;
+		uncore->format_group = &generic_uncore_format_group;
+
+		if (!uncore_update_uncore_type(type_id, uncore, type)) {
+			kfree(uncore);
+			continue;
+		}
+		uncores[i++] = uncore;
+	}
+
+	return uncores;
+}
+
+void intel_uncore_generic_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR, 0);
+}
+
+int intel_uncore_generic_uncore_pci_init(void)
+{
+	uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI, 0);
+
+	return 0;
+}
+
+void intel_uncore_generic_uncore_mmio_init(void)
+{
+	uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO, 0);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
new file mode 100644
index 0000000000000000000000000000000000000000..7280c8a3c831099aca25cac0c0eec6eddce0c6f9
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Generic device ID of a discovery table device */
+#define UNCORE_DISCOVERY_TABLE_DEVICE		0x09a7
+/* Capability ID for a discovery table device */
+#define UNCORE_EXT_CAP_ID_DISCOVERY		0x23
+/* First DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC_OFFSET		0x8
+/* Mask of the supported discovery entry type */
+#define UNCORE_DISCOVERY_DVSEC_ID_MASK		0xffff
+/* PMON discovery entry type ID */
+#define UNCORE_DISCOVERY_DVSEC_ID_PMON		0x1
+/* Second DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC2_OFFSET		0xc
+/* Mask of the discovery table BAR offset */
+#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK	0x7
+/* Discovery table BAR base offset */
+#define UNCORE_DISCOVERY_BIR_BASE		0x10
+/* Discovery table BAR step */
+#define UNCORE_DISCOVERY_BIR_STEP		0x4
+/* Mask of the discovery table offset */
+#define UNCORE_DISCOVERY_MASK			0xf
+/* Global discovery table size */
+#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE	0x20
+
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data)	((data >> 28) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS(data)		((data >> 20) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN(data)	((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data)	(data & 0xfff)
+
+
+#define uncore_discovery_invalid_unit(unit)			\
+	(!unit.table1 || !unit.ctl || !unit.table3 ||	\
+	 unit.table1 == -1ULL || unit.ctl == -1ULL ||	\
+	 unit.table3 == -1ULL)
+
+#define GENERIC_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define GENERIC_PMON_CTL_UMASK_MASK	0x0000ff00
+#define GENERIC_PMON_CTL_EDGE_DET	(1 << 18)
+#define GENERIC_PMON_CTL_INVERT		(1 << 23)
+#define GENERIC_PMON_CTL_TRESH_MASK	0xff000000
+#define GENERIC_PMON_RAW_EVENT_MASK	(GENERIC_PMON_CTL_EV_SEL_MASK | \
+					 GENERIC_PMON_CTL_UMASK_MASK | \
+					 GENERIC_PMON_CTL_EDGE_DET | \
+					 GENERIC_PMON_CTL_INVERT | \
+					 GENERIC_PMON_CTL_TRESH_MASK)
+
+#define GENERIC_PMON_BOX_CTL_FRZ	(1 << 0)
+#define GENERIC_PMON_BOX_CTL_RST_CTRL	(1 << 8)
+#define GENERIC_PMON_BOX_CTL_RST_CTRS	(1 << 9)
+#define GENERIC_PMON_BOX_CTL_INT	(GENERIC_PMON_BOX_CTL_RST_CTRL | \
+					 GENERIC_PMON_BOX_CTL_RST_CTRS)
+
+enum uncore_access_type {
+	UNCORE_ACCESS_MSR	= 0,
+	UNCORE_ACCESS_MMIO,
+	UNCORE_ACCESS_PCI,
+
+	UNCORE_ACCESS_MAX,
+};
+
+struct uncore_global_discovery {
+	union {
+		u64	table1;
+		struct {
+			u64	type : 8,
+				stride : 8,
+				max_units : 10,
+				__reserved_1 : 36,
+				access_type : 2;
+		};
+	};
+
+	u64	ctl;		/* Global Control Address */
+
+	union {
+		u64	table3;
+		struct {
+			u64	status_offset : 8,
+				num_status : 16,
+				__reserved_2 : 40;
+		};
+	};
+};
+
+struct uncore_unit_discovery {
+	union {
+		u64	table1;
+		struct {
+			u64	num_regs : 8,
+				ctl_offset : 8,
+				bit_width : 8,
+				ctr_offset : 8,
+				status_offset : 8,
+				__reserved_1 : 22,
+				access_type : 2;
+			};
+		};
+
+	u64	ctl;		/* Unit Control Address */
+
+	union {
+		u64	table3;
+		struct {
+			u64	box_type : 16,
+				box_id : 16,
+				__reserved_2 : 32;
+		};
+	};
+};
+
+struct intel_uncore_discovery_type {
+	struct rb_node	node;
+	enum uncore_access_type	access_type;
+	u64		box_ctrl;	/* Unit ctrl addr of the first box */
+	u64		*box_ctrl_die;	/* Unit ctrl addr of the first box of each die */
+	u16		type;		/* Type ID of the uncore block */
+	u8		num_counters;
+	u8		counter_width;
+	u8		ctl_offset;	/* Counter Control 0 offset */
+	u8		ctr_offset;	/* Counter 0 offset */
+	u16		num_boxes;	/* number of boxes for the uncore block */
+	unsigned int	*ids;		/* Box IDs */
+	unsigned int	*box_offset;	/* Box offset */
+};
+
+bool intel_uncore_has_discovery_tables(void);
+void intel_uncore_clear_discovery_tables(void);
+void intel_uncore_generic_uncore_cpu_init(void);
+int intel_uncore_generic_uncore_pci_init(void);
+void intel_uncore_generic_uncore_mmio_init(void);
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box);
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+					     struct perf_event *event);
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+					    struct perf_event *event);
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+					  struct perf_event *event);
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index aec6e63c6a04afcc82ebb8fc40198c346b738055..c57d5b0d169c33b8f5014efc4dfbeeb3fad4dc2b 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -431,6 +431,7 @@ static const struct attribute_group snb_uncore_imc_format_group = {
 
 static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 {
+	struct intel_uncore_type *type = box->pmu->type;
 	struct pci_dev *pdev = box->pci_dev;
 	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
 	resource_size_t addr;
@@ -446,7 +447,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 
 	addr &= ~(PAGE_SIZE - 1);
 
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr)
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
 	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
 }
 
@@ -567,7 +571,7 @@ int snb_pci2phy_map_init(int devid)
 		pci_dev_put(dev);
 		return -ENOMEM;
 	}
-	map->pbus_to_physid[bus] = 0;
+	map->pbus_to_dieid[bus] = 0;
 	raw_spin_unlock(&pci2phy_map_lock);
 
 	pci_dev_put(dev);
@@ -602,6 +606,7 @@ static struct intel_uncore_type snb_uncore_imc = {
 	.num_counters   = 2,
 	.num_boxes	= 1,
 	.num_freerunning_types	= SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size	= SNB_UNCORE_PCI_IMC_MAP_SIZE,
 	.freerunning	= snb_uncore_imc_freerunning,
 	.event_descs	= snb_uncore_imc_events,
 	.format_group	= &snb_uncore_imc_format_group,
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 0aa40798252b6ae57e46c0b1173e4d1eb886a0cd..3fe2d6970f90a70e3249dd9bcd98d461e12aecdd 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* SandyBridge-EP/IvyTown uncore support */
 #include "uncore.h"
+#include "uncore_discovery.h"
 
 /* SNB-EP pci bus to socket mapping */
 #define SNBEP_CPUNODEID			0x40
@@ -273,6 +274,30 @@
 #define SKX_CPUNODEID			0xc0
 #define SKX_GIDNIDMAP			0xd4
 
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * |  Bit  |  Default  |  Description
+ * | [63]  |    00h    | VALID - When set, indicates the CPU bus
+ *                       numbers have been initialized. (RO)
+ * |[62:48]|    ---    | Reserved
+ * |[47:40]|    00h    | BUS_NUM_5 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(5). (RO)
+ * |[39:32]|    00h    | BUS_NUM_4 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(4). (RO)
+ * |[31:24]|    00h    | BUS_NUM_3 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(3). (RO)
+ * |[23:16]|    00h    | BUS_NUM_2 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(2). (RO)
+ * |[15:8] |    00h    | BUS_NUM_1 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(1). (RO)
+ * | [7:0] |    00h    | BUS_NUM_0 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER		0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT	(1ULL << 63)
+#define BUS_NUM_STRIDE			8
+
 /* SKX CHA */
 #define SKX_CHA_MSR_PMON_BOX_FILTER_TID		(0x1ffULL << 0)
 #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 9)
@@ -418,6 +443,17 @@
 #define ICX_NUMBER_IMC_CHN			2
 #define ICX_IMC_MEM_STRIDE			0x4
 
+/* SPR */
+#define SPR_RAW_EVENT_MASK_EXT			0xffffff
+
+/* SPR CHA */
+#define SPR_CHA_PMON_CTL_TID_EN			(1 << 16)
+#define SPR_CHA_PMON_EVENT_MASK			(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SPR_CHA_PMON_CTL_TID_EN)
+#define SPR_CHA_PMON_BOX_FILTER_TID		0x3ff
+
+#define SPR_C0_MSR_PMON_BOX_FILTER0		0x200e
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
@@ -430,6 +466,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
 DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
@@ -1329,7 +1366,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
 	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
+	int i, bus, nodeid, segment, die_id;
 	struct pci2phy_map *map;
 	int err = 0;
 	u32 config = 0;
@@ -1365,7 +1402,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		 */
 		for (i = 0; i < 8; i++) {
 			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
+				if (topology_max_die_per_package() > 1)
+					die_id = i;
+				else
+					die_id = topology_phys_to_logical_pkg(i);
+				map->pbus_to_dieid[bus] = die_id;
 				break;
 			}
 		}
@@ -1382,17 +1423,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 			i = -1;
 			if (reverse) {
 				for (bus = 255; bus >= 0; bus--) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			} else {
 				for (bus = 0; bus <= 255; bus++) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			}
 		}
@@ -3603,6 +3644,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 	.read_counter		= uncore_msr_read_counter,
 };
 
+static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+{
+	return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+	struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+	/* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
+	return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pci_bus *bus = pci_find_next_bus(NULL);
+	struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+	struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+	long die = (long)ea->var;
+
+	/*
+	 * Current implementation is for single segment configuration hence it's
+	 * safe to take the segment value from the first available root bus.
+	 */
+	return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
+					   skx_iio_stack(uncore_pmu, die));
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+	u64 msr_value;
+
+	if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+			!(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+		return -ENXIO;
+
+	*topology = msr_value;
+
+	return 0;
+}
+
+static int die_to_cpu(int die)
+{
+	int res = 0, cpu, current_die;
+	/*
+	 * Using cpus_read_lock() to ensure cpu is not going down between
+	 * looking at cpu_online_mask.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		current_die = topology_logical_die_id(cpu);
+		if (current_die == die) {
+			res = cpu;
+			break;
+		}
+	}
+	cpus_read_unlock();
+	return res;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+	int i, ret;
+	struct pci_bus *bus = NULL;
+
+	/*
+	 * Verified single-segment environments only; disabled for multiple
+	 * segment topologies for now except VMD domains.
+	 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
+	 */
+	while ((bus = pci_find_next_bus(bus))
+		&& (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
+		;
+	if (bus)
+		return -EPERM;
+
+	type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+	if (!type->topology)
+		return -ENOMEM;
+
+	for (i = 0; i < uncore_max_dies(); i++) {
+		ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
+		if (ret) {
+			kfree(type->topology);
+			type->topology = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+	.is_visible	= skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+	&skx_iio_mapping_group,
+	NULL,
+};
+
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+	char buf[64];
+	int ret;
+	long die = -1;
+	struct attribute **attrs = NULL;
+	struct dev_ext_attribute *eas = NULL;
+
+	ret = skx_iio_get_topology(type);
+	if (ret)
+		return ret;
+
+	/* One more for NULL. */
+	attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		goto err;
+
+	eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+	if (!eas)
+		goto err;
+
+	for (die = 0; die < uncore_max_dies(); die++) {
+		sprintf(buf, "die%ld", die);
+		sysfs_attr_init(&eas[die].attr.attr);
+		eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+		if (!eas[die].attr.attr.name)
+			goto err;
+		eas[die].attr.attr.mode = 0444;
+		eas[die].attr.show = skx_iio_mapping_show;
+		eas[die].attr.store = NULL;
+		eas[die].var = (void *)die;
+		attrs[die] = &eas[die].attr.attr;
+	}
+	skx_iio_mapping_group.attrs = attrs;
+
+	return 0;
+err:
+	for (; die >= 0; die--)
+		kfree(eas[die].attr.attr.name);
+	kfree(eas);
+	kfree(attrs);
+	kfree(type->topology);
+	type->attr_update = NULL;
+	return -ENOMEM;
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+	struct attribute **attr = skx_iio_mapping_group.attrs;
+
+	if (!attr)
+		return;
+
+	for (; *attr; attr++)
+		kfree((*attr)->name);
+	kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+	kfree(skx_iio_mapping_group.attrs);
+	skx_iio_mapping_group.attrs = NULL;
+	kfree(type->topology);
+}
+
 static struct intel_uncore_type skx_uncore_iio = {
 	.name			= "iio",
 	.num_counters		= 4,
@@ -3617,6 +3822,9 @@ static struct intel_uncore_type skx_uncore_iio = {
 	.constraints		= skx_uncore_iio_constraints,
 	.ops			= &skx_uncore_iio_ops,
 	.format_group		= &skx_uncore_iio_format_group,
+	.attr_update		= skx_iio_attr_update,
+	.set_mapping		= skx_iio_set_mapping,
+	.cleanup_mapping	= skx_iio_cleanup_mapping,
 };
 
 enum perf_uncore_iio_freerunning_type_id {
@@ -4387,36 +4595,35 @@ int snr_uncore_pci_init(void)
 	return 0;
 }
 
-static struct pci_dev *snr_uncore_get_mc_dev(int id)
+#define SNR_MC_DEVICE_ID	0x3451
+
+static struct pci_dev *snr_uncore_get_mc_dev(unsigned int device, int id)
 {
 	struct pci_dev *mc_dev = NULL;
-	int phys_id, pkg;
+	int pkg;
 
 	while (1) {
-		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
+		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, mc_dev);
 		if (!mc_dev)
 			break;
-		phys_id = uncore_pcibus_to_physid(mc_dev->bus);
-		if (phys_id < 0)
-			continue;
-		pkg = topology_phys_to_logical_pkg(phys_id);
-		if (pkg < 0)
-			continue;
-		else if (pkg == id)
+		pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+		if (pkg == id)
 			break;
 	}
 	return mc_dev;
 }
 
-static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
-				       unsigned int box_ctl, int mem_offset)
+static int snr_uncore_mmio_map(struct intel_uncore_box *box,
+			       unsigned int box_ctl, int mem_offset,
+			       unsigned int device)
 {
-	struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+	struct pci_dev *pdev = snr_uncore_get_mc_dev(device, box->dieid);
+	struct intel_uncore_type *type = box->pmu->type;
 	resource_size_t addr;
 	u32 pci_dword;
 
 	if (!pdev)
-		return;
+		return -ENODEV;
 
 	pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword);
 	addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23;
@@ -4426,17 +4633,28 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 
 	addr += box_ctl;
 
-	box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE);
-	if (!box->io_addr)
-		return;
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr) {
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+		return -EINVAL;
+	}
 
-	writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
+	return 0;
+}
+
+static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
+				       unsigned int box_ctl, int mem_offset,
+				       unsigned int device)
+{
+	if (!snr_uncore_mmio_map(box, box_ctl, mem_offset, device))
+		writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
 }
 
 static void snr_uncore_mmio_init_box(struct intel_uncore_box *box)
 {
 	__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box),
-				   SNR_IMC_MMIO_MEM0_OFFSET);
+				   SNR_IMC_MMIO_MEM0_OFFSET,
+				   SNR_MC_DEVICE_ID);
 }
 
 static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box)
@@ -4471,6 +4689,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config | SNBEP_PMON_CTL_EN,
 	       box->io_addr + hwc->config_base);
 }
@@ -4483,6 +4704,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config, box->io_addr + hwc->config_base);
 }
 
@@ -4521,6 +4745,7 @@ static struct intel_uncore_type snr_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &snr_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -4561,6 +4786,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = {
 	.num_counters		= 3,
 	.num_boxes		= 1,
 	.num_freerunning_types	= SNR_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= snr_imc_freerunning,
 	.ops			= &snr_uncore_imc_freerunning_ops,
 	.event_descs		= snr_uncore_imc_freerunning_events,
@@ -4951,7 +5177,8 @@ static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
 	int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
 			 SNR_IMC_MMIO_MEM0_OFFSET;
 
-	__snr_uncore_mmio_init_box(box, box_ctl, mem_offset);
+	__snr_uncore_mmio_init_box(box, box_ctl, mem_offset,
+				   SNR_MC_DEVICE_ID);
 }
 
 static struct intel_uncore_ops icx_uncore_mmio_ops = {
@@ -4978,6 +5205,7 @@ static struct intel_uncore_type icx_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &icx_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -5020,7 +5248,8 @@ static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 	int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
 			 SNR_IMC_MMIO_MEM0_OFFSET;
 
-	__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), mem_offset);
+	snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+			    mem_offset, SNR_MC_DEVICE_ID);
 }
 
 static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
@@ -5035,6 +5264,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = {
 	.num_counters		= 5,
 	.num_boxes		= 4,
 	.num_freerunning_types	= ICX_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= icx_imc_freerunning,
 	.ops			= &icx_uncore_imc_freerunning_ops,
 	.event_descs		= icx_uncore_imc_freerunning_events,
@@ -5053,3 +5283,497 @@ void icx_uncore_mmio_init(void)
 }
 
 /* end of ICX uncore support */
+
+/* SPR uncore support */
+
+static void spr_uncore_msr_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, reg1->config);
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void spr_uncore_msr_disable_event(struct intel_uncore_box *box,
+					 struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, 0);
+
+	wrmsrl(hwc->config_base, 0);
+}
+
+static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN);
+	struct intel_uncore_type *type = box->pmu->type;
+
+	if (tie_en) {
+		reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 +
+			    HSWEP_CBO_MSR_OFFSET * type->box_ids[box->pmu->pmu_idx];
+		reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID;
+		reg1->idx = 0;
+	}
+
+	return 0;
+}
+
+static struct intel_uncore_ops spr_uncore_chabox_ops = {
+	.init_box		= intel_generic_uncore_msr_init_box,
+	.disable_box		= intel_generic_uncore_msr_disable_box,
+	.enable_box		= intel_generic_uncore_msr_enable_box,
+	.disable_event		= spr_uncore_msr_disable_event,
+	.enable_event		= spr_uncore_msr_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= spr_cha_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct attribute *spr_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask_ext4.attr,
+	&format_attr_tid_en2.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid5.attr,
+	NULL,
+};
+static const struct attribute_group spr_uncore_chabox_format_group = {
+	.name = "format",
+	.attrs = spr_uncore_cha_formats_attr,
+};
+
+static ssize_t alias_show(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+	char pmu_name[UNCORE_PMU_NAME_LEN];
+
+	uncore_get_alias_name(pmu_name, pmu);
+	return sysfs_emit(buf, "%s\n", pmu_name);
+}
+
+static DEVICE_ATTR_RO(alias);
+
+static struct attribute *uncore_alias_attrs[] = {
+	&dev_attr_alias.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(uncore_alias);
+
+static struct intel_uncore_type spr_uncore_chabox = {
+	.name			= "cha",
+	.event_mask		= SPR_CHA_PMON_EVENT_MASK,
+	.event_mask_ext		= SPR_RAW_EVENT_MASK_EXT,
+	.num_shared_regs	= 1,
+	.ops			= &spr_uncore_chabox_ops,
+	.format_group		= &spr_uncore_chabox_format_group,
+	.attr_update		= uncore_alias_groups,
+};
+
+static struct intel_uncore_type spr_uncore_iio = {
+	.name			= "iio",
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.event_mask_ext		= SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+	.format_group		= &snr_uncore_iio_format_group,
+	.attr_update		= uncore_alias_groups,
+};
+
+static struct attribute *spr_uncore_raw_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask_ext4.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static const struct attribute_group spr_uncore_raw_format_group = {
+	.name			= "format",
+	.attrs			= spr_uncore_raw_formats_attr,
+};
+
+#define SPR_UNCORE_COMMON_FORMAT()				\
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,	\
+	.event_mask_ext		= SPR_RAW_EVENT_MASK_EXT,	\
+	.format_group		= &spr_uncore_raw_format_group,	\
+	.attr_update		= uncore_alias_groups
+
+static struct intel_uncore_type spr_uncore_irp = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "irp",
+
+};
+
+static struct intel_uncore_type spr_uncore_m2pcie = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "m2pcie",
+};
+
+static struct intel_uncore_type spr_uncore_pcu = {
+	.name			= "pcu",
+	.attr_update		= uncore_alias_groups,
+};
+
+static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box,
+					 struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box->io_addr)
+		return;
+
+	if (uncore_pmc_fixed(hwc->idx))
+		writel(SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base);
+	else
+		writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_ops = {
+	.init_box		= intel_generic_uncore_mmio_init_box,
+	.exit_box		= uncore_mmio_exit_box,
+	.disable_box		= intel_generic_uncore_mmio_disable_box,
+	.enable_box		= intel_generic_uncore_mmio_enable_box,
+	.disable_event		= intel_generic_uncore_mmio_disable_event,
+	.enable_event		= spr_uncore_mmio_enable_event,
+	.read_counter		= uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type spr_uncore_imc = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "imc",
+	.fixed_ctr_bits		= 48,
+	.fixed_ctr		= SNR_IMC_MMIO_PMON_FIXED_CTR,
+	.fixed_ctl		= SNR_IMC_MMIO_PMON_FIXED_CTL,
+	.ops			= &spr_uncore_mmio_ops,
+};
+
+static void spr_uncore_pci_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32));
+	pci_write_config_dword(pdev, hwc->config_base, (u32)hwc->config);
+}
+
+static struct intel_uncore_ops spr_uncore_pci_ops = {
+	.init_box		= intel_generic_uncore_pci_init_box,
+	.disable_box		= intel_generic_uncore_pci_disable_box,
+	.enable_box		= intel_generic_uncore_pci_enable_box,
+	.disable_event		= intel_generic_uncore_pci_disable_event,
+	.enable_event		= spr_uncore_pci_enable_event,
+	.read_counter		= intel_generic_uncore_pci_read_counter,
+};
+
+#define SPR_UNCORE_PCI_COMMON_FORMAT()			\
+	SPR_UNCORE_COMMON_FORMAT(),			\
+	.ops			= &spr_uncore_pci_ops
+
+static struct intel_uncore_type spr_uncore_m2m = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "m2m",
+};
+
+static struct intel_uncore_type spr_uncore_upi = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "upi",
+};
+
+static struct intel_uncore_type spr_uncore_m3upi = {
+	SPR_UNCORE_PCI_COMMON_FORMAT(),
+	.name			= "m3upi",
+};
+
+static struct intel_uncore_type spr_uncore_mdf = {
+	SPR_UNCORE_COMMON_FORMAT(),
+	.name			= "mdf",
+};
+
+#define UNCORE_SPR_NUM_UNCORE_TYPES		12
+#define UNCORE_SPR_IIO				1
+#define UNCORE_SPR_IMC				6
+
+static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
+	&spr_uncore_chabox,
+	&spr_uncore_iio,
+	&spr_uncore_irp,
+	&spr_uncore_m2pcie,
+	&spr_uncore_pcu,
+	NULL,
+	&spr_uncore_imc,
+	&spr_uncore_m2m,
+	&spr_uncore_upi,
+	&spr_uncore_m3upi,
+	NULL,
+	&spr_uncore_mdf,
+};
+
+enum perf_uncore_spr_iio_freerunning_type_id {
+	SPR_IIO_MSR_IOCLK,
+	SPR_IIO_MSR_BW_IN,
+	SPR_IIO_MSR_BW_OUT,
+
+	SPR_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_iio_freerunning[] = {
+	[SPR_IIO_MSR_IOCLK]	= { 0x340e, 0x1, 0x10, 1, 48 },
+	[SPR_IIO_MSR_BW_IN]	= { 0x3800, 0x1, 0x10, 8, 48 },
+	[SPR_IIO_MSR_BW_OUT]	= { 0x3808, 0x1, 0x10, 8, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = {
+	/* Free-Running IIO CLOCKS Counter */
+	INTEL_UNCORE_EVENT_DESC(ioclk,			"event=0xff,umask=0x10"),
+	/* Free-Running IIO BANDWIDTH IN Counters */
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0,		"event=0xff,umask=0x20"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1,		"event=0xff,umask=0x21"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2,		"event=0xff,umask=0x22"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3,		"event=0xff,umask=0x23"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4,		"event=0xff,umask=0x24"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5,		"event=0xff,umask=0x25"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6,		"event=0xff,umask=0x26"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7,		"event=0xff,umask=0x27"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit,	"MiB"),
+	/* Free-Running IIO BANDWIDTH OUT Counters */
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0,		"event=0xff,umask=0x30"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1,		"event=0xff,umask=0x31"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2,		"event=0xff,umask=0x32"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3,		"event=0xff,umask=0x33"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4,		"event=0xff,umask=0x34"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5,		"event=0xff,umask=0x35"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6,		"event=0xff,umask=0x36"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7,		"event=0xff,umask=0x37"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit,	"MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type spr_uncore_iio_free_running = {
+	.name			= "iio_free_running",
+	.num_counters		= 17,
+	.num_freerunning_types	= SPR_IIO_FREERUNNING_TYPE_MAX,
+	.freerunning		= spr_iio_freerunning,
+	.ops			= &skx_uncore_iio_freerunning_ops,
+	.event_descs		= spr_uncore_iio_freerunning_events,
+	.format_group		= &skx_uncore_iio_freerunning_format_group,
+};
+
+enum perf_uncore_spr_imc_freerunning_type_id {
+	SPR_IMC_DCLK,
+	SPR_IMC_PQ_CYCLES,
+
+	SPR_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_imc_freerunning[] = {
+	[SPR_IMC_DCLK]		= { 0x22b0, 0x0, 0, 1, 48 },
+	[SPR_IMC_PQ_CYCLES]	= { 0x2318, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_imc_freerunning_events[] = {
+	INTEL_UNCORE_EVENT_DESC(dclk,			"event=0xff,umask=0x10"),
+
+	INTEL_UNCORE_EVENT_DESC(rpq_cycles,		"event=0xff,umask=0x20"),
+	INTEL_UNCORE_EVENT_DESC(wpq_cycles,		"event=0xff,umask=0x21"),
+	{ /* end: all zeroes */ },
+};
+
+#define SPR_MC_DEVICE_ID	0x3251
+
+static void spr_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+	int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + SNR_IMC_MMIO_MEM0_OFFSET;
+
+	snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+			    mem_offset, SPR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops spr_uncore_imc_freerunning_ops = {
+	.init_box	= spr_uncore_imc_freerunning_init_box,
+	.exit_box	= uncore_mmio_exit_box,
+	.read_counter	= uncore_mmio_read_counter,
+	.hw_config	= uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type spr_uncore_imc_free_running = {
+	.name			= "imc_free_running",
+	.num_counters		= 3,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
+	.num_freerunning_types	= SPR_IMC_FREERUNNING_TYPE_MAX,
+	.freerunning		= spr_imc_freerunning,
+	.ops			= &spr_uncore_imc_freerunning_ops,
+	.event_descs		= spr_uncore_imc_freerunning_events,
+	.format_group		= &skx_uncore_iio_freerunning_format_group,
+};
+
+#define UNCORE_SPR_MSR_EXTRA_UNCORES		1
+#define UNCORE_SPR_MMIO_EXTRA_UNCORES		1
+
+static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
+	&spr_uncore_iio_free_running,
+};
+
+static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES] = {
+	&spr_uncore_imc_free_running,
+};
+
+static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
+					struct intel_uncore_type *from_type)
+{
+	if (!to_type || !from_type)
+		return;
+
+	if (from_type->name)
+		to_type->name = from_type->name;
+	if (from_type->fixed_ctr_bits)
+		to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+	if (from_type->event_mask)
+		to_type->event_mask = from_type->event_mask;
+	if (from_type->event_mask_ext)
+		to_type->event_mask_ext = from_type->event_mask_ext;
+	if (from_type->fixed_ctr)
+		to_type->fixed_ctr = from_type->fixed_ctr;
+	if (from_type->fixed_ctl)
+		to_type->fixed_ctl = from_type->fixed_ctl;
+	if (from_type->fixed_ctr_bits)
+		to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+	if (from_type->num_shared_regs)
+		to_type->num_shared_regs = from_type->num_shared_regs;
+	if (from_type->constraints)
+		to_type->constraints = from_type->constraints;
+	if (from_type->ops)
+		to_type->ops = from_type->ops;
+	if (from_type->event_descs)
+		to_type->event_descs = from_type->event_descs;
+	if (from_type->format_group)
+		to_type->format_group = from_type->format_group;
+	if (from_type->attr_update)
+		to_type->attr_update = from_type->attr_update;
+}
+
+static struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+		    struct intel_uncore_type **extra)
+{
+	struct intel_uncore_type **types, **start_types;
+	int i;
+
+	start_types = types = intel_uncore_generic_init_uncores(type_id, num_extra);
+
+	/* Only copy the customized features */
+	for (; *types; types++) {
+		if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES)
+			continue;
+		uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
+	}
+
+	for (i = 0; i < num_extra; i++, types++)
+		*types = extra[i];
+
+	return start_types;
+}
+
+static struct intel_uncore_type *
+uncore_find_type_by_id(struct intel_uncore_type **types, int type_id)
+{
+	for (; *types; types++) {
+		if (type_id == (*types)->type_id)
+			return *types;
+	}
+
+	return NULL;
+}
+
+static int uncore_type_max_boxes(struct intel_uncore_type **types,
+				 int type_id)
+{
+	struct intel_uncore_type *type;
+	int i, max = 0;
+
+	type = uncore_find_type_by_id(types, type_id);
+	if (!type)
+		return 0;
+
+	for (i = 0; i < type->num_boxes; i++) {
+		if (type->box_ids[i] > max)
+			max = type->box_ids[i];
+	}
+
+	return max + 1;
+}
+
+void spr_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+						UNCORE_SPR_MSR_EXTRA_UNCORES,
+						spr_msr_uncores);
+
+	spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+}
+
+int spr_uncore_pci_init(void)
+{
+	uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL);
+	return 0;
+}
+
+void spr_uncore_mmio_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
+
+	if (ret)
+		uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
+	else {
+		uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+							 UNCORE_SPR_MMIO_EXTRA_UNCORES,
+							 spr_mmio_uncores);
+
+		spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
+	}
+}
+
+/* end of SPR uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5648023e3a6529d540d57b5af9e6b3bfbb1ed387..325c74704fa87ca1af67cd3a7646f1435e1eb7f9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,7 @@
 
 #include <linux/perf_event.h>
 
+#include <asm/fpu/xstate.h>
 #include <asm/intel_ds.h>
 
 /* To enable MSR tracing please use the generic trace points. */
@@ -80,6 +81,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
 #define PERF_X86_EVENT_PAIR		0x1000 /* Large Increment per Cycle */
 #define PERF_X86_EVENT_LBR_SELECT	0x2000 /* Save/Restore MSR_LBR_SELECT */
 #define PERF_X86_EVENT_TOPDOWN		0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT	0x8000 /* st+stlat data address sampling */
 
 static inline bool is_topdown_count(struct perf_event *event)
 {
@@ -204,6 +206,17 @@ struct intel_excl_cntrs {
 struct x86_perf_task_context;
 #define MAX_LBR_ENTRIES		32
 
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_TIME		= 0x06,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
+};
+
 enum {
 	X86_PERF_KFREE_SHARED = 0,
 	X86_PERF_KFREE_EXCL   = 1,
@@ -260,11 +273,15 @@ struct cpu_hw_events {
 	int				lbr_pebs_users;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-	struct er_account		*lbr_sel;
+	union {
+		struct er_account		*lbr_sel;
+		struct er_account		*lbr_ctl;
+	};
 	u64				br_sel;
-	struct x86_perf_task_context	*last_task_ctx;
+	void				*last_task_ctx;
 	int				last_log_id;
 	int				lbr_select;
+	void				*lbr_xsave;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -427,6 +444,10 @@ struct cpu_hw_events {
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
 
+#define INTEL_PSD_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
 #define INTEL_PST_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -665,7 +686,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
-			counter_freezing	:1;
+			enabled_ack		:1;
 	/*
 	 * sysfs attrs
 	 */
@@ -706,7 +727,8 @@ struct x86_pmu {
 			pebs_broken		:1,
 			pebs_prec_dist		:1,
 			pebs_no_tlb		:1,
-			pebs_no_isolation	:1;
+			pebs_no_isolation	:1,
+			pebs_block		:1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	int		max_pebs_events;
@@ -720,12 +742,36 @@ struct x86_pmu {
 	 * Intel LBR
 	 */
 	unsigned int	lbr_tos, lbr_from, lbr_to,
-			lbr_nr;			   /* LBR base regs and size */
-	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
-	const int	*lbr_sel_map;		   /* lbr_select mappings */
+			lbr_info, lbr_nr;	   /* LBR base regs and size */
+	union {
+		u64	lbr_sel_mask;		   /* LBR_SELECT valid bits */
+		u64	lbr_ctl_mask;		   /* LBR_CTL valid bits */
+	};
+	union {
+		const int	*lbr_sel_map;	   /* lbr_select mappings */
+		int		*lbr_ctl_map;	   /* LBR_CTL mappings */
+	};
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
+	/*
+	 * Intel Architectural LBR CPUID Enumeration
+	 */
+	unsigned int	lbr_depth_mask:8;
+	unsigned int	lbr_deep_c_reset:1;
+	unsigned int	lbr_lip:1;
+	unsigned int	lbr_cpl:1;
+	unsigned int	lbr_filter:1;
+	unsigned int	lbr_call_stack:1;
+	unsigned int	lbr_mispred:1;
+	unsigned int	lbr_timed_lbr:1;
+	unsigned int	lbr_br_type:1;
+
+	void		(*lbr_reset)(void);
+	void		(*lbr_read)(struct cpu_hw_events *cpuc);
+	void		(*lbr_save)(void *ctx);
+	void		(*lbr_restore)(void *ctx);
+
 	/*
 	 * Intel PT/LBR/BTS are exclusive
 	 */
@@ -734,9 +780,18 @@ struct x86_pmu {
 	/*
 	* Intel perf metrics
 	*/
+	int		num_topdown_events;
 	u64             (*update_topdown_event)(struct perf_event *event);
 	int             (*set_topdown_event_period)(struct perf_event *event);
 
+	/*
+	 * perf task context (i.e. struct perf_event_context::task_ctx_data)
+	 * switch helper to bridge calls from perf/core to perf/x86.
+	 * See struct pmu::swap_task_ctx() usage for examples;
+	 */
+	void		(*swap_task_ctx)(struct perf_event_context *prev,
+					 struct perf_event_context *next);
+
 	/*
 	 * AMD bits
 	 */
@@ -762,16 +817,44 @@ struct x86_pmu {
 	int (*aux_output_match) (struct perf_event *event);
 };
 
+struct x86_perf_task_context_opt {
+	int lbr_callstack_users;
+	int lbr_stack_state;
+	int log_id;
+};
+
 struct x86_perf_task_context {
-	u64 lbr_from[MAX_LBR_ENTRIES];
-	u64 lbr_to[MAX_LBR_ENTRIES];
-	u64 lbr_info[MAX_LBR_ENTRIES];
 	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
-	int lbr_callstack_users;
-	int lbr_stack_state;
-	int log_id;
+	struct x86_perf_task_context_opt opt;
+	struct lbr_entry lbr[MAX_LBR_ENTRIES];
+};
+
+struct x86_perf_task_context_arch_lbr {
+	struct x86_perf_task_context_opt opt;
+	struct lbr_entry entries[];
+};
+
+/*
+ * Add padding to guarantee the 64-byte alignment of the state buffer.
+ *
+ * The structure is dynamically allocated. The size of the LBR state may vary
+ * based on the number of LBR registers.
+ *
+ * Do not put anything after the LBR state.
+ */
+struct x86_perf_task_context_arch_lbr_xsave {
+	struct x86_perf_task_context_opt		opt;
+
+	union {
+		struct xregs_state			xsave;
+		struct {
+			struct fxregs_state		i387;
+			struct xstate_header		header;
+			struct arch_lbr_state		lbr;
+		} __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
+	};
 };
 
 #define x86_add_quirk(func_)						\
@@ -793,6 +876,8 @@ do {									\
 #define PMU_FL_PEBS_ALL		0x10 /* all events are valid PEBS events */
 #define PMU_FL_TFA		0x20 /* deal with TSX force abort */
 #define PMU_FL_PAIR		0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY	0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX	0x100 /* Require an auxiliary event for the complete memory info */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -822,6 +907,14 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {		\
 struct pmu *x86_get_pmu(void);
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt;
+
+	return &((struct x86_perf_task_context *)ctx)->opt;
+}
+
 static inline bool x86_pmu_has_lbr_callstack(void)
 {
 	return  x86_pmu.lbr_sel_map &&
@@ -974,6 +1067,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 
+static inline bool fixed_counter_disabled(int i)
+{
+	return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
@@ -1034,6 +1132,8 @@ void release_ds_buffers(void);
 
 void reserve_ds_buffers(void);
 
+void release_lbr_buffers(void);
+
 extern struct event_constraint bts_constraint;
 extern struct event_constraint vlbr_constraint;
 
@@ -1069,6 +1169,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 extern struct event_constraint intel_icl_pebs_event_constraints[];
 
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
 void intel_pmu_pebs_add(struct perf_event *event);
@@ -1087,16 +1189,23 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
+				 struct perf_event_context *next);
+
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
+void intel_pmu_lbr_reset_32(void);
+
+void intel_pmu_lbr_reset_64(void);
+
 void intel_pmu_lbr_add(struct perf_event *event);
 
 void intel_pmu_lbr_del(struct perf_event *event);
@@ -1107,6 +1216,14 @@ void intel_pmu_lbr_disable_all(void);
 
 void intel_pmu_lbr_read(void);
 
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_save(void *ctx);
+
+void intel_pmu_lbr_restore(void *ctx);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);
@@ -1123,6 +1240,8 @@ void intel_pmu_lbr_init_skl(void);
 
 void intel_pmu_lbr_init_knl(void);
 
+void intel_pmu_arch_lbr_init(void);
+
 void intel_pmu_pebs_data_source_nhm(void);
 
 void intel_pmu_pebs_data_source_skl(bool pmem);
@@ -1158,6 +1277,10 @@ static inline void release_ds_buffers(void)
 {
 }
 
+static inline void release_lbr_buffers(void)
+{
+}
+
 static inline int intel_pmu_init(void)
 {
 	return 0;
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index c2ede2f3b27702ae4dab76217626447e7b05ec70..136a1e847254eab25b3236c42b633e038431a6d7 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i)
 	return 0;
 }
 
+/*
+ * Accepts msr[] array with non populated entries as long as either
+ * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs
+ * visibility is visible when group->is_visible callback is set.
+ */
 unsigned long
 perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
 {
@@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
 		if (!msr[bit].no_check) {
 			struct attribute_group *grp = msr[bit].grp;
 
+			/* skip entry with no group */
+			if (!grp)
+				continue;
+
 			grp->is_visible = not_visible;
 
+			/* skip unpopulated entry */
+			if (!msr[bit].msr)
+				continue;
+
 			if (msr[bit].test && !msr[bit].test(bit, data))
 				continue;
 			/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 9050d7b8abc5a5354069c8f2c9d482f74eccfc7a..b4952869355a0629ddf663b6483bbcf9d6d67024 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -118,9 +118,17 @@ struct rapl_pmus {
 	struct rapl_pmu		*pmus[];
 };
 
+enum rapl_unit_quirk {
+	RAPL_UNIT_QUIRK_NONE,
+	RAPL_UNIT_QUIRK_INTEL_HSW,
+	RAPL_UNIT_QUIRK_INTEL_SPR,
+};
+
 struct rapl_model {
+	struct perf_msr *rapl_msrs;
 	unsigned long	events;
-	bool		apply_quirk;
+	unsigned int	msr_power_unit;
+	enum rapl_unit_quirk	unit_quirk;
 };
 
  /* 1/2^hw_unit Joule */
@@ -129,7 +137,7 @@ static struct rapl_pmus *rapl_pmus;
 static cpumask_t rapl_cpu_mask;
 static unsigned int rapl_cntr_mask;
 static u64 rapl_timer_ms;
-static struct perf_msr rapl_msrs[];
+static struct perf_msr *rapl_msrs;
 
 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 {
@@ -446,9 +454,16 @@ static struct attribute *rapl_events_cores[] = {
 	NULL,
 };
 
+static umode_t
+rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+	return 0;
+}
+
 static struct attribute_group rapl_events_cores_group = {
 	.name  = "events",
 	.attrs = rapl_events_cores,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_pkg[] = {
@@ -461,6 +476,7 @@ static struct attribute *rapl_events_pkg[] = {
 static struct attribute_group rapl_events_pkg_group = {
 	.name  = "events",
 	.attrs = rapl_events_pkg,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_ram[] = {
@@ -473,6 +489,7 @@ static struct attribute *rapl_events_ram[] = {
 static struct attribute_group rapl_events_ram_group = {
 	.name  = "events",
 	.attrs = rapl_events_ram,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_gpu[] = {
@@ -485,6 +502,7 @@ static struct attribute *rapl_events_gpu[] = {
 static struct attribute_group rapl_events_gpu_group = {
 	.name  = "events",
 	.attrs = rapl_events_gpu,
+	.is_visible = rapl_not_visible,
 };
 
 static struct attribute *rapl_events_psys[] = {
@@ -497,6 +515,7 @@ static struct attribute *rapl_events_psys[] = {
 static struct attribute_group rapl_events_psys_group = {
 	.name  = "events",
 	.attrs = rapl_events_psys,
+	.is_visible = rapl_not_visible,
 };
 
 static bool test_msr(int idx, void *data)
@@ -504,7 +523,7 @@ static bool test_msr(int idx, void *data)
 	return test_bit(idx, (unsigned long *) data);
 }
 
-static struct perf_msr rapl_msrs[] = {
+static struct perf_msr intel_rapl_msrs[] = {
 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
@@ -566,26 +585,39 @@ static int rapl_cpu_online(unsigned int cpu)
 	return 0;
 }
 
-static int rapl_check_hw_unit(bool apply_quirk)
+static int rapl_check_hw_unit(struct rapl_model *rm)
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
 
 	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
 		return -1;
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
+	switch (rm->unit_quirk) {
 	/*
 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
 	 * different than the unit from power unit MSR. See
 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 	 */
-	if (apply_quirk)
+	case RAPL_UNIT_QUIRK_INTEL_HSW:
 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
-
+		break;
+	/*
+	 * SPR shares the same DRAM domain energy unit as HSW, plus it
+	 * also has a fixed energy unit for Psys domain.
+	 */
+	case RAPL_UNIT_QUIRK_INTEL_SPR:
+		rapl_hw_unit[PERF_RAPL_RAM] = 16;
+		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
+		break;
+	default:
+		break;
+	}
+	
 	/*
 	 * Calculate the timer rate:
 	 * Use reference of 200W for scaling the timeout to avoid counter
@@ -666,14 +698,16 @@ static struct rapl_model model_snb = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_PP1),
-	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_snbep = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsw = {
@@ -681,20 +715,25 @@ static struct rapl_model model_hsw = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1),
-	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsx = {
 	.events		= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= true,
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_knl = {
 	.events		= BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
-	.apply_quirk	= true,
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static struct rapl_model model_skl = {
@@ -703,7 +742,18 @@ static struct rapl_model model_skl = {
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1) |
 			  BIT(PERF_RAPL_PSYS),
-	.apply_quirk	= false,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
+};
+
+static struct rapl_model model_spr = {
+	.events		= BIT(PERF_RAPL_PP0) |
+			  BIT(PERF_RAPL_PKG) |
+			  BIT(PERF_RAPL_RAM) |
+			  BIT(PERF_RAPL_PSYS),
+	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
+	.msr_power_unit = MSR_RAPL_POWER_UNIT,
+	.rapl_msrs      = intel_rapl_msrs,
 };
 
 static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -732,6 +782,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	model_hsw),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L,		model_skl),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE,		model_skl),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SAPPHIRERAPIDS_X,	model_spr),
 	{},
 };
 
@@ -748,10 +799,13 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 
 	rm = (struct rapl_model *) id->driver_data;
+
+	rapl_msrs = rm->rapl_msrs;
+
 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 					false, (void *) &rm->events);
 
-	ret = rapl_check_hw_unit(rm->apply_quirk);
+	ret = rapl_check_hw_unit(rm);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 30416d7f19d4f8b94248e9e4e149fb21b5e26f8b..55c38af27465760fda4d88ec0a879d3c48f74bd9 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,7 +24,6 @@
 #include <linux/syscalls.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
@@ -36,70 +35,57 @@
 #include <asm/sighandling.h>
 #include <asm/smap.h>
 
+static inline void reload_segments(struct sigcontext_32 *sc)
+{
+	unsigned int cur;
+
+	savesegment(gs, cur);
+	if ((sc->gs | 0x03) != cur)
+		load_gs_index(sc->gs | 0x03);
+	savesegment(fs, cur);
+	if ((sc->fs | 0x03) != cur)
+		loadsegment(fs, sc->fs | 0x03);
+	savesegment(ds, cur);
+	if ((sc->ds | 0x03) != cur)
+		loadsegment(ds, sc->ds | 0x03);
+	savesegment(es, cur);
+	if ((sc->es | 0x03) != cur)
+		loadsegment(es, sc->es | 0x03);
+}
+
 /*
  * Do a signal return; undo the signal stack.
  */
-#define loadsegment_gs(v)	load_gs_index(v)
-#define loadsegment_fs(v)	loadsegment(fs, v)
-#define loadsegment_ds(v)	loadsegment(ds, v)
-#define loadsegment_es(v)	loadsegment(es, v)
-
-#define get_user_seg(seg)	({ unsigned int v; savesegment(seg, v); v; })
-#define set_user_seg(seg, v)	loadsegment_##seg(v)
-
-#define COPY(x)			{		\
-	get_user_ex(regs->x, &sc->x);		\
-}
-
-#define GET_SEG(seg)		({			\
-	unsigned short tmp;				\
-	get_user_ex(tmp, &sc->seg);			\
-	tmp;						\
-})
-
-#define COPY_SEG_CPL3(seg)	do {			\
-	regs->seg = GET_SEG(seg) | 3;			\
-} while (0)
-
-#define RELOAD_SEG(seg)		{		\
-	unsigned int pre = (seg) | 3;		\
-	unsigned int cur = get_user_seg(seg);	\
-	if (pre != cur)				\
-		set_user_seg(seg, pre);		\
-}
-
-static int ia32_restore_sigcontext(struct pt_regs *regs,
-				   struct sigcontext_32 __user *sc)
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+				    struct sigcontext_32 __user *usc)
 {
-	unsigned int tmpflags, err = 0;
-	u16 gs, fs, es, ds;
-	void __user *buf;
-	u32 tmp;
+	struct sigcontext_32 sc;
+	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	get_user_try {
-		gs = GET_SEG(gs);
-		fs = GET_SEG(fs);
-		ds = GET_SEG(ds);
-		es = GET_SEG(es);
+	if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
+		return false;
 
-		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
-		/* Don't touch extended registers */
+	/* Get only the ia32 registers. */
+	regs->bx = sc.bx;
+	regs->cx = sc.cx;
+	regs->dx = sc.dx;
+	regs->si = sc.si;
+	regs->di = sc.di;
+	regs->bp = sc.bp;
+	regs->ax = sc.ax;
+	regs->sp = sc.sp;
+	regs->ip = sc.ip;
 
-		COPY_SEG_CPL3(cs);
-		COPY_SEG_CPL3(ss);
+	/* Get CS/SS and force CPL3 */
+	regs->cs = sc.cs | 0x03;
+	regs->ss = sc.ss | 0x03;
 
-		get_user_ex(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		/* disable syscall checks */
-		regs->orig_ax = -1;
-
-		get_user_ex(tmp, &sc->fpstate);
-		buf = compat_ptr(tmp);
-	} get_user_catch(err);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+	/* disable syscall checks */
+	regs->orig_ax = -1;
 
 	/*
 	 * Reload fs and gs if they have changed in the signal
@@ -107,12 +93,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	 * the handler, but does not clobber them at least in the
 	 * normal case.
 	 */
-	RELOAD_SEG(gs);
-	RELOAD_SEG(fs);
-	RELOAD_SEG(ds);
-	RELOAD_SEG(es);
-
-	err |= fpu__restore_sig(buf, 1);
+	reload_segments(&sc);
+	err = fpu__restore_sig(compat_ptr(sc.fpstate), 1);
 
 	force_iret();
 
@@ -128,15 +110,12 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
-	    || (_COMPAT_NSIG_WORDS > 1
-		&& __copy_from_user((((char *) &set.sig) + 4),
-				    &frame->extramask,
-				    sizeof(frame->extramask))))
+	    || __get_user(((__u32 *)&set)[1], &frame->extramask[0]))
 		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->sc))
+	if (!ia32_restore_sigcontext(regs, &frame->sc))
 		goto badframe;
 	return regs->ax;
 
@@ -155,12 +134,12 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
@@ -177,44 +156,51 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
  * Set up a signal frame.
  */
 
-static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
-				 void __user *fpstate,
-				 struct pt_regs *regs, unsigned int mask)
+#define get_user_seg(seg)	({ unsigned int v; savesegment(seg, v); v; })
+
+static __always_inline int
+__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
+			    void __user *fpstate,
+			    struct pt_regs *regs, unsigned int mask)
 {
-	int err = 0;
-
-	put_user_try {
-		put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs);
-		put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs);
-		put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds);
-		put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es);
-
-		put_user_ex(regs->di, &sc->di);
-		put_user_ex(regs->si, &sc->si);
-		put_user_ex(regs->bp, &sc->bp);
-		put_user_ex(regs->sp, &sc->sp);
-		put_user_ex(regs->bx, &sc->bx);
-		put_user_ex(regs->dx, &sc->dx);
-		put_user_ex(regs->cx, &sc->cx);
-		put_user_ex(regs->ax, &sc->ax);
-		put_user_ex(current->thread.trap_nr, &sc->trapno);
-		put_user_ex(current->thread.error_code, &sc->err);
-		put_user_ex(regs->ip, &sc->ip);
-		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->sp, &sc->sp_at_signal);
-		put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
-
-		put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
-
-		/* non-iBCS2 extensions.. */
-		put_user_ex(mask, &sc->oldmask);
-		put_user_ex(current->thread.cr2, &sc->cr2);
-	} put_user_catch(err);
+	unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+	unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
+	unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
+	unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+
+	unsafe_put_user(regs->di, &sc->di, Efault);
+	unsafe_put_user(regs->si, &sc->si, Efault);
+	unsafe_put_user(regs->bp, &sc->bp, Efault);
+	unsafe_put_user(regs->sp, &sc->sp, Efault);
+	unsafe_put_user(regs->bx, &sc->bx, Efault);
+	unsafe_put_user(regs->dx, &sc->dx, Efault);
+	unsafe_put_user(regs->cx, &sc->cx, Efault);
+	unsafe_put_user(regs->ax, &sc->ax, Efault);
+	unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+	unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+	unsafe_put_user(regs->ip, &sc->ip, Efault);
+	unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+	unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
+
+	unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault);
+
+	/* non-iBCS2 extensions.. */
+	unsafe_put_user(mask, &sc->oldmask, Efault);
+	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+	return 0;
 
-	return err;
+Efault:
+	return -EFAULT;
 }
 
+#define unsafe_put_sigcontext32(sc, fp, regs, set, label)		\
+do {									\
+	if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0]))	\
+		goto label;						\
+} while(0)
+
 /*
  * Determine which stack to use..
  */
@@ -238,8 +224,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 
 	sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
 	*fpstate = (struct _fpstate_32 __user *) sp;
-	if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
-				     math_size) < 0)
+	if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+				      math_size))
 		return (void __user *) -1L;
 
 	sp -= frame_size;
@@ -254,8 +240,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 {
 	struct sigframe_ia32 __user *frame;
 	void __user *restorer;
-	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
 	/* copy_to_user optimizes that into a single 8 byte store */
 	static const struct {
@@ -268,22 +253,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 		0x80cd,		/* int $0x80 */
 	};
 
-	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
-
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
-
-	if (__put_user(sig, &frame->sig))
-		return -EFAULT;
-
-	if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
-		return -EFAULT;
-
-	if (_COMPAT_NSIG_WORDS > 1) {
-		if (__copy_to_user(frame->extramask, &set->sig[1],
-				   sizeof(frame->extramask)))
-			return -EFAULT;
-	}
+	frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
 
 	if (ksig->ka.sa.sa_flags & SA_RESTORER) {
 		restorer = ksig->ka.sa.sa_restorer;
@@ -296,19 +266,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 			restorer = &frame->retcode;
 	}
 
-	put_user_try {
-		put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
-
-		/*
-		 * These are actually not used anymore, but left because some
-		 * gdb versions depend on them as a marker.
-		 */
-		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
-	} put_user_catch(err);
-
-	if (err)
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
+	unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
+	unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+	/*
+	 * These are actually not used anymore, but left because some
+	 * gdb versions depend on them as a marker.
+	 */
+	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+	user_access_end();
+
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long) frame;
 	regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
@@ -325,6 +296,9 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 	regs->ss = __USER32_DS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
@@ -332,10 +306,9 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 {
 	struct rt_sigframe_ia32 __user *frame;
 	void __user *restorer;
-	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
-	/* __copy_to_user optimizes that into a single 8 byte store */
+	/* unsafe_put_user optimizes that into a single 8 byte store */
 	static const struct {
 		u8 movl;
 		u32 val;
@@ -348,44 +321,40 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 		0,
 	};
 
-	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	put_user_try {
-		put_user_ex(sig, &frame->sig);
-		put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
-		put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
+	unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
 
-		/* Create the ucontext.  */
-		if (static_cpu_has(X86_FEATURE_XSAVE))
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+	/* Create the ucontext.  */
+	if (static_cpu_has(X86_FEATURE_XSAVE))
+		unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+	else
+		unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
 
-		if (ksig->ka.sa.sa_flags & SA_RESTORER)
-			restorer = ksig->ka.sa.sa_restorer;
-		else
-			restorer = current->mm->context.vdso +
-				vdso_image_32.sym___kernel_rt_sigreturn;
-		put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
-
-		/*
-		 * Not actually used anymore, but left because some gdb
-		 * versions need it.
-		 */
-		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
-	} put_user_catch(err);
-
-	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
-	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				     regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
-	if (err)
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
+		restorer = ksig->ka.sa.sa_restorer;
+	else
+		restorer = current->mm->context.vdso +
+			vdso_image_32.sym___kernel_rt_sigreturn;
+	unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+
+	/*
+	 * Not actually used anymore, but left because some gdb
+	 * versions need it.
+	 */
+	unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+	unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault);
+	user_access_end();
+
+	if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false))
 		return -EFAULT;
 
 	/* Set up registers for signal handler */
@@ -404,4 +373,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 	regs->ss = __USER32_DS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5bef1575708dc46e4644f7f8bf91941cbb84f63a..91e62f6b18eb00c42f9504b344d82cb4ffc41371 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -296,11 +296,10 @@ struct apic {
 	void	(*send_IPI_all)(int vector);
 	void	(*send_IPI_self)(int vector);
 
-	/* dest_logical is used by the IPI functions */
-	u32	dest_logical;
 	u32	disable_esr;
-	u32	irq_delivery_mode;
-	u32	irq_dest_mode;
+
+	enum apic_delivery_modes delivery_mode;
+	bool	dest_mode_logical;
 
 	u32	(*calc_dest_apicid)(unsigned int cpu);
 
@@ -510,6 +509,12 @@ static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
 static inline void apic_smt_update(void) { }
 #endif
 
+struct msi_msg;
+struct irq_cfg;
+
+extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+				  bool dmar);
+
 extern void irq_enter(void);
 extern void irq_exit(void);
 
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 05e694ed838642e6e7e40b665ebe766a1d69385b..5716f22f81ac4b05abc33882fa34749732c6e52a 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -432,15 +432,13 @@ struct local_apic {
  #define BAD_APICID 0xFFFFu
 #endif
 
-enum ioapic_irq_destination_types {
-	dest_Fixed		= 0,
-	dest_LowestPrio		= 1,
-	dest_SMI		= 2,
-	dest__reserved_1	= 3,
-	dest_NMI		= 4,
-	dest_INIT		= 5,
-	dest__reserved_2	= 6,
-	dest_ExtINT		= 7
+enum apic_delivery_modes {
+	APIC_DELIVERY_MODE_FIXED	= 0,
+	APIC_DELIVERY_MODE_LOWESTPRIO   = 1,
+	APIC_DELIVERY_MODE_SMI		= 2,
+	APIC_DELIVERY_MODE_NMI		= 4,
+	APIC_DELIVERY_MODE_INIT		= 5,
+	APIC_DELIVERY_MODE_EXTINT	= 7,
 };
 
 #endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 12933d40620799bb5f455b95914d2afadeb187f6..13dd803c743808599285d8a58de5992fbadf7b72 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -117,34 +117,21 @@
 # define CC_OUT(c) [_cc_ ## c] "=qm"
 #endif
 
+#ifdef __KERNEL__
+
+# include <asm/extable_fixup_types.h>
+
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+
+# define _ASM_EXTABLE_TYPE(from, to, type)			\
 	.pushsection "__ex_table","a" ;				\
 	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
-	.long (handler) - . ;					\
+	.long type ;						\
 	.popsection
 
-# define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
-# define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
-
-# define _ASM_EXTABLE_REFCOUNT(from, to)			\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
-
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -152,33 +139,15 @@
 	.popsection
 
 #else
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+
+# define _ASM_EXTABLE_TYPE(from, to, type)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
-	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
+	" .long " __stringify(type) " \n"			\
 	" .popsection\n"
 
-# define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
-# define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
-
-# define _ASM_EXTABLE_REFCOUNT(from, to)			\
-	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
-
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
@@ -193,4 +162,17 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 
+#define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
+
+#define _ASM_EXTABLE_UA(from, to)				\
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
+
+#define _ASM_EXTABLE_CPY(from, to)				\
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
+
+#define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
+
+#endif /* __KERNEL__ */
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index adc6cc86b06201761a2a19e2fcdb306d631241be..fc18a4817f16dc56c46abf0323810789af508cac 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -40,4 +40,25 @@ int mwait_usable(const struct cpuinfo_x86 *);
 unsigned int x86_family(unsigned int sig);
 unsigned int x86_model(unsigned int sig);
 unsigned int x86_stepping(unsigned int sig);
+#ifdef CONFIG_CPU_SUP_INTEL
+extern void __init sld_setup(struct cpuinfo_x86 *c);
+extern void switch_to_sld(unsigned long tifn);
+extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
+extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
+#else
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
+static inline void switch_to_sld(unsigned long tifn) {}
+static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+	return false;
+}
+
+static inline bool handle_guest_split_lock(unsigned long ip)
+{
+	return false;
+}
+
+static inline void handle_bus_lock(struct pt_regs *regs) {}
+#endif
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f885908107f53a0ee2126445d4642cd990566d4a..0778e487c780a42453065f96144e1964207ad8b9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -272,6 +272,7 @@
 #define X86_FEATURE_XSAVEC		(10*32+ 1) /* XSAVEC instruction */
 #define X86_FEATURE_XGETBV1		(10*32+ 2) /* XGETBV with ECX = 1 instruction */
 #define X86_FEATURE_XSAVES		(10*32+ 3) /* XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD			(10*32+ 4) /* "" eXtended Feature Disabling */
 
 /*
  * Extended auxiliary flags: Linux defined - for features scattered in various
@@ -285,8 +286,11 @@
 #define X86_FEATURE_CQM_MBM_LOCAL	(11*32+ 3) /* LLC Local MBM monitoring */
 #define X86_FEATURE_FENCE_SWAPGS_USER	(11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_SPLIT_LOCK_DETECT	(11*32+ 6) /* #AC for split lock */
+#define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
@@ -345,9 +349,11 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* Bus Lock detect */
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI		(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B		(16*32+28) /* MOVDIR64B instruction */
+#define X86_FEATURE_ENQCMD		(16*32+29) /* ENQCMD and ENQCMDS instructions */
 
 #define X86_FEATURE_SGX_LC		(16*32+30) /* Software Guard Extensions Launch Control */
 /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
@@ -363,11 +369,19 @@
 #define X86_FEATURE_SRBDS_CTRL		(18*32+ 9) /* "" SRBDS mitigation MSR available */
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
+#define X86_FEATURE_SERIALIZE		(18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AMX_BF16		(18*32+22) /* AMX bf16 Support */
+#define X86_FEATURE_AVX512_FP16		(18*32+23) /* AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE		(18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8		(18*32+25) /* AMX int8 Support */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_CORE_CAPABILITIES	(18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
 #define X86_FEATURE_SPEC_CTRL_SSBD	(18*32+31) /* "" Speculative Store Bypass Disable */
 
 /*
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index a8f6c809d9b13c7ed533967994cfb675254850fb..3e6c75a6d070fd2ea8cb334a526c24524802e42b 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -11,16 +11,6 @@ struct dev_archdata {
 #endif
 };
 
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-struct dma_domain {
-	struct list_head node;
-	const struct dma_map_ops *dma_ops;
-	int domain_nr;
-};
-void add_dma_domain(struct dma_domain *domain);
-void del_dma_domain(struct dma_domain *domain);
-#endif
-
 struct pdev_archdata {
 };
 
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 8473c786632de978f60d002ed6f4e78da066b65d..6cec2044e9b125b4946981bcdf04eb1a62997c81 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,6 +62,12 @@
 # define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
 #endif
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
+# define DISABLE_ENQCMD		0
+#else
+# define DISABLE_ENQCMD		(1 << (X86_FEATURE_ENQCMD & 31))
+#endif
+
 #ifdef CONFIG_X86_SGX
 # define DISABLE_SGX	0
 #else
@@ -87,7 +93,8 @@
 #define DISABLED_MASK13	0
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
-#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
+			 DISABLE_ENQCMD)
 #define DISABLED_MASK17	0
 #define DISABLED_MASK18	0
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index c3aa4b5e49e2544f319b4afe5d2da700f9553f57..314f75d886d08a979dd63ebce1d150926959d202 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -28,6 +28,14 @@ enum e820_type {
 	 */
 	E820_TYPE_PRAM		= 12,
 
+	/*
+	 * Special-purpose memory is indicated to the system via the
+	 * EFI_MEMORY_SP attribute. Define an e820 translation of this
+	 * memory type for the purpose of reserving this range and
+	 * marking it with the IORES_DESC_SOFT_RESERVED designation.
+	 */
+	E820_TYPE_SOFT_RESERVED	= 0xefffffff,
+
 	/*
 	 * Reserved RAM used by the kernel itself if
 	 * CONFIG_INTEL_TXT=y is enabled, memory of this type
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 43a82e59c59d2657d9e319361e7285a15da2fac7..d028e9acdf1c05964970d467a347aaeff987d875 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -140,7 +140,6 @@ extern void efi_delete_dummy_variable(void);
 extern void efi_switch_mm(struct mm_struct *mm);
 extern void efi_recover_from_page_fault(unsigned long phys_addr);
 extern void efi_free_boot_services(void);
-extern void efi_reserve_boot_services(void);
 
 struct efi_setup_data {
 	u64 fw_vendor;
@@ -244,6 +243,8 @@ static inline bool efi_is_64bit(void)
 extern bool efi_reboot_required(void);
 extern bool efi_is_table_address(unsigned long phys_addr);
 
+extern void efi_find_mirror(void);
+extern void efi_reserve_boot_services(void);
 #else
 static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
 static inline bool efi_reboot_required(void)
@@ -254,6 +255,20 @@ static inline  bool efi_is_table_address(unsigned long phys_addr)
 {
 	return false;
 }
+static inline void efi_find_mirror(void)
+{
+}
+static inline void efi_reserve_boot_services(void)
+{
+}
 #endif /* CONFIG_EFI */
 
+#ifdef CONFIG_EFI_FAKE_MEMMAP
+extern void __init efi_fake_memmap_early(void);
+#else
+static inline void efi_fake_memmap_early(void)
+{
+}
+#endif
+
 #endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ae7f38bdce0574da9dbf11b39a8e43ea1b7b60d2..ca00132fe1224b6c2a0182482da90b5ff3ea9fb5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -313,6 +313,7 @@ do {									\
 		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);			\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);	\
 	}								\
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 /*
@@ -329,6 +330,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -350,6 +352,7 @@ do {									\
 	if (vdso64_enabled)						\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
 			    (unsigned long __force)current->mm->context.vdso); \
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -358,6 +361,7 @@ do {									\
 	if (vdso64_enabled)						\
 		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
 			    (unsigned long __force)current->mm->context.vdso); \
+	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
 } while (0)
 
 #define AT_SYSINFO		32
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index 1f0cbc52937ca571e6b517af01b988bbe218fff0..93f400eb728f87db79be82676401fd7d6d225c1b 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -1,12 +1,18 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _ASM_X86_EXTABLE_H
 #define _ASM_X86_EXTABLE_H
+
+#include <asm/extable_fixup_types.h>
+
 /*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
+ * The exception table consists of two addresses relative to the
+ * exception table entry itself and a type selector field.
+ *
+ * The first address is of an instruction that is allowed to fault, the
+ * second is the target at which the program should continue.
+ *
+ * The type entry is used by fixup_exception() to select the handler to
+ * deal with the fault caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -15,7 +21,7 @@
  */
 
 struct exception_table_entry {
-	int insn, fixup, handler;
+	int insn, fixup, type;
 };
 struct pt_regs;
 
@@ -25,21 +31,27 @@ struct pt_regs;
 	do {							\
 		(a)->fixup = (b)->fixup + (delta);		\
 		(b)->fixup = (tmp).fixup - (delta);		\
-		(a)->handler = (b)->handler + (delta);		\
-		(b)->handler = (tmp).handler - (delta);		\
+		(a)->type = (b)->type;				\
+		(b)->type = (tmp).type;				\
 	} while (0)
 
-enum handler_type {
-	EX_HANDLER_NONE,
-	EX_HANDLER_FAULT,
-	EX_HANDLER_UACCESS,
-	EX_HANDLER_OTHER
-};
-
 extern int fixup_exception(struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr);
 extern int fixup_bug(struct pt_regs *regs, int trapnr);
-extern enum handler_type ex_get_fault_handler_type(unsigned long ip);
+extern int ex_get_fixup_type(unsigned long ip);
 extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
 
+#ifdef CONFIG_X86_MCE
+extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr);
+#else
+static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { }
+#endif
+
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs);
+#else
+static inline bool ex_handler_bpf(const struct exception_table_entry *x,
+				  struct pt_regs *regs) { return false; }
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..409524d5d2eb1f4b14507f2c221bfe640f78c5cd
--- /dev/null
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
+#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+
+#define	EX_TYPE_NONE			 0
+#define	EX_TYPE_DEFAULT			 1
+#define	EX_TYPE_FAULT			 2
+#define	EX_TYPE_UACCESS			 3
+#define	EX_TYPE_COPY			 4
+#define	EX_TYPE_CLEAR_FS		 5
+#define	EX_TYPE_FPU_RESTORE		 6
+#define	EX_TYPE_WRMSR			 7
+#define	EX_TYPE_RDMSR			 8
+#define	EX_TYPE_BPF			 9
+
+#define	EX_TYPE_WRMSR_IN_MCE		10
+#define	EX_TYPE_RDMSR_IN_MCE		11
+
+#define	EX_TYPE_DEFAULT_MCE_SAFE	12
+#define	EX_TYPE_FAULT_MCE_SAFE		13
+
+#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 06e767bca0c14f78f386dd8086fbf085dec547e6..18c0d76b5565828c36b7bf00fe328c6cbb4326a3 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,8 @@
 #define _ASM_X86_FPU_API_H
 #include <linux/bottom_half.h>
 
+#include <asm/fpu/types.h>
+
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
  * disables preemption so be careful if you intend to use it for long periods
@@ -36,21 +38,36 @@ static inline void kernel_fpu_begin(void)
 }
 
 /*
- * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
  * A context switch will (and softirq might) save CPU's FPU registers to
- * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
  * a random state.
+ *
+ * local_bh_disable() protects against both preemption and soft interrupts
+ * on !RT kernels.
+ *
+ * On RT kernels local_bh_disable() is not sufficient because it only
+ * serializes soft interrupt related sections via a local lock, but stays
+ * preemptible. Disabling preemption is the right choice here as bottom
+ * half processing is always in thread context on RT kernels so it
+ * implicitly prevents bottom half processing as well.
+ *
+ * Disabling preemption also serializes against kernel_fpu_begin().
  */
 static inline void fpregs_lock(void)
 {
-	preempt_disable();
-	local_bh_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_disable();
+	else
+		preempt_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
-	local_bh_enable();
-	preempt_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_enable();
+	else
+		preempt_enable();
 }
 
 #ifdef CONFIG_X86_DEBUG_FPU
@@ -73,4 +90,75 @@ extern void switch_fpu_return(void);
  */
 extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
 
+/*
+ * Tasks that are not using SVA have mm->pasid set to zero to note that they
+ * will not have the valid bit set in MSR_IA32_PASID while they are running.
+ */
+#define PASID_DISABLED	0
+
+/* Trap handling */
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+extern void fpu_reset_from_exception_fixup(void);
+
+/* Boot, hotplug and resume */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+
+/* State tracking */
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/* Process cleanup */
+#ifdef CONFIG_X86_64
+extern void fpstate_free(struct fpu *fpu);
+#else
+static inline void fpstate_free(struct fpu *fpu) { }
+#endif
+
+/* fpstate-related functions which are exported to KVM */
+extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
+
+extern u64 xstate_get_guest_group_perm(void);
+
+/* KVM specific functions */
+extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
+extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
+extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
+
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
+#endif
+
+extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
+extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
+
+static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
+{
+	gfpu->fpstate->is_confidential = true;
+}
+
+static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
+{
+	return gfpu->fpstate->is_confidential;
+}
+
+/* prctl */
+struct task_struct;
+extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
+
+extern void fpu_idle_fpregs(void);
+
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 00eac7f1529b08db11d65830cfd14e38a495d75e..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -1,650 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_FPU_INTERNAL_H
-#define _ASM_X86_FPU_INTERNAL_H
-
-#include <linux/compat.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-
-#include <asm/user.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/xstate.h>
-#include <asm/cpufeature.h>
-#include <asm/trace/fpu.h>
-
-/*
- * High level FPU state handling functions:
- */
-extern void fpu__prepare_read(struct fpu *fpu);
-extern void fpu__prepare_write(struct fpu *fpu);
-extern void fpu__save(struct fpu *fpu);
-extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
-extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
-extern void fpu__clear(struct fpu *fpu);
-extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
-extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
-
-/*
- * Boot time FPU initialization functions:
- */
-extern void fpu__init_cpu(void);
-extern void fpu__init_system_xstate(void);
-extern void fpu__init_cpu_xstate(void);
-extern void fpu__init_system(struct cpuinfo_x86 *c);
-extern void fpu__init_check_bugs(void);
-extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
-
-/*
- * Debugging facility:
- */
-#ifdef CONFIG_X86_DEBUG_FPU
-# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
-#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
-#endif
-
-/*
- * FPU related CPU feature flag helper routines:
- */
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-	return static_cpu_has(X86_FEATURE_FXSR);
-}
-
-/*
- * fpstate handling functions:
- */
-
-extern union fpregs_state init_fpstate;
-
-extern void fpstate_init(union fpregs_state *state);
-#ifdef CONFIG_MATH_EMULATION
-extern void fpstate_init_soft(struct swregs_state *soft);
-#else
-static inline void fpstate_init_soft(struct swregs_state *soft) {}
-#endif
-
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
-{
-	/*
-	 * XRSTORS requires these bits set in xcomp_bv, or it will
-	 * trigger #GP:
-	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
-}
-
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
-{
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
-}
-extern void fpstate_sanitize_xstate(struct fpu *fpu);
-
-#define user_insn(insn, output, input...)				\
-({									\
-	int err;							\
-									\
-	might_fault();							\
-									\
-	asm volatile(ASM_STAC "\n"					\
-		     "1:" #insn "\n\t"					\
-		     "2: " ASM_CLAC "\n"				\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define kernel_insn_err(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define kernel_insn(insn, output, input...)				\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)	\
-		     : output : input)
-
-static inline int copy_fregs_to_user(struct fregs_state __user *fx)
-{
-	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else
-		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-}
-
-static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else
-		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void copy_kernel_to_fregs(struct fregs_state *fx)
-{
-	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
-{
-	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_user_to_fregs(struct fregs_state __user *fx)
-{
-	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void copy_fxregs_to_kernel(struct fpu *fpu)
-{
-	if (IS_ENABLED(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
-	else
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
-}
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define XSTATE_OP(op, st, lmask, hmask, err)				\
-	asm volatile("1:" op "\n\t"					\
-		     "xor %[err], %[err]\n"				\
-		     "2:\n\t"						\
-		     ".pushsection .fixup,\"ax\"\n\t"			\
-		     "3: movl $-2,%[err]\n\t"				\
-		     "jmp 2b\n\t"					\
-		     ".popsection\n\t"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err)					\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
- * XSAVEOPT.
- *
- * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
- * supports modified optimization which is not supported by XSAVE.
- *
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
- */
-#define XSTATE_XSAVE(st, lmask, hmask, err)				\
-	asm volatile(ALTERNATIVE_2(XSAVE,				\
-				   XSAVEOPT, X86_FEATURE_XSAVEOPT,	\
-				   XSAVES,   X86_FEATURE_XSAVES)	\
-		     "\n"						\
-		     "xor %[err], %[err]\n"				\
-		     "3:\n"						\
-		     ".pushsection .fixup,\"ax\"\n"			\
-		     "4: movl $-2, %[err]\n"				\
-		     "jmp 3b\n"						\
-		     ".popsection\n"					\
-		     _ASM_EXTABLE(661b, 4b)				\
-		     : [err] "=r" (err)					\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
- * XSAVE area format.
- */
-#define XSTATE_XRESTORE(st, lmask, hmask)				\
-	asm volatile(ALTERNATIVE(XRSTOR,				\
-				 XRSTORS, X86_FEATURE_XSAVES)		\
-		     "\n"						\
-		     "3:\n"						\
-		     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
-		     :							\
-		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
-		     : "memory")
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
-	u64 mask = -1;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
-	else
-		XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
-	/* We should never fault when copying to a kernel buffer: */
-	WARN_ON_FPU(err);
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
-{
-	u64 mask = -1;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-	else
-		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
-	/*
-	 * We should never fault when copying from a kernel buffer, and the FPU
-	 * state we set at boot time should be valid.
-	 */
-	WARN_ON_FPU(err);
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
-{
-	u64 mask = -1;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	WARN_ON_FPU(!alternatives_patched);
-
-	XSTATE_XSAVE(xstate, lmask, hmask, err);
-
-	/* We should never fault when copying to a kernel buffer: */
-	WARN_ON_FPU(err);
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	XSTATE_XRESTORE(xstate, lmask, hmask);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int copy_xregs_to_user(struct xregs_state __user *buf)
-{
-	int err;
-
-	/*
-	 * Clear the xsave header first, so that reserved fields are
-	 * initialized to zero.
-	 */
-	err = __clear_user(&buf->header, sizeof(buf->header));
-	if (unlikely(err))
-		return -EFAULT;
-
-	stac();
-	XSTATE_OP(XSAVE, buf, -1, -1, err);
-	clac();
-
-	return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
-{
-	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	stac();
-	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-	clac();
-
-	return err;
-}
-
-/*
- * Restore xstate from kernel space xsave area, return an error code instead of
- * an exception.
- */
-static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err;
-
-	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
-	return err;
-}
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
- *
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
- */
-static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
-{
-	if (likely(use_xsave())) {
-		copy_xregs_to_kernel(&fpu->state.xsave);
-
-		/*
-		 * AVX512 state is tracked here because its use is
-		 * known to slow the max clock speed of the core.
-		 */
-		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
-			fpu->avx512_timestamp = jiffies;
-		return 1;
-	}
-
-	if (likely(use_fxsr())) {
-		copy_fxregs_to_kernel(fpu);
-		return 1;
-	}
-
-	/*
-	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
-	 * so we have to mark them inactive:
-	 */
-	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
-
-	return 0;
-}
-
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
-{
-	if (use_xsave()) {
-		copy_kernel_to_xregs(&fpstate->xsave, mask);
-	} else {
-		if (use_fxsr())
-			copy_kernel_to_fxregs(&fpstate->fxsave);
-		else
-			copy_kernel_to_fregs(&fpstate->fsave);
-	}
-}
-
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
-{
-	/*
-	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-	 * pending. Clear the x87 state here by setting it to fixed values.
-	 * "m" is a random variable that should be in L1.
-	 */
-	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
-		asm volatile(
-			"fnclex\n\t"
-			"emms\n\t"
-			"fildl %P[addr]"	/* set F?P to defined value */
-			: : [addr] "m" (fpstate));
-	}
-
-	__copy_kernel_to_fpregs(fpstate, -1);
-}
-
-extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
-
-/*
- * FPU context switch related helper methods:
- */
-
-DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-
-/*
- * The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
- * matches the FPU.
- *
- * If the FPU register state is valid, the kernel can skip restoring the
- * FPU state from memory.
- *
- * Any code that clobbers the FPU registers or updates the in-memory
- * FPU state for a task MUST let the rest of the kernel know that the
- * FPU registers are no longer valid for this task.
- *
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
- * (with preemption disabled), FPU for the current task, or a task that
- * is prevented from running by the current task.
- */
-static inline void __cpu_invalidate_fpregs_state(void)
-{
-	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-}
-
-static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
-{
-	fpu->last_cpu = -1;
-}
-
-static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
-{
-	return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
-}
-
-/*
- * These generally need preemption protection to work,
- * do try to avoid using these on their own:
- */
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
-	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-	trace_x86_fpu_regs_deactivated(fpu);
-}
-
-static inline void fpregs_activate(struct fpu *fpu)
-{
-	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
-	trace_x86_fpu_regs_activated(fpu);
-}
-
-/*
- * Internal helper, do not use directly. Use switch_fpu_return() instead.
- */
-static inline void __fpregs_load_activate(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-	int cpu = smp_processor_id();
-
-	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
-		return;
-
-	if (!fpregs_state_valid(fpu, cpu)) {
-		copy_kernel_to_fpregs(&fpu->state);
-		fpregs_activate(fpu);
-		fpu->last_cpu = cpu;
-	}
-	clear_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state.
- *    This is done within the context of the old process.
- *
- *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
- *    will get loaded on return to userspace, or when the kernel needs it.
- *
- * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
- * are saved in the current thread's FPU register state.
- *
- * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
- * hold current()'s FPU registers. It is required to load the
- * registers before returning to userland or using the content
- * otherwise.
- *
- * The FPU context is only stored/restored for a user task and
- * PF_KTHREAD is used to distinguish between kernel and user threads.
- */
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
-{
-	if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
-		if (!copy_fpregs_to_fpstate(old_fpu))
-			old_fpu->last_cpu = -1;
-		else
-			old_fpu->last_cpu = cpu;
-
-		/* But leave fpu_fpregs_owner_ctx! */
-		trace_x86_fpu_regs_deactivated(old_fpu);
-	}
-}
-
-/*
- * Misc helper functions:
- */
-
-/*
- * Load PKRU from the FPU context if available. Delay loading of the
- * complete FPU state until the return to userland.
- */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
-{
-	u32 pkru_val = init_pkru_value;
-	struct pkru_state *pk;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return;
-
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-
-	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
-		return;
-
-	/*
-	 * PKRU state is switched eagerly because it needs to be valid before we
-	 * return to userland e.g. for a copy_to_user() operation.
-	 */
-	if (current->mm) {
-		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
-		if (pk)
-			pkru_val = pk->pkru;
-	}
-	__write_pkru(pkru_val);
-}
-
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
-	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
-}
-
-#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index d5bdffb9d27fedefbec05a8c0adafa592ff359d2..4f928d6a367b8f14c6f399529ad758d4e8bcea14 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -8,8 +8,8 @@
 #include <linux/regset.h>
 
 extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
+extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				 xstateregs_get;
 extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
 				 xstateregs_set;
 
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
new file mode 100644
index 0000000000000000000000000000000000000000..99a8820e8cc4cf63fb4b1fc65b6aff72fc04d15f
--- /dev/null
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_SCHED_H
+#define _ASM_X86_FPU_SCHED_H
+
+#include <linux/sched.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/types.h>
+
+#include <asm/trace/fpu.h>
+
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu_clone(struct task_struct *dst, unsigned long clone_flags);
+extern void fpu_flush_thread(void);
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state.
+ *    This is done within the context of the old process.
+ *
+ *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ *    will get loaded on return to userspace, or when the kernel needs it.
+ *
+ * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
+ * are saved in the current thread's FPU register state.
+ *
+ * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
+ * hold current()'s FPU registers. It is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * PF_KTHREAD is used to distinguish between kernel and user threads.
+ */
+static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+{
+	if (cpu_feature_enabled(X86_FEATURE_FPU) &&
+	    !(current->flags & PF_KTHREAD)) {
+		save_fpregs_to_fpstate(old_fpu);
+		/*
+		 * The save operation preserved register state, so the
+		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+		 * current CPU number in @old_fpu, so the next return
+		 * to user space can avoid the FPU register restore
+		 * when is returns on the same CPU and still owns the
+		 * context.
+		 */
+		old_fpu->last_cpu = cpu;
+
+		trace_x86_fpu_regs_deactivated(old_fpu);
+	}
+}
+
+/*
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
+ */
+static inline void switch_fpu_finish(void)
+{
+	if (cpu_feature_enabled(X86_FEATURE_FPU))
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a8fea46895275c8496b484446e333..22b0273a8bf1e0b0446d0a272fa5928b778ac49a 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -5,6 +5,11 @@
 #ifndef _ASM_X86_FPU_SIGNAL_H
 #define _ASM_X86_FPU_SIGNAL_H
 
+#include <linux/compat.h>
+#include <linux/user.h>
+
+#include <asm/fpu/types.h>
+
 #ifdef CONFIG_X86_64
 # include <uapi/asm/sigcontext.h>
 # include <asm/user32.h>
@@ -29,6 +34,14 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 		     unsigned long *buf_fx, unsigned long *size);
 
-extern void fpu__init_prepare_fx_sw_frame(void);
+unsigned long fpu__get_fpstate_size(void);
+
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
+
+extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
+
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f098f6cab94bfc215bb7c820bf3e77c400604f6e..9a1cc1761817a299ba8588bb9823cb519cf07c6f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,15 @@ enum xfeature {
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 	XFEATURE_PKRU,
+	XFEATURE_PASID,
+	XFEATURE_RSRVD_COMP_11,
+	XFEATURE_RSRVD_COMP_12,
+	XFEATURE_RSRVD_COMP_13,
+	XFEATURE_RSRVD_COMP_14,
+	XFEATURE_LBR,
+	XFEATURE_RSRVD_COMP_16,
+	XFEATURE_XTILE_CFG,
+	XFEATURE_XTILE_DATA,
 
 	XFEATURE_MAX,
 };
@@ -128,12 +137,23 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT		(1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_PASID		(1 << XFEATURE_PASID)
+#define XFEATURE_MASK_LBR		(1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG		(1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA	(1 << XFEATURE_XTILE_DATA)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
 					 | XFEATURE_MASK_ZMM_Hi256 \
 					 | XFEATURE_MASK_Hi16_ZMM)
 
+#ifdef CONFIG_X86_64
+# define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILE_DATA \
+					 | XFEATURE_MASK_XTILE_CFG)
+#else
+# define XFEATURE_MASK_XTILE		(0)
+#endif
+
 #define FIRST_EXTENDED_XFEATURE	XFEATURE_YMM
 
 struct reg_128_bit {
@@ -145,6 +165,9 @@ struct reg_256_bit {
 struct reg_512_bit {
 	u8	regbytes[512/8];
 };
+struct reg_1024_byte {
+	u8	regbytes[1024];
+};
 
 /*
  * State component 2:
@@ -229,6 +252,51 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 15: Architectural LBR configuration state.
+ * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
+ */
+
+struct lbr_entry {
+	u64 from;
+	u64 to;
+	u64 info;
+};
+
+struct arch_lbr_state {
+	u64 lbr_ctl;
+	u64 lbr_depth;
+	u64 ler_from;
+	u64 ler_to;
+	u64 ler_info;
+	struct lbr_entry		entries[];
+} __packed;
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+	u64				tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+	struct reg_1024_byte		tmm;
+} __packed;
+
+/*
+ * State component 10 is supervisor state used for context-switching the
+ * PASID state.
+ */
+struct ia32_pasid_state {
+	u64 pasid;
+} __packed;
+
 struct xstate_header {
 	u64				xfeatures;
 	u64				xcomp_bv;
@@ -273,6 +341,93 @@ union fpregs_state {
 	u8 __padding[PAGE_SIZE];
 };
 
+struct fpstate {
+	/* @kernel_size: The size of the kernel register image */
+	unsigned int		size;
+
+	/* @user_size: The size in non-compacted UABI format */
+	unsigned int		user_size;
+
+	/* @xfeatures:		xfeatures for which the storage is sized */
+	u64			xfeatures;
+
+	/* @user_xfeatures:	xfeatures valid in UABI buffers */
+	u64			user_xfeatures;
+
+	/* @xfd:		xfeatures disabled to trap userspace use. */
+	u64			xfd;
+
+	/* @is_valloc:		Indicator for dynamically allocated state */
+	unsigned int		is_valloc	: 1;
+
+	/* @is_guest:		Indicator for guest state (KVM) */
+	unsigned int		is_guest	: 1;
+
+	/*
+	 * @is_confidential:	Indicator for KVM confidential mode.
+	 *			The FPU registers are restored by the
+	 *			vmentry firmware from encrypted guest
+	 *			memory. On vmexit the FPU registers are
+	 *			saved by firmware to encrypted guest memory
+	 *			and the registers are scrubbed before
+	 *			returning to the host. So there is no
+	 *			content which is worth saving and restoring.
+	 *			The fpstate has to be there so that
+	 *			preemption and softirq FPU usage works
+	 *			without special casing.
+	 */
+	unsigned int		is_confidential	: 1;
+
+	/* @in_use:		State is in use */
+	unsigned int		in_use		: 1;
+
+	/* @regs: The register state union for all supported formats */
+	union fpregs_state	regs;
+
+	/* @regs is dynamically sized! Don't add anything after @regs! */
+} __aligned(64);
+
+#define FPU_GUEST_PERM_LOCKED		BIT_ULL(63)
+
+struct fpu_state_perm {
+	/*
+	 * @__state_perm:
+	 *
+	 * This bitmap indicates the permission for state components, which
+	 * are available to a thread group. The permission prctl() sets the
+	 * enabled state bits in thread_group_leader()->thread.fpu.
+	 *
+	 * All run time operations use the per thread information in the
+	 * currently active fpu.fpstate which contains the xfeature masks
+	 * and sizes for kernel and user space.
+	 *
+	 * This master permission field is only to be used when
+	 * task.fpu.fpstate based checks fail to validate whether the task
+	 * is allowed to expand it's xfeatures set which requires to
+	 * allocate a larger sized fpstate buffer.
+	 *
+	 * Do not access this field directly.  Use the provided helper
+	 * function. Unlocked access is possible for quick checks.
+	 */
+	u64				__state_perm;
+
+	/*
+	 * @__state_size:
+	 *
+	 * The size required for @__state_perm. Only valid to access
+	 * with sighand locked.
+	 */
+	unsigned int			__state_size;
+
+	/*
+	 * @__user_state_size:
+	 *
+	 * The size required for @__state_perm user part. Only valid to
+	 * access with sighand locked.
+	 */
+	unsigned int			__user_state_size;
+};
+
 /*
  * Highest level per task FPU state data structure that
  * contains the FPU register state plus various FPU
@@ -301,19 +456,130 @@ struct fpu {
 	unsigned long			avx512_timestamp;
 
 	/*
-	 * @state:
+	 * @fpstate:
 	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
+	 * Pointer to the active struct fpstate. Initialized to
+	 * point at @__fpstate below.
 	 */
-	union fpregs_state		state;
+	struct fpstate			*fpstate;
+
 	/*
-	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * @__task_fpstate:
+	 *
+	 * Pointer to an inactive struct fpstate. Initialized to NULL. Is
+	 * used only for KVM support to swap out the regular task fpstate.
+	 */
+	struct fpstate			*__task_fpstate;
+
+	/*
+	 * @perm:
+	 *
+	 * Permission related information
+	 */
+	struct fpu_state_perm		perm;
+
+	/*
+	 * @guest_perm:
+	 *
+	 * Permission related information for guest pseudo FPUs
+	 */
+	struct fpu_state_perm		guest_perm;
+
+	/*
+	 * @__fpstate:
+	 *
+	 * Initial in-memory storage for FPU registers which are saved in
+	 * context switch and when the kernel uses the FPU. The registers
+	 * are restored from this storage on return to user space if they
+	 * are not longer containing the tasks FPU register state.
+	 */
+	struct fpstate			__fpstate;
+	/*
+	 * WARNING: '__fpstate' is dynamically-sized.  Do not put
 	 * anything after it here.
 	 */
 };
 
+/*
+ * Guest pseudo FPU container
+ */
+struct fpu_guest {
+	/*
+	 * @xfeatures:			xfeature bitmap of features which are
+	 *				currently enabled for the guest vCPU.
+	 */
+	u64				xfeatures;
+
+	/*
+	 * @perm:			xfeature bitmap of features which are
+	 *				permitted to be enabled for the guest
+	 *				vCPU.
+	 */
+	u64				perm;
+
+	/*
+	 * @xfd_err:			Save the guest value.
+	 */
+	u64				xfd_err;
+
+	/*
+	 * @uabi_size:			Size required for save/restore
+	 */
+	unsigned int			uabi_size;
+
+	/*
+	 * @fpstate:			Pointer to the allocated guest fpstate
+	 */
+	struct fpstate			*fpstate;
+};
+
+/*
+ * FPU state configuration data. Initialized at boot time. Read only after init.
+ */
+struct fpu_state_config {
+	/*
+	 * @max_size:
+	 *
+	 * The maximum size of the register state buffer. Includes all
+	 * supported features except independent managed features.
+	 */
+	unsigned int		max_size;
+
+	/*
+	 * @default_size:
+	 *
+	 * The default size of the register state buffer. Includes all
+	 * supported features except independent managed features and
+	 * features which have to be requested by user space before usage.
+	 */
+	unsigned int		default_size;
+
+	/*
+	 * @max_features:
+	 *
+	 * The maximum supported features bitmap. Does not include
+	 * independent managed features.
+	 */
+	u64 max_features;
+
+	/*
+	 * @default_features:
+	 *
+	 * The default supported features bitmap. Does not include
+	 * independent managed features and features which have to
+	 * be requested by user space before usage.
+	 */
+	u64 default_features;
+	/*
+	 * @legacy_features:
+	 *
+	 * Features which can be reported back to user space
+	 * even without XSAVE support, i.e. legacy features FP + SSE
+	 */
+	u64 legacy_features;
+};
+
+/* FPU state configuration information */
+extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
new file mode 100644
index 0000000000000000000000000000000000000000..9656a5bc6feae4b096c93fcdf1e26e806fa7b5d2
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_XCR_H
+#define _ASM_X86_FPU_XCR_H
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+#define XCR_XFEATURE_IN_USE_MASK	0x00000001
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+/*
+ * Return a mask of xfeatures which are currently being tracked
+ * by the processor as being in the initial configuration.
+ *
+ * Callers should check X86_FEATURE_XGETBV1.
+ */
+static inline u64 xfeatures_in_use(void)
+{
+	return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+}
+
+#endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c6136d79f8c07f90148a22ed3d8bae7a1e0d0b46..6c56b3eedd6060c203567628a56e002e50699479 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 
 #include <asm/processor.h>
+#include <asm/fpu/api.h>
 #include <asm/user.h>
 
 /* Bit 63 of XCR0 is reserved for future expansion */
@@ -13,6 +14,8 @@
 
 #define XSTATE_CPUID		0x0000000d
 
+#define TILE_CPUID		0x0000001d
+
 #define FXSAVE_SIZE	512
 
 #define XSAVE_HDR_SIZE	    64
@@ -21,41 +24,110 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
+#define XSAVE_ALIGNMENT     64
 
-/* All currently supported features */
-#define XCNTXT_MASK		(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE | \
-				 XFEATURE_MASK_YMM | \
-				 XFEATURE_MASK_OPMASK | \
-				 XFEATURE_MASK_ZMM_Hi256 | \
-				 XFEATURE_MASK_Hi16_ZMM	 | \
-				 XFEATURE_MASK_PKRU | \
-				 XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR)
+/* All currently supported user features */
+#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
+				      XFEATURE_MASK_SSE | \
+				      XFEATURE_MASK_YMM | \
+				      XFEATURE_MASK_OPMASK | \
+				      XFEATURE_MASK_ZMM_Hi256 | \
+				      XFEATURE_MASK_Hi16_ZMM	 | \
+				      XFEATURE_MASK_PKRU | \
+				      XFEATURE_MASK_BNDREGS | \
+				      XFEATURE_MASK_BNDCSR | \
+				      XFEATURE_MASK_XTILE)
 
-#ifdef CONFIG_X86_64
-#define REX_PREFIX	"0x48, "
-#else
-#define REX_PREFIX
-#endif
+/*
+ * Features which are restored when returning to user space.
+ * PKRU is not restored on return to user space because PKRU
+ * is switched eagerly in switch_to() and flush_thread()
+ */
+#define XFEATURE_MASK_USER_RESTORE	\
+	(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+
+/* Features which are dynamically enabled for a process on request */
+#define XFEATURE_MASK_USER_DYNAMIC	XFEATURE_MASK_XTILE_DATA
+
+/* All currently supported supervisor features */
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+
+/*
+ * A supervisor state component may not always contain valuable information,
+ * and its size may be huge. Saving/restoring such supervisor state components
+ * at each context switch can cause high CPU and space overhead, which should
+ * be avoided. Such supervisor state components should only be saved/restored
+ * on demand. The on-demand supervisor features are set in this mask.
+ *
+ * Unlike the existing supported supervisor features, an independent supervisor
+ * feature does not allocate a buffer in task->fpu, and the corresponding
+ * supervisor state component cannot be saved/restored at each context switch.
+ *
+ * To support an independent supervisor feature, a developer should follow the
+ * dos and don'ts as below:
+ * - Do dynamically allocate a buffer for the supervisor state component.
+ * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
+ *   state component to/from the buffer.
+ * - Don't set the bit corresponding to the independent supervisor feature in
+ *   IA32_XSS at run time, since it has been set at boot time.
+ */
+#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
+
+/*
+ * Unsupported supervisor features. When a supervisor feature in this mask is
+ * supported in the future, move it to the supported supervisor feature mask.
+ */
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+
+/* All supervisor states including supported and unsupported states. */
+#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
+				      XFEATURE_MASK_INDEPENDENT | \
+				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
+
+/*
+ * The feature mask required to restore FPU state:
+ * - All user states which are not eagerly switched in switch_to()/exec()
+ * - The suporvisor states
+ */
+#define XFEATURE_MASK_FPSTATE	(XFEATURE_MASK_USER_RESTORE | \
+				 XFEATURE_MASK_SUPERVISOR_SUPPORTED)
+
+/*
+ * Features in this mask have space allocated in the signal frame, but may not
+ * have that space initialized when the feature is in its init state.
+ */
+#define XFEATURE_MASK_SIGFRAME_INITOPT	(XFEATURE_MASK_XTILE | \
+					 XFEATURE_MASK_USER_DYNAMIC)
 
-extern u64 xfeatures_mask;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
 					     u64 xstate_mask);
 
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-
-/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-extern int validate_xstate_header(const struct xstate_header *hdr);
+int xfeature_size(int xfeature_nr);
+
+void xsaves(struct xregs_state *xsave, u64 mask);
+void xrstors(struct xregs_state *xsave, u64 mask);
+
+int xfd_enable_feature(u64 xfd_err);
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+	return static_branch_unlikely(&__fpu_state_size_dynamic);
+}
+#else
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+	return false;
+}
+#endif
 
 #endif
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 6352dee37cdac73dbe4519139fab7f02c4179923..ab9f3dd87c805bda9ec6d107a901b4c4dac7c4fd 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,17 +74,6 @@ extern void hpet_disable(void);
 extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
-struct irq_data;
-struct hpet_channel;
-struct irq_domain;
-
-extern void hpet_msi_unmask(struct irq_data *data);
-extern void hpet_msi_mask(struct irq_data *data);
-extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg);
-extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
-extern int hpet_assign_irq(struct irq_domain *domain,
-			   struct hpet_channel *hc, int dev_num);
-
 #ifdef CONFIG_HPET_EMULATE_RTC
 
 #include <linux/interrupt.h>
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4154bc5f6a4ed075952e60d6042889d964649a1c..b466be8673b865caf1859758972f5c8a3753661c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -58,61 +58,54 @@ struct msi_desc;
 enum irq_alloc_type {
 	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
 	X86_IRQ_ALLOC_TYPE_HPET,
-	X86_IRQ_ALLOC_TYPE_MSI,
-	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_PCI_MSI,
+	X86_IRQ_ALLOC_TYPE_PCI_MSIX,
 	X86_IRQ_ALLOC_TYPE_DMAR,
 	X86_IRQ_ALLOC_TYPE_UV,
+	X86_IRQ_ALLOC_TYPE_DEV_MSI,
 };
 
+struct ioapic_alloc_info {
+	int		pin;
+	int		node;
+	u32		is_level	: 1;
+	u32		active_low	: 1;
+	u32		valid		: 1;
+};
+
+struct uv_alloc_info {
+	int		limit;
+	int		blade;
+	unsigned long	offset;
+	char		*name;
+
+};
+
+/**
+ * irq_alloc_info - X86 specific interrupt allocation info
+ * @type:	X86 specific allocation type
+ * @flags:	Flags for allocation tweaks
+ * @devid:	Device ID for allocations
+ * @hwirq:	Associated hw interrupt number in the domain
+ * @mask:	CPU mask for vector allocation
+ * @desc:	Pointer to msi descriptor
+ * @data:	Allocation specific data
+ *
+ * @ioapic:	IOAPIC specific allocation data
+ * @uv:		UV specific allocation data
+*/
 struct irq_alloc_info {
 	enum irq_alloc_type	type;
 	u32			flags;
-	const struct cpumask	*mask;	/* CPU mask for vector allocation */
+	u32			devid;
+	irq_hw_number_t		hwirq;
+	const struct cpumask	*mask;
+	struct msi_desc		*desc;
+	void			*data;
+
 	union {
-		int		unused;
-#ifdef	CONFIG_HPET_TIMER
-		struct {
-			int		hpet_id;
-			int		hpet_index;
-			void		*hpet_data;
-		};
-#endif
-#ifdef	CONFIG_PCI_MSI
-		struct {
-			struct pci_dev	*msi_dev;
-			irq_hw_number_t	msi_hwirq;
-		};
-#endif
-#ifdef	CONFIG_X86_IO_APIC
-		struct {
-			int		ioapic_id;
-			int		ioapic_pin;
-			int		ioapic_node;
-			u32		ioapic_trigger : 1;
-			u32		ioapic_polarity : 1;
-			u32		ioapic_valid : 1;
-			struct IO_APIC_route_entry *ioapic_entry;
-		};
-#endif
-#ifdef	CONFIG_DMAR_TABLE
-		struct {
-			int		dmar_id;
-			void		*dmar_data;
-		};
-#endif
-#ifdef	CONFIG_X86_UV
-		struct {
-			int		uv_limit;
-			int		uv_blade;
-			unsigned long	uv_offset;
-			char		*uv_name;
-		};
-#endif
-#if IS_ENABLED(CONFIG_VMD)
-		struct {
-			struct msi_desc *desc;
-		};
-#endif
+		struct ioapic_alloc_info	ioapic;
+		struct uv_alloc_info		uv;
 	};
 };
 
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index c606c0b7073824ac70f94e1d2fe24ff91320326e..a113c16f7efdf5df2bbe34beda4d96a89d78e2f3 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -86,6 +86,15 @@
 #define INTEL_FAM6_COMETLAKE		0xA5
 #define INTEL_FAM6_COMETLAKE_L		0xA6
 
+#define INTEL_FAM6_ROCKETLAKE		0xA7
+
+/* Hybrid Core/Atom Processors */
+
+#define	INTEL_FAM6_LAKEFIELD		0x8A
+#define INTEL_FAM6_ALDERLAKE		0x97
+
+#define INTEL_FAM6_SAPPHIRERAPIDS_X	0x8F
+
 /* "Small Core" Processors (Atom) */
 
 #define INTEL_FAM6_ATOM_BONNELL		0x1C /* Diamondville, Pineview */
@@ -111,6 +120,7 @@
 
 #define INTEL_FAM6_ATOM_TREMONT_D	0x86 /* Jacobsville */
 #define INTEL_FAM6_ATOM_TREMONT		0x96 /* Elkhart Lake */
+#define INTEL_FAM6_ATOM_TREMONT_L	0x9C /* Jasper Lake */
 
 /* Xeon Phi */
 
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6bed97ff6db2da3a7dc94fdaab10420d7cdca73b..67412283c0cd14892852993e812ce76dd55960f0 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -404,4 +404,29 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
 extern bool phys_mem_access_encrypted(unsigned long phys_addr,
 				      unsigned long size);
 
+/**
+ * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: source
+ * @count: number of 512 bits quantities to submit
+ *
+ * Submit data from kernel space to MMIO space, in units of 512 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the CPU
+ * instruction is supported on the platform.
+ */
+static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
+				    size_t count)
+{
+	const u8 *from = src;
+	const u8 *end = from + count * 64;
+
+	while (from < end) {
+		movdir64b(dst, from);
+		from += 64;
+	}
+}
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index fd20a2334885c491c2e4d3f19f8ebc1c29007045..73da644b2f0df6d9417bdb3a8f06d71c7d0e6379 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -13,15 +13,6 @@
  * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
  */
 
-/* I/O Unit Redirection Table */
-#define IO_APIC_REDIR_VECTOR_MASK	0x000FF
-#define IO_APIC_REDIR_DEST_LOGICAL	0x00800
-#define IO_APIC_REDIR_DEST_PHYSICAL	0x00000
-#define IO_APIC_REDIR_SEND_PENDING	(1 << 12)
-#define IO_APIC_REDIR_REMOTE_IRR	(1 << 14)
-#define IO_APIC_REDIR_LEVEL_TRIGGER	(1 << 15)
-#define IO_APIC_REDIR_MASKED		(1 << 16)
-
 /*
  * The structure of the IO-APIC:
  */
@@ -65,53 +56,39 @@ union IO_APIC_reg_03 {
 };
 
 struct IO_APIC_route_entry {
-	__u32	vector		:  8,
-		delivery_mode	:  3,	/* 000: FIXED
-					 * 001: lowest prio
-					 * 111: ExtINT
-					 */
-		dest_mode	:  1,	/* 0: physical, 1: logical */
-		delivery_status	:  1,
-		polarity	:  1,
-		irr		:  1,
-		trigger		:  1,	/* 0: edge, 1: level */
-		mask		:  1,	/* 0: enabled, 1: disabled */
-		__reserved_2	: 15;
-
-	__u32	__reserved_3	: 24,
-		dest		:  8;
-} __attribute__ ((packed));
-
-struct IR_IO_APIC_route_entry {
-	__u64	vector		: 8,
-		zero		: 3,
-		index2		: 1,
-		delivery_status : 1,
-		polarity	: 1,
-		irr		: 1,
-		trigger		: 1,
-		mask		: 1,
-		reserved	: 31,
-		format		: 1,
-		index		: 15;
+	union {
+		struct {
+			u64	vector			:  8,
+				delivery_mode		:  3,
+				dest_mode_logical	:  1,
+				delivery_status		:  1,
+				active_low		:  1,
+				irr			:  1,
+				is_level		:  1,
+				masked			:  1,
+				reserved_0		: 15,
+				reserved_1		: 24,
+				destid_0_7		:  8;
+		};
+		struct {
+			u64	ir_shared_0		:  8,
+				ir_zero			:  3,
+				ir_index_15		:  1,
+				ir_shared_1		:  5,
+				ir_reserved_0		: 31,
+				ir_format		:  1,
+				ir_index_0_14		: 15;
+		};
+		struct {
+			u64	w1			: 32,
+				w2			: 32;
+		};
+	};
 } __attribute__ ((packed));
 
 struct irq_alloc_info;
 struct ioapic_domain_cfg;
 
-#define IOAPIC_AUTO			-1
-#define IOAPIC_EDGE			0
-#define IOAPIC_LEVEL			1
-
-#define IOAPIC_MASKED			1
-#define IOAPIC_UNMASKED			0
-
-#define IOAPIC_POL_HIGH			0
-#define IOAPIC_POL_LOW			1
-
-#define IOAPIC_DEST_MODE_PHYSICAL	0
-#define IOAPIC_DEST_MODE_LOGICAL	1
-
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index b91623d521d9f0ffeb23e825221353cba05e0ced..bf1ed2ddc74bd605408eaea6ffc9ab27b765bdfd 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -2,10 +2,28 @@
 #ifndef _ASM_X86_IOMMU_H
 #define _ASM_X86_IOMMU_H
 
+#include <linux/acpi.h>
+
+#include <asm/e820/api.h>
+
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
 
+static inline int __init
+arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+	u64 start = rmrr->base_address;
+	u64 end = rmrr->end_address + 1;
+
+	if (e820__mapped_all(start, end, E820_TYPE_RESERVED))
+		return 0;
+
+	pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n",
+	       start, end - 1);
+	return -EINVAL;
+}
+
 #endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 4bc985f1e2e4d2f5173a2ee0f7cbb33e0ad9e4a7..7cc49432187fb2975dd305ca50d586ae5d6cd34d 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,11 +44,6 @@ extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
 extern void panic_if_irq_remap(const char *msg);
 
-extern struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
-extern struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info);
-
 /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
 extern struct irq_domain *
 arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
@@ -73,17 +68,5 @@ static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	return NULL;
-}
-
-static inline struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info)
-{
-	return NULL;
-}
-
 #endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index c066ffae222b769996b78c13ee5d063dc7c5c544..125c23b7bad394b61c06f1266330149e3a975e0f 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -12,6 +12,9 @@ enum {
 	X86_IRQ_ALLOC_LEGACY				= 0x2,
 };
 
+extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
+extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec);
+
 extern struct irq_domain *x86_vector_domain;
 
 extern void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -51,9 +54,13 @@ extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
 #endif /* CONFIG_X86_IO_APIC */
 
 #ifdef CONFIG_PCI_MSI
-extern void arch_init_msi_domain(struct irq_domain *domain);
+void x86_create_pci_msi_domain(void);
+struct irq_domain *native_create_pci_msi_domain(void);
+extern struct irq_domain *x86_pci_msi_default_domain;
 #else
-static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+static inline void x86_create_pci_msi_domain(void) { }
+#define native_create_pci_msi_domain	NULL
+#define x86_pci_msi_default_domain	NULL
 #endif
 
 #endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 77cf6c11f66bd86341ef58f5503d68b24e3b6833..c2a5e44ae33f7695380b9a574986828bfb5dfeaf 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -166,7 +166,10 @@ struct x86_emulate_ops {
 	int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
 			      unsigned long addr, const void *val,
 			      unsigned int bytes,
-			      struct x86_exception *fault);
+			      struct x86_exception *fault,
+			      bool non_posted);
+
+	int (*np_write_complete)(struct x86_emulate_ctxt *ctxt, bool *retry);
 
 	/*
 	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -233,6 +236,7 @@ struct x86_emulate_ops {
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
+typedef u32 __attribute__((vector_size(64))) sz512_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
@@ -257,6 +261,7 @@ struct operand {
 		u64 val64;
 		char valptr[sizeof(sse128_t)];
 		sse128_t vec_val;
+		char valptr512[sizeof(sz512_t)];
 		u64 mm_val;
 		void *data;
 	};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52b7073a5ab57a3a4597e9fee248be559f97174..c3948b936db79bf53b2f62c65b40d9196516813d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,9 +37,21 @@
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
-#define KVM_MAX_VCPUS 288
+#define KVM_MAX_VCPUS 1024
 #define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
@@ -64,7 +76,7 @@
 #define KVM_REQ_PMI			KVM_ARCH_REQ(11)
 #define KVM_REQ_SMI			KVM_ARCH_REQ(12)
 #define KVM_REQ_MASTERCLOCK_UPDATE	KVM_ARCH_REQ(13)
-#define KVM_REQ_MCLOCK_INPROGRESS \
+#define KVM_REQ_BLOCK_VMENTRY \
 	KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_SCAN_IOAPIC \
 	KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -177,6 +189,11 @@ enum {
 	VCPU_SREG_LDTR,
 };
 
+enum exit_fastpath_completion {
+	EXIT_FASTPATH_NONE,
+	EXIT_FASTPATH_SKIP_EMUL_INS,
+};
+
 #include <asm/kvm_emulate.h>
 
 #define KVM_NR_MEM_OBJS 40
@@ -566,6 +583,8 @@ struct kvm_vcpu_arch {
 	u64 smbase;
 	u64 smi_count;
 	bool tpr_access_reporting;
+	bool xfd_no_write_intercept;
+	bool xsaves_enabled;
 	u64 ia32_xss;
 	u64 microcode_version;
 	u64 arch_capabilities;
@@ -613,15 +632,14 @@ struct kvm_vcpu_arch {
 	 *
 	 * Note that while the PKRU state lives inside the fpu registers,
 	 * it is switched out separately at VMENTER and VMEXIT time. The
-	 * "guest_fpu" state here contains the guest FPU context, with the
+	 * "guest_fpstate" state here contains the guest FPU context, with the
 	 * host PRKU bits.
 	 */
-	struct fpu *user_fpu;
-	struct fpu *guest_fpu;
+	struct fpu_guest guest_fpu;
 
 	u64 xcr0;
 	u64 guest_supported_xcr0;
-	u32 guest_xstate_size;
+	u64 guest_supported_xss;
 
 	struct kvm_pio_request pio;
 	void *pio_data;
@@ -880,6 +898,7 @@ struct kvm_arch {
 	atomic_t vapics_in_nmi_mode;
 	struct mutex apic_map_lock;
 	struct kvm_apic_map *apic_map;
+	bool apic_map_dirty;
 
 	bool apic_access_page_done;
 
@@ -939,6 +958,12 @@ struct kvm_arch {
 
 	struct kvm_pmu_event_filter *pmu_event_filter;
 	struct task_struct *nx_lpage_recovery_thread;
+	/*
+	 * VM-scope maximum vCPU ID. Used to determine the size of structures
+	 * that increase along with the maximum vCPU ID, in which case, using
+	 * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
+	 */
+	u32 max_vcpu_ids;
 };
 
 struct kvm_vm_stat {
@@ -1023,6 +1048,7 @@ struct kvm_x86_ops {
 	void (*vm_destroy)(struct kvm *kvm);
 
 	/* Create, but do not attach this VCPU */
+	int (*vcpu_precreate)(struct kvm *kvm);
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
@@ -1073,8 +1099,9 @@ struct kvm_x86_ops {
 	 */
 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
 
-	void (*run)(struct kvm_vcpu *vcpu);
-	int (*handle_exit)(struct kvm_vcpu *vcpu);
+	enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
+	int (*handle_exit)(struct kvm_vcpu *vcpu,
+		enum exit_fastpath_completion exit_fastpath);
 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
@@ -1107,11 +1134,10 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
-	bool (*invpcid_supported)(void);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
-	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+	void (*set_supported_cpuid)(struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
 
@@ -1126,10 +1152,7 @@ struct kvm_x86_ops {
 			       enum x86_intercept_stage stage);
 	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 	bool (*mpx_supported)(void);
-	bool (*xsaves_supported)(void);
-	bool (*umip_emulated)(void);
 	bool (*pt_supported)(void);
-	bool (*pku_supported)(void);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu);
 	void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
@@ -1214,7 +1237,7 @@ struct kvm_x86_ops {
 				   uint16_t *vmcs_version);
 	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
 
-	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+	bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1228,7 +1251,6 @@ struct kvm_arch_async_pf {
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
-extern struct kmem_cache *x86_fpu_cache;
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
@@ -1402,6 +1424,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
@@ -1639,6 +1663,9 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #endif
 }
 
+void kvm_make_block_vmentry_request(struct kvm *kvm);
+void kvm_clear_block_vmentry_request(struct kvm *kvm);
+
 #define put_smstate(type, buf, offset, val)                      \
 	*(type *)((buf) + (offset) - 0x7e00) = val
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b0b54e18aa8bdfcc315646fc19337c48fbf2d98c..124b27c2cf69e082eff00bf87f823494922449e5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,7 @@
 #define MCI_STATUS_CEC_SHIFT	38           /* Corrected Error Count */
 #define MCI_STATUS_CEC_MASK	GENMASK_ULL(52,38)
 #define MCI_STATUS_CEC(c)	(((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
+#define MCI_STATUS_MSCOD(m)	(((m) >> 16) & 0xffff)
 
 /* AMD-specific bits */
 #define MCI_STATUS_TCC		BIT_ULL(55)  /* Task context corrupt */
@@ -172,7 +173,7 @@ enum mce_notifier_prios {
 	MCE_PRIO_EDAC,
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
-	MCE_PRIO_SRAO,
+	MCE_PRIO_UC,
 	MCE_PRIO_EARLY,
 	MCE_PRIO_CEC
 };
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 848ce43b9040d6876f95d808b0a4a2e413949185..6c36956452ca686a24c30cd11b52cc053fb982e4 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -36,6 +36,7 @@ void __init sme_map_bootdata(char *real_mode_data);
 void __init sme_unmap_bootdata(char *real_mode_data);
 
 void __init sme_early_init(void);
+void __init sev_setup_arch(void);
 
 void __init sme_encrypt_kernel(struct boot_params *bp);
 void __init sme_enable(struct boot_params *bp);
@@ -65,6 +66,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
 static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
 
 static inline void __init sme_early_init(void) { }
+static inline void __init sev_setup_arch(void) { }
 
 static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index e78c7db878018ee4b424c8a49630aedae1cbedb9..d232de747837f949bd52ceadf2a7c970567f9100 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -63,5 +63,6 @@ typedef struct {
 	}
 
 void leave_mm(int cpu);
+#define leave_mm leave_mm
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 25ddd0916bb2f4d3fd17bcea2bdbc68d7b3e0f2c..2db0c2bc88a818d6337d893b0ec0666cbf15cb9a 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -6,9 +6,58 @@
 
 typedef struct irq_alloc_info msi_alloc_info_t;
 
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+int x86_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 		    msi_alloc_info_t *arg);
 
-void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+#define arch_msi_prepare		x86_msi_prepare
+
+/* Structs and defines for the X86 specific MSI message format */
+
+typedef struct x86_msi_data {
+	u32	vector			:  8,
+		delivery_mode		:  3,
+		dest_mode_logical	:  1,
+		reserved		:  2,
+		active_low		:  1,
+		is_level		:  1;
+
+	u32	dmar_subhandle;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#define arch_msi_msg_data	x86_msi_data
+
+typedef struct x86_msi_addr_lo {
+	union {
+		struct {
+			u32	reserved_0		:  2,
+				dest_mode_logical	:  1,
+				redirect_hint		:  1,
+				reserved_1		:  8,
+				destid_0_7		:  8,
+				base_address		: 12;
+		};
+		struct {
+			u32	dmar_reserved_0		:  2,
+				dmar_index_15		:  1,
+				dmar_subhandle_valid	:  1,
+				dmar_format		:  1,
+				dmar_index_0_14		: 15,
+				dmar_base_address	: 12;
+		};
+	};
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#define arch_msi_msg_addr_lo	x86_msi_addr_lo
+
+#define X86_MSI_BASE_ADDRESS_LOW	(0xfee00000 >> 20)
+
+typedef struct x86_msi_addr_hi {
+	u32	reserved		:  8,
+		destid_8_31		: 24;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#define arch_msi_msg_addr_hi	x86_msi_addr_hi
+
+#define X86_MSI_BASE_ADDRESS_HIGH	(0)
+
+struct msi_msg;
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
 
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6a80c597b09f977b126b621bc5bc483d303edd3e..8c5f11299c688f8b305c7ee31ea0a1a47538589e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -41,6 +41,10 @@
 
 /* Intel MSRs. Some also available on other CPUs */
 
+#define MSR_TEST_CTRL				0x00000033
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
+
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			BIT(0)	   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP_SHIFT		1	   /* Single Thread Indirect Branch Predictor (STIBP) bit */
@@ -70,6 +74,11 @@
  */
 #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK	(~0x03U)
 
+/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
+#define MSR_IA32_CORE_CAPS			  0x000000cf
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT  5
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT	  BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
+
 #define MSR_PKG_CST_CONFIG_CONTROL	0x000000e2
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
 #define NHM_C1_AUTO_DEMOTE		(1UL << 26)
@@ -149,7 +158,23 @@
 #define LBR_INFO_MISPRED		BIT_ULL(63)
 #define LBR_INFO_IN_TX			BIT_ULL(62)
 #define LBR_INFO_ABORT			BIT_ULL(61)
+#define LBR_INFO_CYC_CNT_VALID		BIT_ULL(60)
 #define LBR_INFO_CYCLES			0xffff
+#define LBR_INFO_BR_TYPE_OFFSET		56
+#define LBR_INFO_BR_TYPE		(0xfull << LBR_INFO_BR_TYPE_OFFSET)
+
+#define MSR_ARCH_LBR_CTL		0x000014ce
+#define ARCH_LBR_CTL_LBREN		BIT(0)
+#define ARCH_LBR_CTL_CPL_OFFSET		1
+#define ARCH_LBR_CTL_CPL		(0x3ull << ARCH_LBR_CTL_CPL_OFFSET)
+#define ARCH_LBR_CTL_STACK_OFFSET	3
+#define ARCH_LBR_CTL_STACK		(0x1ull << ARCH_LBR_CTL_STACK_OFFSET)
+#define ARCH_LBR_CTL_FILTER_OFFSET	16
+#define ARCH_LBR_CTL_FILTER		(0x7full << ARCH_LBR_CTL_FILTER_OFFSET)
+#define MSR_ARCH_LBR_DEPTH		0x000014cf
+#define MSR_ARCH_LBR_FROM_0		0x00001500
+#define MSR_ARCH_LBR_TO_0		0x00001600
+#define MSR_ARCH_LBR_INFO_0		0x00001200
 
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_PEBS_DATA_CFG		0x000003f2
@@ -228,10 +253,14 @@
 #define MSR_IA32_LASTINTFROMIP		0x000001dd
 #define MSR_IA32_LASTINTTOIP		0x000001de
 
+#define MSR_IA32_PASID			0x00000d93
+#define MSR_IA32_PASID_VALID		BIT_ULL(31)
+
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
 #define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT	(1UL <<  2)
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
 #define DEBUGCTLMSR_BTINT		(1UL <<  8)
@@ -578,6 +607,8 @@
 
 #define MSR_IA32_BNDCFGS_RSVD		0x00000ffc
 
+#define MSR_IA32_XFD			0x000001c4
+#define MSR_IA32_XFD_ERR		0x000001c5
 #define MSR_IA32_XSS			0x00000da0
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
@@ -875,6 +906,7 @@
 #define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
 #define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
 #define MSR_IA32_VMX_VMFUNC             0x00000491
+#define MSR_IA32_VMX_PROCBASED_CTLS3	0x00000492
 
 /* VMX_BASIC bits and bitmasks */
 #define VMX_BASIC_VMCS_SIZE_SHIFT	32
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index b40d0295d81290d91bb8505c4b2f6822820d6941..dd4207d7fddb80c268c6b522b0282585d95a979f 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -94,7 +94,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr)
 
 	asm volatile("1: rdmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
 		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 
 	return EAX_EDX_VAL(val, low, high);
@@ -104,7 +104,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
 {
 	asm volatile("1: wrmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
 		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index e662f987dfa2cb093a25e1684e06c0067e78eb58..43e74b7ff65a6e805505291b52d6fa3c33029144 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -27,7 +27,7 @@ struct pci_sysdata {
 	void		*fwnode;	/* IRQ domain for MSI assignment */
 #endif
 #if IS_ENABLED(CONFIG_VMD)
-	bool vmd_domain;		/* True if in Intel VMD domain */
+	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
 };
 
@@ -37,12 +37,15 @@ extern int noioapicreroute;
 
 #ifdef CONFIG_PCI
 
+static inline struct pci_sysdata *to_pci_sysdata(const struct pci_bus *bus)
+{
+	return bus->sysdata;
+}
+
 #ifdef CONFIG_PCI_DOMAINS
 static inline int pci_domain_nr(struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->domain;
+	return to_pci_sysdata(bus)->domain;
 }
 
 static inline int pci_proc_domain(struct pci_bus *bus)
@@ -54,24 +57,20 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->fwnode;
+	return to_pci_sysdata(bus)->fwnode;
 }
 
 #define pci_root_bus_fwnode	_pci_root_bus_fwnode
 #endif
 
+#if IS_ENABLED(CONFIG_VMD)
 static inline bool is_vmd(struct pci_bus *bus)
 {
-#if IS_ENABLED(CONFIG_VMD)
-	struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->vmd_domain;
-#else
-	return false;
-#endif
+	return to_pci_sysdata(bus)->vmd_dev != NULL;
 }
+#else
+#define is_vmd(bus)		false
+#endif /* CONFIG_VMD */
 
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
@@ -107,17 +106,6 @@ static inline void early_quirks(void) { }
 #endif
 
 extern void pci_iommu_alloc(void);
-
-#ifdef CONFIG_PCI_MSI
-/* implemented in arch/x86/kernel/apic/io_apic. */
-struct msi_desc;
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
-void native_teardown_msi_irq(unsigned int irq);
-void native_restore_msi_irqs(struct pci_dev *dev);
-#else
-#define native_setup_msi_irqs		NULL
-#define native_teardown_msi_irq		NULL
-#endif
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
@@ -131,9 +119,7 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 /* Returns the node based on pci bus */
 static inline int __pcibus_to_node(const struct pci_bus *bus)
 {
-	const struct pci_sysdata *sd = bus->sysdata;
-
-	return sd->node;
+	return to_pci_sysdata(bus)->node;
 }
 
 static inline const struct cpumask *
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 73bb404f4d2a23c2e1e01afdc30a132553672b04..490411dba438dcca53517ddc131d8b61242dc007 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -114,9 +114,20 @@ extern const struct pci_raw_ops pci_direct_conf1;
 extern bool port_cf9_safe;
 
 /* arch_initcall level */
+#ifdef CONFIG_PCI_DIRECT
 extern int pci_direct_probe(void);
 extern void pci_direct_init(int type);
+#else
+static inline int pci_direct_probe(void) { return -1; }
+static inline  void pci_direct_init(int type) { }
+#endif
+
+#ifdef CONFIG_PCI_BIOS
 extern void pci_pcbios_init(void);
+#else
+static inline void pci_pcbios_init(void) { }
+#endif
+
 extern void __init dmi_check_pciprobe(void);
 extern void __init dmi_check_skip_isa_align(void);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6a174de55b04cbeefcade351ba6d3baea98ce1fc..abb5bb10e0321f99f9a92f943470a9b632199e11 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -142,6 +142,46 @@ union cpuid10_edx {
 	unsigned int full;
 };
 
+/*
+ * Intel Architectural LBR CPUID detection/enumeration details:
+ */
+union cpuid28_eax {
+	struct {
+		/* Supported LBR depth values */
+		unsigned int	lbr_depth_mask:8;
+		unsigned int	reserved:22;
+		/* Deep C-state Reset */
+		unsigned int	lbr_deep_c_reset:1;
+		/* IP values contain LIP */
+		unsigned int	lbr_lip:1;
+	} split;
+	unsigned int		full;
+};
+
+union cpuid28_ebx {
+	struct {
+		/* CPL Filtering Supported */
+		unsigned int    lbr_cpl:1;
+		/* Branch Filtering Supported */
+		unsigned int    lbr_filter:1;
+		/* Call-stack Mode Supported */
+		unsigned int    lbr_call_stack:1;
+	} split;
+	unsigned int            full;
+};
+
+union cpuid28_ecx {
+	struct {
+		/* Mispredict Bit Supported */
+		unsigned int    lbr_mispred:1;
+		/* Timed LBRs Supported */
+		unsigned int    lbr_timed_lbr:1;
+		/* Branch Type Field Supported */
+		unsigned int    lbr_br_type:1;
+	} split;
+	unsigned int            full;
+};
+
 struct x86_pmu_capability {
 	int		version;
 	int		num_counters_gp;
@@ -219,8 +259,12 @@ struct x86_pmu_capability {
 #define INTEL_PMC_IDX_TD_BAD_SPEC		(INTEL_PMC_IDX_METRIC_BASE + 1)
 #define INTEL_PMC_IDX_TD_FE_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 2)
 #define INTEL_PMC_IDX_TD_BE_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END		INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN			((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS		(INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT		(INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT		(INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND		(INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END		INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN			((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
 						INTEL_PMC_MSK_FIXED_SLOTS)
 
 /*
@@ -238,8 +282,14 @@ struct x86_pmu_capability {
 #define INTEL_TD_METRIC_BAD_SPEC		0x8100	/* Bad speculation metric */
 #define INTEL_TD_METRIC_FE_BOUND		0x8200	/* FE bound metric */
 #define INTEL_TD_METRIC_BE_BOUND		0x8300	/* BE bound metric */
-#define INTEL_TD_METRIC_MAX			INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM			4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS		0x8400  /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT		0x8500  /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT		0x8600  /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND		0x8700  /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX			INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM			8
 
 static inline bool is_metric_idx(int idx)
 {
@@ -313,14 +363,6 @@ struct pebs_xmm {
 	u64 xmm[16*2];	/* two entries for each register */
 };
 
-struct pebs_lbr_entry {
-	u64 from, to, info;
-};
-
-struct pebs_lbr {
-	struct pebs_lbr_entry lbr[0]; /* Variable length */
-};
-
 /*
  * IBS cpuid feature detection
  */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ea85f23d9e2277f1b82baac638f0a4c8513ad982..a5c2dbf21b19437388b143b93d79aaf79a7f06f9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,8 +23,9 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
-#include <asm/fpu/xstate.h>
+#include <asm/pkru.h>
 #include <asm/fpu/api.h>
+#include <linux/spinlock.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -125,35 +126,6 @@ static inline int pte_dirty(pte_t pte)
 	return pte_flags(pte) & _PAGE_DIRTY;
 }
 
-
-static inline u32 read_pkru(void)
-{
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		return rdpkru();
-	return 0;
-}
-
-static inline void write_pkru(u32 pkru)
-{
-	struct pkru_state *pk;
-
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
-		return;
-
-	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
-
-	/*
-	 * The PKRU value in xstate needs to be in sync with the value that is
-	 * written to the CPU. The FPU restore on return to userland would
-	 * otherwise load the previous value again.
-	 */
-	fpregs_lock();
-	if (pk)
-		pk->pkru = pkru;
-	__write_pkru(pkru);
-	fpregs_unlock();
-}
-
 static inline int pte_young(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_ACCESSED;
@@ -1375,32 +1347,6 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
-#define PKRU_BITS_PER_PKEY 2
-
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-extern u32 init_pkru_value;
-#else
-#define init_pkru_value	0
-#endif
-
-static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
-{
-	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
-	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
-}
-
-static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
-{
-	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
-	/*
-	 * Access-disable disables writes too so we need to check
-	 * both bits here.
-	 */
-	return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
-}
-
 static inline u16 pte_flags_pkey(unsigned long pte_flags)
 {
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 2ff9b98812b7637de7fedadc25aa60d20634e19a..5c7bcaa7962323ba29dc825c00d2d7d8a2234417 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -9,14 +9,14 @@
  * will be necessary to ensure that the types that store key
  * numbers and masks have sufficient capacity.
  */
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 
 static inline bool arch_pkeys_enabled(void)
 {
-	return boot_cpu_has(X86_FEATURE_OSPKE);
+	return cpu_feature_enabled(X86_FEATURE_OSPKE);
 }
 
 /*
@@ -26,7 +26,7 @@ static inline bool arch_pkeys_enabled(void)
 extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return ARCH_DEFAULT_PKEY;
 
 	return __execute_only_pkey(mm);
@@ -37,7 +37,7 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
 		int prot, int pkey)
 {
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return 0;
 
 	return __arch_override_mprotect_pkey(vma, prot, pkey);
@@ -124,7 +124,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
-extern void copy_init_pkru_to_fpregs(void);
 
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6c28480dba6716988d1a94005c606f7b06a7c8b
--- /dev/null
+++ b/arch/x86/include/asm/pkru.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKRU_H
+#define _ASM_X86_PKRU_H
+
+#include <asm/cpufeature.h>
+
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
+#define PKRU_BITS_PER_PKEY 2
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#define pkru_get_init_value()	READ_ONCE(init_pkru_value)
+#else
+#define init_pkru_value	0
+#define pkru_get_init_value()	0
+#endif
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+	/*
+	 * Access-disable disables writes too so we need to check
+	 * both bits here.
+	 */
+	return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+
+static inline u32 read_pkru(void)
+{
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return rdpkru();
+	return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+	/*
+	 * WRPKRU is relatively expensive compared to RDPKRU.
+	 * Avoid WRPKRU when it would not change the value.
+	 */
+	if (pkru != rdpkru())
+		wrpkru(pkru);
+}
+
+static inline void pkru_write_default(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	wrpkru(pkru_get_init_value());
+}
+
+#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6574bf1c9dc89c80e3caf795e021c82a75406a1f..e125879d8b3e4ae80d66cb19eb39609760ee731b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -93,15 +93,24 @@ struct cpuinfo_x86 {
 	__u32			extended_cpuid_level;
 	/* Maximum supported CPUID level, -1=no CPUID: */
 	int			cpuid_level;
-	__u32			x86_capability[NCAPINTS + NBUGINTS];
+	/*
+	 * Align to size of unsigned long because the x86_capability array
+	 * is passed to bitops which require the alignment. Use unnamed
+	 * union to enforce the array is aligned to size of unsigned long.
+	 */
+	union {
+		__u32		x86_capability[NCAPINTS + NBUGINTS];
+		unsigned long	x86_capability_alignment;
+	};
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
 	/* in KB - valid for CPUS which support this call: */
 	unsigned int		x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
-	/* Cache QoS architectural values: */
+	/* Cache QoS architectural values, valid only on the BSP: */
 	int			x86_cache_max_rmid;	/* max index */
 	int			x86_cache_occ_scale;	/* scale to bytes */
+	int			x86_cache_mbm_width_offset;
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
 	/* cpuid returned max cores value: */
@@ -426,9 +435,6 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
 #endif	/* X86_64 */
 
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
-
 struct perf_event;
 
 typedef struct {
@@ -486,7 +492,15 @@ struct thread_struct {
 	mm_segment_t		addr_limit;
 
 	unsigned int		sig_on_uaccess_err:1;
-	unsigned int		uaccess_err:1;	/* uaccess failed */
+
+	/*
+	 * Protection Keys Register for Userspace.  Loaded immediately on
+	 * context switch. Store it in thread_struct to avoid a lookup in
+	 * the tasks's FPU xstate buffer. This value is only valid when a
+	 * task is scheduled out. For 'current' the authoritative source of
+	 * PKRU is the hardware itself.
+	 */
+	u32			pkru;
 
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
@@ -496,12 +510,12 @@ struct thread_struct {
 	 */
 };
 
-/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size);
+
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
 						unsigned long *size)
 {
-	*offset = offsetof(struct thread_struct, fpu.state);
-	*size = fpu_kernel_xstate_size;
+	fpu_thread_struct_whitelist(offset, size);
 }
 
 /*
@@ -644,72 +658,6 @@ static __always_inline void cpu_relax(void)
 	rep_nop();
 }
 
-/*
- * This function forces the icache and prefetched instruction stream to
- * catch up with reality in two very specific cases:
- *
- *  a) Text was modified using one virtual address and is about to be executed
- *     from the same physical page at a different virtual address.
- *
- *  b) Text was modified on a different CPU, may subsequently be
- *     executed on this CPU, and you want to make sure the new version
- *     gets executed.  This generally means you're calling this in a IPI.
- *
- * If you're calling this for a different reason, you're probably doing
- * it wrong.
- */
-static inline void sync_core(void)
-{
-	/*
-	 * There are quite a few ways to do this.  IRET-to-self is nice
-	 * because it works on every CPU, at any CPL (so it's compatible
-	 * with paravirtualization), and it never exits to a hypervisor.
-	 * The only down sides are that it's a bit slow (it seems to be
-	 * a bit more than 2x slower than the fastest options) and that
-	 * it unmasks NMIs.  The "push %cs" is needed because, in
-	 * paravirtual environments, __KERNEL_CS may not be a valid CS
-	 * value when we do IRET directly.
-	 *
-	 * In case NMI unmasking or performance ever becomes a problem,
-	 * the next best option appears to be MOV-to-CR2 and an
-	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV).
-	 *
-	 * CPUID is the conventional way, but it's nasty: it doesn't
-	 * exist on some 486-like CPUs, and it usually exits to a
-	 * hypervisor.
-	 *
-	 * Like all of Linux's memory ordering operations, this is a
-	 * compiler barrier as well.
-	 */
-#ifdef CONFIG_X86_32
-	asm volatile (
-		"pushfl\n\t"
-		"pushl %%cs\n\t"
-		"pushl $1f\n\t"
-		"iret\n\t"
-		"1:"
-		: ASM_CALL_CONSTRAINT : : "memory");
-#else
-	unsigned int tmp;
-
-	asm volatile (
-		UNWIND_HINT_SAVE
-		"mov %%ss, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq %%rsp\n\t"
-		"addq $8, (%%rsp)\n\t"
-		"pushfq\n\t"
-		"mov %%cs, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq $1f\n\t"
-		"iretq\n\t"
-		UNWIND_HINT_RESTORE
-		"1:"
-		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
-#endif
-}
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void amd_e400_c1e_apic_setup(void);
 
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6e81788a30c12394f28f7258db157880890adf24..7950b4ba72f411dcb98f7850ca9b80a2aa7a0ac4 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -36,6 +36,6 @@ void x86_report_nx(void);
 extern int reboot_force;
 
 long do_arch_prctl_common(struct task_struct *task, int option,
-			  unsigned long cpuid_enabled);
+			  unsigned long arg2);
 
 #endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
deleted file mode 100644
index 232f856e0db067e285bbec67da28f0ee4996e083..0000000000000000000000000000000000000000
--- a/arch/x86/include/asm/refcount.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef __ASM_X86_REFCOUNT_H
-#define __ASM_X86_REFCOUNT_H
-/*
- * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
- * PaX/grsecurity.
- */
-#include <linux/refcount.h>
-#include <asm/bug.h>
-
-/*
- * This is the first portion of the refcount error handling, which lives in
- * .text.unlikely, and is jumped to from the CPU flag check (in the
- * following macros). This saves the refcount value location into CX for
- * the exception handler to use (in mm/extable.c), and then triggers the
- * central refcount exception. The fixup address for the exception points
- * back to the regular execution flow in .text.
- */
-#define _REFCOUNT_EXCEPTION				\
-	".pushsection .text..refcount\n"		\
-	"111:\tlea %[var], %%" _ASM_CX "\n"		\
-	"112:\t" ASM_UD2 "\n"				\
-	ASM_UNREACHABLE					\
-	".popsection\n"					\
-	"113:\n"					\
-	_ASM_EXTABLE_REFCOUNT(112b, 113b)
-
-/* Trigger refcount exception if refcount result is negative. */
-#define REFCOUNT_CHECK_LT_ZERO				\
-	"js 111f\n\t"					\
-	_REFCOUNT_EXCEPTION
-
-/* Trigger refcount exception if refcount result is zero or negative. */
-#define REFCOUNT_CHECK_LE_ZERO				\
-	"jz 111f\n\t"					\
-	REFCOUNT_CHECK_LT_ZERO
-
-/* Trigger refcount exception unconditionally. */
-#define REFCOUNT_ERROR					\
-	"jmp 111f\n\t"					\
-	_REFCOUNT_EXCEPTION
-
-static __always_inline void refcount_add(unsigned int i, refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
-		REFCOUNT_CHECK_LT_ZERO
-		: [var] "+m" (r->refs.counter)
-		: "ir" (i)
-		: "cc", "cx");
-}
-
-static __always_inline void refcount_inc(refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "incl %0\n\t"
-		REFCOUNT_CHECK_LT_ZERO
-		: [var] "+m" (r->refs.counter)
-		: : "cc", "cx");
-}
-
-static __always_inline void refcount_dec(refcount_t *r)
-{
-	asm volatile(LOCK_PREFIX "decl %0\n\t"
-		REFCOUNT_CHECK_LE_ZERO
-		: [var] "+m" (r->refs.counter)
-		: : "cc", "cx");
-}
-
-static __always_inline __must_check
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
-{
-	bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
-					 REFCOUNT_CHECK_LT_ZERO,
-					 r->refs.counter, e, "er", i, "cx");
-
-	if (ret) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-
-	return false;
-}
-
-static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
-{
-	bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
-					 REFCOUNT_CHECK_LT_ZERO,
-					 r->refs.counter, e, "cx");
-
-	if (ret) {
-		smp_acquire__after_ctrl_dep();
-		return true;
-	}
-
-	return false;
-}
-
-static __always_inline __must_check
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-	int c, result;
-
-	c = atomic_read(&(r->refs));
-	do {
-		if (unlikely(c == 0))
-			return false;
-
-		result = c + i;
-
-		/* Did we try to increment from/to an undesirable state? */
-		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
-			asm volatile(REFCOUNT_ERROR
-				     : : [var] "m" (r->refs.counter)
-				     : "cc", "cx");
-			break;
-		}
-
-	} while (!atomic_try_cmpxchg(&(r->refs), &c, result));
-
-	return c != 0;
-}
-
-static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
-{
-	return refcount_add_not_zero(1, r);
-}
-
-#endif
diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl.h
similarity index 88%
rename from arch/x86/include/asm/resctrl_sched.h
rename to arch/x86/include/asm/resctrl.h
index f6b7fe2833cc72a78bed2c0cad3b59b8c6eb4f23..d60ed0668a59335833ee6fded25f1b60fae3d232 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_RESCTRL_SCHED_H
-#define _ASM_X86_RESCTRL_SCHED_H
+#ifndef _ASM_X86_RESCTRL_H
+#define _ASM_X86_RESCTRL_H
 
 #ifdef CONFIG_X86_CPU_RESCTRL
 
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
 	u32 closid = state->default_closid;
 	u32 rmid = state->default_rmid;
+	u32 tmp;
 
 	/*
 	 * If this task has a closid/rmid assigned, use it.
 	 * Else use the closid/rmid assigned to this cpu.
 	 */
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
-		if (current->closid)
-			closid = current->closid;
+		tmp = READ_ONCE(current->closid);
+		if (tmp)
+			closid = tmp;
 	}
 
 	if (static_branch_likely(&rdt_mon_enable_key)) {
-		if (current->rmid)
-			rmid = current->rmid;
+		tmp = READ_ONCE(current->rmid);
+		if (tmp)
+			rmid = tmp;
 	}
 
 	if (closid != state->cur_closid || rmid != state->cur_rmid) {
@@ -84,10 +87,13 @@ static inline void resctrl_sched_in(void)
 		__resctrl_sched_in();
 }
 
+void resctrl_cpu_detect(struct cpuinfo_x86 *c);
+
 #else
 
 static inline void resctrl_sched_in(void) {}
+static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
 
 #endif /* CONFIG_X86_CPU_RESCTRL */
 
-#endif /* _ASM_X86_RESCTRL_SCHED_H */
+#endif /* _ASM_X86_RESCTRL_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6669164abadcbd8a0b1b71e4a4ba40f800917dd5..c8610da2d23a97ca72e545a157cc5d3671da7ebb 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -350,7 +350,7 @@ static inline void __loadsegment_fs(unsigned short value)
 		     "1:	movw %0, %%fs			\n"
 		     "2:					\n"
 
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
 
 		     : : "rm" (value) : "memory");
 }
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index f176114c04d496002d0f6cf5d6e8f6a9a7ddee74..5b1ed650b12489f328c38645cebd66f446603532 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -33,11 +33,7 @@ struct sigframe_ia32 {
 	 * legacy application accessing/modifying it.
 	 */
 	struct _fpstate_32 fpstate_unused;
-#ifdef CONFIG_IA32_EMULATION
-	unsigned int extramask[_COMPAT_NSIG_WORDS-1];
-#else /* !CONFIG_IA32_EMULATION */
-	unsigned long extramask[_NSIG_WORDS-1];
-#endif /* CONFIG_IA32_EMULATION */
+	unsigned int extramask[1];
 	char retcode[8];
 	/* fp state follows here */
 };
@@ -89,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 2fcbd6f33ef74b22b9a3245261a17f13ca6b8855..35e0b579ffcbec607960ea7fdfd973fb391060bd 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -14,9 +14,6 @@
 			 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-		     struct pt_regs *regs, unsigned long mask);
-
 
 #ifdef CONFIG_X86_X32_ABI
 asmlinkage long sys32_x32_rt_sigreturn(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index e15f364efbccab5b080f1d2616964ef6b97a9ed5..c0538f82c9a220cfafbb20d7d954eff6eff5053e 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -5,16 +5,6 @@
 #include <linux/cpumask.h>
 #include <asm/percpu.h>
 
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <asm/mpspec.h>
-# include <asm/apic.h>
-# ifdef CONFIG_X86_IO_APIC
-#  include <asm/io_apic.h>
-# endif
-#endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
 
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2e0cdc64cb50dc8fe6c8b6641d62181cb696bdb4..00c667f63126ead6e34622c710e7e38541e7ebe2 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -103,25 +103,13 @@ static inline void wrpkru(u32 pkru)
 		     : : "a" (pkru), "c"(ecx), "d"(edx));
 }
 
-static inline void __write_pkru(u32 pkru)
-{
-	/*
-	 * WRPKRU is relatively expensive compared to RDPKRU.
-	 * Avoid WRPKRU when it would not change the value.
-	 */
-	if (pkru == rdpkru())
-		return;
-
-	wrpkru(pkru);
-}
-
 #else
 static inline u32 rdpkru(void)
 {
 	return 0;
 }
 
-static inline void __write_pkru(u32 pkru)
+static inline void wrpkru(u32 pkru)
 {
 }
 #endif
@@ -226,6 +214,78 @@ static inline void clwb(volatile void *__p)
 
 #define nop() asm volatile ("nop")
 
+/* The dst parameter must be 64-bytes aligned */
+static inline void movdir64b(void __iomem *dst, const void *src)
+{
+	const struct { char _[64]; } *__src = src;
+	struct { char _[64]; } __iomem *__dst = dst;
+
+	/*
+	 * MOVDIR64B %(rdx), rax.
+	 *
+	 * Both __src and __dst must be memory constraints in order to tell the
+	 * compiler that no other memory accesses should be reordered around
+	 * this one.
+	 *
+	 * Also, both must be supplied as lvalues because this tells
+	 * the compiler what the object is (its size) the instruction accesses.
+	 * I.e., not the pointers but what they point to, thus the deref'ing '*'.
+	 */
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		     : "+m" (*__dst)
+		     :  "m" (*__src), "a" (__dst), "d" (__src));
+}
+
+/**
+ * enqcmds - Enqueue a command in supervisor (CPL0) mode
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: 512 bits memory operand
+ *
+ * The ENQCMDS instruction allows software to write a 512-bit command to
+ * a 512-bit-aligned special MMIO region that supports the instruction.
+ * A return status is loaded into the ZF flag in the RFLAGS register.
+ * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
+ *
+ * This function issues the ENQCMDS instruction to submit data from
+ * kernel space to MMIO space, in a unit of 512 bits. Order of data access
+ * is not guaranteed, nor is a memory barrier performed afterwards. It
+ * returns 0 on success and -EAGAIN on failure.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the
+ * ENQCMDS instruction is supported on the platform and the device accepts
+ * ENQCMDS.
+ */
+static inline int enqcmds(void __iomem *dst, const void *src)
+{
+	const struct { char _[64]; } *__src = src;
+	struct { char _[64]; } __iomem *__dst = dst;
+	bool zf;
+
+	/*
+	 * ENQCMDS %(rdx), rax
+	 *
+	 * See movdir64b()'s comment on operand specification.
+	 */
+	asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
+		     CC_SET(z)
+		     : CC_OUT(z) (zf), "+m" (*__dst)
+		     : "m" (*__src), "a" (__dst), "d" (__src));
+
+	/* Submission failure is indicated via EFLAGS.ZF=1 */
+	if (zf)
+		return -EAGAIN;
+
+	return 0;
+}
+
+static inline void tile_release(void)
+{
+	/*
+	 * Instruction opcode for TILERELEASE; supported in binutils
+	 * version >= 2.36.
+	 */
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 43b5e02a7b4b9b5a24c379d8089ace2cbf685980..ee40c11cd2afb98cee52050e8ba7ea8b8bf2901c 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -6,6 +6,80 @@
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 
+#ifdef CONFIG_X86_32
+static inline void iret_to_self(void)
+{
+	asm volatile (
+		"pushfl\n\t"
+		"pushl %%cs\n\t"
+		"pushl $1f\n\t"
+		"iret\n\t"
+		"1:"
+		: ASM_CALL_CONSTRAINT : : "memory");
+}
+#else
+static inline void iret_to_self(void)
+{
+	unsigned int tmp;
+
+	asm volatile (
+		UNWIND_HINT_SAVE
+		"mov %%ss, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq %%rsp\n\t"
+		"addq $8, (%%rsp)\n\t"
+		"pushfq\n\t"
+		"mov %%cs, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq $1f\n\t"
+		"iretq\n\t"
+		UNWIND_HINT_RESTORE
+		"1:"
+		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ *  a) Text was modified using one virtual address and is about to be executed
+ *     from the same physical page at a different virtual address.
+ *
+ *  b) Text was modified on a different CPU, may subsequently be
+ *     executed on this CPU, and you want to make sure the new version
+ *     gets executed.  This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
+static inline void sync_core(void)
+{
+	/*
+	 * There are quite a few ways to do this.  IRET-to-self is nice
+	 * because it works on every CPU, at any CPL (so it's compatible
+	 * with paravirtualization), and it never exits to a hypervisor.
+	 * The only down sides are that it's a bit slow (it seems to be
+	 * a bit more than 2x slower than the fastest options) and that
+	 * it unmasks NMIs.  The "push %cs" is needed because, in
+	 * paravirtual environments, __KERNEL_CS may not be a valid CS
+	 * value when we do IRET directly.
+	 *
+	 * In case NMI unmasking or performance ever becomes a problem,
+	 * the next best option appears to be MOV-to-CR2 and an
+	 * unconditional jump.  That sequence also works on all CPUs,
+	 * but it will fault at CPL3 (i.e. Xen PV).
+	 *
+	 * CPUID is the conventional way, but it's nasty: it doesn't
+	 * exist on some 486-like CPUs, and it usually exits to a
+	 * hypervisor.
+	 *
+	 * Like all of Linux's memory ordering operations, this is a
+	 * compiler barrier as well.
+	 */
+	iret_to_self();
+}
+
 /*
  * Ensure that a core serializing instruction is issued before returning
  * to user-mode. x86 implements return to user-space through sysexit,
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a4de7aa7500fbea03cd187cce92b16df3bc76872..20a1462810b11b3a4de0aadb0bbb3ddf9b7616df 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
+#define TIF_SLD			18	/* Restore split lock detection on context switch */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
+#define _TIF_SLD		(1 << TIF_SLD)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
@@ -145,7 +147,7 @@ struct thread_info {
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE						\
 	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|		\
-	 _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+	 _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)
 
 /*
  * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index fd86d4bd23dfc0cfacba57004e79a1e907963c57..39c7ebe9a3e73d243680b4c4b442affe9761d3f8 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
 		__entry->fpu		= fpu;
 		__entry->load_fpu	= test_thread_flag(TIF_NEED_FPU_LOAD);
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
-			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
+			__entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
+			__entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
 		}
 	),
 	TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 8a0c25c6bf099cf9858568d67e382f940e493a47..db5977174ce7442007d2c70bcafd8519075e49e4 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -6,6 +6,7 @@
 #define _ASM_X86_TSC_H
 
 #include <asm/processor.h>
+#include <asm/cpufeature.h>
 
 #define NS_SCALE	10 /* 2^10, carefully chosen */
 #define US_SCALE	32 /* 2^32, arbitralrily chosen */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 61d93f062a36e0a567a69ee29ee7337deefa5e67..d11662f753d2dc128f1ef1cf376319f1aa3b1cc5 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -193,23 +193,12 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : : "A" (x), "r" (addr)			\
 		     : : label)
 
-#define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile("\n"						\
-		     "1:	movl %%eax,0(%1)\n"			\
-		     "2:	movl %%edx,4(%1)\n"			\
-		     "3:"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     _ASM_EXTABLE_EX(2b, 3b)				\
-		     : : "A" (x), "r" (addr))
-
 #define __put_user_x8(x, ptr, __ret_pu)				\
 	asm volatile("call __put_user_8" : "=a" (__ret_pu)	\
 		     : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
 #else
 #define __put_user_goto_u64(x, ptr, label) \
 	__put_user_goto(x, ptr, "q", "", "er", label)
-#define __put_user_asm_ex_u64(x, addr)	\
-	__put_user_asm_ex(x, addr, "q", "", "er")
 #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
 #endif
 
@@ -289,31 +278,6 @@ do {									\
 	}								\
 } while (0)
 
-/*
- * This doesn't do __uaccess_begin/end - the exception handling
- * around it must do that.
- */
-#define __put_user_size_ex(x, ptr, size)				\
-do {									\
-	__chk_user_ptr(ptr);						\
-	switch (size) {							\
-	case 1:								\
-		__put_user_asm_ex(x, ptr, "b", "b", "iq");		\
-		break;							\
-	case 2:								\
-		__put_user_asm_ex(x, ptr, "w", "w", "ir");		\
-		break;							\
-	case 4:								\
-		__put_user_asm_ex(x, ptr, "l", "k", "ir");		\
-		break;							\
-	case 8:								\
-		__put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr);	\
-		break;							\
-	default:							\
-		__put_user_bad();					\
-	}								\
-} while (0)
-
 #ifdef CONFIG_X86_32
 #define __get_user_asm_u64(x, ptr, retval, errret)			\
 ({									\
@@ -335,12 +299,9 @@ do {									\
 		       "i" (errret), "0" (retval));			\
 })
 
-#define __get_user_asm_ex_u64(x, ptr)			(x) = __get_user_bad()
 #else
 #define __get_user_asm_u64(x, ptr, retval, errret) \
 	 __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
-#define __get_user_asm_ex_u64(x, ptr) \
-	 __get_user_asm_ex(x, ptr, "q", "", "=r")
 #endif
 
 #define __get_user_size(x, ptr, size, retval, errret)			\
@@ -378,53 +339,6 @@ do {									\
 		     : "=r" (err), ltype(x)				\
 		     : "m" (__m(addr)), "i" (errret), "0" (err))
 
-#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile("\n"						\
-		     "1:	mov"itype" %2,%"rtype"1\n"		\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:	mov %3,%0\n"				\
-		     "	jmp 2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE_UA(1b, 3b)				\
-		     : "=r" (err), ltype(x)				\
-		     : "m" (__m(addr)), "i" (errret), "0" (err))
-
-/*
- * This doesn't do __uaccess_begin/end - the exception handling
- * around it must do that.
- */
-#define __get_user_size_ex(x, ptr, size)				\
-do {									\
-	__chk_user_ptr(ptr);						\
-	switch (size) {							\
-	case 1:								\
-		__get_user_asm_ex(x, ptr, "b", "b", "=q");		\
-		break;							\
-	case 2:								\
-		__get_user_asm_ex(x, ptr, "w", "w", "=r");		\
-		break;							\
-	case 4:								\
-		__get_user_asm_ex(x, ptr, "l", "k", "=r");		\
-		break;							\
-	case 8:								\
-		__get_user_asm_ex_u64(x, ptr);				\
-		break;							\
-	default:							\
-		(x) = __get_user_bad();					\
-	}								\
-} while (0)
-
-#define __get_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %1,%"rtype"0\n"		\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-                     "3:xor"itype" %"rtype"0,%"rtype"0\n"		\
-		     "  jmp 2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE_EX(1b, 3b)				\
-		     : ltype(x) : "m" (__m(addr)))
-
 #define __put_user_nocheck(x, ptr, size)			\
 ({								\
 	__label__ __pu_label;					\
@@ -480,29 +394,6 @@ struct __large_struct { unsigned long buf[100]; };
 	retval = __put_user_failed(x, addr, itype, rtype, ltype, errret);	\
 } while (0)
 
-#define __put_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %"rtype"0,%1\n"		\
-		     "2:\n"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     : : ltype(x), "m" (__m(addr)))
-
-/*
- * uaccess_try and catch
- */
-#define uaccess_try	do {						\
-	current->thread.uaccess_err = 0;				\
-	__uaccess_begin();						\
-	barrier();
-
-#define uaccess_try_nospec do {						\
-	current->thread.uaccess_err = 0;				\
-	__uaccess_begin_nospec();					\
-
-#define uaccess_catch(err)						\
-	__uaccess_end();						\
-	(err) |= (current->thread.uaccess_err ? -EFAULT : 0);		\
-} while (0)
-
 /**
  * __get_user - Get a simple variable from user space, with less checking.
  * @x:   Variable to store result.
@@ -552,28 +443,6 @@ struct __large_struct { unsigned long buf[100]; };
 #define __put_user(x, ptr)						\
 	__put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
 
-/*
- * {get|put}_user_try and catch
- *
- * get_user_try {
- *	get_user_ex(...);
- * } get_user_catch(err)
- */
-#define get_user_try		uaccess_try_nospec
-#define get_user_catch(err)	uaccess_catch(err)
-
-#define get_user_ex(x, ptr)	do {					\
-	unsigned long __gue_val;					\
-	__get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr))));	\
-	(x) = (__force __typeof__(*(ptr)))__gue_val;			\
-} while (0)
-
-#define put_user_try		uaccess_try
-#define put_user_catch(err)	uaccess_catch(err)
-
-#define put_user_ex(x, ptr)						\
-	__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
-
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
 extern __must_check long
@@ -694,15 +563,6 @@ extern struct movsl_mask {
 # include <asm/uaccess_64.h>
 #endif
 
-/*
- * We rely on the nested NMI work to allow atomic faults from the NMI path; the
- * nested NMI paths are careful to preserve CR2.
- *
- * Caller must use pagefault_enable/disable, or run in interrupt context,
- * and also do a uaccess_ok() check
- */
-#define __copy_from_user_nmi __copy_from_user_inatomic
-
 /*
  * The "unsafe" user accesses aren't really "unsafe", but the naming
  * is a big fat warning: you have to not only do the access_ok()
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index ba2dc19306303728e1cf8652a20d982dce1e4e2a..388a40660c7b517f53c5685ae28844b9b4cacb87 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -23,33 +23,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long
 raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u8 *)to, from, ret,
-					      "b", "b", "=q", 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u16 *)to, from, ret,
-					      "w", "w", "=r", 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			ret = 0;
-			__uaccess_begin_nospec();
-			__get_user_asm_nozero(*(u32 *)to, from, ret,
-					      "l", "k", "=r", 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
 	return __copy_user_ll(to, (__force const void *)from, n);
 }
 
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 5cd1caa8bc6537c8795218581118c60552128ad8..bc10e3dc64fed755dc5077bb4c6bc3c6984794f2 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -65,117 +65,13 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len)
 static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
-	int ret = 0;
-
-	if (!__builtin_constant_p(size))
-		return copy_user_generic(dst, (__force void *)src, size);
-	switch (size) {
-	case 1:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
-			      ret, "b", "b", "=q", 1);
-		__uaccess_end();
-		return ret;
-	case 2:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
-			      ret, "w", "w", "=r", 2);
-		__uaccess_end();
-		return ret;
-	case 4:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
-			      ret, "l", "k", "=r", 4);
-		__uaccess_end();
-		return ret;
-	case 8:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			      ret, "q", "", "=r", 8);
-		__uaccess_end();
-		return ret;
-	case 10:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			       ret, "q", "", "=r", 10);
-		if (likely(!ret))
-			__get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
-				       (u16 __user *)(8 + (char __user *)src),
-				       ret, "w", "w", "=r", 2);
-		__uaccess_end();
-		return ret;
-	case 16:
-		__uaccess_begin_nospec();
-		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
-			       ret, "q", "", "=r", 16);
-		if (likely(!ret))
-			__get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
-				       (u64 __user *)(8 + (char __user *)src),
-				       ret, "q", "", "=r", 8);
-		__uaccess_end();
-		return ret;
-	default:
-		return copy_user_generic(dst, (__force void *)src, size);
-	}
+	return copy_user_generic(dst, (__force void *)src, size);
 }
 
 static __always_inline __must_check unsigned long
 raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 {
-	int ret = 0;
-
-	if (!__builtin_constant_p(size))
-		return copy_user_generic((__force void *)dst, src, size);
-	switch (size) {
-	case 1:
-		__uaccess_begin();
-		__put_user_asm(*(u8 *)src, (u8 __user *)dst,
-			      ret, "b", "b", "iq", 1);
-		__uaccess_end();
-		return ret;
-	case 2:
-		__uaccess_begin();
-		__put_user_asm(*(u16 *)src, (u16 __user *)dst,
-			      ret, "w", "w", "ir", 2);
-		__uaccess_end();
-		return ret;
-	case 4:
-		__uaccess_begin();
-		__put_user_asm(*(u32 *)src, (u32 __user *)dst,
-			      ret, "l", "k", "ir", 4);
-		__uaccess_end();
-		return ret;
-	case 8:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			      ret, "q", "", "er", 8);
-		__uaccess_end();
-		return ret;
-	case 10:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			       ret, "q", "", "er", 10);
-		if (likely(!ret)) {
-			asm("":::"memory");
-			__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
-				       ret, "w", "w", "ir", 2);
-		}
-		__uaccess_end();
-		return ret;
-	case 16:
-		__uaccess_begin();
-		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-			       ret, "q", "", "er", 16);
-		if (likely(!ret)) {
-			asm("":::"memory");
-			__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-				       ret, "q", "", "er", 8);
-		}
-		__uaccess_end();
-		return ret;
-	default:
-		return copy_user_generic((__force void *)dst, src, size);
-	}
+	return copy_user_generic((__force void *)dst, src, size);
 }
 
 static __always_inline __must_check
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 1835767aa3356bf87f3b2d7ec4db2112223c8c09..083910dc5011461b44eaaa7f9cd4bc493161601a 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -28,6 +28,7 @@
 #define CPU_BASED_RDTSC_EXITING                 0x00001000
 #define CPU_BASED_CR3_LOAD_EXITING		0x00008000
 #define CPU_BASED_CR3_STORE_EXITING		0x00010000
+#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS	0x00020000
 #define CPU_BASED_CR8_LOAD_EXITING              0x00080000
 #define CPU_BASED_CR8_STORE_EXITING             0x00100000
 #define CPU_BASED_TPR_SHADOW                    0x00200000
@@ -66,11 +67,17 @@
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_PT_CONCEAL_VMX		0x00080000
 #define SECONDARY_EXEC_XSAVES			0x00100000
+#define SECONDARY_EXEC_PASID_TRANSLATION	0x00200000
 #define SECONDARY_EXEC_PT_USE_GPA		0x01000000
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	0x00400000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	0x04000000
 
+/*
+ * Definitions of Tertiary Processor-Based VM-Execution Controls.
+ */
+#define TERTIARY_EXEC_IPI_VIRT			BIT((3*32+4) & 0x1f)
+
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
@@ -151,6 +158,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc)
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
 	POSTED_INTR_NV                  = 0x00000002,
+	LAST_PID_POINTER_INDEX		= 0x00000008,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -214,6 +222,14 @@ enum vmcs_field {
 	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
 	TSC_MULTIPLIER                  = 0x00002032,
 	TSC_MULTIPLIER_HIGH             = 0x00002033,
+	TERTIARY_VM_EXEC_CONTROL	= 0x00002034,
+	TERTIARY_VM_EXEC_CONTROL_HIGH	= 0x00002035,
+	PASID_DIR0			= 0x00002038,
+	PASID_DIR0_HIGH 		= 0x00002039,
+	PASID_DIR1			= 0x0000203a,
+	PASID_DIR1_HIGH 		= 0x0000203b,
+	PID_POINTER_TABLE		= 0x00002042,
+	PID_POINTER_TABLE_HIGH		= 0x00002043,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
@@ -589,4 +605,42 @@ enum vmx_l1d_flush_state {
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
 
+/*
+ * The VMCS PASID Translation Table is a two-level data structure, including
+ * High/Low PASID Directory and PASID Table. Different fields of the Guest
+ * PASID are used to locate the PASID Table Entry which has the Host PASID.
+ *
+ * High PASID Directory Select - Guest PASID Bit19
+ * PASID Directory Entry Index - Guest PASID Bit18-10
+ * PASID Table Entry Index     - Guest PASID Bit9-0
+ */
+#define pasid_high_dir_select(gpasid)	(((gpasid) >> 19) & 0x1)
+#define pasid_de_idx(gpasid)		(((gpasid) >> 10) & 0x1ff)
+#define pasid_te_idx(gpasid)		((gpasid) & 0x3ff)
+
+#define MAX_PASID			(0Xfffff)
+/*
+ * PASID Directory Entry
+ *
+ * PAISD Table Pointer - PASID Directory Entry BitM-1
+ * PASID Table Present - PASID Directory Entry Bit0
+ */
+#define PASID_DE_TAB_PTR		(((u64)-1) << 12)
+#define PASID_DE_TAB_PRESENT		(1ULL << 0)
+#define PASID_DE_NUM			512
+#define pasid_de_table_ptr(pde)		(*(pde) & PASID_DE_TAB_PTR)
+#define pasid_de_table_present(pde)	(*(pde) & PASID_DE_TAB_PRESENT)
+
+/*
+ * PASID Table Entry
+ *
+ * Host PASID Valid - PASID Table Entry Bit31
+ * Host PASID       - PASID Table Entry Bit19-0
+ */
+#define PASID_TE_VALID			(1 << 31)
+#define PASID_TE_HOST_PASID		(0xfffff)
+#define PASID_TE_NUM			1024
+#define pasid_te_hpasid_valid(pte)	(*(pte) & PASID_TE_VALID)
+#define pasid_te_hpasid(pte)		(*(pte) & PASID_TE_HOST_PASID)
+
 #endif
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd2086104c9cccddedec7a7bf1bc108d32be..92341731f203f4191f824f509927ed9d35c1dcdd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -8,6 +8,7 @@ struct mpc_bus;
 struct mpc_cpu;
 struct mpc_table;
 struct cpuinfo_x86;
+struct irq_domain;
 
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
@@ -53,6 +54,7 @@ struct x86_init_resources {
  * @trap_init:			platform specific trap setup
  * @intr_mode_select:		interrupt delivery mode selection
  * @intr_mode_init:		interrupt delivery mode setup
+ * @create_pci_msi_domain:	Create the PCI/MSI interrupt domain
  */
 struct x86_init_irqs {
 	void (*pre_vector_init)(void);
@@ -60,6 +62,7 @@ struct x86_init_irqs {
 	void (*trap_init)(void);
 	void (*intr_mode_select)(void);
 	void (*intr_mode_init)(void);
+	struct irq_domain *(*create_pci_msi_domain)(void);
 };
 
 /**
@@ -286,7 +289,6 @@ struct pci_dev;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
 };
diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046703c5e8752512a52d9e6ab88106b..6beb55bbefa40b475fea660075be2836b1771246 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -12,9 +12,9 @@
 
 /* entries in ARCH_DLINFO: */
 #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
 #else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
 #endif
 
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index d95d080b30e3ea833a5ddbe47f95e1195c6870a1..0007ba077c0c2beac746d7d51e9e00809cfb4fed 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -24,6 +24,7 @@
 #define DR_TRAP3	(0x8)		/* db3 */
 #define DR_TRAP_BITS	(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
+#define DR_BUS_LOCK	(0x800)		/* bus_lock */
 #define DR_STEP		(0x4000)	/* single-step */
 #define DR_SWITCH	(0x8000)	/* task switch */
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da1676791d2c4f4a70bfad35743daf4c..58204fa6dd75d44d164af6eb9b53e22f23caa30b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -337,9 +337,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
@@ -396,6 +410,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP	0
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f7b565e5c688462000256f00cbb64..500b96e71f1868dd097a2a796c8da89b5f85bed3 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -2,16 +2,22 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS		0x1001
-#define ARCH_SET_FS		0x1002
-#define ARCH_GET_FS		0x1003
-#define ARCH_GET_GS		0x1004
+#define ARCH_SET_GS			0x1001
+#define ARCH_SET_FS			0x1002
+#define ARCH_GET_FS			0x1003
+#define ARCH_GET_GS			0x1004
 
-#define ARCH_GET_CPUID		0x1011
-#define ARCH_SET_CPUID		0x1012
+#define ARCH_GET_CPUID			0x1011
+#define ARCH_SET_CPUID			0x1012
 
-#define ARCH_MAP_VDSO_X32	0x2001
-#define ARCH_MAP_VDSO_32	0x2002
-#define ARCH_MAP_VDSO_64	0x2003
+#define ARCH_GET_XCOMP_SUPP		0x1021
+#define ARCH_GET_XCOMP_PERM		0x1022
+#define ARCH_REQ_XCOMP_PERM		0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM	0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM	0x1025
+
+#define ARCH_MAP_VDSO_X32		0x2001
+#define ARCH_MAP_VDSO_32		0x2002
+#define ARCH_MAP_VDSO_64		0x2003
 
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 3eb8411ab60efb14e4890c711569bfe3259c8687..66d8f64d75a9ce0f9b378e5892d58071bbcb02c9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -88,6 +88,8 @@
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
+#define EXIT_REASON_ENQCMD_PASID        72
+#define EXIT_REASON_ENQCMDS_PASID       73
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -148,7 +150,9 @@
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
-	{ EXIT_REASON_TPAUSE,                "TPAUSE" }
+	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
+	{ EXIT_REASON_ENQCMD_PASID,          "ENQCMD_PASID" }, \
+	{ EXIT_REASON_ENQCMDS_PASID,         "ENQCMDS_PASID" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e0b2510aa0b8bb0d982b0aa62637f692fe496755..a7e013bff99f4e0924ede65ae85b5559150c8ac8 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -15,6 +15,7 @@
 #include <linux/kprobes.h>
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
+#include <linux/sync_core.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index a6ac3712db8bcc4afbe1e61273121e289b9627a1..5181d87fd122217c2827b0eba56dc0cc3e55f023 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 static inline int
 need_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return force_iommu || !dma_capable(dev, addr, size);
+	return force_iommu || !dma_capable(dev, addr, size, true);
 }
 
 static inline int
 nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	return !dma_capable(dev, addr, size);
+	return !dma_capable(dev, addr, size, true);
 }
 
 /* Map a single continuous physical area into the IOMMU.
@@ -332,7 +332,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 	int i;
 
 	if (iommu_start == -1)
-		return -1;
+		return -ENOMEM;
 
 	for_each_sg(start, s, nelems, i) {
 		unsigned long pages, addr;
@@ -381,13 +381,13 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		       enum dma_data_direction dir, unsigned long attrs)
 {
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
-	int need = 0, nextneed, i, out, start;
+	int need = 0, nextneed, i, out, start, ret;
 	unsigned long pages = 0;
 	unsigned int seg_size;
 	unsigned int max_seg_size;
 
 	if (nents == 0)
-		return 0;
+		return -EINVAL;
 
 	out		= 0;
 	start		= 0;
@@ -415,8 +415,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 			if (!iommu_merge || !nextneed || !need || s->offset ||
 			    (s->length + seg_size > max_seg_size) ||
 			    (ps->offset + ps->length) % PAGE_SIZE) {
-				if (dma_map_cont(dev, start_sg, i - start,
-						 sgmap, pages, need) < 0)
+				ret = dma_map_cont(dev, start_sg, i - start,
+						   sgmap, pages, need);
+				if (ret < 0)
 					goto error;
 				out++;
 
@@ -433,7 +434,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		ps = s;
 	}
-	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+	ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+	if (ret < 0)
 		goto error;
 	out++;
 	flush_gart();
@@ -457,9 +459,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		panic("dma_map_sg: overflow on %lu pages\n", pages);
 
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
-	for_each_sg(sg, s, nents, i)
-		s->dma_address = DMA_MAPPING_ERROR;
-	return 0;
+	return ret;
 }
 
 /* allocate and map a coherent mapping */
@@ -469,7 +469,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 {
 	void *vaddr;
 
-	vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs);
+	vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
 	if (!vaddr ||
 	    !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
 		return vaddr;
@@ -481,7 +481,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 		goto out_free;
 	return vaddr;
 out_free:
-	dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs);
+	dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
 	return NULL;
 }
 
@@ -491,7 +491,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
 		   dma_addr_t dma_addr, unsigned long attrs)
 {
 	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
-	dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs);
+	dma_direct_free(dev, size, vaddr, dma_addr, attrs);
 }
 
 static int no_agp;
@@ -681,6 +681,8 @@ static const struct dma_map_ops gart_dma_ops = {
 	.get_sgtable			= dma_common_get_sgtable,
 	.dma_supported			= dma_direct_supported,
 	.get_required_mask		= dma_direct_get_required_mask,
+	.alloc_pages			= dma_direct_alloc_pages,
+	.free_pages			= dma_direct_free_pages,
 };
 
 static void gart_iommu_shutdown(void)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7fafa859e9f259c6a4fb154ae3377c0bf1fcacf5..dfff7f6440cac90124cd5d12439e0f8d0c98d5e1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,6 +48,7 @@
 #include <asm/proto.h>
 #include <asm/traps.h>
 #include <asm/apic.h>
+#include <asm/acpi.h>
 #include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
@@ -1481,6 +1482,9 @@ void __init apic_intr_mode_init(void)
 		break;
 	}
 
+	if (x86_platform.apic_post_init)
+		x86_platform.apic_post_init();
+
 	apic_bsp_setup(upmode);
 }
 
@@ -1640,7 +1644,7 @@ static void setup_local_APIC(void)
 	apic->init_apic_ldr();
 
 #ifdef CONFIG_X86_32
-	if (apic->dest_logical) {
+	if (apic->dest_mode_logical) {
 		int logical_apicid, ldr_apicid;
 
 		/*
@@ -2529,6 +2533,41 @@ int hard_smp_processor_id(void)
 	return read_apic_id();
 }
 
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+			   bool dmar)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+	msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
+
+	msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+	msg->arch_data.vector = cfg->vector;
+
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+	/*
+	 * Only the IOMMU itself can use the trick of putting destination
+	 * APIC ID into the high bits of the address. Anything else would
+	 * just be writing to memory if it tried that, and needs IR to
+	 * address higher APIC IDs.
+	 */
+	if (dmar)
+		msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+	else
+		WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
+}
+
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+	u32 dest = msg->arch_addr_lo.destid_0_7;
+
+	if (extid)
+		dest |= msg->arch_addr_hi.destid_8_31 << 8;
+	return dest;
+}
+EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+
 /*
  * Override the generic EOI implementation with an optimized version.
  * Only called during early boot when only one CPU is active and with
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7862b152a052b5754df1c6866ebca724c65114ea..8f72b4351c9fe017735d2b04098cae96afdfcffd 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+	__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
@@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 1, /* logical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
@@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= physflat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 98c9bb75d1854d9ca14c0c43779c8da161dbb954..bd982beb53ab4a6e3905c5404d82934e5ff23320 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -94,19 +94,15 @@ struct apic apic_noop __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= noop_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	/* logical delivery broadcast to all CPUs: */
-	.irq_dest_mode			= 1,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= default_check_apicid_used,
 
+	.check_apicid_used		= default_check_apicid_used,
 	.init_apic_ldr			= noop_init_apic_ldr,
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= NULL,
-
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index cdf45b4700f283967c9606a152eb98d05e69fea3..8a685864f0cb10f61d6f2587eb3b52f6f4d60178 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = {
 	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
@@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = {
 	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= flat_init_apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 38b5b51d42f6a33773e55e4ad4214356c86591d7..77555f66c14d7747e8d42137c7d81f58076efe65 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -9,6 +9,7 @@
 #include <linux/smp.h>
 
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 
 #include "local.h"
 
@@ -126,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= bigsmp_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	/* phys delivery to target CPU: */
-	.irq_dest_mode			= 0,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 1,
-	.dest_logical			= 0,
-	.check_apicid_used		= bigsmp_check_apicid_used,
 
+	.check_apicid_used		= bigsmp_check_apicid_used,
 	.init_apic_ldr			= bigsmp_init_apic_ldr,
-
 	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map,
 	.setup_apic_routing		= bigsmp_setup_apic_routing,
 	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0edcf69659eee656b73c6daa38a064dfa3047ff1..41055e93d90254379656c5bba5f744589eb751b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/msi.h>
 
 #include <asm/irqdomain.h>
 #include <asm/io.h>
@@ -63,7 +64,6 @@
 #include <asm/setup.h>
 #include <asm/irq_remapping.h>
 #include <asm/hw_irq.h>
-
 #include <asm/apic.h>
 
 #define	for_each_ioapic(idx)		\
@@ -89,12 +89,12 @@ struct irq_pin_list {
 };
 
 struct mp_chip_data {
-	struct list_head irq_2_pin;
-	struct IO_APIC_route_entry entry;
-	int trigger;
-	int polarity;
+	struct list_head		irq_2_pin;
+	struct IO_APIC_route_entry	entry;
+	bool				is_level;
+	bool				active_low;
+	bool				isa_irq;
 	u32 count;
-	bool isa_irq;
 };
 
 struct mp_ioapic_gsi {
@@ -299,31 +299,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg,
 	writel(value, &io_apic->data);
 }
 
-union entry_union {
-	struct { u32 w1, w2; };
-	struct IO_APIC_route_entry entry;
-};
-
 static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
 {
-	union entry_union eu;
+	struct IO_APIC_route_entry entry;
 
-	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
-	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
 
-	return eu.entry;
+	return entry;
 }
 
 static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
 {
-	union entry_union eu;
+	struct IO_APIC_route_entry entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	eu.entry = __ioapic_read_entry(apic, pin);
+	entry = __ioapic_read_entry(apic, pin);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return eu.entry;
+	return entry;
 }
 
 /*
@@ -334,11 +329,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  */
 static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
-	union entry_union eu = {{0, 0}};
-
-	eu.entry = e;
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, e.w2);
+	io_apic_write(apic, 0x10 + 2*pin, e.w1);
 }
 
 static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
@@ -357,12 +349,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
  */
 static void ioapic_mask_entry(int apic, int pin)
 {
+	struct IO_APIC_route_entry e = { .masked = true };
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, e.w1);
+	io_apic_write(apic, 0x11 + 2*pin, e.w2);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -435,20 +427,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void io_apic_modify_irq(struct mp_chip_data *data,
-			       int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
 			       void (*final)(struct irq_pin_list *entry))
 {
-	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	eu.entry = data->entry;
-	eu.w1 &= mask_and;
-	eu.w1 |= mask_or;
-	data->entry = eu.entry;
+	data->entry.masked = masked;
 
 	for_each_irq_pin(entry, data->irq_2_pin) {
-		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
 		if (final)
 			final(entry);
 	}
@@ -472,13 +459,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, true, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
 static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(data, false, NULL);
 }
 
 static void unmask_ioapic_irq(struct irq_data *irq_data)
@@ -519,8 +506,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = IOAPIC_MASKED;
-		entry1.trigger = IOAPIC_EDGE;
+		entry1.masked = true;
+		entry1.is_level = false;
 
 		__ioapic_write_entry(apic, pin, entry1);
 
@@ -548,15 +535,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	entry = ioapic_read_entry(apic, pin);
-	if (entry.delivery_mode == dest_SMI)
+	if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
 		return;
 
 	/*
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (entry.mask == IOAPIC_UNMASKED) {
-		entry.mask = IOAPIC_MASKED;
+	if (!entry.masked) {
+		entry.masked = true;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -569,8 +556,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (entry.trigger == IOAPIC_EDGE) {
-			entry.trigger = IOAPIC_LEVEL;
+		if (!entry.is_level) {
+			entry.is_level = true;
 			ioapic_write_entry(apic, pin, entry);
 		}
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -672,8 +659,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (entry.mask == IOAPIC_UNMASKED) {
-				entry.mask = IOAPIC_MASKED;
+			if (!entry.masked) {
+				entry.masked = true;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -758,44 +745,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	return -1;
 }
 
-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
-	if (irq < nr_legacy_irqs()) {
-		unsigned int port = 0x4d0 + (irq >> 3);
-		return (inb(port) >> (irq & 7)) & 1;
-	}
-	apic_printk(APIC_VERBOSE, KERN_INFO
-			"Broken MPtable reports ISA irq %d\n", irq);
-	return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
-#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value.  If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx)	default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
-#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
 
@@ -804,127 +754,176 @@ static int irq_polarity(int idx)
 	 */
 	switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
 	case MP_IRQPOL_DEFAULT:
-		/* conforms to spec, ie. bus-type dependent polarity */
-		if (test_bit(bus, mp_bus_not_pci))
-			return default_ISA_polarity(idx);
-		else
-			return default_PCI_polarity(idx);
+		/*
+		 * Conforms to spec, ie. bus-type dependent polarity.  PCI
+		 * defaults to low active. [E]ISA defaults to high active.
+		 */
+		return !test_bit(bus, mp_bus_not_pci);
 	case MP_IRQPOL_ACTIVE_HIGH:
-		return IOAPIC_POL_HIGH;
+		return false;
 	case MP_IRQPOL_RESERVED:
 		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
 		/* fall through */
 	case MP_IRQPOL_ACTIVE_LOW:
 	default: /* Pointless default required due to do gcc stupidity */
-		return IOAPIC_POL_LOW;
+		return true;
 	}
 }
 
 #ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+	if (irq < nr_legacy_irqs()) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"Broken MPtable reports ISA irq %d\n", irq);
+	return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
 {
 	switch (mp_bus_id_to_type[bus]) {
 	case MP_BUS_PCI:
 	case MP_BUS_ISA:
-		return trigger;
+		return level;
 	case MP_BUS_EISA:
-		return default_EISA_trigger(idx);
+		return EISA_ELCR(mp_irqs[idx].srcbusirq);
 	}
 	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
-	return IOAPIC_LEVEL;
+	return true;
 }
 #else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
 {
-	return trigger;
+	return level;
 }
 #endif
 
-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int trigger;
+	bool level;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
 	switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
 	case MP_IRQTRIG_DEFAULT:
-		/* conforms to spec, ie. bus-type dependent trigger mode */
-		if (test_bit(bus, mp_bus_not_pci))
-			trigger = default_ISA_trigger(idx);
-		else
-			trigger = default_PCI_trigger(idx);
+		/*
+		 * Conforms to spec, ie. bus-type dependent trigger
+		 * mode. PCI defaults to level, ISA to edge.
+		 */
+		level = !test_bit(bus, mp_bus_not_pci);
 		/* Take EISA into account */
-		return eisa_irq_trigger(idx, bus, trigger);
+		return eisa_irq_is_level(idx, bus, level);
 	case MP_IRQTRIG_EDGE:
-		return IOAPIC_EDGE;
+		return false;
 	case MP_IRQTRIG_RESERVED:
 		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
 		/* fall through */
 	case MP_IRQTRIG_LEVEL:
 	default: /* Pointless default required due to do gcc stupidity */
-		return IOAPIC_LEVEL;
+		return true;
 	}
 }
 
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+	int ioapic, pin, idx;
+
+	if (skip_ioapic_setup)
+		return -1;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
+		return -1;
+
+	*trigger = irq_is_level(idx);
+	*polarity = irq_active_low(idx);
+	return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+	*is_level = *active_low = 0;
+	return __acpi_get_override_irq(gsi, (bool *)is_level,
+				       (bool *)active_low);
+}
+#endif
+
 void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
 			   int trigger, int polarity)
 {
 	init_irq_alloc_info(info, NULL);
 	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
-	info->ioapic_node = node;
-	info->ioapic_trigger = trigger;
-	info->ioapic_polarity = polarity;
-	info->ioapic_valid = 1;
+	info->ioapic.node = node;
+	info->ioapic.is_level = trigger;
+	info->ioapic.active_low = polarity;
+	info->ioapic.valid = 1;
 }
 
-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
 static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
 				   struct irq_alloc_info *src,
 				   u32 gsi, int ioapic_idx, int pin)
 {
-	int trigger, polarity;
+	bool level, pol_low;
 
 	copy_irq_alloc_info(dst, src);
 	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
-	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
-	dst->ioapic_pin = pin;
-	dst->ioapic_valid = 1;
-	if (src && src->ioapic_valid) {
-		dst->ioapic_node = src->ioapic_node;
-		dst->ioapic_trigger = src->ioapic_trigger;
-		dst->ioapic_polarity = src->ioapic_polarity;
+	dst->devid = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic.pin = pin;
+	dst->ioapic.valid = 1;
+	if (src && src->ioapic.valid) {
+		dst->ioapic.node = src->ioapic.node;
+		dst->ioapic.is_level = src->ioapic.is_level;
+		dst->ioapic.active_low = src->ioapic.active_low;
 	} else {
-		dst->ioapic_node = NUMA_NO_NODE;
-		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
-			dst->ioapic_trigger = trigger;
-			dst->ioapic_polarity = polarity;
+		dst->ioapic.node = NUMA_NO_NODE;
+		if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+			dst->ioapic.is_level = level;
+			dst->ioapic.active_low = pol_low;
 		} else {
 			/*
 			 * PCI interrupts are always active low level
 			 * triggered.
 			 */
-			dst->ioapic_trigger = IOAPIC_LEVEL;
-			dst->ioapic_polarity = IOAPIC_POL_LOW;
+			dst->ioapic.is_level = true;
+			dst->ioapic.active_low = true;
 		}
 	}
 }
 
 static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
 {
-	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+	return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
 }
 
-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
 {
 	irq_flow_handler_t hdl;
 	bool fasteoi;
 
-	if (trigger) {
+	if (level) {
 		irq_set_status_flags(irq, IRQ_LEVEL);
 		fasteoi = true;
 	} else {
@@ -946,14 +945,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 	 * pin with real trigger and polarity attributes.
 	 */
 	if (irq < nr_legacy_irqs() && data->count == 1) {
-		if (info->ioapic_trigger != data->trigger)
-			mp_register_handler(irq, info->ioapic_trigger);
-		data->entry.trigger = data->trigger = info->ioapic_trigger;
-		data->entry.polarity = data->polarity = info->ioapic_polarity;
+		if (info->ioapic.is_level != data->is_level)
+			mp_register_handler(irq, info->ioapic.is_level);
+		data->entry.is_level = data->is_level = info->ioapic.is_level;
+		data->entry.active_low = data->active_low = info->ioapic.active_low;
 	}
 
-	return data->trigger == info->ioapic_trigger &&
-	       data->polarity == info->ioapic_polarity;
+	return data->is_level == info->ioapic.is_level &&
+	       data->active_low == info->ioapic.active_low;
 }
 
 static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -1015,7 +1014,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
 		if (!mp_check_pin_attr(irq, info))
 			return -EBUSY;
 		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
-					  info->ioapic_pin))
+					  info->ioapic.pin))
 			return -ENOMEM;
 	} else {
 		info->flags |= X86_IRQ_ALLOC_LEGACY;
@@ -1242,10 +1241,9 @@ void ioapic_zap_locks(void)
 
 static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
-	int i;
-	char buf[256];
 	struct IO_APIC_route_entry entry;
-	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+	char buf[256];
+	int i;
 
 	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
@@ -1253,20 +1251,20 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 		snprintf(buf, sizeof(buf),
 			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
 			 i,
-			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
-			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
-			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.masked ? "disabled" : "enabled ",
+			 entry.is_level ? "level" : "edge ",
+			 entry.active_low ? "low " : "high",
 			 entry.vector, entry.irr, entry.delivery_status);
-		if (ir_entry->format)
+		if (entry.ir_format) {
 			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
-			       buf, (ir_entry->index2 << 15) | ir_entry->index,
-			       ir_entry->zero);
-		else
-			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
 			       buf,
-			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
-			       "logical " : "physical",
-			       entry.dest, entry.delivery_mode);
+			       (entry.ir_index_15 << 15) | entry.ir_index_0_14,
+				entry.ir_zero);
+		} else {
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf,
+			       entry.dest_mode_logical ? "logical " : "physical",
+			       entry.destid_0_7, entry.delivery_mode);
+		}
 	}
 }
 
@@ -1391,7 +1389,8 @@ void __init enable_IO_APIC(void)
 		/* If the interrupt line is enabled and in ExtInt mode
 		 * I have found the pin where the i8259 is connected.
 		 */
-		if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+		if (!entry.masked &&
+		    entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
 			ioapic_i8259.apic = apic;
 			ioapic_i8259.pin  = pin;
 			goto found_i8259;
@@ -1435,12 +1434,12 @@ void native_restore_boot_irq_mode(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask		= IOAPIC_UNMASKED;
-		entry.trigger		= IOAPIC_EDGE;
-		entry.polarity		= IOAPIC_POL_HIGH;
-		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
-		entry.delivery_mode	= dest_ExtINT;
-		entry.dest		= read_apic_id();
+		entry.masked		= false;
+		entry.is_level		= false;
+		entry.active_low	= false;
+		entry.dest_mode_logical	= false;
+		entry.delivery_mode	= APIC_DELIVERY_MODE_EXTINT;
+		entry.destid_0_7	= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1719,13 +1718,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	for_each_irq_pin(entry, data->irq_2_pin) {
-		unsigned int reg;
+		struct IO_APIC_route_entry e;
 		int pin;
 
 		pin = entry->pin;
-		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+		if (e.irr) {
 			raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 			return true;
 		}
@@ -1872,21 +1871,58 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
 	eoi_ioapic_pin(data->entry.vector, data);
 }
 
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+				      struct IO_APIC_route_entry *entry)
+{
+	struct msi_msg msg;
+
+	/* Let the parent domain compose the MSI message */
+	irq_chip_compose_msi_msg(irq_data, &msg);
+
+	/*
+	 * - Real vector
+	 * - DMAR/IR: 8bit subhandle (ioapic.pin)
+	 * - AMD/IR:  8bit IRTE index
+	 */
+	entry->vector			= msg.arch_data.vector;
+	/* Delivery mode (for DMAR/IR all 0) */
+	entry->delivery_mode		= msg.arch_data.delivery_mode;
+	/* Destination mode or DMAR/IR index bit 15 */
+	entry->dest_mode_logical	= msg.arch_addr_lo.dest_mode_logical;
+	/* DMAR/IR: 1, 0 for all other modes */
+	entry->ir_format		= msg.arch_addr_lo.dmar_format;
+	/*
+	 * DMAR/IR: index bit 0-14.
+	 *
+	 * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+	 * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+	 */
+	entry->ir_index_0_14		= msg.arch_addr_lo.dmar_index_0_14;
+}
+
 static void ioapic_configure_entry(struct irq_data *irqd)
 {
 	struct mp_chip_data *mpd = irqd->chip_data;
-	struct irq_cfg *cfg = irqd_cfg(irqd);
 	struct irq_pin_list *entry;
 
-	/*
-	 * Only update when the parent is the vector domain, don't touch it
-	 * if the parent is the remapping domain. Check the installed
-	 * ioapic chip to verify that.
-	 */
-	if (irqd->chip == &ioapic_chip) {
-		mpd->entry.dest = cfg->dest_apicid;
-		mpd->entry.vector = cfg->vector;
-	}
+	ioapic_setup_msg_from_msi(irqd, &mpd->entry);
+
 	for_each_irq_pin(entry, mpd->irq_2_pin)
 		__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
 }
@@ -1942,7 +1978,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
 		 * irrelevant because the IO-APIC treats them as fire and
 		 * forget.
 		 */
-		if (rentry.irr && rentry.trigger) {
+		if (rentry.irr && rentry.is_level) {
 			*state = true;
 			break;
 		}
@@ -2067,12 +2103,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
-	entry1.mask = IOAPIC_UNMASKED;
-	entry1.dest = hard_smp_processor_id();
-	entry1.delivery_mode = dest_ExtINT;
-	entry1.polarity = entry0.polarity;
-	entry1.trigger = IOAPIC_EDGE;
+	entry1.dest_mode_logical	= true;
+	entry1.masked			= false;
+	entry1.destid_0_7		= hard_smp_processor_id();
+	entry1.delivery_mode		= APIC_DELIVERY_MODE_EXTINT;
+	entry1.active_low		= entry0.active_low;
+	entry1.is_level			= false;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2115,8 +2151,8 @@ static int mp_alloc_timer_irq(int ioapic, int pin)
 		struct irq_alloc_info info;
 
 		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
-		info.ioapic_id = mpc_ioapic_id(ioapic);
-		info.ioapic_pin = pin;
+		info.devid = mpc_ioapic_id(ioapic);
+		info.ioapic.pin = pin;
 		mutex_lock(&ioapic_mutex);
 		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
 		mutex_unlock(&ioapic_mutex);
@@ -2201,9 +2237,9 @@ static inline void __init check_timer(void)
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
-			int idx;
-			idx = find_irq_entry(apic1, pin1, mp_INT);
-			if (idx != -1 && irq_trigger(idx))
+			int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+			if (idx != -1 && irq_is_level(idx))
 				unmask_ioapic_irq(irq_get_irq_data(0));
 		}
 		irq_domain_deactivate_irq(irq_data);
@@ -2307,36 +2343,37 @@ static inline void __init check_timer(void)
 
 static int mp_irqdomain_create(int ioapic)
 {
-	struct irq_alloc_info info;
 	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 	struct fwnode_handle *fn;
-	char *name = "IO-APIC";
+	struct irq_fwspec fwspec;
 
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
-	info.ioapic_id = mpc_ioapic_id(ioapic);
-	parent = irq_remapping_get_ir_irq_domain(&info);
-	if (!parent)
-		parent = x86_vector_domain;
-	else
-		name = "IO-APIC-IR";
-
 	/* Handle device tree enumerated APICs proper */
 	if (cfg->dev) {
 		fn = of_node_to_fwnode(cfg->dev);
 	} else {
-		fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+		fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
 		if (!fn)
 			return -ENOMEM;
 	}
 
+	fwspec.fwnode = fn;
+	fwspec.param_count = 1;
+	fwspec.param[0] = mpc_ioapic_id(ioapic);
+
+	parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+	if (!parent) {
+		if (!cfg->dev)
+			irq_domain_free_fwnode(fn);
+		return -ENODEV;
+	}
+
 	ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
 						 (void *)(long)ioapic);
 
@@ -2610,30 +2647,6 @@ static int io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
-	int ioapic, pin, idx;
-
-	if (skip_ioapic_setup)
-		return -1;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -1;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	if (pin < 0)
-		return -1;
-
-	idx = find_irq_entry(ioapic, pin, mp_INT);
-	if (idx < 0)
-		return -1;
-
-	*trigger = irq_trigger(idx);
-	*polarity = irq_polarity(idx);
-	return 0;
-}
-
 /*
  * This function updates target affinity of IOAPIC interrupts to include
  * the CPUs which came online during SMP bringup.
@@ -2956,45 +2969,50 @@ int mp_ioapic_registered(u32 gsi_base)
 static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
 				  struct irq_alloc_info *info)
 {
-	if (info && info->ioapic_valid) {
-		data->trigger = info->ioapic_trigger;
-		data->polarity = info->ioapic_polarity;
-	} else if (acpi_get_override_irq(gsi, &data->trigger,
-					 &data->polarity) < 0) {
+	if (info && info->ioapic.valid) {
+		data->is_level = info->ioapic.is_level;
+		data->active_low = info->ioapic.active_low;
+	} else if (__acpi_get_override_irq(gsi, &data->is_level,
+					   &data->active_low) < 0) {
 		/* PCI interrupts are always active low level triggered. */
-		data->trigger = IOAPIC_LEVEL;
-		data->polarity = IOAPIC_POL_LOW;
+		data->is_level = true;
+		data->active_low = true;
 	}
 }
 
-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
-			   struct IO_APIC_route_entry *entry)
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
 {
+	struct IO_APIC_route_entry *entry = &data->entry;
+
 	memset(entry, 0, sizeof(*entry));
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = cfg->dest_apicid;
-	entry->vector	     = cfg->vector;
-	entry->trigger	     = data->trigger;
-	entry->polarity	     = data->polarity;
+	entry->is_level		 = data->is_level;
+	entry->active_low	 = data->active_low;
 	/*
 	 * Mask level triggered irqs. Edge triggered irqs are masked
 	 * by the irq core code in case they fire.
 	 */
-	if (data->trigger == IOAPIC_LEVEL)
-		entry->mask = IOAPIC_MASKED;
-	else
-		entry->mask = IOAPIC_UNMASKED;
+	entry->masked		= data->is_level;
 }
 
 int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		       unsigned int nr_irqs, void *arg)
 {
-	int ret, ioapic, pin;
-	struct irq_cfg *cfg;
-	struct irq_data *irq_data;
-	struct mp_chip_data *data;
 	struct irq_alloc_info *info = arg;
+	struct mp_chip_data *data;
+	struct irq_data *irq_data;
+	int ret, ioapic, pin;
 	unsigned long flags;
 
 	if (!info || nr_irqs > 1)
@@ -3004,7 +3022,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	ioapic = mp_irqdomain_ioapic_idx(domain);
-	pin = info->ioapic_pin;
+	pin = info->ioapic.pin;
 	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
 		return -EEXIST;
 
@@ -3012,7 +3030,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	if (!data)
 		return -ENOMEM;
 
-	info->ioapic_entry = &data->entry;
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
 	if (ret < 0) {
 		kfree(data);
@@ -3020,28 +3037,26 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 	}
 
 	INIT_LIST_HEAD(&data->irq_2_pin);
-	irq_data->hwirq = info->ioapic_pin;
+	irq_data->hwirq = info->ioapic.pin;
 	irq_data->chip = (domain->parent == x86_vector_domain) ?
 			  &ioapic_chip : &ioapic_ir_chip;
 	irq_data->chip_data = data;
 	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
 
-	cfg = irqd_cfg(irq_data);
 	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
 
+	mp_preconfigure_entry(data);
+	mp_register_handler(virq, data->is_level);
+
 	local_irq_save(flags);
-	if (info->ioapic_entry)
-		mp_setup_entry(cfg, data, info->ioapic_entry);
-	mp_register_handler(virq, data->trigger);
 	if (virq < nr_legacy_irqs())
 		legacy_pic->mask(virq);
 	local_irq_restore(flags);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
-		    virq, data->trigger, data->polarity, cfg->dest_apicid);
-
+		    "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, virq,
+		    data->is_level, data->active_low);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ca0f91372fd2093d09ea1d9d07122ab30422f25..d1fb874fbe64bafac3248120872ecbdb94ebae3b 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -2,6 +2,7 @@
 
 #include <linux/cpumask.h>
 #include <linux/smp.h>
+#include <asm/io_apic.h>
 
 #include "local.h"
 
@@ -259,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
 	for_each_cpu(query_cpu, mask)
 		__default_send_IPI_dest_field(
 			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+			vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
@@ -278,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
 			continue;
 		__default_send_IPI_dest_field(
 			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+			vector, APIC_DEST_LOGICAL);
 		}
 	local_irq_restore(flags);
 }
@@ -296,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
 
 	local_irq_save(flags);
 	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
-	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+	__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 04797f05ce942d183e8c512dd86f947d11556691..a997d849509a66c996872504e14dc3b4b60fc26a 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -10,6 +10,7 @@
 
 #include <linux/jump_label.h>
 
+#include <asm/irq_vectors.h>
 #include <asm/apic.h>
 
 /* APIC flat 64 */
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index a20873bbbed6739c19e6ec7921699e012e7715cb..5691a2bc6d257599526ef41d87f0dbfef52089c4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -15,46 +15,18 @@
 #include <linux/hpet.h>
 #include <linux/msi.h>
 #include <asm/irqdomain.h>
-#include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-static struct irq_domain *msi_default_domain;
-
-static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
-{
-	msg->address_hi = MSI_ADDR_BASE_HI;
-
-	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-
-	msg->address_lo =
-		MSI_ADDR_BASE_LO |
-		((apic->irq_dest_mode == 0) ?
-			MSI_ADDR_DEST_MODE_PHYSICAL :
-			MSI_ADDR_DEST_MODE_LOGICAL) |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID(cfg->dest_apicid);
-
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(cfg->vector);
-}
-
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
-{
-	__irq_msi_compose_msg(irqd_cfg(data), msg);
-}
+struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
 
 static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
 {
 	struct msi_msg msg[2] = { [1] = { }, };
 
-	__irq_msi_compose_msg(cfg, msg);
+	__irq_msi_compose_msg(cfg, msg, false);
 	irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
 }
 
@@ -86,11 +58,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 *   The quirk bit is not set in this case.
 	 * - The new vector is the same as the old vector
 	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+	 * - The interrupt is not yet started up
 	 * - The new destination CPU is the same as the old destination CPU
 	 */
 	if (!irqd_msi_nomask_quirk(irqd) ||
 	    cfg->vector == old_cfg.vector ||
 	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+	    !irqd_is_started(irqd) ||
 	    cfg->dest_apicid == old_cfg.dest_apicid) {
 		irq_msi_update_msg(irqd, cfg);
 		return ret;
@@ -176,69 +150,44 @@ static struct irq_chip pci_msi_controller = {
 	.irq_mask		= pci_msi_mask_irq,
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg	= irq_msi_compose_msg,
 	.irq_set_affinity	= msi_set_affinity,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static void pci_msi_prepare(struct device *dev, msi_alloc_info_t *arg)
 {
-	struct irq_domain *domain;
-	struct irq_alloc_info info;
+	struct msi_desc *desc = first_msi_entry(dev);
 
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_MSI;
-	info.msi_dev = dev;
-
-	domain = irq_remapping_get_irq_domain(&info);
-	if (domain == NULL)
-		domain = msi_default_domain;
-	if (domain == NULL)
-		return -ENOSYS;
-
-	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
-}
-
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_domain_free_irqs(irq, 1);
+	if (desc->msi_attrib.is_msix) {
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+	} else {
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+	}
 }
 
-static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
-					 msi_alloc_info_t *arg)
+static void dev_msi_prepare(struct device *dev, msi_alloc_info_t *arg)
 {
-	return arg->msi_hwirq;
+	arg->type = X86_IRQ_ALLOC_TYPE_DEV_MSI;
 }
 
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+int x86_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
 		    msi_alloc_info_t *arg)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct msi_desc *desc = first_pci_msi_entry(pdev);
-
 	init_irq_alloc_info(arg, NULL);
-	arg->msi_dev = pdev;
-	if (desc->msi_attrib.is_msix) {
-		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
-	} else {
-		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
-		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
-	}
 
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
+	if (dev_is_pci(dev))
+		pci_msi_prepare(dev, arg);
+	else
+		dev_msi_prepare(dev, arg);
 
-void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
-{
-	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_msi_set_desc);
+EXPORT_SYMBOL_GPL(x86_msi_prepare);
 
 static struct msi_domain_ops pci_msi_domain_ops = {
-	.get_hwirq	= pci_msi_get_hwirq,
-	.msi_prepare	= pci_msi_prepare,
-	.set_desc	= pci_msi_set_desc,
+	.msi_prepare	= x86_msi_prepare,
 };
 
 static struct msi_domain_info pci_msi_domain_info = {
@@ -250,25 +199,32 @@ static struct msi_domain_info pci_msi_domain_info = {
 	.handler_name	= "edge",
 };
 
-void __init arch_init_msi_domain(struct irq_domain *parent)
+struct irq_domain * __init native_create_pci_msi_domain(void)
 {
 	struct fwnode_handle *fn;
+	struct irq_domain *d;
 
 	if (disable_apic)
-		return;
+		return NULL;
 
 	fn = irq_domain_alloc_named_fwnode("PCI-MSI");
-	if (fn) {
-		msi_default_domain =
-			pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
-						  parent);
-	}
-	if (!msi_default_domain) {
+	if (!fn)
+		return NULL;
+
+	d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+				      x86_vector_domain);
+	if (!d) {
 		irq_domain_free_fwnode(fn);
-		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+		pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
 	} else {
-		msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+		d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
 	}
+	return d;
+}
+
+void __init x86_create_pci_msi_domain(void)
+{
+	x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
 }
 
 #ifdef CONFIG_IRQ_REMAP
@@ -279,7 +235,8 @@ static struct irq_chip pci_msi_ir_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
 static struct msi_domain_info pci_msi_ir_domain_info = {
@@ -308,6 +265,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
 #endif
 
 #ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	__irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
 static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	dmar_msi_write(data->irq, msg);
@@ -320,35 +288,30 @@ static struct irq_chip dmar_msi_controller = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= msi_domain_set_affinity,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.irq_compose_msi_msg	= dmar_msi_compose_msg,
 	.irq_write_msi_msg	= dmar_msi_write_msg,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_AFFINITY_PRE_STARTUP,
 };
 
-static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
-					  msi_alloc_info_t *arg)
-{
-	return arg->dmar_id;
-}
-
 static int dmar_msi_init(struct irq_domain *domain,
 			 struct msi_domain_info *info, unsigned int virq,
 			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
-	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
-			    handle_edge_irq, arg->dmar_data, "edge");
+	irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL,
+			    handle_edge_irq, arg->data, "edge");
 
 	return 0;
 }
 
 static struct msi_domain_ops dmar_msi_domain_ops = {
-	.get_hwirq	= dmar_msi_get_hwirq,
 	.msi_init	= dmar_msi_init,
 };
 
 static struct msi_domain_info dmar_msi_domain_info = {
 	.ops		= &dmar_msi_domain_ops,
 	.chip		= &dmar_msi_controller,
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS,
 };
 
 static struct irq_domain *dmar_get_irq_domain(void)
@@ -383,8 +346,9 @@ int dmar_alloc_hwirq(int id, int node, void *arg)
 
 	init_irq_alloc_info(&info, NULL);
 	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
-	info.dmar_id = id;
-	info.dmar_data = arg;
+	info.devid = id;
+	info.hwirq = id;
+	info.data = arg;
 
 	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
@@ -394,121 +358,3 @@ void dmar_free_hwirq(int irq)
 	irq_domain_free_irqs(irq, 1);
 }
 #endif
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_HPET_TIMER
-static inline int hpet_dev_id(struct irq_domain *domain)
-{
-	struct msi_domain_info *info = msi_get_domain_info(domain);
-
-	return (int)(long)info->data;
-}
-
-static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
-{
-	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
-}
-
-static struct irq_chip hpet_msi_controller __ro_after_init = {
-	.name = "HPET-MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = irq_chip_ack_parent,
-	.irq_set_affinity = msi_domain_set_affinity,
-	.irq_retrigger = irq_chip_retrigger_hierarchy,
-	.irq_compose_msi_msg = irq_msi_compose_msg,
-	.irq_write_msi_msg = hpet_msi_write_msg,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
-					  msi_alloc_info_t *arg)
-{
-	return arg->hpet_index;
-}
-
-static int hpet_msi_init(struct irq_domain *domain,
-			 struct msi_domain_info *info, unsigned int virq,
-			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
-{
-	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
-	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
-			    handle_edge_irq, arg->hpet_data, "edge");
-
-	return 0;
-}
-
-static void hpet_msi_free(struct irq_domain *domain,
-			  struct msi_domain_info *info, unsigned int virq)
-{
-	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
-}
-
-static struct msi_domain_ops hpet_msi_domain_ops = {
-	.get_hwirq	= hpet_msi_get_hwirq,
-	.msi_init	= hpet_msi_init,
-	.msi_free	= hpet_msi_free,
-};
-
-static struct msi_domain_info hpet_msi_domain_info = {
-	.ops		= &hpet_msi_domain_ops,
-	.chip		= &hpet_msi_controller,
-};
-
-struct irq_domain *hpet_create_irq_domain(int hpet_id)
-{
-	struct msi_domain_info *domain_info;
-	struct irq_domain *parent, *d;
-	struct irq_alloc_info info;
-	struct fwnode_handle *fn;
-
-	if (x86_vector_domain == NULL)
-		return NULL;
-
-	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
-	if (!domain_info)
-		return NULL;
-
-	*domain_info = hpet_msi_domain_info;
-	domain_info->data = (void *)(long)hpet_id;
-
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET;
-	info.hpet_id = hpet_id;
-	parent = irq_remapping_get_ir_irq_domain(&info);
-	if (parent == NULL)
-		parent = x86_vector_domain;
-	else
-		hpet_msi_controller.name = "IR-HPET-MSI";
-
-	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
-					      hpet_id);
-	if (!fn) {
-		kfree(domain_info);
-		return NULL;
-	}
-
-	d = msi_create_irq_domain(fn, domain_info, parent);
-	if (!d) {
-		irq_domain_free_fwnode(fn);
-		kfree(domain_info);
-	}
-	return d;
-}
-
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
-		    int dev_num)
-{
-	struct irq_alloc_info info;
-
-	init_irq_alloc_info(&info, NULL);
-	info.type = X86_IRQ_ALLOC_TYPE_HPET;
-	info.hpet_data = hc;
-	info.hpet_id = hpet_dev_id(domain);
-	info.hpet_index = dev_num;
-
-	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
-}
-#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 67b33d67002f9dbe52d267ae30719bc17756b3a6..3a179e238a9c90f161667c0924e2298581214bf1 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/smp.h>
 
+#include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/acpi.h>
 
@@ -68,16 +69,13 @@ static struct apic apic_default __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= default_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	/* logical delivery broadcast to all CPUs: */
-	.irq_dest_mode			= 1,
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= default_check_apicid_used,
 
+	.check_apicid_used		= default_check_apicid_used,
 	.init_apic_ldr			= default_init_apic_ldr,
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= setup_apic_flat_routing,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
@@ -169,9 +167,6 @@ void __init default_setup_apic_routing(void)
 
 	if (apic->setup_apic_routing)
 		apic->setup_apic_routing();
-
-	if (x86_platform.apic_post_init)
-		x86_platform.apic_post_init();
 }
 
 void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 29f0e0984557ebb152d7fcf971398d0e1cf5b63b..fdf09158bc4588c077dfb1d4717e82c68ef56ace 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -31,9 +31,6 @@ void __init default_setup_apic_routing(void)
 			break;
 		}
 	}
-
-	if (x86_platform.apic_post_init)
-		x86_platform.apic_post_init();
 }
 
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index bf6662d37a3346b50605320e633bfc2fccd469cf..0e1e33f4734bbc1893fffcd3ff5b40dfea65eb19 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -633,7 +633,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
 }
 #endif
 
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+	if (fwspec->param_count != 1)
+		return 0;
+
+	if (is_fwnode_irqchip(fwspec->fwnode)) {
+		const char *fwname = fwnode_get_name(fwspec->fwnode);
+		return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+			simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+	}
+	return to_of_node(fwspec->fwnode) &&
+		of_device_is_compatible(to_of_node(fwspec->fwnode),
+					"intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+	if (fwspec->param_count != 1)
+		return 0;
+
+	if (is_fwnode_irqchip(fwspec->fwnode)) {
+		const char *fwname = fwnode_get_name(fwspec->fwnode);
+		return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+			simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+	}
+	return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+			     enum irq_domain_bus_token bus_token)
+{
+	/*
+	 * HPET and I/OAPIC cannot be parented in the vector domain
+	 * if IRQ remapping is enabled. APIC IDs above 15 bits are
+	 * only permitted if IRQ remapping is enabled, so check that.
+	 */
+	if (apic->apic_id_valid(32768))
+		return 0;
+
+	return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
 static const struct irq_domain_ops x86_vector_domain_ops = {
+	.select		= x86_vector_select,
 	.alloc		= x86_vector_alloc_irqs,
 	.free		= x86_vector_free_irqs,
 	.activate	= x86_vector_activate,
@@ -711,8 +754,6 @@ int __init arch_early_irq_init(void)
 	BUG_ON(x86_vector_domain == NULL);
 	irq_set_default_host(x86_vector_domain);
 
-	arch_init_msi_domain(x86_vector_domain);
-
 	BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
 
 	/*
@@ -817,10 +858,17 @@ void apic_ack_edge(struct irq_data *irqd)
 	apic_ack_irq(irqd);
 }
 
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+				       struct msi_msg *msg)
+{
+       __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
 static struct irq_chip lapic_controller = {
 	.name			= "APIC",
 	.irq_ack		= apic_ack_edge,
 	.irq_set_affinity	= apic_set_affinity,
+	.irq_compose_msi_msg	= x86_vector_msi_compose_msg,
 	.irq_retrigger		= apic_retrigger_irq,
 };
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 7eec3c154fa2477bb4fb5b268a814521df39c01e..f4da9bb69a8859ff10824315388aeb49c2ccfad9 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -63,7 +63,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 		if (!dest)
 			continue;
 
-		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 		/* Remove cluster CPUs from tmpmask */
 		cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
 	}
@@ -186,15 +186,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 1, /* logical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= true,
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= init_x2apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 032a00e5d9fa6603e8e7662ebdf9d93c5b6a4901..6bde05a86b4edde5b0ea91b4f088424a97c4a216 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -160,15 +160,13 @@ static struct apic apic_x2apic_phys __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.delivery_mode			= APIC_DELIVERY_MODE_FIXED,
+	.dest_mode_logical		= false,
 
 	.disable_esr			= 0,
-	.dest_logical			= 0,
-	.check_apicid_used		= NULL,
 
+	.check_apicid_used		= NULL,
 	.init_apic_ldr			= init_x2apic_ldr,
-
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e6230af1986468d2f0a03464988dcd576e7ca081..83cf4e4bdf56e9ff3248eb57015e38230995813c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -651,7 +651,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
 	.irq_dest_mode			= 0, /* Physical */
 
 	.disable_esr			= 0,
-	.dest_logical			= APIC_DEST_LOGICAL,
+	.dest_logical			= APIC_DEST_PHYSICAL,
 	.check_apicid_used		= NULL,
 
 	.init_apic_ldr			= uv_init_apic_ldr,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index eea739a66a4b68e15c6e508d8aec284f7c208539..e18408e98b84fd81a33517ca3e514ff046c4c576 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -18,6 +18,7 @@
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
 #include <asm/debugreg.h>
+#include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -574,6 +575,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
 		}
 	}
+
+	resctrl_cpu_detect(c);
 }
 
 static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index b290aa9bfb1cc48f53230096c5a68e9b4d89c154..dcb1d2452550719cae8ef021342b31491517d8fb 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,7 +21,7 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/msr.h>
 #include <asm/vmx.h>
 #include <asm/paravirt.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8ec156923ebbd3fac263cf3bdf246dd041addc4d..4ec0c7dc250db692071db15dff4550fc432fb30a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <linux/syscore_ops.h>
 
+#include <asm/cmdline.h>
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
 #include <asm/mmu_context.h>
@@ -39,7 +40,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/mtrr.h>
 #include <asm/hwcap2.h>
 #include <linux/numa.h>
@@ -57,6 +58,7 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
 #endif
+#include <asm/sigframe.h>
 
 #ifdef CONFIG_NUMA_AWARE_SPINLOCKS
 #include <asm/qspinlock.h>
@@ -464,27 +466,22 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
-	struct pkru_state *pk;
+	if (c == &boot_cpu_data) {
+		if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+			return;
+		/*
+		 * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+		 * bit to be set.  Enforce it.
+		 */
+		setup_force_cpu_cap(X86_FEATURE_OSPKE);
 
-	/* check the boot processor, plus compile options for PKU: */
-	if (!cpu_feature_enabled(X86_FEATURE_PKU))
-		return;
-	/* checks the actual processor's cpuid bits: */
-	if (!cpu_has(c, X86_FEATURE_PKU))
-		return;
-	if (pku_disabled)
+	} else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		return;
+	}
 
 	cr4_set_bits(X86_CR4_PKE);
-	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-	if (pk)
-		pk->pkru = init_pkru_value;
-	/*
-	 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
-	 * cpuid bit to be set.  We need to ensure that we
-	 * update that bit in this CPU's "cpu_info".
-	 */
-	set_cpu_cap(c, X86_FEATURE_OSPKE);
+	/* Load the default PKRU value */
+	pkru_write_default();
 }
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -585,8 +582,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
 
 void load_percpu_segment(int cpu)
 {
@@ -892,30 +890,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	}
 }
 
-static void init_cqm(struct cpuinfo_x86 *c)
-{
-	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
-		c->x86_cache_max_rmid  = -1;
-		c->x86_cache_occ_scale = -1;
-		return;
-	}
-
-	/* will be overridden if occupancy monitoring exists */
-	c->x86_cache_max_rmid = cpuid_ebx(0xf);
-
-	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
-	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
-	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
-		u32 eax, ebx, ecx, edx;
-
-		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
-		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
-
-		c->x86_cache_max_rmid  = ecx;
-		c->x86_cache_occ_scale = ebx;
-	}
-}
-
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
@@ -983,7 +957,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 
 	init_scattered_cpuid_features(c);
 	init_speculation_control(c);
-	init_cqm(c);
 
 	/*
 	 * Clear/Set all flags overridden by options, after probe.
@@ -1243,6 +1216,59 @@ static void detect_nopl(void)
 #endif
 }
 
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+	char arg[128];
+	char *argptr = arg;
+	int arglen, res, bit;
+
+#ifdef CONFIG_X86_32
+	if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+		setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+		pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+	if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+		setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+	if (arglen <= 0)
+		return;
+
+	pr_info("Clearing CPUID bits:");
+	do {
+		res = get_option(&argptr, &bit);
+		if (res == 0 || res == 3)
+			break;
+
+		/* If the argument was too long, the last bit may be cut off */
+		if (res == 1 && arglen >= sizeof(arg))
+			break;
+
+		if (bit >= 0 && bit < NCAPINTS * 32) {
+			pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+			setup_clear_cpu_cap(bit);
+		}
+	} while (res == 2);
+	pr_cont("\n");
+}
+
 /*
  * Do minimum CPU detection early.
  * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -1278,6 +1304,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		get_cpu_cap(c);
 		get_cpu_address_sizes(c);
 		setup_force_cpu_cap(X86_FEATURE_CPUID);
+		cpu_parse_early_param();
 
 		if (this_cpu->c_early_init)
 			this_cpu->c_early_init(c);
@@ -1295,8 +1322,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	cpu_set_bug_bits(c);
 
+	sld_setup(c);
+
 	fpu__init_system(c);
 
+	init_sigframe_size();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Regardless of whether PCID is enumerated, the SDM says
@@ -1367,7 +1398,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
 	 * where GS is unused by the prev and next threads.
 	 *
 	 * Since neither vendor documents this anywhere that I can see,
-	 * detect it directly instead of hardcoding the choice by
+	 * detect it directly instead of hard-coding the choice by
 	 * vendor.
 	 *
 	 * I've designated AMD's behavior as the "bug" because it's
@@ -1446,20 +1477,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
 #endif
 }
 
-static void x86_init_cache_qos(struct cpuinfo_x86 *c)
-{
-	/*
-	 * The heavy lifting of max_rmid and cache_occ_scale are handled
-	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
-	 * in case CQM bits really aren't there in this CPU.
-	 */
-	if (c != &boot_cpu_data) {
-		boot_cpu_data.x86_cache_max_rmid =
-			min(boot_cpu_data.x86_cache_max_rmid,
-			    c->x86_cache_max_rmid);
-	}
-}
-
 /*
  * Validate that ACPI/mptables have the same information about the
  * effective APIC id and update the package map.
@@ -1579,7 +1596,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #endif
 
 	x86_init_rdrand(c);
-	x86_init_cache_qos(c);
 	setup_pku(c);
 
 	/*
@@ -1706,9 +1722,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 }
 
 /*
- * clearcpuid= was already parsed in fpu__init_parse_early_param.
- * But we need to keep a dummy __setup around otherwise it would
- * show up as an environment variable for init.
+ * clearcpuid= was already parsed in cpu_parse_early_param().  This dummy
+ * function prevents it from becoming an environment variable for init.
  */
 static __init int setup_clearcpuid(char *arg)
 {
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 3cbe24ca80abd9b1033657a080da305e29d31a3c..91ae604925f3f4d53dbe7f70423915388fd48875 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,12 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_TOTAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_CQM_MBM_LOCAL,		X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_FP16,		X86_FEATURE_AVX512BW  },
+	{ X86_FEATURE_ENQCMD,			X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_PER_THREAD_MBA,		X86_FEATURE_MBA       },
+	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
+	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 62e9a982adaf92bad1394142ff58d99d74f81f75..dc0840aae26c14e8c8b6fb9ae6a7b4996a9edc5e 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -10,6 +10,7 @@
 
 #include <asm/cpu.h>
 #include <asm/smp.h>
+#include <asm/numa.h>
 #include <asm/cacheinfo.h>
 #include <asm/spec-ctrl.h>
 #include <asm/delay.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f3f42e2169923e41890b694e8ab571327a35a575..468783c45cb594e2e75e1eac254b67fd00cf11ca 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -9,6 +9,7 @@
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 
 #include <asm/cpufeature.h>
 #include <asm/pgtable.h>
@@ -19,6 +20,10 @@
 #include <asm/microcode_intel.h>
 #include <asm/hwcap2.h>
 #include <asm/elf.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/resctrl.h>
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
@@ -31,6 +36,28 @@
 #include <asm/apic.h>
 #endif
 
+enum split_lock_detect_state {
+	sld_off = 0,
+	sld_warn,
+	sld_fatal,
+	sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models.  Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
 /*
  * Just in case our CPU detection goes bad, or you have a weird system,
  * allow a way to override the automatic disabling of MPX.
@@ -341,6 +368,11 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		detect_ht_early(c);
 }
 
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+	resctrl_cpu_detect(c);
+}
+
 #ifdef CONFIG_X86_32
 /*
  *	Early probe support logic for ppro memory erratum #50
@@ -652,6 +684,9 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
 }
 
+static void split_lock_init(void);
+static void bus_lock_init(void);
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	early_init_intel(c);
@@ -769,6 +804,9 @@ static void init_intel(struct cpuinfo_x86 *c)
 		tsx_enable();
 	if (tsx_ctrl_state == TSX_CTRL_DISABLE)
 		tsx_disable();
+
+	split_lock_init();
+	bus_lock_init();
 }
 
 #ifdef CONFIG_X86_32
@@ -1025,8 +1063,326 @@ static const struct cpu_dev intel_cpu_dev = {
 #endif
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
+	.c_bsp_init	= bsp_init_intel,
 	.c_init		= init_intel,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
 cpu_dev_register(intel_cpu_dev);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+static const struct {
+	const char			*option;
+	enum split_lock_detect_state	state;
+} sld_options[] __initconst = {
+	{ "off",	sld_off   },
+	{ "warn",	sld_warn  },
+	{ "fatal",	sld_fatal },
+	{ "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+	int len = strlen(opt), ratelimit;
+
+	if (strncmp(arg, opt, len))
+		return false;
+
+	/*
+	 * Min ratelimit is 1 bus lock/sec.
+	 * Max ratelimit is 1000 bus locks/sec.
+	 */
+	if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+	    ratelimit > 0 && ratelimit <= 1000) {
+		ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+		ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+		return true;
+	}
+
+	return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+	u64 ctrl, tmp;
+
+	if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+		return false;
+	if (on)
+		ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+	else
+		ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+	if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+		return false;
+	rdmsrl(MSR_TEST_CTRL, tmp);
+	return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+	enum split_lock_detect_state state = sld_warn;
+	char arg[20];
+	int i, ret;
+
+	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+		return;
+
+	ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+				  arg, sizeof(arg));
+	if (ret >= 0) {
+		for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+			if (match_option(arg, ret, sld_options[i].option)) {
+				state = sld_options[i].state;
+				break;
+			}
+		}
+	}
+	sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+	if (!split_lock_verify_msr(false)) {
+		pr_info("MSR access failed: Disabled\n");
+		return;
+	}
+
+	rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+	if (!split_lock_verify_msr(true)) {
+		pr_info("MSR access failed: Disabled\n");
+		return;
+	}
+
+	/* Restore the MSR to its cached value. */
+	wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+	setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+	u64 test_ctrl_val = msr_test_ctrl_cache;
+
+	if (on)
+		test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+	wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+static void split_lock_init(void)
+{
+	/*
+	 * #DB for bus lock handles ratelimit and #AC for split lock is
+	 * disabled.
+	 */
+	if (sld_state == sld_ratelimit) {
+		split_lock_verify_msr(false);
+		return;
+	}
+
+	if (cpu_model_supports_sld)
+		split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+	pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+			    current->comm, current->pid, ip);
+
+	/*
+	 * Disable the split lock detection for this task so it can make
+	 * progress and set TIF_SLD so the detection is re-enabled via
+	 * switch_to_sld() when the task is scheduled out.
+	 */
+	sld_update_msr(false);
+	set_tsk_thread_flag(current, TIF_SLD);
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+	if (sld_state == sld_warn) {
+		split_lock_warn(ip);
+		return true;
+	}
+
+	pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+		     current->comm, current->pid,
+		     sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+	current->thread.error_code = 0;
+	current->thread.trap_nr = X86_TRAP_AC;
+	force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+	return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+static void bus_lock_init(void)
+{
+	u64 val;
+
+	/*
+	 * Warn and fatal are handled by #AC for split lock if #AC for
+	 * split lock is supported.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
+	    (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+	    (sld_state == sld_warn || sld_state == sld_fatal)) ||
+	    sld_state == sld_off)
+		return;
+
+	/*
+	 * Enable #DB for bus lock. All bus locks are handled in #DB except
+	 * split locks are handled in #AC in the fatal case.
+	 */
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+	val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+	if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+		return false;
+	split_lock_warn(regs->ip);
+	return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+	switch (sld_state) {
+	case sld_off:
+		break;
+	case sld_ratelimit:
+		/* Enforce no more than bld_ratelimit bus locks/sec. */
+		while (!__ratelimit(&bld_ratelimit))
+			msleep(20);
+		/* Warn on the bus lock. */
+		fallthrough;
+	case sld_warn:
+		pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+				    current->comm, current->pid, regs->ip);
+		break;
+	case sld_fatal:
+		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+		break;
+	}
+}
+
+/*
+ * This function is called only when switching between tasks with
+ * different split-lock detection modes. It sets the MSR for the
+ * mode of the new task. This is right most of the time, but since
+ * the MSR is shared by hyperthreads on a physical core there can
+ * be glitches when the two threads need different modes.
+ */
+void switch_to_sld(unsigned long tifn)
+{
+	sld_update_msr(!(tifn & _TIF_SLD));
+}
+
+/*
+ * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
+ * only be trusted if it is confirmed that a CPU model implements a
+ * specific feature at a particular bit position.
+ *
+ * The possible driver data field values:
+ *
+ * - 0: CPU models that are known to have the per-core split-lock detection
+ *	feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ *
+ * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
+ *      bit 5 to enumerate the per-core split-lock detection feature.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_X,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_L,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_D,		X86_FEATURE_ANY, 0},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_D,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_L,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE_L,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE,		X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_SAPPHIRERAPIDS_X,	X86_FEATURE_ANY, 1},
+	{X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE,		X86_FEATURE_ANY, 1},
+	{}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+	const struct x86_cpu_id *m;
+	u64 ia32_core_caps;
+
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return;
+
+	m = x86_match_cpu(split_lock_cpu_ids);
+	if (!m)
+		return;
+
+	switch (m->driver_data) {
+	case 0:
+		break;
+	case 1:
+		if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+			return;
+		rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+		if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
+			return;
+		break;
+	default:
+		return;
+	}
+
+	cpu_model_supports_sld = true;
+	__split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+	    !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		return;
+
+	switch (sld_state) {
+	case sld_off:
+		pr_info("disabled\n");
+		break;
+	case sld_warn:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+			pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+		else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: warning on user-space bus_locks\n");
+		break;
+	case sld_fatal:
+		if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+			pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+		} else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+			pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+				boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+				" from non-WB" : "");
+		}
+		break;
+	case sld_ratelimit:
+		if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+			pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+		break;
+	}
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+	split_lock_setup(c);
+	sld_state_setup();
+	sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2e6371a2f123f85e70a10597b6ffd8cb55ac2879..4529f1103bef8b57aa8380293a55f44052397fae 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
 #include <linux/export.h>
 #include <linux/jump_label.h>
 #include <linux/set_memory.h>
+#include <linux/sync_core.h>
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 
@@ -167,8 +168,6 @@ void mce_inject_log(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_inject_log);
 
-static struct notifier_block mce_srao_nb;
-
 void mce_register_decode_chain(struct notifier_block *nb)
 {
 	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
@@ -379,13 +378,16 @@ static int msr_to_offset(u32 msr)
 	return -1;
 }
 
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr)
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
 {
-	pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
-		 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+	if (wrmsr) {
+		pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+			 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+			 regs->ip, (void *)regs->ip);
+	} else {
+		pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+	}
 
 	show_stack_regs(regs);
 
@@ -393,21 +395,28 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
 
 	while (true)
 		cpu_relax();
-
-	return true;
 }
 
 /* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
+static noinstr u64 mce_rdmsrl(u32 msr)
 {
 	DECLARE_ARGS(val, low, high);
 
 	if (__this_cpu_read(injectm.finished)) {
-		int offset = msr_to_offset(msr);
+		int offset;
+		u64 ret;
+
+		instrumentation_begin();
 
+		offset = msr_to_offset(msr);
 		if (offset < 0)
-			return 0;
-		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+			ret = 0;
+		else
+			ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+		instrumentation_end();
+
+		return ret;
 	}
 
 	/*
@@ -417,41 +426,28 @@ static u64 mce_rdmsrl(u32 msr)
 	 */
 	asm volatile("1: rdmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
 		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 
 
 	return EAX_EDX_VAL(val, low, high);
 }
 
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr)
-{
-	pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
-		 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
-		  regs->ip, (void *)regs->ip);
-
-	show_stack_regs(regs);
-
-	panic("MCA architectural violation!\n");
-
-	while (true)
-		cpu_relax();
-
-	return true;
-}
-
-static void mce_wrmsrl(u32 msr, u64 v)
+static noinstr void mce_wrmsrl(u32 msr, u64 v)
 {
 	u32 low, high;
 
 	if (__this_cpu_read(injectm.finished)) {
-		int offset = msr_to_offset(msr);
+		int offset;
 
+		instrumentation_begin();
+
+		offset = msr_to_offset(msr);
 		if (offset >= 0)
 			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+		instrumentation_end();
+
 		return;
 	}
 
@@ -461,7 +457,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
 	/* See comment in mce_rdmsrl() */
 	asm volatile("1: wrmsr\n"
 		     "2:\n"
-		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
 		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 
@@ -617,28 +613,30 @@ static struct notifier_block early_nb = {
 	.priority	= MCE_PRIO_EARLY,
 };
 
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct mce *mce = (struct mce *)data;
 	unsigned long pfn;
 
-	if (!mce)
+	if (!mce || !mce_usable_address(mce))
 		return NOTIFY_DONE;
 
-	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
-		pfn = mce->addr >> PAGE_SHIFT;
-		if (!memory_failure(pfn, 0)) {
-			set_mce_nospec(pfn, whole_page(mce));
-			mce->kflags |= MCE_HANDLED_UC;
-		}
+	if (mce->severity != MCE_AO_SEVERITY &&
+	    mce->severity != MCE_DEFERRED_SEVERITY)
+		return NOTIFY_DONE;
+
+	pfn = mce->addr >> PAGE_SHIFT;
+	if (!memory_failure(pfn, 0)) {
+		set_mce_nospec(pfn, whole_page(mce));
+		mce->kflags |= MCE_HANDLED_UC;
 	}
 
 	return NOTIFY_OK;
 }
-static struct notifier_block mce_srao_nb = {
-	.notifier_call	= srao_decode_notifier,
-	.priority	= MCE_PRIO_SRAO,
+static struct notifier_block mce_uc_nb = {
+	.notifier_call	= uc_decode_notifier,
+	.priority	= MCE_PRIO_UC,
 };
 
 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -1214,6 +1212,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
 
 static void kill_me_now(struct callback_head *ch)
 {
+	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
 	force_sig(SIGBUS);
 }
 
@@ -1221,36 +1222,65 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
+	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		return;
 	}
 
-	if (p->mce_vaddr != (void __user *)-1l) {
-		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
-	} else {
-		pr_err("Memory error not recovered");
-		kill_me_now(cb);
-	}
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send SIGBUS here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
+	pr_err("Memory error not recovered");
+	kill_me_now(cb);
+}
+
+static void kill_me_never(struct callback_head *cb)
+{
+	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
+	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 }
 
-static void queue_task_work(struct mce *m, int kill_it)
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
 {
-	current->mce_addr = m->addr;
-	current->mce_kflags = m->kflags;
-	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
-	current->mce_whole_page = whole_page(m);
+	int count = ++current->mce_count;
 
-	if (kill_it)
-		current->mce_kill_me.func = kill_me_now;
-	else
-		current->mce_kill_me.func = kill_me_maybe;
+	/* First call, save all the details */
+	if (count == 1) {
+		current->mce_addr = m->addr;
+		current->mce_kflags = m->kflags;
+		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+		current->mce_whole_page = whole_page(m);
+		current->mce_kill_me.func = func;
+	}
+
+	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
+	if (count > 10)
+		mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+	/* Second or later call, make sure page address matches the one from first call */
+	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+		mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+	/* Do not call task_work_add() more than once */
+	if (count > 1)
+		return;
 
 	task_work_add(current, &current->mce_kill_me, true);
 }
@@ -1401,8 +1431,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, kill_it);
-
+		if (kill_it)
+			queue_task_work(&m, msg, kill_me_now);
+		else
+			queue_task_work(&m, msg, kill_me_maybe);
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
@@ -1419,7 +1451,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, kill_it);
+			queue_task_work(&m, msg, kill_me_never);
 	}
 
 out_ist:
@@ -1893,6 +1925,8 @@ bool filter_mce(struct mce *m)
 {
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return amd_filter_mce(m);
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		return intel_filter_mce(m);
 
 	return false;
 }
@@ -2048,7 +2082,7 @@ int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&early_nb);
-	mce_register_decode_chain(&mce_srao_nb);
+	mce_register_decode_chain(&mce_uc_nb);
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
 
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f2350967a898a3c06890d4c454ae5a500d7d24dd..04081400df5c4b6e7a84a21fed6510c5373e6c90 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -517,3 +517,20 @@ void mce_intel_feature_clear(struct cpuinfo_x86 *c)
 {
 	intel_clear_lmce();
 }
+
+bool intel_filter_mce(struct mce *m)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	/* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+	if ((c->x86 == 6) &&
+	    ((c->x86_model == INTEL_FAM6_HASWELL) ||
+	     (c->x86_model == INTEL_FAM6_HASWELL_L) ||
+	     (c->x86_model == INTEL_FAM6_BROADWELL) ||
+	     (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+	    (m->bank == 0) &&
+	    ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+		return true;
+
+	return false;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 4f90f720b28151cffe9bba6cc8554dc2e84302b9..9e2237f68c923b9bf2d3c680a1c54566d231f026 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -46,11 +46,13 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
 bool mce_intel_cmci_poll(void);
 void mce_intel_hcpu_update(unsigned long cpu);
 void cmci_disable_bank(int bank);
+bool intel_filter_mce(struct mce *m);
 #else
 # define cmci_intel_adjust_timer mce_adjust_timer_default
 static inline bool mce_intel_cmci_poll(void) { return false; }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; }
 #endif
 
 void mce_timer_kick(unsigned long interval);
@@ -171,17 +173,7 @@ extern bool filter_mce(struct mce *m);
 #ifdef CONFIG_X86_MCE_AMD
 extern bool amd_filter_mce(struct mce *m);
 #else
-static inline bool amd_filter_mce(struct mce *m)			{ return false; };
+static inline bool amd_filter_mce(struct mce *m) { return false; }
 #endif
 
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr);
-
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
-				      struct pt_regs *regs, int trapnr,
-				      unsigned long error_code,
-				      unsigned long fault_addr);
-
 #endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 2c5ea37a2e9b3b6cc6077d5b38c2ee5699aae4e4..6fa66c2b94b223e9f81a184b89b56cc02648779d 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -273,25 +273,25 @@ static bool is_copy_from_user(struct pt_regs *regs)
  */
 static int error_context(struct mce *m, struct pt_regs *regs)
 {
-	enum handler_type t;
-
 	if ((m->cs & 3) == 3)
 		return IN_USER;
 	if (!mc_recoverable(m->mcgstatus))
 		return IN_KERNEL;
 
-	t = ex_get_fault_handler_type(m->ip);
-	if (t == EX_HANDLER_FAULT) {
-		m->kflags |= MCE_IN_KERNEL_RECOV;
-		return IN_KERNEL_RECOV;
-	}
-	if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
-		m->kflags |= MCE_IN_KERNEL_RECOV;
+	switch (ex_get_fixup_type(m->ip)) {
+	case EX_TYPE_UACCESS:
+	case EX_TYPE_COPY:
+		if (!regs || !is_copy_from_user(regs))
+			return IN_KERNEL;
 		m->kflags |= MCE_IN_KERNEL_COPYIN;
+		fallthrough;
+	case EX_TYPE_FAULT_MCE_SAFE:
+	case EX_TYPE_DEFAULT_MCE_SAFE:
+		m->kflags |= MCE_IN_KERNEL_RECOV;
 		return IN_KERNEL_RECOV;
+	default:
+		return IN_KERNEL;
 	}
-
-	return IN_KERNEL;
 }
 
 static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 87a34b6e06a2bab0612310d4265afc997fbdbdb6..53857855aa0e0be7df23b928915a499e63bf1fae 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,7 +22,7 @@
 #include <linux/cpuhotplug.h>
 
 #include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include "internal.h"
 
 /* Mutex to protect rdtgroup access. */
@@ -57,127 +57,57 @@ static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
 	      struct rdt_resource *r);
 
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
 
-struct rdt_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
 	{
-		.rid			= RDT_RESOURCE_L3,
-		.name			= "L3",
-		.domains		= domain_init(RDT_RESOURCE_L3),
-		.msr_base		= MSR_IA32_L3_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 1,
-			.cbm_idx_offset	= 0,
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L3,
+			.name			= "L3",
+			.cache_level		= 3,
+			.cache = {
+				.min_cbm_bits	= 1,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L3),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
 		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
-	},
-	[RDT_RESOURCE_L3DATA] =
-	{
-		.rid			= RDT_RESOURCE_L3DATA,
-		.name			= "L3DATA",
-		.domains		= domain_init(RDT_RESOURCE_L3DATA),
 		.msr_base		= MSR_IA32_L3_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
-	},
-	[RDT_RESOURCE_L3CODE] =
-	{
-		.rid			= RDT_RESOURCE_L3CODE,
-		.name			= "L3CODE",
-		.domains		= domain_init(RDT_RESOURCE_L3CODE),
-		.msr_base		= MSR_IA32_L3_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-		.cache_level		= 3,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 1,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L2] =
 	{
-		.rid			= RDT_RESOURCE_L2,
-		.name			= "L2",
-		.domains		= domain_init(RDT_RESOURCE_L2),
-		.msr_base		= MSR_IA32_L2_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 1,
-			.cbm_idx_offset	= 0,
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_L2,
+			.name			= "L2",
+			.cache_level		= 2,
+			.cache = {
+				.min_cbm_bits	= 1,
+			},
+			.domains		= domain_init(RDT_RESOURCE_L2),
+			.parse_ctrlval		= parse_cbm,
+			.format_str		= "%d=%0*x",
+			.fflags			= RFTYPE_RES_CACHE,
 		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
-	},
-	[RDT_RESOURCE_L2DATA] =
-	{
-		.rid			= RDT_RESOURCE_L2DATA,
-		.name			= "L2DATA",
-		.domains		= domain_init(RDT_RESOURCE_L2DATA),
 		.msr_base		= MSR_IA32_L2_CBM_BASE,
 		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 0,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
-	},
-	[RDT_RESOURCE_L2CODE] =
-	{
-		.rid			= RDT_RESOURCE_L2CODE,
-		.name			= "L2CODE",
-		.domains		= domain_init(RDT_RESOURCE_L2CODE),
-		.msr_base		= MSR_IA32_L2_CBM_BASE,
-		.msr_update		= cat_wrmsr,
-		.cache_level		= 2,
-		.cache = {
-			.min_cbm_bits	= 1,
-			.cbm_idx_mult	= 2,
-			.cbm_idx_offset	= 1,
-		},
-		.parse_ctrlval		= parse_cbm,
-		.format_str		= "%d=%0*x",
-		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_MBA] =
 	{
-		.rid			= RDT_RESOURCE_MBA,
-		.name			= "MB",
-		.domains		= domain_init(RDT_RESOURCE_MBA),
-		.cache_level		= 3,
-		.format_str		= "%d=%*u",
-		.fflags			= RFTYPE_RES_MB,
+		.r_resctrl = {
+			.rid			= RDT_RESOURCE_MBA,
+			.name			= "MB",
+			.cache_level		= 3,
+			.domains		= domain_init(RDT_RESOURCE_MBA),
+			.parse_ctrlval		= parse_bw,
+			.format_str		= "%d=%*u",
+			.fflags			= RFTYPE_RES_MB,
+		},
 	},
 };
 
-static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
-{
-	return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
-}
-
 /*
  * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
  * as they do not have CPUID enumeration support for Cache allocation.
@@ -198,7 +128,8 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
  */
 static inline void cache_alloc_hsw_probe(void)
 {
-	struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_resource *r  = &hw_res->r_resctrl;
 	u32 l, h, max_cbm = BIT_MASK(20) - 1;
 
 	if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
@@ -210,7 +141,7 @@ static inline void cache_alloc_hsw_probe(void)
 	if (l != max_cbm)
 		return;
 
-	r->num_closid = 4;
+	hw_res->num_closid = 4;
 	r->default_ctrl = max_cbm;
 	r->cache.cbm_len = 20;
 	r->cache.shareable_bits = 0xc0000;
@@ -224,7 +155,7 @@ static inline void cache_alloc_hsw_probe(void)
 bool is_mba_sc(struct rdt_resource *r)
 {
 	if (!r)
-		return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+		return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
 
 	return r->membw.mba_sc;
 }
@@ -252,25 +183,33 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
 
 static bool __get_mem_config_intel(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_3_eax eax;
 	union cpuid_0x10_x_edx edx;
-	u32 ebx, ecx;
+	u32 ebx, ecx, max_delay;
 
 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
-	r->membw.max_delay = eax.split.max_delay + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
+	max_delay = eax.split.max_delay + 1;
 	r->default_ctrl = MAX_MBA_BW;
-	r->membw.mbm_width = MBM_CNTR_WIDTH;
+	r->membw.arch_needs_linear = true;
 	if (ecx & MBA_IS_LINEAR) {
 		r->membw.delay_linear = true;
-		r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
-		r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+		r->membw.min_bw = MAX_MBA_BW - max_delay;
+		r->membw.bw_gran = MAX_MBA_BW - max_delay;
 	} else {
 		if (!rdt_get_mb_table(r))
 			return false;
+		r->membw.arch_needs_linear = false;
 	}
 	r->data_width = 3;
 
+	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+	else
+		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+	thread_throttle_mode_init();
+
 	r->alloc_capable = true;
 	r->alloc_enabled = true;
 
@@ -279,18 +218,18 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 
 static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_3_eax eax;
 	union cpuid_0x10_x_edx edx;
 	u32 ebx, ecx;
 
 	cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
 	r->default_ctrl = MAX_MBA_BW_AMD;
 
 	/* AMD does not use delay */
-	r->membw.delay_linear = false;
+	r->membw.arch_needs_linear = false;
 
-	r->membw.mbm_width = MBM_CNTR_WIDTH_AMD;
 	r->membw.min_bw = 0;
 	r->membw.bw_gran = 1;
 	/* Max value is 2048, Data width should be 4 in decimal */
@@ -304,12 +243,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 
 static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_1_eax eax;
 	union cpuid_0x10_x_edx edx;
 	u32 ebx, ecx;
 
 	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
-	r->num_closid = edx.split.cos_max + 1;
+	hw_res->num_closid = edx.split.cos_max + 1;
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
 	r->cache.shareable_bits = ebx & r->default_ctrl;
@@ -318,56 +258,35 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 	r->alloc_enabled = true;
 }
 
-static void rdt_get_cdp_config(int level, int type)
+static void rdt_get_cdp_config(int level)
 {
-	struct rdt_resource *r_l = &rdt_resources_all[level];
-	struct rdt_resource *r = &rdt_resources_all[type];
-
-	r->num_closid = r_l->num_closid / 2;
-	r->cache.cbm_len = r_l->cache.cbm_len;
-	r->default_ctrl = r_l->default_ctrl;
-	r->cache.shareable_bits = r_l->cache.shareable_bits;
-	r->data_width = (r->cache.cbm_len + 3) / 4;
-	r->alloc_capable = true;
 	/*
 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
 	 * "cdp" during resctrl file system mount time.
 	 */
-	r->alloc_enabled = false;
+	rdt_resources_all[level].cdp_enabled = false;
+	rdt_resources_all[level].r_resctrl.cdp_capable = true;
 }
 
 static void rdt_get_cdp_l3_config(void)
 {
-	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
-	rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+	rdt_get_cdp_config(RDT_RESOURCE_L3);
 }
 
 static void rdt_get_cdp_l2_config(void)
 {
-	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
-	rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
-}
-
-static int get_cache_id(int cpu, int level)
-{
-	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
-	int i;
-
-	for (i = 0; i < ci->num_leaves; i++) {
-		if (ci->info_list[i].level == level)
-			return ci->info_list[i].id;
-	}
-
-	return -1;
+	rdt_get_cdp_config(RDT_RESOURCE_L2);
 }
 
 static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + i, d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
 }
 
 /*
@@ -389,19 +308,23 @@ mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
 		struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	/*  Write the delay values for mba. */
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+		wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
 }
 
 static void
 cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 {
 	unsigned int i;
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 
 	for (i = m->low; i < m->high; i++)
-		wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
 }
 
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
@@ -417,16 +340,22 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
 	return NULL;
 }
 
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->num_closid;
+}
+
 void rdt_ctrl_update(void *arg)
 {
 	struct msr_param *m = arg;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
 	struct rdt_resource *r = m->res;
 	int cpu = smp_processor_id();
 	struct rdt_domain *d;
 
 	d = get_domain_from_cpu(cpu, r);
 	if (d) {
-		r->msr_update(d, m, r);
+		hw_res->msr_update(d, m, r);
 		return;
 	}
 	pr_warn_once("cpu %d not found in any domain for resource %s\n",
@@ -468,6 +397,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
 
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	int i;
 
 	/*
@@ -476,7 +406,7 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	 * For Memory Allocation: Set b/w requested to 100%
 	 * and the bandwidth in MBps to U32_MAX
 	 */
-	for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+	for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
 		*dc = r->default_ctrl;
 		*dm = MBA_MAX_MBPS;
 	}
@@ -484,26 +414,30 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct msr_param m;
 	u32 *dc, *dm;
 
-	dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+			   GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+	dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
+			   GFP_KERNEL);
 	if (!dm) {
 		kfree(dc);
 		return -ENOMEM;
 	}
 
-	d->ctrl_val = dc;
-	d->mbps_val = dm;
+	hw_dom->ctrl_val = dc;
+	hw_dom->mbps_val = dm;
 	setup_default_ctrlval(r, dc, dm);
 
 	m.low = 0;
-	m.high = r->num_closid;
-	r->msr_update(d, &m, r);
+	m.high = hw_res->num_closid;
+	hw_res->msr_update(d, &m, r);
 	return 0;
 }
 
@@ -558,37 +492,43 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
  */
 static void domain_add_cpu(int cpu, struct rdt_resource *r)
 {
-	int id = get_cache_id(cpu, r->cache_level);
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
 	struct list_head *add_pos = NULL;
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	d = rdt_find_domain(r, id, &add_pos);
 	if (IS_ERR(d)) {
-		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
 	}
 
 	if (d) {
 		cpumask_set_cpu(cpu, &d->cpu_mask);
+		if (r->cache.arch_has_per_cpu_cfg)
+			rdt_domain_reconfigure_cdp(r);
 		return;
 	}
 
-	d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
-	if (!d)
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
 		return;
 
+	d = &hw_dom->d_resctrl;
 	d->id = id;
 	cpumask_set_cpu(cpu, &d->cpu_mask);
 
 	rdt_domain_reconfigure_cdp(r);
 
 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
-		kfree(d);
+		kfree(hw_dom);
 		return;
 	}
 
 	if (r->mon_capable && domain_setup_mon_state(r, d)) {
-		kfree(d);
+		kfree(hw_dom->ctrl_val);
+		kfree(hw_dom->mbps_val);
+		kfree(hw_dom);
 		return;
 	}
 
@@ -604,14 +544,16 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 {
-	int id = get_cache_id(cpu, r->cache_level);
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	d = rdt_find_domain(r, id, NULL);
 	if (IS_ERR_OR_NULL(d)) {
-		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
 	}
+	hw_dom = resctrl_to_arch_dom(d);
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
@@ -644,16 +586,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		if (d->plr)
 			d->plr->d = NULL;
 
-		kfree(d->ctrl_val);
-		kfree(d->mbps_val);
+		kfree(hw_dom->ctrl_val);
+		kfree(hw_dom->mbps_val);
 		bitmap_free(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
-		kfree(d);
+		kfree(hw_dom);
 		return;
 	}
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+	if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
 		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
 			cancel_delayed_work(&d->mbm_over);
 			mbm_setup_overflow_handler(d, 0);
@@ -730,13 +672,8 @@ static int resctrl_offline_cpu(unsigned int cpu)
 static __init void rdt_init_padding(void)
 {
 	struct rdt_resource *r;
-	int cl;
 
 	for_each_alloc_capable_rdt_resource(r) {
-		cl = strlen(r->name);
-		if (cl > max_name_width)
-			max_name_width = cl;
-
 		if (r->data_width > max_data_width)
 			max_data_width = r->data_width;
 	}
@@ -825,19 +762,22 @@ static bool __init rdt_cpu_has(int flag)
 
 static __init bool get_mem_config(void)
 {
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
 	if (!rdt_cpu_has(X86_FEATURE_MBA))
 		return false;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		return __get_mem_config_intel(&rdt_resources_all[RDT_RESOURCE_MBA]);
+		return __get_mem_config_intel(&hw_res->r_resctrl);
 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return __rdt_get_mem_config_amd(&rdt_resources_all[RDT_RESOURCE_MBA]);
+		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
 
 	return false;
 }
 
 static __init bool get_rdt_alloc_resources(void)
 {
+	struct rdt_resource *r;
 	bool ret = false;
 
 	if (rdt_alloc_capable)
@@ -847,14 +787,16 @@ static __init bool get_rdt_alloc_resources(void)
 		return false;
 
 	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
-		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+		rdt_get_cache_alloc_cfg(1, r);
 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
 			rdt_get_cdp_l3_config();
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
-		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+		rdt_get_cache_alloc_cfg(2, r);
 		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
 			rdt_get_cdp_l2_config();
 		ret = true;
@@ -868,6 +810,8 @@ static __init bool get_rdt_alloc_resources(void)
 
 static __init bool get_rdt_mon_resources(void)
 {
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
 		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
@@ -878,7 +822,7 @@ static __init bool get_rdt_mon_resources(void)
 	if (!rdt_mon_features)
 		return false;
 
-	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+	return !rdt_get_mon_l3_config(r);
 }
 
 static __init void __check_quirks_intel(void)
@@ -893,6 +837,10 @@ static __init void __check_quirks_intel(void)
 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
 		else
 			set_rdt_options("!l3cat");
+		fallthrough;
+	case INTEL_FAM6_BROADWELL_X:
+		intel_rdt_mbm_apply_quirk();
+		break;
 	}
 }
 
@@ -912,40 +860,40 @@ static __init bool get_rdt_resources(void)
 
 static __init void rdt_init_res_defs_intel(void)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
 	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
 		if (r->rid == RDT_RESOURCE_L3 ||
-		    r->rid == RDT_RESOURCE_L3DATA ||
-		    r->rid == RDT_RESOURCE_L3CODE ||
-		    r->rid == RDT_RESOURCE_L2 ||
-		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE)
-			r->cbm_validate = cbm_validate_intel;
-		else if (r->rid == RDT_RESOURCE_MBA) {
-			r->msr_base = MSR_IA32_MBA_THRTL_BASE;
-			r->msr_update = mba_wrmsr_intel;
-			r->parse_ctrlval = parse_bw_intel;
+		    r->rid == RDT_RESOURCE_L2) {
+			r->cache.arch_has_sparse_bitmaps = false;
+			r->cache.arch_has_empty_bitmaps = false;
+			r->cache.arch_has_per_cpu_cfg = false;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
+			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+			hw_res->msr_update = mba_wrmsr_intel;
 		}
 	}
 }
 
 static __init void rdt_init_res_defs_amd(void)
 {
+	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
 	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
 		if (r->rid == RDT_RESOURCE_L3 ||
-		    r->rid == RDT_RESOURCE_L3DATA ||
-		    r->rid == RDT_RESOURCE_L3CODE ||
-		    r->rid == RDT_RESOURCE_L2 ||
-		    r->rid == RDT_RESOURCE_L2DATA ||
-		    r->rid == RDT_RESOURCE_L2CODE)
-			r->cbm_validate = cbm_validate_amd;
-		else if (r->rid == RDT_RESOURCE_MBA) {
-			r->msr_base = MSR_IA32_MBA_BW_BASE;
-			r->msr_update = mba_wrmsr_amd;
-			r->parse_ctrlval = parse_bw_amd;
+		    r->rid == RDT_RESOURCE_L2) {
+			r->cache.arch_has_sparse_bitmaps = true;
+			r->cache.arch_has_empty_bitmaps = true;
+			r->cache.arch_has_per_cpu_cfg = true;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
+			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+			hw_res->msr_update = mba_wrmsr_amd;
 		}
 	}
 }
@@ -960,6 +908,36 @@ static __init void rdt_init_res_defs(void)
 
 static enum cpuhp_state rdt_online;
 
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+		c->x86_cache_max_rmid  = -1;
+		c->x86_cache_occ_scale = -1;
+		c->x86_cache_mbm_width_offset = -1;
+		return;
+	}
+
+	/* will be overridden if occupancy monitoring exists */
+	c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+		c->x86_cache_max_rmid  = ecx;
+		c->x86_cache_occ_scale = ebx;
+		c->x86_cache_mbm_width_offset = eax & 0xff;
+
+		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+	}
+}
+
 static int __init resctrl_late_init(void)
 {
 	struct rdt_resource *r;
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 055c8613b5317afd6c550cab54ca94507b4e196a..87666275eed920654868ccf3fd9bb2a0b6b81004 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -21,53 +21,6 @@
 #include <linux/slab.h>
 #include "internal.h"
 
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and maximum bandwidth values specified by
- * the hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate_amd(char *buf, unsigned long *data,
-			    struct rdt_resource *r)
-{
-	unsigned long bw;
-	int ret;
-
-	ret = kstrtoul(buf, 10, &bw);
-	if (ret) {
-		rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
-		return false;
-	}
-
-	if (bw < r->membw.min_bw || bw > r->default_ctrl) {
-		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
-				    r->membw.min_bw, r->default_ctrl);
-		return false;
-	}
-
-	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
-	return true;
-}
-
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
-		 struct rdt_domain *d)
-{
-	unsigned long bw_val;
-
-	if (d->have_new_ctrl) {
-		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
-		return -EINVAL;
-	}
-
-	if (!bw_validate_amd(data->buf, &bw_val, r))
-		return -EINVAL;
-
-	d->new_ctrl = bw_val;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
 /*
  * Check whether MBA bandwidth percentage value is correct. The value is
  * checked against the minimum and max bandwidth values specified by the
@@ -82,7 +35,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 	/*
 	 * Only linear delay values is supported for current Intel SKUs.
 	 */
-	if (!r->membw.delay_linear) {
+	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
 		rdt_last_cmd_puts("No support for non-linear MB domains\n");
 		return false;
 	}
@@ -104,31 +57,36 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 	return true;
 }
 
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
-		   struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+	     struct rdt_domain *d)
 {
+	struct resctrl_staged_config *cfg;
+	struct rdt_resource *r = s->res;
 	unsigned long bw_val;
 
-	if (d->have_new_ctrl) {
+	cfg = &d->staged_config[s->conf_type];
+	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
 	}
 
 	if (!bw_validate(data->buf, &bw_val, r))
 		return -EINVAL;
-	d->new_ctrl = bw_val;
-	d->have_new_ctrl = true;
+	cfg->new_ctrl = bw_val;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
 
 /*
- * Check whether a cache bit mask is valid. The SDM says:
+ * Check whether a cache bit mask is valid.
+ * For Intel the SDM says:
  *	Please note that all (and only) contiguous '1' combinations
  *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
  * Additionally Haswell requires at least two bits set.
+ * AMD allows non-contiguous bitmasks.
  */
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 {
 	unsigned long first_bit, zero_bit, val;
 	unsigned int cbm_len = r->cache.cbm_len;
@@ -140,7 +98,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 		return false;
 	}
 
-	if (val == 0 || val > r->default_ctrl) {
+	if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
+	    val > r->default_ctrl) {
 		rdt_last_cmd_puts("Mask out of range\n");
 		return false;
 	}
@@ -148,7 +107,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 	first_bit = find_first_bit(&val, cbm_len);
 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
 
-	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+	/* Are non-contiguous bitmaps allowed? */
+	if (!r->cache.arch_has_sparse_bitmaps &&
+	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
 		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
 		return false;
 	}
@@ -163,41 +124,20 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
 	return true;
 }
 
-/*
- * Check whether a cache bit mask is valid. AMD allows non-contiguous
- * bitmasks
- */
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r)
-{
-	unsigned long val;
-	int ret;
-
-	ret = kstrtoul(buf, 16, &val);
-	if (ret) {
-		rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
-		return false;
-	}
-
-	if (val > r->default_ctrl) {
-		rdt_last_cmd_puts("Mask out of range\n");
-		return false;
-	}
-
-	*data = val;
-	return true;
-}
-
 /*
  * Read one cache bit mask (hex). Check that it is valid for the current
  * resource type.
  */
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d)
 {
 	struct rdtgroup *rdtgrp = data->rdtgrp;
+	struct resctrl_staged_config *cfg;
+	struct rdt_resource *r = s->res;
 	u32 cbm_val;
 
-	if (d->have_new_ctrl) {
+	cfg = &d->staged_config[s->conf_type];
+	if (cfg->have_new_ctrl) {
 		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
 		return -EINVAL;
 	}
@@ -212,7 +152,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 		return -EINVAL;
 	}
 
-	if (!r->cbm_validate(data->buf, &cbm_val, r))
+	if (!cbm_validate(data->buf, &cbm_val, r))
 		return -EINVAL;
 
 	if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
@@ -226,12 +166,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 	 * The CBM may not overlap with the CBM of another closid if
 	 * either is exclusive.
 	 */
-	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
 		rdt_last_cmd_puts("Overlaps with exclusive group\n");
 		return -EINVAL;
 	}
 
-	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
 		if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 			rdt_last_cmd_puts("Overlaps with other group\n");
@@ -239,8 +179,8 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
 		}
 	}
 
-	d->new_ctrl = cbm_val;
-	d->have_new_ctrl = true;
+	cfg->new_ctrl = cbm_val;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
@@ -251,9 +191,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
  * separated by ";". The "id" is in decimal, and must match one of
  * the "id"s for this resource.
  */
-static int parse_line(char *line, struct rdt_resource *r,
+static int parse_line(char *line, struct resctrl_schema *s,
 		      struct rdtgroup *rdtgrp)
 {
+	enum resctrl_conf_type t = s->conf_type;
+	struct resctrl_staged_config *cfg;
+	struct rdt_resource *r = s->res;
 	struct rdt_parse_data data;
 	char *dom = NULL, *id;
 	struct rdt_domain *d;
@@ -279,9 +222,10 @@ static int parse_line(char *line, struct rdt_resource *r,
 		if (d->id == dom_id) {
 			data.buf = dom;
 			data.rdtgrp = rdtgrp;
-			if (r->parse_ctrlval(&data, r, d))
+			if (r->parse_ctrlval(&data, s, d))
 				return -EINVAL;
 			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
+				cfg = &d->staged_config[t];
 				/*
 				 * In pseudo-locking setup mode and just
 				 * parsed a valid CBM that should be
@@ -290,9 +234,9 @@ static int parse_line(char *line, struct rdt_resource *r,
 				 * the required initialization for single
 				 * region and return.
 				 */
-				rdtgrp->plr->r = r;
+				rdtgrp->plr->s = s;
 				rdtgrp->plr->d = d;
-				rdtgrp->plr->cbm = d->new_ctrl;
+				rdtgrp->plr->cbm = cfg->new_ctrl;
 				d->plr = rdtgrp->plr;
 				return 0;
 			}
@@ -302,28 +246,72 @@ static int parse_line(char *line, struct rdt_resource *r,
 	return -EINVAL;
 }
 
-int update_domains(struct rdt_resource *r, int closid)
+static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
+{
+	switch (type) {
+	default:
+	case CDP_NONE:
+		return closid;
+	case CDP_CODE:
+		return closid * 2 + 1;
+	case CDP_DATA:
+		return closid * 2;
+	}
+}
+
+static bool apply_config(struct rdt_hw_domain *hw_dom,
+			 struct resctrl_staged_config *cfg, u32 idx,
+			 cpumask_var_t cpu_mask, bool mba_sc)
+{
+	struct rdt_domain *dom = &hw_dom->d_resctrl;
+	u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
+
+	if (cfg->new_ctrl != dc[idx]) {
+		cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
+		dc[idx] = cfg->new_ctrl;
+
+		return true;
+	}
+
+	return false;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 {
+	struct resctrl_staged_config *cfg;
+	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
+	enum resctrl_conf_type t;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
 	bool mba_sc;
-	u32 *dc;
 	int cpu;
+	u32 idx;
 
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	msr_param.low = closid;
-	msr_param.high = msr_param.low + 1;
-	msr_param.res = r;
-
 	mba_sc = is_mba_sc(r);
+	msr_param.res = NULL;
 	list_for_each_entry(d, &r->domains, list) {
-		dc = !mba_sc ? d->ctrl_val : d->mbps_val;
-		if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
-			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-			dc[closid] = d->new_ctrl;
+		hw_dom = resctrl_to_arch_dom(d);
+		for (t = 0; t < CDP_NUM_TYPES; t++) {
+			cfg = &hw_dom->d_resctrl.staged_config[t];
+			if (!cfg->have_new_ctrl)
+				continue;
+
+			idx = get_config_index(closid, t);
+			if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+				continue;
+
+			if (!msr_param.res) {
+				msr_param.low = idx;
+				msr_param.high = msr_param.low + 1;
+				msr_param.res = r;
+			} else {
+				msr_param.low = min(msr_param.low, idx);
+				msr_param.high = max(msr_param.high, idx + 1);
+			}
 		}
 	}
 
@@ -350,11 +338,11 @@ int update_domains(struct rdt_resource *r, int closid)
 static int rdtgroup_parse_resource(char *resname, char *tok,
 				   struct rdtgroup *rdtgrp)
 {
-	struct rdt_resource *r;
+	struct resctrl_schema *s;
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
-			return parse_line(tok, r, rdtgrp);
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
+			return parse_line(tok, s, rdtgrp);
 	}
 	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
 	return -EINVAL;
@@ -363,6 +351,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
+	struct resctrl_schema *s;
 	struct rdtgroup *rdtgrp;
 	struct rdt_domain *dom;
 	struct rdt_resource *r;
@@ -393,9 +382,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 		goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		list_for_each_entry(dom, &r->domains, list)
-			dom->have_new_ctrl = false;
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		list_for_each_entry(dom, &s->res->domains, list)
+			memset(dom->staged_config, 0, sizeof(dom->staged_config));
 	}
 
 	while ((tok = strsep(&buf, "\n")) != NULL) {
@@ -415,8 +404,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 			goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
-		ret = update_domains(r, rdtgrp->closid);
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
+		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
 		if (ret)
 			goto out;
 	}
@@ -437,19 +427,31 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
-static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type type)
 {
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u32 idx = get_config_index(closid, type);
+
+	if (!is_mba_sc(r))
+		return hw_dom->ctrl_val[idx];
+	return hw_dom->mbps_val[idx];
+}
+
+static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
+{
+	struct rdt_resource *r = schema->res;
 	struct rdt_domain *dom;
 	bool sep = false;
 	u32 ctrl_val;
 
-	seq_printf(s, "%*s:", max_name_width, r->name);
+	seq_printf(s, "%*s:", max_name_width, schema->name);
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_puts(s, ";");
 
-		ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
-			    dom->mbps_val[closid]);
+		ctrl_val = resctrl_arch_get_config(r, dom, closid,
+						   schema->conf_type);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
@@ -460,16 +462,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v)
 {
+	struct resctrl_schema *schema;
 	struct rdtgroup *rdtgrp;
-	struct rdt_resource *r;
 	int ret = 0;
 	u32 closid;
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (rdtgrp) {
 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-			for_each_alloc_enabled_rdt_resource(r)
-				seq_printf(s, "%s:uninitialized\n", r->name);
+			list_for_each_entry(schema, &resctrl_schema_all, list) {
+				seq_printf(s, "%s:uninitialized\n", schema->name);
+			}
 		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 			if (!rdtgrp->plr->d) {
 				rdt_last_cmd_clear();
@@ -477,15 +480,15 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 				ret = -ENODEV;
 			} else {
 				seq_printf(s, "%s:%d=%x\n",
-					   rdtgrp->plr->r->name,
+					   rdtgrp->plr->s->res->name,
 					   rdtgrp->plr->d->id,
 					   rdtgrp->plr->cbm);
 			}
 		} else {
 			closid = rdtgrp->closid;
-			for_each_alloc_enabled_rdt_resource(r) {
-				if (closid < r->num_closid)
-					show_doms(s, r, closid);
+			list_for_each_entry(schema, &resctrl_schema_all, list) {
+				if (closid < schema->num_closid)
+					show_doms(s, schema, closid);
 			}
 		}
 	} else {
@@ -495,14 +498,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	return ret;
 }
 
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-		    struct rdtgroup *rdtgrp, int evtid, int first)
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
+		    int evtid, int first)
 {
 	/*
 	 * setup the parameters to send to the IPI to read the data.
 	 */
 	rr->rgrp = rdtgrp;
 	rr->evtid = evtid;
+	rr->r = r;
 	rr->d = d;
 	rr->val = 0;
 	rr->first = first;
@@ -513,6 +518,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
+	struct rdt_hw_resource *hw_res;
 	u32 resid, evtid, domid;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
@@ -532,21 +538,22 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	domid = md.u.domid;
 	evtid = md.u.evtid;
 
-	r = &rdt_resources_all[resid];
+	hw_res = &rdt_resources_all[resid];
+	r = &hw_res->r_resctrl;
 	d = rdt_find_domain(r, domid, NULL);
 	if (IS_ERR_OR_NULL(d)) {
 		ret = -ENOENT;
 		goto out;
 	}
 
-	mon_event_read(&rr, d, rdtgrp, evtid, false);
+	mon_event_read(&rr, r, d, rdtgrp, evtid, false);
 
 	if (rr.val & RMID_VAL_ERROR)
 		seq_puts(m, "Error\n");
 	else if (rr.val & RMID_VAL_UNAVAIL)
 		seq_puts(m, "Unavailable\n");
 	else
-		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+		seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
 
 out:
 	rdtgroup_kn_unlock(of->kn);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 499cb2e727a09db94e4b43a41385d43f43902395..2f92015bf12952d88d4ad7f99f71daa2c0d52e5a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_RESCTRL_INTERNAL_H
 #define _ASM_X86_RESCTRL_INTERNAL_H
 
+#include <linux/resctrl.h>
 #include <linux/sched.h>
 #include <linux/kernfs.h>
 #include <linux/fs_context.h>
@@ -31,16 +32,23 @@
 
 #define CQM_LIMBOCHECK_INTERVAL	1000
 
-#define MBM_CNTR_WIDTH			24
+#define MBM_CNTR_WIDTH_BASE		24
 #define MBM_CNTR_WIDTH_AMD		44
 #define MBM_OVERFLOW_INTERVAL		1000
 #define MAX_MBA_BW			100u
 #define MBA_IS_LINEAR			0x4
 #define MBA_MAX_MBPS			U32_MAX
 #define MAX_MBA_BW_AMD			0x800
+#define MBM_CNTR_WIDTH_OFFSET_AMD	20
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
 
 
 struct rdt_fs_context {
@@ -64,6 +72,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  * struct mon_evt - Entry in the event list of a resource
  * @evtid:		event id
  * @name:		name of the event
+ * @list:		entry in &rdt_resource->evt_list
  */
 struct mon_evt {
 	u32			evtid;
@@ -72,10 +81,13 @@ struct mon_evt {
 };
 
 /**
- * struct mon_data_bits - Monitoring details for each event file
- * @rid:               Resource id associated with the event file.
+ * union mon_data_bits - Monitoring details for each event file
+ * @priv:              Used to store monitoring event data in @u
+ *                     as kernfs private data
+ * @rid:               Resource id associated with the event file
  * @evtid:             Event id associated with the event file
  * @domid:             The domain to which the event file belongs
+ * @u:                 Name of the bit fields struct
  */
 union mon_data_bits {
 	void *priv;
@@ -88,6 +100,7 @@ union mon_data_bits {
 
 struct rmid_read {
 	struct rdtgroup		*rgrp;
+	struct rdt_resource	*r;
 	struct rdt_domain	*d;
 	int			evtid;
 	bool			first;
@@ -98,6 +111,7 @@ extern unsigned int resctrl_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
+extern struct list_head resctrl_schema_all;
 
 enum rdt_group_type {
 	RDTCTRL_GROUP = 0,
@@ -112,6 +126,7 @@ enum rdt_group_type {
  * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
  * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
  *                          allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
  *
  * The mode of a resource group enables control over the allowed overlap
  * between allocations associated with different resource groups (classes
@@ -135,7 +150,7 @@ enum rdtgrp_mode {
 
 /**
  * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn		kernlfs node for the mon_data directory
+ * @mon_data_kn:		kernfs node for the mon_data directory
  * @parent:			parent rdtgrp
  * @crdtgrp_list:		child rdtgroup node list
  * @rmid:			rmid for this rdtgroup
@@ -149,8 +164,8 @@ struct mongroup {
 
 /**
  * struct pseudo_lock_region - pseudo-lock region information
- * @r:			RDT resource to which this pseudo-locked region
- *			belongs
+ * @s:			Resctrl schema for the resource to which this
+ *			pseudo-locked region belongs
  * @d:			RDT domain to which this pseudo-locked region
  *			belongs
  * @cbm:		bitmask of the pseudo-locked region
@@ -170,7 +185,7 @@ struct mongroup {
  * @pm_reqs:		Power management QoS requests related to this region
  */
 struct pseudo_lock_region {
-	struct rdt_resource	*r;
+	struct resctrl_schema	*s;
 	struct rdt_domain	*d;
 	u32			cbm;
 	wait_queue_head_t	lock_thread_wq;
@@ -257,7 +272,7 @@ void __exit rdtgroup_exit(void);
 struct rftype {
 	char			*name;
 	umode_t			mode;
-	struct kernfs_ops	*kf_ops;
+	const struct kernfs_ops	*kf_ops;
 	unsigned long		flags;
 	unsigned long		fflags;
 
@@ -275,11 +290,11 @@ struct rftype {
 /**
  * struct mbm_state - status for each MBM counter in each domain
  * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_msr:	Value of IA32_QM_CTR for this RMID last time we read it
  * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw	The most recent bandwidth in MBps
- * @delta_bw	Difference between the current and previous bandwidth
- * @delta_comp	Indicates whether to compute the delta_bw
+ * @prev_bw:	The most recent bandwidth in MBps
+ * @delta_bw:	Difference between the current and previous bandwidth
+ * @delta_comp:	Indicates whether to compute the delta_bw
  */
 struct mbm_state {
 	u64	chunks;
@@ -291,44 +306,25 @@ struct mbm_state {
 };
 
 /**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:	all instances of this resource
- * @id:		unique id for this instance
- * @cpu_mask:	which cpus share this resource
- * @rmid_busy_llc:
- *		bitmap of which limbo RMIDs are above threshold
- * @mbm_total:	saved state for MBM total bandwidth
- * @mbm_local:	saved state for MBM local bandwidth
- * @mbm_over:	worker to periodically read MBM h/w counters
- * @cqm_limbo:	worker to periodically read CQM h/w counters
- * @mbm_work_cpu:
- *		worker cpu for MBM h/w counters
- * @cqm_work_cpu:
- *		worker cpu for CQM h/w counters
+ * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
+ *			  a resource
+ * @d_resctrl:	Properties exposed to the resctrl file system
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps
- * @new_ctrl:	new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- * @plr:	pseudo-locked region (if any) associated with domain
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
  */
-struct rdt_domain {
-	struct list_head		list;
-	int				id;
-	struct cpumask			cpu_mask;
-	unsigned long			*rmid_busy_llc;
-	struct mbm_state		*mbm_total;
-	struct mbm_state		*mbm_local;
-	struct delayed_work		mbm_over;
-	struct delayed_work		cqm_limbo;
-	int				mbm_work_cpu;
-	int				cqm_work_cpu;
+struct rdt_hw_domain {
+	struct rdt_domain		d_resctrl;
 	u32				*ctrl_val;
 	u32				*mbps_val;
-	u32				new_ctrl;
-	bool				have_new_ctrl;
-	struct pseudo_lock_region	*plr;
 };
 
+static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+{
+	return container_of(r, struct rdt_hw_domain, d_resctrl);
+}
+
 /**
  * struct msr_param - set a range of MSRs from a domain
  * @res:       The resource to use
@@ -337,48 +333,8 @@ struct rdt_domain {
  */
 struct msr_param {
 	struct rdt_resource	*res;
-	int			low;
-	int			high;
-};
-
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len:		Length of the cache bit mask
- * @min_cbm_bits:	Minimum number of consecutive bits to be set
- * @cbm_idx_mult:	Multiplier of CBM index
- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
- *			closid * cbm_idx_multi + cbm_idx_offset
- *			in a cache bit mask
- * @shareable_bits:	Bitmask of shareable resource with other
- *			executing entities
- */
-struct rdt_cache {
-	unsigned int	cbm_len;
-	unsigned int	min_cbm_bits;
-	unsigned int	cbm_idx_mult;
-	unsigned int	cbm_idx_offset;
-	unsigned int	shareable_bits;
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay:		Max throttle delay. Delay is the hardware
- *			representation for memory bandwidth.
- * @min_bw:		Minimum memory bandwidth percentage user can request
- * @bw_gran:		Granularity at which the memory bandwidth is allocated
- * @delay_linear:	True if memory B/W delay is in linear scale
- * @mbm_width:		memory B/W monitor counter width
- * @mba_sc:		True if MBA software controller(mba_sc) is enabled
- * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
-	u32		max_delay;
-	u32		min_bw;
-	u32		bw_gran;
-	u32		delay_linear;
-	u32		mbm_width;
-	bool		mba_sc;
-	u32		*mb_map;
+	u32			low;
+	u32			high;
 };
 
 static inline bool is_llc_occupancy_enabled(void)
@@ -413,112 +369,99 @@ struct rdt_parse_data {
 };
 
 /**
- * struct rdt_resource - attributes of an RDT resource
- * @rid:		The index of the resource
- * @alloc_enabled:	Is allocation enabled on this machine
- * @mon_enabled:	Is monitoring enabled for this feature
- * @alloc_capable:	Is allocation available on this machine
- * @mon_capable:	Is monitor feature available on this machine
- * @name:		Name to use in "schemata" file
- * @num_closid:		Number of CLOSIDs available
- * @cache_level:	Which cache level defines scope of this resource
- * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl:		Attributes of the resource used directly by resctrl.
+ * @num_closid:		Maximum number of closid this hardware can support,
+ *			regardless of CDP. This is exposed via
+ *			resctrl_arch_get_num_closid() to avoid confusion
+ *			with struct resctrl_schema's property of the same name,
+ *			which has been corrected for features like CDP.
  * @msr_base:		Base MSR address for CBMs
  * @msr_update:		Function pointer to update QOS MSRs
- * @data_width:		Character width of data when displaying
- * @domains:		All domains for this resource
- * @cache:		Cache allocation related data
- * @format_str:		Per resource format string to show domain value
- * @parse_ctrlval:	Per resource function pointer to parse control values
- * @cbm_validate	Cache bitmask validate function
- * @evt_list:		List of monitoring events
- * @num_rmid:		Number of RMIDs available
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
- * @fflags:		flags to choose base and info files
+ * @mbm_width:		Monitor width, to detect and correct for overflow.
+ * @cdp_enabled:	CDP state of this resource
  */
-struct rdt_resource {
-	int			rid;
-	bool			alloc_enabled;
-	bool			mon_enabled;
-	bool			alloc_capable;
-	bool			mon_capable;
-	char			*name;
-	int			num_closid;
-	int			cache_level;
-	u32			default_ctrl;
+struct rdt_hw_resource {
+	struct rdt_resource	r_resctrl;
+	u32			num_closid;
 	unsigned int		msr_base;
 	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
 				 struct rdt_resource *r);
-	int			data_width;
-	struct list_head	domains;
-	struct rdt_cache	cache;
-	struct rdt_membw	membw;
-	const char		*format_str;
-	int (*parse_ctrlval)(struct rdt_parse_data *data,
-			     struct rdt_resource *r,
-			     struct rdt_domain *d);
-	bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r);
-	struct list_head	evt_list;
-	int			num_rmid;
 	unsigned int		mon_scale;
-	unsigned long		fflags;
+	unsigned int		mbm_width;
+	bool			cdp_enabled;
 };
 
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+	return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
 	      struct rdt_domain *d);
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
-		   struct rdt_domain *d);
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
-		 struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+	     struct rdt_domain *d);
 
 extern struct mutex rdtgroup_mutex;
 
-extern struct rdt_resource rdt_resources_all[];
+extern struct rdt_hw_resource rdt_resources_all[];
 extern struct rdtgroup rdtgroup_default;
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 extern struct dentry *debugfs_resctrl;
 
-enum {
+enum resctrl_res_level {
 	RDT_RESOURCE_L3,
-	RDT_RESOURCE_L3DATA,
-	RDT_RESOURCE_L3CODE,
 	RDT_RESOURCE_L2,
-	RDT_RESOURCE_L2DATA,
-	RDT_RESOURCE_L2CODE,
 	RDT_RESOURCE_MBA,
 
 	/* Must be the last */
 	RDT_NUM_RESOURCES,
 };
 
+static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
+
+	hw_res++;
+	return &hw_res->r_resctrl;
+}
+
+static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+	return rdt_resources_all[l].cdp_enabled;
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
+
+/*
+ * To return the common struct rdt_resource, which is contained in struct
+ * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
+ */
 #define for_each_rdt_resource(r)					      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)
+	for (r = &rdt_resources_all[0].r_resctrl;			      \
+	     r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl;	      \
+	     r = resctrl_inc(r))
 
 #define for_each_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_capable || r->mon_capable)
 
 #define for_each_alloc_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_capable)
 
 #define for_each_mon_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->mon_capable)
 
 #define for_each_alloc_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->alloc_enabled)
 
 #define for_each_mon_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
+	for_each_rdt_resource(r)					      \
 		if (r->mon_enabled)
 
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
@@ -547,6 +490,7 @@ union cpuid_0x10_x_edx {
 
 void rdt_last_cmd_clear(void);
 void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
 void rdt_last_cmd_printf(const char *fmt, ...);
 
 void rdt_ctrl_update(void *arg);
@@ -561,7 +505,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive);
 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
 				  unsigned long cbm);
@@ -576,7 +520,6 @@ void rdt_pseudo_lock_release(void);
 int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
 void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
-int update_domains(struct rdt_resource *r, int closid);
 int closids_supported(void);
 void closid_free(int closid);
 int alloc_rmid(void);
@@ -588,11 +531,13 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    unsigned int dom_id);
 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-		    struct rdtgroup *rdtgrp, int evtid, int first);
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
+		    int evtid, int first);
 void mbm_setup_overflow_handler(struct rdt_domain *dom,
 				unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
+void __init intel_rdt_mbm_apply_quirk(void);
 bool is_mba_sc(struct rdt_resource *r);
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
 u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
@@ -600,8 +545,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
 void __check_limbo(struct rdt_domain *d, bool force_free);
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r);
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r);
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void __init thread_throttle_mode_init(void);
 
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 50f683ecd2c6ceeb71e7eb911c38d7d3868d6277..2a968ee87fb5b6670b53eeac1fdaa522be680b73 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -64,6 +64,69 @@ unsigned int rdt_mon_features;
  */
 unsigned int resctrl_cqm_threshold;
 
+#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ *    for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ *    to calculate corrected value by shifting:
+ *    corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+	u32 rmidthreshold;
+	u64 cf;
+} mbm_cf_table[] __initconst = {
+	{7,	CF(1.000000)},
+	{15,	CF(1.000000)},
+	{15,	CF(0.969650)},
+	{31,	CF(1.000000)},
+	{31,	CF(1.066667)},
+	{31,	CF(0.969650)},
+	{47,	CF(1.142857)},
+	{63,	CF(1.000000)},
+	{63,	CF(1.185115)},
+	{63,	CF(1.066553)},
+	{79,	CF(1.454545)},
+	{95,	CF(1.000000)},
+	{95,	CF(1.230769)},
+	{95,	CF(1.142857)},
+	{95,	CF(1.066667)},
+	{127,	CF(1.000000)},
+	{127,	CF(1.254863)},
+	{127,	CF(1.185255)},
+	{151,	CF(1.000000)},
+	{127,	CF(1.066667)},
+	{167,	CF(1.000000)},
+	{159,	CF(1.454334)},
+	{183,	CF(1.000000)},
+	{127,	CF(0.969744)},
+	{191,	CF(1.280246)},
+	{191,	CF(1.230921)},
+	{215,	CF(1.000000)},
+	{191,	CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+	/* Correct MBM value. */
+	if (rmid > mbm_cf_rmidthreshold)
+		val = (val * mbm_cf) >> 20;
+
+	return val;
+}
+
 static inline struct rmid_entry *__rmid_entry(u32 rmid)
 {
 	struct rmid_entry *entry;
@@ -111,7 +174,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 	struct rdt_resource *r;
 	u32 crmid = 1, nrmid;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
 	/*
 	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
@@ -169,7 +232,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 	int cpu;
 	u64 val;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
 	entry->busy = 0;
 	cpu = get_cpu();
@@ -214,17 +277,17 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 {
-	u64 shift, chunks;
+	u64 shift = 64 - width, chunks;
 
-	shift = 64 - rdt_resources_all[RDT_RESOURCE_MBA].membw.mbm_width;
 	chunks = (cur_msr << shift) - (prev_msr << shift);
-	return chunks >>= shift;
+	return chunks >> shift;
 }
 
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m;
 	u64 chunks, tval;
 
@@ -257,11 +320,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		return 0;
 	}
 
-	chunks = mbm_overflow_count(m->prev_msr, tval);
+	chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
 	m->chunks += chunks;
 	m->prev_msr = tval;
 
-	rr->val += m->chunks;
+	rr->val += get_corrected_mbm_count(rmid, m->chunks);
+
 	return 0;
 }
 
@@ -271,7 +335,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m = &rr->d->mbm_local[rmid];
 	u64 tval, cur_bw, chunks;
 
@@ -279,8 +343,8 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
 		return;
 
-	chunks = mbm_overflow_count(m->prev_bw_msr, tval);
-	cur_bw = (chunks * r->mon_scale) >> 20;
+	chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
+	cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
 
 	if (m->delta_comp)
 		m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -353,6 +417,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 {
 	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
+	struct rdt_hw_resource *hw_r_mba;
+	struct rdt_hw_domain *hw_dom_mba;
 	u32 cur_bw, delta_bw, user_bw;
 	struct rdt_resource *r_mba;
 	struct rdt_domain *dom_mba;
@@ -362,7 +428,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	if (!is_mbm_local_enabled())
 		return;
 
-	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	r_mba = &hw_r_mba->r_resctrl;
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
 	pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -372,11 +439,16 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		pr_warn_once("Failure to get domain for MBA update\n");
 		return;
 	}
+	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	user_bw = dom_mba->mbps_val[closid];
+	user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
 	delta_bw = pmbm_data->delta_bw;
-	cur_msr_val = dom_mba->ctrl_val[closid];
+	/*
+	 * resctrl_arch_get_config() chooses the mbps/ctrl value to return
+	 * based on is_mba_sc(). For now, reach into the hw_dom.
+	 */
+	cur_msr_val = hw_dom_mba->ctrl_val[closid];
 
 	/*
 	 * For Ctrl groups read data from child monitor groups.
@@ -411,9 +483,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		return;
 	}
 
-	cur_msr = r_mba->msr_base + closid;
+	cur_msr = hw_r_mba->msr_base + closid;
 	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
-	dom_mba->ctrl_val[closid] = new_msr_val;
+	hw_dom_mba->ctrl_val[closid] = new_msr_val;
 
 	/*
 	 * Delta values are updated dynamically package wise for each
@@ -432,11 +504,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	}
 }
 
-static void mbm_update(struct rdt_domain *d, int rmid)
+static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
 {
 	struct rmid_read rr;
 
 	rr.first = false;
+	rr.r = r;
 	rr.d = d;
 
 	/*
@@ -474,20 +547,14 @@ void cqm_handle_limbo(struct work_struct *work)
 
 	mutex_lock(&rdtgroup_mutex);
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3];
-	d = get_domain_from_cpu(cpu, r);
-
-	if (!d) {
-		pr_warn_once("Failure to get domain for limbo worker\n");
-		goto out_unlock;
-	}
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	d = container_of(work, struct rdt_domain, cqm_limbo.work);
 
 	__check_limbo(d, false);
 
 	if (has_busy_rmid(r, d))
 		schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
 
-out_unlock:
 	mutex_unlock(&rdtgroup_mutex);
 }
 
@@ -508,6 +575,7 @@ void mbm_handle_overflow(struct work_struct *work)
 	struct rdtgroup *prgrp, *crgrp;
 	int cpu = smp_processor_id();
 	struct list_head *head;
+	struct rdt_resource *r;
 	struct rdt_domain *d;
 
 	mutex_lock(&rdtgroup_mutex);
@@ -515,16 +583,15 @@ void mbm_handle_overflow(struct work_struct *work)
 	if (!static_branch_likely(&rdt_mon_enable_key))
 		goto out_unlock;
 
-	d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
-	if (!d)
-		goto out_unlock;
+	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	d = container_of(work, struct rdt_domain, mbm_over.work);
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-		mbm_update(d, prgrp->mon.rmid);
+		mbm_update(r, d, prgrp->mon.rmid);
 
 		head = &prgrp->mon.crdtgrp_list;
 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
-			mbm_update(d, crgrp->mon.rmid);
+			mbm_update(r, d, crgrp->mon.rmid);
 
 		if (is_mba_sc(NULL))
 			update_mba_bw(prgrp, d);
@@ -612,11 +679,19 @@ static void l3_mon_evt_init(struct rdt_resource *r)
 
 int rdt_get_mon_l3_config(struct rdt_resource *r)
 {
+	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	unsigned int cl_size = boot_cpu_data.x86_cache_size;
 	int ret;
 
-	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+		hw_res->mbm_width += mbm_offset;
+	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+		pr_warn("Ignoring impossible MBM counter offset\n");
 
 	/*
 	 * A reasonable upper limit on the max threshold is the number
@@ -628,7 +703,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
 
 	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
-	resctrl_cqm_threshold /= r->mon_scale;
+	resctrl_cqm_threshold /= hw_res->mon_scale;
 
 	ret = dom_data_init(r);
 	if (ret)
@@ -641,3 +716,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 
 	return 0;
 }
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+	int cf_index;
+
+	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+		pr_info("No MBM correction factor available\n");
+		return;
+	}
+
+	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+	mbm_cf = mbm_cf_table[cf_index].cf;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d7623e1b927d1d9fbdc53c59dd8ed4c06c9e64dd..730b5d4b3f7ba2090bf850d52f4c7a861075939a 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -24,7 +24,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/perf_event.h>
 
 #include "../../events/perf_event.h" /* For X86_CONFIG() */
@@ -49,6 +49,7 @@ static struct class *pseudo_lock_class;
 
 /**
  * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * @void: It takes no parameters.
  *
  * Capture the list of platforms that have been validated to support
  * pseudo-locking. This includes testing to ensure pseudo-locked regions
@@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
 }
 
 /**
- * pseudo_lock_pm_req - A power management QoS request list entry
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
  * @list:	Entry within the @pm_reqs list for a pseudo-locked region
  * @req:	PM QoS request
  */
@@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
 
 /**
  * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
  *
  * To prevent the cache from being affected by power management entering
  * C6 has to be avoided. This is accomplished by requesting a latency
@@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
  * the ACPI latencies need to be considered while keeping in mind that C2
  * may be set to map to deeper sleep states. In this case the latency
  * requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
  */
 static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
 {
@@ -246,7 +250,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
 	plr->line_size = 0;
 	kfree(plr->kmem);
 	plr->kmem = NULL;
-	plr->r = NULL;
+	plr->s = NULL;
 	if (plr->d)
 		plr->d->plr = NULL;
 	plr->d = NULL;
@@ -290,10 +294,10 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
 
 	ci = get_cpu_cacheinfo(plr->cpu);
 
-	plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+	plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
 
 	for (i = 0; i < ci->num_leaves; i++) {
-		if (ci->info_list[i].level == plr->r->cache_level) {
+		if (ci->info_list[i].level == plr->s->res->cache_level) {
 			plr->line_size = ci->info_list[i].coherency_line_size;
 			return 0;
 		}
@@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
 
 /**
  * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @r: resource group being queried
+ * @rdtgrp: resource group being queried
  *
  * Return: 1 if monitor groups have been created for this resource
  * group, 0 otherwise.
@@ -684,8 +688,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
 	 *   resource, the portion of cache used by it should be made
 	 *   unavailable to all future allocations from both resources.
 	 */
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
-	    rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
+	    resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
 		rdt_last_cmd_puts("CDP enabled\n");
 		return -EINVAL;
 	}
@@ -796,7 +800,7 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
 	unsigned long cbm_b;
 
 	if (d->plr) {
-		cbm_len = d->plr->r->cache.cbm_len;
+		cbm_len = d->plr->s->res->cache.cbm_len;
 		cbm_b = d->plr->cbm;
 		if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
 			return true;
@@ -1140,6 +1144,8 @@ static int measure_l3_residency(void *_plr)
 
 /**
  * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
  *
  * The measurement of latency to access a pseudo-locked region should be
  * done from a cpu that is associated with that pseudo-locked region.
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 28f786289fce4cc145dc8ae89890dc4e39804fb0..60f0dc5e955ad64dc3e3cf70cf5ac63eb14eb9e4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -29,7 +29,7 @@
 
 #include <uapi/linux/magic.h>
 
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include "internal.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
@@ -39,6 +39,9 @@ static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
+/* list of entries for the schemata file */
+LIST_HEAD(resctrl_schema_all);
+
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
@@ -100,12 +103,12 @@ int closids_supported(void)
 
 static void closid_init(void)
 {
-	struct rdt_resource *r;
-	int rdt_min_closid = 32;
+	struct resctrl_schema *s;
+	u32 rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	for_each_alloc_enabled_rdt_resource(r)
-		rdt_min_closid = min(rdt_min_closid, r->num_closid);
+	list_for_each_entry(s, &resctrl_schema_all, list)
+		rdt_min_closid = min(rdt_min_closid, s->num_closid);
 
 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 
@@ -240,13 +243,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 	return -EINVAL;
 }
 
-static struct kernfs_ops rdtgroup_kf_single_ops = {
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= rdtgroup_file_write,
 	.seq_show		= rdtgroup_seqfile_show,
 };
 
-static struct kernfs_ops kf_mondata_ops = {
+static const struct kernfs_ops kf_mondata_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.seq_show		= rdtgroup_mondata_show,
 };
@@ -563,11 +566,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 	 */
 
 	if (rdtgrp->type == RDTCTRL_GROUP) {
-		tsk->closid = rdtgrp->closid;
-		tsk->rmid = rdtgrp->mon.rmid;
+		WRITE_ONCE(tsk->closid, rdtgrp->closid);
+		WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
 	} else if (rdtgrp->type == RDTMON_GROUP) {
 		if (rdtgrp->mon.parent->closid == tsk->closid) {
-			tsk->rmid = rdtgrp->mon.rmid;
+			WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
 		} else {
 			rdt_last_cmd_puts("Can't move task to different control group\n");
 			return -EINVAL;
@@ -591,6 +594,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 	return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_alloc_capable &&
+	       (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_mon_capable &&
+	       (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /**
  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
  * @r: Resource group
@@ -606,8 +621,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
-		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
 			ret = 1;
 			break;
 		}
@@ -705,8 +719,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
-		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
+		if (is_closid_match(t, r) || is_rmid_match(t, r))
 			seq_printf(s, "%d\n", t->pid);
 	}
 	rcu_read_unlock();
@@ -728,6 +741,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 	return ret;
 }
 
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1)   res:
+ *      mon:
+ *
+ *    resctrl is not available.
+ *
+ * 2)   res:/
+ *      mon:
+ *
+ *    Task is part of the root resctrl control group, and it is not associated
+ *    to any monitor group.
+ *
+ * 3)  res:/
+ *     mon:mon0
+ *
+ *    Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4)  res:group0
+ *     mon:
+ *
+ *    Task is part of resctrl control group group0, and it is not associated
+ *    to any monitor group.
+ *
+ * 5) res:group0
+ *    mon:mon1
+ *
+ *    Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+		      struct pid *pid, struct task_struct *tsk)
+{
+	struct rdtgroup *rdtg;
+	int ret = 0;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/* Return empty if resctrl has not been mounted. */
+	if (!static_branch_unlikely(&rdt_enable_key)) {
+		seq_puts(s, "res:\nmon:\n");
+		goto unlock;
+	}
+
+	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+		struct rdtgroup *crg;
+
+		/*
+		 * Task information is only relevant for shareable
+		 * and exclusive groups.
+		 */
+		if (rdtg->mode != RDT_MODE_SHAREABLE &&
+		    rdtg->mode != RDT_MODE_EXCLUSIVE)
+			continue;
+
+		if (rdtg->closid != tsk->closid)
+			continue;
+
+		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+			   rdtg->kn->name);
+		seq_puts(s, "mon:");
+		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+				    mon.crdtgrp_list) {
+			if (tsk->rmid != crg->mon.rmid)
+				continue;
+			seq_printf(s, "%s", crg->kn->name);
+			break;
+		}
+		seq_putc(s, '\n');
+		goto unlock;
+	}
+	/*
+	 * The above search should succeed. Otherwise return
+	 * with an error.
+	 */
+	ret = -ENOENT;
+unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+#endif
+
 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 				    struct seq_file *seq, void *v)
 {
@@ -746,16 +845,17 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
 
-	seq_printf(seq, "%d\n", r->num_closid);
+	seq_printf(seq, "%u\n", s->num_closid);
 	return 0;
 }
 
 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%x\n", r->default_ctrl);
 	return 0;
@@ -764,7 +864,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 	return 0;
@@ -773,7 +874,8 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 				   struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%x\n", r->cache.shareable_bits);
 	return 0;
@@ -796,38 +898,40 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 static int rdt_bit_usage_show(struct kernfs_open_file *of,
 			      struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
 	/*
 	 * Use unsigned long even though only 32 bits are used to ensure
 	 * test_bit() is used safely.
 	 */
 	unsigned long sw_shareable = 0, hw_shareable = 0;
 	unsigned long exclusive = 0, pseudo_locked = 0;
+	struct rdt_resource *r = s->res;
 	struct rdt_domain *dom;
 	int i, hwb, swb, excl, psl;
 	enum rdtgrp_mode mode;
 	bool sep = false;
-	u32 *ctrl;
+	u32 ctrl_val;
 
 	mutex_lock(&rdtgroup_mutex);
 	hw_shareable = r->cache.shareable_bits;
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_putc(seq, ';');
-		ctrl = dom->ctrl_val;
 		sw_shareable = 0;
 		exclusive = 0;
 		seq_printf(seq, "%d=", dom->id);
-		for (i = 0; i < closids_supported(); i++, ctrl++) {
+		for (i = 0; i < closids_supported(); i++) {
 			if (!closid_allocated(i))
 				continue;
+			ctrl_val = resctrl_arch_get_config(r, dom, i,
+							   s->conf_type);
 			mode = rdtgroup_mode_by_closid(i);
 			switch (mode) {
 			case RDT_MODE_SHAREABLE:
-				sw_shareable |= *ctrl;
+				sw_shareable |= ctrl_val;
 				break;
 			case RDT_MODE_EXCLUSIVE:
-				exclusive |= *ctrl;
+				exclusive |= ctrl_val;
 				break;
 			case RDT_MODE_PSEUDO_LOCKSETUP:
 			/*
@@ -874,7 +978,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
 static int rdt_min_bw_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.min_bw);
 	return 0;
@@ -905,7 +1010,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.bw_gran);
 	return 0;
@@ -914,7 +1020,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
 static int rdt_delay_linear_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
 
 	seq_printf(seq, "%u\n", r->membw.delay_linear);
 	return 0;
@@ -924,8 +1031,23 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
 				  struct seq_file *seq, void *v)
 {
 	struct rdt_resource *r = of->kn->parent->priv;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
+
+	return 0;
+}
 
-	seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale);
+static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
+					 struct seq_file *seq, void *v)
+{
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
+
+	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+		seq_puts(seq, "per-thread\n");
+	else
+		seq_puts(seq, "max\n");
 
 	return 0;
 }
@@ -933,7 +1055,7 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 				       char *buf, size_t nbytes, loff_t off)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
+	struct rdt_hw_resource *hw_res;
 	unsigned int bytes;
 	int ret;
 
@@ -944,7 +1066,8 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 		return -EINVAL;
 
-	resctrl_cqm_threshold = bytes / r->mon_scale;
+	hw_res = resctrl_to_arch_res(of->kn->parent->priv);
+	resctrl_cqm_threshold = bytes / hw_res->mon_scale;
 
 	return nbytes;
 }
@@ -969,76 +1092,17 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
 	return 0;
 }
 
-/**
- * rdt_cdp_peer_get - Retrieve CDP peer if it exists
- * @r: RDT resource to which RDT domain @d belongs
- * @d: Cache instance for which a CDP peer is requested
- * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
- *         Used to return the result.
- * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
- *         Used to return the result.
- *
- * RDT resources are managed independently and by extension the RDT domains
- * (RDT resource instances) are managed independently also. The Code and
- * Data Prioritization (CDP) RDT resources, while managed independently,
- * could refer to the same underlying hardware. For example,
- * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
- *
- * When provided with an RDT resource @r and an instance of that RDT
- * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
- * resource and the exact instance that shares the same hardware.
- *
- * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
- *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
- *         and @d_cdp will point to the peer RDT domain.
- */
-static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
-			    struct rdt_resource **r_cdp,
-			    struct rdt_domain **d_cdp)
+static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
 {
-	struct rdt_resource *_r_cdp = NULL;
-	struct rdt_domain *_d_cdp = NULL;
-	int ret = 0;
-
-	switch (r->rid) {
-	case RDT_RESOURCE_L3DATA:
-		_r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
-		break;
-	case RDT_RESOURCE_L3CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
-		break;
-	case RDT_RESOURCE_L2DATA:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
-		break;
-	case RDT_RESOURCE_L2CODE:
-		_r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
-		break;
+	switch (my_type) {
+	case CDP_CODE:
+		return CDP_DATA;
+	case CDP_DATA:
+		return CDP_CODE;
 	default:
-		ret = -ENOENT;
-		goto out;
+	case CDP_NONE:
+		return CDP_NONE;
 	}
-
-	/*
-	 * When a new CPU comes online and CDP is enabled then the new
-	 * RDT domains (if any) associated with both CDP RDT resources
-	 * are added in the same CPU online routine while the
-	 * rdtgroup_mutex is held. It should thus not happen for one
-	 * RDT domain to exist and be associated with its RDT CDP
-	 * resource but there is no RDT domain associated with the
-	 * peer RDT CDP resource. Hence the WARN.
-	 */
-	_d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
-	if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
-		_r_cdp = NULL;
-		_d_cdp = NULL;
-		ret = -EINVAL;
-	}
-
-out:
-	*r_cdp = _r_cdp;
-	*d_cdp = _d_cdp;
-
-	return ret;
 }
 
 /**
@@ -1062,11 +1126,11 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
  * Return: false if CBM does not overlap, true if it does.
  */
 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
-				    unsigned long cbm, int closid, bool exclusive)
+				    unsigned long cbm, int closid,
+				    enum resctrl_conf_type type, bool exclusive)
 {
 	enum rdtgrp_mode mode;
 	unsigned long ctrl_b;
-	u32 *ctrl;
 	int i;
 
 	/* Check for any overlap with regions used by hardware directly */
@@ -1077,9 +1141,8 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 	}
 
 	/* Check for overlap with other resource groups */
-	ctrl = d->ctrl_val;
-	for (i = 0; i < closids_supported(); i++, ctrl++) {
-		ctrl_b = *ctrl;
+	for (i = 0; i < closids_supported(); i++) {
+		ctrl_b = resctrl_arch_get_config(r, d, i, type);
 		mode = rdtgroup_mode_by_closid(i);
 		if (closid_allocated(i) && i != closid &&
 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1099,7 +1162,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
 
 /**
  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @r: Resource to which domain instance @d belongs.
+ * @s: Schema for the resource to which domain instance @d belongs.
  * @d: The domain instance for which @closid is being tested.
  * @cbm: Capacity bitmask being tested.
  * @closid: Intended closid for @cbm.
@@ -1117,19 +1180,19 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
  *
  * Return: true if CBM overlap detected, false if there is no overlap
  */
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 			   unsigned long cbm, int closid, bool exclusive)
 {
-	struct rdt_resource *r_cdp;
-	struct rdt_domain *d_cdp;
+	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+	struct rdt_resource *r = s->res;
 
-	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
+	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
+				    exclusive))
 		return true;
 
-	if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
+	if (!resctrl_arch_get_cdp_enabled(r->rid))
 		return false;
-
-	return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
+	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
 }
 
 /**
@@ -1147,17 +1210,21 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 {
 	int closid = rdtgrp->closid;
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	bool has_cache = false;
 	struct rdt_domain *d;
+	u32 ctrl;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		if (r->rid == RDT_RESOURCE_MBA)
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->domains, list) {
-			if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
-						  rdtgrp->closid, false)) {
+			ctrl = resctrl_arch_get_config(r, d, closid,
+						       s->conf_type);
+			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
 				rdt_last_cmd_puts("Schemata overlaps\n");
 				return false;
 			}
@@ -1288,6 +1355,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
 static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
+	struct resctrl_schema *schema;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	struct rdt_domain *d;
@@ -1309,8 +1377,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			ret = -ENODEV;
 		} else {
 			seq_printf(s, "%*s:", max_name_width,
-				   rdtgrp->plr->r->name);
-			size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+				   rdtgrp->plr->s->name);
+			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
 						    rdtgrp->plr->d,
 						    rdtgrp->plr->cbm);
 			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
@@ -1318,18 +1386,19 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 		goto out;
 	}
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(schema, &resctrl_schema_all, list) {
+		r = schema->res;
 		sep = false;
-		seq_printf(s, "%*s:", max_name_width, r->name);
+		seq_printf(s, "%*s:", max_name_width, schema->name);
 		list_for_each_entry(d, &r->domains, list) {
 			if (sep)
 				seq_putc(s, ';');
 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 				size = 0;
 			} else {
-				ctrl = (!is_mba_sc(r) ?
-						d->ctrl_val[rdtgrp->closid] :
-						d->mbps_val[rdtgrp->closid]);
+				ctrl = resctrl_arch_get_config(r, d,
+							       rdtgrp->closid,
+							       schema->conf_type);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -1426,6 +1495,17 @@ static struct rftype res_common_files[] = {
 		.seq_show	= rdt_delay_linear_show,
 		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
+	/*
+	 * Platform specific which (if any) capabilities are provided by
+	 * thread_throttle_mode. Defer "fflags" initialization to platform
+	 * discovery.
+	 */
+	{
+		.name		= "thread_throttle_mode",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_thread_throttle_mode_show,
+	},
 	{
 		.name		= "max_threshold_occupancy",
 		.mode		= 0644,
@@ -1496,7 +1576,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	for (rft = rfts; rft < rfts + len; rft++) {
-		if ((fflags & rft->fflags) == rft->fflags) {
+		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
 			ret = rdtgroup_add_file(kn, rft);
 			if (ret)
 				goto error;
@@ -1513,6 +1593,33 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 	return ret;
 }
 
+static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
+{
+	struct rftype *rfts, *rft;
+	int len;
+
+	rfts = res_common_files;
+	len = ARRAY_SIZE(res_common_files);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		if (!strcmp(rft->name, name))
+			return rft;
+	}
+
+	return NULL;
+}
+
+void __init thread_throttle_mode_init(void)
+{
+	struct rftype *rft;
+
+	rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
+	if (!rft)
+		return;
+
+	rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+}
+
 /**
  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
  * @r: The resource group with which the file is associated.
@@ -1610,14 +1717,14 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
 	return ret;
 }
 
-static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
 				      unsigned long fflags)
 {
 	struct kernfs_node *kn_subdir;
 	int ret;
 
 	kn_subdir = kernfs_create_dir(kn_info, name,
-				      kn_info->mode, r);
+				      kn_info->mode, priv);
 	if (IS_ERR(kn_subdir))
 		return PTR_ERR(kn_subdir);
 
@@ -1634,6 +1741,7 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
 
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	unsigned long fflags;
 	char name[32];
@@ -1648,9 +1756,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	if (ret)
 		goto out_destroy;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	/* loop over enabled controls, these are all alloc_enabled */
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		fflags =  r->fflags | RF_CTRL_INFO;
-		ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
+		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
 		if (ret)
 			goto out_destroy;
 	}
@@ -1720,7 +1830,7 @@ static void l2_qos_cfg_update(void *arg)
 
 static inline bool is_mba_linear(void)
 {
-	return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+	return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
 }
 
 static int set_cache_qos_cfg(int level, bool enable)
@@ -1741,10 +1851,15 @@ static int set_cache_qos_cfg(int level, bool enable)
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	r_l = &rdt_resources_all[level];
+	r_l = &rdt_resources_all[level].r_resctrl;
 	list_for_each_entry(d, &r_l->domains, list) {
-		/* Pick one CPU from each domain instance to update MSR */
-		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+		if (r_l->cache.arch_has_per_cpu_cfg)
+			/* Pick all the CPUs in the domain instance */
+			for_each_cpu(cpu, &d->cpu_mask)
+				cpumask_set_cpu(cpu, cpu_mask);
+		else
+			/* Pick one CPU from each domain instance to update MSR */
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 	}
 	cpu = get_cpu();
 	/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
@@ -1762,14 +1877,16 @@ static int set_cache_qos_cfg(int level, bool enable)
 /* Restore the qos cfg state when a domain comes online */
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 {
-	if (!r->alloc_capable)
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (!r->cdp_capable)
 		return;
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
-		l2_qos_cfg_update(&r->alloc_enabled);
+	if (r->rid == RDT_RESOURCE_L2)
+		l2_qos_cfg_update(&hw_res->cdp_enabled);
 
-	if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
-		l3_qos_cfg_update(&r->alloc_enabled);
+	if (r->rid == RDT_RESOURCE_L3)
+		l3_qos_cfg_update(&hw_res->cdp_enabled);
 }
 
 /*
@@ -1780,7 +1897,8 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
  */
 static int set_mba_sc(bool mba_sc)
 {
-	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
 	if (!is_mbm_enabled() || !is_mba_linear() ||
@@ -1788,73 +1906,60 @@ static int set_mba_sc(bool mba_sc)
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
-	list_for_each_entry(d, &r->domains, list)
-		setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+	list_for_each_entry(d, &r->domains, list) {
+		hw_dom = resctrl_to_arch_dom(d);
+		setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
+	}
 
 	return 0;
 }
 
-static int cdp_enable(int level, int data_type, int code_type)
+static int cdp_enable(int level)
 {
-	struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
-	struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
-	struct rdt_resource *r_l = &rdt_resources_all[level];
+	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
 	int ret;
 
-	if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
-	    !r_lcode->alloc_capable)
+	if (!r_l->alloc_capable)
 		return -EINVAL;
 
 	ret = set_cache_qos_cfg(level, true);
-	if (!ret) {
-		r_l->alloc_enabled = false;
-		r_ldata->alloc_enabled = true;
-		r_lcode->alloc_enabled = true;
-	}
+	if (!ret)
+		rdt_resources_all[level].cdp_enabled = true;
+
 	return ret;
 }
 
-static int cdpl3_enable(void)
+static void cdp_disable(int level)
 {
-	return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
-			  RDT_RESOURCE_L3CODE);
-}
+	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
 
-static int cdpl2_enable(void)
-{
-	return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
-			  RDT_RESOURCE_L2CODE);
+	if (r_hw->cdp_enabled) {
+		set_cache_qos_cfg(level, false);
+		r_hw->cdp_enabled = false;
+	}
 }
 
-static void cdp_disable(int level, int data_type, int code_type)
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
 {
-	struct rdt_resource *r = &rdt_resources_all[level];
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
 
-	r->alloc_enabled = r->alloc_capable;
+	if (!hw_res->r_resctrl.cdp_capable)
+		return -EINVAL;
 
-	if (rdt_resources_all[data_type].alloc_enabled) {
-		rdt_resources_all[data_type].alloc_enabled = false;
-		rdt_resources_all[code_type].alloc_enabled = false;
-		set_cache_qos_cfg(level, false);
-	}
-}
+	if (enable)
+		return cdp_enable(l);
 
-static void cdpl3_disable(void)
-{
-	cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
-}
+	cdp_disable(l);
 
-static void cdpl2_disable(void)
-{
-	cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+	return 0;
 }
 
 static void cdp_disable_all(void)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
-		cdpl3_disable();
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
-		cdpl2_disable();
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
+		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
+		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
 }
 
 /*
@@ -1932,10 +2037,10 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	int ret = 0;
 
 	if (ctx->enable_cdpl2)
-		ret = cdpl2_enable();
+		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
 
 	if (!ret && ctx->enable_cdpl3)
-		ret = cdpl3_enable();
+		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
 
 	if (!ret && ctx->enable_mba_mbps)
 		ret = set_mba_sc(true);
@@ -1943,6 +2048,92 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	return ret;
 }
 
+static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
+{
+	struct resctrl_schema *s;
+	const char *suffix = "";
+	int ret, cl;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	s->res = r;
+	s->num_closid = resctrl_arch_get_num_closid(r);
+	if (resctrl_arch_get_cdp_enabled(r->rid))
+		s->num_closid /= 2;
+
+	s->conf_type = type;
+	switch (type) {
+	case CDP_CODE:
+		suffix = "CODE";
+		break;
+	case CDP_DATA:
+		suffix = "DATA";
+		break;
+	case CDP_NONE:
+		suffix = "";
+		break;
+	}
+
+	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
+	if (ret >= sizeof(s->name)) {
+		kfree(s);
+		return -EINVAL;
+	}
+
+	cl = strlen(s->name);
+
+	/*
+	 * If CDP is supported by this resource, but not enabled,
+	 * include the suffix. This ensures the tabular format of the
+	 * schemata file does not change between mounts of the filesystem.
+	 */
+	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+		cl += 4;
+
+	if (cl > max_name_width)
+		max_name_width = cl;
+
+	INIT_LIST_HEAD(&s->list);
+	list_add(&s->list, &resctrl_schema_all);
+
+	return 0;
+}
+
+static int schemata_list_create(void)
+{
+	struct rdt_resource *r;
+	int ret = 0;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		if (resctrl_arch_get_cdp_enabled(r->rid)) {
+			ret = schemata_list_add(r, CDP_CODE);
+			if (ret)
+				break;
+
+			ret = schemata_list_add(r, CDP_DATA);
+		} else {
+			ret = schemata_list_add(r, CDP_NONE);
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static void schemata_list_destroy(void)
+{
+	struct resctrl_schema *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
 static int rdt_get_tree(struct fs_context *fc)
 {
 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
@@ -1964,11 +2155,17 @@ static int rdt_get_tree(struct fs_context *fc)
 	if (ret < 0)
 		goto out_cdp;
 
+	ret = schemata_list_create();
+	if (ret) {
+		schemata_list_destroy();
+		goto out_mba;
+	}
+
 	closid_init();
 
 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
 	if (ret < 0)
-		goto out_mba;
+		goto out_schemata_free;
 
 	if (rdt_mon_capable) {
 		ret = mongroup_create_dir(rdtgroup_default.kn,
@@ -2001,7 +2198,7 @@ static int rdt_get_tree(struct fs_context *fc)
 		static_branch_enable_cpuslocked(&rdt_enable_key);
 
 	if (is_mbm_enabled()) {
-		r = &rdt_resources_all[RDT_RESOURCE_L3];
+		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 		list_for_each_entry(dom, &r->domains, list)
 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
 	}
@@ -2018,6 +2215,8 @@ static int rdt_get_tree(struct fs_context *fc)
 		kernfs_remove(kn_mongrp);
 out_info:
 	kernfs_remove(kn_info);
+out_schemata_free:
+	schemata_list_destroy();
 out_mba:
 	if (ctx->enable_mba_mbps)
 		set_mba_sc(false);
@@ -2110,6 +2309,8 @@ static int rdt_init_fs_context(struct fs_context *fc)
 
 static int reset_all_ctrls(struct rdt_resource *r)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom;
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
@@ -2120,7 +2321,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
 
 	msr_param.res = r;
 	msr_param.low = 0;
-	msr_param.high = r->num_closid;
+	msr_param.high = hw_res->num_closid;
 
 	/*
 	 * Disable resource control for this resource by setting all
@@ -2128,10 +2329,11 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	 * from each domain to update the MSRs below.
 	 */
 	list_for_each_entry(d, &r->domains, list) {
+		hw_dom = resctrl_to_arch_dom(d);
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
-		for (i = 0; i < r->num_closid; i++)
-			d->ctrl_val[i] = r->default_ctrl;
+		for (i = 0; i < hw_res->num_closid; i++)
+			hw_dom->ctrl_val[i] = r->default_ctrl;
 	}
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
@@ -2146,18 +2348,6 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	return 0;
 }
 
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (rdt_alloc_capable &&
-		(r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (rdt_mon_capable &&
-		(r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
-}
-
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
@@ -2175,22 +2365,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 	for_each_process_thread(p, t) {
 		if (!from || is_closid_match(t, from) ||
 		    is_rmid_match(t, from)) {
-			t->closid = to->closid;
-			t->rmid = to->mon.rmid;
+			WRITE_ONCE(t->closid, to->closid);
+			WRITE_ONCE(t->rmid, to->mon.rmid);
 
-#ifdef CONFIG_SMP
 			/*
-			 * This is safe on x86 w/o barriers as the ordering
-			 * of writing to task_cpu() and t->on_cpu is
-			 * reverse to the reading here. The detection is
-			 * inaccurate as tasks might move or schedule
-			 * before the smp function call takes place. In
-			 * such a case the function call is pointless, but
+			 * If the task is on a CPU, set the CPU in the mask.
+			 * The detection is inaccurate as tasks might move or
+			 * schedule before the smp function call takes place.
+			 * In such a case the function call is pointless, but
 			 * there is no other side effect.
 			 */
-			if (mask && t->on_cpu)
+			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
 				cpumask_set_cpu(task_cpu(t), mask);
-#endif
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -2277,6 +2463,7 @@ static void rdt_kill_sb(struct super_block *sb)
 	rmdir_all_sub();
 	rdt_pseudo_lock_release();
 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+	schemata_list_destroy();
 	static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
 	static_branch_disable_cpuslocked(&rdt_mon_enable_key);
 	static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -2369,7 +2556,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 			goto out_destroy;
 
 		if (is_mbm_event(mevt->evtid))
-			mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+			mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
 	}
 	kernfs_activate(kn);
 	return 0;
@@ -2511,23 +2698,24 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
  * Set the RDT domain up to start off with all usable allocations. That is,
  * all shareable and unused bits. All-zero CBM is invalid.
  */
-static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
+static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
 				 u32 closid)
 {
-	struct rdt_resource *r_cdp = NULL;
-	struct rdt_domain *d_cdp = NULL;
+	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+	enum resctrl_conf_type t = s->conf_type;
+	struct resctrl_staged_config *cfg;
+	struct rdt_resource *r = s->res;
 	u32 used_b = 0, unused_b = 0;
 	unsigned long tmp_cbm;
 	enum rdtgrp_mode mode;
-	u32 peer_ctl, *ctrl;
+	u32 peer_ctl, ctrl_val;
 	int i;
 
-	rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
-	d->have_new_ctrl = false;
-	d->new_ctrl = r->cache.shareable_bits;
+	cfg = &d->staged_config[t];
+	cfg->have_new_ctrl = false;
+	cfg->new_ctrl = r->cache.shareable_bits;
 	used_b = r->cache.shareable_bits;
-	ctrl = d->ctrl_val;
-	for (i = 0; i < closids_supported(); i++, ctrl++) {
+	for (i = 0; i < closids_supported(); i++) {
 		if (closid_allocated(i) && i != closid) {
 			mode = rdtgroup_mode_by_closid(i);
 			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
@@ -2542,35 +2730,38 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
 			 * usage to ensure there is no overlap
 			 * with an exclusive group.
 			 */
-			if (d_cdp)
-				peer_ctl = d_cdp->ctrl_val[i];
+			if (resctrl_arch_get_cdp_enabled(r->rid))
+				peer_ctl = resctrl_arch_get_config(r, d, i,
+								   peer_type);
 			else
 				peer_ctl = 0;
-			used_b |= *ctrl | peer_ctl;
+			ctrl_val = resctrl_arch_get_config(r, d, i,
+							   s->conf_type);
+			used_b |= ctrl_val | peer_ctl;
 			if (mode == RDT_MODE_SHAREABLE)
-				d->new_ctrl |= *ctrl | peer_ctl;
+				cfg->new_ctrl |= ctrl_val | peer_ctl;
 		}
 	}
 	if (d->plr && d->plr->cbm > 0)
 		used_b |= d->plr->cbm;
 	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
 	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
-	d->new_ctrl |= unused_b;
+	cfg->new_ctrl |= unused_b;
 	/*
 	 * Force the initial CBM to be valid, user can
 	 * modify the CBM based on system availability.
 	 */
-	d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r);
+	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
 	/*
 	 * Assign the u32 CBM to an unsigned long to ensure that
 	 * bitmap_weight() does not access out-of-bound memory.
 	 */
-	tmp_cbm = d->new_ctrl;
+	tmp_cbm = cfg->new_ctrl;
 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
-		rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id);
+		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
 		return -ENOSPC;
 	}
-	d->have_new_ctrl = true;
+	cfg->have_new_ctrl = true;
 
 	return 0;
 }
@@ -2585,13 +2776,13 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
  * If there are no more shareable bits available on any domain then
  * the entire allocation will fail.
  */
-static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
+static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
 {
 	struct rdt_domain *d;
 	int ret;
 
-	list_for_each_entry(d, &r->domains, list) {
-		ret = __init_one_rdt_domain(d, r, closid);
+	list_for_each_entry(d, &s->res->domains, list) {
+		ret = __init_one_rdt_domain(d, s, closid);
 		if (ret < 0)
 			return ret;
 	}
@@ -2602,30 +2793,34 @@ static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
 /* Initialize MBA resource with default values. */
 static void rdtgroup_init_mba(struct rdt_resource *r)
 {
+	struct resctrl_staged_config *cfg;
 	struct rdt_domain *d;
 
 	list_for_each_entry(d, &r->domains, list) {
-		d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
-		d->have_new_ctrl = true;
+		cfg = &d->staged_config[CDP_NONE];
+		cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
+		cfg->have_new_ctrl = true;
 	}
 }
 
 /* Initialize the RDT group's allocations. */
 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 {
+	struct resctrl_schema *s;
 	struct rdt_resource *r;
 	int ret;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	list_for_each_entry(s, &resctrl_schema_all, list) {
+		r = s->res;
 		if (r->rid == RDT_RESOURCE_MBA) {
 			rdtgroup_init_mba(r);
 		} else {
-			ret = rdtgroup_init_cat(r, rdtgrp->closid);
+			ret = rdtgroup_init_cat(s, rdtgrp->closid);
 			if (ret < 0)
 				return ret;
 		}
 
-		ret = update_domains(r, rdtgrp->closid);
+		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
 		if (ret < 0) {
 			rdt_last_cmd_puts("Failed to initialize allocations\n");
 			return ret;
@@ -2639,7 +2834,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 }
 
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
-			     struct kernfs_node *prgrp_kn,
 			     const char *name, umode_t mode,
 			     enum rdt_group_type rtype, struct rdtgroup **r)
 {
@@ -2750,15 +2944,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
  */
 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
-			      struct kernfs_node *prgrp_kn,
-			      const char *name,
-			      umode_t mode)
+			      const char *name, umode_t mode)
 {
 	struct rdtgroup *rdtgrp, *prgrp;
 	int ret;
 
-	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
-				&rdtgrp);
+	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
 	if (ret)
 		return ret;
 
@@ -2780,7 +2971,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
  * to allocate and monitor resources.
  */
 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
-				   struct kernfs_node *prgrp_kn,
 				   const char *name, umode_t mode)
 {
 	struct rdtgroup *rdtgrp;
@@ -2788,8 +2978,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
 	u32 closid;
 	int ret;
 
-	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
-				&rdtgrp);
+	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
 	if (ret)
 		return ret;
 
@@ -2863,20 +3052,19 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	 * subdirectory
 	 */
 	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
-		return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
 
 	/*
 	 * If RDT monitoring is supported and the parent directory is a valid
 	 * "mon_groups" directory, add a monitoring subdirectory.
 	 */
 	if (rdt_mon_capable && is_mon_groups(parent_kn, name))
-		return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+		return rdtgroup_mkdir_mon(parent_kn, name, mode);
 
 	return -EPERM;
 }
 
-static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
-			      cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 {
 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
 	int cpu;
@@ -2908,8 +3096,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 	return 0;
 }
 
-static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
-				struct rdtgroup *rdtgrp)
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
 {
 	rdtgrp->flags = RDT_DELETED;
 	list_del(&rdtgrp->rdtgroup_list);
@@ -2918,8 +3105,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
 	return 0;
 }
 
-static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
-			       cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 {
 	int cpu;
 
@@ -2946,7 +3132,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 	closid_free(rdtgrp->closid);
 	free_rmid(rdtgrp->mon.rmid);
 
-	rdtgroup_ctrl_remove(kn, rdtgrp);
+	rdtgroup_ctrl_remove(rdtgrp);
 
 	/*
 	 * Free all the child monitor group rmids.
@@ -2983,13 +3169,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	    rdtgrp != &rdtgroup_default) {
 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-			ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+			ret = rdtgroup_ctrl_remove(rdtgrp);
 		} else {
-			ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
 		}
 	} else if (rdtgrp->type == RDTMON_GROUP &&
 		 is_mon_groups(parent_kn, kn->name)) {
-		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
 	} else {
 		ret = -EPERM;
 	}
@@ -3002,13 +3188,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
 		seq_puts(seq, ",cdp");
 
-	if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
 		seq_puts(seq, ",cdpl2");
 
-	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
 		seq_puts(seq, ",mba_MBps");
 
 	return 0;
@@ -3089,7 +3275,7 @@ int __init rdtgroup_init(void)
 	 * It may also be ok since that would enable debugging of RDT before
 	 * resctrl is mounted.
 	 * The reason why the debugfs directory is created here and not in
-	 * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
 	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
 	 * (the lockdep class of inode->i_rwsem). Other filesystem
 	 * interactions (eg. SyS_getdents) have the lock ordering:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386effa99207724997efd6eff34f3c10b..5cc4b5d43f4595992e61a20593329f3d9daddf8a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -35,6 +35,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
 	{ X86_FEATURE_CDP_L2,		CPUID_ECX,  2, 0x00000010, 2 },
 	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },
+	{ X86_FEATURE_PER_THREAD_MBA,	CPUID_ECX,  0, 0x00000010, 3 },
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 8d85e00bb40a5ad54e4164563b57941931f9e3ce..6a4cb71c24983ef65ac60d15718f7570cfa4c4f3 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -20,6 +20,7 @@
 #include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
 #include <asm/i8259.h>
@@ -183,31 +184,31 @@ static unsigned int ioapic_id;
 
 struct of_ioapic_type {
 	u32 out_type;
-	u32 trigger;
-	u32 polarity;
+	u32 is_level;
+	u32 active_low;
 };
 
 static struct of_ioapic_type of_ioapic_type[] =
 {
 	{
-		.out_type	= IRQ_TYPE_EDGE_RISING,
-		.trigger	= IOAPIC_EDGE,
-		.polarity	= 1,
+		.out_type	= IRQ_TYPE_EDGE_FALLING,
+		.is_level	= 0,
+		.active_low	= 1,
 	},
 	{
-		.out_type	= IRQ_TYPE_LEVEL_LOW,
-		.trigger	= IOAPIC_LEVEL,
-		.polarity	= 0,
+		.out_type	= IRQ_TYPE_LEVEL_HIGH,
+		.is_level	= 1,
+		.active_low	= 0,
 	},
 	{
-		.out_type	= IRQ_TYPE_LEVEL_HIGH,
-		.trigger	= IOAPIC_LEVEL,
-		.polarity	= 1,
+		.out_type	= IRQ_TYPE_LEVEL_LOW,
+		.is_level	= 1,
+		.active_low	= 1,
 	},
 	{
-		.out_type	= IRQ_TYPE_EDGE_FALLING,
-		.trigger	= IOAPIC_EDGE,
-		.polarity	= 0,
+		.out_type	= IRQ_TYPE_EDGE_RISING,
+		.is_level	= 0,
+		.active_low	= 0,
 	},
 };
 
@@ -227,9 +228,9 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 		return -EINVAL;
 
 	it = &of_ioapic_type[type_index];
-	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
-	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
-	tmp.ioapic_pin = fwspec->param[0];
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
+	tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic.pin = fwspec->param[0];
 
 	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7da2bcd2b8eb019ebffbf3325fc45cf896981c82..9976106b57ececec5d183c86f1732e84708b9010 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -190,6 +190,7 @@ static void __init e820_print_type(enum e820_type type)
 	case E820_TYPE_RAM:		/* Fall through: */
 	case E820_TYPE_RESERVED_KERN:	pr_cont("usable");			break;
 	case E820_TYPE_RESERVED:	pr_cont("reserved");			break;
+	case E820_TYPE_SOFT_RESERVED:	pr_cont("soft reserved");		break;
 	case E820_TYPE_ACPI:		pr_cont("ACPI data");			break;
 	case E820_TYPE_NVS:		pr_cont("ACPI NVS");			break;
 	case E820_TYPE_UNUSABLE:	pr_cont("unusable");			break;
@@ -1037,6 +1038,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		return "Persistent Memory (legacy)";
 	case E820_TYPE_PMEM:		return "Persistent Memory";
 	case E820_TYPE_RESERVED:	return "Reserved";
+	case E820_TYPE_SOFT_RESERVED:	return "Soft Reserved";
 	default:			return "Unknown E820 type";
 	}
 }
@@ -1052,6 +1054,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
 	case E820_TYPE_PRAM:		/* Fall-through: */
 	case E820_TYPE_PMEM:		/* Fall-through: */
 	case E820_TYPE_RESERVED:	/* Fall-through: */
+	case E820_TYPE_SOFT_RESERVED:	/* Fall-through: */
 	default:			return IORESOURCE_MEM;
 	}
 }
@@ -1064,6 +1067,7 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
 	case E820_TYPE_PMEM:		return IORES_DESC_PERSISTENT_MEMORY;
 	case E820_TYPE_PRAM:		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
 	case E820_TYPE_RESERVED:	return IORES_DESC_RESERVED;
+	case E820_TYPE_SOFT_RESERVED:	return IORES_DESC_SOFT_RESERVED;
 	case E820_TYPE_RESERVED_KERN:	/* Fall-through: */
 	case E820_TYPE_RAM:		/* Fall-through: */
 	case E820_TYPE_UNUSABLE:	/* Fall-through: */
@@ -1078,11 +1082,12 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
 		return true;
 
 	/*
-	 * Treat persistent memory like device memory, i.e. reserve it
-	 * for exclusive use of a driver
+	 * Treat persistent memory and other special memory ranges like
+	 * device memory, i.e. reserve it for exclusive use of a driver
 	 */
 	switch (type) {
 	case E820_TYPE_RESERVED:
+	case E820_TYPE_SOFT_RESERVED:
 	case E820_TYPE_PRAM:
 	case E820_TYPE_PMEM:
 		return false;
@@ -1285,6 +1290,9 @@ void __init e820__memblock_setup(void)
 		if (end != (resource_size_t)end)
 			continue;
 
+		if (entry->type == E820_TYPE_SOFT_RESERVED)
+			memblock_reserve(entry->addr, entry->size);
+
 		if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 			continue;
 
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 2954fab15e51c5372135e383f2924136baeffcba..794e70151203722b55050b392902279f63df7a30 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,7 +2,7 @@
 /*
  * x86 FPU bug checks:
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 
 /*
  * Boot time CPU/FPU FDIV bug detection code:
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..958accf2ccf07fa513424c87ad691ab6499e681d
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+	fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+	return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+	trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+	__this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+	trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+	int cpu = smp_processor_id();
+
+	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+		return;
+
+	if (!fpregs_state_valid(fpu, cpu)) {
+		/*
+		 * This restores _all_ xstate which has not been
+		 * established yet.
+		 *
+		 * If PKRU is enabled, then the PKRU value is already
+		 * correct because it was either set in switch_to() or in
+		 * flush_thread(). So it is excluded because it might be
+		 * not up to date in current->thread.fpu.xsave state.
+		 *
+		 * XFD state is handled in restore_fpregs_from_fpstate().
+		 */
+		restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+		fpregs_activate(fpu);
+		fpu->last_cpu = cpu;
+	}
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8c9b202f3e6db304769d02e3d6d1cf71f87595dd..ff41bd6a458c7e70e55dc3aa4bbede1b4b0228dc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -6,8 +6,9 @@
  *  General FPU state handling cleanups
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/types.h>
 #include <asm/traps.h>
@@ -15,15 +16,30 @@
 
 #include <linux/hardirq.h>
 #include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
 
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config	fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
 /*
  * Represents the initial FPU state. It's mostly (but not completely) zeroes,
  * depending on the FPU hardware format:
  */
-union fpregs_state init_fpstate __read_mostly;
+struct fpstate init_fpstate __ro_after_init;
 
 /*
  * Track whether the kernel is using the FPU state
@@ -82,6 +98,321 @@ bool irq_fpu_usable(void)
 }
 EXPORT_SYMBOL(irq_fpu_usable);
 
+/*
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
+ * preserved.
+ *
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
+ */
+void save_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		os_xsave(fpu->fpstate);
+
+		/*
+		 * AVX512 state is tracked here because its use is
+		 * known to slow the max clock speed of the core.
+		 */
+		if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+			fpu->avx512_timestamp = jiffies;
+		return;
+	}
+
+	if (likely(use_fxsr())) {
+		fxsave(&fpu->fpstate->regs.fxsave);
+		return;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to reload them from the memory state.
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+	frstor(&fpu->fpstate->regs.fsave);
+}
+
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
+{
+	/*
+	 * AMD K7/K8 and later CPUs up to Zen don't save/restore
+	 * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+	 * here by setting it to fixed values.  "m" is a random variable
+	 * that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %P[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (fpstate));
+	}
+
+	if (use_xsave()) {
+		/*
+		 * Dynamically enabled features are enabled in XCR0, but
+		 * usage requires also that the corresponding bits in XFD
+		 * are cleared.  If the bits are set then using a related
+		 * instruction will raise #NM. This allows to do the
+		 * allocation of the larger FPU buffer lazy from #NM or if
+		 * the task has no permission to kill it which would happen
+		 * via #UD if the feature is disabled in XCR0.
+		 *
+		 * XFD state is following the same life time rules as
+		 * XSTATE and to restore state correctly XFD has to be
+		 * updated before XRSTORS otherwise the component would
+		 * stay in or go into init state even if the bits are set
+		 * in fpstate::regs::xsave::xfeatures.
+		 */
+		xfd_update_state(fpstate);
+
+		/*
+		 * Restoring state always needs to modify all features
+		 * which are in @mask even if the current task cannot use
+		 * extended features.
+		 *
+		 * So fpstate->xfeatures cannot be used here, because then
+		 * a feature for which the task has no permission but was
+		 * used by the previous task would not go into init state.
+		 */
+		mask = fpu_kernel_cfg.max_features & mask;
+
+		os_xrstor(fpstate, mask);
+	} else {
+		if (use_fxsr())
+			fxrstor(&fpstate->regs.fxsave);
+		else
+			frstor(&fpstate->regs.fsave);
+	}
+}
+
+void fpu_reset_from_exception_fixup(void)
+{
+	restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+
+static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+{
+	struct fpu_state_perm *fpuperm;
+	u64 perm;
+
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	spin_lock_irq(&current->sighand->siglock);
+	fpuperm = &current->group_leader->thread.fpu.guest_perm;
+	perm = fpuperm->__state_perm;
+
+	/* First fpstate allocation locks down permissions. */
+	WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+	spin_unlock_irq(&current->sighand->siglock);
+
+	gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
+}
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+	struct fpstate *fpstate;
+	unsigned int size;
+
+	size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+	fpstate = vzalloc(size);
+	if (!fpstate)
+		return false;
+
+	/* Leave xfd to 0 (the reset value defined by spec) */
+	__fpstate_reset(fpstate, 0);
+	fpstate_init_user(fpstate);
+	fpstate->is_valloc	= true;
+	fpstate->is_guest	= true;
+
+	gfpu->fpstate		= fpstate;
+	gfpu->xfeatures		= fpu_user_cfg.default_features;
+	gfpu->perm		= fpu_user_cfg.default_features;
+	gfpu->uabi_size		= fpu_user_cfg.default_size;
+	fpu_init_guest_permissions(gfpu);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+	struct fpstate *fps = gfpu->fpstate;
+
+	if (!fps)
+		return;
+
+	if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+		return;
+
+	gfpu->fpstate = NULL;
+	vfree(fps);
+}
+EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+
+/*
+  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+  * @guest_fpu:         Pointer to the guest FPU container
+  * @xfeatures:         Features requested by guest CPUID
+  *
+  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+  *
+  * Return: 0 on success, error code otherwise
+  */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+	lockdep_assert_preemption_enabled();
+
+	/* Nothing to do if all requested features are already enabled. */
+	xfeatures &= ~guest_fpu->xfeatures;
+	if (!xfeatures)
+		return 0;
+
+	return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+	fpregs_lock();
+	guest_fpu->fpstate->xfd = xfd;
+	if (guest_fpu->fpstate->in_use)
+		xfd_update_state(guest_fpu->fpstate);
+	fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be udpated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+	struct fpstate *fps = current->thread.fpu.fpstate;
+
+	lockdep_assert_irqs_disabled();
+	if (fpu_state_size_dynamic()) {
+		rdmsrl(MSR_IA32_XFD, fps->xfd);
+		__this_cpu_write(xfd_state, fps->xfd);
+	}
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+	struct fpstate *guest_fps = guest_fpu->fpstate;
+	struct fpu *fpu = &current->thread.fpu;
+	struct fpstate *cur_fps = fpu->fpstate;
+
+	fpregs_lock();
+	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+		save_fpregs_to_fpstate(fpu);
+
+	/* Swap fpstate */
+	if (enter_guest) {
+		fpu->__task_fpstate = cur_fps;
+		fpu->fpstate = guest_fps;
+		guest_fps->in_use = true;
+	} else {
+		guest_fps->in_use = false;
+		fpu->fpstate = fpu->__task_fpstate;
+		fpu->__task_fpstate = NULL;
+	}
+
+	cur_fps = fpu->fpstate;
+
+	if (!cur_fps->is_confidential) {
+		/* Includes XFD update */
+		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+	} else {
+		/*
+		 * XSTATE is restored by firmware from encrypted
+		 * memory. Make sure XFD state is correct while
+		 * running with guest fpstate
+		 */
+		xfd_update_state(cur_fps);
+	}
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+				    unsigned int size, u32 pkru)
+{
+	struct fpstate *kstate = gfpu->fpstate;
+	union fpregs_state *ustate = buf;
+	struct membuf mb = { .p = buf, .left = size };
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+		__copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+	} else {
+		memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+		       sizeof(ustate->fxsave));
+		/* Make it restorable on a XSAVE enabled host */
+		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+	}
+}
+EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+				   u64 xcr0, u32 *vpkru)
+{
+	struct fpstate *kstate = gfpu->fpstate;
+	const union fpregs_state *ustate = buf;
+	struct pkru_state *xpkru;
+	int ret;
+
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+		if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+			return -EINVAL;
+		if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+			return -EINVAL;
+		memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+		return 0;
+	}
+
+	if (ustate->xsave.header.xfeatures & ~xcr0)
+		return -EINVAL;
+
+	ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
+	if (ret)
+		return ret;
+
+	/* Retrieve PKRU if not in init state */
+	if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+		xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
+		*vpkru = xpkru->pkru;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
+
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
 	preempt_disable();
@@ -94,11 +425,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 	if (!(current->flags & PF_KTHREAD) &&
 	    !test_thread_flag(TIF_NEED_FPU_LOAD)) {
 		set_thread_flag(TIF_NEED_FPU_LOAD);
-		/*
-		 * Ignore return value -- we don't care if reg state
-		 * is clobbered.
-		 */
-		copy_fpregs_to_fpstate(&current->thread.fpu);
+		save_fpregs_to_fpstate(&current->thread.fpu);
 	}
 	__cpu_invalidate_fpregs_state();
 
@@ -121,92 +448,166 @@ void kernel_fpu_end(void)
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
 
 /*
- * Save the FPU state (mark it for reload if necessary):
- *
- * This only ever gets called for the current task.
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
  */
-void fpu__save(struct fpu *fpu)
+void fpu_sync_fpstate(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
 	fpregs_lock();
 	trace_x86_fpu_before_save(fpu);
 
-	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		if (!copy_fpregs_to_fpstate(fpu)) {
-			copy_kernel_to_fpregs(&fpu->state);
-		}
-	}
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		save_fpregs_to_fpstate(fpu);
 
 	trace_x86_fpu_after_save(fpu);
 	fpregs_unlock();
 }
 
+static inline unsigned int init_fpstate_copy_size(void)
+{
+	if (!use_xsave())
+		return fpu_kernel_cfg.default_size;
+
+	/* XSAVE(S) just needs the legacy and the xstate header part */
+	return sizeof(init_fpstate.regs.xsave);
+}
+
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
+{
+	fpstate->regs.fxsave.cwd = 0x37f;
+	fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
+}
+
 /*
  * Legacy x87 fpstate state init:
  */
-static inline void fpstate_init_fstate(struct fregs_state *fp)
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
 {
-	fp->cwd = 0xffff037fu;
-	fp->swd = 0xffff0000u;
-	fp->twd = 0xffffffffu;
-	fp->fos = 0xffff0000u;
+	fpstate->regs.fsave.cwd = 0xffff037fu;
+	fpstate->regs.fsave.swd = 0xffff0000u;
+	fpstate->regs.fsave.twd = 0xffffffffu;
+	fpstate->regs.fsave.fos = 0xffff0000u;
 }
 
-void fpstate_init(union fpregs_state *state)
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
 {
-	if (!static_cpu_has(X86_FEATURE_FPU)) {
-		fpstate_init_soft(&state->soft);
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+		fpstate_init_soft(&fpstate->regs.soft);
 		return;
 	}
 
-	memset(state, 0, fpu_kernel_xstate_size);
+	xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
 
-	if (static_cpu_has(X86_FEATURE_XSAVES))
-		fpstate_init_xstate(&state->xsave);
-	if (static_cpu_has(X86_FEATURE_FXSR))
-		fpstate_init_fxstate(&state->fxsave);
+	if (cpu_feature_enabled(X86_FEATURE_FXSR))
+		fpstate_init_fxstate(fpstate);
 	else
-		fpstate_init_fstate(&state->fsave);
+		fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+{
+	/* Initialize sizes and feature masks */
+	fpstate->size		= fpu_kernel_cfg.default_size;
+	fpstate->user_size	= fpu_user_cfg.default_size;
+	fpstate->xfeatures	= fpu_kernel_cfg.default_features;
+	fpstate->user_xfeatures	= fpu_user_cfg.default_features;
+	fpstate->xfd		= xfd;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+	/* Set the fpstate pointer to the default fpstate */
+	fpu->fpstate = &fpu->__fpstate;
+	__fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+
+	/* Initialize the permission related info in fpu */
+	fpu->perm.__state_perm		= fpu_kernel_cfg.default_features;
+	fpu->perm.__state_size		= fpu_kernel_cfg.default_size;
+	fpu->perm.__user_state_size	= fpu_user_cfg.default_size;
+	/* Same defaults for guests */
+	fpu->guest_perm = fpu->perm;
 }
-EXPORT_SYMBOL_GPL(fpstate_init);
 
-int fpu__copy(struct task_struct *dst, struct task_struct *src)
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
 {
+	if (fpu_state_size_dynamic()) {
+		struct fpu *src_fpu = &current->group_leader->thread.fpu;
+
+		spin_lock_irq(&current->sighand->siglock);
+		/* Fork also inherits the permissions of the parent */
+		dst_fpu->perm = src_fpu->perm;
+		dst_fpu->guest_perm = src_fpu->guest_perm;
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
+{
+	struct fpu *src_fpu = &current->thread.fpu;
 	struct fpu *dst_fpu = &dst->thread.fpu;
-	struct fpu *src_fpu = &src->thread.fpu;
 
+	/* The new task's FPU state cannot be valid in the hardware. */
 	dst_fpu->last_cpu = -1;
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
+	fpstate_reset(dst_fpu);
+
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return 0;
 
-	WARN_ON_FPU(src_fpu != &current->thread.fpu);
+	/*
+	 * Enforce reload for user space tasks and prevent kernel threads
+	 * from trying to save the FPU registers on context switch.
+	 */
+	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
 
 	/*
-	 * Don't let 'init optimized' areas of the XSAVE area
-	 * leak into the child task:
+	 * No FPU state inheritance for kernel threads and IO
+	 * worker threads.
 	 */
-	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+	if (dst->flags & (PF_KTHREAD)) {
+		/* Clear out the minimal state */
+		memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+		       init_fpstate_copy_size());
+		return 0;
+	}
 
 	/*
-	 * If the FPU registers are not current just memcpy() the state.
-	 * Otherwise save current FPU registers directly into the child's FPU
-	 * context, without any memory-to-memory copying.
+	 * If a new feature is added, ensure all dynamic features are
+	 * caller-saved from here!
+	 */
+	BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+	/*
+	 * Save the default portion of the current FPU state into the
+	 * clone. Assume all dynamic features to be defined as caller-
+	 * saved, which enables skipping both the expansion of fpstate
+	 * and the copying of any dynamic state.
 	 *
-	 * ( The function 'fails' in the FNSAVE case, which destroys
-	 *   register contents so we have to load them back. )
+	 * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+	 * copying is not valid when current uses non-default states.
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
-
-	else if (!copy_fpregs_to_fpstate(dst_fpu))
-		copy_kernel_to_fpregs(&dst_fpu->state);
-
+		fpregs_restore_userregs();
+	save_fpregs_to_fpstate(dst_fpu);
+	if (!(clone_flags & CLONE_THREAD))
+		fpu_inherit_perms(dst_fpu);
 	fpregs_unlock();
 
-	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+	/*
+	 * Children never inherit PASID state.
+	 * Force it to have its init value:
+	 */
+	if (use_xsave())
+		dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
 
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
@@ -215,60 +616,13 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
 }
 
 /*
- * Activate the current task's in-memory FPU context,
- * if it has not been used before:
+ * Whitelist the FPU register state embedded into task_struct for hardened
+ * usercopy.
  */
-static void fpu__initialize(struct fpu *fpu)
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
 {
-	WARN_ON_FPU(fpu != &current->thread.fpu);
-
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-	fpstate_init(&fpu->state);
-	trace_x86_fpu_init_state(fpu);
-}
-
-/*
- * This function must be called before we read a task's fpstate.
- *
- * There's two cases where this gets called:
- *
- * - for the current task (when coredumping), in which case we have
- *   to save the latest FPU registers into the fpstate,
- *
- * - or it's called for stopped tasks (ptrace), in which case the
- *   registers were already saved by the context-switch code when
- *   the task scheduled out.
- *
- * If the task has used the FPU before then save it.
- */
-void fpu__prepare_read(struct fpu *fpu)
-{
-	if (fpu == &current->thread.fpu)
-		fpu__save(fpu);
-}
-
-/*
- * This function must be called before we write a task's fpstate.
- *
- * Invalidate any cached FPU registers.
- *
- * After this function call, after registers in the fpstate are
- * modified and the child task has woken up, the child task will
- * restore the modified FPU state from the modified context. If we
- * didn't clear its cached status here then the cached in-registers
- * state pending on its former CPU could be restored, corrupting
- * the modifications.
- */
-void fpu__prepare_write(struct fpu *fpu)
-{
-	/*
-	 * Only stopped child tasks can be used to modify the FPU
-	 * state in the fpstate buffer:
-	 */
-	WARN_ON_FPU(fpu == &current->thread.fpu);
-
-	/* Invalidate any cached state: */
-	__fpu_invalidate_fpregs_state(fpu);
+	*offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+	*size = fpu_kernel_cfg.default_size;
 }
 
 /*
@@ -298,47 +652,91 @@ void fpu__drop(struct fpu *fpu)
 }
 
 /*
- * Clear FPU registers by setting them up from
- * the init fpstate:
+ * Clear FPU registers by setting them up from the init fpstate.
+ * Caller must do fpregs_[un]lock() around it.
  */
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 {
-	fpregs_lock();
-
 	if (use_xsave())
-		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-	else if (static_cpu_has(X86_FEATURE_FXSR))
-		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+		os_xrstor(&init_fpstate, features_mask);
+	else if (use_fxsr())
+		fxrstor(&init_fpstate.regs.fxsave);
 	else
-		copy_kernel_to_fregs(&init_fpstate.fsave);
+		frstor(&init_fpstate.regs.fsave);
 
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		copy_init_pkru_to_fpregs();
+	pkru_write_default();
+}
 
-	fpregs_mark_activate();
+/*
+ * Reset current->fpu memory state to the init values.
+ */
+static void fpu_reset_fpregs(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_lock();
+	fpu__drop(fpu);
+	/*
+	 * This does not change the actual hardware registers. It just
+	 * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+	 * subsequent return to usermode will reload the registers from the
+	 * task's memory image.
+	 *
+	 * Do not use fpstate_init() here. Just copy init_fpstate which has
+	 * the correct content already except for PKRU.
+	 *
+	 * PKRU handling does not rely on the xstate when restoring for
+	 * user space as PKRU is eagerly written in switch_to() and
+	 * flush_thread().
+	 */
+	memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 }
 
 /*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
+ * Reset current's user FPU states to the init states.  current's
+ * supervisor states, if any, are not modified by this function.  The
+ * caller guarantees that the XSTATE header in memory is intact.
  */
-void fpu__clear(struct fpu *fpu)
+void fpu__clear_user_states(struct fpu *fpu)
 {
-	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	fpu__drop(fpu);
+	fpregs_lock();
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+		fpu_reset_fpregs();
+		fpregs_unlock();
+		return;
+	}
+
+	/*
+	 * Ensure that current's supervisor states are loaded into their
+	 * corresponding registers.
+	 */
+	if (xfeatures_mask_supervisor() &&
+	    !fpregs_state_valid(fpu, smp_processor_id()))
+		os_xrstor_supervisor(fpu->fpstate);
+
+	/* Reset user states in registers. */
+	restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
 
 	/*
-	 * Make sure fpstate is cleared and initialized.
+	 * Now all FPU registers have their desired values.  Inform the FPU
+	 * state machine that current's FPU registers are in the hardware
+	 * registers. The memory image does not need to be updated because
+	 * any operation relying on it has to save the registers first when
+	 * current's FPU is marked active.
 	 */
-	fpu__initialize(fpu);
-	if (static_cpu_has(X86_FEATURE_FPU))
-		copy_init_fpstate_to_fpregs();
+	fpregs_mark_activate();
+	fpregs_unlock();
 }
 
+void fpu_flush_thread(void)
+{
+	fpstate_reset(&current->thread.fpu);
+	fpu_reset_fpregs();
+}
 /*
  * Load FPU context before returning to userspace.
  */
@@ -347,7 +745,7 @@ void switch_fpu_return(void)
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	__fpregs_load_activate();
+	fpregs_restore_userregs();
 }
 EXPORT_SYMBOL_GPL(switch_fpu_return);
 
@@ -377,7 +775,6 @@ void fpregs_mark_activate(void)
 	fpu->last_cpu = smp_processor_id();
 	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
 
 /*
  * x87 math exception handling:
@@ -400,11 +797,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		 * fully reproduce the context of the exception.
 		 */
 		if (boot_cpu_has(X86_FEATURE_FXSR)) {
-			cwd = fpu->state.fxsave.cwd;
-			swd = fpu->state.fxsave.swd;
+			cwd = fpu->fpstate->regs.fxsave.cwd;
+			swd = fpu->fpstate->regs.fxsave.swd;
 		} else {
-			cwd = (unsigned short)fpu->state.fsave.cwd;
-			swd = (unsigned short)fpu->state.fsave.swd;
+			cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+			swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
 		}
 
 		err = swd & ~cwd;
@@ -418,7 +815,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		unsigned short mxcsr = MXCSR_DEFAULT;
 
 		if (boot_cpu_has(X86_FEATURE_XMM))
-			mxcsr = fpu->state.fxsave.mxcsr;
+			mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
 
 		err = ~(mxcsr >> 7) & mxcsr;
 	}
@@ -447,3 +844,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 	 */
 	return 0;
 }
+
+/*
+ * Initialize register state that may prevent from entering low-power idle.
+ * This function will be invoked from the cpuidle driver only when needed.
+ */
+void fpu_idle_fpregs(void)
+{
+	/* Note: AMX_TILE being enabled implies XGETBV1 support */
+	if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
+	    (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
+		tile_release();
+		fpregs_deactivate(&current->thread.fpu);
+	}
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index b271da0fa2193de5ff65012c7810d85c9af40e70..621f4b6cac4a33525edbbf00bac4a280781490be 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -2,15 +2,18 @@
 /*
  * x86 FPU boot time init code:
  */
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
-#include <asm/cmdline.h>
 
 #include <linux/sched.h>
 #include <linux/sched/task.h>
 #include <linux/init.h>
 
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
 /*
  * Initialize the registers found in all CPUs, CR0 and CR4:
  */
@@ -35,7 +38,7 @@ static void fpu__init_cpu_generic(void)
 	/* Flush out any pending x87 state: */
 #ifdef CONFIG_MATH_EMULATION
 	if (!boot_cpu_has(X86_FEATURE_FPU))
-		fpstate_init_soft(&current->thread.fpu.state.soft);
+		fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
 	else
 #endif
 		asm volatile ("fninit");
@@ -90,7 +93,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 /*
  * Boot time FPU feature detection code:
  */
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
 EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
 
 static void __init fpu__init_system_mxcsr(void)
@@ -122,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void)
 static void __init fpu__init_system_generic(void)
 {
 	/*
-	 * Set up the legacy init FPU context. (xstate init might overwrite this
-	 * with a more modern format, if the CPU supports it.)
+	 * Set up the legacy init FPU context. Will be updated when the
+	 * CPU supports XSAVE[S].
 	 */
-	fpstate_init(&init_fpstate);
+	fpstate_init_user(&init_fpstate);
 
 	fpu__init_system_mxcsr();
 }
 
-/*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
- */
-unsigned int fpu_kernel_xstate_size;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
-
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
 
@@ -163,13 +157,13 @@ static void __init fpu__init_task_struct_size(void)
 	 * Subtract off the static size of the register state.
 	 * It potentially has a bunch of padding.
 	 */
-	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+	task_size -= sizeof(current->thread.fpu.__fpstate.regs);
 
 	/*
 	 * Add back the dynamically-calculated register state
 	 * size.
 	 */
-	task_size += fpu_kernel_xstate_size;
+	task_size += fpu_kernel_cfg.default_size;
 
 	/*
 	 * We dynamically size 'struct fpu', so we require that
@@ -178,7 +172,7 @@ static void __init fpu__init_task_struct_size(void)
 	 * you hit a compile error here, check the structure to
 	 * see if something got added to the end.
 	 */
-	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
@@ -193,100 +187,34 @@ static void __init fpu__init_task_struct_size(void)
  */
 static void __init fpu__init_system_xstate_size_legacy(void)
 {
-	static int on_boot_cpu __initdata = 1;
-
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
+	unsigned int size;
 
 	/*
-	 * Note that xstate sizes might be overwritten later during
-	 * fpu__init_system_xstate().
+	 * Note that the size configuration might be overwritten later
+	 * during fpu__init_system_xstate().
 	 */
-
-	if (!boot_cpu_has(X86_FEATURE_FPU)) {
-		fpu_kernel_xstate_size = sizeof(struct swregs_state);
+	if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+		size = sizeof(struct swregs_state);
+	} else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+		size = sizeof(struct fxregs_state);
+		fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
 	} else {
-		if (boot_cpu_has(X86_FEATURE_FXSR))
-			fpu_kernel_xstate_size =
-				sizeof(struct fxregs_state);
-		else
-			fpu_kernel_xstate_size =
-				sizeof(struct fregs_state);
+		size = sizeof(struct fregs_state);
+		fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
 	}
 
-	fpu_user_xstate_size = fpu_kernel_xstate_size;
-}
-
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
-	return XCNTXT_MASK;
+	fpu_kernel_cfg.max_size = size;
+	fpu_kernel_cfg.default_size = size;
+	fpu_user_cfg.max_size = size;
+	fpu_user_cfg.default_size = size;
+	fpstate_reset(&current->thread.fpu);
 }
 
-/* Legacy code to initialize eager fpu mode. */
-static void __init fpu__init_system_ctx_switch(void)
+static void __init fpu__init_init_fpstate(void)
 {
-	static bool on_boot_cpu __initdata = 1;
-
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-}
-
-/*
- * We parse fpu parameters early because fpu__init_system() is executed
- * before parse_early_param().
- */
-static void __init fpu__init_parse_early_param(void)
-{
-	char arg[128];
-	char *argptr = arg;
-	int arglen, res, bit;
-
-#ifdef CONFIG_X86_32
-	if (cmdline_find_option_bool(boot_command_line, "no387"))
-#ifdef CONFIG_MATH_EMULATION
-		setup_clear_cpu_cap(X86_FEATURE_FPU);
-#else
-		pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
-#endif
-
-	if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
-		setup_clear_cpu_cap(X86_FEATURE_FXSR);
-#endif
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
-	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
-		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
-	arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
-	if (arglen <= 0)
-		return;
-
-	pr_info("Clearing CPUID bits:");
-	do {
-		res = get_option(&argptr, &bit);
-		if (res == 0 || res == 3)
-			break;
-
-		/* If the argument was too long, the last bit may be cut off */
-		if (res == 1 && arglen >= sizeof(arg))
-			break;
-
-		if (bit >= 0 && bit < NCAPINTS * 32) {
-			pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
-			setup_clear_cpu_cap(bit);
-		}
-	} while (res == 2);
-	pr_cont("\n");
+	/* Bring init_fpstate size and features up to date */
+	init_fpstate.size		= fpu_kernel_cfg.max_size;
+	init_fpstate.xfeatures		= fpu_kernel_cfg.max_features;
 }
 
 /*
@@ -295,7 +223,7 @@ static void __init fpu__init_parse_early_param(void)
  */
 void __init fpu__init_system(struct cpuinfo_x86 *c)
 {
-	fpu__init_parse_early_param();
+	fpstate_reset(&current->thread.fpu);
 	fpu__init_system_early_generic(c);
 
 	/*
@@ -306,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
-	fpu__init_system_xstate();
+	fpu__init_system_xstate(fpu_kernel_cfg.max_size);
 	fpu__init_task_struct_size();
-
-	fpu__init_system_ctx_switch();
+	fpu__init_init_fpstate();
 }
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbdb31f55fc7ff83165167741304bec103c5a42b
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..17c26b164c6321ac21717dd70eb567e72abfcb3c
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+	asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...)				\
+({									\
+	int err;							\
+									\
+	might_fault();							\
+									\
+	asm volatile(ASM_STAC "\n"					\
+		     "1: " #insn "\n"					\
+		     "2: " ASM_CLAC "\n"				\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     : [err] "=a" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define kernel_insn_err(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define kernel_insn(insn, output, input...)				\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE)	\
+		     : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+	else
+		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+	else
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index d652b939ccfb5865f8a8383e04e71adfe450c1e1..75ffaef8c2991993597a764360a0260715f21d13 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -2,11 +2,17 @@
 /*
  * FPU register's regset abstraction, for ptrace, core dumps, etc.
  */
-#include <asm/fpu/internal.h>
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/fpu/api.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
-#include <linux/sched/task_stack.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -26,20 +32,58 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
 		return 0;
 }
 
+/*
+ * The regset get() functions are invoked from:
+ *
+ *   - coredump to dump the current task's fpstate. If the current task
+ *     owns the FPU then the memory state has to be synchronized and the
+ *     FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ *   - ptrace to dump fpstate of a stopped task, in which case the registers
+ *     have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+	if (fpu == &current->thread.fpu)
+		fpu_sync_fpstate(fpu);
+}
+
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+	/*
+	 * Only stopped child tasks can be used to modify the FPU
+	 * state in the fpstate buffer:
+	 */
+	WARN_ON_FPU(fpu == &current->thread.fpu);
+
+	__fpu_invalidate_fpregs_state(fpu);
+}
+
 int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
+		struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
-	fpu__prepare_read(fpu);
-	fpstate_sanitize_xstate(fpu);
+	sync_fpstate(fpu);
+
+	if (!use_xsave()) {
+		return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+				    sizeof(fpu->fpstate->regs.fxsave));
+	}
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &fpu->state.fxsave, 0, -1);
+	copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
+	return 0;
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -47,67 +91,51 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		const void *kbuf, const void __user *ubuf)
 {
 	struct fpu *fpu = &target->thread.fpu;
+	struct fxregs_state newstate;
 	int ret;
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR))
 		return -ENODEV;
 
-	fpu__prepare_write(fpu);
-	fpstate_sanitize_xstate(fpu);
+	/* No funny business with partial or oversized writes is permitted. */
+	if (pos != 0 || count != sizeof(newstate))
+		return -EINVAL;
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &fpu->state.fxsave, 0, -1);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+	if (ret)
+		return ret;
 
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+	/* Do not allow an invalid MXCSR value. */
+	if (newstate.mxcsr & ~mxcsr_feature_mask)
+		return -EINVAL;
 
-	/*
-	 * update the header bits in the xsave header, indicating the
-	 * presence of FP and SSE state.
-	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+	fpu_force_restore(fpu);
 
-	return ret;
+	/* Copy the state  */
+	memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
+
+	/* Clear xmm8..15 for 32-bit callers */
+	BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+	if (in_ia32_syscall())
+		memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16);
+
+	/* Mark FP and SSE as in use when XSAVE is enabled */
+	if (use_xsave())
+		fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+	return 0;
 }
 
 int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
+		struct membuf to)
 {
-	struct fpu *fpu = &target->thread.fpu;
-	struct xregs_state *xsave;
-	int ret;
-
-	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
-	xsave = &fpu->state.xsave;
-
-	fpu__prepare_read(fpu);
+	sync_fpstate(&target->thread.fpu);
 
-	if (using_compacted_format()) {
-		if (kbuf)
-			ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
-		else
-			ret = copy_xstate_to_user(ubuf, xsave, pos, count);
-	} else {
-		fpstate_sanitize_xstate(fpu);
-		/*
-		 * Copy the 48 bytes defined by the software into the xsave
-		 * area in the thread struct, so that we can copy the whole
-		 * area to user using one user_regset_copyout().
-		 */
-		memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
-		/*
-		 * Copy the xstate memory layout.
-		 */
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	}
-	return ret;
+	copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
+	return 0;
 }
 
 int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -115,44 +143,34 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		  const void *kbuf, const void __user *ubuf)
 {
 	struct fpu *fpu = &target->thread.fpu;
-	struct xregs_state *xsave;
+	struct xregs_state *tmpbuf = NULL;
 	int ret;
 
-	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
 	/*
 	 * A whole standard-format XSAVE buffer is needed:
 	 */
-	if ((pos != 0) || (count < fpu_user_xstate_size))
+	if (pos != 0 || count != fpu_user_cfg.max_size)
 		return -EFAULT;
 
-	xsave = &fpu->state.xsave;
+	if (!kbuf) {
+		tmpbuf = vmalloc(count);
+		if (!tmpbuf)
+			return -ENOMEM;
 
-	fpu__prepare_write(fpu);
-
-	if (using_compacted_format()) {
-		if (kbuf)
-			ret = copy_kernel_to_xstate(xsave, kbuf);
-		else
-			ret = copy_user_to_xstate(xsave, ubuf);
-	} else {
-		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-		if (!ret)
-			ret = validate_xstate_header(&xsave->header);
+		if (copy_from_user(tmpbuf, ubuf, count)) {
+			ret = -EFAULT;
+			goto out;
+		}
 	}
 
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-	/*
-	 * In case of failure, mark all states as init:
-	 */
-	if (ret)
-		fpstate_init(&fpu->state);
+	fpu_force_restore(fpu);
+	ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
 
+out:
+	vfree(tmpbuf);
 	return ret;
 }
 
@@ -228,10 +246,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
  * FXSR floating point environment conversions.
  */
 
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+				struct task_struct *tsk,
+				struct fxregs_state *fxsave)
 {
-	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -265,6 +283,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 		memcpy(&to[i], &from[i], sizeof(to[0]));
 }
 
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+	__convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
+}
+
 void convert_to_fxsr(struct fxregs_state *fxsave,
 		     const struct user_i387_ia32_struct *env)
 
@@ -293,32 +317,34 @@ void convert_to_fxsr(struct fxregs_state *fxsave,
 }
 
 int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       void *kbuf, void __user *ubuf)
+	       struct membuf to)
 {
 	struct fpu *fpu = &target->thread.fpu;
 	struct user_i387_ia32_struct env;
+	struct fxregs_state fxsave, *fx;
 
-	fpu__prepare_read(fpu);
+	sync_fpstate(fpu);
 
-	if (!boot_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
+		return fpregs_soft_get(target, regset, to);
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &fpu->state.fsave, 0,
-					   -1);
+	if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
+		return membuf_write(&to, &fpu->fpstate->regs.fsave,
+				    sizeof(struct fregs_state));
+	}
 
-	fpstate_sanitize_xstate(fpu);
+	if (use_xsave()) {
+		struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
 
-	if (kbuf && pos == 0 && count == sizeof(env)) {
-		convert_from_fxsr(kbuf, target);
-		return 0;
+		/* Handle init state optimized xstate correctly */
+		copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
+		fx = &fxsave;
+	} else {
+		fx = &fpu->fpstate->regs.fxsave;
 	}
 
-	convert_from_fxsr(&env, target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	__convert_from_fxsr(&env, target, fx);
+	return membuf_write(&to, &env, sizeof(env));
 }
 
 int fpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -329,47 +355,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	fpu__prepare_write(fpu);
-	fpstate_sanitize_xstate(fpu);
+	/* No funny business with partial or oversized writes is permitted. */
+	if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+		return -EINVAL;
 
-	if (!boot_cpu_has(X86_FEATURE_FPU))
+	if (!cpu_feature_enabled(X86_FEATURE_FPU))
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	if (!boot_cpu_has(X86_FEATURE_FXSR))
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &fpu->state.fsave, 0,
-					  -1);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (ret)
+		return ret;
 
-	if (pos > 0 || count < sizeof(env))
-		convert_from_fxsr(&env, target);
+	fpu_force_restore(fpu);
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-	if (!ret)
-		convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
+	if (cpu_feature_enabled(X86_FEATURE_FXSR))
+		convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
+	else
+		memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
 
 	/*
-	 * update the header bit in the xsave header, indicating the
+	 * Update the header bit in the xsave header, indicating the
 	 * presence of FP.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
-	return ret;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
-{
-	struct task_struct *tsk = current;
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+		fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
 
-	return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct),
-			   ufpu, NULL);
+	return 0;
 }
-EXPORT_SYMBOL(dump_fpu);
 
 #endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 400a05e1c1c519988a348be099c1160f68b0cbd8..8fd5f6a9482808439ee688f22e95dfc587ef0a82 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -7,37 +7,40 @@
 #include <linux/cpu.h>
 #include <linux/pagemap.h>
 
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
 
 #include <asm/sigframe.h>
+#include <asm/traps.h>
 #include <asm/trace/fpu.h>
 
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
 
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
  */
-static inline int check_for_xstate(struct fxregs_state __user *buf,
-				   void __user *fpstate,
-				   struct _fpx_sw_bytes *fx_sw)
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+					    struct _fpx_sw_bytes *fx_sw)
 {
 	int min_xstate_size = sizeof(struct fxregs_state) +
 			      sizeof(struct xstate_header);
+	void __user *fpstate = fxbuf;
 	unsigned int magic2;
 
-	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-		return -1;
+	if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+		return false;
 
 	/* Check for the first magic field and other error scenarios. */
 	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
 	    fx_sw->xstate_size < min_xstate_size ||
-	    fx_sw->xstate_size > fpu_user_xstate_size ||
+	    fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
 	    fx_sw->xstate_size > fx_sw->extended_size)
-		return -1;
+		goto setfx;
 
 	/*
 	 * Check for the presence of second magic word at the end of memory
@@ -45,26 +48,34 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
 	 * fpstate layout with out copying the extended state information
 	 * in the memory layout.
 	 */
-	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-	    || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
-
-	return 0;
+	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+		return false;
+
+	if (likely(magic2 == FP_XSTATE_MAGIC2))
+		return true;
+setfx:
+	trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
+
+	/* Set the parameters for fx only state */
+	fx_sw->magic1 = 0;
+	fx_sw->xstate_size = sizeof(struct fxregs_state);
+	fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
+	return true;
 }
 
 /*
  * Signal frame handlers.
  */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
 {
 	if (use_fxsr()) {
-		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+		struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
 		struct user_i387_ia32_struct env;
 		struct _fpstate_32 __user *fp = buf;
 
 		fpregs_lock();
 		if (!test_thread_flag(TIF_NEED_FPU_LOAD))
-			copy_fxregs_to_kernel(&tsk->thread.fpu);
+			fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
 		fpregs_unlock();
 
 		convert_from_fxsr(&env, tsk);
@@ -72,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
 		if (__copy_to_user(buf, &env, sizeof(env)) ||
 		    __put_user(xsave->i387.swd, &fp->status) ||
 		    __put_user(X86_FXSR_MAGIC, &fp->magic))
-			return -1;
+			return false;
 	} else {
 		struct fregs_state __user *fp = buf;
 		u32 swd;
+
 		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-			return -1;
+			return false;
 	}
 
-	return 0;
+	return true;
 }
 
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+				 struct fpstate *fpstate)
+{
+	sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+	sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+	sw_bytes->xfeatures = fpstate->user_xfeatures;
+	sw_bytes->xstate_size = fpstate->user_size;
+
+	if (ia32_frame)
+		sw_bytes->extended_size += sizeof(struct fregs_state);
+}
+
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+				      struct fpstate *fpstate)
 {
 	struct xregs_state __user *x = buf;
-	struct _fpx_sw_bytes *sw_bytes;
+	struct _fpx_sw_bytes sw_bytes = {};
 	u32 xfeatures;
 	int err;
 
 	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+	save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+	err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
 
 	if (!use_xsave())
-		return err;
+		return !err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
-			  (__u32 __user *)(buf + fpu_user_xstate_size));
+			  (__u32 __user *)(buf + fpstate->user_size));
 
 	/*
 	 * Read the xfeatures which we copied (directly from the cpu or
@@ -121,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 
 	err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
-	return err;
+	return !err;
 }
 
 static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 {
-	int err;
-
 	if (use_xsave())
-		err = copy_xregs_to_user(buf);
-	else if (use_fxsr())
-		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+		return xsave_to_user_sigframe(buf);
+	if (use_fxsr())
+		return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
 	else
-		err = copy_fregs_to_user((struct fregs_state __user *) buf);
-
-	if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
-		err = -EFAULT;
-	return err;
+		return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
 }
 
 /*
@@ -150,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Try to save it directly to the user frame with disabled page fault handler.
- * If this fails then do the slow path where the FPU state is first saved to
- * task's fpu->state and then copy it to the user frame pointed to by the
- * aligned pointer 'buf_fx'.
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -161,23 +185,37 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
  * indicating the absence/presence of the extended state to the user.
  */
-int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct task_struct *tsk = current;
-	int ia32_fxstate = (buf != buf_fx);
+	struct fpstate *fpstate = tsk->thread.fpu.fpstate;
+	bool ia32_fxstate = (buf != buf_fx);
 	int ret;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
+	if (!static_cpu_has(X86_FEATURE_FPU)) {
+		struct user_i387_ia32_struct fp;
+
+		fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
+						.left = sizeof(fp)});
+		return !copy_to_user(buf, &fp, sizeof(fp));
+	}
+
 	if (!access_ok(buf, size))
-		return -EACCES;
+		return false;
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(current, NULL, 0,
-			sizeof(struct user_i387_ia32_struct), NULL,
-			(struct _fpstate_32 __user *) buf) ? -1 : 1;
+	if (use_xsave()) {
+		struct xregs_state __user *xbuf = buf_fx;
 
+		/*
+		 * Clear the xsave header first, so that reserved fields are
+		 * initialized to zero.
+		 */
+		if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+			return false;
+	}
 retry:
 	/*
 	 * Load the FPU registers if they are not valid for the current task.
@@ -187,7 +225,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		__fpregs_load_activate();
+		fpregs_restore_userregs();
 
 	pagefault_disable();
 	ret = copy_fpregs_to_sigframe(buf_fx);
@@ -195,252 +233,280 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	fpregs_unlock();
 
 	if (ret) {
-		if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+		if (!__clear_user(buf_fx, fpstate->user_size))
 			goto retry;
-		return -EFAULT;
+		return false;
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-		return -1;
+	if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+		return false;
 
-	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-		return -1;
+	if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+		return false;
 
-	return 0;
+	return true;
 }
 
-static inline void
-sanitize_restored_xstate(union fpregs_state *state,
-			 struct user_i387_ia32_struct *ia32_env,
-			 u64 xfeatures, int fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+				      u64 xrestore, bool fx_only)
 {
-	struct xregs_state *xsave = &state->xsave;
-	struct xstate_header *header = &xsave->header;
-
 	if (use_xsave()) {
-		/*
-		 * Note: we don't need to zero the reserved bits in the
-		 * xstate_header here because we either didn't copy them at all,
-		 * or we checked earlier that they aren't set.
-		 */
+		u64 init_bv = ufeatures & ~xrestore;
+		int ret;
 
-		/*
-		 * Init the state that is not present in the memory
-		 * layout and not enabled by the OS.
-		 */
-		if (fx_only)
-			header->xfeatures = XFEATURE_MASK_FPSSE;
+		if (likely(!fx_only))
+			ret = xrstor_from_user_sigframe(buf, xrestore);
 		else
-			header->xfeatures &= xfeatures;
-	}
-
-	if (use_fxsr()) {
-		/*
-		 * mscsr reserved bits must be masked to zero for security
-		 * reasons.
-		 */
-		xsave->i387.mxcsr &= mxcsr_feature_mask;
+			ret = fxrstor_from_user_sigframe(buf);
 
-		if (ia32_env)
-			convert_to_fxsr(&state->fxsave, ia32_env);
+		if (!ret && unlikely(init_bv))
+			os_xrstor(&init_fpstate, init_bv);
+		return ret;
+	} else if (use_fxsr()) {
+		return fxrstor_from_user_sigframe(buf);
+	} else {
+		return frstor_from_user_sigframe(buf);
 	}
 }
 
 /*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
  */
-static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
+				     bool fx_only, unsigned int size)
 {
-	if (use_xsave()) {
-		if (fx_only) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			return copy_user_to_fxregs(buf);
-		} else {
-			u64 init_bv = xfeatures_mask & ~xbv;
-			if (unlikely(init_bv))
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			return copy_user_to_xregs(buf, xbv);
-		}
-	} else if (use_fxsr()) {
-		return copy_user_to_fxregs(buf);
-	} else
-		return copy_user_to_fregs(buf);
-}
+	struct fpu *fpu = &current->thread.fpu;
+	int ret;
 
-static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	struct user_i387_ia32_struct *envp = NULL;
-	int state_size = fpu_kernel_xstate_size;
-	int ia32_fxstate = (buf != buf_fx);
-	struct task_struct *tsk = current;
-	struct fpu *fpu = &tsk->thread.fpu;
-	struct user_i387_ia32_struct env;
-	u64 xfeatures = 0;
-	int fx_only = 0;
-	int ret = 0;
+retry:
+	fpregs_lock();
+	/* Ensure that XFD is up to date */
+	xfd_update_state(fpu->fpstate);
+	pagefault_disable();
+	ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+					 xrestore, fx_only);
+	pagefault_enable();
 
-	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
-			 IS_ENABLED(CONFIG_IA32_EMULATION));
+	if (unlikely(ret)) {
+		/*
+		 * The above did an FPU restore operation, restricted to
+		 * the user portion of the registers, and failed, but the
+		 * microcode might have modified the FPU registers
+		 * nevertheless.
+		 *
+		 * If the FPU registers do not belong to current, then
+		 * invalidate the FPU register state otherwise the task
+		 * might preempt current and return to user space with
+		 * corrupted FPU registers.
+		 */
+		if (test_thread_flag(TIF_NEED_FPU_LOAD))
+			__cpu_invalidate_fpregs_state();
+		fpregs_unlock();
+
+		/* Try to handle #PF, but anything else is fatal. */
+		if (ret != X86_TRAP_PF)
+			return false;
 
-	if (!buf) {
-		fpu__clear(fpu);
-		return 0;
+		if (!fault_in_pages_readable(buf, size))
+			goto retry;
+		return false;
 	}
 
-	if (!access_ok(buf, size))
-		return -EACCES;
+	/*
+	 * Restore supervisor states: previous context switch etc has done
+	 * XSAVES and saved the supervisor states in the kernel buffer from
+	 * which they can be restored now.
+	 *
+	 * It would be optimal to handle this with a single XRSTORS, but
+	 * this does not work because the rest of the FPU registers have
+	 * been restored from a user buffer directly.
+	 */
+	if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+		os_xrstor_supervisor(fpu->fpstate);
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+	return true;
+}
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(current, NULL,
-				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) != 0;
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+			      bool ia32_fxstate)
+{
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	struct user_i387_ia32_struct env;
+	bool success, fx_only = false;
+	union fpregs_state *fpregs;
+	unsigned int state_size;
+	u64 user_xfeatures = 0;
 
 	if (use_xsave()) {
 		struct _fpx_sw_bytes fx_sw_user;
-		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-			/*
-			 * Couldn't find the extended state information in the
-			 * memory layout. Restore just the FP/SSE and init all
-			 * the other extended state.
-			 */
-			state_size = sizeof(struct fxregs_state);
-			fx_only = 1;
-			trace_x86_fpu_xstate_check_failed(fpu);
-		} else {
-			state_size = fx_sw_user.xstate_size;
-			xfeatures = fx_sw_user.xfeatures;
-		}
+
+		if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+			return false;
+
+		fx_only = !fx_sw_user.magic1;
+		state_size = fx_sw_user.xstate_size;
+		user_xfeatures = fx_sw_user.xfeatures;
+	} else {
+		user_xfeatures = XFEATURE_MASK_FPSSE;
+		state_size = fpu->fpstate->user_size;
+	}
+	
+	if (likely(!ia32_fxstate)) {
+		/* Restore the FPU registers directly from user memory. */
+		return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+						state_size);
 	}
 
 	/*
-	 * The current state of the FPU registers does not matter. By setting
-	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
-	 * is not modified on context switch and that the xstate is considered
-	 * to be loaded again on return to userland (overriding last_cpu avoids
-	 * the optimisation).
+	 * Copy the legacy state because the FP portion of the FX frame has
+	 * to be ignored for histerical raisins. The legacy state is folded
+	 * in once the larger state has been copied.
 	 */
-	set_thread_flag(TIF_NEED_FPU_LOAD);
-	__fpu_invalidate_fpregs_state(fpu);
+	if (__copy_from_user(&env, buf, sizeof(env)))
+		return false;
 
-	if ((unsigned long)buf_fx % 64)
-		fx_only = 1;
 	/*
-	 * For 32-bit frames with fxstate, copy the fxstate so it can be
-	 * reconstructed later.
+	 * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
+	 * not modified on context switch and that the xstate is considered
+	 * to be loaded again on return to userland (overriding last_cpu avoids
+	 * the optimisation).
 	 */
-	if (ia32_fxstate) {
-		ret = __copy_from_user(&env, buf, sizeof(env));
-		if (ret)
-			goto err_out;
-		envp = &env;
-	} else {
+	fpregs_lock();
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
 		/*
-		 * Attempt to restore the FPU registers directly from user
-		 * memory. For that to succeed, the user access cannot cause
-		 * page faults. If it does, fall back to the slow path below,
-		 * going through the kernel buffer with the enabled pagefault
-		 * handler.
+		 * If supervisor states are available then save the
+		 * hardware state in current's fpstate so that the
+		 * supervisor state is preserved. Save the full state for
+		 * simplicity. There is no point in optimizing this by only
+		 * saving the supervisor states and then shuffle them to
+		 * the right place in memory. It's ia32 mode. Shrug.
 		 */
-		fpregs_lock();
-		pagefault_disable();
-		ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
-		pagefault_enable();
-		if (!ret) {
-			fpregs_mark_activate();
-			fpregs_unlock();
-			return 0;
-		}
-		fpregs_deactivate(fpu);
-		fpregs_unlock();
+		if (xfeatures_mask_supervisor())
+			os_xsave(fpu->fpstate);
+		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
+	__fpu_invalidate_fpregs_state(fpu);
+	__cpu_invalidate_fpregs_state();
+	fpregs_unlock();
 
-
+	fpregs = &fpu->fpstate->regs;
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask & ~xfeatures;
-
-		if (using_compacted_format()) {
-			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+		if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+			return false;
+	} else {
+		if (__copy_from_user(&fpregs->fxsave, buf_fx,
+				     sizeof(fpregs->fxsave)))
+			return false;
+
+		if (IS_ENABLED(CONFIG_X86_64)) {
+			/* Reject invalid MXCSR values. */
+			if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+				return false;
 		} else {
-			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
-			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_xstate_header(&fpu->state.xsave.header);
+			/* Mask invalid bits out for historical reasons (broken hardware). */
+			fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
 		}
-		if (ret)
-			goto err_out;
-
-		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
-
-		fpregs_lock();
-		if (unlikely(init_bv))
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
 
-	} else if (use_fxsr()) {
-		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
-		if (ret) {
-			ret = -EFAULT;
-			goto err_out;
-		}
+		/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+		if (use_xsave())
+			fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+	}
 
-		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
+	/* Fold the legacy FP storage */
+	convert_to_fxsr(&fpregs->fxsave, &env);
 
-		fpregs_lock();
-		if (use_xsave()) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		}
+	fpregs_lock();
+	if (use_xsave()) {
+		/*
+		 * Remove all UABI feature bits not set in user_xfeatures
+		 * from the memory xstate header which makes the full
+		 * restore below bring them into init state. This works for
+		 * fx_only mode as well because that has only FP and SSE
+		 * set in user_xfeatures.
+		 *
+		 * Preserve supervisor states!
+		 */
+		u64 mask = user_xfeatures | xfeatures_mask_supervisor();
 
-		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
+		fpregs->xsave.header.xfeatures &= mask;
+		success = !os_xrstor_safe(fpu->fpstate,
+					  fpu_kernel_cfg.max_features);
 	} else {
-		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
-		if (ret)
-			goto err_out;
-
-		fpregs_lock();
-		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
+		success = !fxrstor_safe(&fpregs->fxsave);
 	}
-	if (!ret)
+
+	if (likely(success))
 		fpregs_mark_activate();
-	else
-		fpregs_deactivate(fpu);
-	fpregs_unlock();
 
-err_out:
-	if (ret)
-		fpu__clear(fpu);
-	return ret;
+	fpregs_unlock();
+	return success;
 }
 
-static inline int xstate_sigframe_size(void)
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
 {
-	return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
-			fpu_user_xstate_size;
+	unsigned int size = fpstate->user_size;
+
+	return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
 }
 
 /*
  * Restore FPU state from a sigframe:
  */
-int fpu__restore_sig(void __user *buf, int ia32_frame)
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	void __user *buf_fx = buf;
-	int size = xstate_sigframe_size();
+	bool ia32_fxstate = false;
+	bool success = false;
+	unsigned int size;
+
+	if (unlikely(!buf)) {
+		fpu__clear_user_states(fpu);
+		return true;
+	}
 
+	size = xstate_sigframe_size(fpu->fpstate);
+
+	ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+		       IS_ENABLED(CONFIG_IA32_EMULATION));
+
+	/*
+	 * Only FXSR enabled systems need the FX state quirk.
+	 * FRSTOR does not need it and can use the fast path.
+	 */
 	if (ia32_frame && use_fxsr()) {
 		buf_fx = buf + sizeof(struct fregs_state);
 		size += sizeof(struct fregs_state);
+		ia32_fxstate = true;
 	}
 
-	return __fpu__restore_sig(buf, buf_fx, size);
+	if (!access_ok(buf, size))
+		goto out;
+
+	if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+		success = !fpregs_soft_set(current, NULL, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   NULL, buf);
+	} else {
+		success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+	}
+
+out:
+	if (unlikely(!success))
+		fpu__clear_user_states(fpu);
+	return success;
 }
 
 unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 		     unsigned long *buf_fx, unsigned long *size)
 {
-	unsigned long frame_size = xstate_sigframe_size();
+	unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
 
 	*buf_fx = sp = round_down(sp - frame_size, 64);
 	if (ia32_frame && use_fxsr()) {
@@ -452,28 +518,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
 	return sp;
 }
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-void fpu__init_prepare_fx_sw_frame(void)
+
+unsigned long __init fpu__get_fpstate_size(void)
 {
-	int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+	unsigned long ret = fpu_user_cfg.max_size;
 
-	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = xfeatures_mask;
-	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
+	if (use_xsave())
+		ret += FP_XSTATE_MAGIC2_SIZE;
 
-	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
-	    IS_ENABLED(CONFIG_X86_32)) {
-		int fsave_header_size = sizeof(struct fregs_state);
+	/*
+	 * This space is needed on (most) 32-bit kernels, or when a 32-bit
+	 * app is running on a 64-bit kernel. To keep things simple, just
+	 * assume the worst case and always include space for 'freg_state',
+	 * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+	 * space, but keeps the code simple.
+	 */
+	if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+	     IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+		ret += sizeof(struct fregs_state);
 
-		fx_sw_reserved_ia32 = fx_sw_reserved;
-		fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
-	}
+	return ret;
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 735d1f1bbabc7f234b16976e7b4e161a5b892135..e13b4fea4d68764bf3fdb906c790f83588f09430 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -4,21 +4,34 @@
  *
  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
  */
+#include <linux/bitops.h>
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/mman.h>
+#include <linux/nospec.h>
 #include <linux/pkeys.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
 
 #include <asm/fpu/api.h>
-#include <asm/fpu/internal.h>
-#include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
 
 #include <asm/tlbflush.h>
-#include <asm/cpufeature.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask)				\
+	(bit) = FIRST_EXTENDED_XFEATURE;				\
+	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
 
 /*
  * Although we spell it out in here, the Processor Trace
@@ -37,37 +50,42 @@ static const char *xfeature_names[] =
 	"AVX-512 ZMM_Hi256"		,
 	"Processor Trace (unused)"	,
 	"Protection Keys User registers",
+	"PASID state",
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"unknown xstate feature"	,
+	"AMX Tile config"		,
+	"AMX Tile data"			,
 	"unknown xstate feature"	,
 };
 
-static short xsave_cpuid_features[] __initdata = {
-	X86_FEATURE_FPU,
-	X86_FEATURE_XMM,
-	X86_FEATURE_AVX,
-	X86_FEATURE_MPX,
-	X86_FEATURE_MPX,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_AVX512F,
-	X86_FEATURE_INTEL_PT,
-	X86_FEATURE_PKU,
+static unsigned short xsave_cpuid_features[] __initdata = {
+	[XFEATURE_FP]				= X86_FEATURE_FPU,
+	[XFEATURE_SSE]				= X86_FEATURE_XMM,
+	[XFEATURE_YMM]				= X86_FEATURE_AVX,
+	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
+	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
+	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
+	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
+	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
+	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
+	[XFEATURE_PKRU]				= X86_FEATURE_PKU,
+	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
+	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
+	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
 };
 
-/*
- * Mask of xstate features supported by the CPU and the kernel:
- */
-u64 xfeatures_mask __read_mostly;
-
-static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+	{ [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
 
-/*
- * The XSAVE area of kernel can be in standard or compacted format;
- * it is always in standard format for user mode. This is the user
- * mode standard format size used for signal and ptrace frames.
- */
-unsigned int fpu_user_xstate_size;
+#define XSTATE_FLAG_SUPERVISOR	BIT(0)
+#define XSTATE_FLAG_ALIGNED64	BIT(1)
 
 /*
  * Return whether the system supports a given xfeature.
@@ -76,7 +94,7 @@ unsigned int fpu_user_xstate_size;
  */
 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 {
-	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
 
 	if (unlikely(feature_name)) {
 		long xfeature_idx, max_idx;
@@ -107,104 +125,41 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 }
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 
-static int xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_aligned64(int xfeature_nr)
 {
-	/*
-	 * We currently do not support supervisor states, but if
-	 * we did, we could find out like this.
-	 *
-	 * SDM says: If state component 'i' is a user state component,
-	 * ECX[0] return 0; if state component i is a supervisor
-	 * state component, ECX[0] returns 1.
-	 */
-	u32 eax, ebx, ecx, edx;
-
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	return !!(ecx & 1);
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
 }
 
-static int xfeature_is_user(int xfeature_nr)
+static bool xfeature_is_supervisor(int xfeature_nr)
 {
-	return !xfeature_is_supervisor(xfeature_nr);
+	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
 }
 
-/*
- * When executing XSAVEOPT (or other optimized XSAVE instructions), if
- * a processor implementation detects that an FPU state component is still
- * (or is again) in its initialized state, it may clear the corresponding
- * bit in the header.xfeatures field, and can skip the writeout of registers
- * to the corresponding memory layout.
- *
- * This means that when the bit is zero, the state component might still contain
- * some previous - non-initialized register state.
- *
- * Before writing xstate information to user-space we sanitize those components,
- * to always ensure that the memory layout of a feature will be in the init state
- * if the corresponding header bit is zero. This is to ensure that user-space doesn't
- * see some stale state in the memory layout during signal handling, debugging etc.
- */
-void fpstate_sanitize_xstate(struct fpu *fpu)
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
 {
-	struct fxregs_state *fx = &fpu->state.fxsave;
-	int feature_bit;
-	u64 xfeatures;
-
-	if (!use_xsaveopt())
-		return;
-
-	xfeatures = fpu->state.xsave.header.xfeatures;
+	unsigned int offs, i;
 
 	/*
-	 * None of the feature bits are in init state. So nothing else
-	 * to do for us, as the memory layout is up to date.
+	 * Non-compacted format and legacy features use the cached fixed
+	 * offsets.
 	 */
-	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
-		return;
+	if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE)
+		return xstate_offsets[xfeature];
 
 	/*
-	 * FP is in init state
+	 * Compacted format offsets depend on the actual content of the
+	 * compacted xsave area which is determined by the xcomp_bv header
+	 * field.
 	 */
-	if (!(xfeatures & XFEATURE_MASK_FP)) {
-		fx->cwd = 0x37f;
-		fx->swd = 0;
-		fx->twd = 0;
-		fx->fop = 0;
-		fx->rip = 0;
-		fx->rdp = 0;
-		memset(&fx->st_space[0], 0, 128);
-	}
-
-	/*
-	 * SSE is in init state
-	 */
-	if (!(xfeatures & XFEATURE_MASK_SSE))
-		memset(&fx->xmm_space[0], 0, 256);
-
-	/*
-	 * First two features are FPU and SSE, which above we handled
-	 * in a special way already:
-	 */
-	feature_bit = 0x2;
-	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
-
-	/*
-	 * Update all the remaining memory layouts according to their
-	 * standard xstate layout, if their header bit is in the init
-	 * state:
-	 */
-	while (xfeatures) {
-		if (xfeatures & 0x1) {
-			int offset = xstate_comp_offsets[feature_bit];
-			int size = xstate_sizes[feature_bit];
-
-			memcpy((void *)fx + offset,
-			       (void *)&init_fpstate.xsave + offset,
-			       size);
-		}
-
-		xfeatures >>= 1;
-		feature_bit++;
+	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for_each_extended_xfeature(i, xcomp_bv) {
+		if (xfeature_is_aligned64(i))
+			offs = ALIGN(offs, 64);
+		if (i == xfeature)
+			break;
+		offs += xstate_sizes[i];
 	}
+	return offs;
 }
 
 /*
@@ -213,40 +168,49 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
 		return;
+
+	cr4_set_bits(X86_CR4_OSXSAVE);
+
 	/*
-	 * Make it clear that XSAVES supervisor states are not yet
-	 * implemented should anyone expect it to work by changing
-	 * bits in XFEATURE_MASK_* macros and XCR0.
+	 * Must happen after CR4 setup and before xsetbv() to allow KVM
+	 * lazy passthrough.  Write independent of the dynamic state static
+	 * key as that does not work on the boot CPU. This also ensures
+	 * that any stale state is wiped out from XFD.
 	 */
-	WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
-		"x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+	if (cpu_feature_enabled(X86_FEATURE_XFD))
+		wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
 
-	xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+	/*
+	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
+	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
+	 * states can be set here.
+	 */
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
 
-	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+	/*
+	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
+	 */
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+				     xfeatures_mask_independent());
+	}
 }
 
-/*
- * Note that in the future we will likely need a pair of
- * functions here: one for user xstates and the other for
- * system xstates.  For now, they are the same.
- */
-static int xfeature_enabled(enum xfeature xfeature)
+static bool xfeature_enabled(enum xfeature xfeature)
 {
-	return !!(xfeatures_mask & (1UL << xfeature));
+	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
 }
 
 /*
  * Record the offsets and sizes of various xstates contained
  * in the XSAVE state memory layout.
  */
-static void __init setup_xstate_features(void)
+static void __init setup_xstate_cache(void)
 {
 	u32 eax, ebx, ecx, edx, i;
-	/* start at the beginnning of the "extended state" */
+	/* start at the beginning of the "extended state" */
 	unsigned int last_good_offset = offsetof(struct xregs_state,
 						 extended_state_area);
 	/*
@@ -254,32 +218,37 @@ static void __init setup_xstate_features(void)
 	 * in the fixed offsets in the xsave area in either compacted form
 	 * or standard form.
 	 */
-	xstate_offsets[0] = 0;
-	xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
-	xstate_offsets[1] = xstate_sizes[0];
-	xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+	xstate_offsets[XFEATURE_FP]	= 0;
+	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
+						   xmm_space);
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
+	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
+	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
+						       xmm_space);
 
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
+		xstate_sizes[i] = eax;
+		xstate_flags[i] = ecx;
+
 		/*
-		 * If an xfeature is supervisor state, the offset
-		 * in EBX is invalid. We leave it to -1.
+		 * If an xfeature is supervisor state, the offset in EBX is
+		 * invalid, leave it to -1.
 		 */
-		if (xfeature_is_user(i))
-			xstate_offsets[i] = ebx;
+		if (xfeature_is_supervisor(i))
+			continue;
+
+		xstate_offsets[i] = ebx;
 
-		xstate_sizes[i] = eax;
 		/*
-		 * In our xstate size checks, we assume that the
-		 * highest-numbered xstate feature has the
-		 * highest offset in the buffer.  Ensure it does.
+		 * In our xstate size checks, we assume that the highest-numbered
+		 * xstate feature has the highest offset in the buffer.  Ensure
+		 * it does.
 		 */
 		WARN_ONCE(last_good_offset > xstate_offsets[i],
-			"x86/fpu: misordered xstate at %d\n", last_good_offset);
+			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
+
 		last_good_offset = xstate_offsets[i];
 	}
 }
@@ -306,6 +275,9 @@ static void __init print_xstate_features(void)
 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 	print_xstate_feature(XFEATURE_MASK_PKRU);
+	print_xstate_feature(XFEATURE_MASK_PASID);
+	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
+	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
 }
 
 /*
@@ -318,137 +290,103 @@ static void __init print_xstate_features(void)
 } while (0)
 
 /*
- * We could cache this like xstate_size[], but we only use
- * it here, so it would be a waste of space.
+ * Print out xstate component offsets and sizes
  */
-static int xfeature_is_aligned(int xfeature_nr)
+static void __init print_xstate_offset_size(void)
 {
-	u32 eax, ebx, ecx, edx;
+	int i;
 
-	CHECK_XFEATURE(xfeature_nr);
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	/*
-	 * The value returned by ECX[1] indicates the alignment
-	 * of state component 'i' when the compacted format
-	 * of the extended region of an XSAVE area is used:
-	 */
-	return !!(ecx & 2);
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
+			i, xstate_sizes[i]);
+	}
 }
 
 /*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
  */
-static void __init setup_xstate_comp(void)
+static __init void os_xrstor_booting(struct xregs_state *xstate)
 {
-	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
-	int i;
+	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+	else
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 
 	/*
-	 * The FP xstates and SSE xstates are legacy states. They are always
-	 * in the fixed offsets in the xsave area in either compacted form
-	 * or standard form.
+	 * We should never fault when copying from a kernel buffer, and the FPU
+	 * state we set at boot time should be valid.
 	 */
-	xstate_comp_offsets[0] = 0;
-	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
-
-	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
-		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-			if (xfeature_enabled(i)) {
-				xstate_comp_offsets[i] = xstate_offsets[i];
-				xstate_comp_sizes[i] = xstate_sizes[i];
-			}
-		}
-		return;
-	}
-
-	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
-		FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (xfeature_enabled(i))
-			xstate_comp_sizes[i] = xstate_sizes[i];
-		else
-			xstate_comp_sizes[i] = 0;
-
-		if (i > FIRST_EXTENDED_XFEATURE) {
-			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-					+ xstate_comp_sizes[i-1];
-
-			if (xfeature_is_aligned(i))
-				xstate_comp_offsets[i] =
-					ALIGN(xstate_comp_offsets[i], 64);
-		}
-	}
+	WARN_ON_FPU(err);
 }
 
 /*
- * Print out xstate component offsets and sizes
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
  */
-static void __init print_xstate_offset_size(void)
-{
-	int i;
-
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
-		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
-			 i, xstate_comp_offsets[i], i, xstate_sizes[i]);
-	}
-}
+#define XFEATURES_INIT_FPSTATE_HANDLED		\
+	(XFEATURE_MASK_FP |			\
+	 XFEATURE_MASK_SSE |			\
+	 XFEATURE_MASK_YMM |			\
+	 XFEATURE_MASK_OPMASK |			\
+	 XFEATURE_MASK_ZMM_Hi256 |		\
+	 XFEATURE_MASK_Hi16_ZMM	 |		\
+	 XFEATURE_MASK_PKRU |			\
+	 XFEATURE_MASK_BNDREGS |		\
+	 XFEATURE_MASK_BNDCSR |			\
+	 XFEATURE_MASK_PASID |			\
+	 XFEATURE_MASK_XTILE)
 
 /*
  * setup the xstate image representing the init state
  */
 static void __init setup_init_fpu_buf(void)
 {
-	static int on_boot_cpu __initdata = 1;
-
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
+	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+			XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+			XFEATURES_INIT_FPSTATE_HANDLED);
 
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return;
 
-	setup_xstate_features();
 	print_xstate_features();
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
 	 */
-	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
-
-	/*
-	 * Dump the init state again. This is to identify the init state
-	 * of any feature which is not represented by all zero's.
-	 */
-	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
-}
-
-static int xfeature_uncompacted_offset(int xfeature_nr)
-{
-	u32 eax, ebx, ecx, edx;
+	os_xrstor_booting(&init_fpstate.regs.xsave);
 
 	/*
-	 * Only XSAVES supports supervisor states and it uses compacted
-	 * format. Checking a supervisor state's uncompacted offset is
-	 * an error.
+	 * All components are now in init state. Read the state back so
+	 * that init_fpstate contains all non-zero init state. This only
+	 * works with XSAVE, but not with XSAVEOPT and XSAVES because
+	 * those use the init optimization which skips writing data for
+	 * components in init state.
+	 *
+	 * XSAVE could be used, but that would require to reshuffle the
+	 * data when XSAVES is available because XSAVES uses xstate
+	 * compaction. But doing so is a pointless exercise because most
+	 * components have an all zeros init state except for the legacy
+	 * ones (FP and SSE). Those can be saved with FXSAVE into the
+	 * legacy area. Adding new features requires to ensure that init
+	 * state is all zeroes or if not to add the necessary handling
+	 * here.
 	 */
-	if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
-		WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
-		return -1;
-	}
-
-	CHECK_XFEATURE(xfeature_nr);
-	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
-	return ebx;
+	fxsave(&init_fpstate.regs.fxsave);
 }
 
-static int xfeature_size(int xfeature_nr)
+int xfeature_size(int xfeature_nr)
 {
 	u32 eax, ebx, ecx, edx;
 
@@ -457,25 +395,12 @@ static int xfeature_size(int xfeature_nr)
 	return eax;
 }
 
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
-	return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+				       struct fpstate *fpstate)
 {
 	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+	if (hdr->xfeatures & ~fpstate->user_xfeatures)
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -495,7 +420,7 @@ int validate_xstate_header(const struct xstate_header *hdr)
 	return 0;
 }
 
-static void __xstate_dump_leaves(void)
+static void __init __xstate_dump_leaves(void)
 {
 	int i;
 	u32 eax, ebx, ecx, edx;
@@ -530,12 +455,73 @@ static void __xstate_dump_leaves(void)
 	}								\
 } while (0)
 
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size:	The tile data state size
+ *
+ * Returns:	0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+	u32 max_palid, palid, state_size;
+	u32 eax, ebx, ecx, edx;
+	u16 max_tile;
+
+	/*
+	 * Check the maximum palette id:
+	 *   eax: the highest numbered palette subleaf.
+	 */
+	cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+
+	/*
+	 * Cross-check each tile size and find the maximum number of
+	 * supported tiles.
+	 */
+	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+		u16 tile_size, max;
+
+		/*
+		 * Check the tile size info:
+		 *   eax[31:16]:  bytes per title
+		 *   ebx[31:16]:  the max names (or max number of tiles)
+		 */
+		cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+		tile_size = eax >> 16;
+		max = ebx >> 16;
+
+		if (tile_size != sizeof(struct xtile_data)) {
+			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+			       __stringify(XFEATURE_XTILE_DATA),
+			       sizeof(struct xtile_data), tile_size);
+			__xstate_dump_leaves();
+			return -EINVAL;
+		}
+
+		if (max > max_tile)
+			max_tile = max;
+	}
+
+	state_size = sizeof(struct xtile_data) * max_tile;
+	if (size != state_size) {
+		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
+		__xstate_dump_leaves();
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /*
  * We have a C struct for each 'xstate'.  We need to ensure
  * that our software representation matches what the CPU
  * tells us about the state's size.
  */
-static void check_xstate_against_struct(int nr)
+static bool __init check_xstate_against_struct(int nr)
 {
 	/*
 	 * Ask the CPU for the size of the state.
@@ -552,6 +538,12 @@ static void check_xstate_against_struct(int nr)
 	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
 	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
 	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
+	XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
+	XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
+
+	/* The tile data size varies between implementations. */
+	if (nr == XFEATURE_XTILE_DATA)
+		check_xtile_data_against_struct(sz);
 
 	/*
 	 * Make *SURE* to add any feature numbers in below if
@@ -560,65 +552,67 @@ static void check_xstate_against_struct(int nr)
 	 */
 	if ((nr < XFEATURE_YMM) ||
 	    (nr >= XFEATURE_MAX) ||
-	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
+	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
+		return false;
 	}
+	return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+	unsigned int topmost = fls64(xfeatures) -  1;
+	unsigned int offset = xstate_offsets[topmost];
+
+	if (topmost <= XFEATURE_SSE)
+		return sizeof(struct xregs_state);
+
+	if (compacted)
+		offset = xfeature_get_offset(xfeatures, topmost);
+	return offset + xstate_sizes[topmost];
 }
 
 /*
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
+ *
+ * Independent XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
  */
-static void do_extra_xstate_size_checks(void)
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
 {
-	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 	int i;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		if (!xfeature_enabled(i))
-			continue;
-
-		check_xstate_against_struct(i);
+	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+		if (!check_xstate_against_struct(i))
+			return false;
 		/*
 		 * Supervisor state components can be managed only by
-		 * XSAVES, which is compacted-format only.
-		 */
-		if (!using_compacted_format())
-			XSTATE_WARN_ON(xfeature_is_supervisor(i));
-
-		/* Align from the end of the previous feature */
-		if (xfeature_is_aligned(i))
-			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
-		/*
-		 * The offset of a given state in the non-compacted
-		 * format is given to us in a CPUID leaf.  We check
-		 * them for being ordered (increasing offsets) in
-		 * setup_xstate_features().
-		 */
-		if (!using_compacted_format())
-			paranoid_xstate_size = xfeature_uncompacted_offset(i);
-		/*
-		 * The compacted-format offset always depends on where
-		 * the previous state ended.
+		 * XSAVES.
 		 */
-		paranoid_xstate_size += xfeature_size(i);
+		if (!compacted && xfeature_is_supervisor(i)) {
+			XSTATE_WARN_ON(1);
+			return false;
+		}
 	}
-	XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+	XSTATE_WARN_ON(size != kernel_size);
+	return size == kernel_size;
 }
 
-
 /*
- * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
  *
  * Note the SDM's wording here.  "sub-function 0" only enumerates
  * the size of the *user* states.  If we use it to size a buffer
  * that we use 'XSAVES' on, we could potentially overflow the
  * buffer because 'XSAVES' saves system states too.
- *
- * Note that we do not currently set any bits on IA32_XSS so
- * 'XCR0 | IA32_XSS == XCR0' for now.
  */
 static unsigned int __init get_xsaves_size(void)
 {
@@ -635,7 +629,34 @@ static unsigned int __init get_xsaves_size(void)
 	return ebx;
 }
 
-static unsigned int __init get_xsave_size(void)
+/*
+ * Get the total size of the enabled xstates without the independent supervisor
+ * features.
+ */
+static unsigned int __init get_xsaves_size_no_independent(void)
+{
+	u64 mask = xfeatures_mask_independent();
+	unsigned int size;
+
+	if (!mask)
+		return get_xsaves_size();
+
+	/* Disable independent features. */
+	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+	/*
+	 * Ask the hardware what size is required of the buffer.
+	 * This is the size required for the task->fpu buffer.
+	 */
+	size = get_xsaves_size();
+
+	/* Re-enable independent features so XSAVES will work on them again. */
+	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+	return size;
+}
+
+static unsigned int __init get_xsave_size_user(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 	/*
@@ -653,44 +674,54 @@ static unsigned int __init get_xsave_size(void)
  * Will the runtime-enumerated 'xstate_size' fit in the init
  * task's statically-allocated buffer?
  */
-static bool is_supported_xstate_size(unsigned int test_xstate_size)
+static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 {
-	if (test_xstate_size <= sizeof(union fpregs_state))
+	if (test_xstate_size <= sizeof(init_fpstate.regs))
 		return true;
 
 	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
-			sizeof(union fpregs_state), test_xstate_size);
+			sizeof(init_fpstate.regs), test_xstate_size);
 	return false;
 }
 
 static int __init init_xstate_size(void)
 {
 	/* Recompute the context size for enabled features: */
-	unsigned int possible_xstate_size;
-	unsigned int xsave_size;
+	unsigned int user_size, kernel_size, kernel_default_size;
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 
-	xsave_size = get_xsave_size();
+	/* Uncompacted user space size */
+	user_size = get_xsave_size_user();
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		possible_xstate_size = get_xsaves_size();
+	/*
+	 * XSAVES kernel size includes supervisor states and
+	 * uses compacted format when available.
+	 *
+	 * XSAVE does not support supervisor states so
+	 * kernel and user size is identical.
+	 */
+	if (compacted)
+		kernel_size = get_xsaves_size_no_independent();
 	else
-		possible_xstate_size = xsave_size;
+		kernel_size = user_size;
 
-	/* Ensure we have the space to store all enabled: */
-	if (!is_supported_xstate_size(possible_xstate_size))
+	kernel_default_size =
+		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
+
+	/* Ensure we have the space to store all default enabled features. */
+	if (!is_supported_xstate_size(kernel_default_size))
 		return -EINVAL;
 
-	/*
-	 * The size is OK, we are definitely going to use xsave,
-	 * make it known to the world that we need more space.
-	 */
-	fpu_kernel_xstate_size = possible_xstate_size;
-	do_extra_xstate_size_checks();
+	if (!paranoid_xstate_size_valid(kernel_size))
+		return -EINVAL;
+
+	fpu_kernel_cfg.max_size = kernel_size;
+	fpu_user_cfg.max_size = user_size;
+
+	fpu_kernel_cfg.default_size = kernel_default_size;
+	fpu_user_cfg.default_size =
+		xstate_calculate_size(fpu_user_cfg.default_features, false);
 
-	/*
-	 * User space is always in standard format.
-	 */
-	fpu_user_xstate_size = xsave_size;
 	return 0;
 }
 
@@ -698,27 +729,38 @@ static int __init init_xstate_size(void)
  * We enabled the XSAVE hardware, but something went wrong and
  * we can not use it.  Disable it.
  */
-static void fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
 {
-	xfeatures_mask = 0;
+	fpu_kernel_cfg.max_features = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+	/* Restore the legacy size.*/
+	fpu_kernel_cfg.max_size = legacy_size;
+	fpu_kernel_cfg.default_size = legacy_size;
+	fpu_user_cfg.max_size = legacy_size;
+	fpu_user_cfg.default_size = legacy_size;
+
+	/*
+	 * Prevent enabling the static branch which enables writes to the
+	 * XFD MSR.
+	 */
+	init_fpstate.xfd = 0;
+
+	fpstate_reset(&current->thread.fpu);
 }
 
 /*
  * Enable and initialize the xsave feature.
  * Called once per system bootup.
  */
-void __init fpu__init_system_xstate(void)
+void __init fpu__init_system_xstate(unsigned int legacy_size)
 {
 	unsigned int eax, ebx, ecx, edx;
-	static int on_boot_cpu __initdata = 1;
+	u64 xfeatures;
 	int err;
 	int i;
 
-	WARN_ON_FPU(!on_boot_cpu);
-	on_boot_cpu = 0;
-
 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
 		pr_info("x86/fpu: No FPU detected\n");
 		return;
@@ -735,16 +777,26 @@ void __init fpu__init_system_xstate(void)
 		return;
 	}
 
+	/*
+	 * Find user xstates supported by the processor.
+	 */
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	xfeatures_mask = eax + ((u64)edx << 32);
+	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
+
+	/*
+	 * Find supervisor xstates supported by the processor.
+	 */
+	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
 
-	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
 		 * booting without it.  This is too early to BUG().
 		 */
-		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
+		       fpu_kernel_cfg.max_features);
 		goto out_disable;
 	}
 
@@ -752,38 +804,82 @@ void __init fpu__init_system_xstate(void)
 	 * Clear XSAVE features that are disabled in the normal CPUID.
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
-		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask &= ~BIT(i);
+		unsigned short cid = xsave_cpuid_features[i];
+
+		/* Careful: X86_FEATURE_FPU is 0! */
+		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+	if (!cpu_feature_enabled(X86_FEATURE_XFD))
+		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+	fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
+			      XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+	/* Clean out dynamic features from default */
+	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+	/* Store it for paranoia check at the end */
+	xfeatures = fpu_kernel_cfg.max_features;
+
+	/*
+	 * Initialize the default XFD state in initfp_state and enable the
+	 * dynamic sizing mechanism if dynamic states are available.  The
+	 * static key cannot be enabled here because this runs before
+	 * jump_label_init(). This is delayed to an initcall.
+	 */
+	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
+
+	/* Cache size, offset and flags for initialization */
+	setup_xstate_cache();
+
 	err = init_xstate_size();
 	if (err)
 		goto out_disable;
 
+	/* Reset the state for the current task */
+	fpstate_reset(&current->thread.fpu);
+
 	/*
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * supervisor xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size,	xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+	update_regset_xstate_info(fpu_user_cfg.max_size,
+				  fpu_user_cfg.max_features);
 
-	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
-	setup_xstate_comp();
-	print_xstate_offset_size();
 
+	/*
+	 * Paranoia check whether something in the setup modified the
+	 * xfeatures mask.
+	 */
+	if (xfeatures != fpu_kernel_cfg.max_features) {
+		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
+		       xfeatures, fpu_kernel_cfg.max_features);
+		goto out_disable;
+	}
+
+	print_xstate_offset_size();
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-		xfeatures_mask,
-		fpu_kernel_xstate_size,
+		fpu_kernel_cfg.max_features,
+		fpu_kernel_cfg.max_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
 
 out_disable:
 	/* something went wrong, try to boot without any XSAVE support */
-	fpu__init_disable_system_xstate();
+	fpu__init_disable_system_xstate(legacy_size);
 }
 
 /*
@@ -794,8 +890,20 @@ void fpu__resume_cpu(void)
 	/*
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
+
+	/*
+	 * Restore IA32_XSS. The same CPUID bit enumerates support
+	 * of XSAVES and MSR_IA32_XSS.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
+				     xfeatures_mask_independent());
+	}
+
+	if (fpu_state_size_dynamic())
+		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
 }
 
 /*
@@ -805,13 +913,19 @@ void fpu__resume_cpu(void)
  */
 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	if (!xfeature_enabled(xfeature_nr)) {
-		WARN_ON_FPU(1);
+	u64 xcomp_bv = xsave->header.xcomp_bv;
+
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
 		return NULL;
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+			return NULL;
 	}
 
-	return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
 }
+
 /*
  * Given the xsave area and a state inside, this function returns the
  * address of the state.
@@ -840,11 +954,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	/*
 	 * We should not ever be requesting features that we
-	 * have not enabled.  Remember that pcntxt_mask is
-	 * what we write to the XCR0 register.
+	 * have not enabled.
 	 */
-	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
-		  "get of unsupported state");
+	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+		return NULL;
+
 	/*
 	 * This assumes the last 'xsave*' instruction to
 	 * have requested that 'xfeature_nr' be saved.
@@ -861,7 +975,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
-EXPORT_SYMBOL_GPL(get_xsave_addr);
 
 /*
  * This wraps up the common operations that need to occur when retrieving
@@ -888,9 +1001,9 @@ const void *get_xsave_field_ptr(int xfeature_nr)
 	 * fpu__save() takes the CPU's xstate registers
 	 * and saves them off to the 'fpu memory buffer.
 	 */
-	fpu__save(fpu);
+	fpu_sync_fpstate(fpu);
 
-	return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
+	return get_xsave_addr(&fpu->fpstate->regs.xsave, xfeature_nr);
 }
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -900,17 +1013,16 @@ const void *get_xsave_field_ptr(int xfeature_nr)
  * rights for @pkey to @init_val.
  */
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
-		unsigned long init_val)
+			      unsigned long init_val)
 {
-	u32 old_pkru;
-	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
-	u32 new_pkru_bits = 0;
+	u32 old_pkru, new_pkru_bits = 0;
+	int pkey_shift;
 
 	/*
 	 * This check implies XSAVE support.  OSPKE only gets
 	 * set if we enable XSAVE and we enable PKU in XCR0.
 	 */
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return -EINVAL;
 
 	/*
@@ -918,7 +1030,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 * values originating from in-kernel users.  Complain
 	 * if a bad value is observed.
 	 */
-	WARN_ON_ONCE(pkey >= arch_max_pkey());
+	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+		return -EINVAL;
 
 	/* Set the bits we need in PKRU:  */
 	if (init_val & PKEY_DISABLE_ACCESS)
@@ -927,6 +1040,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		new_pkru_bits |= PKRU_WD_BIT;
 
 	/* Shift the bits in to the correct place in PKRU for pkey: */
+	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
 	new_pkru_bits <<= pkey_shift;
 
 	/* Get old PKRU and mask off any old bits in place: */
@@ -940,313 +1054,709 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 }
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+			 void *init_xstate, unsigned int size)
+{
+	membuf_write(to, from_xstate ? xstate : init_xstate, size);
+}
+
+/**
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to:		membuf descriptor
+ * @fpstate:	The fpstate buffer from which to copy
+ * @pkru_val:	The PKRU value to store in the PKRU component
+ * @copy_mode:	The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+			       u32 pkru_val, enum xstate_copy_mode copy_mode)
+{
+	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+	struct xregs_state *xinit = &init_fpstate.regs.xsave;
+	struct xregs_state *xsave = &fpstate->regs.xsave;
+	struct xstate_header header;
+	unsigned int zerofrom;
+	u64 mask;
+	int i;
+
+	memset(&header, 0, sizeof(header));
+	header.xfeatures = xsave->header.xfeatures;
+
+	/* Mask out the feature bits depending on copy mode */
+	switch (copy_mode) {
+	case XSTATE_COPY_FP:
+		header.xfeatures &= XFEATURE_MASK_FP;
+		break;
+
+	case XSTATE_COPY_FX:
+		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+		break;
+
+	case XSTATE_COPY_XSAVE:
+		header.xfeatures &= fpstate->user_xfeatures;
+		break;
+	}
+
+	/* Copy FP state up to MXCSR */
+	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+		     &xinit->i387, off_mxcsr);
+
+	/* Copy MXCSR when SSE or YMM are set in the feature mask */
+	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+		     MXCSR_AND_FLAGS_SIZE);
+
+	/* Copy the remaining FP state */
+	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
+		     sizeof(xsave->i387.st_space));
+
+	/* Copy the SSE state - shared with YMM, but independently managed */
+	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+		     sizeof(xsave->i387.xmm_space));
+
+	if (copy_mode != XSTATE_COPY_XSAVE)
+		goto out;
+
+	/* Zero the padding area */
+	membuf_zero(&to, sizeof(xsave->i387.padding));
+
+	/* Copy xsave->i387.sw_reserved */
+	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
+
+	/* Copy the user space relevant state of @xsave->header */
+	membuf_write(&to, &header, sizeof(header));
+
+	zerofrom = offsetof(struct xregs_state, extended_state_area);
+
+	/*
+	 * The ptrace buffer is in non-compacted XSAVE format.  In
+	 * non-compacted format disabled features still occupy state space,
+	 * but there is no state to copy from in the compacted
+	 * init_fpstate. The gap tracking will zero these states.
+	 */
+	mask = fpstate->user_xfeatures;
+
+	for_each_extended_xfeature(i, mask) {
+		/*
+		 * If there was a feature or alignment gap, zero the space
+		 * in the destination buffer.
+		 */
+		if (zerofrom < xstate_offsets[i])
+			membuf_zero(&to, xstate_offsets[i] - zerofrom);
+
+		if (i == XFEATURE_PKRU) {
+			struct pkru_state pkru = {0};
+			/*
+			 * PKRU is not necessarily up to date in the
+			 * XSAVE buffer. Use the provided value.
+			 */
+			pkru.pkru = pkru_val;
+			membuf_write(&to, &pkru, sizeof(pkru));
+		} else {
+			copy_feature(header.xfeatures & BIT_ULL(i), &to,
+				     __raw_xsave_addr(xsave, i),
+				     __raw_xsave_addr(xinit, i),
+				     xstate_sizes[i]);
+		}
+		/*
+		 * Keep track of the last copied state in the non-compacted
+		 * target buffer for gap zeroing.
+		 */
+		zerofrom = xstate_offsets[i] + xstate_sizes[i];
+	}
+
+out:
+	if (to.left)
+		membuf_zero(&to, to.left);
+}
+
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to:		membuf descriptor
+ * @tsk:	The task from which to copy the saved xstate
+ * @copy_mode:	The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+			     enum xstate_copy_mode copy_mode)
+{
+	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+				  tsk->thread.pkru, copy_mode);
+}
+
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+			    const void *kbuf, const void __user *ubuf)
+{
+	if (kbuf) {
+		memcpy(dst, kbuf + offset, size);
+	} else {
+		if (copy_from_user(dst, ubuf + offset, size))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
+			       const void __user *ubuf)
+{
+	struct xregs_state *xsave = &fpstate->regs.xsave;
+	unsigned int offset, size;
+	struct xstate_header hdr;
+	u64 mask;
+	int i;
+
+	offset = offsetof(struct xregs_state, header);
+	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
+		return -EFAULT;
+
+	if (validate_user_xstate_header(&hdr, fpstate))
+		return -EINVAL;
+
+	/* Validate MXCSR when any of the related features is in use */
+	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+	if (hdr.xfeatures & mask) {
+		u32 mxcsr[2];
+
+		offset = offsetof(struct fxregs_state, mxcsr);
+		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+			return -EFAULT;
+
+		/* Reserved bits in MXCSR must be zero. */
+		if (mxcsr[0] & ~mxcsr_feature_mask)
+			return -EINVAL;
+
+		/* SSE and YMM require MXCSR even when FP is not in use. */
+		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+			xsave->i387.mxcsr = mxcsr[0];
+			xsave->i387.mxcsr_mask = mxcsr[1];
+		}
+	}
+
+	for (i = 0; i < XFEATURE_MAX; i++) {
+		u64 mask = ((u64)1 << i);
+
+		if (hdr.xfeatures & mask) {
+			void *dst = __raw_xsave_addr(xsave, i);
+
+			offset = xstate_offsets[i];
+			size = xstate_sizes[i];
+
+			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
+				return -EFAULT;
+		}
+	}
+
+	/*
+	 * The state that came in from userspace was user-state only.
+	 * Mask all the user states out of 'xfeatures':
+	 */
+	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
+
+	/*
+	 * Add back in the features that came in from userspace:
+	 */
+	xsave->header.xfeatures |= hdr.xfeatures;
+
+	return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. Used by ptrace and KVM.
+ */
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
+{
+	return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+}
+
 /*
- * Weird legacy quirk: SSE and YMM states store information in the
- * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
- * area is marked as unused in the xfeatures header, we need to copy
- * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
  */
-static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+				      const void __user *ubuf)
 {
-	if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+	return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+}
+
+static bool validate_independent_components(u64 mask)
+{
+	u64 xchk;
+
+	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
 		return false;
 
-	if (xfeatures & XFEATURE_MASK_FP)
+	xchk = ~xfeatures_mask_independent();
+
+	if (WARN_ON_ONCE(!mask || mask & xchk))
 		return false;
 
 	return true;
 }
 
-static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
+/**
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate:	Pointer to the buffer
+ * @mask:	Feature mask to select the components to save
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xsaves(struct xregs_state *xstate, u64 mask)
 {
-	if (*pos < to) {
-		unsigned size = to - *pos;
-
-		if (size > *count)
-			size = *count;
-		memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
-		*kbuf += size;
-		*pos += size;
-		*count -= size;
-	}
+	int err;
+
+	if (!validate_independent_components(mask))
+		return;
+
+	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+	WARN_ON_ONCE(err);
+}
+
+/**
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate:	Pointer to the buffer
+ * @mask:	Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
+ *
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xrstors(struct xregs_state *xstate, u64 mask)
+{
+	int err;
+
+	if (!validate_independent_components(mask))
+		return;
+
+	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+	WARN_ON_ONCE(err);
 }
 
-static void copy_part(unsigned offset, unsigned size, void *from,
-			void **kbuf, unsigned *pos, unsigned *count)
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
 {
-	fill_gap(offset, kbuf, pos, count);
-	if (size > *count)
-		size = *count;
-	if (size) {
-		memcpy(*kbuf, from, size);
-		*kbuf += size;
-		*pos += size;
-		*count -= size;
-	}
+	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+	if (addr)
+		memset(addr, 0, xstate_sizes[xfeature]);
 }
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
 
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
 /*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a kernel-space ptrace buffer.
- *
- * It supports partial copy but pos always starts from zero. This is called
- * from xstateregs_get() and there we check the CPU has XSAVES.
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
  */
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
 {
-	struct xstate_header header;
-	const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
-	unsigned count = size_total;
-	int i;
+	u64 xfd = __this_cpu_read(xfd_state);
+
+	if (fpstate->xfd == xfd)
+		return true;
+
+	 /*
+	  * The XFD MSR does not match fpstate->xfd. That's invalid when
+	  * the passed in fpstate is current's fpstate.
+	  */
+	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+		return false;
 
 	/*
-	 * Currently copy_regset_to_user() starts from pos 0:
+	 * XRSTOR(S) from init_fpstate are always correct as it will just
+	 * bring all components into init state and not read from the
+	 * buffer. XSAVE(S) raises #PF after init.
 	 */
-	if (unlikely(offset_start != 0))
-		return -EFAULT;
+	if (fpstate == &init_fpstate)
+		return rstor;
 
 	/*
-	 * The destination is a ptrace buffer; we put in only user xstates:
+	 * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+	 * XRSTORS(S): fpu_swap_kvm_fpu()
 	 */
-	memset(&header, 0, sizeof(header));
-	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
-
-	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(0, off_mxcsr,
-			  &xsave->i387, &kbuf, &offset_start, &count);
-	if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
-		copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
-			  &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
-	if (header.xfeatures & XFEATURE_MASK_FP)
-		copy_part(offsetof(struct fxregs_state, st_space), 128,
-			  &xsave->i387.st_space, &kbuf, &offset_start, &count);
-	if (header.xfeatures & XFEATURE_MASK_SSE)
-		copy_part(xstate_offsets[XFEATURE_SSE], 256,
-			  &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+
 	/*
-	 * Fill xsave->i387.sw_reserved value for ptrace frame:
+	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+	 * the buffer area for XFD-disabled state components.
 	 */
-	copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
-		  xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
+	mask &= ~xfd;
+
 	/*
-	 * Copy xregs_state->header:
+	 * Remove features which are valid in fpstate. They
+	 * have space allocated in fpstate.
 	 */
-	copy_part(offsetof(struct xregs_state, header), sizeof(header),
-		  &header, &kbuf, &offset_start, &count);
+	mask &= ~fpstate->xfeatures;
 
-	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
-		/*
-		 * Copy only in-use xstates:
-		 */
-		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, i);
-
-			copy_part(xstate_offsets[i], xstate_sizes[i],
-				  src, &kbuf, &offset_start, &count);
-		}
+	/*
+	 * Any remaining state components in 'mask' might be written
+	 * by XSAVE/XRSTOR. Fail validation it found.
+	 */
+	return !mask;
+}
 
-	}
-	fill_gap(size_total, &kbuf, &offset_start, &count);
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
 
+static int __init xfd_update_static_branch(void)
+{
+	/*
+	 * If init_fpstate.xfd has bits set then dynamic features are
+	 * available and the dynamic sizing must be enabled.
+	 */
+	if (init_fpstate.xfd)
+		static_branch_enable(&__fpu_state_size_dynamic);
 	return 0;
 }
+arch_initcall(xfd_update_static_branch)
 
-static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
+void fpstate_free(struct fpu *fpu)
 {
-	if (!size)
-		return 0;
-
-	if (offset < size_total) {
-		unsigned int copy = min(size, size_total - offset);
-
-		if (__copy_to_user(ubuf + offset, data, copy))
-			return -EFAULT;
-	}
-	return 0;
+	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+		vfree(fpu->fpstate);
 }
 
-/*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a user-space buffer. It supports partial copy but pos always starts from
- * zero. This is called from xstateregs_get() and there we check the CPU
- * has XSAVES.
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures:	A bitmap of xstate features which extend the enabled features
+ *		of that task
+ * @ksize:	The required size for the kernel buffer
+ * @usize:	The required size for user space buffers
+ * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
  */
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+			   unsigned int usize, struct fpu_guest *guest_fpu)
 {
-	unsigned int offset, size;
-	int ret, i;
-	struct xstate_header header;
+	struct fpu *fpu = &current->thread.fpu;
+	struct fpstate *curfps, *newfps = NULL;
+	unsigned int fpsize;
+	bool in_use;
 
-	/*
-	 * Currently copy_regset_to_user() starts from pos 0:
-	 */
-	if (unlikely(offset_start != 0))
-		return -EFAULT;
+	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+	newfps = vzalloc(fpsize);
+	if (!newfps)
+		return -ENOMEM;
+	newfps->size = ksize;
+	newfps->user_size = usize;
+	newfps->is_valloc = true;
 
 	/*
-	 * The destination is a ptrace buffer; we put in only user xstates:
+	 * When a guest FPU is supplied, use @guest_fpu->fpstate
+	 * as reference independent whether it is in use or not.
 	 */
-	memset(&header, 0, sizeof(header));
-	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
 
+	/* Determine whether @curfps is the active fpstate */
+	in_use = fpu->fpstate == curfps;
+
+	if (guest_fpu) {
+		newfps->is_guest = true;
+		newfps->is_confidential = curfps->is_confidential;
+		newfps->in_use = curfps->in_use;
+		guest_fpu->xfeatures |= xfeatures;
+		guest_fpu->uabi_size = usize;
+	}
+
+	fpregs_lock();
 	/*
-	 * Copy xregs_state->header:
+	 * If @curfps is in use, ensure that the current state is in the
+	 * registers before swapping fpstate as that might invalidate it
+	 * due to layout changes.
 	 */
-	offset = offsetof(struct xregs_state, header);
-	size = sizeof(header);
+	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
+		fpregs_restore_userregs();
 
-	ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
-	if (ret)
-		return ret;
+	newfps->xfeatures = curfps->xfeatures | xfeatures;
+	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+	newfps->xfd = curfps->xfd & ~xfeatures;
 
-	for (i = 0; i < XFEATURE_MAX; i++) {
-		/*
-		 * Copy only in-use xstates:
-		 */
-		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, i);
+	/* Do the final updates within the locked region */
+	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
 
-			offset = xstate_offsets[i];
-			size = xstate_sizes[i];
+	if (guest_fpu) {
+		guest_fpu->fpstate = newfps;
+		/* If curfps is active, update the FPU fpstate pointer */
+		if (in_use)
+			fpu->fpstate = newfps;
+	} else {
+		fpu->fpstate = newfps;
+	}
 
-			/* The next component has to fit fully into the output buffer: */
-			if (offset + size > size_total)
-				break;
+	if (in_use)
+		xfd_update_state(fpu->fpstate);
+	fpregs_unlock();
 
-			ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
-			if (ret)
-				return ret;
-		}
+	/* Only free valloc'ed state */
+	if (curfps && curfps->is_valloc)
+		vfree(curfps);
 
-	}
+	return 0;
+}
 
-	if (xfeatures_mxcsr_quirk(header.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		__copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
+static int validate_sigaltstack(unsigned int usize)
+{
+	struct task_struct *thread, *leader = current->group_leader;
+	unsigned long framesize = get_sigframe_size();
+
+	lockdep_assert_held(&current->sighand->siglock);
+
+	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
+	framesize -= fpu_user_cfg.max_size;
+	framesize += usize;
+	for_each_thread(leader, thread) {
+		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+			return -ENOSPC;
 	}
+	return 0;
+}
 
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
+{
 	/*
-	 * Fill xsave->i387.sw_reserved value for ptrace frame:
+	 * This deliberately does not exclude !XSAVES as we still might
+	 * decide to optionally context switch XCR0 or talk the silicon
+	 * vendors into extending XFD for the pre AMX states, especially
+	 * AVX512.
 	 */
-	offset = offsetof(struct fxregs_state, sw_reserved);
-	size = sizeof(xstate_fx_sw_bytes);
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+	struct fpu *fpu = &current->group_leader->thread.fpu;
+	struct fpu_state_perm *perm;
+	unsigned int ksize, usize;
+	u64 mask;
+	int ret = 0;
+
+	/* Check whether fully enabled */
+	if ((permitted & requested) == requested)
+		return 0;
 
-	ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
-	if (ret)
-		return ret;
+	/* Calculate the resulting kernel state size */
+	mask = permitted | requested;
+	/* Take supervisor states into account on the host */
+	if (!guest)
+		mask |= xfeatures_mask_supervisor();
+	ksize = xstate_calculate_size(mask, compacted);
+
+	/* Calculate the resulting user state size */
+	mask &= XFEATURE_MASK_USER_SUPPORTED;
+	usize = xstate_calculate_size(mask, false);
+
+	if (!guest) {
+		ret = validate_sigaltstack(usize);
+		if (ret)
+			return ret;
+	}
 
-	return 0;
+	perm = guest ? &fpu->guest_perm : &fpu->perm;
+	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+	WRITE_ONCE(perm->__state_perm, mask);
+	/* Protected by sighand lock */
+	perm->__state_size = ksize;
+	perm->__user_state_size = usize;
+	return ret;
 }
 
 /*
- * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set().
+ * Permissions array to map facilities with more than one component
  */
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx, bool guest)
 {
-	unsigned int offset, size;
-	int i;
-	struct xstate_header hdr;
+	u64 permitted, requested;
+	int ret;
 
-	offset = offsetof(struct xregs_state, header);
-	size = sizeof(hdr);
+	if (idx >= XFEATURE_MAX)
+		return -EINVAL;
 
-	memcpy(&hdr, kbuf + offset, size);
+	/*
+	 * Look up the facility mask which can require more than
+	 * one xstate component.
+	 */
+	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+	requested = xstate_prctl_req[idx];
+	if (!requested)
+		return -EOPNOTSUPP;
 
-	if (validate_xstate_header(&hdr))
-		return -EINVAL;
+	if ((fpu_user_cfg.max_features & requested) != requested)
+		return -EOPNOTSUPP;
 
-	for (i = 0; i < XFEATURE_MAX; i++) {
-		u64 mask = ((u64)1 << i);
+	/* Lockless quick check */
+	permitted = xstate_get_group_perm(guest);
+	if ((permitted & requested) == requested)
+		return 0;
 
-		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, i);
+	/* Protect against concurrent modifications */
+	spin_lock_irq(&current->sighand->siglock);
+	permitted = xstate_get_group_perm(guest);
 
-			offset = xstate_offsets[i];
-			size = xstate_sizes[i];
+	/* First vCPU allocation locks the permissions. */
+	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+		ret = -EBUSY;
+	else
+		ret = __xstate_request_perm(permitted, requested, guest);
+	spin_unlock_irq(&current->sighand->siglock);
+	return ret;
+}
 
-			memcpy(dst, kbuf + offset, size);
-		}
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+	struct fpu_state_perm *perm;
+	unsigned int ksize, usize;
+	struct fpu *fpu;
+
+	if (!xfd_event) {
+		if (!guest_fpu)
+			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+		return 0;
 	}
 
-	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+	/* Protect against concurrent modifications */
+	spin_lock_irq(&current->sighand->siglock);
+
+	/* If not permitted let it die */
+	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return -EPERM;
 	}
 
+	fpu = &current->group_leader->thread.fpu;
+	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+	ksize = perm->__state_size;
+	usize = perm->__user_state_size;
+
 	/*
-	 * The state that came in from userspace was user-state only.
-	 * Mask all the user states out of 'xfeatures':
+	 * The feature is permitted. State size is sufficient.  Dropping
+	 * the lock is safe here even if more features are added from
+	 * another task, the retrieved buffer sizes are valid for the
+	 * currently requested feature(s).
 	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	/*
-	 * Add back in the features that came in from userspace:
+	 * Try to allocate a new fpstate. If that fails there is no way
+	 * out.
 	 */
-	xsave->header.xfeatures |= hdr.xfeatures;
-
+	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+		return -EFAULT;
 	return 0;
 }
 
-/*
- * Convert from a ptrace or sigreturn standard-format user-space buffer to
- * kernel XSAVES format and copy to the target thread. This is called from
- * xstateregs_set(), as well as potentially from the sigreturn() and
- * rt_sigreturn() system calls.
- */
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+int xfd_enable_feature(u64 xfd_err)
 {
-	unsigned int offset, size;
-	int i;
-	struct xstate_header hdr;
-
-	offset = offsetof(struct xregs_state, header);
-	size = sizeof(hdr);
+	return __xfd_enable_feature(xfd_err, NULL);
+}
 
-	if (__copy_from_user(&hdr, ubuf + offset, size))
-		return -EFAULT;
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx, bool guest)
+{
+	return -EPERM;
+}
+#endif  /* !CONFIG_X86_64 */
 
-	if (validate_xstate_header(&hdr))
-		return -EINVAL;
+u64 xstate_get_guest_group_perm(void)
+{
+	return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @tsk:	Redundant pointer to current
+ * @option:	A subfunction of arch_prctl()
+ * @arg2:	option argument
+ * Return:	0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
+{
+	u64 __user *uptr = (u64 __user *)arg2;
+	u64 permitted, supported;
+	unsigned long idx = arg2;
+	bool guest = false;
 
-	for (i = 0; i < XFEATURE_MAX; i++) {
-		u64 mask = ((u64)1 << i);
+	if (tsk != current)
+		return -EPERM;
 
-		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, i);
+	switch (option) {
+	case ARCH_GET_XCOMP_SUPP:
+		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
+		return put_user(supported, uptr);
 
-			offset = xstate_offsets[i];
-			size = xstate_sizes[i];
+	case ARCH_GET_XCOMP_PERM:
+		/*
+		 * Lockless snapshot as it can also change right after the
+		 * dropping the lock.
+		 */
+		permitted = xstate_get_host_group_perm();
+		permitted &= XFEATURE_MASK_USER_SUPPORTED;
+		return put_user(permitted, uptr);
 
-			if (__copy_from_user(dst, ubuf + offset, size))
-				return -EFAULT;
-		}
-	}
+	case ARCH_GET_XCOMP_GUEST_PERM:
+		permitted = xstate_get_guest_group_perm();
+		permitted &= XFEATURE_MASK_USER_SUPPORTED;
+		return put_user(permitted, uptr);
 
-	if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
-		offset = offsetof(struct fxregs_state, mxcsr);
-		size = MXCSR_AND_FLAGS_SIZE;
-		if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
-			return -EFAULT;
-	}
+	case ARCH_REQ_XCOMP_GUEST_PERM:
+		guest = true;
+		fallthrough;
 
-	/*
-	 * The state that came in from userspace was user-state only.
-	 * Mask all the user states out of 'xfeatures':
-	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	case ARCH_REQ_XCOMP_PERM:
+		if (!IS_ENABLED(CONFIG_X86_64))
+			return -EOPNOTSUPP;
 
-	/*
-	 * Add back in the features that came in from userspace:
-	 */
-	xsave->header.xfeatures |= hdr.xfeatures;
+		return xstate_request_perm(idx, guest);
 
-	return 0;
+	default:
+		return -EINVAL;
+	}
 }
 
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 0000000000000000000000000000000000000000..67ed6bbc19b8dc09acc5941ae0467e8fbb9e8bf8
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+	/*
+	 * XRSTORS requires these bits set in xcomp_bv, or it will
+	 * trigger #GP:
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_group_perm(bool guest)
+{
+	struct fpu *fpu = &current->group_leader->thread.fpu;
+	struct fpu_state_perm *perm;
+
+	/* Pairs with WRITE_ONCE() in xstate_request_perm() */
+	perm = guest ? &fpu->guest_perm : &fpu->perm;
+	return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+	return xstate_get_group_perm(false);
+}
+
+enum xstate_copy_mode {
+	XSTATE_COPY_FP,
+	XSTATE_COPY_FX,
+	XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+				      u32 pkru_val, enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+				    enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
+extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+	return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+		return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+
+	return XFEATURE_MASK_INDEPENDENT;
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err)				\
+	asm volatile("1:" op "\n\t"					\
+		     "xor %[err], %[err]\n"				\
+		     "2:\n\t"						\
+		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)	\
+		     : [err] "=a" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err)				\
+	asm volatile(ALTERNATIVE_2(XSAVE,				\
+				   XSAVEOPT, X86_FEATURE_XSAVEOPT,	\
+				   XSAVES,   X86_FEATURE_XSAVES)	\
+		     "\n"						\
+		     "xor %[err], %[err]\n"				\
+		     "3:\n"						\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "4: movl $-2, %[err]\n"				\
+		     "jmp 3b\n"						\
+		     ".popsection\n"					\
+		     _ASM_EXTABLE(661b, 4b)				\
+		     : [err] "=r" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask)				\
+	asm volatile(ALTERNATIVE(XRSTOR,				\
+				 XRSTORS, X86_FEATURE_XSAVES)		\
+		     "\n"						\
+		     "3:\n"						\
+		     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)	\
+		     :							\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+	if (fpu_state_size_dynamic()) {
+		u64 xfd = fpstate->xfd;
+
+		if (__this_cpu_read(xfd_state) != xfd) {
+			wrmsrl(MSR_IA32_XFD, xfd);
+			__this_cpu_write(xfd_state, xfd);
+		}
+	}
+}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
+#else
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+	return -EPERM;
+}
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+	u64 mask = fpstate->xfeatures;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	WARN_ON_FPU(!alternatives_patched);
+	xfd_validate_state(fpstate, mask, false);
+
+	XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	xfd_validate_state(fpstate, mask, true);
+	XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+	u64 mask = xfeatures_mask_supervisor();
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * XSAVE itself always writes all requested xfeatures.  Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe.  The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible.  Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+	u64 xfeaures_to_write;
+
+	/* In-use features must be written: */
+	xfeaures_to_write = xfeatures_in_use();
+
+	/* Also write all non-optimizable sigframe features: */
+	xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+			     ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+	return xfeaures_to_write;
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+{
+	/*
+	 * Include the features which are not xsaved/rstored by the kernel
+	 * internally, e.g. PKRU. That's user space ABI and also required
+	 * to allow the signal handler to modify PKRU.
+	 */
+	struct fpstate *fpstate = current->thread.fpu.fpstate;
+	u64 mask = fpstate->user_xfeatures;
+	u32 lmask;
+	u32 hmask;
+	int err;
+
+	/* Optimize away writing unnecessary xfeatures: */
+	if (fpu_state_size_dynamic())
+		mask &= xfeatures_need_sigframe_write();
+
+	lmask = mask;
+	hmask = mask >> 32;
+	xfd_validate_state(fpstate, mask, false);
+
+	stac();
+	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+	clac();
+
+	return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+
+	stac();
+	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+	clac();
+
+	return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+	struct xregs_state *xstate = &fpstate->regs.xsave;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	/* Ensure that XFD is up to date */
+	xfd_update_state(fpstate);
+
+	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+	else
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+	return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbbab673d4d23f7e78726022b03620b5..1b72aa2a7d5fa08b32430f599d02b98dd68c9a1e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -31,6 +31,7 @@
 #include <asm/ftrace.h>
 #include <asm/nops.h>
 #include <asm/text-patching.h>
+#include <asm/sync_core.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6f791bc481eb19d5fb4cb34989c2b485c82c038..f6a782c732253999bd4d057d51bf77e397df4354 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,6 +7,7 @@
 #include <linux/cpu.h>
 #include <linux/irq.h>
 
+#include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
 
@@ -50,7 +51,7 @@ unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
 bool					hpet_msi_disable;
 
-#ifdef CONFIG_PCI_MSI
+#ifdef CONFIG_GENERIC_MSI_IRQ
 static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
 static struct irq_domain		*hpet_domain;
 #endif
@@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
 /*
  * HPET MSI Support
  */
-#ifdef CONFIG_PCI_MSI
-
-void hpet_msi_unmask(struct irq_data *data)
+#ifdef CONFIG_GENERIC_MSI_IRQ
+static void hpet_msi_unmask(struct irq_data *data)
 {
 	struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
@@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data)
 	hpet_writel(cfg, HPET_Tn_CFG(hc->num));
 }
 
-void hpet_msi_mask(struct irq_data *data)
+static void hpet_msi_mask(struct irq_data *data)
 {
 	struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
@@ -489,12 +489,122 @@ void hpet_msi_mask(struct irq_data *data)
 	hpet_writel(cfg, HPET_Tn_CFG(hc->num));
 }
 
-void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
 {
 	hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
 	hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
 }
 
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+			    handle_edge_irq, arg->data, "edge");
+
+	return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+	struct msi_domain_info *domain_info;
+	struct irq_domain *parent, *d;
+	struct fwnode_handle *fn;
+	struct irq_fwspec fwspec;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+					      hpet_id);
+	if (!fn) {
+		kfree(domain_info);
+		return NULL;
+	}
+
+	fwspec.fwnode = fn;
+	fwspec.param_count = 1;
+	fwspec.param[0] = hpet_id;
+
+	parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+	if (!parent) {
+		irq_domain_free_fwnode(fn);
+		kfree(domain_info);
+		return NULL;
+	}
+	if (parent != x86_vector_domain)
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	d = msi_create_irq_domain(fn, domain_info, parent);
+	if (!d) {
+		irq_domain_free_fwnode(fn);
+		kfree(domain_info);
+	}
+	return d;
+}
+
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
+
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+			   int dev_num)
+{
+	struct irq_alloc_info info;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.data = hc;
+	info.devid = hpet_dev_id(domain);
+	info.hwirq = dev_num;
+
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+
 static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
 {
 	struct hpet_channel *hc = clockevent_to_channel(evt);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 16919a9671fa93f89aac7ed279097898f6516b33..41df22cdb99be4951498a8f71e316381e8500b1c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -22,6 +22,8 @@
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/i8259.h>
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 3ad34f01de2a6779b134600b8837cf04465da0b5..20f73d3e47e4aecf27afd969b109685e2a9ee1b1 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/cpu.h>
 #include <asm/hypervisor.h>
 #include <asm/i8259.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index afac7ccce72f43da8b3048f0fcb8d357676e9575..db509e1134ceaaa7e28c561a0eed998a761f5ebf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,8 @@
 #include <linux/smp.h>
 #include <linux/pci.h>
 
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
 #include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa4352dce491c855ba6f0a1866390c750b4b42a9..3a75d665d43c4299cafb2131ac67651a442ad890 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -146,7 +146,7 @@ rootfs_initcall(pci_iommu_init);
 
 static int via_no_dac_cb(struct pci_dev *pdev, void *data)
 {
-	pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+	pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0105fd785e9ad3a95d902c45ad27012cd10d9912..a3fb6e2480b6c49e93a2baa2a65273fd55a5b4c8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -31,7 +31,9 @@
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
@@ -100,10 +102,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 #ifdef CONFIG_VM86
 	dst->thread.vm86 = NULL;
 #endif
+	/* Drop the copied pointer to current's fpstate */
+	dst->thread.fpu.fpstate = NULL;
 
-	return fpu__copy(dst, src);
+	return 0;
 }
 
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
+{
+	if (fpu_state_size_dynamic())
+		fpstate_free(&tsk->thread.fpu);
+}
+#endif
+
 /*
  * Free current thread data structures etc..
  */
@@ -132,6 +144,15 @@ void exit_thread(struct task_struct *tsk)
 	fpu__drop(fpu);
 }
 
+static void pkru_flush_thread(void)
+{
+	/*
+	 * If PKRU is enabled the default PKRU value has to be loaded into
+	 * the hardware right here (similar to context switch).
+	 */
+	pkru_write_default();
+}
+
 static int set_new_tls(struct task_struct *p, unsigned long tls)
 {
 	struct user_desc __user *utls = (struct user_desc __user *)tls;
@@ -204,6 +225,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	frame->flags = X86_EFLAGS_FIXED;
 #endif
 
+	fpu_clone(p, clone_flags);
+
 	/* Kernel thread ? */
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		memset(childregs, 0, sizeof(struct pt_regs));
@@ -241,7 +264,8 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	fpu__clear(&tsk->thread.fpu);
+	fpu_flush_thread();
+	pkru_flush_thread();
 }
 
 void disable_TSC(void)
@@ -629,6 +653,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 		/* Enforce MSR update to ensure consistent state */
 		__speculation_ctrl_update(~tifn, tifn);
 	}
+
+	if ((tifp ^ tifn) & _TIF_SLD)
+		switch_to_sld(tifn);
 }
 
 /*
@@ -958,13 +985,19 @@ unsigned long get_wchan(struct task_struct *p)
 }
 
 long do_arch_prctl_common(struct task_struct *task, int option,
-			  unsigned long cpuid_enabled)
+			  unsigned long arg2)
 {
 	switch (option) {
 	case ARCH_GET_CPUID:
 		return get_cpuid_mode();
 	case ARCH_SET_CPUID:
-		return set_cpuid_mode(task, cpuid_enabled);
+		return set_cpuid_mode(task, arg2);
+	case ARCH_GET_XCOMP_SUPP:
+	case ARCH_GET_XCOMP_PERM:
+	case ARCH_REQ_XCOMP_PERM:
+	case ARCH_GET_XCOMP_GUEST_PERM:
+	case ARCH_REQ_XCOMP_GUEST_PERM:
+		return fpu_xstate_prctl(task, option, arg2);
 	}
 
 	return -EINVAL;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6c7d905271566be8e8760ea9611621d465ea701d..670e5c1b6b86c018b9cc60758dff88917d725e3a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -42,7 +42,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
 #include <asm/desc.h>
 
 #include <linux/err.h>
@@ -53,7 +53,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/proto.h>
 
 #include "process.h"
@@ -224,7 +224,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish();
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5db6a0737e6d94941812a5ff1bcb14d08ac7ef30..8e148bef009b7d5bb3b8e1ee26376783741a1bbe 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -42,7 +42,8 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/pkru.h>
+#include <asm/fpu/sched.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
@@ -53,7 +54,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
 #include <asm/unistd.h>
 #include <asm/fsgsbase.h>
 #ifdef CONFIG_IA32_EMULATION
@@ -137,7 +138,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
 		       d3, d6, d7);
 	}
 
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
+	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 }
 
@@ -340,6 +341,29 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
 	}
 }
 
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+					  struct thread_struct *next)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	/* Stash the prev task's value: */
+	prev->pkru = rdpkru();
+
+	/*
+	 * PKRU writes are slightly expensive.  Avoid them when not
+	 * strictly necessary:
+	 */
+	if (prev->pkru != next->pkru)
+		wrpkru(next->pkru);
+}
+
 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 					      struct thread_struct *next)
 {
@@ -591,13 +615,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	x86_fsgsbase_load(prev, next);
 
+	x86_pkru_load(prev, next);
+
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu);
+	switch_fpu_finish();
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2366dbeaf2633c1e44bf7294e43f54c7d2f860f0..5ffe13fcf45e23eb9803654082e9afe93aaf5670 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -30,9 +30,9 @@
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -417,26 +417,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
 
 static int genregs_get(struct task_struct *target,
 		       const struct user_regset *regset,
-		       unsigned int pos, unsigned int count,
-		       void *kbuf, void __user *ubuf)
+		       struct membuf to)
 {
-	if (kbuf) {
-		unsigned long *k = kbuf;
-		while (count >= sizeof(*k)) {
-			*k++ = getreg(target, pos);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		unsigned long __user *u = ubuf;
-		while (count >= sizeof(*u)) {
-			if (__put_user(getreg(target, pos), u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	int reg;
 
+	for (reg = 0; to.left; reg++)
+		membuf_store(&to, getreg(target, reg * sizeof(unsigned long)));
 	return 0;
 }
 
@@ -698,15 +684,14 @@ static int ioperm_active(struct task_struct *target,
 
 static int ioperm_get(struct task_struct *target,
 		      const struct user_regset *regset,
-		      unsigned int pos, unsigned int count,
-		      void *kbuf, void __user *ubuf)
+		      struct membuf to)
 {
-	if (!target->thread.io_bitmap_ptr)
+	unsigned long *iobm = target->thread.io_bitmap_ptr;
+
+	if (!iobm)
 		return -ENXIO;
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   target->thread.io_bitmap_ptr,
-				   0, IO_BITMAP_BYTES);
+	return membuf_write(&to, iobm, IO_BITMAP_BYTES);
 }
 
 /*
@@ -1009,28 +994,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 static int genregs32_get(struct task_struct *target,
 			 const struct user_regset *regset,
-			 unsigned int pos, unsigned int count,
-			 void *kbuf, void __user *ubuf)
+			 struct membuf to)
 {
-	if (kbuf) {
-		compat_ulong_t *k = kbuf;
-		while (count >= sizeof(*k)) {
-			getreg32(target, pos, k++);
-			count -= sizeof(*k);
-			pos += sizeof(*k);
-		}
-	} else {
-		compat_ulong_t __user *u = ubuf;
-		while (count >= sizeof(*u)) {
-			compat_ulong_t word;
-			getreg32(target, pos, &word);
-			if (__put_user(word, u++))
-				return -EFAULT;
-			count -= sizeof(*u);
-			pos += sizeof(*u);
-		}
-	}
+	int reg;
 
+	for (reg = 0; to.left; reg++) {
+		u32 val;
+		getreg32(target, reg * 4, &val);
+		membuf_store(&to, val);
+	}
 	return 0;
 }
 
@@ -1240,25 +1212,25 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
 		.core_note_type = NT_PRSTATUS,
 		.n = sizeof(struct user_regs_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.get = genregs_get, .set = genregs_set
+		.regset_get = genregs_get, .set = genregs_set
 	},
 	[REGSET_FP] = {
 		.core_note_type = NT_PRFPREG,
-		.n = sizeof(struct user_i387_struct) / sizeof(long),
+		.n = sizeof(struct fxregs_state) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
 		.size = sizeof(u64), .align = sizeof(u64),
-		.active = xstateregs_active, .get = xstateregs_get,
+		.active = xstateregs_active, .regset_get = xstateregs_get,
 		.set = xstateregs_set
 	},
 	[REGSET_IOPERM64] = {
 		.core_note_type = NT_386_IOPERM,
 		.n = IO_BITMAP_LONGS,
 		.size = sizeof(long), .align = sizeof(long),
-		.active = ioperm_active, .get = ioperm_get
+		.active = ioperm_active, .regset_get = ioperm_get
 	},
 };
 
@@ -1281,24 +1253,24 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
 		.core_note_type = NT_PRSTATUS,
 		.n = sizeof(struct user_regs_struct32) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.get = genregs32_get, .set = genregs32_set
+		.regset_get = genregs32_get, .set = genregs32_set
 	},
 	[REGSET_FP] = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
+		.active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
-		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
+		.n = sizeof(struct fxregs_state) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
 		.size = sizeof(u64), .align = sizeof(u64),
-		.active = xstateregs_active, .get = xstateregs_get,
+		.active = xstateregs_active, .regset_get = xstateregs_get,
 		.set = xstateregs_set
 	},
 	[REGSET_TLS] = {
@@ -1307,13 +1279,13 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
 		.size = sizeof(struct user_desc),
 		.align = sizeof(struct user_desc),
 		.active = regset_tls_active,
-		.get = regset_tls_get, .set = regset_tls_set
+		.regset_get = regset_tls_get, .set = regset_tls_set
 	},
 	[REGSET_IOPERM32] = {
 		.core_note_type = NT_386_IOPERM,
 		.n = IO_BITMAP_BYTES / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = ioperm_active, .get = ioperm_get
+		.active = ioperm_active, .regset_get = ioperm_get
 	},
 };
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 366875a828b32cabb67c6524f4ebf722e19d7dd6..1e331a0868ace2e6794d2b0c7d1a261fe0686e70 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -79,6 +79,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
+#include <asm/numa.h>
 #include <asm/realmode.h>
 #include <asm/e820/api.h>
 #include <asm/mpspec.h>
@@ -1120,19 +1121,23 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	e820__memblock_setup();
 
+	/*
+	 * Needs to run after memblock setup because it needs the physical
+	 * memory size.
+	 */
+	sev_setup_arch();
+
 	reserve_bios_regions();
 
-	if (efi_enabled(EFI_MEMMAP)) {
-		efi_fake_memmap();
-		efi_find_mirror();
-		efi_esrt_init();
+	efi_fake_memmap();
+	efi_find_mirror();
+	efi_esrt_init();
 
-		/*
-		 * The EFI specification says that boot service code won't be
-		 * called after ExitBootServices(). This is, in fact, a lie.
-		 */
-		efi_reserve_boot_services();
-	}
+	/*
+	 * The EFI specification says that boot service code won't be
+	 * called after ExitBootServices(). This is, in fact, a lie.
+	 */
+	efi_reserve_boot_services();
 
 	/* preallocate 4k for mptable mpc */
 	e820__memblock_alloc_reserved_mpc_new();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2fdbf5ef8c39c439aaae3411c519993aca0ba442..9fe6b025f4bd5ca8fae329b46f9691e96feb6975 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -29,8 +29,8 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
@@ -39,6 +39,7 @@
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
+#include <asm/fpu/xstate.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -47,24 +48,6 @@
 #include <asm/sigframe.h>
 #include <asm/signal.h>
 
-#define COPY(x)			do {			\
-	get_user_ex(regs->x, &sc->x);			\
-} while (0)
-
-#define GET_SEG(seg)		({			\
-	unsigned short tmp;				\
-	get_user_ex(tmp, &sc->seg);			\
-	tmp;						\
-})
-
-#define COPY_SEG(seg)		do {			\
-	regs->seg = GET_SEG(seg);			\
-} while (0)
-
-#define COPY_SEG_CPL3(seg)	do {			\
-	regs->seg = GET_SEG(seg) | 3;			\
-} while (0)
-
 #ifdef CONFIG_X86_64
 /*
  * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
@@ -92,53 +75,58 @@ static void force_valid_ss(struct pt_regs *regs)
 	    ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
 		regs->ss = __USER_DS;
 }
+# define CONTEXT_COPY_SIZE	offsetof(struct sigcontext, reserved1)
+#else
+# define CONTEXT_COPY_SIZE	sizeof(struct sigcontext)
 #endif
 
-static int restore_sigcontext(struct pt_regs *regs,
-			      struct sigcontext __user *sc,
-			      unsigned long uc_flags)
+static bool restore_sigcontext(struct pt_regs *regs,
+			       struct sigcontext __user *usc,
+			       unsigned long uc_flags)
 {
-	unsigned long buf_val;
-	void __user *buf;
-	unsigned int tmpflags;
-	unsigned int err = 0;
+	struct sigcontext sc;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	get_user_try {
+	if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
+		return false;
 
 #ifdef CONFIG_X86_32
-		set_user_gs(regs, GET_SEG(gs));
-		COPY_SEG(fs);
-		COPY_SEG(es);
-		COPY_SEG(ds);
+	set_user_gs(regs, sc.gs);
+	regs->fs = sc.fs;
+	regs->es = sc.es;
+	regs->ds = sc.ds;
 #endif /* CONFIG_X86_32 */
 
-		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+	regs->bx = sc.bx;
+	regs->cx = sc.cx;
+	regs->dx = sc.dx;
+	regs->si = sc.si;
+	regs->di = sc.di;
+	regs->bp = sc.bp;
+	regs->ax = sc.ax;
+	regs->sp = sc.sp;
+	regs->ip = sc.ip;
 
 #ifdef CONFIG_X86_64
-		COPY(r8);
-		COPY(r9);
-		COPY(r10);
-		COPY(r11);
-		COPY(r12);
-		COPY(r13);
-		COPY(r14);
-		COPY(r15);
+	regs->r8 = sc.r8;
+	regs->r9 = sc.r9;
+	regs->r10 = sc.r10;
+	regs->r11 = sc.r11;
+	regs->r12 = sc.r12;
+	regs->r13 = sc.r13;
+	regs->r14 = sc.r14;
+	regs->r15 = sc.r15;
 #endif /* CONFIG_X86_64 */
 
-		COPY_SEG_CPL3(cs);
-		COPY_SEG_CPL3(ss);
-
-		get_user_ex(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		regs->orig_ax = -1;		/* disable syscall checks */
+	/* Get CS/SS and force CPL3 */
+	regs->cs = sc.cs | 0x03;
+	regs->ss = sc.ss | 0x03;
 
-		get_user_ex(buf_val, &sc->fpstate);
-		buf = (void __user *)buf_val;
-	} get_user_catch(err);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+	/* disable syscall checks */
+	regs->orig_ax = -1;
 
 #ifdef CONFIG_X86_64
 	/*
@@ -149,76 +137,89 @@ static int restore_sigcontext(struct pt_regs *regs,
 		force_valid_ss(regs);
 #endif
 
-	err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
-
 	force_iret();
 
-	return err;
+	return fpu__restore_sig((void __user *)sc.fpstate,
+			       IS_ENABLED(CONFIG_X86_32));
 }
 
-int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask)
 {
-	int err = 0;
-
-	put_user_try {
-
 #ifdef CONFIG_X86_32
-		put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
-		put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
-		put_user_ex(regs->es, (unsigned int __user *)&sc->es);
-		put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+	unsafe_put_user(get_user_gs(regs),
+				  (unsigned int __user *)&sc->gs, Efault);
+	unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+	unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+	unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(regs->di, &sc->di);
-		put_user_ex(regs->si, &sc->si);
-		put_user_ex(regs->bp, &sc->bp);
-		put_user_ex(regs->sp, &sc->sp);
-		put_user_ex(regs->bx, &sc->bx);
-		put_user_ex(regs->dx, &sc->dx);
-		put_user_ex(regs->cx, &sc->cx);
-		put_user_ex(regs->ax, &sc->ax);
+	unsafe_put_user(regs->di, &sc->di, Efault);
+	unsafe_put_user(regs->si, &sc->si, Efault);
+	unsafe_put_user(regs->bp, &sc->bp, Efault);
+	unsafe_put_user(regs->sp, &sc->sp, Efault);
+	unsafe_put_user(regs->bx, &sc->bx, Efault);
+	unsafe_put_user(regs->dx, &sc->dx, Efault);
+	unsafe_put_user(regs->cx, &sc->cx, Efault);
+	unsafe_put_user(regs->ax, &sc->ax, Efault);
 #ifdef CONFIG_X86_64
-		put_user_ex(regs->r8, &sc->r8);
-		put_user_ex(regs->r9, &sc->r9);
-		put_user_ex(regs->r10, &sc->r10);
-		put_user_ex(regs->r11, &sc->r11);
-		put_user_ex(regs->r12, &sc->r12);
-		put_user_ex(regs->r13, &sc->r13);
-		put_user_ex(regs->r14, &sc->r14);
-		put_user_ex(regs->r15, &sc->r15);
+	unsafe_put_user(regs->r8, &sc->r8, Efault);
+	unsafe_put_user(regs->r9, &sc->r9, Efault);
+	unsafe_put_user(regs->r10, &sc->r10, Efault);
+	unsafe_put_user(regs->r11, &sc->r11, Efault);
+	unsafe_put_user(regs->r12, &sc->r12, Efault);
+	unsafe_put_user(regs->r13, &sc->r13, Efault);
+	unsafe_put_user(regs->r14, &sc->r14, Efault);
+	unsafe_put_user(regs->r15, &sc->r15, Efault);
 #endif /* CONFIG_X86_64 */
 
-		put_user_ex(current->thread.trap_nr, &sc->trapno);
-		put_user_ex(current->thread.error_code, &sc->err);
-		put_user_ex(regs->ip, &sc->ip);
+	unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+	unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+	unsafe_put_user(regs->ip, &sc->ip, Efault);
 #ifdef CONFIG_X86_32
-		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->sp, &sc->sp_at_signal);
-		put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+	unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+	unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
 #else /* !CONFIG_X86_32 */
-		put_user_ex(regs->flags, &sc->flags);
-		put_user_ex(regs->cs, &sc->cs);
-		put_user_ex(0, &sc->gs);
-		put_user_ex(0, &sc->fs);
-		put_user_ex(regs->ss, &sc->ss);
+	unsafe_put_user(regs->flags, &sc->flags, Efault);
+	unsafe_put_user(regs->cs, &sc->cs, Efault);
+	unsafe_put_user(0, &sc->gs, Efault);
+	unsafe_put_user(0, &sc->fs, Efault);
+	unsafe_put_user(regs->ss, &sc->ss, Efault);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate);
-
-		/* non-iBCS2 extensions.. */
-		put_user_ex(mask, &sc->oldmask);
-		put_user_ex(current->thread.cr2, &sc->cr2);
-	} put_user_catch(err);
+	unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
 
-	return err;
+	/* non-iBCS2 extensions.. */
+	unsafe_put_user(mask, &sc->oldmask, Efault);
+	unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+	return 0;
+Efault:
+	return -EFAULT;
 }
 
+#define unsafe_put_sigcontext(sc, fp, regs, set, label)			\
+do {									\
+	if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0]))	\
+		goto label;						\
+} while(0);
+
+#define unsafe_put_sigmask(set, frame, label) \
+	unsafe_put_user(*(__u64 *)(set), \
+			(__u64 __user *)&(frame)->uc.uc_sigmask, \
+			label)
+
 /*
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT	16UL
+
+#define MAX_FRAME_PADDING	(FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -229,9 +230,9 @@ static unsigned long align_sigframe(unsigned long sp)
 	 * Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 	 */
-	sp = ((sp + 4) & -16ul) - 4;
+	sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-	sp = round_down(sp, 16) - 8;
+	sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
 	return sp;
 }
@@ -241,11 +242,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	     void __user **fpstate)
 {
 	/* Default to using normal stack */
+	bool nested_altstack = on_sig_stack(regs->sp);
+	bool entering_altstack = false;
 	unsigned long math_size = 0;
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
-	int onsigstack = on_sig_stack(sp);
-	int ret;
 
 	/* redzone */
 	if (IS_ENABLED(CONFIG_X86_64))
@@ -253,15 +254,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
+		/*
+		 * This checks nested_altstack via sas_ss_flags(). Sensible
+		 * programs use SS_AUTODISARM, which disables that check, and
+		 * programs that don't use SS_AUTODISARM get compatible.
+		 */
+		if (sas_ss_flags(sp) == 0) {
 			sp = current->sas_ss_sp + current->sas_ss_size;
+			entering_altstack = true;
+		}
 	} else if (IS_ENABLED(CONFIG_X86_32) &&
-		   !onsigstack &&
+		   !nested_altstack &&
 		   regs->ss != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
 		   ka->sa.sa_restorer) {
 		/* This is the legacy signal stack switching. */
 		sp = (unsigned long) ka->sa.sa_restorer;
+		entering_altstack = true;
 	}
 
 	sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@ -274,12 +283,18 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	 * If we are on the alternate signal stack and would overflow it, don't.
 	 * Return an always-bogus address instead so we will die with SIGSEGV.
 	 */
-	if (onsigstack && !likely(on_sig_stack(sp)))
+	if (unlikely((nested_altstack || entering_altstack) &&
+		     !__on_sig_stack(sp))) {
+
+		if (show_unhandled_signals && printk_ratelimit())
+			pr_info("%s[%d] overflowed sigaltstack\n",
+				current->comm, task_pid_nr(current));
+
 		return (void __user *)-1L;
+	}
 
 	/* save i387 and extended state */
-	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
-	if (ret < 0)
+	if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -314,26 +329,16 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 {
 	struct sigframe __user *frame;
 	void __user *restorer;
-	int err = 0;
-	void __user *fpstate = NULL;
-
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
-
-	if (!access_ok(frame, sizeof(*frame)))
-		return -EFAULT;
+	void __user *fp = NULL;
 
-	if (__put_user(sig, &frame->sig))
-		return -EFAULT;
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
-	if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	if (_NSIG_WORDS > 1) {
-		if (__copy_to_user(&frame->extramask, &set->sig[1],
-				   sizeof(frame->extramask)))
-			return -EFAULT;
-	}
-
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
+	unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
 	if (current->mm->context.vdso)
 		restorer = current->mm->context.vdso +
 			vdso_image_32.sym___kernel_sigreturn;
@@ -343,7 +348,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 		restorer = ksig->ka.sa.sa_restorer;
 
 	/* Set up to return from userspace.  */
-	err |= __put_user(restorer, &frame->pretcode);
+	unsafe_put_user(restorer, &frame->pretcode, Efault);
 
 	/*
 	 * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
@@ -352,10 +357,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	 * reasons and because gdb uses it as a signature to notice
 	 * signal handler stack frames.
 	 */
-	err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
-
-	if (err)
-		return -EFAULT;
+	unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault);
+	user_access_end();
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long)frame;
@@ -370,6 +373,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 	regs->cs = __USER_CS;
 
 	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 static int __setup_rt_frame(int sig, struct ksignal *ksig,
@@ -377,50 +384,45 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 {
 	struct rt_sigframe __user *frame;
 	void __user *restorer;
-	int err = 0;
-	void __user *fpstate = NULL;
+	void __user *fp = NULL;
 
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
-	put_user_try {
-		put_user_ex(sig, &frame->sig);
-		put_user_ex(&frame->info, &frame->pinfo);
-		put_user_ex(&frame->uc, &frame->puc);
+	unsafe_put_user(sig, &frame->sig, Efault);
+	unsafe_put_user(&frame->info, &frame->pinfo, Efault);
+	unsafe_put_user(&frame->uc, &frame->puc, Efault);
 
-		/* Create the ucontext.  */
-		if (static_cpu_has(X86_FEATURE_XSAVE))
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+	/* Create the ucontext.  */
+	if (static_cpu_has(X86_FEATURE_XSAVE))
+		unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+	else
+		unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
 
-		/* Set up to return from userspace.  */
-		restorer = current->mm->context.vdso +
-			vdso_image_32.sym___kernel_rt_sigreturn;
-		if (ksig->ka.sa.sa_flags & SA_RESTORER)
-			restorer = ksig->ka.sa.sa_restorer;
-		put_user_ex(restorer, &frame->pretcode);
+	/* Set up to return from userspace.  */
+	restorer = current->mm->context.vdso +
+		vdso_image_32.sym___kernel_rt_sigreturn;
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
+		restorer = ksig->ka.sa.sa_restorer;
+	unsafe_put_user(restorer, &frame->pretcode, Efault);
 
-		/*
-		 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
-		 *
-		 * WE DO NOT USE IT ANY MORE! It's only left here for historical
-		 * reasons and because gdb uses it as a signature to notice
-		 * signal handler stack frames.
-		 */
-		put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
-	} put_user_catch(err);
+	/*
+	 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+	 *
+	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
+	 * reasons and because gdb uses it as a signature to notice
+	 * signal handler stack frames.
+	 */
+	unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_sigmask(set, frame, Efault);
+	user_access_end();
 	
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
-	if (err)
+	if (copy_siginfo_to_user(&frame->info, &ksig->info))
 		return -EFAULT;
 
 	/* Set up registers for signal handler */
@@ -436,6 +438,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	regs->cs = __USER_CS;
 
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #else /* !CONFIG_X86_32 */
 static unsigned long frame_uc_flags(struct pt_regs *regs)
@@ -459,43 +464,34 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	struct rt_sigframe __user *frame;
 	void __user *fp = NULL;
 	unsigned long uc_flags;
-	int err = 0;
+
+	/* x86-64 should always use SA_RESTORER. */
+	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+		return -EFAULT;
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
+	uc_flags = frame_uc_flags(regs);
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!user_access_begin(frame, sizeof(*frame)))
 		return -EFAULT;
 
+	/* Create the ucontext.  */
+	unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+	/* Set up to return from userspace.  If provided, use a stub
+	   already in userspace.  */
+	unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_sigmask(set, frame, Efault);
+	user_access_end();
+
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
 		if (copy_siginfo_to_user(&frame->info, &ksig->info))
 			return -EFAULT;
 	}
 
-	uc_flags = frame_uc_flags(regs);
-
-	put_user_try {
-		/* Create the ucontext.  */
-		put_user_ex(uc_flags, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-
-		/* Set up to return from userspace.  If provided, use a stub
-		   already in userspace.  */
-		/* x86-64 should always use SA_RESTORER. */
-		if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-			put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
-		} else {
-			/* could use a vstub here */
-			err |= -EFAULT;
-		}
-	} put_user_catch(err);
-
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
-	if (err)
-		return -EFAULT;
-
 	/* Set up registers for signal handler */
 	regs->di = sig;
 	/* In case the signal handler was declared without prototypes */
@@ -532,6 +528,10 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 		force_valid_ss(regs);
 
 	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #endif /* CONFIG_X86_32 */
 
@@ -543,44 +543,33 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 	struct rt_sigframe_x32 __user *frame;
 	unsigned long uc_flags;
 	void __user *restorer;
-	int err = 0;
-	void __user *fpstate = NULL;
-
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+	void __user *fp = NULL;
 
-	if (!access_ok(frame, sizeof(*frame)))
+	if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
 		return -EFAULT;
 
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
-			return -EFAULT;
-	}
+	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
 
 	uc_flags = frame_uc_flags(regs);
 
-	put_user_try {
-		/* Create the ucontext.  */
-		put_user_ex(uc_flags, &frame->uc.uc_flags);
-		put_user_ex(0, &frame->uc.uc_link);
-		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-		put_user_ex(0, &frame->uc.uc__pad0);
-
-		if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-			restorer = ksig->ka.sa.sa_restorer;
-		} else {
-			/* could use a vstub here */
-			restorer = NULL;
-			err |= -EFAULT;
-		}
-		put_user_ex(restorer, (unsigned long __user *)&frame->pretcode);
-	} put_user_catch(err);
+	if (!user_access_begin(frame, sizeof(*frame)))
+		return -EFAULT;
 
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-				regs, set->sig[0]);
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	/* Create the ucontext.  */
+	unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+	unsafe_put_user(0, &frame->uc.uc_link, Efault);
+	unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+	unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+	restorer = ksig->ka.sa.sa_restorer;
+	unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+	unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+	unsafe_put_sigmask(set, frame, Efault);
+	user_access_end();
 
-	if (err)
-		return -EFAULT;
+	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+			return -EFAULT;
+	}
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long) frame;
@@ -599,6 +588,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 #endif	/* CONFIG_X86_X32_ABI */
 
 	return 0;
+#ifdef CONFIG_X86_X32_ABI
+Efault:
+	user_access_end();
+	return -EFAULT;
+#endif
 }
 
 /*
@@ -615,9 +609,8 @@ SYSCALL_DEFINE0(sigreturn)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
-		&& __copy_from_user(&set.sig[1], &frame->extramask,
-				    sizeof(frame->extramask))))
+	if (__get_user(set.sig[0], &frame->sc.oldmask) ||
+	    __get_user(set.sig[1], &frame->extramask[0]))
 		goto badframe;
 
 	set_current_blocked(&set);
@@ -626,7 +619,7 @@ SYSCALL_DEFINE0(sigreturn)
 	 * x86_32 has no uc_flags bits relevant to restore_sigcontext.
 	 * Save a few cycles by skipping the __get_user.
 	 */
-	if (restore_sigcontext(regs, &frame->sc, 0))
+	if (!restore_sigcontext(regs, &frame->sc, 0))
 		goto badframe;
 	return regs->ax;
 
@@ -647,14 +640,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 	if (__get_user(uc_flags, &frame->uc.uc_flags))
 		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+	if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
@@ -667,6 +660,64 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE	sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE	sizeof(struct rt_sigframe)
+#endif
+
+/*
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
+ */
+#define MAX_XSAVE_PADDING	63UL
+
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding     |
+ * -------------------------
+ * | (f)xsave frame        |
+ * -------------------------
+ * | fsave header          |
+ * -------------------------
+ * | alignment padding     |
+ * -------------------------
+ * | siginfo + ucontext    |
+ * -------------------------
+ */
+
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
+
+void __init init_sigframe_size(void)
+{
+	fpu_default_state_size = fpu__get_fpstate_size();
+
+	max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+
+	max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
+
+	/* Userspace expects an aligned size. */
+	max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+
+	pr_info("max sigframe size: %lu\n", max_frame_size);
+}
+
+unsigned long get_sigframe_size(void)
+{
+	return max_frame_size;
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
 	return IS_ENABLED(CONFIG_IA32_EMULATION) &&
@@ -763,7 +814,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
@@ -838,6 +889,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 	force_sig(SIGSEGV);
 }
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+	return kstrtobool(arg, &strict_sigaltstack_size);
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+	unsigned long fsize = max_frame_size - fpu_default_state_size;
+	u64 mask;
+
+	lockdep_assert_held(&current->sighand->siglock);
+
+	if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+		return true;
+
+	fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+	if (likely(ss_size > fsize))
+		return true;
+
+	if (strict_sigaltstack_size)
+		return ss_size > fsize;
+
+	mask = current->group_leader->thread.fpu.perm.__state_perm;
+	if (mask & XFEATURE_MASK_USER_DYNAMIC)
+		return ss_size > fsize;
+
+	return true;
+}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
+
 #ifdef CONFIG_X86_X32_ABI
 asmlinkage long sys32_x32_rt_sigreturn(void)
 {
@@ -850,14 +957,14 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
-	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+	if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
 		goto badframe;
 	if (__get_user(uc_flags, &frame->uc.uc_flags))
 		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+	if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8367bd7a9a810cc303bd3e7644a8accb3e5add22..8ee307fe07ed18e44cfc442583c0ad41b02cd6cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -69,7 +69,7 @@
 #include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if (c->phys_proc_id == o->phys_proc_id &&
+	    c->cpu_die_id == o->cpu_die_id)
+		return true;
+	return false;
+}
+
+/*
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node.  If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if (c->phys_proc_id == o->phys_proc_id)
+		return true;
+	return false;
+}
+
 /*
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
  *
- * These are Intel CPUs that enumerate an LLC that is shared by
- * multiple NUMA nodes. The LLC on these systems is shared for
- * off-package data access but private to the NUMA node (half
- * of the package) for on-package access.
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
  *
- * CPUID (the source of the information about the LLC) can only
- * enumerate the cache as being shared *or* unshared, but not
- * this particular configuration. The CPU in this case enumerates
- * the cache to be shared across the entire package (spanning both
- * NUMA nodes).
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
  */
 
-static const struct x86_cpu_id snc_cpu[] = {
-	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+static const struct x86_cpu_id intel_cod_cpu[] = {
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_HASWELL_X, 0, 0, 0},   /* COD */
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_BROADWELL_X, 0, 0, 0}, /* COD */
+	{ X86_VENDOR_INTEL, 6, 0, 0, 1, 0},			 /* SNC */
 	{}
 };
 
 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
+	const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
 	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+	bool intel_snc = id && id->driver_data;
 
 	/* Do not match if we do not have a valid APICID for cpu: */
 	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
@@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	 * means 'c' does not share the LLC of 'o'. This will be
 	 * reflected to userspace.
 	 */
-	if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
 		return false;
 
 	return topology_sane(c, o, "llc");
 }
 
-/*
- * Unlike the other levels, we do not enforce keeping a
- * multicore group inside a NUMA node.  If this happens, we will
- * discard the MC level of the topology later.
- */
-static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
-	if (c->phys_proc_id == o->phys_proc_id)
-		return true;
-	return false;
-}
-
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
-	if ((c->phys_proc_id == o->phys_proc_id) &&
-		(c->cpu_die_id == o->cpu_die_id))
-		return true;
-	return false;
-}
-
 
 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
 static inline int x86_sched_itmt_flags(void)
@@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu)
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		o = &cpu_data(i);
 
+		if (match_pkg(c, o) && !topology_same_node(c, o))
+			x86_has_numa_in_package = true;
+
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
 			link_mask(topology_sibling_cpumask, cpu, i);
 
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
 			link_mask(cpu_llc_shared_mask, cpu, i);
 
+		if ((i == cpu) || (has_mp && match_die(c, o)))
+			link_mask(topology_die_cpumask, cpu, i);
 	}
 
+	threads = cpumask_weight(topology_sibling_cpumask(cpu));
+	if (threads > __max_smt_threads)
+		__max_smt_threads = threads;
+
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
 	 * topology_sibling_cpumask links to be set-up.
@@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu)
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-			if (cpumask_weight(
-			    topology_sibling_cpumask(cpu)) == 1) {
+			if (threads == 1) {
 				/*
 				 * for each core in package, increment
 				 * the booted_cores for this new cpu
@@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu)
 			} else if (i != cpu && !c->booted_cores)
 				c->booted_cores = cpu_data(i).booted_cores;
 		}
-		if (match_pkg(c, o) && !topology_same_node(c, o))
-			x86_has_numa_in_package = true;
-
-		if ((i == cpu) || (has_mp && match_die(c, o)))
-			link_mask(topology_die_cpumask, cpu, i);
 	}
-
-	threads = cpumask_weight(topology_sibling_cpumask(cpu));
-	if (threads > __max_smt_threads)
-		__max_smt_threads = threads;
 }
 
 /* maps the cpu to the sched domain representing multi-core */
@@ -753,13 +755,14 @@ static void __init smp_quirk_init_udelay(void)
 int
 wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 {
+	u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
 	unsigned long send_status, accept_status = 0;
 	int maxlvt;
 
 	/* Target chip */
 	/* Boot on the stack */
 	/* Kick the second */
-	apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+	apic_icr_write(APIC_DM_NMI | dm, apicid);
 
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
@@ -986,10 +989,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
 	if (!boot_error) {
 		enable_start_cpu0 = 1;
 		*cpu0_nmi_registered = 1;
-		if (apic->dest_logical == APIC_DEST_LOGICAL)
-			id = cpu0_logical_apicid;
-		else
-			id = apicid;
+		id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
 		boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
 	}
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6d83b4b857e6a78f909a42c1ec3cf4d4a68d11e1..2fd698e28e4d5d23629fbcdbbd2ce40b89c9baef 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -91,7 +91,8 @@ struct stack_frame_user {
 };
 
 static int
-copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
+copy_stack_frame(const struct stack_frame_user __user *fp,
+		 struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -100,7 +101,8 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 
 	ret = 1;
 	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+	if (__get_user(frame->next_fp, &fp->next_fp) ||
+	    __get_user(frame->ret_addr, &fp->ret_addr))
 		ret = 0;
 	pagefault_enable();
 
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 71d3fef1edc92e7e10180a84ea36b5c81443e58c..64a496a0687f60b3ae98fcf4797a786912c08f90 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -256,36 +256,16 @@ int regset_tls_active(struct task_struct *target,
 }
 
 int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+		   struct membuf to)
 {
 	const struct desc_struct *tls;
+	struct user_desc v;
+	int pos;
 
-	if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
-	    (pos % sizeof(struct user_desc)) != 0 ||
-	    (count % sizeof(struct user_desc)) != 0)
-		return -EINVAL;
-
-	pos /= sizeof(struct user_desc);
-	count /= sizeof(struct user_desc);
-
-	tls = &target->thread.tls_array[pos];
-
-	if (kbuf) {
-		struct user_desc *info = kbuf;
-		while (count-- > 0)
-			fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
-				       tls++);
-	} else {
-		struct user_desc __user *u_info = ubuf;
-		while (count-- > 0) {
-			struct user_desc info;
-			fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
-			if (__copy_to_user(u_info++, &info, sizeof(info)))
-				return -EFAULT;
-		}
+	for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
+		fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
+		membuf_write(&to, &v, sizeof(v));
 	}
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
index 3a76e1d3535e612d1887f330358099d91b54d3e3..fc39447a0c1ab60bcf1ea144e9d01be59f65ea68 100644
--- a/arch/x86/kernel/tls.h
+++ b/arch/x86/kernel/tls.h
@@ -12,7 +12,7 @@
 #include <linux/regset.h>
 
 extern user_regset_active_fn regset_tls_active;
-extern user_regset_get_fn regset_tls_get;
+extern user_regset_get2_fn regset_tls_get;
 extern user_regset_set_fn regset_tls_set;
 
 #endif	/* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index be5bc2e47c71e864a0500a9ad64d81247bf8782e..f3cad0d8c6a8d71fee15ca90f5ab8b327dae0eec 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/irq.h>
+#include <asm/io_apic.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 18648baeccf7402046ccc45d2ed23d31e7854917..2309527e5f5f99ed3b4bb056528e2547a1fb3bff 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -39,6 +39,7 @@
 #include <linux/io.h>
 #include <linux/hardirq.h>
 #include <linux/atomic.h>
+#include <linux/ioasid.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
@@ -51,7 +52,8 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu.h>
 #include <asm/cpu_entry_area.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
@@ -180,7 +182,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 {
 	struct task_struct *tsk = current;
 
-
 	if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
 		return;
 
@@ -212,6 +213,21 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	}
 }
 
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+	return (void __user *)uprobe_get_trap_addr(regs);
+}
+
 #define IP ((void __user *)uprobe_get_trap_addr(regs))
 #define DO_ERROR(trapnr, signr, sicode, addr, str, name)		   \
 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	   \
@@ -226,9 +242,29 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,           0, NULL, "coprocessor segment overru
 DO_ERROR(X86_TRAP_TS,     SIGSEGV,          0, NULL, "invalid TSS",         invalid_TSS)
 DO_ERROR(X86_TRAP_NP,     SIGBUS,           0, NULL, "segment not present", segment_not_present)
 DO_ERROR(X86_TRAP_SS,     SIGBUS,           0, NULL, "stack segment",       stack_segment)
-DO_ERROR(X86_TRAP_AC,     SIGBUS,  BUS_ADRALN, NULL, "alignment check",     alignment_check)
 #undef IP
 
+dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
+{
+	char *str = "alignment check";
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
+		return;
+
+	if (!user_mode(regs))
+		die("Split lock detected\n", regs, error_code);
+
+	local_irq_enable();
+
+	if (handle_user_split_lock(regs, error_code))
+		return;
+
+	do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
+		error_code, BUS_ADRALN, NULL);
+}
+
 #ifdef CONFIG_VMAP_STACK
 __visible void __noreturn handle_stack_overflow(const char *message,
 						struct pt_regs *regs,
@@ -447,12 +483,66 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 }
 
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated.  If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_IOMMU_SVA
+	u32 pasid;
+
+	/*
+	 * MSR_IA32_PASID is managed using XSAVE.  Directly
+	 * writing to the MSR is only possible when fpregs
+	 * are valid and the fpstate is not.  This is
+	 * guaranteed when handling a userspace exception
+	 * in *before* interrupts are re-enabled.
+	 */
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Hardware without ENQCMD will not generate
+	 * #GPs that can be fixed up here.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+		return false;
+
+	pasid = current->mm->pasid;
+
+	/*
+	 * If the mm has not been allocated a
+	 * PASID, the #GP can not be fixed up.
+	 */
+	if (!pasid_valid(pasid))
+		return false;
+
+	/*
+	 * Did this thread already have its PASID activated?
+	 * If so, the #GP must be from something else.
+	 */
+	if (current->pasid_activated)
+		return false;
+
+	wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+	current->pasid_activated = 1;
+
+	return true;
+#else
+	return false;
+#endif
+}
+
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	const char *desc = "general protection fault";
 	struct task_struct *tsk;
 
+	if (user_mode(regs) && try_fixup_enqcmd_gp())
+		return;
+
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	cond_local_irq_enable(regs);
 
@@ -727,6 +817,11 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
 	}
+
+	/* #DB for bus lock can only be triggered from userspace. */
+	if ((dr6 & DR_BUS_LOCK) && user_mode(regs))
+		handle_bus_lock(regs);
+
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(regs, error_code, si_code);
@@ -767,9 +862,10 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	}
 
 	/*
-	 * Save the info for the exception handler and clear the error.
+	 * Synchronize the FPU register state to the memory register state
+	 * if necessary. This allows the exception handler to inspect it.
 	 */
-	fpu__save(fpu);
+	fpu_sync_fpstate(fpu);
 
 	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
@@ -805,11 +901,49 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 	cond_local_irq_enable(regs);
 }
 
+static bool handle_xfd_event(struct pt_regs *regs)
+{
+	u64 xfd_err;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+		return false;
+
+	rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+	if (!xfd_err)
+		return false;
+
+	wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+	/* Die if that happens in kernel space */
+	if (WARN_ON(!user_mode(regs)))
+		return false;
+
+	local_irq_enable();
+
+	err = xfd_enable_feature(xfd_err);
+
+	switch (err) {
+	case -EPERM:
+		force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+		break;
+	case -EFAULT:
+		force_sig(SIGSEGV);
+		break;
+	}
+
+	local_irq_disable();
+	return true;
+}
+
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
 	unsigned long cr0 = read_cr0();
 
+	if (handle_xfd_event(regs))
+		return;
+
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a76c12b38e925a378cab92dba4ca5638dcf29645..f747021024ca4b0d42df14f020145600a713cbb4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -98,7 +98,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
-	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -114,37 +113,30 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
 	user = vm86->user_vm86;
 
-	if (!access_ok(user, vm86->vm86plus.is_vm86pus ?
+	if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
-		       sizeof(struct vm86_struct))) {
-		pr_alert("could not access userspace vm86 info\n");
-		do_exit(SIGSEGV);
-	}
-
-	put_user_try {
-		put_user_ex(regs->pt.bx, &user->regs.ebx);
-		put_user_ex(regs->pt.cx, &user->regs.ecx);
-		put_user_ex(regs->pt.dx, &user->regs.edx);
-		put_user_ex(regs->pt.si, &user->regs.esi);
-		put_user_ex(regs->pt.di, &user->regs.edi);
-		put_user_ex(regs->pt.bp, &user->regs.ebp);
-		put_user_ex(regs->pt.ax, &user->regs.eax);
-		put_user_ex(regs->pt.ip, &user->regs.eip);
-		put_user_ex(regs->pt.cs, &user->regs.cs);
-		put_user_ex(regs->pt.flags, &user->regs.eflags);
-		put_user_ex(regs->pt.sp, &user->regs.esp);
-		put_user_ex(regs->pt.ss, &user->regs.ss);
-		put_user_ex(regs->es, &user->regs.es);
-		put_user_ex(regs->ds, &user->regs.ds);
-		put_user_ex(regs->fs, &user->regs.fs);
-		put_user_ex(regs->gs, &user->regs.gs);
-
-		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
-	} put_user_catch(err);
-	if (err) {
-		pr_alert("could not access userspace vm86 info\n");
-		do_exit(SIGSEGV);
-	}
+		       sizeof(struct vm86_struct)))
+		goto Efault;
+
+	unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end);
+	unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end);
+	unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end);
+	unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end);
+	unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end);
+	unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end);
+	unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end);
+	unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end);
+	unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end);
+	unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end);
+	unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end);
+	unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end);
+	unsafe_put_user(regs->es, &user->regs.es, Efault_end);
+	unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
+	unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
+	unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
+	unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+	user_access_end();
 
 	preempt_disable();
 	tsk->thread.sp0 = vm86->saved_sp0;
@@ -159,6 +151,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	lazy_load_gs(vm86->regs32.gs);
 
 	regs->pt.ax = retval;
+	return;
+
+Efault_end:
+	user_access_end();
+Efault:
+	pr_alert("could not access userspace vm86 info\n");
+	do_exit(SIGSEGV);
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -243,6 +242,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	struct kernel_vm86_regs vm86regs;
 	struct pt_regs *regs = current_pt_regs();
 	unsigned long err = 0;
+	struct vm86_struct v;
 
 	err = security_mmap_addr(0);
 	if (err) {
@@ -278,39 +278,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	if (vm86->saved_sp0)
 		return -EPERM;
 
-	if (!access_ok(user_vm86, plus ?
-		       sizeof(struct vm86_struct) :
-		       sizeof(struct vm86plus_struct)))
+	if (copy_from_user(&v, user_vm86,
+			offsetof(struct vm86_struct, int_revectored)))
 		return -EFAULT;
 
 	memset(&vm86regs, 0, sizeof(vm86regs));
-	get_user_try {
-		unsigned short seg;
-		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
-		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
-		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
-		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
-		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
-		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
-		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
-		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
-		get_user_ex(seg, &user_vm86->regs.cs);
-		vm86regs.pt.cs = seg;
-		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
-		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
-		get_user_ex(seg, &user_vm86->regs.ss);
-		vm86regs.pt.ss = seg;
-		get_user_ex(vm86regs.es, &user_vm86->regs.es);
-		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
-		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
-		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
-
-		get_user_ex(vm86->flags, &user_vm86->flags);
-		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
-		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
-	} get_user_catch(err);
-	if (err)
-		return err;
+
+	vm86regs.pt.bx = v.regs.ebx;
+	vm86regs.pt.cx = v.regs.ecx;
+	vm86regs.pt.dx = v.regs.edx;
+	vm86regs.pt.si = v.regs.esi;
+	vm86regs.pt.di = v.regs.edi;
+	vm86regs.pt.bp = v.regs.ebp;
+	vm86regs.pt.ax = v.regs.eax;
+	vm86regs.pt.ip = v.regs.eip;
+	vm86regs.pt.cs = v.regs.cs;
+	vm86regs.pt.flags = v.regs.eflags;
+	vm86regs.pt.sp = v.regs.esp;
+	vm86regs.pt.ss = v.regs.ss;
+	vm86regs.es = v.regs.es;
+	vm86regs.ds = v.regs.ds;
+	vm86regs.fs = v.regs.fs;
+	vm86regs.gs = v.regs.gs;
+
+	vm86->flags = v.flags;
+	vm86->screen_bitmap = v.screen_bitmap;
+	vm86->cpu_type = v.cpu_type;
 
 	if (copy_from_user(&vm86->int_revectored,
 			   &user_vm86->int_revectored,
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1838b10a299cf360ba73d9624a5026a7ccafaeba..32ea56412e71550a17efea3e4036b5eeac4cb406 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -24,6 +24,7 @@
 #include <asm/tsc.h>
 #include <asm/iommu.h>
 #include <asm/mach_traps.h>
+#include <asm/irqdomain.h>
 
 void x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
@@ -59,7 +60,8 @@ struct x86_init_ops x86_init __initdata = {
 		.intr_init		= native_init_IRQ,
 		.trap_init		= x86_init_noop,
 		.intr_mode_select	= apic_intr_mode_select,
-		.intr_mode_init		= apic_intr_mode_init
+		.intr_mode_init		= apic_intr_mode_init,
+		.create_pci_msi_domain	= native_create_pci_msi_domain,
 	},
 
 	.oem = {
@@ -127,28 +129,10 @@ EXPORT_SYMBOL_GPL(x86_platform);
 
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi __ro_after_init = {
-	.setup_msi_irqs		= native_setup_msi_irqs,
-	.teardown_msi_irq	= native_teardown_msi_irq,
-	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
 };
 
 /* MSI arch specific hooks */
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	return x86_msi.setup_msi_irqs(dev, nvec, type);
-}
-
-void arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-	x86_msi.teardown_msi_irqs(dev);
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-	x86_msi.teardown_msi_irq(irq);
-}
-
 void arch_restore_msi_irqs(struct pci_dev *dev)
 {
 	x86_msi.restore_msi_irqs(dev);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6fa946f983c921afc08c3c4ea18dd6a67d0ab8de..4d90f3cb4b461351caa8325e97de0d24014f4da4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -24,7 +24,14 @@
 #include "trace.h"
 #include "pmu.h"
 
-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -34,7 +41,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 		if (xstate_bv & 0x1) {
 		        u32 eax, ebx, ecx, edx, offset;
 		        cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
-			offset = compacted ? ret : ebx;
+			/* ECX[1]: 64B alignment in compacted form */
+			if (compacted)
+				offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+			else
+				offset = ebx;
 			ret = max(ret, offset + eax);
 		}
 
@@ -62,7 +73,8 @@ u64 kvm_supported_xcr0(void)
 	return xcr0;
 }
 
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
@@ -74,48 +86,54 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		return 0;
 
 	/* Update OSXSAVE bit */
-	if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
-		best->ecx &= ~F(OSXSAVE);
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
-			best->ecx |= F(OSXSAVE);
-	}
+	if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
+		cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+				   kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
 
-	best->edx &= ~F(APIC);
-	if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
-		best->edx |= F(APIC);
+	cpuid_entry_change(best, X86_FEATURE_APIC,
+			   vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
 
 	if (apic) {
-		if (best->ecx & F(TSC_DEADLINE_TIMER))
+		if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
 			apic->lapic_timer.timer_mode_mask = 3 << 17;
 		else
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	if (best) {
-		/* Update OSPKE bit */
-		if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
-			best->ecx &= ~F(OSPKE);
-			if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
-				best->ecx |= F(OSPKE);
-		}
-	}
+	if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
+		cpuid_entry_change(best, X86_FEATURE_OSPKE,
+				   kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
 		vcpu->arch.guest_supported_xcr0 = 0;
-		vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 	} else {
 		vcpu->arch.guest_supported_xcr0 =
 			(best->eax | ((u64)best->edx << 32)) &
 			kvm_supported_xcr0();
-		vcpu->arch.guest_xstate_size = best->ebx =
-			xstate_required_size(vcpu->arch.xcr0, false);
+		best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
-	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
-		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+	if (best) {
+		if (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+		    cpuid_entry_has(best, X86_FEATURE_XSAVEC))  {
+			u64 xstate = vcpu->arch.xcr0 | vcpu->arch.ia32_xss;
+
+			best->ebx = xstate_required_size(xstate, true);
+		}
+
+		if (!cpuid_entry_has(best, X86_FEATURE_XSAVES)) {
+			best->ecx = 0;
+			best->edx = 0;
+		}
+		vcpu->arch.guest_supported_xss =
+			(((u64)best->edx << 32) | best->ecx) & supported_xss;
+
+	} else {
+		vcpu->arch.guest_supported_xss = 0;
+	}
 
 	/*
 	 * The existing code assumes virtual address is 48-bit or 57-bit in the
@@ -129,6 +147,25 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			return -EINVAL;
 	}
 
+	/*
+	 * Exposing dynamic xfeatures to the guest requires additional
+	 * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+	 */
+	best = kvm_find_cpuid_entry(vcpu, 0xd, 0);
+	if (best) {
+		u64 xfeatures;
+		int r;
+
+		xfeatures = best->eax | ((u64)best->edx << 32);
+		xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+		if (xfeatures) {
+			r = fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu,
+							  xfeatures);
+			if (r)
+				return r;
+		}
+	}
+
 	best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
 	if (kvm_hlt_in_guest(vcpu->kvm) && best &&
 		(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
@@ -136,12 +173,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 
 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
 		best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-		if (best) {
-			if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
-				best->ecx |= F(MWAIT);
-			else
-				best->ecx &= ~F(MWAIT);
-		}
+		if (best)
+			cpuid_entry_change(best, X86_FEATURE_MWAIT,
+					   vcpu->arch.ia32_misc_enable_msr &
+					   MSR_IA32_MISC_ENABLE_MWAIT);
 	}
 
 	/* Update physical-address width */
@@ -173,8 +208,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 			break;
 		}
 	}
-	if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
-		entry->edx &= ~F(NX);
+	if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
+		cpuid_entry_clear(entry, X86_FEATURE_NX);
 		printk(KERN_INFO "kvm: guest NX capability removed\n");
 	}
 }
@@ -281,14 +316,177 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-static void cpuid_mask(u32 *word, int wordnum)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
-	*word &= boot_cpu_data.x86_capability[wordnum];
+	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
+	struct kvm_cpuid_entry2 entry;
+
+	reverse_cpuid_check(leaf);
+
+	cpuid_count(cpuid.function, cpuid.index,
+		    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+	kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
+}
+
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_mask for non-scattered leafs. */
+	BUILD_BUG_ON(leaf < NCAPINTS);
+
+	kvm_cpu_caps[leaf] = mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+	/* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+	BUILD_BUG_ON(leaf >= NCAPINTS);
+
+	kvm_cpu_caps[leaf] &= mask;
+
+	__kvm_cpu_cap_mask(leaf);
+}
+
+void kvm_set_cpu_caps(void)
+{
+	unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+	unsigned int f_gbpages = F(GBPAGES);
+	unsigned int f_lm = F(LM);
+	unsigned int f_xfd = F(XFD);
+#else
+	unsigned int f_gbpages = 0;
+	unsigned int f_lm = 0;
+	unsigned int f_xfd = 0;
+#endif
+	memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
+
+	BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
+		     sizeof(boot_cpu_data.x86_capability));
+
+	memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
+	       sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
+
+	kvm_cpu_cap_mask(CPUID_1_ECX,
+		/*
+		 * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+		 * advertised to guests via CPUID!
+		 */
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C) | F(RDRAND)
+	);
+
+	kvm_cpu_cap_mask(CPUID_1_EDX,
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_0_EBX,
+		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_ECX,
+		F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+	);
+	/* Set LA57 based on hardware capability. */
+	if (cpuid_ecx(7) & F(LA57))
+		kvm_cpu_cap_set(X86_FEATURE_LA57);
+
+	kvm_cpu_cap_mask(CPUID_7_EDX,
+		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+		F(MD_CLEAR) |
+		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FSRM)
+	);
+
+	kvm_cpu_cap_mask(CPUID_7_1_EAX,
+		F(AVX512_BF16)
+	);
+
+	kvm_cpu_cap_mask(CPUID_D_1_EAX,
+		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
+	);
+
+	kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+		F(TOPOEXT) | F(PERFCTR_CORE)
+	);
+
+	kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+	);
+
+	if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+		kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+	kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
+		F(CLZERO) | F(XSAVEERPTR) |
+		F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+		F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+	);
+
+	/*
+	 * Hide all SVM features by default, SVM will set the cap bits for
+	 * features it emulates and/or exposes for L1.
+	 */
+	kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+
+	kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
+		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+		F(PMM) | F(PMM_EN)
+	);
 }
+EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
+
+struct kvm_cpuid_array {
+	struct kvm_cpuid_entry2 *entries;
+	const int maxnent;
+	int nent;
+};
 
-static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
-			   u32 index)
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+					      u32 function, u32 index)
 {
+	struct kvm_cpuid_entry2 *entry;
+
+	if (array->nent >= array->maxnent)
+		return NULL;
+
+	entry = &array->entries[array->nent++];
+
 	entry->function = function;
 	entry->index = index;
 	entry->flags = 0;
@@ -310,16 +508,21 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 0x14:
 	case 0x17:
 	case 0x18:
+	case 0x1d:
+	case 0x1e:
 	case 0x1f:
 	case 0x8000001d:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		break;
 	}
+
+	return entry;
 }
 
-static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
-				    u32 func, int *nent, int maxnent)
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 {
+	struct kvm_cpuid_entry2 *entry = &array->entries[array->nent];
+
 	entry->function = func;
 	entry->index = 0;
 	entry->flags = 0;
@@ -327,17 +530,17 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
 	switch (func) {
 	case 0:
 		entry->eax = 7;
-		++*nent;
+		++array->nent;
 		break;
 	case 1:
 		entry->ecx = F(MOVBE);
-		++*nent;
+		++array->nent;
 		break;
 	case 7:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		entry->eax = 0;
 		entry->ecx = F(RDPID);
-		++*nent;
+		++array->nent;
 	default:
 		break;
 	}
@@ -345,223 +548,59 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
 	return 0;
 }
 
-static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
-{
-	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
-	unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
-	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
-	unsigned f_la57;
-	unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
-
-	/* cpuid 7.0.ebx */
-	const u32 kvm_cpuid_7_0_ebx_x86_features =
-		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
-		F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
-		F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
-		F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
-
-	/* cpuid 7.0.ecx*/
-	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
-		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
-		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
-
-	/* cpuid 7.0.edx*/
-	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
-		F(MD_CLEAR);
-
-	/* cpuid 7.1.eax */
-	const u32 kvm_cpuid_7_1_eax_x86_features =
-		F(AVX512_BF16);
-
-	switch (index) {
-	case 0:
-		entry->eax = min(entry->eax, 1u);
-		entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
-		cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
-		/* TSC_ADJUST is emulated */
-		entry->ebx |= F(TSC_ADJUST);
-
-		entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-		f_la57 = entry->ecx & F(LA57);
-		cpuid_mask(&entry->ecx, CPUID_7_ECX);
-		/* Set LA57 based on hardware capability. */
-		entry->ecx |= f_la57;
-		entry->ecx |= f_umip;
-		entry->ecx |= f_pku;
-		/* PKU is not yet implemented for shadow paging. */
-		if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-			entry->ecx &= ~F(PKU);
-
-		entry->edx &= kvm_cpuid_7_0_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_7_EDX);
-		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
-			entry->edx |= F(SPEC_CTRL);
-		if (boot_cpu_has(X86_FEATURE_STIBP))
-			entry->edx |= F(INTEL_STIBP);
-		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->edx |= F(SPEC_CTRL_SSBD);
-		/*
-		 * We emulate ARCH_CAPABILITIES in software even
-		 * if the host doesn't support it.
-		 */
-		entry->edx |= F(ARCH_CAPABILITIES);
-		break;
-	case 1:
-		entry->eax &= kvm_cpuid_7_1_eax_x86_features;
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		entry->eax = 0;
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	}
-}
-
-static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
-				  int *nent, int maxnent)
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 {
-	int r;
-	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-				? F(GBPAGES) : 0;
-	unsigned f_lm = F(LM);
-#else
-	unsigned f_gbpages = 0;
-	unsigned f_lm = 0;
-#endif
-	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+	struct kvm_cpuid_entry2 *entry;
+	int r, i, max_idx;
 	unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
-	/* cpuid 1.edx */
-	const u32 kvm_cpuid_1_edx_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
-		0 /* Reserved, DS, ACPI */ | F(MMX) |
-		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-		0 /* HTT, TM, Reserved, PBE */;
-	/* cpuid 0x80000001.edx */
-	const u32 kvm_cpuid_8000_0001_edx_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* Reserved */ |
-		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
-		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-	/* cpuid 1.ecx */
-	const u32 kvm_cpuid_1_ecx_x86_features =
-		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
-		 * but *not* advertised to guests via CPUID ! */
-		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
-		0 /* DS-CPL, VMX, SMX, EST */ |
-		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
-		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C) | F(RDRAND);
-	/* cpuid 0x80000001.ecx */
-	const u32 kvm_cpuid_8000_0001_ecx_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
-		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
-		F(TOPOEXT) | F(PERFCTR_CORE);
-
-	/* cpuid 0x80000008.ebx */
-	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(CLZERO) | F(XSAVEERPTR) |
-		F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
-		F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
-
-	/* cpuid 0xC0000001.edx */
-	const u32 kvm_cpuid_C000_0001_edx_x86_features =
-		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
-		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
-		F(PMM) | F(PMM_EN);
-
-	/* cpuid 0xD.1.eax */
-	const u32 kvm_cpuid_D_1_eax_x86_features =
-		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
-
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
 	r = -E2BIG;
 
-	if (WARN_ON(*nent >= maxnent))
+	entry = do_host_cpuid(array, function, 0);
+	if (WARN_ON(!entry))
 		goto out;
 
-	do_host_cpuid(entry, function, 0);
-	++*nent;
-
 	switch (function) {
 	case 0:
 		/* Limited to the highest leaf implemented in KVM. */
 		entry->eax = min(entry->eax, 0x1fU);
 		break;
 	case 1:
-		entry->edx &= kvm_cpuid_1_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_1_EDX);
-		entry->ecx &= kvm_cpuid_1_ecx_x86_features;
-		cpuid_mask(&entry->ecx, CPUID_1_ECX);
+		cpuid_entry_override(entry, CPUID_1_EDX);
+		cpuid_entry_override(entry, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
-		entry->ecx |= F(X2APIC);
+		cpuid_entry_set(entry, X86_FEATURE_X2APIC);
 		break;
 	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
 	 * may return different values. This forces us to get_cpu() before
 	 * issuing the first command, and also to emulate this annoying behavior
 	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
+	case 2:
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times; ++t) {
-			if (*nent >= maxnent)
-				goto out;
 
-			do_host_cpuid(&entry[t], function, 0);
-			++*nent;
+		for (i = 1, max_idx = entry->eax & 0xff; i < max_idx; ++i) {
+			entry = do_host_cpuid(array, function, 0);
+			if (!entry)
+				goto out;
 		}
 		break;
-	}
 	/* functions 4 and 0x8000001d have additional index. */
 	case 4:
-	case 0x8000001d: {
-		int i, cache_type;
-
-		/* read more entries until cache_type is zero */
-		for (i = 1; ; ++i) {
-			if (*nent >= maxnent)
+	case 0x8000001d:
+		/*
+		 * Read entries until the cache type in the previous entry is
+		 * zero, i.e. indicates an invalid entry.
+		 */
+		for (i = 1; entry->eax & 0x1f; ++i) {
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
-
-			cache_type = entry[i - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
 		}
 		break;
-	}
 	case 6: /* Thermal management */
 		entry->eax = 0x4; /* allow ARAT */
 		entry->ebx = 0;
@@ -569,22 +608,37 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->edx = 0;
 		break;
 	/* function 7 has additional index. */
-	case 7: {
-		int i;
+	case 7:
+		entry->eax = min(entry->eax, 1u);
+		cpuid_entry_override(entry, CPUID_7_0_EBX);
+		cpuid_entry_override(entry, CPUID_7_ECX);
+		cpuid_entry_override(entry, CPUID_7_EDX);
+
+		/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
+		cpuid_entry_set(entry, X86_FEATURE_TSC_ADJUST);
+		cpuid_entry_set(entry, X86_FEATURE_ARCH_CAPABILITIES);
+
+		if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL);
+		if (boot_cpu_has(X86_FEATURE_STIBP))
+			cpuid_entry_set(entry, X86_FEATURE_INTEL_STIBP);
+		if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+			cpuid_entry_set(entry, X86_FEATURE_SPEC_CTRL_SSBD);
 
-		for (i = 0; ; ) {
-			do_cpuid_7_mask(&entry[i], i);
-			if (i == entry->eax)
+		for (i = 1, max_idx = entry->eax; i <= max_idx; i++) {
+			if (WARN_ON_ONCE(i > 1))
 				break;
-			if (*nent >= maxnent)
+
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
 
-			++i;
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
+			cpuid_entry_override(entry, CPUID_7_1_EAX);
+			entry->ebx = 0;
+			entry->ecx = 0;
+			entry->edx = 0;
 		}
 		break;
-	}
 	case 9:
 		break;
 	case 0xa: { /* Architectural Performance Monitoring */
@@ -621,79 +675,104 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	 * thus they can be handled by common code.
 	 */
 	case 0x1f:
-	case 0xb: {
-		int i;
-
+	case 0xb:
 		/*
-		 * We filled in entry[0] for CPUID(EAX=<function>,
-		 * ECX=00H) above.  If its level type (ECX[15:8]) is
-		 * zero, then the leaf is unimplemented, and we're
-		 * done.  Otherwise, continue to populate entries
-		 * until the level type (ECX[15:8]) of the previously
-		 * added entry is zero.
+		 * Populate entries until the level type (ECX[15:8]) of the
+		 * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
+		 * the starting entry, filled by the primary do_host_cpuid().
 		 */
-		for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
-			if (*nent >= maxnent)
+		for (i = 1; entry->ecx & 0xff00; ++i) {
+			entry = do_host_cpuid(array, function, i);
+			if (!entry)
 				goto out;
-
-			do_host_cpuid(&entry[i], function, i);
-			++*nent;
 		}
 		break;
-	}
 	case 0xd: {
-		int idx, i;
+		int idx;
 		u64 supported = kvm_supported_xcr0();
+		u64 guest_perm = xstate_get_guest_group_perm();
 
-		entry->eax &= supported;
+		entry->eax &= supported & guest_perm;
 		entry->ebx = xstate_required_size(supported, false);
 		entry->ecx = entry->ebx;
-		entry->edx &= supported >> 32;
+		entry->edx &= (supported & guest_perm) >> 32;
 		if (!supported)
 			break;
 
-		for (idx = 1, i = 1; idx < 64; ++idx) {
-			u64 mask = ((u64)1 << idx);
-			if (*nent >= maxnent)
+		entry = do_host_cpuid(array, function, 1);
+		if (!entry)
+			goto out;
+
+		cpuid_entry_override(entry, CPUID_D_1_EAX);
+		if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+			entry->ebx = xstate_required_size(supported | supported_xss,
+							  true);
+		else {
+			WARN_ON_ONCE(supported_xss != 0);
+			entry->ebx = 0;
+		}
+		entry->ecx &= supported_xss;
+		entry->edx &= supported_xss >> 32;
+
+		for (idx = 2; idx < 64; ++idx) {
+			bool s_state;
+			if (supported & BIT_ULL(idx))
+				s_state = false;
+			else if (supported_xss & BIT_ULL(idx))
+				s_state = true;
+			else
+				continue;
+
+			entry = do_host_cpuid(array, function, idx);
+			if (!entry)
 				goto out;
 
-			do_host_cpuid(&entry[i], function, idx);
-			if (idx == 1) {
-				entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
-				cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
-				entry[i].ebx = 0;
-				if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
-					entry[i].ebx =
-						xstate_required_size(supported,
-								     true);
-			} else {
-				if (entry[i].eax == 0 || !(supported & mask))
-					continue;
-				if (WARN_ON_ONCE(entry[i].ecx & 1))
-					continue;
+			/*
+			 * The supported check above should have filtered out
+			 * invalid sub-leafs.  Only valid sub-leafs should
+			 * reach this point, and they should have a non-zero
+			 * save state size.  Furthermore, check whether the
+			 * processor agrees with supported_xcr0/supported_xss
+			 * on whether this is an XCR0- or IA32_XSS-managed area.
+			 */
+			if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+				--array->nent;
+				continue;
 			}
-			entry[i].ecx = 0;
-			entry[i].edx = 0;
-			++*nent;
-			++i;
+			entry->edx = 0;
 		}
 		break;
 	}
 	/* Intel PT */
-	case 0x14: {
-		int t, times = entry->eax;
+	case 0x14:
+		if (!f_intel_pt) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
 
-		if (!f_intel_pt)
+		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+			if (!do_host_cpuid(array, function, i))
+				goto out;
+		}
+		break;
+	/* Intel AMX TILE */
+	case 0x1d:
+		if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 			break;
+		}
 
-		for (t = 1; t <= times; ++t) {
-			if (*nent >= maxnent)
+		for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+			if (!do_host_cpuid(array, function, i))
 				goto out;
-			do_host_cpuid(&entry[t], function, t);
-			++*nent;
 		}
 		break;
-	}
+	case 0x1e: /* TMUL information */
+		if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+			entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+			break;
+		}
+		break;
 	case KVM_CPUID_SIGNATURE: {
 		static const char signature[12] = "KVMKVMKVM\0\0";
 		const u32 *sigptr = (const u32 *)signature;
@@ -728,10 +807,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0x8000001f);
 		break;
 	case 0x80000001:
-		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
-		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
-		cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
+		cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+		cpuid_entry_override(entry, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -749,31 +826,29 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
-		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+		cpuid_entry_override(entry, CPUID_8000_0008_EBX);
 		/*
 		 * AMD has separate bits for each SPEC_CTRL bit.
 		 * arch/x86/kernel/cpu/bugs.c is kind enough to
 		 * record that in cpufeatures so use them.
 		 */
 		if (boot_cpu_has(X86_FEATURE_IBPB))
-			entry->ebx |= F(AMD_IBPB);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_IBPB);
 		if (boot_cpu_has(X86_FEATURE_IBRS))
-			entry->ebx |= F(AMD_IBRS);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_IBRS);
 		if (boot_cpu_has(X86_FEATURE_STIBP))
-			entry->ebx |= F(AMD_STIBP);
-		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->ebx |= F(AMD_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_STIBP);
+		if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+			cpuid_entry_set(entry, X86_FEATURE_AMD_SSBD);
 		if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
-			entry->ebx |= F(AMD_SSB_NO);
+			cpuid_entry_set(entry, X86_FEATURE_AMD_SSB_NO);
 		/*
 		 * The preference is to use SPEC CTRL MSR instead of the
 		 * VIRT_SPEC MSR.
 		 */
 		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
 		    !boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->ebx |= F(VIRT_SSBD);
+			cpuid_entry_set(entry, X86_FEATURE_VIRT_SSBD);
 		break;
 	}
 	case 0x80000019:
@@ -788,8 +863,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0xC0000004);
 		break;
 	case 0xC0000001:
-		entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
-		cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
+		cpuid_entry_override(entry, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
@@ -801,7 +875,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	}
 
-	kvm_x86_ops->set_supported_cpuid(function, entry);
+	kvm_x86_ops->set_supported_cpuid(entry);
 
 	r = 0;
 
@@ -811,28 +885,44 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
 	return r;
 }
 
-static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
-			 int *nent, int maxnent, unsigned int type)
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+			 unsigned int type)
 {
-	if (*nent >= maxnent)
+	if (array->nent >= array->maxnent)
 		return -E2BIG;
 
 	if (type == KVM_GET_EMULATED_CPUID)
-		return __do_cpuid_func_emulated(entry, func, nent, maxnent);
+		return __do_cpuid_func_emulated(array, func);
 
-	return __do_cpuid_func(entry, func, nent, maxnent);
+	return __do_cpuid_func(array, func);
 }
 
 #undef F
 
-struct kvm_cpuid_param {
-	u32 func;
-	bool (*qualifier)(const struct kvm_cpuid_param *param);
-};
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
 
-static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+			  unsigned int type)
 {
-	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+	u32 limit;
+	int r;
+
+	if (func == CENTAUR_CPUID_SIGNATURE &&
+	    boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
+		return 0;
+
+	r = do_cpuid_func(array, func, type);
+	if (r)
+		return r;
+
+	limit = array->entries[array->nent - 1].eax;
+	for (func = func + 1; func <= limit; ++func) {
+		r = do_cpuid_func(array, func, type);
+		if (r)
+			break;
+	}
+
+	return r;
 }
 
 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
@@ -866,62 +956,42 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 			    struct kvm_cpuid_entry2 __user *entries,
 			    unsigned int type)
 {
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG, i;
-	u32 func;
-	static const struct kvm_cpuid_param param[] = {
-		{ .func = 0 },
-		{ .func = 0x80000000 },
-		{ .func = 0xC0000000, .qualifier = is_centaur_cpu },
-		{ .func = KVM_CPUID_SIGNATURE },
+	static const u32 funcs[] = {
+		0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
+	};
+
+	struct kvm_cpuid_array array = {
+		.nent = 0,
+		.maxnent = cpuid->nent,
 	};
+	int r, i;
 
 	if (cpuid->nent < 1)
-		goto out;
+		return -E2BIG;
 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 
 	if (sanity_check_entries(entries, cpuid->nent, type))
 		return -EINVAL;
 
-	r = -ENOMEM;
-	cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
+	array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
 					   cpuid->nent));
-	if (!cpuid_entries)
-		goto out;
-
-	r = 0;
-	for (i = 0; i < ARRAY_SIZE(param); i++) {
-		const struct kvm_cpuid_param *ent = &param[i];
-
-		if (ent->qualifier && !ent->qualifier(ent))
-			continue;
-
-		r = do_cpuid_func(&cpuid_entries[nent], ent->func,
-				  &nent, cpuid->nent, type);
-
-		if (r)
-			goto out_free;
-
-		limit = cpuid_entries[nent - 1].eax;
-		for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
-			r = do_cpuid_func(&cpuid_entries[nent], func,
-				          &nent, cpuid->nent, type);
+	if (!array.entries)
+		return -ENOMEM;
 
+	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+		r = get_cpuid_func(&array, funcs[i], type);
 		if (r)
 			goto out_free;
 	}
+	cpuid->nent = array.nent;
 
-	r = -EFAULT;
-	if (copy_to_user(entries, cpuid_entries,
-			 nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out_free;
-	cpuid->nent = nent;
-	r = 0;
+	if (copy_to_user(entries, array.entries,
+			 array.nent * sizeof(struct kvm_cpuid_entry2)))
+		r = -EFAULT;
 
 out_free:
-	vfree(cpuid_entries);
-out:
+	vfree(array.entries);
 	return r;
 }
 
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 7dec43b2c4205680925bca103541287e78dfb450..d2bd1b4e5d5165ca87da771ad42160509f31aaf0 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,6 +6,22 @@
 #include <asm/cpu.h>
 #include <asm/processor.h>
 
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+	NR_KVM_CPU_CAPS = NCAPINTS,
+
+	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define X86_KVM_FEATURE(w, f)		((w)*32 + (f))
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
 bool kvm_mpx_supported(void);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -25,6 +41,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 	       u32 *ecx, u32 *edx, bool check_limit);
 
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
 
 static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
@@ -53,28 +71,69 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
 	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
+	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
 };
 
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities.  And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 {
-	unsigned x86_leaf = x86_feature / 32;
-
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+	BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
 	BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
 	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
 
-	return reverse_cpuid[x86_leaf];
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+	return x86_feature;
 }
 
-static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 __feature_leaf(int x86_feature)
 {
-	struct kvm_cpuid_entry2 *entry;
-	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+	return __feature_translate(x86_feature) / 32;
+}
 
-	entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
-	if (!entry)
-		return NULL;
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5).  The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+	x86_feature = __feature_translate(x86_feature);
+
+	reverse_cpuid_check(x86_feature / 32);
+	return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
 
-	switch (cpuid.reg) {
+	reverse_cpuid_check(x86_leaf);
+	return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+						  u32 reg)
+{
+	switch (reg) {
 	case CPUID_EAX:
 		return &entry->eax;
 	case CPUID_EBX:
@@ -89,9 +148,86 @@ static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi
 	}
 }
 
-static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+						unsigned int x86_feature)
+{
+	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+	return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+					   unsigned int x86_feature)
 {
-	int *reg;
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+					    unsigned int x86_feature)
+{
+	return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+					      unsigned int x86_feature)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	*reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+					    unsigned int x86_feature)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	*reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+					       unsigned int x86_feature,
+					       bool set)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+	/*
+	 * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+	 * compiler into using CMOV instead of Jcc when possible.
+	 */
+	if (set)
+		*reg |= __feature_bit(x86_feature);
+	else
+		*reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+						 enum cpuid_leafs leaf)
+{
+	u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+	BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+	*reg = kvm_cpu_caps[leaf];
+}
+
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
+						     unsigned int x86_feature)
+{
+	const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+	if (!entry)
+		return NULL;
+
+	return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+					    unsigned int x86_feature)
+{
+	u32 *reg;
 
 	if (x86_feature == X86_FEATURE_XSAVE &&
 			!static_cpu_has(X86_FEATURE_XSAVE))
@@ -101,16 +237,17 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
 	if (!reg)
 		return false;
 
-	return *reg & bit(x86_feature);
+	return *reg & __feature_bit(x86_feature);
 }
 
-static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
+					      unsigned int x86_feature)
 {
-	int *reg;
+	u32 *reg;
 
 	reg = guest_cpuid_get_register(vcpu, x86_feature);
 	if (reg)
-		*reg &= ~bit(x86_feature);
+		*reg &= ~__feature_bit(x86_feature);
 }
 
 static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
@@ -179,4 +316,38 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 		  MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
 }
 
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+	reverse_cpuid_check(x86_leaf);
+	kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+	reverse_cpuid_check(x86_leaf);
+	kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+	unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+	reverse_cpuid_check(x86_leaf);
+	return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+	return !!kvm_cpu_cap_get(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+	if (boot_cpu_has(x86_feature))
+		kvm_cpu_cap_set(x86_feature);
+}
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 60c8dcb907a500d692fc72001734296f59f469af..d415435122b0f6763c2b07ce8b5f9ed716b20c91 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -26,6 +26,7 @@
 #include <asm/debugreg.h>
 #include <asm/nospec-branch.h>
 
+#include "cpuid.h"
 #include "x86.h"
 #include "tss.h"
 #include "mmu.h"
@@ -171,6 +172,7 @@
 #define NoMod	    ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
+#define NonPostedWrite  ((u64)1 << 50)  /* Instruction does non-posted write */
 #define PrivUD      ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
@@ -1484,12 +1486,15 @@ static int segmented_write(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	ulong linear;
+	bool non_posted = false;
 
 	rc = linearize(ctxt, addr, size, true, &linear);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
+	if (ctxt->d & NonPostedWrite)
+		non_posted = true;
 	return ctxt->ops->write_emulated(ctxt, linear, data, size,
-					 &ctxt->exception);
+					 &ctxt->exception, non_posted);
 }
 
 static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
@@ -2382,7 +2387,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
 	eax = 0x80000001;
 	ecx = 0;
 	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
-	return edx & bit(X86_FEATURE_LM);
+	return edx & feature_bit(LM);
 #else
 	return false;
 #endif
@@ -3651,7 +3656,120 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-#define FFL(x) bit(X86_FEATURE_##x)
+static int em_enqcmd(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+	bool retry = false;
+
+	if (ctxt->ops->np_write_complete(ctxt, &retry)) {
+		/* Dont do writeback on the return path*/
+		ctxt->d &= ~NoWrite;
+		if (retry)
+			ctxt->eflags |=  X86_EFLAGS_ZF;
+		else
+			ctxt->eflags &=  ~X86_EFLAGS_ZF;
+
+		return rc;
+	}
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+	ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("ENQCMD: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	/* TODO: Get PASID from PASID MSR */
+
+	/* TODO: Translate PASID through VMCS PASID translation table */
+
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
+static int em_enqcmds(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+	bool retry = false;
+
+	if (ctxt->ops->np_write_complete(ctxt, &retry)) {
+		/* Dont do writeback on the return path*/
+		ctxt->d &= ~NoWrite;
+		if (retry)
+			ctxt->eflags |=  X86_EFLAGS_ZF;
+		else
+			ctxt->eflags &=  ~X86_EFLAGS_ZF;
+
+		return rc;
+	}
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+	ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("ENQCMDS: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	/* TODO: Translate PASID through VMCS PASID translation table */
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
+static int em_movdir64b(struct x86_emulate_ctxt *ctxt)
+{
+	u64 dest_va = ctxt->dst.val;
+	int rc = X86EMUL_CONTINUE;
+
+	if (dest_va & 0x3f) {
+		printk("dest va is not 64 byte aligned\n");
+	}
+	ctxt->op_bytes = 64;
+	ctxt->src.bytes = 64;
+	ctxt->dst.bytes = 64;
+
+	ctxt->dst.type = OP_MEM;
+	ctxt->dst.addr.mem.ea = dest_va;
+	ctxt->dst.addr.mem.seg = ctxt->src.addr.mem.seg;
+
+	rc = segmented_read(ctxt, ctxt->src.addr.mem,
+		ctxt->src.valptr512, ctxt->src.bytes);
+	if (rc != X86EMUL_CONTINUE) {
+		printk("MOVDIR64B: read src data failed rc %d\n", rc);
+		goto done;
+	}
+
+	memcpy(ctxt->dst.valptr512, ctxt->src.valptr512, sizeof(ctxt->src.valptr512));
+done:
+	return rc;
+}
+
+#define FFL(x) __feature_bit(X86_FEATURE_##x)
 
 static int em_movbe(struct x86_emulate_ctxt *ctxt)
 {
@@ -4940,6 +5058,15 @@ static const struct gprefix three_byte_0f_38_f1 = {
 	ID(0, &instr_dual_0f_38_f1), N, N, N
 };
 
+/* MOVDIR64B has alignment requirement on dest but not on source.
+ * Not sure how to specify Unaligned only on the source addr
+ */
+static const struct gprefix three_byte_0f_38_f8 = {
+	N, I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned, em_movdir64b),
+	I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned | NonPostedWrite, em_enqcmd),
+	I(DstReg | SrcMem | Mov | NoAccess | TwoMemOp | Unaligned | NonPostedWrite, em_enqcmds)
+};
+
 /*
  * Insns below are selected by the prefix which indexed by the third opcode
  * byte.
@@ -4953,7 +5080,7 @@ static const struct opcode opcode_map_0f_38[256] = {
 	GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
 	GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
 	/* 0xf2 - 0xff */
-	N, N, X4(N), X8(N)
+	N, N, X4(N), GP(ModRM, &three_byte_0f_38_f8), N, N, N, X4(N)
 };
 
 #undef D
@@ -5321,6 +5448,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 			ctxt->opcode_len = 3;
 			ctxt->b = insn_fetch(u8, ctxt);
 			opcode = opcode_map_0f_38[ctxt->b];
+			if (ctxt->b == 0xf8)
+				ctxt->op_bytes = 8;
 		}
 	}
 	ctxt->d = opcode.flags;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 26408434b9bcf5b65c75b1a233a8c069dd233cd2..0fa3e8105330c48ef1e988b1616b4ad6e52afb6f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1113,7 +1113,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
 		if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
 			hv_vcpu->hv_vapic = data;
-			if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+			if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
 				return 1;
 			break;
 		}
@@ -1131,7 +1131,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 			return 1;
 		hv_vcpu->hv_vapic = data;
 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
-		if (kvm_lapic_enable_pv_eoi(vcpu,
+		if (kvm_lapic_set_pv_eoi(vcpu,
 					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
 					    sizeof(struct hv_vp_assist_page)))
 			return 1;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 24a6905d60ee2767089874c412927e1c3e757c5d..d7723e5d728a6a100eb6c3520781dd2f9f02109b 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -91,7 +91,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
 {
 	ioapic->rtc_status.pending_eoi = 0;
-	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
 }
 
 static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ea1a4e0297dae117832e19e44079e031b7fd5a94..85a60351e16ffa3d28fd8f4bbe74dc10dc67d672 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -43,13 +43,13 @@ struct kvm_vcpu;
 
 struct dest_map {
 	/* vcpu bitmap where IRQ has been sent */
-	DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+	DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
 
 	/*
 	 * Vector sent to a given vcpu, only valid when
 	 * the vcpu's bit in map is set
 	 */
-	u8 vectors[KVM_MAX_VCPU_ID];
+	u8 vectors[KVM_MAX_VCPU_IDS];
 };
 
 
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 5ddcaacef29123229e3776b6081bf0d098ba97d5..9b92a6bb277ba23b618af4a108697a89167c7ea3 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	unsigned int dest_vcpus = 0;
 
+	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+		return r;
+
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_lowest_prio_delivery(irq)) {
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
 
-	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
-		return r;
-
 	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3f6b866c644d5028f87728f8bab86ebf73e0e3b1..aa46aad54cf06c172eda9e23b0e774f84077cdd9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -56,15 +56,9 @@
 #define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
 #define LAPIC_MMIO_LENGTH		(1 << 12)
 /* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK			0xc0000
-#define APIC_DEST_NOSHORT		0x0
-#define APIC_DEST_MASK			0x800
 #define MAX_APIC_VECTOR			256
 #define APIC_VECTORS_PER_REG		32
 
-#define APIC_BROADCAST			0xFF
-#define X2APIC_BROADCAST		0xFFFFFFFFul
-
 static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_ADJUST_MIN	100	/* clock cycles */
 #define LAPIC_TIMER_ADVANCE_ADJUST_MAX	10000	/* clock cycles */
@@ -72,6 +66,40 @@ static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_NS_MAX     5000
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
+
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+	*((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+	__kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+	BUILD_BUG_ON(reg != APIC_ICR);
+	return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+	return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+	BUILD_BUG_ON(reg != APIC_ICR);
+	*((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+						int reg, u64 val)
+{
+	__kvm_lapic_set_reg64(apic->regs, reg, val);
+}
 
 static inline int apic_test_vector(int vec, void *bitmap)
 {
@@ -167,14 +195,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
 	kvfree(map);
 }
 
-static void recalculate_apic_map(struct kvm *kvm)
+void kvm_recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
 	u32 max_id = 255; /* enough space for any xAPIC ID */
 
+	if (!kvm->arch.apic_map_dirty) {
+		/*
+		 * Read kvm->arch.apic_map_dirty before
+		 * kvm->arch.apic_map
+		 */
+		smp_rmb();
+		return;
+	}
+
 	mutex_lock(&kvm->arch.apic_map_lock);
+	if (!kvm->arch.apic_map_dirty) {
+		/* Someone else has updated the map. */
+		mutex_unlock(&kvm->arch.apic_map_lock);
+		return;
+	}
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
@@ -239,6 +281,12 @@ static void recalculate_apic_map(struct kvm *kvm)
 	old = rcu_dereference_protected(kvm->arch.apic_map,
 			lockdep_is_held(&kvm->arch.apic_map_lock));
 	rcu_assign_pointer(kvm->arch.apic_map, new);
+	/*
+	 * Write kvm->arch.apic_map before
+	 * clearing apic->apic_map_dirty
+	 */
+	smp_wmb();
+	kvm->arch.apic_map_dirty = false;
 	mutex_unlock(&kvm->arch.apic_map_lock);
 
 	if (old)
@@ -260,20 +308,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 		else
 			static_key_slow_inc(&apic_sw_disabled.key);
 
-		recalculate_apic_map(apic->vcpu->kvm);
+		apic->vcpu->kvm->arch.apic_map_dirty = true;
 	}
 }
 
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 {
 	kvm_lapic_set_reg(apic, APIC_LDR, id);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -289,7 +337,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
-	recalculate_apic_map(apic->vcpu->kvm);
+	apic->vcpu->kvm->arch.apic_map_dirty = true;
 }
 
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1195,10 +1243,13 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
 
-static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 {
 	struct kvm_lapic_irq irq;
 
+	/* KVM has no delay and should always clear the BUSY/PENDING flag. */
+	WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
 	irq.vector = icr_low & APIC_VECTOR_MASK;
 	irq.delivery_mode = icr_low & APIC_MODE_MASK;
 	irq.dest_mode = icr_low & APIC_DEST_MASK;
@@ -1215,6 +1266,7 @@ static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
@@ -1298,8 +1350,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
 #define APIC_REGS_MASK(first, count) \
 	(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
 
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
-		void *data)
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+			      void *data)
 {
 	unsigned char alignment = offset & 0xf;
 	u32 result;
@@ -1317,7 +1369,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
 		APIC_REG_MASK(APIC_ESR) |
 		APIC_REG_MASK(APIC_ICR) |
-		APIC_REG_MASK(APIC_ICR2) |
 		APIC_REG_MASK(APIC_LVTT) |
 		APIC_REG_MASK(APIC_LVTTHMR) |
 		APIC_REG_MASK(APIC_LVTPC) |
@@ -1328,9 +1379,16 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REG_MASK(APIC_TMCCT) |
 		APIC_REG_MASK(APIC_TDCR);
 
-	/* ARBPRI is not valid on x2APIC */
+	/*
+	 * ARBPRI and ICR2 are not valid in x2APIC mode.  WARN if KVM reads ICR
+	 * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
+	 * manually handled by the caller.
+	 */
 	if (!apic_x2apic_mode(apic))
-		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+				  APIC_REG_MASK(APIC_ICR2);
+	else
+		WARN_ON_ONCE(offset == APIC_ICR);
 
 	if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
 		return 1;
@@ -1352,7 +1410,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
 
 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 {
@@ -1599,13 +1656,18 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+	return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
 {
 	ktime_t now, remaining;
 	u64 ns_remaining_old, ns_remaining_new;
 
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 	limit_periodic_timer_frequency(apic);
 
 	now = ktime_get();
@@ -1623,14 +1685,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
 	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
 }
 
-static bool set_target_expiration(struct kvm_lapic *apic)
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
 {
 	ktime_t now;
 	u64 tscl = rdtsc();
+	s64 deadline;
 
 	now = ktime_get();
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 
 	if (!apic->lapic_timer.period) {
 		apic->lapic_timer.tscdeadline = 0;
@@ -1638,10 +1701,32 @@ static bool set_target_expiration(struct kvm_lapic *apic)
 	}
 
 	limit_periodic_timer_frequency(apic);
+	deadline = apic->lapic_timer.period;
+
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		if (unlikely(count_reg != APIC_TMICT)) {
+			deadline = tmict_to_ns(apic,
+				     kvm_lapic_get_reg(apic, count_reg));
+			if (unlikely(deadline <= 0))
+				deadline = apic->lapic_timer.period;
+			else if (unlikely(deadline > apic->lapic_timer.period)) {
+				pr_info_ratelimited(
+				    "kvm: vcpu %i: requested lapic timer restore with "
+				    "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+				    "Using initial count to start timer.\n",
+				    apic->vcpu->vcpu_id,
+				    count_reg,
+				    kvm_lapic_get_reg(apic, count_reg),
+				    deadline, apic->lapic_timer.period);
+				kvm_lapic_set_reg(apic, count_reg, 0);
+				deadline = apic->lapic_timer.period;
+			}
+		}
+	}
 
 	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
-		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+		nsec_to_cycles(apic->vcpu, deadline);
+	apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
 
 	return true;
 }
@@ -1823,17 +1908,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
 	restart_apic_timer(apic);
 }
 
-static void start_apic_timer(struct kvm_lapic *apic)
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
 {
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-	    && !set_target_expiration(apic))
+	    && !set_target_expiration(apic, count_reg))
 		return;
 
 	restart_apic_timer(apic);
 }
 
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+	__start_apic_timer(apic, APIC_TMICT);
+}
+
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
 	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
@@ -1847,7 +1937,7 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 	}
 }
 
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
 	int ret = 0;
 
@@ -1880,7 +1970,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	case APIC_DFR:
 		if (!apic_x2apic_mode(apic)) {
 			kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-			recalculate_apic_map(apic->vcpu->kvm);
+			apic->vcpu->kvm->arch.apic_map_dirty = true;
 		} else
 			ret = 1;
 		break;
@@ -1907,16 +1997,18 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 	case APIC_ICR:
+		WARN_ON_ONCE(apic_x2apic_mode(apic));
+
 		/* No delay here, so we always clear the pending bit */
-		val &= ~(1 << 12);
-		apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+		val &= ~APIC_ICR_BUSY;
+		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
 		kvm_lapic_set_reg(apic, APIC_ICR, val);
 		break;
-
 	case APIC_ICR2:
-		if (!apic_x2apic_mode(apic))
-			val &= 0xff000000;
-		kvm_lapic_set_reg(apic, APIC_ICR2, val);
+		if (apic_x2apic_mode(apic))
+			ret = 1;
+		else
+			kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
 		break;
 
 	case APIC_LVT0:
@@ -1986,9 +2078,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		break;
 	}
 
+	/*
+	 * Recalculate APIC maps if necessary, e.g. if the software enable bit
+	 * was toggled, the APIC ID changed, etc...   The maps are marked dirty
+	 * on relevant changes, i.e. this is a nop for most writes.
+	 */
+	kvm_recalculate_apic_map(apic->vcpu->kvm);
+
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
 
 static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 			    gpa_t address, int len, const void *data)
@@ -2032,15 +2130,27 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 /* emulate APIC access in a trap manner */
 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
 {
-	u32 val = 0;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 val;
 
-	/* hw has done the conditional check and inst decode */
-	offset &= 0xff0;
+	if (apic_x2apic_mode(apic)) {
+		/*
+		 * When guest APIC is in x2APIC mode and IPI virtualization
+		 * is enabled, accessing APIC_ICR may cause trap-like VM-exit
+		 * on Intel hardware. Other offsets are not possible.
+		 */
+		if (WARN_ON_ONCE(offset != APIC_ICR))
+			return;
 
-	kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+		kvm_lapic_msr_read(apic, offset, &val);
+		kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
+		trace_kvm_apic_write(APIC_ICR, val);
+	} else {
+		val = kvm_lapic_get_reg(apic, offset);
 
-	/* TODO: optimize to just emulate side effect w/o one more write */
-	kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
+		/* TODO: optimize to just emulate side effect w/o one more write */
+		kvm_lapic_reg_write(apic, offset, (u32)val);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
 
@@ -2134,7 +2244,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 			static_key_slow_dec_deferred(&apic_hw_disabled);
 		} else {
 			static_key_slow_inc(&apic_hw_disabled.key);
-			recalculate_apic_map(vcpu->kvm);
+			vcpu->kvm->arch.apic_map_dirty = true;
 		}
 	}
 
@@ -2160,6 +2270,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!apic)
 		return;
 
+	vcpu->kvm->arch.apic_map_dirty = false;
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
@@ -2185,8 +2296,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!apic_x2apic_mode(apic))
 		kvm_apic_set_ldr(apic, 0);
 	kvm_lapic_set_reg(apic, APIC_ESR, 0);
-	kvm_lapic_set_reg(apic, APIC_ICR, 0);
-	kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+	if (!apic_x2apic_mode(apic)) {
+		kvm_lapic_set_reg(apic, APIC_ICR, 0);
+		kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+	} else {
+		kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+	}
 	kvm_lapic_set_reg(apic, APIC_TDCR, 0);
 	kvm_lapic_set_reg(apic, APIC_TMICT, 0);
 	for (i = 0; i < 8; i++) {
@@ -2212,6 +2327,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
+
+	kvm_recalculate_apic_map(vcpu->kvm);
 }
 
 /*
@@ -2404,6 +2521,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 	if (apic_x2apic_mode(vcpu->arch.apic)) {
 		u32 *id = (u32 *)(s->regs + APIC_ID);
 		u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+		u64 icr;
 
 		if (vcpu->kvm->arch.x2apic_format) {
 			if (*id != vcpu->vcpu_id)
@@ -2415,9 +2533,21 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 				*id <<= 24;
 		}
 
-		/* In x2APIC mode, the LDR is fixed and based on the id */
-		if (set)
+		/*
+		 * In x2APIC mode, the LDR is fixed and based on the id.  And
+		 * ICR is internally a single 64-bit register, but needs to be
+		 * split to ICR+ICR2 in userspace for backwards compatibility.
+		 */
+		if (set) {
 			*ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+			icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+			      (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+			__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+		} else {
+			icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+			__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+		}
 	}
 
 	return 0;
@@ -2426,6 +2556,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+	/*
+	 * Get calculated timer current count for remaining timer period (if
+	 * any) and store it in the returned register set.
+	 */
+	__kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+			    __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
 	return kvm_apic_state_fixup(vcpu, s, false);
 }
 
@@ -2434,17 +2572,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int r;
 
-
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	/* set SPIV separately to get count of SW disabled APICs right */
 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
 
 	r = kvm_apic_state_fixup(vcpu, s, true);
-	if (r)
+	if (r) {
+		kvm_recalculate_apic_map(vcpu->kvm);
 		return r;
+	}
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
 
-	recalculate_apic_map(vcpu->kvm);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
@@ -2452,7 +2591,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	apic_update_lvtt(apic);
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
 	update_divide_count(apic);
-	start_apic_timer(apic);
+	__start_apic_timer(apic, APIC_TMCCT);
 	apic->irr_pending = true;
 	apic->isr_count = vcpu->arch.apicv_active ?
 				1 : count_vectors(apic->regs + APIC_ISR);
@@ -2606,76 +2745,88 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 	return 0;
 }
 
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 reg = (msr - APIC_BASE_MSR) << 4;
+	data &= ~APIC_ICR_BUSY;
 
-	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
-		return 1;
+	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+	kvm_lapic_set_reg64(apic, APIC_ICR, data);
+	trace_kvm_apic_write(APIC_ICR, data);
+	return 0;
+}
 
-	if (reg == APIC_ICR2)
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+	u32 low;
+
+	if (reg == APIC_ICR) {
+		*data = kvm_lapic_get_reg64(apic, APIC_ICR);
+		return 0;
+	}
+
+	if (kvm_lapic_reg_read(apic, reg, 4, &low))
 		return 1;
 
-	/* if this is ICR write vector before command */
+	*data = low;
+
+	return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+	/*
+	 * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+	 * can be written as such, all other registers remain accessible only
+	 * through 32-bit reads/writes.
+	 */
 	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+		return kvm_x2apic_icr_write(apic, data);
+
 	return kvm_lapic_reg_write(apic, reg, (u32)data);
 }
 
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
-	if (reg == APIC_DFR || reg == APIC_ICR2)
-		return 1;
+	return kvm_lapic_msr_write(apic, reg, data);
+}
 
-	if (kvm_lapic_reg_read(apic, reg, 4, &low))
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
 
-	*data = (((u64)high) << 32) | low;
+	if (reg == APIC_DFR)
+		return 1;
 
-	return 0;
+	return kvm_lapic_msr_read(apic, reg, data);
 }
 
 int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
 	if (!lapic_in_kernel(vcpu))
 		return 1;
 
-	/* if this is ICR write vector before command */
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
-	return kvm_lapic_reg_write(apic, reg, (u32)data);
+	return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
 }
 
 int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 low, high = 0;
-
 	if (!lapic_in_kernel(vcpu))
 		return 1;
 
-	if (kvm_lapic_reg_read(apic, reg, 4, &low))
-		return 1;
-	if (reg == APIC_ICR)
-		kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
-	*data = (((u64)high) << 32) | low;
-
-	return 0;
+	return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
 {
 	u64 addr = data & ~KVM_MSR_ENABLED;
 	struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1f5014852e208e1c38c9d0c7c57fa0ae8fc6d110..7b5bb9bc9e5bd5ba19806edb438397f2d1025586 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,12 +10,19 @@
 #define KVM_APIC_SIPI		1
 #define KVM_APIC_LVT_NUM	6
 
+#define APIC_SHORT_MASK		0xc0000
+#define APIC_DEST_NOSHORT	0x0
+#define APIC_DEST_MASK		0x800
+
 #define KVM_APIC_SHORT_MASK	0xc0000
 #define KVM_APIC_DEST_MASK	0x800
 
 #define APIC_BUS_CYCLE_NS       1
 #define APIC_BUS_FREQUENCY      (1000000000ULL / APIC_BUS_CYCLE_NS)
 
+#define APIC_BROADCAST		0xFF
+#define X2APIC_BROADCAST	0xFFFFFFFFul
+
 enum lapic_mode {
 	LAPIC_MODE_DISABLED = 0,
 	LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -77,10 +84,8 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
-		       void *data);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode);
 
@@ -111,6 +116,8 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
 
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 
@@ -122,7 +129,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
 }
 
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
 void kvm_lapic_init(void);
 void kvm_lapic_exit(void);
 
@@ -149,14 +156,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
 	apic->irr_pending = true;
 }
 
-static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
 {
-	return *((u32 *) (apic->regs + reg_off));
+	return *((u32 *) (regs + reg_off));
 }
 
-static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
-	*((u32 *) (apic->regs + reg_off)) = val;
+	return __kvm_lapic_get_reg(apic->regs, reg_off);
 }
 
 extern struct static_key kvm_no_apic_vcpu;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 47c27c6e3842636981d82d5a5706d34b01a6f5c4..7a73eb05d6160d2dcfd9580425f2077f8e9d95a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -247,6 +247,22 @@ static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
+/*
+1) Each ept page table entry requires 8 bytes, and a physical  page 
+   can store 512 (4096/8)  entries. Therefore, for a scenario with 4K pages, 
+   the number of mmu pages required to map 760G physical memory is :
+   (760*2^30)/4096/512=389120.
+2)The reason for  mapping  760G  is that the max physical memory 
+   of  host is 768G.
+3)In fact, since THP is enabled, this number maybe never be reached.
+*/
+u32 kvm_mmu_limit_nr = 389120;
+EXPORT_SYMBOL(kvm_mmu_limit_nr);
+u32 kvm_mmu_reclaim_try_times = 0;
+EXPORT_SYMBOL(kvm_mmu_reclaim_try_times);
+u32 kvm_mmu_reclaim_times = 0;
+EXPORT_SYMBOL(kvm_mmu_reclaim_times);
+
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -5602,18 +5618,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
 		emulation_type = EMULTYPE_ALLOW_RETRY;
 emulate:
-	/*
-	 * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
-	 * This can happen if a guest gets a page-fault on data access but the HW
-	 * table walker is not able to read the instruction page (e.g instruction
-	 * page is not present in memory). In those cases we simply restart the
-	 * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
-	 */
-	if (unlikely(insn && !insn_len)) {
-		if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
-			return 1;
-	}
-
 	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
 				       insn_len);
 }
@@ -6240,7 +6244,15 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 static unsigned long
 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+	unsigned long shrink_count =
+		percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+
+	kvm_mmu_reclaim_try_times++;
+	if (shrink_count < kvm_mmu_limit_nr)
+		return 0;
+	kvm_mmu_reclaim_times++;
+
+	return shrink_count;
 }
 
 static struct shrinker mmu_shrinker = {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a20fc1ba607f3b3df6547f42cf80389667b55c1a..6092ffa649af648f27f62c8d6ede837224491a29 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -385,7 +385,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 			goto error;
 
 		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+		if (unlikely(__get_user(pte, ptep_user)))
 			goto error;
 		walker->ptep_user[walker->level - 1] = ptep_user;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b9d14fdbd2d8109312f9fe0409a50f75a1f5ee0b..ac5a32320c9605f8f800c759489b6a2350311726 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1040,7 +1040,7 @@ static bool valid_msr_intercept(u32 index)
 	return false;
 }
 
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 {
 	u8 bit_write;
 	unsigned long tmp;
@@ -1059,7 +1059,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
 	return !!test_bit(bit_write,  &tmp);
 }
 
-static void set_msr_interception(u32 *msrpm, unsigned msr,
+static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 				 int read, int write)
 {
 	u8 bit_read, bit_write;
@@ -1085,20 +1085,36 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	msrpm[offset] = tmp;
 }
 
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static u32 *svm_vcpu_alloc_msrpm(void)
 {
-	int i;
+	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+	u32 *msrpm;
+
+	if (!pages)
+		return NULL;
 
+	msrpm = page_address(pages);
 	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 
+	return msrpm;
+}
+
+static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+	int i;
+
 	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 		if (!direct_access_msrs[i].always)
 			continue;
-
-		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 	}
 }
 
+static void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+	__free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+}
+
 static void add_msr_offset(u32 offset)
 {
 	int i;
@@ -1142,26 +1158,26 @@ static void init_msrpm_offsets(void)
 	}
 }
 
-static void svm_enable_lbrv(struct vcpu_svm *svm)
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 {
-	u32 *msrpm = svm->msrpm;
+	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 }
 
-static void svm_disable_lbrv(struct vcpu_svm *svm)
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 {
-	u32 *msrpm = svm->msrpm;
+	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
-	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
 static void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -1340,6 +1356,24 @@ static __init void svm_adjust_mmio_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
+static __init void svm_set_cpu_caps(void)
+{
+	kvm_set_cpu_caps();
+
+	supported_xss = 0;
+
+	/* CPUID 0x80000001 */
+	if (nested)
+		kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+	/* CPUID 0x8000000A */
+	/* Support next_rip if host supports it */
+	kvm_cpu_cap_check_and_set(X86_FEATURE_NRIPS);
+
+	if (npt_enabled)
+		kvm_cpu_cap_set(X86_FEATURE_NPT);
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1450,6 +1484,8 @@ static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
+	svm_set_cpu_caps();
+	
 	return 0;
 
 err:
@@ -2186,9 +2222,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	struct vcpu_svm *svm;
 	struct page *page;
-	struct page *msrpm_pages;
 	struct page *hsave_page;
-	struct page *nested_msrpm_pages;
 	int err;
 
 	BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0,
@@ -2200,46 +2234,27 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto out;
 	}
 
-	svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-						     GFP_KERNEL_ACCOUNT);
-	if (!svm->vcpu.arch.user_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
-		err = -ENOMEM;
+	if (!fpu_alloc_guest_fpstate(&svm->vcpu.arch.guest_fpu)) {
+		pr_err("kvm: failed to allocate vcpu's fpu\n");
 		goto free_partial_svm;
 	}
 
-	svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-						     GFP_KERNEL_ACCOUNT);
-	if (!svm->vcpu.arch.guest_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
-		err = -ENOMEM;
-		goto free_user_fpu;
-	}
-
 	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
 	if (err)
-		goto free_svm;
+		goto free_guest_fpu;
 
 	err = -ENOMEM;
 	page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!page)
 		goto uninit;
 
-	msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-	if (!msrpm_pages)
-		goto free_page1;
-
-	nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
-	if (!nested_msrpm_pages)
-		goto free_page2;
-
 	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!hsave_page)
-		goto free_page3;
+		goto free_page1;
 
 	err = avic_init_vcpu(svm);
 	if (err)
-		goto free_page4;
+		goto free_page2;
 
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
@@ -2248,11 +2263,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->nested.hsave = page_address(hsave_page);
 
-	svm->msrpm = page_address(msrpm_pages);
-	svm_vcpu_init_msrpm(svm->msrpm);
+	svm->msrpm = svm_vcpu_alloc_msrpm();
+	if (!svm->msrpm)
+		goto free_page2;
+
+	svm_vcpu_init_msrpm(&svm->vcpu, svm->msrpm);
+
+	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+	if (!svm->nested.msrpm)
+		goto free_page3;
 
-	svm->nested.msrpm = page_address(nested_msrpm_pages);
-	svm_vcpu_init_msrpm(svm->nested.msrpm);
+	/* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
+	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
@@ -2264,20 +2286,16 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	return &svm->vcpu;
 
-free_page4:
-	__free_page(hsave_page);
 free_page3:
-	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+	svm_vcpu_free_msrpm(svm->msrpm);
 free_page2:
-	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+	__free_page(hsave_page);
 free_page1:
 	__free_page(page);
 uninit:
 	kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
-free_user_fpu:
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
+free_guest_fpu:
+	fpu_free_guest_fpstate(&svm->vcpu.arch.guest_fpu);
 free_partial_svm:
 	kmem_cache_free(kvm_vcpu_cache, svm);
 out:
@@ -2308,8 +2326,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
-	kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
+	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
 
@@ -4345,7 +4362,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		 * We update the L1 MSR bit as well since it will end up
 		 * touching the MSR anyway now.
 		 */
-		set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr->host_initiated &&
@@ -4360,7 +4377,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			break;
 
 		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
 		break;
 	case MSR_AMD64_VIRT_SPEC_CTRL:
 		if (!msr->host_initiated &&
@@ -4424,9 +4441,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		svm->vmcb->save.dbgctl = data;
 		mark_dirty(svm->vmcb, VMCB_LBR);
 		if (data & (1ULL<<0))
-			svm_enable_lbrv(svm);
+			svm_enable_lbrv(vcpu);
 		else
-			svm_disable_lbrv(svm);
+			svm_disable_lbrv(vcpu);
 		break;
 	case MSR_VM_HSAVE_PA:
 		svm->nested.hsave_msr = data;
@@ -4536,18 +4553,18 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
 	switch (id) {
 	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
 		/*
-		 * AVIC hardware handles the generation of
-		 * IPIs when the specified Message Type is Fixed
-		 * (also known as fixed delivery mode) and
-		 * the Trigger Mode is edge-triggered. The hardware
-		 * also supports self and broadcast delivery modes
-		 * specified via the Destination Shorthand(DSH)
-		 * field of the ICRL. Logical and physical APIC ID
-		 * formats are supported. All other IPI types cause
-		 * a #VMEXIT, which needs to emulated.
+		 * Emulate IPIs that are not handled by AVIC hardware, which
+		 * only virtualizes Fixed, Edge-Triggered INTRs.  The exit is
+		 * a trap, e.g. ICR holds the correct value and RIP has been
+		 * advanced, KVM is responsible only for emulating the IPI.
+		 * Sadly, hardware may sometimes leave the BUSY flag set, in
+		 * which case KVM needs to emulate the ICR write as well in
+		 * order to clear the BUSY flag.
 		 */
-		kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
-		kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+		if (icrl & APIC_ICR_BUSY)
+			kvm_apic_write_nodecode(&svm->vcpu, APIC_ICR);
+		else
+			kvm_apic_send_ipi(apic, icrl, icrh);
 		break;
 	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
 		int i;
@@ -4705,30 +4722,28 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
 	svm->dfr_reg = dfr;
 }
 
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = svm->vcpu.arch.apic;
-	u32 offset = svm->vmcb->control.exit_info_1 &
+	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
 				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
 
 	switch (offset) {
 	case APIC_ID:
-		if (avic_handle_apic_id_update(&svm->vcpu))
+		if (avic_handle_apic_id_update(vcpu))
 			return 0;
 		break;
 	case APIC_LDR:
-		if (avic_handle_ldr_update(&svm->vcpu))
+		if (avic_handle_ldr_update(vcpu))
 			return 0;
 		break;
 	case APIC_DFR:
-		avic_handle_dfr_update(&svm->vcpu);
+		avic_handle_dfr_update(vcpu);
 		break;
 	default:
 		break;
 	}
 
-	kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
+	kvm_apic_write_nodecode(vcpu, offset);
 	return 1;
 }
 
@@ -4777,7 +4792,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
 	if (trap) {
 		/* Handling Trap */
 		WARN_ONCE(!write, "svm: Handling trap read.\n");
-		ret = avic_unaccel_trap_write(svm);
+		ret = avic_unaccel_trap_write(&svm->vcpu);
 	} else {
 		/* Handling Fault */
 		ret = kvm_emulate_instruction(&svm->vcpu, 0);
@@ -4978,7 +4993,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = control->exit_info_2;
 }
 
-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu,
+		       enum exit_fastpath_completion exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
@@ -5036,7 +5052,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		       __func__, svm->vmcb->control.exit_int_info,
 		       exit_code);
 
-	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+		skip_emulated_instruction(vcpu);
+		return 1;
+	} else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
 	    || !svm_exit_handlers[exit_code]) {
 		vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
 		dump_vmcb(vcpu);
@@ -5653,8 +5672,14 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
+	return EXIT_FASTPATH_NONE;
+}
+
+static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	enum exit_fastpath_completion exit_fastpath;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -5666,7 +5691,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * again.
 	 */
 	if (unlikely(svm->nested.exit_required))
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	/*
 	 * Disable singlestep if we're injecting an interrupt/exception.
@@ -5691,7 +5716,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
 	clgi();
-	kvm_load_guest_xcr0(vcpu);
+	kvm_load_guest_xsave_state(vcpu);
 
 	if (lapic_in_kernel(vcpu) &&
 		vcpu->arch.apic->lapic_timer.timer_advance_ns)
@@ -5841,10 +5866,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_before_interrupt(&svm->vcpu);
 
-	kvm_put_guest_xcr0(vcpu);
+	kvm_load_host_xsave_state(vcpu);
 	stgi();
 
 	/* Any pending NMI will happen here */
+	exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_after_interrupt(&svm->vcpu);
@@ -5873,6 +5899,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		svm_handle_mce(svm);
 
 	mark_all_clean(svm->vmcb);
+	return exit_fastpath;
 }
 STACK_FRAME_NON_STANDARD(svm_vcpu_run);
 
@@ -5950,6 +5977,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+				    boot_cpu_has(X86_FEATURE_XSAVES);
+
 	/* Update nrips enabled cache */
 	svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 
@@ -5959,40 +5989,32 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
 }
 
-#define F(x) bit(X86_FEATURE_##x)
-
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+/*
+ * Vendor specific emulation must be handled via ->set_supported_cpuid(), not
+ * svm_set_cpu_caps(), as capabilities configured during hardware_setup() are
+ * masked against hardware/kernel support, i.e. they'd be lost.
+ *
+ * Note, setting a flag based on a *different* feature, e.g. setting VIRT_SSBD
+ * if LS_CFG_SSBD or AMD_SSBD is supported, is effectively emulation.
+ */
+static void svm_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
 {
-	switch (func) {
+	switch (entry->function) {
 	case 0x1:
 		if (avic)
-			entry->ecx &= ~bit(X86_FEATURE_X2APIC);
-		break;
-	case 0x80000001:
-		if (nested)
-			entry->ecx |= (1 << 2); /* Set SVM bit */
+			cpuid_entry_clear(entry, X86_FEATURE_X2APIC);
 		break;
 	case 0x80000008:
 		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
-		     boot_cpu_has(X86_FEATURE_AMD_SSBD))
-			entry->ebx |= F(VIRT_SSBD);
+		    boot_cpu_has(X86_FEATURE_AMD_SSBD))
+			cpuid_entry_set(entry, X86_FEATURE_VIRT_SSBD);
 		break;
 	case 0x8000000A:
 		entry->eax = 1; /* SVM revision 1 */
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		entry->edx = 0; /* Per default do not support any
-				   additional features */
-
-		/* Support next_rip if host supports it */
-		if (boot_cpu_has(X86_FEATURE_NRIPS))
-			entry->edx |= F(NRIPS);
-
-		/* Support NPT for the guest if enabled */
-		if (npt_enabled)
-			entry->edx |= F(NPT);
-
+		cpuid_entry_override(entry, CPUID_8000_000A_EDX);
 		break;
 	case 0x8000001F:
 		/* Support memory encryption cpuid if host supports it */
@@ -6013,26 +6035,11 @@ static bool svm_rdtscp_supported(void)
 	return boot_cpu_has(X86_FEATURE_RDTSCP);
 }
 
-static bool svm_invpcid_supported(void)
-{
-	return false;
-}
-
 static bool svm_mpx_supported(void)
 {
 	return false;
 }
 
-static bool svm_xsaves_supported(void)
-{
-	return false;
-}
-
-static bool svm_umip_emulated(void)
-{
-	return false;
-}
-
 static bool svm_pt_supported(void)
 {
 	return false;
@@ -6043,11 +6050,6 @@ static bool svm_has_wbinvd_exit(void)
 	return true;
 }
 
-static bool svm_pku_supported(void)
-{
-	return false;
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
 			.stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -7168,12 +7170,10 @@ static int svm_unregister_enc_region(struct kvm *kvm,
 	return ret;
 }
 
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
-	unsigned long cr4 = kvm_read_cr4(vcpu);
-	bool smep = cr4 & X86_CR4_SMEP;
-	bool smap = cr4 & X86_CR4_SMAP;
-	bool is_user = svm_get_cpl(vcpu) == 3;
+	bool smep, smap, is_user;
+	unsigned long cr4;
 
 	/*
 	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -7215,6 +7215,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 	 * instruction pointer so we will not able to workaround it. Lets
 	 * print the error and request to kill the guest.
 	 */
+	if (likely(!insn || insn_len))
+		return true;
+
+	/*
+	 * If RIP is invalid, go ahead with emulation which will cause an
+	 * internal error exit.
+	 */
+	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+		return true;
+
+	cr4 = kvm_read_cr4(vcpu);
+	smep = cr4 & X86_CR4_SMEP;
+	smap = cr4 & X86_CR4_SMAP;
+	is_user = svm_get_cpl(vcpu) == 3;
 	if (smap && (!smep || is_user)) {
 		if (!sev_guest(vcpu->kvm))
 			return true;
@@ -7336,12 +7350,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
-	.invpcid_supported = svm_invpcid_supported,
 	.mpx_supported = svm_mpx_supported,
-	.xsaves_supported = svm_xsaves_supported,
-	.umip_emulated = svm_umip_emulated,
 	.pt_supported = svm_pt_supported,
-	.pku_supported = svm_pku_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
@@ -7377,7 +7387,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.nested_enable_evmcs = NULL,
 	.nested_get_evmcs_version = NULL,
 
-	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+	.can_emulate_instruction = svm_can_emulate_instruction,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
 };
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c741a0c5f808466354c7e786d81f2e6acdf12ee..af14b0953b9b4c5187b8aaa1f60c99a0fade2b5d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -192,13 +192,13 @@ TRACE_EVENT(kvm_cpuid,
  * Tracepoint for apic access.
  */
 TRACE_EVENT(kvm_apic,
-	TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+	TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
 	TP_ARGS(rw, reg, val),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	rw		)
 		__field(	unsigned int,	reg		)
-		__field(	unsigned int,	val		)
+		__field(	u64,		val		)
 	),
 
 	TP_fast_assign(
@@ -207,7 +207,7 @@ TRACE_EVENT(kvm_apic,
 		__entry->val		= val;
 	),
 
-	TP_printk("apic_%s %s = 0x%x",
+	TP_printk("apic_%s %s = 0x%llx",
 		  __entry->rw ? "write" : "read",
 		  __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
 		  __entry->val)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f486e260624740e8fd57683b001beb4737593c60..6ccf08a2d9fb74d63d9ee5a8b75778be6bf14e8c 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept;
 extern bool __read_mostly enable_unrestricted_guest;
 extern bool __read_mostly enable_ept_ad_bits;
 extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_ipiv;
 extern bool __read_mostly enable_apicv;
 extern int __read_mostly pt_mode;
 
@@ -55,6 +56,7 @@ struct vmcs_config {
 	u32 pin_based_exec_ctrl;
 	u32 cpu_based_exec_ctrl;
 	u32 cpu_based_2nd_exec_ctrl;
+	u64 cpu_based_3rd_exec_ctrl;
 	u32 vmexit_ctrl;
 	u32 vmentry_ctrl;
 	struct nested_vmx_msrs nested;
@@ -128,6 +130,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void)
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 }
 
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -146,11 +154,6 @@ static inline bool vmx_umip_emulated(void)
 		SECONDARY_EXEC_DESC;
 }
 
-static inline bool vmx_pku_supported(void)
-{
-	return boot_cpu_has(X86_FEATURE_PKU);
-}
-
 static inline bool cpu_has_vmx_rdtscp(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -253,6 +256,12 @@ static inline bool vmx_xsaves_supported(void)
 		SECONDARY_EXEC_XSAVES;
 }
 
+static inline bool cpu_has_vmx_pasid_trans(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_PASID_TRANSLATION;
+}
+
 static inline bool vmx_waitpkg_supported(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -272,6 +281,11 @@ static inline bool cpu_has_vmx_apicv(void)
 		cpu_has_vmx_posted_intr();
 }
 
+static inline bool cpu_has_vmx_ipiv(void)
+{
+	return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 72359709cdc1741beff8656b20f0432895388717..f4d77f4786fc98aaaac2332d0e80c38f21e74dd2 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -305,8 +305,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
 
 void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 {
+	vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
 	vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
 	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+	vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
 
 	vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
 	vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 07ebf6882a458ab7f85c03d0dd22aa759e6085eb..892504b1e96503be4871b7b56d41606975570fae 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -49,6 +49,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
  */
 #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
 				    PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
 #define EVMCS1_UNSUPPORTED_2NDEXEC					\
 	(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |				\
 	 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |			\
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3f63bd7421aca6b3fe663311f4c1d0504b71a742..e9ca618c8ed42c11a03e4021d2adae231ea5d935 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3058,7 +3058,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	bool evaluate_pending_interrupts;
-	u32 exit_reason = EXIT_REASON_INVALID_STATE;
+	union vmx_exit_reason exit_reason = {
+		.basic = EXIT_REASON_INVALID_STATE,
+		.failed_vmentry = 1,
+	};
 	u32 exit_qual;
 
 	evaluate_pending_interrupts = exec_controls_get(vmx) &
@@ -3118,7 +3121,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 		goto vmentry_fail_vmexit_guest_mode;
 
 	if (from_vmentry) {
-		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+		exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
 		exit_qual = nested_vmx_load_msr(vcpu,
 						vmcs12->vm_entry_msr_load_addr,
 						vmcs12->vm_entry_msr_load_count);
@@ -3186,7 +3189,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 		return NVMX_VMENTRY_VMEXIT;
 
 	load_vmcs12_host_state(vcpu, vmcs12);
-	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+	vmcs12->vm_exit_reason = exit_reason.full;
 	vmcs12->exit_qualification = exit_qual;
 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
@@ -4407,7 +4410,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
 	if (pt_mode == PT_MODE_HOST_GUEST) {
 		vmx->pt_desc.guest.ctl = 0;
-		pt_update_intercept_for_msr(vmx);
+		pt_update_intercept_for_msr(vcpu);
 	}
 
 	return 0;
@@ -5113,7 +5116,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 
 fail:
-	nested_vmx_vmexit(vcpu, vmx->exit_reason,
+	/*
+	 * This is effectively a reflected VM-Exit, as opposed to a synthesized
+	 * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
+	 * EXIT_REASON_VMFUNC as the exit reason.
+	 */
+	nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
@@ -5181,7 +5189,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
  */
 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
-	struct vmcs12 *vmcs12, u32 exit_reason)
+					struct vmcs12 *vmcs12,
+					union vmx_exit_reason exit_reason)
 {
 	u32 msr_index = kvm_rcx_read(vcpu);
 	gpa_t bitmap;
@@ -5195,7 +5204,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	 * First we need to figure out which of the four to use:
 	 */
 	bitmap = vmcs12->msr_bitmap;
-	if (exit_reason == EXIT_REASON_MSR_WRITE)
+	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
 		bitmap += 2048;
 	if (msr_index >= 0xc0000000) {
 		msr_index -= 0xc0000000;
@@ -5325,7 +5334,8 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu,
+			       union vmx_exit_reason exit_reason)
 {
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5354,14 +5364,14 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	 */
 	nested_mark_vmcs12_pages_dirty(vcpu);
 
-	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason.full,
 				vmcs_readl(EXIT_QUALIFICATION),
 				vmx->idt_vectoring_info,
 				intr_info,
 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 				KVM_ISA_VMX);
 
-	switch ((u16)exit_reason) {
+	switch ((u16)exit_reason.basic) {
 	case EXIT_REASON_EXCEPTION_NMI:
 		if (is_nmi(intr_info))
 			return false;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b8521c451bb0413ae14171e17032ee244c338eee..7e960f084c521b2b2f4ca06d88dffe88e48ef4de 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -24,7 +24,8 @@ void nested_vmx_vcpu_setup(void);
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 						     bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
+bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu,
+			       union vmx_exit_reason exit_reason);
 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification);
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
@@ -74,7 +75,7 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
  * Reflect a VM Exit into L1.
  */
 static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
-					    u32 exit_reason)
+					    union vmx_exit_reason exit_reason)
 {
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
@@ -83,7 +84,7 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
 	 * we need to query the in-kernel LAPIC.
 	 */
-	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+	WARN_ON(exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT);
 	if ((exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
@@ -93,7 +94,7 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	}
 
-	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+	nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info,
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
 }
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 481ad879197b867f2e46927a8c5c5cbf501918d4..962806a4c47818d81e3b9281a9731e70917c41a0 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -48,6 +48,7 @@ struct vmcs_controls_shadow {
 	u32 pin;
 	u32 exec;
 	u32 secondary_exec;
+	u64 tertiary_exec;
 };
 
 /*
@@ -111,6 +112,11 @@ static inline bool is_machine_check(u32 intr_info)
 		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
+static inline bool is_nm_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, NM_VECTOR);
+}
+
 /* Undocumented: icebp/int1 */
 static inline bool is_icebp(u32 intr_info)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0564c05c1ce0588e7b323312c48236780aa7914f..6c79798d2da2073e669a71af624996aae08c68b0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -16,6 +16,7 @@
 #include <linux/frame.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/ioasid.h>
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
@@ -23,6 +24,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/smt.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
@@ -33,7 +35,8 @@
 #include <asm/cpu.h>
 #include <asm/debugreg.h>
 #include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
 #include <asm/io.h>
 #include <asm/irq_remapping.h>
 #include <asm/kexec.h>
@@ -51,6 +54,7 @@
 #include "irq.h"
 #include "kvm_cache_regs.h"
 #include "lapic.h"
+#include "irq.h"
 #include "mmu.h"
 #include "nested.h"
 #include "ops.h"
@@ -98,6 +102,9 @@ module_param(fasteoi, bool, S_IRUGO);
 bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -106,8 +113,6 @@ module_param(enable_apicv, bool, S_IRUGO);
 static bool __read_mostly nested = 1;
 module_param(nested, bool, S_IRUGO);
 
-static u64 __read_mostly host_xss;
-
 bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
@@ -343,7 +348,7 @@ module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							  u32 msr, int type);
 
 void vmx_vmexit(void);
@@ -779,6 +784,14 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu))
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
 
+	/*
+	 * Disabling xfd interception indicates that dynamic xfeatures
+	 * might be used in the guest. Always trap #NM in this case
+	 * to save guest xfd_err timely.
+	 */
+	if (vcpu->arch.xfd_no_write_intercept)
+		eb |= (1u << NM_VECTOR);
+
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1377,13 +1390,31 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm) && enable_apicv &&
+		kvm_arch_has_assigned_device(kvm) &&
+		irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * The default posted interrupt vector does nothing when
+	 * invoked outside guest mode.   Return whether a blocked vCPU
+	 * can be the target of posted interrupts, as is the case when
+	 * using either IPI virtualization or VT-d PI, so that the
+	 * notification vector is switched to the one that calls
+	 * back to the pi_wakeup_handler() function.
+	 */
+	return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+}
+
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (!vmx_needs_pi_wakeup(vcpu))
 		return;
 
 	/* Set SN when the vCPU is preempted */
@@ -1539,6 +1570,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 	return 0;
 }
 
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+	return true;
+}
+
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip, orig_rip;
@@ -1552,7 +1588,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * i.e. we end up advancing IP with some random value.
 	 */
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+	    to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
 		orig_rip = kvm_rip_read(vcpu);
 		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 #ifdef CONFIG_X86_64
@@ -1631,11 +1667,6 @@ static bool vmx_rdtscp_supported(void)
 	return cpu_has_vmx_rdtscp();
 }
 
-static bool vmx_invpcid_supported(void)
-{
-	return cpu_has_vmx_invpcid();
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -1741,6 +1772,24 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
 	return !(val & ~valid_bits);
 }
 
+static void __maybe_unused vmx_get_xsave_msr(struct msr_data *msr_info)
+{
+	local_irq_disable();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+	      switch_fpu_return();
+	rdmsrl(msr_info->index, msr_info->data);
+	local_irq_enable();
+}
+
+static void __maybe_unused vmx_set_xsave_msr(struct msr_data *msr_info)
+{
+	local_irq_disable();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+	      switch_fpu_return();
+	wrmsrl(msr_info->index, msr_info->data);
+	local_irq_enable();
+}
+
 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 {
 	switch (msr->index) {
@@ -1824,14 +1873,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 				       &msr_info->data);
-	case MSR_IA32_XSS:
-		if (!vmx_xsaves_supported() ||
-		    (!msr_info->host_initiated &&
-		     !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-		       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
-			return 1;
-		msr_info->data = vcpu->arch.ia32_xss;
-		break;
 	case MSR_IA32_RTIT_CTL:
 		if (pt_mode != PT_MODE_HOST_GUEST)
 			return 1;
@@ -1925,6 +1966,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_KERNEL_GS_BASE:
 		vmx_write_guest_kernel_gs_base(vmx, data);
 		break;
+	case MSR_IA32_XFD:
+		ret = kvm_set_msr_common(vcpu, msr_info);
+		/*
+		 * Always intercepting WRMSR could incur non-negligible
+		 * overhead given xfd might be changed frequently in
+		 * guest context switch. Disable write interception
+		 * upon the first write with a non-zero value (indicating
+		 * potential usage on dynamic xfeatures). Also update
+		 * exception bitmap to trap #NM for proper virtualization
+		 * of guest xfd_err.
+		 */
+		if (!ret && data) {
+			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+						      MSR_TYPE_RW);
+			vcpu->arch.xfd_no_write_intercept = true;
+			update_exception_bitmap(vcpu);
+		}
+		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
 		if (is_guest_mode(vcpu))
@@ -1993,7 +2052,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * in the merging. We update the vmcs01 here for L1 as well
 		 * since it will end up touching the MSR anyway now.
 		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+		vmx_disable_intercept_for_msr(vcpu,
 					      MSR_IA32_SPEC_CTRL,
 					      MSR_TYPE_RW);
 		break;
@@ -2022,8 +2081,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * vmcs02.msr_bitmap here since it gets completely overwritten
 		 * in the merging.
 		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
-					      MSR_TYPE_W);
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (!kvm_pat_valid(data))
@@ -2066,25 +2124,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
 		return vmx_set_vmx_msr(vcpu, msr_index, data);
-	case MSR_IA32_XSS:
-		if (!vmx_xsaves_supported() ||
-		    (!msr_info->host_initiated &&
-		     !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-		       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
-			return 1;
-		/*
-		 * The only supported bit as of Skylake is bit 8, but
-		 * it is not supported on KVM.
-		 */
-		if (data != 0)
-			return 1;
-		vcpu->arch.ia32_xss = data;
-		if (vcpu->arch.ia32_xss != host_xss)
-			add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-				vcpu->arch.ia32_xss, host_xss, false);
-		else
-			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
-		break;
 	case MSR_IA32_RTIT_CTL:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
 			vmx_rtit_ctl_check(vcpu, data) ||
@@ -2092,7 +2131,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
 		vmx->pt_desc.guest.ctl = data;
-		pt_update_intercept_for_msr(vmx);
+		pt_update_intercept_for_msr(vcpu);
 		break;
 	case MSR_IA32_RTIT_STATUS:
 		if ((pt_mode != PT_MODE_HOST_GUEST) ||
@@ -2314,6 +2353,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 	return 0;
 }
 
+static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+	u64 allowed;
+
+	rdmsrl(msr, allowed);
+
+	return  ctl_opt & allowed;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 				    struct vmx_capability *vmx_cap)
 {
@@ -2322,6 +2370,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	u32 _pin_based_exec_control = 0;
 	u32 _cpu_based_exec_control = 0;
 	u32 _cpu_based_2nd_exec_control = 0;
+	u64 _cpu_based_3rd_exec_control = 0;
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
@@ -2343,7 +2392,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
-	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
+	      CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 				&_cpu_based_exec_control) < 0)
 		return -EIO;
@@ -2377,6 +2427,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
 			SECONDARY_EXEC_ENCLS_EXITING;
+		if (boot_cpu_has(X86_FEATURE_ENQCMD))
+			opt2 |= SECONDARY_EXEC_PASID_TRANSLATION;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2415,6 +2467,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 				"1-setting enable VPID VM-execution control\n");
 	}
 
+	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
+		u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
+
+		_cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+					      MSR_IA32_VMX_PROCBASED_CTLS3);
+	}
+
 	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -2502,6 +2561,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
 	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+	vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
@@ -3581,9 +3641,11 @@ void free_vpid(int vpid)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							  u32 msr, int type)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
@@ -3619,9 +3681,11 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
 	}
 }
 
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							 u32 msr, int type)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
@@ -3657,13 +3721,13 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm
 	}
 }
 
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
-			     			      u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+						      u32 msr, int type, bool value)
 {
 	if (value)
-		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+		vmx_enable_intercept_for_msr(vcpu, msr, type);
 	else
-		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+		vmx_disable_intercept_for_msr(vcpu, msr, type);
 }
 
 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3681,8 +3745,8 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 	return mode;
 }
 
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
-					 u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu,
+					 unsigned long *msr_bitmap, u8 mode)
 {
 	int msr;
 
@@ -3697,11 +3761,13 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
 		 * TPR reads and writes can be virtualized even if virtual interrupt
 		 * delivery is not in use.
 		 */
-		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
 		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
-			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+			vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+			if (enable_ipiv)
+				vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
 		}
 	}
 }
@@ -3717,30 +3783,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 		return;
 
 	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+		vmx_update_msr_bitmap_x2apic(vcpu, msr_bitmap, mode);
 
 	vmx->msr_bitmap_mode = mode;
 }
 
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 {
-	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 	u32 i;
 
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
-							MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
 	for (i = 0; i < vmx->pt_desc.addr_range; i++) {
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
 	}
 }
 
@@ -3957,15 +4017,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
-	if (cpu_has_secondary_exec_ctrls()) {
-		if (kvm_vcpu_apicv_active(vcpu))
-			secondary_exec_controls_setbit(vmx,
-				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-		else
-			secondary_exec_controls_clearbit(vmx,
-					SECONDARY_EXEC_APIC_REGISTER_VIRT |
-					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+	if (kvm_vcpu_apicv_active(vcpu)) {
+		secondary_exec_controls_setbit(vmx,
+					       SECONDARY_EXEC_APIC_REGISTER_VIRT |
+					       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+		if (enable_ipiv)
+			tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+	} else {
+		secondary_exec_controls_clearbit(vmx,
+						 SECONDARY_EXEC_APIC_REGISTER_VIRT |
+						 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+		if (enable_ipiv)
+			tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
 	}
 
 	if (cpu_has_vmx_msr_bitmap())
@@ -3998,6 +4062,19 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+	u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+	/*
+	 * IPI virtualization relies on APICv. Disable IPI virtualization if
+	 * APICv is inhibited.
+	 */
+	if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+		exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+	return exec_control;
+}
 
 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 {
@@ -4044,6 +4121,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
 
+		vcpu->arch.xsaves_enabled = xsaves_enabled;
+
 		if (!xsaves_enabled)
 			exec_control &= ~SECONDARY_EXEC_XSAVES;
 
@@ -4072,7 +4151,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		}
 	}
 
-	if (vmx_invpcid_supported()) {
+	if (cpu_has_vmx_invpcid()) {
 		/* Exposing INVPCID only when PCID is exposed */
 		bool invpcid_enabled =
 			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -4140,6 +4219,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		}
 	}
 
+	if (cpu_has_vmx_pasid_trans()) {
+		if (!guest_cpuid_has(vcpu, X86_FEATURE_ENQCMD))
+			exec_control &= ~SECONDARY_EXEC_PASID_TRANSLATION;
+	}
+
 	vmx->secondary_exec_control = exec_control;
 }
 
@@ -4153,6 +4237,35 @@ static void ept_set_mmio_spte_mask(void)
 				   VMX_EPT_MISCONFIG_WX_VALUE, 0);
 }
 
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+	return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+	struct page *pages;
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+	if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+		return 0;
+
+	if (kvm_vmx->pid_table)
+		return 0;
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+	if (!pages)
+		return -ENOMEM;
+
+	kvm_vmx->pid_table = (void *)page_address(pages);
+	return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+	return vmx_alloc_ipiv_pid_table(kvm);
+}
+
 #define VMX_XSS_EXIT_BITMAP 0
 
 /*
@@ -4160,6 +4273,8 @@ static void ept_set_mmio_spte_mask(void)
  */
 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
+	struct kvm *kvm = vmx->vcpu.kvm;
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
 	int i;
 
 	if (nested)
@@ -4181,6 +4296,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
 	}
 
+	if (cpu_has_tertiary_exec_ctrls())
+		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
 	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
@@ -4193,7 +4311,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
 	}
 
-	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+	if (vmx_can_use_ipiv(&vmx->vcpu)) {
+		vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+		vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+	}
+
+	if (!kvm_pause_in_guest(kvm)) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmx->ple_window = ple_window;
 		vmx->ple_window_dirty = true;
@@ -4605,6 +4728,26 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ *  - Guest CPL == 3 (user mode)
+ *  - Guest has #AC detection enabled in CR0
+ *  - Guest EFLAGS has AC bit set
+ */
+static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		return true;
+
+	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4619,6 +4762,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	if (is_machine_check(intr_info) || is_nmi(intr_info))
 		return 1; /* handled by handle_exception_nmi_irqoff() */
 
+	/*
+	 * Queue the exception here instead of in handle_nm_fault_irqoff().
+	 * This ensures the nested_vmx check is not skipped so vmexit can
+	 * be reflected to L1 (when it intercepts #NM) before reaching this
+	 * point.
+	 */
+	if (is_nm_fault(intr_info)) {
+		kvm_queue_exception(vcpu, NM_VECTOR);
+		return 1;
+	}
+
 	if (is_invalid_opcode(intr_info))
 		return handle_ud(vcpu);
 
@@ -4670,9 +4824,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		return handle_rmode_exception(vcpu, ex_no, error_code);
 
 	switch (ex_no) {
-	case AC_VECTOR:
-		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
-		return 1;
 	case DB_VECTOR:
 		dr6 = vmcs_readl(EXIT_QUALIFICATION);
 		if (!(vcpu->guest_debug &
@@ -4701,6 +4852,20 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
 		kvm_run->debug.arch.exception = ex_no;
 		break;
+	case AC_VECTOR:
+		if (guest_inject_ac(vcpu)) {
+			kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+			return 1;
+		}
+
+		/*
+		 * Handle split lock. Depending on detection mode this will
+		 * either warn and disable split lock detection for this
+		 * task or force SIGBUS on it.
+		 */
+		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+			return 1;
+		fallthrough;
 	default:
 		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 		kvm_run->ex.exception = ex_no;
@@ -5094,9 +5259,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
 static int handle_apic_write(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	u32 offset = exit_qualification & 0xfff;
 
-	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
+	/*
+	 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+	 * hardware has done any necessary aliasing, offset adjustments, etc...
+	 * for the access.  I.e. the correct value has already been  written to
+	 * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
+	 * retrieve the register value and emulate the access.
+	 */
+	u32 offset = exit_qualification & 0xff0;
+
 	kvm_apic_write_nodecode(vcpu, offset);
 	return 1;
 }
@@ -5552,6 +5724,27 @@ static int handle_encls(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_enqcmd_pasid(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	kvm_debug_ratelimited("[%s] VM exit_qualification=0x%lx\n", __func__,
+			      vmcs_readl(EXIT_QUALIFICATION));
+
+	/*
+	 * Valid PASID translation should exist before the guest attempts to do
+	 * ENQCMD/ENQCMDS. Otherwise, VM exit is triggered if CPU executes
+	 * ENQCMD/ENQCMDS in non-root mode but fails to translate the guest
+	 * PASID latched in IA32_PASID MSR. In such case, set the EFLAGS.ZF to
+	 * indicate the failure to the guest and skip the instruction.
+	 */
+	flags = vmx_get_rflags(vcpu);
+	flags |= X86_EFLAGS_ZF;
+	vmx_set_rflags(vcpu, flags);
+
+	return kvm_skip_emulated_instruction(vcpu);
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5608,6 +5801,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
+	[EXIT_REASON_ENQCMD_PASID]	      = handle_enqcmd_pasid,
+	[EXIT_REASON_ENQCMDS_PASID]	      = handle_enqcmd_pasid,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -5696,6 +5891,7 @@ void dump_vmcs(void)
 {
 	u32 vmentry_ctl, vmexit_ctl;
 	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+	u64 tertiary_exec_control;
 	unsigned long cr4;
 	u64 efer;
 	int i, n;
@@ -5711,9 +5907,16 @@ void dump_vmcs(void)
 	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
 	cr4 = vmcs_readl(GUEST_CR4);
 	efer = vmcs_read64(GUEST_IA32_EFER);
-	secondary_exec_control = 0;
+
 	if (cpu_has_secondary_exec_ctrls())
 		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	else
+		secondary_exec_control = 0;
+
+	if (cpu_has_tertiary_exec_ctrls())
+		tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+	else
+		tertiary_exec_control = 0;
 
 	pr_err("*** Guest State ***\n");
 	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
@@ -5797,9 +6000,10 @@ void dump_vmcs(void)
 		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
 
 	pr_err("*** Control State ***\n");
-	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
-	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
-	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+	pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+	       cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+	pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+	       pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
 	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
 	       vmcs_read32(EXCEPTION_BITMAP),
 	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -5855,13 +6059,15 @@ void dump_vmcs(void)
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu,
+			   enum exit_fastpath_completion exit_fastpath)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exit_reason = vmx->exit_reason;
+	union vmx_exit_reason exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
+	u16 exit_handler_index;
 
-	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+	trace_kvm_exit(exit_reason.full, vcpu, KVM_ISA_VMX);
 
 	/*
 	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5880,11 +6086,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
 		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
-	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+	if (exit_reason.failed_vmentry) {
 		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
-			= exit_reason;
+			= exit_reason.full;
 		return 0;
 	}
 
@@ -5904,18 +6110,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	 * will cause infinite loop.
 	 */
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-			exit_reason != EXIT_REASON_EPT_VIOLATION &&
-			exit_reason != EXIT_REASON_PML_FULL &&
-			exit_reason != EXIT_REASON_APIC_ACCESS &&
-			exit_reason != EXIT_REASON_TASK_SWITCH)) {
+			(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+			exit_reason.basic != EXIT_REASON_PML_FULL &&
+			exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+			exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
 		vcpu->run->internal.ndata = 3;
 		vcpu->run->internal.data[0] = vectoring_info;
-		vcpu->run->internal.data[1] = exit_reason;
+		vcpu->run->internal.data[1] = exit_reason.full;
 		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
-		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
 			vcpu->run->internal.ndata++;
 			vcpu->run->internal.data[3] =
 				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -5942,18 +6148,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	if (exit_reason < kvm_vmx_max_exit_handlers
-	    && kvm_vmx_exit_handlers[exit_reason])
-		return kvm_vmx_exit_handlers[exit_reason](vcpu);
-	else {
+	exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+						kvm_vmx_max_exit_handlers);
+	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+		skip_emulated_instruction(vcpu);
+		return 1;
+	} else if (exit_reason.basic < kvm_vmx_max_exit_handlers
+		   && kvm_vmx_exit_handlers[exit_handler_index]) {
+		return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
+	} else {
 		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
-				exit_reason);
+				exit_reason.full);
 		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror =
 			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
 		vcpu->run->internal.ndata = 1;
-		vcpu->run->internal.data[0] = exit_reason;
+		vcpu->run->internal.data[0] = exit_reason.full;
 		return 0;
 	}
 }
@@ -6210,20 +6421,41 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Save xfd_err to guest_fpu before interrupt is enabled, so the
+	 * MSR value is not clobbered by the host activity before the guest
+	 * has chance to consume it.
+	 *
+	 * Do not blindly read xfd_err here, since this exception might
+	 * be caused by L1 interception on a platform which doesn't
+	 * support xfd at all.
+	 *
+	 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+	 * only when xfd contains a non-zero value.
+	 *
+	 * Queuing exception is done in vmx_handle_exit. See comment there.
+	 */
+	if (vcpu->arch.guest_fpu.fpstate->xfd)
+		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
 	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	/* if exit due to PF check for async PF */
-	if (is_page_fault(vmx->exit_intr_info))
+	if (is_page_fault(vmx->exit_intr_info)) {
 		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
+	/* if exit due to NM, handle before interrupts are enabled */
+	} else if (is_nm_fault(vmx->exit_intr_info)) {
+		handle_nm_fault_irqoff(&vmx->vcpu);
 	/* Handle machine checks before interrupts are enabled */
-	if (is_machine_check(vmx->exit_intr_info))
+	} else if (is_machine_check(vmx->exit_intr_info)) {
 		kvm_machine_check();
-
 	/* We need to handle NMIs before interrupts are enabled */
-	if (is_nmi(vmx->exit_intr_info)) {
+	} else if (is_nmi(vmx->exit_intr_info)) {
 		kvm_before_interrupt(&vmx->vcpu);
 		asm("int $2");
 		kvm_after_interrupt(&vmx->vcpu);
@@ -6280,9 +6512,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
 		handle_external_interrupt_irqoff(vcpu);
-	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
 		handle_exception_nmi_irqoff(vmx);
 }
 
@@ -6478,10 +6710,11 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
+	enum exit_fastpath_completion exit_fastpath;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -6491,7 +6724,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
 	if (vmx->emulation_required)
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	if (vmx->ple_window_dirty) {
 		vmx->ple_window_dirty = false;
@@ -6529,7 +6762,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
-	kvm_load_guest_xcr0(vcpu);
+	kvm_load_guest_xsave_state(vcpu);
 
 	pt_guest_enter(vmx);
 
@@ -6618,23 +6851,30 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	pt_guest_exit(vmx);
 
-	kvm_put_guest_xcr0(vcpu);
+	kvm_load_host_xsave_state(vcpu);
 
 	vmx->nested.nested_run_pending = 0;
 	vmx->idt_vectoring_info = 0;
 
-	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
-	if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	vmx->exit_reason.full = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+	if ((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)
 		kvm_machine_check();
 
-	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-		return;
+	if (vmx->fail || (vmx->exit_reason.failed_vmentry))
+		return EXIT_FASTPATH_NONE;
 
 	vmx->loaded_vmcs->launched = 1;
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
+
+	if (!is_guest_mode(vcpu) && vmx->exit_reason.basic == EXIT_REASON_MSR_WRITE)
+		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
+	else
+		exit_fastpath = EXIT_FASTPATH_NONE;
+
+	return exit_fastpath;
 }
 
 static struct kvm *vmx_vm_alloc(void)
@@ -6666,8 +6906,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
+	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
@@ -6675,6 +6914,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct vcpu_vmx *vmx;
+	struct kvm_vcpu *vcpu;
 	unsigned long *msr_bitmap;
 	int cpu;
 
@@ -6685,20 +6925,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
-	vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.user_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
-		err = -ENOMEM;
-		goto free_partial_vcpu;
-	}
-
-	vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.guest_fpu) {
+	if (!fpu_alloc_guest_fpstate(&vmx->vcpu.arch.guest_fpu)) {
 		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
 		err = -ENOMEM;
-		goto free_user_fpu;
+		goto free_partial_vcpu;
 	}
 
 	vmx->vpid = allocate_vpid();
@@ -6733,19 +6963,21 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto free_msrs;
 
 	msr_bitmap = vmx->vmcs01.msr_bitmap;
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
 	if (kvm_cstate_in_guest(kvm)) {
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
+	if (cpu_has_vmx_pasid_trans())
+		vmx_disable_intercept_for_msr(&vmx->vcpu, MSR_IA32_PASID, MSR_TYPE_RW);
 	vmx->msr_bitmap_mode = 0;
 
 	vmx->loaded_vmcs = &vmx->vmcs01;
@@ -6787,6 +7019,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->ept_pointer = INVALID_PAGE;
 
+	vcpu = &vmx->vcpu;
+
+	if (vmx_can_use_ipiv(vcpu))
+		WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+			   __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -6799,9 +7037,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
 	free_vpid(vmx->vpid);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_user_fpu:
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
+	fpu_free_guest_fpstate(&vmx->vcpu.arch.guest_fpu);
 free_partial_vcpu:
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
@@ -6810,9 +7046,253 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 
+static inline struct page *vmx_pasid_dir_page(struct kvm_vmx *kvm_vmx,
+					      ioasid_t gpasid)
+{
+	if (!kvm_vmx->pasid_dirs)
+		return NULL;
+
+	return pasid_high_dir_select(gpasid) ?
+			&kvm_vmx->pasid_dirs[1] : &kvm_vmx->pasid_dirs[0];
+}
+
+/* hold pasid_lock when invoke this function */
+static u32 *vmx_find_pasid_table_entry(struct kvm_vmx *kvm_vmx, ioasid_t gpasid)
+{
+	struct page *pd_page = vmx_pasid_dir_page(kvm_vmx, gpasid);
+	u64 *pd_base, *pde;
+	u32 *pt_base;
+
+	if (!pd_page) {
+		kvm_err("%s: PASID directory not found\n", __func__);
+		return ERR_PTR(-ENOENT);
+	}
+
+	pd_base = page_address(pd_page);
+	pde = pd_base + pasid_de_idx(gpasid);
+
+	if (!pasid_de_table_present(pde))
+		return NULL;
+
+	pt_base = __va(pasid_de_table_ptr(pde));
+	return pt_base + pasid_te_idx(gpasid);
+}
+
+/* hold pasid_lock when invoke this function */
+static u32 *vmx_alloc_pasid_table_entry(struct kvm_vmx *kvm_vmx,
+					ioasid_t gpasid)
+{
+	struct page *pt_page, *pd_page = vmx_pasid_dir_page(kvm_vmx, gpasid);
+	u64 *pd_base, *pde;
+	u32 *pt_base;
+
+	if (!pd_page) {
+		kvm_err("%s: PASID directory not found\n", __func__);
+		return ERR_PTR(-ENOENT);
+	}
+
+	pd_base = page_address(pd_page);
+	pde = pd_base + pasid_de_idx(gpasid);
+
+	pt_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!pt_page) {
+		kvm_err("%s: fail to allocate PASID table entry\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pt_base = page_address(pt_page);
+	*pde = (u64)page_to_phys(pt_page) | PASID_DE_TAB_PRESENT;
+	return pt_base + pasid_te_idx(gpasid);
+}
+
+static int vmx_set_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
+			       ioasid_t hpasid)
+{
+	int ret = 0;
+	u32 *pte;
+
+	spin_lock(&kvm_vmx->pasid_lock);
+
+	pte = vmx_find_pasid_table_entry(kvm_vmx, gpasid);
+	if (IS_ERR(pte)) {
+		ret = PTR_ERR(pte);
+		goto done;
+	} else if (!pte) {
+		pte = vmx_alloc_pasid_table_entry(kvm_vmx, gpasid);
+		if (IS_ERR(pte)) {
+			ret = PTR_ERR(pte);
+			goto done;
+		}
+	}
+
+	WARN_ON(pasid_te_hpasid_valid(pte));
+
+	ioasid_get_locked(NULL, hpasid);
+	*pte = hpasid | PASID_TE_VALID;
+
+done:
+	kvm_clear_block_vmentry_request(&kvm_vmx->kvm);
+	spin_unlock(&kvm_vmx->pasid_lock);
+	return ret;
+}
+
+static int vmx_clear_pasid_trans(struct kvm_vmx *kvm_vmx, ioasid_t gpasid,
+				 ioasid_t hpasid)
+{
+	ioasid_t old_hpasid;
+	int ret = 0;
+	u32 *pte;
+
+	spin_lock(&kvm_vmx->pasid_lock);
+
+	kvm_make_block_vmentry_request(&kvm_vmx->kvm);
+
+	pte = vmx_find_pasid_table_entry(kvm_vmx, gpasid);
+	if (IS_ERR(pte)) {
+		ret = PTR_ERR(pte);
+		goto done;
+	} else if (!pte || !pasid_te_hpasid_valid(pte)) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	old_hpasid = pasid_te_hpasid(pte);
+	WARN_ON(old_hpasid != hpasid);
+
+	kvm_make_block_vmentry_request(&kvm_vmx->kvm);
+
+	*pte = 0;
+	ioasid_put_locked(NULL, old_hpasid);
+
+	kvm_clear_block_vmentry_request(&kvm_vmx->kvm);
+done:
+	spin_unlock(&kvm_vmx->pasid_lock);
+	return ret;
+}
+
+static int vmx_handle_ioasid_event(struct notifier_block *nb,
+				   unsigned long event, void *data)
+{
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct kvm_vmx *kvm_vmx = container_of(nb, struct kvm_vmx, pasid_nb);
+	ioasid_t gpasid = args->spid;
+	ioasid_t hpasid = args->id;
+	int r = -1;
+
+	kvm_debug("%s: event %lu hpasid %u gpasid %u\n", __func__,
+		  event, hpasid, gpasid);
+
+	if (hpasid > MAX_PASID || gpasid > MAX_PASID)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case IOASID_NOTIFY_BIND:
+		r = vmx_set_pasid_trans(kvm_vmx, gpasid, hpasid);
+		break;
+	case IOASID_NOTIFY_UNBIND:
+		r = vmx_clear_pasid_trans(kvm_vmx, gpasid, hpasid);
+		break;
+	}
+
+	return r ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+#define PASID_DIRS_ORDER 1
+
+static void vmx_vcpu_pasid_trans_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+	struct mm_struct *mm = get_task_mm(current);
+	int ret = 0;
+
+	/*
+	 * initialize a per-vm PASID translation table and start monitoring
+	 * IOASID events.
+	 */
+	spin_lock(&kvm_vmx->pasid_lock);
+	if (kvm_vmx->pasid_dirs) {
+		/* skip this as table has already been initialized */
+		goto done;
+	}
+
+	kvm_vmx->pasid_dirs = alloc_pages(GFP_ATOMIC | __GFP_ZERO,
+					  PASID_DIRS_ORDER);
+	if (!kvm_vmx->pasid_dirs) {
+		kvm_err("%s: fail to alloc PASID Directory\n", __func__);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	kvm_vmx->pasid_nb.notifier_call = vmx_handle_ioasid_event;
+	kvm_vmx->pasid_nb.priority = IOASID_PRIO_CPU;
+	kvm_vmx->mm = mm;
+
+	ret = ioasid_register_notifier_mm(kvm_vmx->mm, &kvm_vmx->pasid_nb);
+	if (ret) {
+		__free_pages(kvm_vmx->pasid_dirs, PASID_DIRS_ORDER);
+		kvm_vmx->pasid_dirs = NULL;
+	}
+
+done:
+	spin_unlock(&kvm_vmx->pasid_lock);
+	mmput(mm);
+
+	if (!ret) {
+		vmcs_write64(PASID_DIR0, page_to_phys(&kvm_vmx->pasid_dirs[0]));
+		vmcs_write64(PASID_DIR1, page_to_phys(&kvm_vmx->pasid_dirs[1]));
+	}
+}
+
+static void vmx_vm_pasid_tables_free(struct page *pd_page)
+{
+	u64 *pde, *pd_base = page_address(pd_page);
+	u32 *pte, *pt_base;
+	ioasid_t hpasid;
+
+	/*
+	 * before free the PASID translation table, traverse all table entries
+	 * to make sure that kvm doesn't hold reference count of any hpasid.
+	 */
+	for (pde = pd_base; pde < pd_base + PASID_DE_NUM; pde++) {
+		if (!pasid_de_table_present(pde))
+			continue;
+
+		pt_base = __va(pasid_de_table_ptr(pde));
+
+		for (pte = pt_base; pte < pt_base + PASID_TE_NUM; pte++) {
+			if (pasid_te_hpasid_valid(pte)) {
+				hpasid = pasid_te_hpasid(pte);
+
+				/*
+				 * decrease the reference of this hpasid, and
+				 * remove it from the PASID translation table.
+				 */
+				*pte = 0;
+				ioasid_put(NULL, hpasid);
+			}
+		}
+
+		*pde = 0;
+		free_page((unsigned long)pt_base);
+	}
+}
+
+static void vmx_vm_pasid_trans_destroy(struct kvm_vmx *kvm_vmx)
+{
+	if (!kvm_vmx->pasid_dirs)
+		return;
+
+	ioasid_unregister_notifier_mm(kvm_vmx->mm, &kvm_vmx->pasid_nb);
+
+	vmx_vm_pasid_tables_free(&kvm_vmx->pasid_dirs[0]);
+	vmx_vm_pasid_tables_free(&kvm_vmx->pasid_dirs[1]);
+	__free_pages(kvm_vmx->pasid_dirs, PASID_DIRS_ORDER);
+}
+
 static int vmx_vm_init(struct kvm *kvm)
 {
 	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+	spin_lock_init(&to_kvm_vmx(kvm)->pasid_lock);
 
 	if (!ple_gap)
 		kvm->arch.pause_in_guest = true;
@@ -6843,6 +7323,16 @@ static int vmx_vm_init(struct kvm *kvm)
 	return 0;
 }
 
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+	if (cpu_has_vmx_pasid_trans())
+		vmx_vm_pasid_trans_destroy(kvm_vmx);
+
+	free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
 static int __init vmx_check_processor_compat(void)
 {
 	struct vmcs_config vmcs_conf;
@@ -6949,27 +7439,27 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 } while (0)
 
 	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
-	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
-	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
-	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
-	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
-	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
-	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
-	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
-	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
-	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
-	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
-	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
+	cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
+	cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
+	cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
+	cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
+	cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
+	cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
+	cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
+	cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
+	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
+	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+	cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
+	cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
+	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
+	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
 
 	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
-	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
-	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
-	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
-	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
-	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
+	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
+	cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
+	cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
+	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
+	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
 
 #undef cr4_fixed1_update
 }
@@ -7064,6 +7554,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
+	vcpu->arch.xsaves_enabled = false;
+
 	if (cpu_has_secondary_exec_ctrls()) {
 		vmx_compute_secondary_exec_control(vmx);
 		vmcs_set_secondary_exec_control(vmx);
@@ -7084,12 +7577,61 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
 		update_intel_pt_cfg(vcpu);
+	
+	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+					  !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+	if (cpu_has_vmx_pasid_trans() &&
+		guest_cpuid_has(vcpu, X86_FEATURE_ENQCMD))
+		vmx_vcpu_pasid_trans_init(vcpu);
+}
+
+/*
+ * Vendor specific emulation must be handled via ->set_supported_cpuid(), not
+ * vmx_set_cpu_caps(), as capabilities configured during hardware_setup() are
+ * masked against hardware/kernel support, i.e. they'd be lost.
+ */
+static void vmx_set_supported_cpuid(struct kvm_cpuid_entry2 *entry)
+{
 }
 
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
 {
-	if (func == 1 && nested)
-		entry->ecx |= bit(X86_FEATURE_VMX);
+	kvm_set_cpu_caps();
+
+	/* CPUID 0x1 */
+	if (nested)
+		kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+	/* CPUID 0x7 */
+	if (kvm_mpx_supported())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+	if (cpu_has_vmx_invpcid())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+	if (vmx_pt_supported())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+	/* PKU is not yet implemented for shadow paging. */
+	if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+		kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+
+	if (vmx_umip_emulated())
+		kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+	/* CPUID 0xD.1 */
+	supported_xss = 0;
+	if (!vmx_xsaves_supported())
+		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+	/* CPUID 0x80000001 */
+	if (!cpu_has_vmx_rdtscp())
+		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+
+	if (!cpu_has_vmx_pasid_trans())
+		kvm_cpu_cap_clear(X86_FEATURE_ENQCMD);
+	else if (kvm_cpu_cap_has(X86_FEATURE_ENQCMD))
+		supported_xss |= XFEATURE_MASK_PASID;
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7365,11 +7907,6 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
-		return 0;
-
 	WARN_ON(irqs_disabled());
 	local_irq_disable();
 	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
@@ -7465,9 +8002,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	struct vcpu_data vcpu_info;
 	int idx, ret = 0;
 
-	if (!kvm_arch_has_assigned_device(kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP) ||
-		!kvm_vcpu_apicv_active(kvm->vcpus[0]))
+	if (!vmx_can_use_vtd_pi(kvm))
 		return 0;
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
@@ -7597,11 +8132,6 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 {
 	return to_vmx(vcpu)->nested.vmxon;
@@ -7632,9 +8162,6 @@ static __init int hardware_setup(void)
 		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
 	}
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		rdmsrl(MSR_IA32_XSS, host_xss);
-
 	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
 	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;
@@ -7693,6 +8220,9 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->sync_pir_to_irr = NULL;
 	}
 
+	if (!enable_apicv || !cpu_has_vmx_ipiv())
+		enable_ipiv = false;
+
 	if (cpu_has_vmx_tsc_scaling()) {
 		kvm_has_tsc_control = true;
 		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
@@ -7768,6 +8298,8 @@ static __init int hardware_setup(void)
 			return r;
 	}
 
+	vmx_set_cpu_caps();
+
 	r = alloc_kvm_area();
 	if (r)
 		nested_vmx_hardware_unsetup();
@@ -7794,9 +8326,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_init = vmx_vm_init,
+	.vm_destroy = vmx_vm_destroy,
 	.vm_alloc = vmx_vm_alloc,
 	.vm_free = vmx_vm_free,
 
+	.vcpu_precreate = vmx_vcpu_precreate,
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
@@ -7878,7 +8412,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
-	.invpcid_supported = vmx_invpcid_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
@@ -7892,10 +8425,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.check_intercept = vmx_check_intercept,
 	.handle_exit_irqoff = vmx_handle_exit_irqoff,
 	.mpx_supported = vmx_mpx_supported,
-	.xsaves_supported = vmx_xsaves_supported,
-	.umip_emulated = vmx_umip_emulated,
 	.pt_supported = vmx_pt_supported,
-	.pku_supported = vmx_pku_supported,
 
 	.request_immediate_exit = vmx_request_immediate_exit,
 
@@ -7932,7 +8462,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.get_vmcs12_pages = NULL,
 	.nested_enable_evmcs = NULL,
 	.nested_get_evmcs_version = NULL,
-	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+	.can_emulate_instruction = vmx_can_emulate_instruction,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
 };
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a1919ec7fd1082c43a6c067f42b0a54ba0b9b5d4..965d4dc31674f1d171f5939a17907c366c5333e0 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -85,6 +85,29 @@ struct pt_desc {
 	struct pt_ctx guest;
 };
 
+union vmx_exit_reason {
+	struct {
+		u32	basic			: 16;
+		u32	reserved16		: 1;
+		u32	reserved17		: 1;
+		u32	reserved18		: 1;
+		u32	reserved19		: 1;
+		u32	reserved20		: 1;
+		u32	reserved21		: 1;
+		u32	reserved22		: 1;
+		u32	reserved23		: 1;
+		u32	reserved24		: 1;
+		u32	reserved25		: 1;
+		u32	reserved26		: 1;
+		u32	enclave_mode		: 1;
+		u32	smi_pending_mtf		: 1;
+		u32	smi_from_vmx_root	: 1;
+		u32	reserved30		: 1;
+		u32	failed_vmentry		: 1;
+	};
+	u32 full;
+};
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -245,7 +268,7 @@ struct vcpu_vmx {
 	int vpid;
 	bool emulation_required;
 
-	u32 exit_reason;
+	union vmx_exit_reason exit_reason;
 
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
@@ -299,6 +322,13 @@ struct kvm_vmx {
 
 	enum ept_pointers_status ept_pointers_match;
 	spinlock_t ept_pointer_lock;
+
+	struct page *pasid_dirs;
+	spinlock_t pasid_lock;
+	struct notifier_block pasid_nb;
+	struct mm_struct *mm;
+	/* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+	u64 *pid_table;
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -331,12 +361,14 @@ bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
 
 #define POSTED_INTR_ON  0
 #define POSTED_INTR_SN  1
 
+#define PID_TABLE_ENTRY_VALID 1
+
 static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
 {
 	return test_and_set_bit(POSTED_INTR_ON,
@@ -400,31 +432,36 @@ static inline u8 vmx_get_rvi(void)
 	return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
 }
 
-#define BUILD_CONTROLS_SHADOW(lname, uname)				    \
-static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val)	    \
-{									    \
-	if (vmx->loaded_vmcs->controls_shadow.lname != val) {		    \
-		vmcs_write32(uname, val);				    \
-		vmx->loaded_vmcs->controls_shadow.lname = val;		    \
-	}								    \
-}									    \
-static inline u32 lname##_controls_get(struct vcpu_vmx *vmx)		    \
-{									    \
-	return vmx->loaded_vmcs->controls_shadow.lname;			    \
-}									    \
-static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val)   \
-{									    \
-	lname##_controls_set(vmx, lname##_controls_get(vmx) | val);	    \
-}									    \
-static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
-{									    \
-	lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val);	    \
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits)				\
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	if (vmx->loaded_vmcs->controls_shadow.lname != val) {			\
+		vmcs_write##bits(uname, val);					\
+		vmx->loaded_vmcs->controls_shadow.lname = val;			\
+	}									\
+}										\
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs)	\
+{										\
+	return vmcs->controls_shadow.lname;					\
+}										\
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx)		\
+{										\
+	return __##lname##_controls_get(vmx->loaded_vmcs);			\
+}										\
+static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	lname##_controls_set(vmx, lname##_controls_get(vmx) | val);		\
+}										\
+static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val)	\
+{										\
+	lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val);		\
 }
-BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
-BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
-BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
 
 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
@@ -518,4 +555,9 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 
 void dump_vmcs(void);
 
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+	return  lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
 #endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 153659e8f40394715da82c98f3d818342b4b8b67..b3e0518586897addabe6e078af995dc79321d39e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,8 +60,11 @@
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mce.h>
+#include <asm/pkru.h>
 #include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
@@ -78,6 +81,8 @@
 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
+#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
+
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
 
@@ -108,6 +113,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
 
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
@@ -179,6 +187,11 @@ struct kvm_shared_msrs {
 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 static struct kvm_shared_msrs __percpu *shared_msrs;
 
+u64 __read_mostly host_xss;
+EXPORT_SYMBOL_GPL(host_xss);
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -224,9 +237,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-struct kmem_cache *x86_fpu_cache;
-EXPORT_SYMBOL_GPL(x86_fpu_cache);
-
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
@@ -358,6 +368,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	}
 
 	kvm_lapic_set_base(vcpu, msr_info->data);
+	kvm_recalculate_apic_map(vcpu->kvm);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@ -527,6 +538,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
+		pr_info("inject a TF to VM, current_nr:%x\n", nr);
+		WARN_ON(1);
 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
@@ -539,6 +552,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		 * exception.pending = true so that the double fault
 		 * can trigger a nested vmexit.
 		 */
+		pr_info("inject a DF to VM, prev_nr:%x, current_nr:%x\n",
+			prev_nr, nr);
+		WARN_ON(1);
 		vcpu->arch.exception.pending = true;
 		vcpu->arch.exception.injected = false;
 		vcpu->arch.exception.has_error_code = true;
@@ -604,8 +620,11 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault)
 {
+	WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
 	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 	else
@@ -613,6 +632,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
 
 	return fault->nested_page_fault;
 }
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
@@ -824,41 +844,48 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 {
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-			!vcpu->guest_xcr0_loaded) {
-		/* kvm_set_xcr() also depends on this */
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
 		if (vcpu->arch.xcr0 != host_xcr0)
 			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
-		vcpu->guest_xcr0_loaded = 1;
+
+		if (vcpu->arch.xsaves_enabled &&
+		    vcpu->arch.ia32_xss != host_xss)
+			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
 	}
 
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
 	    vcpu->arch.pkru != vcpu->arch.host_pkru)
-		__write_pkru(vcpu->arch.pkru);
+		write_pkru(vcpu->arch.pkru);
 }
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 {
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
 		vcpu->arch.pkru = rdpkru();
 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
-			__write_pkru(vcpu->arch.host_pkru);
+			write_pkru(vcpu->arch.host_pkru);
 	}
 
-	if (vcpu->guest_xcr0_loaded) {
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
 		if (vcpu->arch.xcr0 != host_xcr0)
 			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
-		vcpu->guest_xcr0_loaded = 0;
+
+		if (vcpu->arch.xsaves_enabled &&
+		    vcpu->arch.ia32_xss != host_xss)
+			wrmsrl(MSR_IA32_XSS, host_xss);
 	}
+
 }
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 
 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
@@ -893,6 +920,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
+
+	if ((xcr0 & XFEATURE_MASK_XTILE) &&
+	    ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+		return 1;
+
 	vcpu->arch.xcr0 = xcr0;
 
 	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -931,10 +963,10 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
 		reserved_bits |= X86_CR4_PKE;
 
 	if (!cpu_has(c, X86_FEATURE_LA57) &&
-	    !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+	    !kvm_cpu_cap_has(X86_FEATURE_LA57))
 		reserved_bits |= X86_CR4_LA57;
 
-	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_cpu_cap_has(X86_FEATURE_UMIP))
 		reserved_bits |= X86_CR4_UMIP;
 
 	return reserved_bits;
@@ -1242,6 +1274,16 @@ static const u32 msrs_to_save_all[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+	MSR_IA32_XSS,
+	MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+
+	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+	MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
 };
 
 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1606,6 +1648,50 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
+		return 1;
+
+	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
+	    ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+	    ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+	    ((u32)(data >> 32) != X2APIC_BROADCAST))
+		return kvm_x2apic_icr_write(vcpu->arch.apic, data);
+	return 1;
+}
+
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+	u32 msr = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	u64 data;
+	int ret = 0;
+
+	switch (msr) {
+	case APIC_BASE_MSR + (APIC_ICR >> 4):
+		data = kvm_read_edx_eax(vcpu);
+		ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+		break;
+	default:
+		return EXIT_FASTPATH_NONE;
+	}
+
+	if (!ret) {
+		trace_kvm_msr_write(msr, data);
+		return EXIT_FASTPATH_SKIP_EMUL_INS;
+	}
+
+	return EXIT_FASTPATH_NONE;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
 /*
  * Adapt set_msr() to msr_io()'s calling convention
  */
@@ -2253,10 +2339,21 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }
 
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+void kvm_make_block_vmentry_request(struct kvm *kvm)
 {
-	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+	kvm_make_all_cpus_request(kvm, KVM_REQ_BLOCK_VMENTRY);
 }
+EXPORT_SYMBOL_GPL(kvm_make_block_vmentry_request);
+
+void kvm_clear_block_vmentry_request(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_clear_request(KVM_REQ_BLOCK_VMENTRY, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_block_vmentry_request);
 
 static void kvm_gen_update_masterclock(struct kvm *kvm)
 {
@@ -2266,7 +2363,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	struct kvm_arch *ka = &kvm->arch;
 
 	spin_lock(&ka->pvclock_gtod_sync_lock);
-	kvm_make_mclock_inprogress_request(kvm);
+	kvm_make_block_vmentry_request(kvm);
 	/* no guest entries from this point */
 	pvclock_update_vm_gtod_copy(kvm);
 
@@ -2274,8 +2371,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 	/* guest entries allowed */
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+	kvm_clear_block_vmentry_request(kvm);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
@@ -2793,6 +2889,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr_info);
 		break;
+	case MSR_IA32_XSS:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+			return 1;
+		/*
+		 * We do support PT if kvm_x86_ops->pt_supported(), but we do
+		 * not support IA32_XSS[bit 8]. Guests will have to use
+		 * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+		 * MSRs.
+		 */
+		if (data & ~vcpu->arch.guest_supported_xss)
+			return 1;
+		if (vcpu->arch.ia32_xss != data) {
+			vcpu->arch.ia32_xss = data;
+			kvm_update_cpuid(vcpu);
+		}
+		break;
 	case MSR_SMI_COUNT:
 		if (!msr_info->host_initiated)
 			return 1;
@@ -2852,7 +2965,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		break;
 	case MSR_KVM_PV_EOI_EN:
-		if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+		if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
 			return 1;
 		break;
 
@@ -2932,6 +3045,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.msr_misc_features_enables = data;
 		break;
+#ifdef CONFIG_X86_64
+	case MSR_IA32_XFD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+			     vcpu->arch.guest_supported_xcr0))
+			return 1;
+
+		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+		break;
+	case MSR_IA32_XFD_ERR:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+			     vcpu->arch.guest_supported_xcr0))
+			return 1;
+
+		vcpu->arch.guest_fpu.xfd_err = data;
+		break;
+#endif
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -3118,6 +3255,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
 				   msr_info->host_initiated);
+	case MSR_IA32_XSS:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+			return 1;
+		msr_info->data = vcpu->arch.ia32_xss;
+		break;
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
@@ -3176,6 +3319,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_HWCR:
 		msr_info->data = vcpu->arch.msr_hwcr;
 		break;
+#ifdef CONFIG_X86_64
+	case MSR_IA32_XFD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+		break;
+	case MSR_IA32_XFD_ERR:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+			return 1;
+
+		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+		break;
+#endif
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3195,6 +3354,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 
+/*
+ * If new features passthrough XSS managed MSRs to guest, it's required to
+ * add separate checks here so as to load feature dependent guest MSRs before
+ * access them.
+ */
+static bool is_xsaves_msr(u32 index)
+{
+
+	/*
+	 * Add the check for your feature like this:
+	 * return index == MSR_IA32_U_CET;
+	 */
+
+	return false;
+}
+
 /*
  * Read or write a bunch of msrs. All parameters are kernel addresses.
  *
@@ -3205,11 +3380,20 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 		    int (*do_msr)(struct kvm_vcpu *vcpu,
 				  unsigned index, u64 *data))
 {
+	bool fpu_loaded = false;
 	int i;
 
-	for (i = 0; i < msrs->nmsrs; ++i)
+	for (i = 0; i < msrs->nmsrs; ++i) {
+		if (vcpu && !fpu_loaded && supported_xss &&
+		    is_xsaves_msr(entries[i].index)) {
+			kvm_load_guest_fpu(vcpu);
+			fpu_loaded = true;
+		}
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
+	}
+	if (fpu_loaded)
+		kvm_put_guest_fpu(vcpu);
 
 	return i;
 }
@@ -3322,6 +3506,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_GET_MSR_FEATURES:
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
+	case KVM_CAP_SYS_ATTRIBUTES:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
@@ -3357,7 +3542,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_MAX_VCPUS;
 		break;
 	case KVM_CAP_MAX_VCPU_ID:
-		r = KVM_MAX_VCPU_ID;
+		r = KVM_MAX_VCPU_IDS;
 		break;
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
@@ -3384,6 +3569,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
 		r = kvm_x86_ops->nested_enable_evmcs != NULL;
 		break;
+	case KVM_CAP_XSAVE2: {
+		u64 guest_perm = xstate_get_guest_group_perm();
+
+		r = xstate_required_size(kvm_supported_xcr0() & guest_perm, false);
+		if (r < sizeof(struct kvm_xsave))
+			r = sizeof(struct kvm_xsave);
+		break;
+	}
 	default:
 		break;
 	}
@@ -3391,6 +3584,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 }
 
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+ 
+	if ((u64)(unsigned long)uaddr != attr->addr)
+		return ERR_PTR_USR(-EFAULT);
+	return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+	u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+	if (attr->group)
+		return -ENXIO;
+
+	if (IS_ERR(uaddr))
+		return PTR_ERR(uaddr);
+
+	switch (attr->attr) {
+	case KVM_X86_XCOMP_GUEST_SUPP:
+		if (put_user(kvm_supported_xcr0(), uaddr))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENXIO;
+		break;
+	}
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+	if (attr->group)
+		return -ENXIO;
+
+	switch (attr->attr) {
+	case KVM_X86_XCOMP_GUEST_SUPP:
+		return 0;
+	default:
+		return -ENXIO;
+	}
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
@@ -3477,6 +3713,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		r = msr_io(NULL, argp, do_get_msr_feature, 1);
 		break;
 	}
+	case KVM_GET_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+		r = -EFAULT;
+		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+			break;
+		r = kvm_x86_dev_get_attr(&attr);
+		break;
+	}
+	case KVM_HAS_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+		r = -EFAULT;
+		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+			break;
+		r = kvm_x86_dev_has_attr(&attr);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -3955,134 +4207,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					 struct kvm_xsave *guest_xsave)
 {
-	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-	u64 xstate_bv = xsave->header.xfeatures;
-	u64 valid;
-
-	/*
-	 * Copy legacy XSAVE area, to avoid complications with CPUID
-	 * leaves 0 and 1 in the loop below.
-	 */
-	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
-	/* Set XSTATE_BV */
-	xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
-	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
-	/*
-	 * Copy each region from the possibly compacted offset to the
-	 * non-compacted offset.
-	 */
-	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-	while (valid) {
-		u64 xfeature_mask = valid & -valid;
-		int xfeature_nr = fls64(xfeature_mask) - 1;
-		void *src = get_xsave_addr(xsave, xfeature_nr);
-
-		if (src) {
-			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, xfeature_nr,
-				    &size, &offset, &ecx, &edx);
-			if (xfeature_nr == XFEATURE_PKRU)
-				memcpy(dest + offset, &vcpu->arch.pkru,
-				       sizeof(vcpu->arch.pkru));
-			else
-				memcpy(dest + offset, src, size);
-
-		}
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return;
 
-		valid -= xfeature_mask;
-	}
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       guest_xsave->region,
+				       sizeof(guest_xsave->region),
+				       vcpu->arch.pkru);
 }
 
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+					  u8 *state, unsigned int size)
 {
-	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
-	u64 valid;
-
-	/*
-	 * Copy legacy XSAVE area, to avoid complications with CPUID
-	 * leaves 0 and 1 in the loop below.
-	 */
-	memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
-	/* Set XSTATE_BV and possibly XCOMP_BV.  */
-	xsave->header.xfeatures = xstate_bv;
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
-	/*
-	 * Copy each region from the non-compacted offset to the
-	 * possibly compacted offset.
-	 */
-	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-	while (valid) {
-		u64 xfeature_mask = valid & -valid;
-		int xfeature_nr = fls64(xfeature_mask) - 1;
-		void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-		if (dest) {
-			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, xfeature_nr,
-				    &size, &offset, &ecx, &edx);
-			if (xfeature_nr == XFEATURE_PKRU)
-				memcpy(&vcpu->arch.pkru, src + offset,
-				       sizeof(vcpu->arch.pkru));
-			else
-				memcpy(dest, src + offset, size);
-		}
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return;
 
-		valid -= xfeature_mask;
-	}
+	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+				       state, size, vcpu->arch.pkru);
 }
 
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
-					 struct kvm_xsave *guest_xsave)
-{
-	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
-		fill_xsave((u8 *) guest_xsave->region, vcpu);
-	} else {
-		memcpy(guest_xsave->region,
-			&vcpu->arch.guest_fpu->state.fxsave,
-			sizeof(struct fxregs_state));
-		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-			XFEATURE_MASK_FPSSE;
-	}
-}
-
-#define XSAVE_MXCSR_OFFSET 24
-
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
-	u64 xstate_bv =
-		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-	u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return 0;
 
-	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-		/*
-		 * Here we allow setting states that are not present in
-		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
-		 * with old userspace.
-		 */
-		if (xstate_bv & ~kvm_supported_xcr0() ||
-			mxcsr & ~mxcsr_feature_mask)
-			return -EINVAL;
-		load_xsave(vcpu, (u8 *)guest_xsave->region);
-	} else {
-		if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
-			mxcsr & ~mxcsr_feature_mask)
-			return -EINVAL;
-		memcpy(&vcpu->arch.guest_fpu->state.fxsave,
-			guest_xsave->region, sizeof(struct fxregs_state));
-	}
-	return 0;
+	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+					      guest_xsave->region,
+					      kvm_supported_xcr0(), &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -4389,6 +4544,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
+		r = -EINVAL;
+		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+			break;
+
 		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xsave)
@@ -4403,7 +4562,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = memdup_user(argp, size);
 		if (IS_ERR(u.xsave)) {
 			r = PTR_ERR(u.xsave);
 			goto out_nofree;
@@ -4412,6 +4573,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
+
+	case KVM_GET_XSAVE2: {
+		int size = vcpu->arch.guest_fpu.uabi_size;
+
+		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+		r = -ENOMEM;
+		if (!u.xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, u.xsave, size))
+			break;
+
+		r = 0;
+		break;
+	}
+
 	case KVM_GET_XCRS: {
 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
@@ -4890,6 +5070,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.exception_payload_enabled = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = -EINVAL;
+		if (cap->args[0] > KVM_MAX_VCPU_IDS)
+			break;
+
+		mutex_lock(&kvm->lock);
+		if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+			r = 0;
+		} else if (!kvm->arch.max_vcpu_ids) {
+			kvm->arch.max_vcpu_ids = cap->args[0];
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -5278,7 +5472,17 @@ static void kvm_init_msr_list(void)
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 				continue;
+			break;
 		}
+		case MSR_IA32_XSS:
+			if (!supported_xss)
+				continue;
+			break;
+		case MSR_IA32_XFD:
+		case MSR_IA32_XFD_ERR:
+			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+				continue;
+			break;
 		default:
 			break;
 		}
@@ -5311,7 +5515,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	int n;
 
 	do {
-		n = min(len, 8);
+		n = min(len, 64);
 		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -5331,7 +5535,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	int n;
 
 	do {
-		n = min(len, 8);
+		n = min(len, 64);
 		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
 					 addr, n, v))
@@ -5570,6 +5774,9 @@ int handle_ud(struct kvm_vcpu *vcpu)
 	char sig[5]; /* ud2; .ascii "kvm" */
 	struct x86_exception e;
 
+	if (unlikely(!kvm_x86_ops->can_emulate_instruction(vcpu, NULL, 0)))
+		return 1;
+
 	if (force_emulation_prefix &&
 	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
 				sig, sizeof(sig), &e) == 0 &&
@@ -5692,7 +5899,11 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
 
-	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+	if (frag->len > 8) {
+		memcpy(vcpu->run->mmio.np_data, frag->data, min(64u, frag->len));
+	} else {
+		memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+	}
 	return X86EMUL_CONTINUE;
 }
 
@@ -5766,7 +5977,8 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 			unsigned long addr,
 			void *val, unsigned int bytes,
 			struct x86_exception *exception,
-			const struct read_write_emulator_ops *ops)
+			const struct read_write_emulator_ops *ops,
+			bool non_posted)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	gpa_t gpa;
@@ -5808,8 +6020,14 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_cur_fragment = 0;
 
-	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+	vcpu->run->mmio.len = min(64u, vcpu->mmio_fragments[0].len);
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+
+	if (non_posted && ops->write) {
+		vcpu->mmio_is_write |= MMIO_NONPOSTED_WRITE;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write;
+	}
+
 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = gpa;
 
@@ -5823,17 +6041,31 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  struct x86_exception *exception)
 {
 	return emulator_read_write(ctxt, addr, val, bytes,
-				   exception, &read_emultor);
+				   exception, &read_emultor, false);
 }
 
 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 			    unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
-			    struct x86_exception *exception)
+			    struct x86_exception *exception,
+			    bool non_posted)
 {
 	return emulator_read_write(ctxt, addr, (void *)val, bytes,
-				   exception, &write_emultor);
+				   exception, &write_emultor, non_posted);
+}
+
+static int emulator_np_write_complete(struct x86_emulate_ctxt *ctxt, bool *retry)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	if (vcpu->mmio_nonposted_write_completed) {
+		vcpu->mmio_nonposted_write_completed = 0;
+		*retry = !!(vcpu->run->mmio.is_write & MMIO_NONPOSTED_DEFERRED);
+		return 1;
+	}
+
+	return 0;
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -5855,6 +6087,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	struct kvm_host_map map;
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	u64 page_line_mask;
 	gpa_t gpa;
 	char *kaddr;
 	bool exchanged;
@@ -5869,7 +6102,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto emul_write;
 
-	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+	/*
+	 * Emulate the atomic as a straight write to avoid #AC if SLD is
+	 * enabled in the host and the access splits a cache line.
+	 */
+	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+		page_line_mask = ~(cache_line_size() - 1);
+	else
+		page_line_mask = PAGE_MASK;
+
+	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
 		goto emul_write;
 
 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
@@ -5906,7 +6148,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+	return emulator_write_emulated(ctxt, addr, new, bytes, exception, false);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -6308,6 +6550,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
+	.np_write_complete   = emulator_np_write_complete,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 	.invlpg              = emulator_invlpg,
 	.pio_in_emulated     = emulator_pio_in_emulated,
@@ -6366,7 +6609,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception.vector == PF_VECTOR)
-		return kvm_propagate_fault(vcpu, &ctxt->exception);
+		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
 
 	if (ctxt->exception.error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
@@ -6717,7 +6960,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
-	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+	bool write_fault_to_spt;
+
+	if (unlikely(!kvm_x86_ops->can_emulate_instruction(vcpu, insn, insn_len)))
+		return 1;
 
 	vcpu->arch.l1tf_flush_l1d = true;
 
@@ -6725,6 +6971,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
 	 */
+	write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 	kvm_clear_exception_queue(vcpu);
 
@@ -7015,7 +7262,7 @@ static void kvm_hyperv_tsc_notifier(void)
 
 	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_make_mclock_inprogress_request(kvm);
+		kvm_make_block_vmentry_request(kvm);
 
 	hyperv_stop_tsc_emulation();
 
@@ -7034,8 +7281,7 @@ static void kvm_hyperv_tsc_notifier(void)
 		kvm_for_each_vcpu(cpu, vcpu, kvm)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
-		kvm_for_each_vcpu(cpu, vcpu, kvm)
-			kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+		kvm_clear_block_vmentry_request(kvm);
 
 		spin_unlock(&ka->pvclock_gtod_sync_lock);
 	}
@@ -7292,18 +7538,11 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
-	x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
-					  __alignof__(struct fpu), SLAB_ACCOUNT,
-					  NULL);
-	if (!x86_fpu_cache) {
-		printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
-		goto out;
-	}
 
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out_free_x86_fpu_cache;
+		goto out;
 	}
 
 	r = kvm_mmu_module_init();
@@ -7336,8 +7575,6 @@ int kvm_arch_init(void *opaque)
 
 out_free_percpu:
 	free_percpu(shared_msrs);
-out_free_x86_fpu_cache:
-	kmem_cache_destroy(x86_fpu_cache);
 out:
 	return r;
 }
@@ -7361,7 +7598,6 @@ void kvm_arch_exit(void)
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 	free_percpu(shared_msrs);
-	kmem_cache_destroy(x86_fpu_cache);
 }
 
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
@@ -7541,7 +7777,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
 	return emulator_write_emulated(ctxt, rip, instruction, 3,
-		&ctxt->exception);
+		&ctxt->exception, false);
 }
 
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
@@ -8052,6 +8288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
+	enum exit_fastpath_completion exit_fastpath;
 
 	bool req_immediate_exit = false;
 
@@ -8260,6 +8497,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
 		switch_fpu_return();
 
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8270,7 +8510,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
-	kvm_x86_ops->run(vcpu);
+	exit_fastpath = kvm_x86_ops->run(vcpu);
 
 	/*
 	 * Do this here before restoring debug registers on the host.  And
@@ -8302,8 +8542,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
+	/*
+	 * Sync xfd before calling handle_exit_irqoff() which may
+	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+	 * in #NM irqoff handler).
+	 */
+	if (vcpu->arch.xfd_no_write_intercept)
+		fpu_sync_guest_vmexit_xfd_state();
+
 	kvm_x86_ops->handle_exit_irqoff(vcpu);
 
+	if (vcpu->arch.guest_fpu.xfd_err)
+		wrmsrl(MSR_IA32_XFD_ERR, 0);
+
 	/*
 	 * Consume any pending interrupts, including the possible source of
 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
@@ -8346,7 +8597,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_lapic_sync_from_vapic(vcpu);
 
 	vcpu->arch.gpa_available = false;
-	r = kvm_x86_ops->handle_exit(vcpu);
+	r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
 	return r;
 
 cancel_injection:
@@ -8495,9 +8746,14 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 	/* Complete previous fragment */
 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
-	len = min(8u, frag->len);
-	if (!vcpu->mmio_is_write)
-		memcpy(frag->data, run->mmio.data, len);
+	len = min(64u, frag->len);
+	if (!vcpu->mmio_is_write) {
+                if (len > 8) {
+                        memcpy(frag->data, run->mmio.np_data, len);
+                } else {
+                        memcpy(frag->data, run->mmio.data, len);
+                }
+        }
 
 	if (frag->len <= 8) {
 		/* Switch to the next fragment. */
@@ -8514,64 +8770,46 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 		vcpu->mmio_needed = 0;
 
 		/* FIXME: return into emulator if single-stepping.  */
-		if (vcpu->mmio_is_write)
+		if (vcpu->mmio_is_write == MMIO_WRITE)
 			return 1;
-		vcpu->mmio_read_completed = 1;
+		if (vcpu->mmio_is_write == MMIO_NONPOSTED_WRITE)
+			vcpu->mmio_nonposted_write_completed = 1;
+		else
+			vcpu->mmio_read_completed = 1;
+
 		return complete_emulated_io(vcpu);
 	}
 
 	run->exit_reason = KVM_EXIT_MMIO;
 	run->mmio.phys_addr = frag->gpa;
-	if (vcpu->mmio_is_write)
-		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
-	run->mmio.len = min(8u, frag->len);
+	if (vcpu->mmio_is_write) {
+		if (frag->len > 8) {
+			memcpy(run->mmio.np_data, frag->data, min(64u, frag->len));
+		} else {
+			memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+		}
+	}
+	run->mmio.len = min(64u, frag->len);
 	run->mmio.is_write = vcpu->mmio_is_write;
 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	return 0;
 }
 
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
-	/*
-	 * If the target FPU state is not resident in the CPU registers, just
-	 * memcpy() from current, else save CPU state directly to the target.
-	 */
-	if (test_thread_flag(TIF_NEED_FPU_LOAD))
-		memcpy(&fpu->state, &current->thread.fpu.state,
-		       fpu_kernel_xstate_size);
-	else
-		copy_fpregs_to_fpstate(fpu);
-}
-
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
-	kvm_save_current_fpu(vcpu->arch.user_fpu);
-
-	/* PKRU is separately restored in kvm_x86_ops->run.  */
-	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
-				~XFEATURE_MASK_PKRU);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	/*
+	 * Exclude PKRU from restore as restored separately in
+	 * kvm_x86_ops.run().
+	 */
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	fpregs_lock();
-
-	kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
-	copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
-
-	fpregs_mark_activate();
-	fpregs_unlock();
-
+	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
@@ -9065,9 +9303,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return 0;
+
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -9085,9 +9326,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxregs_state *fxsave;
 
+	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+		return 0;
+
 	vcpu_load(vcpu);
 
-	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -9143,11 +9387,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 static void fx_init(struct kvm_vcpu *vcpu)
 {
-	fpstate_init(&vcpu->arch.guest_fpu->state);
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
-			host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
@@ -9169,16 +9408,26 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	free_cpumask_var(wbinvd_dirty_mask);
 }
 
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	if (kvm_check_tsc_unstable() && kvm->created_vcpus)
+		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+			     "guest TSC will not be reliable\n");
+
+	if (!kvm->arch.max_vcpu_ids)
+		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+	if (id >= kvm->arch.max_vcpu_ids)
+		return -EINVAL;
+
+	return kvm_x86_ops->vcpu_precreate(kvm);
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
 
-	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
-		printk_once(KERN_WARNING
-		"kvm: SMP vm created on host with unstable TSC; "
-		"guest TSC will not be reliable\n");
-
 	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
 
 	return vcpu;
@@ -9263,8 +9512,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	if (kvm_mpx_supported()) {
-		void *mpx_state_buffer;
+	if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
 
 		/*
 		 * To avoid have the INIT path from kvm_apic_has_events() that be
@@ -9272,14 +9521,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		 */
 		if (init_event)
 			kvm_put_guest_fpu(vcpu);
-		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_BNDREGS);
-		if (mpx_state_buffer)
-			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
-		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_BNDCSR);
-		if (mpx_state_buffer)
-			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
 		if (init_event)
 			kvm_load_guest_fpu(vcpu);
 	}
@@ -9414,10 +9659,21 @@ int kvm_arch_hardware_setup(void)
 {
 	int r;
 
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		rdmsrl(MSR_IA32_XSS, host_xss);
+
 	r = kvm_x86_ops->hardware_setup();
 	if (r != 0)
 		return r;
 
+	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+		supported_xss = 0;
+	else
+		supported_xss &= host_xss;
+
+	if (!kvm_pasid_supported())
+		kvm_cpu_cap_clear(X86_FEATURE_ENQCMD);
+
 	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
 
 	if (kvm_has_tsc_control) {
@@ -9510,8 +9766,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	fx_init(vcpu);
 
-	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c520d373790a21c9bc05bdd8e65744cf4cde9163..29bbb0aaf125b2e99ccded08e1f5db679ee0503b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
 	return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
 }
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
@@ -289,14 +284,24 @@ bool kvm_vector_hashing_enabled(void);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 			    int emulation_type, void *insn, int insn_len);
 
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
-				| XFEATURE_MASK_PKRU)
+				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
 
+extern u64 host_xss;
+extern u64 supported_xss;
+
+static inline bool kvm_pasid_supported(void)
+{
+	return supported_xss & XFEATURE_MASK_PASID;
+}
+
 extern unsigned int min_timer_period_us;
 
 extern bool enable_vmware_backdoor;
@@ -366,8 +371,8 @@ static inline bool kvm_pat_valid(u64 data)
 	return (data | ((data & 0x0202020202020202ull) << 1)) == data;
 }
 
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 int kvm_spec_ctrl_test_value(u64 value);
 
 #endif
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 497b4a8414b38b306149bc8a725fab8d4e3ba7ee..6a53e579d6839de4804dcae9a7bb743b915769ca 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -224,6 +224,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * Don't try to copy the tail if machine check happened
  *
  * Input:
+ * eax trap number written by ex_handler_copy()
  * rdi destination
  * rsi source
  * rdx count
@@ -233,22 +234,17 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  */
 ALIGN;
 .Lcopy_user_handle_tail:
-	movl %edx,%ecx
-	cmp $18,%eax		/* check if X86_TRAP_MC */
+	cmp $18,%eax
 	je 3f
+
+	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
-	/*
-	 * Return zero to pretend that this copy succeeded. This
-	 * is counter-intuitive, but needed to prevent the code
-	 * in lib/iov_iter.c from retrying and running back into
-	 * the poison cache line again. The machine check handler
-	 * will ensure that a SIGBUS is sent to the task.
-	 */
-3:	xorl %eax,%eax
+3:
+	movl %edx,%eax
 	ASM_CLAC
 	ret
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index dc2fb886db2bcf722037d94c2bd34c7d4af57b0e..2cb18ef244270b312de75c5169f847d4f44b6d0c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -288,9 +288,9 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
 	.previous
 
-	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
-	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
-	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+	_ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+	_ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
+	_ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
 	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
 	_ASM_EXTABLE(.L_write_words, .E_write_words)
 	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 034748459482a945fff91f3e9554e46bccdf4a30..d62662bdd4604398656559cb23955a7d6b217a42 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft)
 
 void finit(void)
 {
-	fpstate_init_soft(&current->thread.fpu.state.soft);
+	fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index a873da6b46d6b2bb138c58b201d8ca5fb31da4c8..7fe56c594aa627e5de65199ba85bbae6d4d340ea 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
 #include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/user.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
@@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct swregs_state *s387 = &target->thread.fpu.state.soft;
+	struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
@@ -689,12 +689,10 @@ int fpregs_soft_set(struct task_struct *target,
 
 int fpregs_soft_get(struct task_struct *target,
 		    const struct user_regset *regset,
-		    unsigned int pos, unsigned int count,
-		    void *kbuf, void __user *ubuf)
+		    struct membuf to)
 {
-	struct swregs_state *s387 = &target->thread.fpu.state.soft;
+	struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
 	const void *space = s387->st_space;
-	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
 
 	RE_ENTRANT_CHECK_OFF;
@@ -709,18 +707,11 @@ int fpregs_soft_get(struct task_struct *target,
 	S387->fos |= 0xffff0000;
 #endif /* PECULIAR_486 */
 
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-				  offsetof(struct swregs_state, st_space));
-
-	/* Copy all registers in stack order. */
-	if (!ret)
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					  space + offset, 0, other);
-	if (!ret)
-		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					  space, 0, offset);
+	membuf_write(&to, s387, offsetof(struct swregs_state, st_space));
+	membuf_write(&to, space + offset, other);
+	membuf_write(&to, space, offset);
 
 	RE_ENTRANT_CHECK_ON;
 
-	return ret;
+	return 0;
 }
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 70d35c20094517aee19b3577c806d6488d21955b..94c4023092f3a6a431404ab455fd8e848ede6a88 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -144,7 +144,7 @@ extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
 extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
 extern int FPU_round_to_int(FPU_REG *r, u_char tag);
 extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
-extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
 extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
 extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
 extern int FPU_tagof(FPU_REG *ptr);
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 9b41391867dcafb90472bb9de2a2ff6372c2731a..eec3e4805c75e595653dbba3b98e6fbc2a51f49f 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d)
 	return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
 }
 
-#define I387			(&current->thread.fpu.state)
+#define I387			(&current->thread.fpu.fpstate->regs)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index f15263e158e8e669d82398bd40b3f2f8415cd88d..4092df79de4f50d919a6519185ab089e929a5f55 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -240,7 +240,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		   fix-up operations. */
 		return 1;
 	case 022:		/* frstor m94/108byte */
-		frstor(addr_modes, (u_char __user *) data_address);
+		FPU_frstor(addr_modes, (u_char __user *) data_address);
 		/* Ensure that the values just loaded are not changed by
 		   fix-up operations. */
 		return 1;
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index fe6246ff98870580ce11a2b2178b68b731c4036c..2de1094ed4d72792aa2f4a5f76170ea43a677523 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -1117,7 +1117,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
 	return s;
 }
 
-void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
 {
 	int i, regnr;
 	u_char __user *s = fldenv(addr_modes, data_address);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 3626bae87d40468f43d9faad461ee6be4a7c5a40..ce621329491261330b064f0e054a7636c2547219 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -4,94 +4,29 @@
 #include <linux/sched/debug.h>
 #include <xen/xen.h>
 
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
-			    struct pt_regs *, int, unsigned long,
-			    unsigned long);
-
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
-static inline ex_handler_t
-ex_fixup_handler(const struct exception_table_entry *x)
-{
-	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
-}
 
-__visible bool ex_handler_default(const struct exception_table_entry *fixup,
-				  struct pt_regs *regs, int trapnr,
-				  unsigned long error_code,
-				  unsigned long fault_addr)
+static bool ex_handler_default(const struct exception_table_entry *fixup,
+			       struct pt_regs *regs)
 {
 	regs->ip = ex_fixup_addr(fixup);
 	return true;
 }
-EXPORT_SYMBOL(ex_handler_default);
 
-__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
-				struct pt_regs *regs, int trapnr,
-				unsigned long error_code,
-				unsigned long fault_addr)
+static bool ex_handler_fault(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
 {
-	regs->ip = ex_fixup_addr(fixup);
 	regs->ax = trapnr;
-	return true;
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL_GPL(ex_handler_fault);
-
-/*
- * Handler for UD0 exception following a failed test against the
- * result of a refcount inc/dec/add/sub.
- */
-__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
-				   struct pt_regs *regs, int trapnr,
-				   unsigned long error_code,
-				   unsigned long fault_addr)
-{
-	/* First unconditionally saturate the refcount. */
-	*(int *)regs->cx = INT_MIN / 2;
-
-	/*
-	 * Strictly speaking, this reports the fixup destination, not
-	 * the fault location, and not the actually overflowing
-	 * instruction, which is the instruction before the "js", but
-	 * since that instruction could be a variety of lengths, just
-	 * report the location after the overflow, which should be close
-	 * enough for finding the overflow, as it's at least back in
-	 * the function, having returned from .text.unlikely.
-	 */
-	regs->ip = ex_fixup_addr(fixup);
-
-	/*
-	 * This function has been called because either a negative refcount
-	 * value was seen by any of the refcount functions, or a zero
-	 * refcount value was seen by refcount_dec().
-	 *
-	 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
-	 * wrapped around) will be set. Additionally, seeing the refcount
-	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
-	 * these cases we want a report, since it's a boundary condition.
-	 * The SF case is not reported since it indicates post-boundary
-	 * manipulations below zero or above INT_MAX. And if none of the
-	 * flags are set, something has gone very wrong, so report it.
-	 */
-	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
-		bool zero = regs->flags & X86_EFLAGS_ZF;
-
-		refcount_error_report(regs, zero ? "hit zero" : "overflow");
-	} else if ((regs->flags & X86_EFLAGS_SF) == 0) {
-		/* Report if none of OF, ZF, nor SF are set. */
-		refcount_error_report(regs, "unexpected saturation");
-	}
-
-	return true;
-}
-EXPORT_SYMBOL(ex_handler_refcount);
 
 /*
  * Handler for when we fail to restore a task's FPU state.  We should never get
@@ -103,77 +38,47 @@ EXPORT_SYMBOL(ex_handler_refcount);
  * of vulnerability by restoring from the initial state (essentially, zeroing
  * out all the FPU registers) if we can't restore from the task's FPU state.
  */
-__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
-				    struct pt_regs *regs, int trapnr,
-				    unsigned long error_code,
-				    unsigned long fault_addr)
+static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+				 struct pt_regs *regs)
 {
 	regs->ip = ex_fixup_addr(fixup);
 
 	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
 		  (void *)instruction_pointer(regs));
 
-	__copy_kernel_to_fpregs(&init_fpstate, -1);
+	fpu_reset_from_exception_fixup();
 	return true;
 }
-EXPORT_SYMBOL_GPL(ex_handler_fprestore);
 
-__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
-				  struct pt_regs *regs, int trapnr,
-				  unsigned long error_code,
-				  unsigned long fault_addr)
+static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+			       struct pt_regs *regs, int trapnr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
-}
-EXPORT_SYMBOL(ex_handler_uaccess);
-
-__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
-			      struct pt_regs *regs, int trapnr,
-			      unsigned long error_code,
-			      unsigned long fault_addr)
-{
-	/* Special hack for uaccess_err */
-	current->thread.uaccess_err = 1;
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_ext);
 
-__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
-			       struct pt_regs *regs, int trapnr,
-			       unsigned long error_code,
-			       unsigned long fault_addr)
+static bool ex_handler_copy(const struct exception_table_entry *fixup,
+			    struct pt_regs *regs, int trapnr)
 {
 	WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
-	regs->ip = ex_fixup_addr(fixup);
-	regs->ax = trapnr;
-	return true;
+	return ex_handler_fault(fixup, regs, trapnr);
 }
-EXPORT_SYMBOL(ex_handler_copy);
 
-__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
-				       struct pt_regs *regs, int trapnr,
-				       unsigned long error_code,
-				       unsigned long fault_addr)
+static bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+				    struct pt_regs *regs)
 {
 	if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
 		show_stack_regs(regs);
 
 	/* Pretend that the read succeeded and returned 0. */
-	regs->ip = ex_fixup_addr(fixup);
 	regs->ax = 0;
 	regs->dx = 0;
-	return true;
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
 
-__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
-				       struct pt_regs *regs, int trapnr,
-				       unsigned long error_code,
-				       unsigned long fault_addr)
+static bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+				    struct pt_regs *regs)
 {
 	if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, (unsigned int)regs->dx,
@@ -181,45 +86,29 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
 		show_stack_regs(regs);
 
 	/* Pretend that the write succeeded. */
-	regs->ip = ex_fixup_addr(fixup);
-	return true;
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
 
-__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
-				   struct pt_regs *regs, int trapnr,
-				   unsigned long error_code,
-				   unsigned long fault_addr)
+static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+				struct pt_regs *regs)
 {
 	if (static_cpu_has(X86_BUG_NULL_SEG))
 		asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
 	asm volatile ("mov %0, %%fs" : : "rm" (0));
-	return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+	return ex_handler_default(fixup, regs);
 }
-EXPORT_SYMBOL(ex_handler_clear_fs);
 
-enum handler_type ex_get_fault_handler_type(unsigned long ip)
+int ex_get_fixup_type(unsigned long ip)
 {
-	const struct exception_table_entry *e;
-	ex_handler_t handler;
+	const struct exception_table_entry *e = search_exception_tables(ip);
 
-	e = search_exception_tables(ip);
-	if (!e)
-		return EX_HANDLER_NONE;
-	handler = ex_fixup_handler(e);
-	if (handler == ex_handler_fault)
-		return EX_HANDLER_FAULT;
-	else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
-		return EX_HANDLER_UACCESS;
-	else
-		return EX_HANDLER_OTHER;
+	return e ? e->type : EX_TYPE_NONE;
 }
 
 int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 		    unsigned long fault_addr)
 {
 	const struct exception_table_entry *e;
-	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -239,8 +128,35 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 	if (!e)
 		return 0;
 
-	handler = ex_fixup_handler(e);
-	return handler(e, regs, trapnr, error_code, fault_addr);
+	switch (e->type) {
+	case EX_TYPE_DEFAULT:
+	case EX_TYPE_DEFAULT_MCE_SAFE:
+		return ex_handler_default(e, regs);
+	case EX_TYPE_FAULT:
+	case EX_TYPE_FAULT_MCE_SAFE:
+		return ex_handler_fault(e, regs, trapnr);
+	case EX_TYPE_UACCESS:
+		return ex_handler_uaccess(e, regs, trapnr);
+	case EX_TYPE_COPY:
+		return ex_handler_copy(e, regs, trapnr);
+	case EX_TYPE_CLEAR_FS:
+		return ex_handler_clear_fs(e, regs);
+	case EX_TYPE_FPU_RESTORE:
+		return ex_handler_fprestore(e, regs);
+	case EX_TYPE_RDMSR:
+		return ex_handler_rdmsr_unsafe(e, regs);
+	case EX_TYPE_WRMSR:
+		return ex_handler_wrmsr_unsafe(e, regs);
+	case EX_TYPE_BPF:
+		return ex_handler_bpf(e, regs);
+	case EX_TYPE_RDMSR_IN_MCE:
+		ex_handler_msr_mce(regs, false);
+		break;
+	case EX_TYPE_WRMSR_IN_MCE:
+		ex_handler_msr_mce(regs, true);
+		break;
+	}
+	BUG();
 }
 
 extern unsigned int early_recursion_flag;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 61562a6c310c32f5820820bed8882cb6a084fc9d..b1cfb26d06c74f0ae953f9636f9198b6394acea2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -971,7 +971,7 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 	/* This code is always called on the current mm */
 	bool foreign = false;
 
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return false;
 	if (error_code & X86_PF_PK)
 		return true;
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7b558939b89c154e46f363c315db0b68ccb12583..b5b1491a26cc75f63e271a2f62e7fcfaf5b1cfaa 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -196,6 +196,37 @@ void __init sme_early_init(void)
 		swiotlb_force = SWIOTLB_FORCE;
 }
 
+void __init sev_setup_arch(void)
+{
+	phys_addr_t total_mem = memblock_phys_mem_size();
+	unsigned long size;
+
+	if (!sev_active())
+		return;
+
+	/*
+	 * For SEV, all DMA has to occur via shared/unencrypted pages.
+	 * SEV uses SWIOTLB to make this happen without changing device
+	 * drivers. However, depending on the workload being run, the
+	 * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+	 * run out of buffers for DMA, resulting in I/O errors and/or
+	 * performance degradation especially with high I/O workloads.
+	 *
+	 * Adjust the default size of SWIOTLB for SEV guests using
+	 * a percentage of guest memory for SWIOTLB buffers.
+	 * Also, as the SWIOTLB bounce buffer memory is allocated
+	 * from low memory, ensure that the adjusted size is within
+	 * the limits of low available memory.
+	 *
+	 * The percentage of guest memory used here for SWIOTLB buffers
+	 * is more of an approximation of the static adjustment which
+	 * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+	 */
+	size = total_mem * 6 / 100;
+	size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+	swiotlb_adjust_size(size);
+}
+
 static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
 {
 	pgprot_t old_prot, new_prot;
@@ -367,7 +398,7 @@ bool force_dma_unencrypted(struct device *dev)
 	if (sme_active()) {
 		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
 		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
-						dev->bus_dma_mask);
+						dev->bus_dma_limit);
 
 		if (dma_dev_mask <= dma_enc_mask)
 			return true;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 895fb7a9294d4fa3a2c404fff42544f5710245c5..7621c8ad20890c8236663236cb57036c6ab7ed91 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -18,7 +18,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
 #include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/xstate.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/mpx.h>
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index c6f84c0b5d7a5ddae9efcb24e75592555648296a..b5945b62d92ac90a0b4763c06d0b4c56b8cd01e8 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
-#include <asm/fpu/internal.h>		/* init_fpstate			*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
 		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
 		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
 
-/*
- * Called from the FPU code when creating a fresh set of FPU
- * registers.  This is called from a very specific context where
- * we know the FPU regstiers are safe for use and we can use PKRU
- * directly.
- */
-void copy_init_pkru_to_fpregs(void)
-{
-	u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-	/*
-	 * Override the PKRU state that came from 'init_fpstate'
-	 * with the baseline from the process.
-	 */
-	write_pkru(init_pkru_value_snapshot);
-}
-
 static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)
 {
@@ -154,7 +137,6 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 static ssize_t init_pkru_write_file(struct file *file,
 		 const char __user *user_buf, size_t count, loff_t *ppos)
 {
-	struct pkru_state *pk;
 	char buf[32];
 	ssize_t len;
 	u32 new_init_pkru;
@@ -177,10 +159,6 @@ static ssize_t init_pkru_write_file(struct file *file,
 		return -EINVAL;
 
 	WRITE_ONCE(init_pkru_value, new_init_pkru);
-	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-	if (!pk)
-		return -EINVAL;
-	pk->pkru = new_init_pkru;
 	return count;
 }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 851359b7edc571ef392c2ddacc9a628fd852f0a6..14b44ac910ad6f68df98023000e526ddeccbdaec 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -422,21 +422,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 		load_new_mm_cr3(next->pgd, new_asid, true);
 
-		/*
-		 * NB: This gets called via leave_mm() in the idle path
-		 * where RCU functions differently.  Tracing normally
-		 * uses RCU, so we need to use the _rcuidle variant.
-		 *
-		 * (There is no good reason for this.  The idle code should
-		 *  be rearranged to call this before rcu_idle_enter().)
-		 */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 	} else {
 		/* The new ASID is already up to date. */
 		load_new_mm_cr3(next->pgd, new_asid, false);
 
-		/* See above wrt _rcuidle. */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
 	}
 
 	/* Make sure we write CR3 before loaded_mm. */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f85b7365a7709581ce59d5ddb7161eecb98e78d3..ef97b0a8429da726096d234d8df2b36e5fff57d2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -641,9 +641,7 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
-static bool ex_handler_bpf(const struct exception_table_entry *x,
-			   struct pt_regs *regs, int trapnr,
-			   unsigned long error_code, unsigned long fault_addr)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
 {
 	u32 reg = x->fixup >> 8;
 
@@ -1067,12 +1065,7 @@ st:			if (is_imm8(insn->off))
 				}
 				ex->insn = delta;
 
-				delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
-				if (!is_simm32(delta)) {
-					pr_err("extable->handler doesn't fit into 32-bit\n");
-					return -EFAULT;
-				}
-				ex->handler = delta;
+				ex->type = EX_TYPE_BPF;
 
 				if (dst_reg > BPF_REG_9) {
 					pr_err("verifier error\n");
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b339ce5e7f597f8b6e2aa2ce28ad3c084a421ead..ad81de4ed7b2f77200b28c3d020ca37f03120350 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/msi.h>
 
 #include <asm/acpi.h>
 #include <asm/segment.h>
@@ -19,6 +21,7 @@
 #include <asm/smp.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
+#include <asm/irqdomain.h>
 
 unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
 				PCI_PROBE_MMCONF;
@@ -135,7 +138,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev)
 		* resource so the kernel doesn't attempt to assign
 		* it later on in pci_assign_unassigned_resources
 		*/
-		for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+		for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 			bar_r = &dev->resource[bar];
 			if (bar_r->start == 0 && bar_r->end != 0) {
 				bar_r->flags = 0;
@@ -627,43 +630,6 @@ unsigned int pcibios_assign_all_busses(void)
 	return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
 }
 
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-static LIST_HEAD(dma_domain_list);
-static DEFINE_SPINLOCK(dma_domain_list_lock);
-
-void add_dma_domain(struct dma_domain *domain)
-{
-	spin_lock(&dma_domain_list_lock);
-	list_add(&domain->node, &dma_domain_list);
-	spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(add_dma_domain);
-
-void del_dma_domain(struct dma_domain *domain)
-{
-	spin_lock(&dma_domain_list_lock);
-	list_del(&domain->node);
-	spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(del_dma_domain);
-
-static void set_dma_domain_ops(struct pci_dev *pdev)
-{
-	struct dma_domain *domain;
-
-	spin_lock(&dma_domain_list_lock);
-	list_for_each_entry(domain, &dma_domain_list, node) {
-		if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
-			pdev->dev.dma_ops = domain->dma_ops;
-			break;
-		}
-	}
-	spin_unlock(&dma_domain_list_lock);
-}
-#else
-static void set_dma_domain_ops(struct pci_dev *pdev) {}
-#endif
-
 static void set_dev_domain_options(struct pci_dev *pdev)
 {
 	if (is_vmd(pdev->bus))
@@ -672,8 +638,9 @@ static void set_dev_domain_options(struct pci_dev *pdev)
 
 int pcibios_add_device(struct pci_dev *dev)
 {
-	struct setup_data *data;
 	struct pci_setup_rom *rom;
+	struct irq_domain *msidom;
+	struct setup_data *data;
 	u64 pa_data;
 
 	pa_data = boot_params.hdr.setup_data;
@@ -699,8 +666,21 @@ int pcibios_add_device(struct pci_dev *dev)
 		pa_data = data->next;
 		memunmap(data);
 	}
-	set_dma_domain_ops(dev);
 	set_dev_domain_options(dev);
+
+	/*
+	 * Setup the initial MSI domain of the device. If the underlying
+	 * bus has a PCI/MSI irqdomain associated use the bus domain,
+	 * otherwise set the default domain. This ensures that special irq
+	 * domains e.g. VMD are preserved. The default ensures initial
+	 * operation if irq remapping is not active. If irq remapping is
+	 * active it will overwrite the domain pointer when the device is
+	 * associated to a remapping domain.
+	 */
+	msidom = dev_get_msi_domain(&dev->bus->dev);
+	if (!msidom)
+		msidom = x86_pci_msi_default_domain;
+	dev_set_msi_domain(&dev->dev, msidom);
 	return 0;
 }
 
@@ -738,3 +718,83 @@ int pci_ext_cfg_avail(void)
 	else
 		return 0;
 }
+
+#if IS_ENABLED(CONFIG_VMD)
+struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
+{
+	if (is_vmd(dev->bus))
+		return to_pci_sysdata(dev->bus)->vmd_dev;
+
+	return dev;
+}
+#endif
+
+/*
+ * We want to figure out which context we are running in. But the hardware
+ * does not introduce a reliable way (instruction, CPUID leaf, MSR, whatever)
+ * which can be manipulated by the VMM to let the OS figure out where it runs.
+ * So we go with the below probably on_bare_metal() function as a replacement
+ * for definitely on_bare_metal() to go forward only for the very simple reason
+ * that this is the only option we have.
+ */
+static const char * const vmm_vendor_name[] = {
+	"QEMU", "Bochs", "KVM", "Xen", "VMware", "VMW", "VMware Inc.",
+	"innotek GmbH", "Oracle Corporation", "Parallels", "BHYVE"
+};
+
+static void read_type0_virtual_machine(const struct dmi_header *dm, void *p)
+{
+	u8 *data = (u8 *)dm + 0x13;
+
+	/* BIOS Information (Type 0) */
+	if (dm->type != 0 || dm->length < 0x14)
+		return;
+
+	/* Bit 4 of BIOS Characteristics Extension Byte 2*/
+	if (*data & BIT(4))
+		*((bool *)p) = true;
+}
+
+static bool smbios_virtual_machine(void)
+{
+	bool bit_present = false;
+
+	dmi_walk(read_type0_virtual_machine, &bit_present);
+
+	return bit_present;
+}
+
+static bool on_bare_metal(struct device *dev)
+{
+	int i;
+
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
+
+	if (smbios_virtual_machine())
+		return false;
+
+	if (iommu_capable(dev->bus, IOMMU_CAP_VIOMMU_HINT))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(vmm_vendor_name); i++)
+		if (dmi_match(DMI_SYS_VENDOR, vmm_vendor_name[i]))
+			return false;
+
+	pr_info("System running on bare metal, report to bugzilla.kernel.org if not the case.");
+
+	return true;
+}
+
+bool arch_support_pci_device_msi(struct pci_dev *pdev)
+{
+	/*
+	 * When we are running in a VMM context, the device IMS could only be
+	 * enabled when the underlying hardware supports interrupt isolation
+	 * of the subdevice, or any mechanism (trap, hypercall) is added so
+	 * that changes in the interrupt message store could be managed by the
+	 * VMM. For now, we only support the device IMS when we are running on
+	 * the bare metal.
+	 */
+	return on_bare_metal(&pdev->dev);
+}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 5fc617edf10856b88fda394c3a7456a0b9888ec7..00bfa1ebad6c78f430f6cba36ce0004e65ea0a18 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -3,16 +3,17 @@
 #include <linux/init.h>
 #include <asm/pci_x86.h>
 #include <asm/x86_init.h>
+#include <asm/irqdomain.h>
 
 /* arch_initcall has too random ordering, so call the initializers
    in the right sequence from here. */
 static __init int pci_arch_init(void)
 {
-#ifdef CONFIG_PCI_DIRECT
-	int type = 0;
+	int type;
+
+	x86_create_pci_msi_domain();
 
 	type = pci_direct_probe();
-#endif
 
 	if (!(pci_probe & PCI_PROBE_NOEARLY))
 		pci_mmcfg_early_init();
@@ -20,18 +21,16 @@ static __init int pci_arch_init(void)
 	if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
 		return 0;
 
-#ifdef CONFIG_PCI_BIOS
 	pci_pcbios_init();
-#endif
+
 	/*
 	 * don't check for raw_pci_ops here because we want pcbios as last
 	 * fallback, yet it's needed to run first to set pcibios_last_bus
 	 * in case legacy PCI probing is used. otherwise detecting peer busses
 	 * fails.
 	 */
-#ifdef CONFIG_PCI_DIRECT
 	pci_direct_init(type);
-#endif
+
 	if (!raw_pci_ops && !raw_pci_ext_ops)
 		printk(KERN_ERR
 		"PCI: Fatal: No config space access function found\n");
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index eea5a0f3b959b7913d19a800ee845c2842a335c9..0b2c6ec9bd347bea5544b342bbce38636192fc4d 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
 	struct irq_alloc_info info;
-	int polarity;
+	bool polarity_low;
 	int ret;
 	u8 gsi;
 
@@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 	switch (intel_mid_identify_cpu()) {
 	case INTEL_MID_CPU_CHIP_TANGIER:
-		polarity = IOAPIC_POL_HIGH;
+		polarity_low = false;
 
 		/* Special treatment for IRQ0 */
 		if (gsi == 0) {
@@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 		}
 		break;
 	default:
-		polarity = IOAPIC_POL_LOW;
+		polarity_low = true;
 		break;
 	}
 
-	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
@@ -383,7 +383,7 @@ static void pci_fixed_bar_fixup(struct pci_dev *dev)
 	    PCI_DEVFN(2, 2) == dev->devfn)
 		return;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
 		dev->resource[i].end = dev->resource[i].start + size - 1;
 		dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 5c11ae66b5d8ed872aee750230a026e86059fd52..0be4b0d99c7bd9470c641157078a386b9b68565f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -391,12 +391,6 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 	/* Free the IRQ's and the msidesc using the generic code. */
 	default_teardown_msi_irqs(dev);
 }
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
-	xen_destroy_irq(irq);
-}
-
 #endif
 
 int __init pci_xen_init(void)
@@ -416,7 +410,6 @@ int __init pci_xen_init(void)
 
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 	x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
 	pci_msi_ignore_mask = 1;
 #endif
@@ -441,7 +434,11 @@ void __init xen_msi_init(void)
 	}
 
 	x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+	/*
+	 * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+	 * controlled by the hypervisor.
+	 */
+	pci_msi_ignore_mask = 1;
 }
 #endif
 
@@ -476,7 +473,6 @@ int __init pci_xen_initial_domain(void)
 
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
-	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 	x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
 	pci_msi_ignore_mask = 1;
 #endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ae1c5baf27cd22ba8666cf4118f3423b409d2fff..89ae6adfc4c4db4871ac500e949b916f9807a575 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -130,6 +130,9 @@ void __init efi_find_mirror(void)
 	efi_memory_desc_t *md;
 	u64 mirror_size = 0, total_size = 0;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
@@ -147,14 +150,18 @@ void __init efi_find_mirror(void)
 
 /*
  * Tell the kernel about the EFI memory map.  This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
+ * more than the max 128 entries that can fit in the passed in e820
+ * legacy (zeropage) memory map, but the kernel's e820 table can hold
+ * E820_MAX_ENTRIES.
  */
 
 static void __init do_add_efi_memmap(void)
 {
 	efi_memory_desc_t *md;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
@@ -166,7 +173,10 @@ static void __init do_add_efi_memmap(void)
 		case EFI_BOOT_SERVICES_CODE:
 		case EFI_BOOT_SERVICES_DATA:
 		case EFI_CONVENTIONAL_MEMORY:
-			if (md->attribute & EFI_MEMORY_WB)
+			if (efi_soft_reserve_enabled()
+			    && (md->attribute & EFI_MEMORY_SP))
+				e820_type = E820_TYPE_SOFT_RESERVED;
+			else if (md->attribute & EFI_MEMORY_WB)
 				e820_type = E820_TYPE_RAM;
 			else
 				e820_type = E820_TYPE_RESERVED;
@@ -192,11 +202,36 @@ static void __init do_add_efi_memmap(void)
 			e820_type = E820_TYPE_RESERVED;
 			break;
 		}
+
 		e820__range_add(start, size, e820_type);
 	}
 	e820__update_table(e820_table);
 }
 
+/*
+ * Given add_efi_memmap defaults to 0 and there there is no alternative
+ * e820 mechanism for soft-reserved memory, import the full EFI memory
+ * map if soft reservations are present and enabled. Otherwise, the
+ * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is
+ * the efi=nosoftreserve option.
+ */
+static bool do_efi_soft_reserve(void)
+{
+	efi_memory_desc_t *md;
+
+	if (!efi_enabled(EFI_MEMMAP))
+		return false;
+
+	if (!efi_soft_reserve_enabled())
+		return false;
+
+	for_each_efi_memory_desc(md)
+		if (md->type == EFI_CONVENTIONAL_MEMORY &&
+		    (md->attribute & EFI_MEMORY_SP))
+			return true;
+	return false;
+}
+
 int __init efi_memblock_x86_reserve_range(void)
 {
 	struct efi_info *e = &boot_params.efi_info;
@@ -226,9 +261,11 @@ int __init efi_memblock_x86_reserve_range(void)
 	if (rv)
 		return rv;
 
-	if (add_efi_memmap)
+	if (add_efi_memmap || do_efi_soft_reserve())
 		do_add_efi_memmap();
 
+	efi_fake_memmap_early();
+
 	WARN(efi.memmap.desc_version != 1,
 	     "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
 	     efi.memmap.desc_version);
@@ -781,6 +818,15 @@ static bool should_map_region(efi_memory_desc_t *md)
 	if (IS_ENABLED(CONFIG_X86_32))
 		return false;
 
+	/*
+	 * EFI specific purpose memory may be reserved by default
+	 * depending on kernel config and boot options.
+	 */
+	if (md->type == EFI_CONVENTIONAL_MEMORY &&
+	    efi_soft_reserve_enabled() &&
+	    (md->attribute & EFI_MEMORY_SP))
+		return false;
+
 	/*
 	 * Map all of RAM so that we can access arguments in the 1:1
 	 * mapping when making EFI runtime calls.
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index aefe845dff596c55788e8a904de111450c31d1dc..f8f0220b6a665a1c26b9a2e15f12cb6c4fb37f2f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -318,6 +318,9 @@ void __init efi_reserve_boot_services(void)
 {
 	efi_memory_desc_t *md;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	for_each_efi_memory_desc(md) {
 		u64 start = md->phys_addr;
 		u64 size = md->num_pages << EFI_PAGE_SHIFT;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index abb6075397f0510b5b6e5b70f322086c8873cba7..18ca2261cc9ab5fd3008fda5cec83cdfba1efc41 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -90,15 +90,15 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
 	if (ret >= 0) {
-		if (info->uv_limit == UV_AFFINITY_CPU)
+		if (info->uv.limit == UV_AFFINITY_CPU)
 			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
 			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
 
-		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
-		chip_data->offset = info->uv_offset;
+		chip_data->pnode = uv_blade_to_pnode(info->uv.blade);
+		chip_data->offset = info->uv.offset;
 		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
-				    handle_percpu_irq, NULL, info->uv_name);
+				    handle_percpu_irq, NULL, info->uv.name);
 	} else {
 		kfree(chip_data);
 	}
@@ -193,10 +193,10 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 
 	init_irq_alloc_info(&info, cpumask_of(cpu));
 	info.type = X86_IRQ_ALLOC_TYPE_UV;
-	info.uv_limit = limit;
-	info.uv_blade = mmr_blade;
-	info.uv_offset = mmr_offset;
-	info.uv_name = irq_name;
+	info.uv.limit = limit;
+	info.uv.blade = mmr_blade;
+	info.uv.offset = mmr_offset;
+	info.uv.name = irq_name;
 
 	return irq_domain_alloc_irqs(domain, 1,
 				     uv_blade_to_memory_nid(mmr_blade), &info);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 915bb163976324edd41b615b08fd8ecb95c2c33c..8d7569bb704aa90bcaf284d9406052cfe186decd 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -20,7 +20,7 @@
 #include <asm/page.h>
 #include <asm/mce.h>
 #include <asm/suspend.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
 #include <asm/debugreg.h>
 #include <asm/cpu.h>
 #include <asm/mmu_context.h>
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 5e53bfbe5823b78cb6666ce9c3fcf563eed3e47e..fe12fab61535847a02bf83b140aad8e69d5e6cc0 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -3,6 +3,7 @@
 
 #include <asm/x86_init.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/xen/hypercall.h>
 
 #include <xen/xen.h>
@@ -158,15 +159,12 @@ static struct apic xen_pv_apic = {
 	.apic_id_valid 			= xen_id_always_valid,
 	.apic_id_registered 		= xen_id_always_registered,
 
-	/* .irq_delivery_mode - used in native_compose_msi_msg only */
-	/* .irq_dest_mode     - used in native_compose_msi_msg only */
+	/* .delivery_mode and .dest_mode_logical not used by XENPV */
 
 	.disable_esr			= 0,
-	/* .dest_logical      -  default_send_IPI_ use it but we use our own. */
-	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
 
+	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
 	.init_apic_ldr			= xen_noop, /* setup_local_APIC calls it */
-
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map, /* Used on 32-bit */
 	.setup_apic_routing		= NULL,
 	.cpu_present_to_apicid		= xen_cpu_present_to_apicid,
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 6024fafed1642d0c2a5d09143a55849dbdb69aa0..3c4d0b6431caf61cfcf26f0aee419f9e6fb0165b 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -11,6 +11,7 @@
 
 #include <asm/cpu.h>
 #include <asm/smp.h>
+#include <asm/io_apic.h>
 #include <asm/reboot.h>
 #include <asm/setup.h>
 #include <asm/hypervisor.h>
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 0cebe5db691d93b103d2e18a7da96c7161c76f9d..38d1cd4b2088a715f8e020d08dc186ec7c18e025 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -28,6 +28,7 @@
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/cpu.h>
+#include <asm/io_apic.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 154979d62b73c7e3e6aa54cc7497244a9e6375c2..2b86a2a042368c674eaf62b2d3a44e2019ba5f5a 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -44,8 +44,8 @@ static void do_cache_op(phys_addr_t paddr, size_t size,
 		}
 }
 
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
@@ -62,8 +62,8 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 	}
 }
 
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 	switch (dir) {
 	case DMA_BIDIRECTIONAL:
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 5559ef26406051de94d21df9a77389ee73143564..8140180fcdaa80103047027bf60efd76487d4f38 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -320,12 +320,6 @@ config ACPI_THERMAL
 	  To compile this driver as a module, choose M here:
 	  the module will be called thermal.
 
-config ACPI_NUMA
-	bool "NUMA support"
-	depends on NUMA
-	depends on (X86 || IA64 || ARM64)
-	default y if IA64 || ARM64
-
 config ACPI_CUSTOM_DSDT_FILE
 	string "Custom DSDT Table file to include"
 	default ""
@@ -474,10 +468,10 @@ config ACPI_REDUCED_HARDWARE_ONLY
 	  If you are unsure what to do, do not enable this option.
 
 source "drivers/acpi/nfit/Kconfig"
-source "drivers/acpi/hmat/Kconfig"
-
+source "drivers/acpi/numa/Kconfig"
 source "drivers/acpi/apei/Kconfig"
 source "drivers/acpi/dptf/Kconfig"
+source "drivers/acpi/pfru/Kconfig"
 
 config ACPI_WATCHDOG
 	bool
@@ -594,3 +588,8 @@ config X86_PM_TIMER
 
 	  You should nearly always say Y here because many modern
 	  systems require this timer.
+
+config ACPI_PRMT
+	bool "Platform Runtime Mechanism Support"
+	depends on EFI && X86_64
+	default y
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index ef1ac4d127dabc9d78bf215ccd442ddaab616d58..87006b9fe01786f30c07802138525a67e942d8fd 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -55,12 +55,12 @@ acpi-$(CONFIG_X86)		+= acpi_cmos_rtc.o
 acpi-$(CONFIG_X86)		+= x86/apple.o
 acpi-$(CONFIG_X86)		+= x86/utils.o
 acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
-acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
 acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
 acpi-y				+= acpi_lpat.o
 acpi-$(CONFIG_ACPI_LPIT)	+= acpi_lpit.o
 acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
 acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
+acpi-$(CONFIG_ACPI_PRMT)	+= prmt.o
 
 # Address translation
 acpi-$(CONFIG_ACPI_ADXL)	+= acpi_adxl.o
@@ -80,7 +80,7 @@ obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-$(CONFIG_ACPI)		+= container.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
 obj-$(CONFIG_ACPI_NFIT)		+= nfit/
-obj-$(CONFIG_ACPI_HMAT)		+= hmat/
+obj-$(CONFIG_ACPI_NUMA)		+= numa/
 obj-$(CONFIG_ACPI)		+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
 obj-$(CONFIG_ACPI_BATTERY)	+= battery.o
@@ -94,6 +94,7 @@ obj-$(CONFIG_ACPI_CPPC_LIB)	+= cppc_acpi.o
 obj-$(CONFIG_ACPI_SPCR_TABLE)	+= spcr.o
 obj-$(CONFIG_ACPI_DEBUGGER_USER) += acpi_dbg.o
 obj-$(CONFIG_ACPI_PPTT) 	+= pptt.o
+obj-$(CONFIG_ACPI_PFRU)		+= pfru/
 
 # processor has its own "processor." module_param namespace
 processor-y			:= processor_driver.o
diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index 601808be86d18e7a73ad4d731dd0eae164aefc8a..201e2d0dc01e95ac45ee06529f5085e6206d9319 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -735,6 +735,8 @@ const char *acpi_ah_match_uuid(u8 *data);
  */
 #if (defined ACPI_ASL_COMPILER || defined ACPI_EXEC_APP || defined ACPI_HELP_APP)
 void acpi_ut_convert_string_to_uuid(char *in_string, u8 *uuid_buffer);
+
+acpi_status acpi_ut_convert_uuid_to_string(char *uuid_buffer, char *out_string);
 #endif
 
 #endif				/* _ACUTILS_H */
diff --git a/drivers/acpi/acpica/dbdisply.c b/drivers/acpi/acpica/dbdisply.c
index 30ab62b0fec875d195224bd6b02ac2d9886689a3..33b3b8f9b21982e752260a9cb5a9ca835e5c5e62 100644
--- a/drivers/acpi/acpica/dbdisply.c
+++ b/drivers/acpi/acpica/dbdisply.c
@@ -51,6 +51,8 @@ static acpi_adr_space_type acpi_gbl_space_id_list[] = {
 	ACPI_ADR_SPACE_IPMI,
 	ACPI_ADR_SPACE_GPIO,
 	ACPI_ADR_SPACE_GSBUS,
+	ACPI_ADR_SPACE_PLATFORM_COMM,
+	ACPI_ADR_SPACE_PLATFORM_RT,
 	ACPI_ADR_SPACE_DATA_TABLE,
 	ACPI_ADR_SPACE_FIXED_HARDWARE
 };
diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c
index d3d2dbfba680c18d533caf861eb26a910dbc0db1..4de2c103a0f287627894cf502653f35f0dfb432d 100644
--- a/drivers/acpi/acpica/exfield.c
+++ b/drivers/acpi/acpica/exfield.c
@@ -138,7 +138,9 @@ acpi_ex_read_data_from_field(struct acpi_walk_state *walk_state,
 		    || obj_desc->field.region_obj->region.space_id ==
 		    ACPI_ADR_SPACE_GSBUS
 		    || obj_desc->field.region_obj->region.space_id ==
-		    ACPI_ADR_SPACE_IPMI)) {
+		    ACPI_ADR_SPACE_IPMI
+		    || obj_desc->field.region_obj->region.space_id ==
+		    ACPI_ADR_SPACE_PLATFORM_RT)) {
 
 		/* SMBus, GSBus, IPMI serial */
 
@@ -295,7 +297,9 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc,
 		    || obj_desc->field.region_obj->region.space_id ==
 		    ACPI_ADR_SPACE_GSBUS
 		    || obj_desc->field.region_obj->region.space_id ==
-		    ACPI_ADR_SPACE_IPMI)) {
+		    ACPI_ADR_SPACE_IPMI
+		    || obj_desc->field.region_obj->region.space_id ==
+		    ACPI_ADR_SPACE_PLATFORM_RT)) {
 
 		/* SMBus, GSBus, IPMI serial */
 
diff --git a/drivers/acpi/acpica/exserial.c b/drivers/acpi/acpica/exserial.c
index c5aa4b0deb70b488f394bfdb9e1017de40c5615e..fd36aaada6a0add8be0c18b1336638798ed92f76 100644
--- a/drivers/acpi/acpica/exserial.c
+++ b/drivers/acpi/acpica/exserial.c
@@ -195,6 +195,12 @@ acpi_ex_read_serial_bus(union acpi_operand_object *obj_desc,
 		function = ACPI_READ | (accessor_type << 16);
 		break;
 
+	case ACPI_ADR_SPACE_PLATFORM_RT:
+
+		buffer_length = ACPI_PRM_INPUT_BUFFER_SIZE;
+		function = ACPI_READ;
+		break;
+
 	default:
 		return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID);
 	}
@@ -311,6 +317,12 @@ acpi_ex_write_serial_bus(union acpi_operand_object *source_desc,
 		function = ACPI_WRITE | (accessor_type << 16);
 		break;
 
+	case ACPI_ADR_SPACE_PLATFORM_RT:
+
+		buffer_length = ACPI_PRM_INPUT_BUFFER_SIZE;
+		function = ACPI_WRITE;
+		break;
+
 	default:
 		return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID);
 	}
diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c
index 65beaa23766929974d2576d7ef4d025af202fe1a..1a41bd048eedc1080677f62cac332c16e40ab9a6 100644
--- a/drivers/acpi/acpica/utdecode.c
+++ b/drivers/acpi/acpica/utdecode.c
@@ -78,7 +78,8 @@ const char *acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = {
 	"IPMI",			/* 0x07 */
 	"GeneralPurposeIo",	/* 0x08 */
 	"GenericSerialBus",	/* 0x09 */
-	"PlatformCommChannel"	/* 0x0A */
+	"PlatformCommChannel",	/* 0x0A */
+	"PlatformRtMechanism"	/* 0x0B */
 };
 
 const char *acpi_ut_get_region_name(u8 space_id)
diff --git a/drivers/acpi/acpica/utuuid.c b/drivers/acpi/acpica/utuuid.c
index 0a7cf800764313cf508c19e25ac00b5a3c2ac9dd..1ccd9ceac51a0dbeba68c26c4d0ace6649ae6246 100644
--- a/drivers/acpi/acpica/utuuid.c
+++ b/drivers/acpi/acpica/utuuid.c
@@ -61,4 +61,45 @@ void acpi_ut_convert_string_to_uuid(char *in_string, u8 *uuid_buffer)
 					       1]);
 	}
 }
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_convert_uuid_to_string
+ *
+ * PARAMETERS:  uuid_buffer         - 16-byte UUID buffer
+ *              out_string          - 36-byte formatted UUID string
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Convert 16-byte UUID buffer to 36-byte formatted UUID string
+ *              out_string must be 37 bytes to include null terminator.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_convert_uuid_to_string(char *uuid_buffer, char *out_string)
+{
+	u32 i;
+
+	if (!uuid_buffer || !out_string) {
+		return (AE_BAD_PARAMETER);
+	}
+
+	for (i = 0; i < UUID_BUFFER_LENGTH; i++) {
+		out_string[acpi_gbl_map_to_uuid_offset[i]] =
+		    acpi_ut_hex_to_ascii_char(uuid_buffer[i], 4);
+
+		out_string[acpi_gbl_map_to_uuid_offset[i] + 1] =
+		    acpi_ut_hex_to_ascii_char(uuid_buffer[i], 0);
+	}
+
+	/* Insert required hyphens (dashes) */
+
+	out_string[UUID_HYPHEN1_OFFSET] =
+	    out_string[UUID_HYPHEN2_OFFSET] =
+	    out_string[UUID_HYPHEN3_OFFSET] =
+	    out_string[UUID_HYPHEN4_OFFSET] = '-';
+
+	out_string[UUID_STRING_LENGTH] = 0;	/* Null terminate */
+	return (AE_OK);
+}
 #endif
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index bc95a5eebd137c86ead416813434f9ecd906fe72..e2943cb5c292f3ab49842f6a6396647e60d42978 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -503,7 +503,6 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev)
 		node = iort_get_iort_node(dev->fwnode);
 		if (node)
 			return node;
-
 		/*
 		 * if not, then it should be a platform device defined in
 		 * DSDT/SSDT (with Named Component node in IORT)
@@ -594,13 +593,13 @@ static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base)
 /**
  * iort_dev_find_its_id() - Find the ITS identifier for a device
  * @dev: The device.
- * @req_id: Device's requester ID
+ * @id: Device's ID
  * @idx: Index of the ITS identifier list.
  * @its_id: ITS identifier.
  *
  * Returns: 0 on success, appropriate error value otherwise
  */
-static int iort_dev_find_its_id(struct device *dev, u32 req_id,
+static int iort_dev_find_its_id(struct device *dev, u32 id,
 				unsigned int idx, int *its_id)
 {
 	struct acpi_iort_its_group *its;
@@ -610,7 +609,7 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
 	if (!node)
 		return -ENXIO;
 
-	node = iort_node_map_id(node, req_id, NULL, IORT_MSI_TYPE);
+	node = iort_node_map_id(node, id, NULL, IORT_MSI_TYPE);
 	if (!node)
 		return -ENXIO;
 
@@ -633,19 +632,20 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
  *
  * Returns: the MSI domain for this device, NULL otherwise
  */
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id)
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token)
 {
 	struct fwnode_handle *handle;
 	int its_id;
 
-	if (iort_dev_find_its_id(dev, req_id, 0, &its_id))
+	if (iort_dev_find_its_id(dev, id, 0, &its_id))
 		return NULL;
 
 	handle = iort_find_domain_token(its_id);
 	if (!handle)
 		return NULL;
 
-	return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI);
+	return irq_find_matching_fwnode(handle, bus_token);
 }
 
 static void iort_set_device_domain(struct device *dev,
@@ -1062,8 +1062,8 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
  */
 void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 {
-	u64 mask, dmaaddr = 0, size = 0, offset = 0;
-	int ret, msb;
+	u64 end, mask, dmaaddr = 0, size = 0, offset = 0;
+	int ret;
 
 	/*
 	 * If @dev is expected to be DMA-capable then the bus code that created
@@ -1090,19 +1090,13 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 	}
 
 	if (!ret) {
-		msb = fls64(dmaaddr + size - 1);
-		/*
-		 * Round-up to the power-of-two mask or set
-		 * the mask to the whole 64-bit address space
-		 * in case the DMA region covers the full
-		 * memory window.
-		 */
-		mask = msb == 64 ? U64_MAX : (1ULL << msb) - 1;
 		/*
-		 * Limit coherent and dma mask based on size
-		 * retrieved from firmware.
+		 * Limit coherent and dma mask based on size retrieved from
+		 * firmware.
 		 */
-		dev->bus_dma_mask = mask;
+		end = dmaaddr + size - 1;
+		mask = DMA_BIT_MASK(ilog2(end) + 1);
+		dev->bus_dma_limit = end;
 		dev->coherent_dma_mask = mask;
 		*dev->dma_mask = mask;
 	}
@@ -1261,7 +1255,7 @@ static int  __init arm_smmu_v3_set_proximity(struct device *dev,
 
 	smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 	if (smmu->flags & ACPI_IORT_SMMU_V3_PXM_VALID) {
-		int dev_node = acpi_map_pxm_to_node(smmu->pxm);
+		int dev_node = pxm_to_node(smmu->pxm);
 
 		if (dev_node != NUMA_NO_NODE && !node_online(dev_node))
 			return -EINVAL;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 54002670cb7a15b59ef3ea424e8af6e2c199c3fc..ccfa8d8f4834222a468fa0ec24416030a1005f0d 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -28,6 +28,7 @@
 #include <linux/pci.h>
 #include <acpi/apei.h>
 #include <linux/suspend.h>
+#include <linux/prmt.h>
 
 #include "internal.h"
 
@@ -302,6 +303,8 @@ static void acpi_bus_osc_support(void)
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
+	if (IS_ENABLED(CONFIG_ACPI_PRMT))
+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 
 #ifdef CONFIG_X86
 	if (boot_cpu_has(X86_FEATURE_HWP)) {
@@ -1238,6 +1241,7 @@ static int __init acpi_init(void)
 		acpi_kobj = NULL;
 	}
 
+	init_prmt();
 	result = acpi_bus_init();
 	if (result) {
 		disable_acpi();
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9d78f29cf9967f59395af4dae409e26103889e7d..5a6b8ad10eeabcd55fd65a8ccf42fb39b2efd5d1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2952,8 +2952,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
 		ndr_desc->numa_node = acpi_map_pxm_to_online_node(
 						spa->proximity_domain);
-		ndr_desc->target_node = acpi_map_pxm_to_node(
-				spa->proximity_domain);
+		ndr_desc->target_node = pxm_to_node(spa->proximity_domain);
 	} else {
 		ndr_desc->numa_node = NUMA_NO_NODE;
 		ndr_desc->target_node = NUMA_NO_NODE;
diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/numa/Kconfig
similarity index 75%
rename from drivers/acpi/hmat/Kconfig
rename to drivers/acpi/numa/Kconfig
index 95a29964dbeae7a17892d1aad09fcfe4a4ba81e9..fcf2e556d69d21477466f610408f668eb8303442 100644
--- a/drivers/acpi/hmat/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -1,8 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0
+config ACPI_NUMA
+	bool "NUMA support"
+	depends on NUMA
+	depends on (X86 || IA64 || ARM64)
+	default y if IA64 || ARM64
+
 config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
 	depends on ACPI_NUMA
 	select HMEM_REPORTING
+	select MEMREGION
 	help
 	 If set, this option has the kernel parse and report the
 	 platform's ACPI HMAT (Heterogeneous Memory Attributes Table),
diff --git a/drivers/acpi/numa/Makefile b/drivers/acpi/numa/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..517a6c689a947ca006c1d97525646bb9434f36aa
--- /dev/null
+++ b/drivers/acpi/numa/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+obj-$(CONFIG_ACPI_HMAT) += hmat.o
diff --git a/drivers/acpi/hmat/hmat.c b/drivers/acpi/numa/hmat.c
similarity index 82%
rename from drivers/acpi/hmat/hmat.c
rename to drivers/acpi/numa/hmat.c
index 0f1c939b7e9010dce680922868b0e9f6891f9cf7..63b04721dafa8f659044d661e402560c3e8a13a8 100644
--- a/drivers/acpi/hmat/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -8,12 +8,18 @@
  * the applicable attributes with the node's interfaces.
  */
 
+#define pr_fmt(fmt) "acpi/hmat: " fmt
+#define dev_fmt(fmt) "acpi/hmat: " fmt
+
 #include <linux/acpi.h>
 #include <linux/bitops.h>
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
 #include <linux/list_sort.h>
+#include <linux/memregion.h>
 #include <linux/memory.h>
 #include <linux/mutex.h>
 #include <linux/node.h>
@@ -49,6 +55,7 @@ struct memory_target {
 	struct list_head node;
 	unsigned int memory_pxm;
 	unsigned int processor_pxm;
+	struct resource memregions;
 	struct node_hmem_attrs hmem_attrs;
 	struct list_head caches;
 	struct node_cache_attrs cache_attrs;
@@ -104,22 +111,36 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm)
 	list_add_tail(&initiator->node, &initiators);
 }
 
-static __init void alloc_memory_target(unsigned int mem_pxm)
+static __init void alloc_memory_target(unsigned int mem_pxm,
+		resource_size_t start, resource_size_t len)
 {
 	struct memory_target *target;
 
 	target = find_mem_target(mem_pxm);
-	if (target)
-		return;
-
-	target = kzalloc(sizeof(*target), GFP_KERNEL);
-	if (!target)
-		return;
+	if (!target) {
+		target = kzalloc(sizeof(*target), GFP_KERNEL);
+		if (!target)
+			return;
+		target->memory_pxm = mem_pxm;
+		target->processor_pxm = PXM_INVAL;
+		target->memregions = (struct resource) {
+			.name	= "ACPI mem",
+			.start	= 0,
+			.end	= -1,
+			.flags	= IORESOURCE_MEM,
+		};
+		list_add_tail(&target->node, &targets);
+		INIT_LIST_HEAD(&target->caches);
+	}
 
-	target->memory_pxm = mem_pxm;
-	target->processor_pxm = PXM_INVAL;
-	list_add_tail(&target->node, &targets);
-	INIT_LIST_HEAD(&target->caches);
+	/*
+	 * There are potentially multiple ranges per PXM, so record each
+	 * in the per-target memregions resource tree.
+	 */
+	if (!__request_region(&target->memregions, start, len, "memory target",
+				IORESOURCE_MEM))
+		pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n",
+				start, start + len, mem_pxm);
 }
 
 static __init const char *hmat_data_type(u8 type)
@@ -272,7 +293,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	u8 type, mem_hier;
 
 	if (hmat_loc->header.length < sizeof(*hmat_loc)) {
-		pr_notice("HMAT: Unexpected locality header length: %d\n",
+		pr_notice("HMAT: Unexpected locality header length: %u\n",
 			 hmat_loc->header.length);
 		return -EINVAL;
 	}
@@ -284,12 +305,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds +
 		     sizeof(*inits) * ipds + sizeof(*targs) * tpds;
 	if (hmat_loc->header.length < total_size) {
-		pr_notice("HMAT: Unexpected locality header length:%d, minimum required:%d\n",
+		pr_notice("HMAT: Unexpected locality header length:%u, minimum required:%u\n",
 			 hmat_loc->header.length, total_size);
 		return -EINVAL;
 	}
 
-	pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%d Target Domains:%d Base:%lld\n",
+	pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n",
 		hmat_loc->flags, hmat_data_type(type), ipds, tpds,
 		hmat_loc->entry_base_unit);
 
@@ -302,7 +323,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 			value = hmat_normalize(entries[init * tpds + targ],
 					       hmat_loc->entry_base_unit,
 					       type);
-			pr_info("  Initiator-Target[%d-%d]:%d%s\n",
+			pr_info("  Initiator-Target[%u-%u]:%u%s\n",
 				inits[init], targs[targ], value,
 				hmat_data_type_suffix(type));
 
@@ -329,13 +350,13 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header,
 	u32 attrs;
 
 	if (cache->header.length < sizeof(*cache)) {
-		pr_notice("HMAT: Unexpected cache header length: %d\n",
+		pr_notice("HMAT: Unexpected cache header length: %u\n",
 			 cache->header.length);
 		return -EINVAL;
 	}
 
 	attrs = cache->cache_attributes;
-	pr_info("HMAT: Cache: Domain:%d Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
+	pr_info("HMAT: Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
 		cache->memory_PD, cache->cache_size, attrs,
 		cache->number_of_SMBIOShandles);
 
@@ -390,17 +411,17 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 	struct memory_target *target = NULL;
 
 	if (p->header.length != sizeof(*p)) {
-		pr_notice("HMAT: Unexpected address range header length: %d\n",
+		pr_notice("HMAT: Unexpected address range header length: %u\n",
 			 p->header.length);
 		return -EINVAL;
 	}
 
 	if (hmat_revision == 1)
-		pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%d\n",
+		pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%u\n",
 			p->reserved3, p->reserved4, p->flags, p->processor_PD,
 			p->memory_PD);
 	else
-		pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%d\n",
+		pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%u\n",
 			p->flags, p->processor_PD, p->memory_PD);
 
 	if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) ||
@@ -418,7 +439,7 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 			pr_debug("HMAT: Invalid Processor Domain\n");
 			return -EINVAL;
 		}
-		target->processor_pxm = p_node;
+		target->processor_pxm = p->processor_PD;
 	}
 
 	return 0;
@@ -453,7 +474,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header,
 		return -EINVAL;
 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
 		return 0;
-	alloc_memory_target(ma->proximity_domain);
+	alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length);
 	return 0;
 }
 
@@ -614,10 +635,91 @@ static void hmat_register_target_perf(struct memory_target *target)
 	node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
 }
 
+static void hmat_register_target_device(struct memory_target *target,
+		struct resource *r)
+{
+	/* define a clean / non-busy resource for the platform device */
+	struct resource res = {
+		.start = r->start,
+		.end = r->end,
+		.flags = IORESOURCE_MEM,
+	};
+	struct platform_device *pdev;
+	struct memregion_info info;
+	int rc, id;
+
+	rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
+			IORES_DESC_SOFT_RESERVED);
+	if (rc != REGION_INTERSECTS)
+		return;
+
+	id = memregion_alloc(GFP_KERNEL);
+	if (id < 0) {
+		pr_err("memregion allocation failure for %pr\n", &res);
+		return;
+	}
+
+	pdev = platform_device_alloc("hmem", id);
+	if (!pdev) {
+		pr_err("hmem device allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
+	info = (struct memregion_info) {
+		.target_node = acpi_map_pxm_to_node(target->memory_pxm),
+	};
+	rc = platform_device_add_data(pdev, &info, sizeof(info));
+	if (rc < 0) {
+		pr_err("hmem memregion_info allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	rc = platform_device_add_resources(pdev, &res, 1);
+	if (rc < 0) {
+		pr_err("hmem resource allocation failure for %pr\n", &res);
+		goto out_resource;
+	}
+
+	rc = platform_device_add(pdev);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "device add failed for %pr\n", &res);
+		goto out_resource;
+	}
+
+	return;
+
+out_resource:
+	put_device(&pdev->dev);
+out_pdev:
+	memregion_free(id);
+}
+
+static void hmat_register_target_devices(struct memory_target *target)
+{
+	struct resource *res;
+
+	/*
+	 * Do not bother creating devices if no driver is available to
+	 * consume them.
+	 */
+	if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
+		return;
+
+	for (res = target->memregions.child; res; res = res->sibling)
+		hmat_register_target_device(target, res);
+}
+
 static void hmat_register_target(struct memory_target *target)
 {
 	int nid = pxm_to_node(target->memory_pxm);
 
+	/*
+	 * Devices may belong to either an offline or online
+	 * node, so unconditionally add them.
+	 */
+	hmat_register_target_devices(target);
+
 	/*
 	 * Skip offline nodes. This can happen when memory
 	 * marked EFI_MEMORY_SP, "specific purpose", is applied
@@ -678,11 +780,21 @@ static __init void hmat_free_structures(void)
 	struct target_cache *tcache, *cnext;
 
 	list_for_each_entry_safe(target, tnext, &targets, node) {
+		struct resource *res, *res_next;
+
 		list_for_each_entry_safe(tcache, cnext, &target->caches, node) {
 			list_del(&tcache->node);
 			kfree(tcache);
 		}
+
 		list_del(&target->node);
+		res = target->memregions.child;
+		while (res) {
+			res_next = res->sibling;
+			__release_region(&target->memregions, res->start,
+					resource_size(res));
+			res = res_next;
+		}
 		kfree(target);
 	}
 
@@ -749,4 +861,4 @@ static __init int hmat_init(void)
 	acpi_put_table(tbl);
 	return 0;
 }
-subsys_initcall(hmat_init);
+device_initcall(hmat_init);
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa/srat.c
similarity index 100%
rename from drivers/acpi/numa.c
rename to drivers/acpi/numa/srat.c
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index d1e666ef3fcc886eb48772419aa891b78b039bc6..ddc8baa34864e7dfb0b8be2f5df6bb0ddd704fb1 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -131,6 +131,7 @@ static struct pci_osc_bit_struct pci_osc_support_bit[] = {
 	{ OSC_PCI_CLOCK_PM_SUPPORT, "ClockPM" },
 	{ OSC_PCI_SEGMENT_GROUPS_SUPPORT, "Segments" },
 	{ OSC_PCI_MSI_SUPPORT, "MSI" },
+	{ OSC_PCI_EDR_SUPPORT, "EDR" },
 	{ OSC_PCI_HPX_TYPE_3_SUPPORT, "HPX-Type3" },
 };
 
@@ -141,6 +142,7 @@ static struct pci_osc_bit_struct pci_osc_control_bit[] = {
 	{ OSC_PCI_EXPRESS_AER_CONTROL, "AER" },
 	{ OSC_PCI_EXPRESS_CAPABILITY_CONTROL, "PCIeCapability" },
 	{ OSC_PCI_EXPRESS_LTR_CONTROL, "LTR" },
+	{ OSC_PCI_EXPRESS_DPC_CONTROL, "DPC" },
 };
 
 static void decode_osc_bits(struct acpi_pci_root *root, char *msg, u32 word,
@@ -440,6 +442,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 		support |= OSC_PCI_ASPM_SUPPORT | OSC_PCI_CLOCK_PM_SUPPORT;
 	if (pci_msi_enabled())
 		support |= OSC_PCI_MSI_SUPPORT;
+	if (IS_ENABLED(CONFIG_PCIE_EDR))
+		support |= OSC_PCI_EDR_SUPPORT;
 
 	decode_osc_support(root, "OS supports", support);
 	status = acpi_pci_osc_support(root, support);
@@ -479,13 +483,17 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 	if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
 		control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL;
 
-	if (pci_aer_available()) {
-		if (aer_acpi_firmware_first())
-			dev_info(&device->dev,
-				 "PCIe AER handled by firmware\n");
-		else
-			control |= OSC_PCI_EXPRESS_AER_CONTROL;
-	}
+	if (pci_aer_available())
+		control |= OSC_PCI_EXPRESS_AER_CONTROL;
+
+	/*
+	 * Per the Downstream Port Containment Related Enhancements ECN to
+	 * the PCI Firmware Spec, r3.2, sec 4.5.1, table 4-5,
+	 * OSC_PCI_EXPRESS_DPC_CONTROL indicates the OS supports both DPC
+	 * and EDR.
+	 */
+	if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR))
+		control |= OSC_PCI_EXPRESS_DPC_CONTROL;
 
 	requested = control;
 	status = acpi_pci_osc_control_set(handle, &control,
@@ -916,6 +924,8 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 		host_bridge->native_pme = 0;
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
 		host_bridge->native_ltr = 0;
+	if (!(root->osc_control_set & OSC_PCI_EXPRESS_DPC_CONTROL))
+		host_bridge->native_dpc = 0;
 
 	/*
 	 * Evaluate the "PCI Boot Configuration" _DSM Function.  If it
@@ -923,7 +933,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	 * assignments made by firmware for this host bridge.
 	 */
 	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1,
-	                        IGNORE_PCI_BOOT_CONFIG_DSM, NULL);
+				DSM_PCI_PRESERVE_BOOT_CONFIG, NULL);
 	if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0)
 		host_bridge->preserve_config = 1;
 	ACPI_FREE(obj);
diff --git a/drivers/acpi/pfru/Kconfig b/drivers/acpi/pfru/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..e2934058884ebf893000dc2841151cc8b8e7c4c4
--- /dev/null
+++ b/drivers/acpi/pfru/Kconfig
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+config ACPI_PFRU
+	tristate "ACPI Platform Firmware Runtime Update (PFRU)"
+	depends on 64BIT
+	help
+	  In order to reduce the system reboot times and update the platform firmware
+	  in time, Platform Firmware Runtime Update is leveraged to patch the system
+	  without reboot. This driver supports Platform Firmware Runtime Update,
+	  which is composed of two parts: code injection and driver update.
+
+	  For more information, see:
+	  <file:Documentation/x86/pfru_update.rst>
+
+	  To compile this driver as a module, choose M here:
+	  the module will be called pfru_update.
+
+config ACPI_PFRU_TELEMETRY
+	tristate "ACPI Platform Firmware Runtime Update Telemetry Service"
+	depends on ACPI_PFRU
+	help
+	  PFRU(Platform Firmware Runtime Update) Telemetry Service is part of
+	  RoT(Root of Trust), which allows Platform Firmware Runtime Update handler
+	  and other PFRU drivers to produce telemetry data to upper layer OS consumer
+	  at runtime.
+
+	  For more information, see:
+	  <file:Documentation/x86/pfru_update.rst>
+
+	  If unsure, please say N.
diff --git a/drivers/acpi/pfru/Makefile b/drivers/acpi/pfru/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..30060ba320ea5e84b3c5f4b0fd334ce3ca56fda4
--- /dev/null
+++ b/drivers/acpi/pfru/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_ACPI_PFRU) += pfru_update.o
+obj-$(CONFIG_ACPI_PFRU_TELEMETRY) += pfru_telemetry.o
diff --git a/drivers/acpi/pfru/pfru_telemetry.c b/drivers/acpi/pfru/pfru_telemetry.c
new file mode 100644
index 0000000000000000000000000000000000000000..5b80eee891c5e9e42a9efbd910c2e1da15fecdfc
--- /dev/null
+++ b/drivers/acpi/pfru/pfru_telemetry.c
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI Platform Firmware Runtime Update
+ * Telemetry Service Device Driver
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Chen Yu <yu.c.chen@intel.com>
+ */
+#include <linux/acpi.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
+#include <linux/uuid.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/pfru.h>
+
+struct pfru_telem_device {
+	struct device *dev;
+	guid_t uuid;
+	struct telem_info info;
+};
+
+static struct pfru_telem_device telem_dev;
+static struct pfru_telem_device *get_pfru_telem_dev(void)
+{
+	return &telem_dev;
+}
+
+static int get_pfru_data_info(struct telem_data_info *data_info,
+			      int log_type)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	union acpi_object *out_obj, in_obj, in_buf;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+
+	memset(data_info, 0, sizeof(*data_info));
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = log_type;
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_GET_DATA,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj) {
+		pr_err("Failed to invoke _DSM\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->max_data_size = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_addr_lo = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_addr_hi = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk1_size = obj->integer.value;
+			break;
+		case 6:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_addr_lo = obj->integer.value;
+			break;
+		case 7:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_addr_hi = obj->integer.value;
+			break;
+		case 8:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->chunk2_size = obj->integer.value;
+			break;
+		case 9:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->rollover_cnt = obj->integer.value;
+			break;
+		case 10:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			data_info->reset_cnt = obj->integer.value;
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int set_pfru_log_level(int level)
+{
+	union acpi_object *out_obj, *obj, in_obj, in_buf;
+	struct pfru_telem_device *pf_telem_dev;
+	enum dsm_status status;
+	acpi_handle handle;
+	int ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = level;
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_SET_LEV,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	obj = &out_obj->package.elements[0];
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[1];
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error extend status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int get_pfru_log_level(int *level)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	union acpi_object *out_obj, *obj;
+	enum dsm_status status;
+	acpi_handle handle;
+	int ret = -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_telem_dev->uuid,
+					  pf_telem_dev->info.log_revid, FUNC_GET_LEV,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	obj = &out_obj->package.elements[0];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[1];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	status = obj->integer.value;
+	if (status) {
+		pr_err("get MM telemetry level error status %d\n",
+		       status);
+		goto free_acpi_buffer;
+	}
+
+	obj = &out_obj->package.elements[2];
+	if (obj->type != ACPI_TYPE_INTEGER)
+		goto free_acpi_buffer;
+	*level = obj->integer.value;
+
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int valid_log_level(int level)
+{
+	return (level == LOG_ERR) || (level == LOG_WARN) ||
+		(level == LOG_INFO) || (level == LOG_VERB);
+}
+
+static int valid_log_type(int type)
+{
+	return (type == LOG_EXEC_IDX) || (type == LOG_HISTORY_IDX);
+}
+
+static long pfru_telemetry_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+
+{
+	struct pfru_telem_device *pf_telem_dev;
+	struct telem_data_info data_info;
+	struct telem_info info;
+	void __user *p;
+	int ret = 0;
+
+	pf_telem_dev = get_pfru_telem_dev();
+	p = (void __user *)arg;
+
+	switch (cmd) {
+	case PFRU_LOG_IOC_SET_INFO:
+		if (copy_from_user(&info, p, sizeof(info)))
+			return -EFAULT;
+		if (valid_revid(info.log_revid))
+			pf_telem_dev->info.log_revid = info.log_revid;
+
+		if (valid_log_level(info.log_level)) {
+			ret = set_pfru_log_level(info.log_level);
+			if (ret)
+				return ret;
+			pf_telem_dev->info.log_level = info.log_level;
+		}
+		if (valid_log_type(info.log_type))
+			pf_telem_dev->info.log_type = info.log_type;
+		break;
+	case PFRU_LOG_IOC_GET_INFO:
+		ret = get_pfru_log_level(&info.log_level);
+		if (ret)
+			return ret;
+		info.log_type = pf_telem_dev->info.log_type;
+		info.log_revid = pf_telem_dev->info.log_revid;
+		if (copy_to_user(p, &info, sizeof(info)))
+			ret = -EFAULT;
+		break;
+	case PFRU_LOG_IOC_GET_DATA_INFO:
+		ret = get_pfru_data_info(&data_info, pf_telem_dev->info.log_type);
+		if (ret)
+			return ret;
+		if (copy_to_user(p, &data_info, sizeof(struct telem_data_info)))
+			ret = -EFAULT;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static ssize_t pfru_telemetry_read(struct file *filp, char __user *ubuf,
+				   size_t size, loff_t *off)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	struct telem_data_info info;
+	phys_addr_t base_addr;
+	int buf_size, ret;
+	char *buf_ptr;
+
+	if (*off < 0)
+		return -EINVAL;
+
+	pf_telem_dev = get_pfru_telem_dev();
+
+	ret = get_pfru_data_info(&info, pf_telem_dev->info.log_type);
+	if (ret) {
+		pr_err("Could not get telemetry data info %d\n", ret);
+		return ret;
+	}
+
+	base_addr = (phys_addr_t)(info.chunk2_addr_lo |
+			(info.chunk2_addr_hi << 32));
+	if (!base_addr) {
+		pr_err("Telemetry data not ready\n");
+		return -EBUSY;
+	}
+
+	buf_size = info.max_data_size;
+	if (*off >= buf_size)
+		return 0;
+
+	buf_ptr = memremap(base_addr, buf_size, MEMREMAP_WB);
+	if (IS_ERR(buf_ptr))
+		return PTR_ERR(buf_ptr);
+
+	size = min_t(size_t, size, buf_size - *off);
+
+	ret = -EFAULT;
+	if (copy_to_user(ubuf, buf_ptr + *off, size))
+		goto out;
+	ret = 0;
+out:
+	memunmap(buf_ptr);
+
+	return ret ? ret : size;
+}
+
+#ifdef CONFIG_COMPAT
+static long compat_pfru_telemetry_ioctl(struct file *filep, unsigned int cmd,
+					unsigned long arg)
+{
+	return pfru_telemetry_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations acpi_pfru_telemetry_fops = {
+	.owner		= THIS_MODULE,
+	.read		= pfru_telemetry_read,
+	.unlocked_ioctl = pfru_telemetry_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_pfru_telemetry_ioctl,
+#endif
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice pfru_telemetry_misc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "pfru_telemetry",
+	.nodename = "pfru/telemetry",
+	.fops = &acpi_pfru_telemetry_fops,
+};
+
+static int acpi_pfru_telemetry_remove(struct platform_device *pdev)
+{
+	misc_deregister(&pfru_telemetry_misc_dev);
+
+	return 0;
+}
+
+static int acpi_pfru_telemetry_probe(struct platform_device *pdev)
+{
+	struct pfru_telem_device *pf_telem_dev;
+	acpi_handle handle;
+	int ret;
+
+	pf_telem_dev = get_pfru_telem_dev();
+
+	ret = guid_parse(PFRU_TELEMETRY_UUID, &pf_telem_dev->uuid);
+	if (ret)
+		return ret;
+
+	pf_telem_dev->info.log_revid = 1;
+	pf_telem_dev->dev = &pdev->dev;
+	handle = ACPI_HANDLE(pf_telem_dev->dev);
+	if (!acpi_has_method(handle, "_DSM")) {
+		pr_err("Missing _DSM\n");
+		return -ENODEV;
+	}
+
+	ret = misc_register(&pfru_telemetry_misc_dev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct acpi_device_id acpi_pfru_telemetry_ids[] = {
+	{"INTC1081", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, acpi_pfru_telemetry_ids);
+
+static struct platform_driver acpi_pfru_telemetry_driver = {
+	.driver = {
+		.name = "pfru_telemetry",
+		.acpi_match_table = acpi_pfru_telemetry_ids,
+	},
+	.probe = acpi_pfru_telemetry_probe,
+	.remove = acpi_pfru_telemetry_remove,
+};
+module_platform_driver(acpi_pfru_telemetry_driver);
+
+MODULE_DESCRIPTION("Platform Firmware Runtime Update Telemetry Service device driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/acpi/pfru/pfru_update.c b/drivers/acpi/pfru/pfru_update.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac6ac0152e4d4c64e081e0ea0a0f733f4629a1d8
--- /dev/null
+++ b/drivers/acpi/pfru/pfru_update.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI Platform Firmware Runtime Update Device Driver
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Chen Yu <yu.c.chen@intel.com>
+ */
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/uuid.h>
+#include <uapi/linux/pfru.h>
+
+struct pfru_device {
+	guid_t uuid;
+	int rev_id;
+	struct device *dev;
+};
+
+static struct pfru_device pfru_dev;
+static struct pfru_device *get_pfru_dev(void)
+{
+	return &pfru_dev;
+}
+
+static int query_capability(struct update_cap_info *cap)
+{
+	union acpi_object *out_obj;
+	struct pfru_device *pf_dev;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	handle = ACPI_HANDLE(pf_dev->dev);
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_QUERY_UPDATE_CAP,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->update_cap = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->code_type, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->fw_version = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->code_rt_version = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->drv_type, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 6:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->drv_rt_version = obj->integer.value;
+			break;
+		case 7:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			cap->drv_svn = obj->integer.value;
+			break;
+		case 8:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->platform_id, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		case 9:
+			if (obj->type != ACPI_TYPE_BUFFER)
+				goto free_acpi_buffer;
+			memcpy(&cap->oem_id, obj->buffer.pointer,
+			       obj->buffer.length);
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int query_buffer(struct com_buf_info *info)
+{
+	union acpi_object *out_obj;
+	struct pfru_device *pf_dev;
+	acpi_handle handle;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	handle = ACPI_HANDLE(pf_dev->dev);
+
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_QUERY_BUF,
+					  NULL, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->addr_lo = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->addr_hi = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			info->buf_size = obj->integer.value;
+			break;
+		}
+	}
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static int get_image_type(efi_manage_capsule_image_header_t *img_hdr,
+			  int *type)
+{
+	int ret;
+	guid_t code_inj_id, drv_update_id, *image_type_id;
+
+	ret = guid_parse(PFRU_CODE_INJ_UUID, &code_inj_id);
+	if (ret)
+		return ret;
+	ret = guid_parse(PFRU_DRV_UPDATE_UUID, &drv_update_id);
+	if (ret)
+		return ret;
+	/* check whether this is a code injection or driver update */
+	image_type_id = &img_hdr->image_type_id;
+	if (guid_equal(image_type_id, &code_inj_id))
+		*type = CODE_INJECT_TYPE;
+	else if (guid_equal(image_type_id, &drv_update_id))
+		*type = DRIVER_UPDATE_TYPE;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * The (u64 hw_ins) was introduced in UEFI spec version 2,
+ * and (u64 capsule_support) was introduced in version 3.
+ * The size needs to be adjusted accordingly.
+ */
+static int adjust_efi_size(efi_manage_capsule_image_header_t *img_hdr,
+			   int *size)
+{
+	int tmp_size = *size;
+
+	tmp_size += sizeof(efi_manage_capsule_image_header_t);
+	switch (img_hdr->ver) {
+	case 1:
+		tmp_size -= 2 * sizeof(u64);
+		break;
+	case 2:
+		tmp_size -= sizeof(u64);
+		break;
+	default:
+		/* only support version 1 and 2 */
+		return -EINVAL;
+	}
+	*size = tmp_size;
+
+	return 0;
+}
+
+/*
+ * Sanity check if the capsule image has a newer version than current one.
+ * Return: true if it is valid, false otherwise.
+ */
+static bool valid_version(const void *data, struct update_cap_info *cap)
+{
+	struct payload_hdr *payload_hdr;
+	efi_capsule_header_t *cap_hdr;
+	efi_manage_capsule_header_t *m_hdr;
+	efi_manage_capsule_image_header_t *m_img_hdr;
+	efi_image_auth_t *auth;
+	int type, size, ret;
+
+	cap_hdr = (efi_capsule_header_t *)data;
+	size = cap_hdr->headersize;
+	m_hdr = (efi_manage_capsule_header_t *)(data + size);
+	/*
+	 * Current data structure size plus variable array indicated
+	 * by number of (emb_drv_cnt + payload_cnt)
+	 */
+	size += sizeof(efi_manage_capsule_header_t) +
+		      (m_hdr->emb_drv_cnt + m_hdr->payload_cnt) * sizeof(u64);
+	m_img_hdr = (efi_manage_capsule_image_header_t *)(data + size);
+
+	ret = get_image_type(m_img_hdr, &type);
+	if (ret)
+		return false;
+
+	ret = adjust_efi_size(m_img_hdr, &size);
+	if (ret)
+		return false;
+
+	auth = (efi_image_auth_t *)(data + size);
+	size += sizeof(u64) + auth->auth_info.hdr.len;
+	payload_hdr = (struct payload_hdr *)(data + size);
+
+	/* Finally, compare the version. */
+	if (type == CODE_INJECT_TYPE)
+		return payload_hdr->rt_ver >= cap->code_rt_version;
+	else
+		return payload_hdr->rt_ver >= cap->drv_rt_version;
+}
+
+static void parse_update_result(struct updated_result *result)
+{
+	pr_debug("Update result:\n");
+	pr_debug("Status:%d\n", result->status);
+	pr_debug("Extended Status:%d\n", result->ext_status);
+	pr_debug("Authentication Time Low:%ld\n", result->low_auth_time);
+	pr_debug("Authentication Time High:%ld\n", result->high_auth_time);
+	pr_debug("Execution Time Low:%ld\n", result->low_exec_time);
+	pr_debug("Execution Time High:%ld\n", result->high_exec_time);
+}
+
+static int start_acpi_update(int action)
+{
+	union acpi_object *out_obj, in_obj, in_buf;
+	struct updated_result update_result;
+	acpi_handle handle;
+	struct pfru_device *pf_dev;
+	int i, ret = -EINVAL;
+
+	pf_dev = get_pfru_dev();
+	memset(&in_obj, 0, sizeof(in_obj));
+	memset(&in_buf, 0, sizeof(in_buf));
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_INTEGER;
+	in_buf.integer.value = action;
+
+	handle = ACPI_HANDLE(pf_dev->dev);
+	out_obj = acpi_evaluate_dsm_typed(handle, &pf_dev->uuid,
+					  pf_dev->rev_id, FUNC_START,
+					  &in_obj, ACPI_TYPE_PACKAGE);
+	if (!out_obj)
+		return -EINVAL;
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		union acpi_object *obj = &out_obj->package.elements[i];
+
+		switch (i) {
+		case 0:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.status = obj->integer.value;
+			break;
+		case 1:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.ext_status = obj->integer.value;
+			break;
+		case 2:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.low_auth_time = obj->integer.value;
+			break;
+		case 3:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.high_auth_time = obj->integer.value;
+			break;
+		case 4:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.low_exec_time = obj->integer.value;
+			break;
+		case 5:
+			if (obj->type != ACPI_TYPE_INTEGER)
+				goto free_acpi_buffer;
+			update_result.high_exec_time = obj->integer.value;
+			break;
+		}
+	}
+	parse_update_result(&update_result);
+	ret = 0;
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+
+	return ret;
+}
+
+static long pfru_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *p;
+	int ret = 0, rev;
+	struct pfru_device *pf_dev;
+
+	pf_dev = get_pfru_dev();
+
+	p = (void __user *)arg;
+
+	switch (cmd) {
+	case PFRU_IOC_SET_REV:
+		if (copy_from_user(&rev, p, sizeof(unsigned int)))
+			return -EFAULT;
+		if (!valid_revid(rev))
+			return -EFAULT;
+		pf_dev->rev_id = rev;
+		break;
+	case PFRU_IOC_STAGE:
+		ret = start_acpi_update(START_STAGE);
+		if (ret)
+			return ret;
+		break;
+	case PFRU_IOC_ACTIVATE:
+		ret = start_acpi_update(START_ACTIVATE);
+		if (ret)
+			return ret;
+		break;
+	case PFRU_IOC_STAGE_ACTIVATE:
+		ret = start_acpi_update(START_STAGE_ACTIVATE);
+		if (ret)
+			return ret;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long compat_pfru_ioctl(struct file *filep, unsigned int cmd,
+			      unsigned long arg)
+{
+	return pfru_ioctl(filep, cmd, arg);
+}
+#endif
+
+static int pfru_open(struct inode *inode, struct file *file)
+{
+	return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM;
+}
+
+static ssize_t pfru_write(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	struct update_cap_info cap;
+	struct com_buf_info info;
+	phys_addr_t phy_addr;
+	struct iov_iter iter;
+	struct iovec iov;
+	char *buf_ptr;
+	int ret;
+
+	ret = query_buffer(&info);
+	if (ret)
+		return ret;
+
+	if (len > info.buf_size)
+		return -EINVAL;
+
+	iov.iov_base = (void __user *)buf;
+	iov.iov_len = len;
+	iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+	/* map the communication buffer */
+	phy_addr = (phys_addr_t)(info.addr_lo | (info.addr_hi << 32));
+	buf_ptr = memremap(phy_addr, info.buf_size, MEMREMAP_WB);
+	if (!buf_ptr)
+		return -ENOMEM;
+	if (!copy_from_iter_full(buf_ptr, len, &iter)) {
+		pr_err("error! could not read capsule file\n");
+		ret = -EINVAL;
+		goto unmap;
+	}
+
+	/* Check if the capsule header has a valid version number. */
+	ret = query_capability(&cap);
+	if (ret)
+		goto unmap;
+
+	if (cap.status != DSM_SUCCEED) {
+		ret = -EBUSY;
+		goto unmap;
+	}
+	if (!valid_version(buf_ptr, &cap)) {
+		ret = -EINVAL;
+		goto unmap;
+	}
+	ret = 0;
+unmap:
+	memunmap(buf_ptr);
+
+	return ret ?: len;
+}
+
+static ssize_t pfru_read(struct file *filp, char __user *ubuf,
+			 size_t size, loff_t *off)
+{
+	struct update_cap_info cap;
+	int ret;
+
+	ret = query_capability(&cap);
+	if (ret)
+		return ret;
+
+	size = min_t(size_t, size, sizeof(cap));
+
+	if (copy_to_user(ubuf, &cap, size))
+		return -EFAULT;
+
+	return size;
+}
+
+static const struct file_operations acpi_pfru_fops = {
+	.owner		= THIS_MODULE,
+	.write		= pfru_write,
+	.read		= pfru_read,
+	.open		= pfru_open,
+	.unlocked_ioctl = pfru_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_pfru_ioctl,
+#endif
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice pfru_misc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "pfru_update",
+	.nodename = "pfru/update",
+	.fops = &acpi_pfru_fops,
+};
+
+static int acpi_pfru_remove(struct platform_device *pdev)
+{
+	misc_deregister(&pfru_misc_dev);
+
+	return 0;
+}
+
+static int acpi_pfru_probe(struct platform_device *pdev)
+{
+	acpi_handle handle;
+	struct pfru_device *pf_dev;
+	int ret;
+
+	pf_dev = get_pfru_dev();
+
+	ret = guid_parse(PFRU_UUID, &pf_dev->uuid);
+	if (ret)
+		return ret;
+	/* default rev id is 1 */
+	pf_dev->rev_id = 1;
+	pf_dev->dev = &pdev->dev;
+	handle = ACPI_HANDLE(pf_dev->dev);
+	if (!acpi_has_method(handle, "_DSM")) {
+		pr_err("Missing _DSM\n");
+		return -ENODEV;
+	}
+
+	ret = misc_register(&pfru_misc_dev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct acpi_device_id acpi_pfru_ids[] = {
+	{"INTC1080", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, acpi_pfru_ids);
+
+static struct platform_driver acpi_pfru_driver = {
+	.driver = {
+		.name = "pfru_update",
+		.acpi_match_table = acpi_pfru_ids,
+	},
+	.probe = acpi_pfru_probe,
+	.remove = acpi_pfru_remove,
+};
+module_platform_driver(acpi_pfru_driver);
+
+MODULE_DESCRIPTION("Platform Firmware Runtime Update device driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
new file mode 100644
index 0000000000000000000000000000000000000000..3b3022b94ae389c2c5fd98b436009669b9936205
--- /dev/null
+++ b/drivers/acpi/prmt.c
@@ -0,0 +1,818 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Author: Erik Kaneda <erik.kaneda@intel.com>
+ * Copyright 2020 Intel Corporation
+ *
+ * prmt.c
+ *
+ * Each PRM service is an executable that is run in a restricted environment
+ * that is invoked by writing to the PlatformRtMechanism OperationRegion from
+ * AML bytecode.
+ *
+ * init_prmt initializes the Platform Runtime Mechanism (PRM) services by
+ * processing data in the PRMT as well as registering an ACPI OperationRegion
+ * handler for the PlatformRtMechanism subtype.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/acpi.h>
+#include <linux/prmt.h>
+#include <linux/err.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <linux/moduleloader.h>
+#include <linux/set_memory.h>
+#include <asm/efi.h>
+
+#pragma pack(1)
+struct prm_mmio_addr_range {
+	u64 phys_addr;
+	u64 virt_addr;
+	u32 length;
+};
+
+struct prm_mmio_info {
+	u64 mmio_count;
+	struct prm_mmio_addr_range addr_ranges[];
+};
+
+struct prm_buffer {
+	u8 prm_status;
+	u64 efi_status;
+	u8 prm_cmd;
+	guid_t handler_guid;
+};
+
+struct prm_context_buffer {
+	char signature[ACPI_NAMESEG_SIZE];
+	u16 revision;
+	u16 reserved;
+	guid_t identifier;
+	u64 static_data_buffer;
+	struct prm_mmio_info *mmio_ranges;
+};
+
+struct prm_handler_export_descriptor {
+	guid_t handler_guid;
+	char handler_name[128];
+};
+
+struct prm_module_export_descriptor {
+	char signature[8];
+	u16 revision;
+	u16 handler_count;
+	guid_t platform_guid;
+	guid_t  identifier;
+	struct prm_handler_export_descriptor handlers[];
+};
+
+struct prm_image_section {
+	u64 base;
+	u64 size;
+	u64 attr;
+};
+
+struct prm_handler_param_buffer {
+	u32 phase;
+	u32 reserved;
+	u64 raw_base;
+	u64 raw_size;
+	u64 loaded_base;
+	u64 loaded_size;
+	u64 seclist_base;
+	u64 seclist_size;
+	guid_t module_guid;
+	u64 module_info_base;
+	u64 module_info_size;
+};
+#pragma pack()
+
+LIST_HEAD(prm_module_list);
+
+struct prm_handler_info {
+	guid_t guid;
+	u16 rev;
+	u64 handler_addr;
+	u64 static_data_buffer_addr;
+	u64 acpi_param_buffer_addr;
+
+	struct list_head handler_list;
+};
+
+struct prm_module_info {
+	guid_t guid;
+	u16 major_rev;
+	u16 minor_rev;
+	u16 handler_count;
+	struct prm_mmio_info *mmio_info;
+	bool updatable;
+
+	struct list_head module_list;
+	struct prm_handler_info handlers[];
+};
+
+static DEFINE_MUTEX(prm_mutex);
+/* fake device for request_firmware */
+static struct platform_device *prm_pdev;
+
+enum prm_state {
+	PRM_STS_OK		= 0,
+	PRM_STS_NFOUND,
+	PRM_STS_ERR,
+	PRM_STS_MAX,
+};
+
+enum prm_phase {
+	PRM_PHASE_GET_SIZE	= 0,
+	PRM_PHASE_LOAD_IMAGE,
+	PRM_PHASE_GET_MODINFO,
+	PRM_PHASE_MAX,
+};
+
+static u64 efi_pa_va_lookup(u64 pa)
+{
+	efi_memory_desc_t *md;
+	u64 pa_offset = pa & ~PAGE_MASK;
+	u64 page = pa & PAGE_MASK;
+
+	for_each_efi_memory_desc(md) {
+		if (md->phys_addr < pa && pa < md->phys_addr + PAGE_SIZE * md->num_pages)
+			return pa_offset + md->virt_addr + page - md->phys_addr;
+	}
+
+	return 0;
+}
+
+
+#define get_first_handler(a) ((struct acpi_prmt_handler_info *) ((char *) (a) + a->handler_info_offset))
+#define get_next_handler(a) ((struct acpi_prmt_handler_info *) (sizeof(struct acpi_prmt_handler_info) + (char *) a))
+
+static int __init
+acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
+{
+	struct acpi_prmt_module_info *module_info;
+	struct acpi_prmt_handler_info *handler_info;
+	struct prm_handler_info *th;
+	struct prm_module_info *tm;
+	u64 *mmio_count;
+	u64 cur_handler = 0;
+	u32 module_info_size = 0;
+	u64 mmio_range_size = 0;
+	void *temp_mmio;
+
+	module_info = (struct acpi_prmt_module_info *) header;
+	module_info_size = struct_size(tm, handlers, module_info->handler_info_count);
+	tm = kmalloc(module_info_size, GFP_KERNEL);
+	if (!tm)
+		goto parse_prmt_out1;
+
+	guid_copy(&tm->guid, (guid_t *) module_info->module_guid);
+	tm->major_rev = module_info->major_rev;
+	tm->minor_rev = module_info->minor_rev;
+	tm->handler_count = module_info->handler_info_count;
+	tm->updatable = true;
+
+	if (module_info->mmio_list_pointer) {
+		/*
+		 * Each module is associated with a list of addr
+		 * ranges that it can use during the service
+		 */
+		mmio_count = (u64 *) memremap(module_info->mmio_list_pointer, 8, MEMREMAP_WB);
+		if (!mmio_count)
+			goto parse_prmt_out2;
+
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, *mmio_count);
+		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		if (!tm->mmio_info)
+			goto parse_prmt_out3;
+
+		temp_mmio = memremap(module_info->mmio_list_pointer, mmio_range_size, MEMREMAP_WB);
+		if (!temp_mmio)
+			goto parse_prmt_out4;
+		memmove(tm->mmio_info, temp_mmio, mmio_range_size);
+	} else {
+		tm->mmio_info = kmalloc(sizeof(*tm->mmio_info), GFP_KERNEL);
+		if (!tm->mmio_info)
+			goto parse_prmt_out2;
+
+		tm->mmio_info->mmio_count = 0;
+	}
+
+	INIT_LIST_HEAD(&tm->module_list);
+	list_add(&tm->module_list, &prm_module_list);
+
+	handler_info = get_first_handler(module_info);
+	do {
+		th = &tm->handlers[cur_handler];
+
+		guid_copy(&th->guid, (guid_t *)handler_info->handler_guid);
+		th->rev = handler_info->revision;
+		th->handler_addr = efi_pa_va_lookup(handler_info->handler_address);
+		th->static_data_buffer_addr = efi_pa_va_lookup(handler_info->static_data_buffer_address);
+		th->acpi_param_buffer_addr = efi_pa_va_lookup(handler_info->acpi_param_buffer_address);
+	} while (++cur_handler < tm->handler_count && (handler_info = get_next_handler(handler_info)));
+
+	return 0;
+
+parse_prmt_out4:
+	kfree(tm->mmio_info);
+parse_prmt_out3:
+	memunmap(mmio_count);
+parse_prmt_out2:
+	kfree(tm);
+parse_prmt_out1:
+	return -ENOMEM;
+}
+
+#define GET_MODULE	0
+#define GET_HANDLER	1
+
+static void *find_guid_info(const guid_t *guid, u8 mode)
+{
+	struct prm_handler_info *cur_handler;
+	struct prm_module_info *cur_module;
+	int i = 0;
+
+	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		/*
+		 * Module GUID match
+		 */
+		if (mode == GET_MODULE && guid_equal(guid, &cur_module->guid))
+			return (void *)cur_module;
+		/*
+		 * Handler GUID match
+		 */
+		for (i = 0; i < cur_module->handler_count; ++i) {
+			cur_handler = &cur_module->handlers[i];
+			if (guid_equal(guid, &cur_handler->guid)) {
+				if (mode == GET_MODULE)
+					return (void *)cur_module;
+				else
+					return (void *)cur_handler;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+
+static struct prm_module_info *find_prm_module(const guid_t *guid)
+{
+	return (struct prm_module_info *)find_guid_info(guid, GET_MODULE);
+}
+
+static struct prm_handler_info *find_prm_handler(const guid_t *guid)
+{
+	return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER);
+}
+
+/* In-coming PRM commands */
+
+#define PRM_CMD_RUN_SERVICE		0
+#define PRM_CMD_START_TRANSACTION	1
+#define PRM_CMD_END_TRANSACTION		2
+
+/* statuses that can be passed back to ASL */
+
+#define PRM_HANDLER_SUCCESS 		0
+#define PRM_HANDLER_ERROR 		1
+#define INVALID_PRM_COMMAND 		2
+#define PRM_HANDLER_GUID_NOT_FOUND 	3
+#define UPDATE_LOCK_ALREADY_HELD 	4
+#define UPDATE_UNLOCK_WITHOUT_LOCK 	5
+
+/*
+ * This is the PlatformRtMechanism opregion space handler.
+ * @function: indicates the read/write. In fact as the PlatformRtMechanism
+ * message is driven by command, only write is meaningful.
+ *
+ * @addr   : not used
+ * @bits   : not used.
+ * @value  : it is an in/out parameter. It points to the PRM message buffer.
+ * @handler_context: not used
+ */
+static acpi_status acpi_platformrt_space_handler(u32 function,
+						 acpi_physical_address addr,
+						 u32 bits, acpi_integer *value,
+						 void *handler_context,
+						 void *region_context)
+{
+	struct prm_buffer *buffer = ACPI_CAST_PTR(struct prm_buffer, value);
+	struct prm_handler_info *handler;
+	struct prm_module_info *module;
+	efi_status_t status;
+	struct prm_context_buffer context;
+
+	/*
+	 * The returned acpi_status will always be AE_OK. Error values will be
+	 * saved in the first byte of the PRM message buffer to be used by ASL.
+	 */
+	switch (buffer->prm_cmd) {
+	case PRM_CMD_RUN_SERVICE:
+
+		mutex_lock(&prm_mutex);
+		handler = find_prm_handler(&buffer->handler_guid);
+		module = find_prm_module(&buffer->handler_guid);
+		if (!handler || !module) {
+			mutex_unlock(&prm_mutex);
+			goto invalid_guid;
+		}
+
+		ACPI_COPY_NAMESEG(context.signature, "PRMC");
+		context.revision = 0x0;
+		context.reserved = 0x0;
+		context.identifier = handler->guid;
+		context.static_data_buffer = handler->static_data_buffer_addr;
+		context.mmio_ranges = module->mmio_info;
+
+		status = efi_call_virt_pointer(handler, handler_addr,
+					       handler->acpi_param_buffer_addr,
+					       &context);
+		mutex_unlock(&prm_mutex);
+		if (status == EFI_SUCCESS) {
+			buffer->prm_status = PRM_HANDLER_SUCCESS;
+		} else {
+			buffer->prm_status = PRM_HANDLER_ERROR;
+			buffer->efi_status = status;
+		}
+		break;
+
+	case PRM_CMD_START_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			module->updatable = false;
+		else
+			buffer->prm_status = UPDATE_LOCK_ALREADY_HELD;
+		break;
+
+	case PRM_CMD_END_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
+		else
+			module->updatable = true;
+		break;
+
+	default:
+
+		buffer->prm_status = INVALID_PRM_COMMAND;
+		break;
+	}
+
+	return AE_OK;
+
+invalid_guid:
+	buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+	return AE_OK;
+}
+
+static int prm_dump_raw_image(const u8 *data, int size)
+{
+	struct prm_module_export_descriptor *med;
+	struct prm_handler_export_descriptor *hed;
+	unsigned char *signature = "PRM_MEDT";
+	int i, med_offset, sig_size = 8;
+
+	/*
+	 * Scan "PRM_MEDT" string for module export descriptor structure.
+	 */
+	for (i = 0; i <= size - sig_size; i++) {
+		if (memcmp(data + i, signature, sig_size) == 0) {
+			med_offset = i;
+			pr_info("PRM: found module export descriptor structure, offset = 0x%x\n", i);
+			break;
+		}
+	}
+
+	if (i > size - sig_size) {
+		pr_err("PRM: no module export descriptor structure found\n");
+		return PRM_STS_NFOUND;
+	}
+
+	med = (struct prm_module_export_descriptor *)(data + med_offset);
+	pr_info("Platform guid    : %pUl\n", &med->platform_guid);
+	pr_info("Module signature : %s\n", med->signature);
+	pr_info("Module revision  : %d\n", med->revision);
+	pr_info("Module identifier: %pUl\n", &med->identifier);
+	pr_info("Handler count    : %d\n", med->handler_count);
+
+	hed = med->handlers;
+	for(i = 0; i < med->handler_count; i++) {
+		pr_info(" handler guid    : %pUl\n", &hed->handler_guid);
+		pr_info(" handler name    : %s\n", hed->handler_name);
+		hed++;
+	}
+
+	return PRM_STS_OK;
+}
+
+/*
+ * PE/COFF image section attribute definition.
+ */
+#define	PRM_IMAGE_SEC_CODE	BIT(5)
+#define	PRM_IMAGE_SEC_MEM_X	BIT(29)
+#define	PRM_IMAGE_SEC_MEM_R	BIT(30)
+#define	PRM_IMAGE_SEC_MEM_W	BIT(31)
+/*
+ * BIOS-OS communication phase 1: get loaded image size and section list size
+ * - input:
+ *   raw image base
+ *   raw image size
+ * - output:
+ *   loaded image size
+ *   section list size
+ */
+static int efi_get_loaded_size(struct prm_handler_info *handler,
+				 struct prm_handler_param_buffer *param_buffer)
+{
+	efi_status_t status;
+
+	param_buffer->phase = PRM_PHASE_GET_SIZE;
+	/*
+	 * Return loaded image size
+	 */
+	status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+	if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to query loaded image size or module info size, 0x%lx\n", status);
+		return PRM_STS_ERR;
+	}
+	pr_info("PRM: phase:%d, raw_base:0x%llx, raw_size:0x%llx, loaded_size:0x%llx, seclist_size:0x%llx\n",
+		param_buffer->phase, param_buffer->raw_base, param_buffer->raw_size,
+		param_buffer->loaded_size, param_buffer->seclist_size);
+
+	return PRM_STS_OK;
+}
+
+/*
+ * BIOS-OS communication phase 2: parse loaded image
+ * - input:
+ *   loaded image base
+ *   loaded image size
+ *   section list base
+ *   section list size
+ * - output:
+ *   section list data structure
+ *   module GUID
+ *   module info size
+ */
+static int efi_parse_loaded_image(struct prm_handler_info *handler,
+		struct prm_handler_param_buffer *param_buffer)
+{
+	struct prm_module_info *module_info;
+	efi_status_t status;
+
+	param_buffer->phase = PRM_PHASE_LOAD_IMAGE;
+	param_buffer->loaded_base = (u64)module_alloc(param_buffer->loaded_size);
+	if (!param_buffer->loaded_base)
+		return PRM_STS_ERR;
+
+	param_buffer->seclist_base = (u64)kmalloc(param_buffer->seclist_size, GFP_KERNEL);
+	if (!param_buffer->seclist_base)
+		goto phase2_out1;
+	/*
+	 * Return module GUID and module info size
+	 */
+	status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+	if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to parse loaded image\n");
+		goto phase2_out2;
+	}
+	module_info = find_prm_module(&param_buffer->module_guid);
+	if (!module_info) {
+		pr_err("PRM: no existing module found, update module GUID:%pUl\n",
+			&param_buffer->module_guid);
+		goto phase2_out2;
+	}
+	pr_info("PRM: phase:%d, module_GUID:%pUl, module_info_size:0x%llx\n",
+		param_buffer->phase, &param_buffer->module_guid, param_buffer->module_info_size);
+
+	return PRM_STS_OK;
+
+phase2_out2:
+	kfree((void *)param_buffer->seclist_base);
+phase2_out1:
+	module_memfree((void *)param_buffer->loaded_base);
+	return PRM_STS_ERR;
+}
+
+/*
+ * BIOS-OS communication phase 3: collect module info
+ * - input:
+ *   module info base
+ *   module info size
+ * - output:
+ *   module info data structure
+ */
+static int efi_collect_module_info(struct prm_handler_info *handler,
+				    struct prm_handler_param_buffer *param_buffer)
+{
+	struct acpi_prmt_module_info *acpi_module_info;
+	struct acpi_prmt_handler_info *acpi_handler_info;
+	efi_status_t status;
+	int i;
+
+	param_buffer->phase = PRM_PHASE_GET_MODINFO;
+	param_buffer->module_info_base = (u64)kmalloc(param_buffer->module_info_size, GFP_KERNEL);
+	if (!param_buffer->module_info_base)
+		return PRM_STS_ERR;
+	/*
+	 * Return code section data structure
+	 */
+        status = efi_call_virt_pointer(handler, handler_addr, param_buffer);
+        if (status != EFI_SUCCESS) {
+		pr_err("PRM: failed to generate module info data structure, 0x%lx\n", status);
+		kfree((void *)param_buffer->module_info_base);
+                return PRM_STS_ERR;
+	}
+	pr_info("PRM: phase:%d\n", param_buffer->phase);
+	acpi_module_info = (struct acpi_prmt_module_info *)param_buffer->module_info_base;
+	pr_info("Module GUID  : %pUl\n", acpi_module_info->module_guid);
+	pr_info("Major Rev    : %d\n", acpi_module_info->major_rev);
+	pr_info("Minor Rev    : %d\n", acpi_module_info->minor_rev);
+	pr_info("Handler Count: %d\n", acpi_module_info->handler_info_count);
+
+	acpi_handler_info = get_first_handler(acpi_module_info);
+	for (i = 0; i < acpi_module_info->handler_info_count; i++) {
+		pr_info("Handler GUID: %pUl\n", acpi_handler_info->handler_guid);
+		pr_info("Handler addr: 0x%llx\n", acpi_handler_info->handler_address);
+		acpi_handler_info++;
+	}
+	return PRM_STS_OK;
+}
+
+static int prm_compare_version(struct prm_module_info *old_mod,
+				struct prm_module_info *new_mod)
+{
+	/*
+	 * PRM updates are always applied in monotonically increasing fashion.
+	 */
+	if (new_mod->major_rev > old_mod->major_rev)
+		return PRM_STS_OK;
+	else if (new_mod->major_rev == old_mod->major_rev) {
+		if (new_mod->minor_rev > old_mod->minor_rev)
+			return PRM_STS_OK;
+	}
+	return PRM_STS_ERR;
+}
+
+static int prm_update_module(struct prm_handler_param_buffer *param_buffer)
+{
+	struct acpi_prmt_module_info *acpi_mod;
+	struct acpi_prmt_handler_info *acpi_handler;
+	struct prm_module_info *old_mod, *new_mod;
+	struct prm_handler_info *old_handler, *new_handler;
+	int mod_size, i;
+
+	acpi_mod = (struct acpi_prmt_module_info *)param_buffer->module_info_base;
+	old_mod = find_prm_module((guid_t *)acpi_mod->module_guid);
+	/*
+	 * Don't update PRM module if it's already in transaction.
+	 */
+	if (!old_mod->updatable)
+		return PRM_STS_ERR;
+
+	mod_size = struct_size(new_mod, handlers, acpi_mod->handler_info_count);
+	new_mod = (struct prm_module_info *)kmalloc(mod_size, GFP_KERNEL);
+	if (!new_mod)
+		return PRM_STS_ERR;
+
+	guid_copy(&new_mod->guid, (guid_t *)acpi_mod->module_guid);
+	new_mod->major_rev = acpi_mod->major_rev;
+	new_mod->minor_rev = acpi_mod->minor_rev;
+	new_mod->handler_count = acpi_mod->handler_info_count;
+	new_mod->updatable = true;
+
+	/*
+	 * Don't update PRM module if its version number is not
+	 * greater than the current PRM module.
+	 */
+	if (prm_compare_version(old_mod, new_mod) != PRM_STS_OK) {
+		pr_err("PRM: version number not meet the requirement\n");
+		kfree(new_mod);
+		return PRM_STS_ERR;
+	}
+	/*
+	 * Inherit mmio resource from existing module
+	 */
+	new_mod->mmio_info = old_mod->mmio_info;
+
+	acpi_handler = get_first_handler(acpi_mod);
+
+	for (i = 0; i < acpi_mod->handler_info_count; i++) {
+		old_handler = &old_mod->handlers[i];
+		new_handler = &new_mod->handlers[i];
+
+		guid_copy(&new_handler->guid, (guid_t *)acpi_handler->handler_guid);
+		new_handler->rev = acpi_handler->revision;
+		new_handler->handler_addr = acpi_handler->handler_address;
+		/*
+		 * Inherit static data buffer and acpi parameter buffer from existing handler
+		 */
+		new_handler->static_data_buffer_addr = old_handler->static_data_buffer_addr;
+		new_handler->acpi_param_buffer_addr = old_handler->acpi_param_buffer_addr;
+		acpi_handler = get_next_handler(acpi_handler);
+		pr_info("PRM: count:%d,old addr:0x%llx, new addr:0x%llx\n",i, old_handler->handler_addr, new_handler->handler_addr);
+	}
+
+	INIT_LIST_HEAD(&new_mod->module_list);
+	list_replace(&old_mod->module_list, &new_mod->module_list);
+	kfree(old_mod);
+	return PRM_STS_OK;
+}
+
+#define	PRM_UPDATE_HANDLER_GUID	EFI_GUID(0xa13cdd48, 0xc822, 0x4913, 0xb0, 0xcf, 0x3c, 0xb4, 0x68, 0x23, 0xd3, 0x76)
+
+static int prm_parse_image(const u8 *data, int size)
+{
+	struct prm_handler_info *update_handler;
+	struct prm_handler_param_buffer *param_buffer;
+	struct prm_image_section *sec;
+	efi_guid_t update_handler_guid = PRM_UPDATE_HANDLER_GUID;
+	size_t param_size;
+	int num_pages, num_secs, ret, i;
+
+	update_handler = find_prm_handler(&update_handler_guid);
+	if (!update_handler) {
+		pr_err("PRM: failed to find self update handler\n");
+		return PRM_STS_NFOUND;
+	}
+	pr_info("PRM: found self update handler\n");
+
+	param_size = sizeof(struct prm_handler_param_buffer);
+	param_buffer = (struct prm_handler_param_buffer *)kmalloc(param_size, GFP_KERNEL);
+	if (!param_buffer)
+		return PRM_STS_ERR;
+
+	memset(param_buffer, 0, param_size);
+
+	param_buffer->raw_base = (u64)data;
+	param_buffer->raw_size = size;
+	/*
+	 * BIOS-OS communication phase 1: get loaded image size
+	 */
+	ret = efi_get_loaded_size(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * BIOS-OS communication phase 2: parse loaded image
+	 */
+	ret = efi_parse_loaded_image(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * BIOS-OS communication phase 3: get module info
+	 */
+	ret = efi_collect_module_info(update_handler, param_buffer);
+	if (ret != PRM_STS_OK)
+		goto parse_image_out;
+	/*
+	 * PRM image load and parse completed, fix page table for code sections.
+	 * First make the page read-only, and only then make it executable to
+	 * prevent it from being W+X in between.
+	 */
+	num_secs = param_buffer->seclist_size / sizeof(struct prm_image_section);
+	sec = (struct prm_image_section *)param_buffer->seclist_base;
+	for (i = 0; i < num_secs; i++) {
+		pr_info("PRM: sec_base:0x%llx, sec_size:0x%llx, sec_attr:0x%llx\n",
+			sec->base, sec->size, sec->attr);
+		if (sec->attr & PRM_IMAGE_SEC_CODE) {
+			num_pages = DIV_ROUND_UP(sec->size, PAGE_SIZE);
+			set_memory_ro((unsigned long)sec->base, num_pages);
+			set_memory_x((unsigned long)sec->base, num_pages);
+		}
+		sec++;
+	}
+	/*
+	 * Update system PRM module list
+	 */
+	ret = prm_update_module(param_buffer);
+	return ret;
+
+parse_image_out:
+	kfree(param_buffer);
+	return PRM_STS_ERR;
+}
+
+static int prm_load_image(struct device *dev)
+{
+	const struct firmware *firmware;
+	char name[16];
+	int ret;
+
+	sprintf(name, "prm.efi");
+
+	if (request_firmware_direct(&firmware, name, dev)) {
+		pr_err("PRM: image %s load failed\n", name);
+		return PRM_STS_NFOUND;
+	}
+
+	mutex_lock(&prm_mutex);
+	ret = prm_dump_raw_image(firmware->data, firmware->size);
+	if (ret != PRM_STS_OK) {
+		pr_err("PRM: dump raw image error\n");
+		goto load_out;
+	}
+
+	ret = prm_parse_image(firmware->data, firmware->size);
+load_out:
+	mutex_unlock(&prm_mutex);
+	release_firmware(firmware);
+	return ret;
+}
+
+static ssize_t prm_update_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct prm_handler_info *cur_handler;
+	struct prm_module_info *cur_module;
+	char *s = buf;
+	int i = 0;
+
+	mutex_lock(&prm_mutex);
+	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		s += sprintf(s, "Module GUID  : %pUl\n", &cur_module->guid);
+		s += sprintf(s, "Major Rev    : %d\n", cur_module->major_rev);
+		s += sprintf(s, "Minor Rev    : %d\n", cur_module->minor_rev);
+		s += sprintf(s, "Handler Count: %d\n", cur_module->handler_count);
+		for (i = 0; i < cur_module->handler_count; ++i) {
+			cur_handler = &cur_module->handlers[i];
+			s += sprintf(s, " Handler GUID: %pUl\n", &cur_handler->guid);
+			s += sprintf(s, "          Rev: %d\n", cur_handler->rev);
+		}
+		s += sprintf(s, "\n");
+	}
+	mutex_unlock(&prm_mutex);
+
+	return (s - buf);
+}
+
+static ssize_t prm_update_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t size)
+{
+	unsigned long val;
+	ssize_t ret = 0;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
+	if (val != 1)
+		return size;
+
+	pr_info("PRM: runtime update\n");
+	ret = prm_load_image(&prm_pdev->dev);
+	if (ret == PRM_STS_OK)
+		ret = size;
+
+	return ret;
+}
+
+static const struct kobj_attribute prm_update_attr =
+__ATTR(prm_update, 0644, prm_update_show, prm_update_store);
+
+void __init init_prmt(void)
+{
+	struct acpi_table_header *tbl;
+	acpi_status status;
+	int mc;
+
+	status = acpi_get_table(ACPI_SIG_PRMT, 0, &tbl);
+	if (ACPI_FAILURE(status))
+		return;
+
+	mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
+					  sizeof (struct acpi_table_prmt_header),
+					  0, acpi_parse_prmt, 0);
+	acpi_put_table(tbl);
+	/*
+	 * Return immediately if PRMT table is not present or no PRM module found.
+	 */
+	if (mc <= 0)
+		return;
+
+	pr_info("PRM: found %u modules\n", mc);
+
+	prm_pdev = platform_device_register_simple("prm", -1, NULL, 0);
+	if (IS_ERR(prm_pdev))
+		return;
+
+	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
+						    ACPI_ADR_SPACE_PLATFORM_RT,
+						    &acpi_platformrt_space_handler,
+						    NULL, NULL);
+	if (ACPI_FAILURE(status))
+		pr_alert("PRM: OperationRegion handler could not be installed\n");
+
+	if (sysfs_create_file(acpi_kobj, &prm_update_attr.attr))
+		pr_err("PRM: failed to create prm sysfs entry\n");
+}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a1b2f3ec49e7957094e3e39add0e011ed9f1da8d..dcc289e30166cf78d2c33932dcbe304ee08bc5ff 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -502,6 +502,19 @@ static int acpi_idle_bm_check(void)
 	return bm_status;
 }
 
+static void wait_for_freeze(void)
+{
+#ifdef	CONFIG_X86
+	/* No delay is needed if we are in guest */
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return;
+#endif
+	/* Dummy wait op - must do something useless after P_LVL2 read
+	   because chipsets cannot guarantee that STPCLK# signal
+	   gets asserted in time to freeze execution properly. */
+	inl(acpi_gbl_FADT.xpm_timer_block.address);
+}
+
 /**
  * acpi_idle_do_entry - enter idle state using the appropriate method
  * @cx: cstate data
@@ -518,10 +531,7 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx)
 	} else {
 		/* IO port based C-state */
 		inb(cx->address);
-		/* Dummy wait op - must do something useless after P_LVL2 read
-		   because chipsets cannot guarantee that STPCLK# signal
-		   gets asserted in time to freeze execution properly. */
-		inl(acpi_gbl_FADT.xpm_timer_block.address);
+		wait_for_freeze();
 	}
 }
 
@@ -542,8 +552,7 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
 			safe_halt();
 		else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
 			inb(cx->address);
-			/* See comment in acpi_idle_do_entry() */
-			inl(acpi_gbl_FADT.xpm_timer_block.address);
+			wait_for_freeze();
 		} else
 			return -ENODEV;
 	}
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index a08e3eb2a6f9f133451ecee2fdae15058f19e402..1a8c5716ca1860df69284a3fd53b91de93703e05 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1339,6 +1339,52 @@ acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
 						  args_count, args);
 }
 
+static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	const struct acpi_device *adev;
+	struct fwnode_handle *parent;
+
+	/* Is this the root node? */
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "\\";
+
+	fwnode_handle_put(parent);
+
+	if (is_acpi_data_node(fwnode)) {
+		const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
+
+		return dn->name;
+	}
+
+	adev = to_acpi_device_node(fwnode);
+	if (WARN_ON(!adev))
+		return NULL;
+
+	return acpi_device_bid(adev);
+}
+
+static const char *
+acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent;
+
+	/* Is this the root node? */
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "";
+
+	/* Is this 2nd node from the root? */
+	parent = fwnode_get_next_parent(parent);
+	if (!parent)
+		return "";
+
+	fwnode_handle_put(parent);
+
+	/* ACPI device or data node. */
+	return ".";
+}
+
 static struct fwnode_handle *
 acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
 {
@@ -1379,6 +1425,8 @@ acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 		.get_parent = acpi_node_get_parent,			\
 		.get_next_child_node = acpi_get_next_subnode,		\
 		.get_named_child_node = acpi_fwnode_get_named_child_node, \
+		.get_name = acpi_fwnode_get_name,			\
+		.get_name_prefix = acpi_fwnode_get_name_prefix,		\
 		.get_reference_args = acpi_fwnode_get_reference_args,	\
 		.graph_get_next_endpoint =				\
 			acpi_graph_get_next_endpoint,			\
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index b2cafa37df7c058e813068cb42f29aa0a3d04779..1abb5f92f12750c109cb87a8c513bda4234a44e9 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -39,6 +39,7 @@ static int acpi_apic_instance __initdata;
 enum acpi_subtable_type {
 	ACPI_SUBTABLE_COMMON,
 	ACPI_SUBTABLE_HMAT,
+	ACPI_SUBTABLE_PRMT,
 };
 
 struct acpi_subtable_entry {
@@ -222,6 +223,8 @@ acpi_get_entry_type(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.type;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.type;
+	case ACPI_SUBTABLE_PRMT:
+		return 0;
 	}
 	return 0;
 }
@@ -234,6 +237,8 @@ acpi_get_entry_length(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.length;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.length;
+	case ACPI_SUBTABLE_PRMT:
+		return entry->hdr->prmt.length;
 	}
 	return 0;
 }
@@ -246,6 +251,8 @@ acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
 		return sizeof(entry->hdr->common);
 	case ACPI_SUBTABLE_HMAT:
 		return sizeof(entry->hdr->hmat);
+	case ACPI_SUBTABLE_PRMT:
+		return sizeof(entry->hdr->prmt);
 	}
 	return 0;
 }
@@ -255,6 +262,8 @@ acpi_get_subtable_type(char *id)
 {
 	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
 		return ACPI_SUBTABLE_HMAT;
+	if (strncmp(id, ACPI_SIG_PRMT, 4) == 0)
+		return ACPI_SUBTABLE_PRMT;
 	return ACPI_SUBTABLE_COMMON;
 }
 
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 8beb418ce167bae9e041c804ad4ff2f6655e03cc..6b657cbec5cdbbd504b8b44feba2083cf2580b03 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -900,7 +900,7 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac)
 	 * value, don't extend it here. This happens on STA2X11, for example.
 	 *
 	 * XXX: manipulating the DMA mask from platform code is completely
-	 * bogus, platform code should use dev->bus_dma_mask instead..
+	 * bogus, platform code should use dev->bus_dma_limit instead..
 	 */
 	if (pdev->dma_mask && pdev->dma_mask < DMA_BIT_MASK(32))
 		return 0;
diff --git a/drivers/ata/pata_atp867x.c b/drivers/ata/pata_atp867x.c
index cfd0cf2cbca6caaaab2fd2b2d424375bb09a95a2..e01a3a6e4d462a3ed1089ad6ff52cea21252d5ab 100644
--- a/drivers/ata/pata_atp867x.c
+++ b/drivers/ata/pata_atp867x.c
@@ -422,7 +422,7 @@ static int atp867x_ata_pci_sff_init_host(struct ata_host *host)
 #ifdef	ATP867X_DEBUG
 	atp867x_check_res(pdev);
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		printk(KERN_DEBUG "ATP867X: iomap[%d]=0x%llx\n", i,
 			(unsigned long long)(host->iomap[i]));
 #endif
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 0514aa7e80e393508037911f884e4de1622e8e49..20190f66ced987c10badfad6012bdfdbe678345b 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -2329,7 +2329,7 @@ static int nv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         // Make sure this is a SATA controller by counting the number of bars
         // (NVIDIA SATA controllers will always have six bars).  Otherwise,
         // it's an IDE controller and we ignore it.
-	for (bar = 0; bar < 6; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		if (pci_resource_start(pdev, bar) == 0)
 			return -ENODEV;
 
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0d32544b6f91668c1d3493c875498b4f8e609cb5..6ead2acf1ce9f422f357086779bca4d97fa906c3 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -121,7 +121,6 @@ extern void device_release_driver_internal(struct device *dev,
 					   struct device *parent);
 
 extern void driver_detach(struct device_driver *drv);
-extern int driver_probe_device(struct device_driver *drv, struct device *dev);
 extern void driver_deferred_probe_del(struct device *dev);
 static inline int driver_match_device(struct device_driver *drv,
 				      struct device *dev)
@@ -134,7 +133,6 @@ extern int driver_add_groups(struct device_driver *drv,
 			     const struct attribute_group **groups);
 extern void driver_remove_groups(struct device_driver *drv,
 				 const struct attribute_group **groups);
-int device_driver_attach(struct device_driver *drv, struct device *dev);
 void device_driver_detach(struct device *dev);
 
 extern char *make_class_name(const char *name, struct kobject *kobj);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index a1d1e82563244c4fc9e512222036651d21130fee..7974bd7c6c13b350c7d4e2f7fead1feb7621eb0c 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -209,15 +209,11 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 	int err = -ENODEV;
 
 	dev = bus_find_device_by_name(bus, NULL, buf);
-	if (dev && dev->driver == NULL && driver_match_device(drv, dev)) {
+	if (dev && driver_match_device(drv, dev)) {
 		err = device_driver_attach(drv, dev);
-
-		if (err > 0) {
+		if (!err) {
 			/* success */
 			err = count;
-		} else if (err == 0) {
-			/* driver didn't accept device */
-			err = -ENODEV;
 		}
 	}
 	put_device(dev);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6fea8096edf12c887d179df85e36e3b18b8c02dd..cf98e505efb71840a2b3ad271e02f18d8c14b88e 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1722,7 +1722,11 @@ void device_initialize(struct device *dev)
 	device_pm_init(dev);
 	set_dev_node(dev, -1);
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	raw_spin_lock_init(&dev->msi_lock);
 	INIT_LIST_HEAD(&dev->msi_list);
+	INIT_LIST_HEAD(&dev->dev_msi_list);
+	dev->msi_last_list = &dev->msi_list;
+	dev->dev_msi_last_list = &dev->dev_msi_list;
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
 	INIT_LIST_HEAD(&dev->links.suppliers);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index cf7e5b4afc1be41f8ad9c89a1b7ba311cdaaeedd..f98d5411f9b277d54169d45850264d3d1a205f16 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -464,6 +464,8 @@ static void driver_sysfs_remove(struct device *dev)
  * from a driver's probe() method.)
  *
  * This function must be called with the device lock held.
+ *
+ * Callers should prefer to use device_driver_attach() instead.
  */
 int device_bind_driver(struct device *dev)
 {
@@ -482,21 +484,42 @@ EXPORT_SYMBOL_GPL(device_bind_driver);
 static atomic_t probe_count = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);
 
-static void driver_deferred_probe_add_trigger(struct device *dev,
-					      int local_trigger_count)
+static int call_driver_probe(struct device *dev, struct device_driver *drv)
 {
-	driver_deferred_probe_add(dev);
-	/* Did a trigger occur while probing? Need to re-trigger if yes */
-	if (local_trigger_count != atomic_read(&deferred_trigger_count))
-		driver_deferred_probe_trigger();
+	int ret = 0;
+
+	if (dev->bus->probe)
+		ret = dev->bus->probe(dev);
+	else if (drv->probe)
+		ret = drv->probe(dev);
+
+	switch (ret) {
+	case 0:
+		break;
+	case -EPROBE_DEFER:
+		/* Driver requested deferred probing */
+		dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
+		break;
+	case -ENODEV:
+	case -ENXIO:
+		pr_debug("%s: probe of %s rejects match %d\n",
+			 drv->name, dev_name(dev), ret);
+		break;
+	default:
+		/* driver matched but the probe failed */
+		pr_warn("%s: probe of %s failed with error %d\n",
+			drv->name, dev_name(dev), ret);
+		break;
+	}
+
+	return ret;
 }
 
 static int really_probe(struct device *dev, struct device_driver *drv)
 {
-	int ret = -EPROBE_DEFER;
-	int local_trigger_count = atomic_read(&deferred_trigger_count);
 	bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
 			   !drv->suppress_bind_attrs;
+	int ret;
 
 	if (defer_all_probes) {
 		/*
@@ -505,17 +528,13 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 		 * wait_for_device_probe() right after that to avoid any races.
 		 */
 		dev_dbg(dev, "Driver %s force probe deferral\n", drv->name);
-		driver_deferred_probe_add(dev);
-		return ret;
+		return -EPROBE_DEFER;
 	}
 
 	ret = device_links_check_suppliers(dev);
-	if (ret == -EPROBE_DEFER)
-		driver_deferred_probe_add_trigger(dev, local_trigger_count);
 	if (ret)
 		return ret;
 
-	atomic_inc(&probe_count);
 	pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
 		 drv->bus->name, __func__, drv->name, dev_name(dev));
 	if (!list_empty(&dev->devres_head)) {
@@ -550,14 +569,14 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 			goto probe_failed;
 	}
 
-	if (dev->bus->probe) {
-		ret = dev->bus->probe(dev);
-		if (ret)
-			goto probe_failed;
-	} else if (drv->probe) {
-		ret = drv->probe(dev);
-		if (ret)
-			goto probe_failed;
+	ret = call_driver_probe(dev, drv);
+	if (ret) {
+		/*
+		 * Return probe errors as positive values so that the callers
+		 * can distinguish them from other errors.
+		 */
+		ret = -ret;
+		goto probe_failed;
 	}
 
 	if (device_add_groups(dev, drv->dev_groups)) {
@@ -592,7 +611,6 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 		dev->pm_domain->sync(dev);
 
 	driver_bound(dev);
-	ret = 1;
 	pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 	goto done;
@@ -618,31 +636,7 @@ static int really_probe(struct device *dev, struct device_driver *drv)
 	pm_runtime_reinit(dev);
 	dev_pm_set_driver_flags(dev, 0);
 
-	switch (ret) {
-	case -EPROBE_DEFER:
-		/* Driver requested deferred probing */
-		dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
-		driver_deferred_probe_add_trigger(dev, local_trigger_count);
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		pr_debug("%s: probe of %s rejects match %d\n",
-			 drv->name, dev_name(dev), ret);
-		break;
-	default:
-		/* driver matched but the probe failed */
-		printk(KERN_WARNING
-		       "%s: probe of %s failed with error %d\n",
-		       drv->name, dev_name(dev), ret);
-	}
-	/*
-	 * Ignore errors returned by ->probe so that the next driver can try
-	 * its luck.
-	 */
-	ret = 0;
 done:
-	atomic_dec(&probe_count);
-	wake_up_all(&probe_waitqueue);
 	return ret;
 }
 
@@ -693,25 +687,14 @@ void wait_for_device_probe(void)
 }
 EXPORT_SYMBOL_GPL(wait_for_device_probe);
 
-/**
- * driver_probe_device - attempt to bind device & driver together
- * @drv: driver to bind a device to
- * @dev: device to try to bind to the driver
- *
- * This function returns -ENODEV if the device is not registered,
- * 1 if the device is bound successfully and 0 otherwise.
- *
- * This function must be called with @dev lock held.  When called for a
- * USB interface, @dev->parent lock must be held as well.
- *
- * If the device has a parent, runtime-resume the parent before driver probing.
- */
-int driver_probe_device(struct device_driver *drv, struct device *dev)
+static int __driver_probe_device(struct device_driver *drv, struct device *dev)
 {
 	int ret = 0;
 
-	if (!device_is_registered(dev))
+	if (dev->p->dead || !device_is_registered(dev))
 		return -ENODEV;
+	if (dev->driver)
+		return -EBUSY;
 
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
@@ -734,6 +717,42 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	return ret;
 }
 
+/**
+ * driver_probe_device - attempt to bind device & driver together
+ * @drv: driver to bind a device to
+ * @dev: device to try to bind to the driver
+ *
+ * This function returns -ENODEV if the device is not registered, -EBUSY if it
+ * already has a driver, 0 if the device is bound successfully and a positive
+ * (inverted) error code for failures from the ->probe method.
+ *
+ * This function must be called with @dev lock held.  When called for a
+ * USB interface, @dev->parent lock must be held as well.
+ *
+ * If the device has a parent, runtime-resume the parent before driver probing.
+ */
+static int driver_probe_device(struct device_driver *drv, struct device *dev)
+{
+	int trigger_count = atomic_read(&deferred_trigger_count);
+	int ret;
+
+	atomic_inc(&probe_count);
+	ret = __driver_probe_device(drv, dev);
+	if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) {
+		driver_deferred_probe_add(dev);
+
+		/*
+		 * Did a trigger occur while probing? Need to re-trigger if yes
+		 */
+		if (trigger_count != atomic_read(&deferred_trigger_count) &&
+		    !defer_all_probes)
+			driver_deferred_probe_trigger();
+	}
+	atomic_dec(&probe_count);
+	wake_up_all(&probe_waitqueue);
+	return ret;
+}
+
 static inline bool cmdline_requested_async_probing(const char *drv_name)
 {
 	return parse_option_str(async_probe_drv_names, drv_name);
@@ -831,7 +850,14 @@ static int __device_attach_driver(struct device_driver *drv, void *_data)
 	if (data->check_async && async_allowed != data->want_async)
 		return 0;
 
-	return driver_probe_device(drv, dev);
+	/*
+	 * Ignore errors returned by ->probe so that the next driver can try
+	 * its luck.
+	 */
+	ret = driver_probe_device(drv, dev);
+	if (ret < 0)
+		return ret;
+	return ret == 0;
 }
 
 static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
@@ -987,43 +1013,34 @@ static void __device_driver_unlock(struct device *dev, struct device *parent)
  * @dev: Device to attach it to
  *
  * Manually attach driver to a device. Will acquire both @dev lock and
- * @dev->parent lock if needed.
+ * @dev->parent lock if needed. Returns 0 on success, -ERR on failure.
  */
 int device_driver_attach(struct device_driver *drv, struct device *dev)
 {
-	int ret = 0;
+	int ret;
 
 	__device_driver_lock(dev, dev->parent);
-
-	/*
-	 * If device has been removed or someone has already successfully
-	 * bound a driver before us just skip the driver probe call.
-	 */
-	if (!dev->p->dead && !dev->driver)
-		ret = driver_probe_device(drv, dev);
-
+	ret = __driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
+	/* also return probe errors as normal negative errnos */
+	if (ret > 0)
+		ret = -ret;
+	if (ret == -EPROBE_DEFER)
+		return -EAGAIN;
 	return ret;
 }
+EXPORT_SYMBOL_GPL(device_driver_attach);
 
 static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
 {
 	struct device *dev = _dev;
 	struct device_driver *drv;
-	int ret = 0;
+	int ret;
 
 	__device_driver_lock(dev, dev->parent);
-
 	drv = dev->p->async_driver;
-
-	/*
-	 * If device has been removed or someone has already successfully
-	 * bound a driver before us just skip the driver probe call.
-	 */
-	if (!dev->p->dead && !dev->driver)
-		ret = driver_probe_device(drv, dev);
-
+	ret = driver_probe_device(drv, dev);
 	__device_driver_unlock(dev, dev->parent);
 
 	dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);
@@ -1077,7 +1094,9 @@ static int __driver_attach(struct device *dev, void *data)
 		return 0;
 	}
 
-	device_driver_attach(drv, dev);
+	__device_driver_lock(dev, dev->parent);
+	driver_probe_device(drv, dev);
+	__device_driver_unlock(dev, dev->parent);
 
 	return 0;
 }
diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c
index 8da314b81eabbe904aefffdd95270b249d7cc5b7..319785758334b906cfe867d2eb023c4d3ec8a640 100644
--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -95,6 +95,8 @@ static void platform_msi_update_chip_ops(struct msi_domain_info *info)
 		chip->irq_mask = irq_chip_mask_parent;
 	if (!chip->irq_unmask)
 		chip->irq_unmask = irq_chip_unmask_parent;
+	if (!chip->irq_ack)
+		chip->irq_ack = irq_chip_ack_parent;
 	if (!chip->irq_eoi)
 		chip->irq_eoi = irq_chip_eoi_parent;
 	if (!chip->irq_set_affinity)
@@ -410,3 +412,187 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	return err;
 }
+
+#ifdef CONFIG_DEVICE_MSI
+/*
+ * Device specific MSI domain infrastructure for devices which have their
+ * own resource management and interrupt chip. These devices are not
+ * related to PCI and contrary to platform MSI they do not share a common
+ * resource and interrupt chip. They provide their own domain specific
+ * resource management and interrupt chip.
+ */
+
+static void device_msi_free_msi_entries(struct device *dev)
+{
+	struct list_head *msi_list = dev_to_dev_msi_list(dev);
+	struct msi_desc *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, msi_list, list) {
+		list_del(&entry->list);
+		free_msi_entry(entry);
+	}
+}
+
+/**
+ * device_msi_free_irqs - Free MSI interrupts assigned to  a device
+ * @dev:	Pointer to the device
+ *
+ * Frees the interrupt and the MSI descriptors.
+ */
+static void device_msi_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+	__msi_domain_free_irqs(domain, dev);
+	device_msi_free_msi_entries(dev);
+}
+
+static void device_msi_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_desc *entry, *tmp;
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	if (ops->msi_free_irq)
+		ops->msi_free_irq(domain, dev, irq);
+
+	list_for_each_entry_safe(entry, tmp, dev_to_dev_msi_list(dev), list) {
+		if (entry->irq == irq) {
+			list_del(&entry->list);
+			free_msi_entry(entry);
+		}
+	}
+
+	__msi_domain_free_irq(domain, dev, irq);
+}
+
+/**
+ * device_msi_alloc_irqs - Allocate MSI interrupts for a device
+ * @dev:	Pointer to the device
+ * @nvec:	Number of vectors
+ *
+ * Allocates the required number of MSI descriptors and the corresponding
+ * interrupt descriptors.
+ */
+static int device_msi_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+{
+	int i, ret = -ENOMEM;
+
+	dev->dev_msi_last_list = dev->dev_msi_list.prev;
+
+	for (i = 0; i < nvec; i++) {
+		struct msi_desc *entry = alloc_msi_entry(dev, 1, NULL);
+
+		if (!entry)
+			goto fail;
+		list_add_tail(&entry->list, dev_to_dev_msi_list(dev));
+	}
+
+	ret = __msi_domain_alloc_irqs(domain, dev, nvec);
+	if (!ret)
+		return 0;
+fail:
+	device_msi_free_msi_entries(dev);
+	return ret;
+}
+
+int device_msi_add_irq(struct irq_domain *domain, struct device *dev)
+{
+	struct msi_desc *entry;
+
+	dev->dev_msi_last_list = dev->dev_msi_list.prev;
+
+	entry = alloc_msi_entry(dev, 1, NULL);
+	if (!entry)
+		goto fail;
+	list_add_tail(&entry->list, dev_to_dev_msi_list(dev));
+
+	entry = list_last_entry(dev_to_dev_msi_list(dev), struct msi_desc, list);
+	if (!__msi_domain_alloc_irqs(domain, dev, 1))
+		return entry->device_msi.hwirq;
+
+	list_del(&entry->list);
+fail:
+	free_msi_entry(entry);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(device_msi_add_irq);
+
+static void device_msi_update_dom_ops(struct msi_domain_info *info)
+{
+	if (!info->ops->domain_alloc_irqs)
+		info->ops->domain_alloc_irqs = device_msi_alloc_irqs;
+	if (!info->ops->domain_free_irqs)
+		info->ops->domain_free_irqs = device_msi_free_irqs;
+	if (!info->ops->domain_free_irq)
+		info->ops->domain_free_irq = device_msi_free_irq;
+	if (!info->ops->msi_prepare)
+		info->ops->msi_prepare = arch_msi_prepare;
+}
+
+/**
+ * device_msi_create_msi_irq_domain - Create an irq domain for devices
+ * @fwnode:	Firmware node of the interrupt controller
+ * @info:	MSI domain info to configure the new domain
+ * @parent:	Parent domain
+ */
+struct irq_domain *device_msi_create_irq_domain(struct fwnode_handle *fn,
+						struct msi_domain_info *info,
+						struct irq_domain *parent)
+{
+	struct irq_domain *domain;
+
+	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+		platform_msi_update_chip_ops(info);
+
+	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+		device_msi_update_dom_ops(info);
+
+	msi_domain_set_default_info_flags(info);
+
+	domain = msi_create_irq_domain(fn, info, parent);
+	if (domain)
+		irq_domain_update_bus_token(domain, DOMAIN_BUS_DEVICE_MSI);
+	return domain;
+}
+
+#ifdef CONFIG_PCI
+#include <linux/pci.h>
+
+bool __weak arch_support_pci_device_msi(struct pci_dev *pdev)
+{
+	return false;
+}
+
+/**
+ * pci_subdevice_msi_create_irq_domain - Create an irq domain for subdevices
+ * @pdev:	Pointer to PCI device for which the subdevice domain is created
+ * @info:	MSI domain info to configure the new domain
+ */
+struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
+						       struct msi_domain_info *info)
+{
+	struct irq_domain *domain, *pdev_msi;
+	struct fwnode_handle *fn;
+
+	if (!arch_support_pci_device_msi(pdev))
+		return NULL;
+
+	/*
+	 * Retrieve the MSI domain of the underlying PCI device's MSI
+	 * domain. The PCI device domain's parent domain is also the parent
+	 * domain of the new subdevice domain.
+	 */
+	pdev_msi = dev_get_msi_domain(&pdev->dev);
+	if (!pdev_msi)
+		return NULL;
+
+	fn = irq_domain_alloc_named_fwnode(dev_name(&pdev->dev));
+	if (!fn)
+		return NULL;
+	domain = device_msi_create_irq_domain(fn, info, pdev_msi->parent);
+	if (!domain)
+		irq_domain_free_fwnode(fn);
+	return domain;
+}
+EXPORT_SYMBOL_GPL(pci_subdevice_msi_create_irq_domain);
+#endif /* CONFIG_PCI */
+#endif /* CONFIG_DEVICE_MSI */
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 81bd01ed4042784b1d6b531720330353626de282..63903cf2819749edfd7fec23e97a71dd2e6bf798 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -556,6 +556,19 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_parent - Return parent firwmare node
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * Return parent firmware node of the given node if possible or %NULL if no
+ * parent was available.
+ */
+struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
+{
+	return fwnode_call_ptr_op(fwnode, get_parent);
+}
+EXPORT_SYMBOL_GPL(fwnode_get_parent);
+
 /**
  * fwnode_get_next_parent - Iterate to the node's parent
  * @fwnode: Firmware whose parent is retrieved
@@ -578,17 +591,27 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
 EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
 
 /**
- * fwnode_get_parent - Return parent firwmare node
- * @fwnode: Firmware whose parent is retrieved
+ * fwnode_get_name - Return the name of a node
+ * @fwnode: The firmware node
  *
- * Return parent firmware node of the given node if possible or %NULL if no
- * parent was available.
+ * Returns a pointer to the node name.
  */
-struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
+const char *fwnode_get_name(const struct fwnode_handle *fwnode)
 {
-	return fwnode_call_ptr_op(fwnode, get_parent);
+	return fwnode_call_ptr_op(fwnode, get_name);
+}
+
+/**
+ * fwnode_get_name_prefix - Return the prefix of node for printing purposes
+ * @fwnode: The firmware node
+ *
+ * Returns the prefix of a node, intended to be printed right before the node.
+ * The prefix works also as a separator between the nodes.
+ */
+const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	return fwnode_call_ptr_op(fwnode, get_name_prefix);
 }
-EXPORT_SYMBOL_GPL(fwnode_get_parent);
 
 /**
  * fwnode_get_next_child_node - Return the next child node handle for a node
@@ -956,6 +979,52 @@ fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
 
+/**
+ * fwnode_count_parents - Return the number of parents a node has
+ * @fwnode: The node the parents of which are to be counted
+ *
+ * Returns the number of parents a node has.
+ */
+unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *__fwnode;
+	unsigned int count;
+
+	__fwnode = fwnode_get_parent(fwnode);
+
+	for (count = 0; __fwnode; count++)
+		__fwnode = fwnode_get_next_parent(__fwnode);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(fwnode_count_parents);
+
+/**
+ * fwnode_get_nth_parent - Return an nth parent of a node
+ * @fwnode: The node the parent of which is requested
+ * @depth: Distance of the parent from the node
+ *
+ * Returns the nth parent of a node. If there is no parent at the requested
+ * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to
+ * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on.
+ *
+ * The caller is responsible for calling fwnode_handle_put() for the returned
+ * node.
+ */
+struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode,
+					    unsigned int depth)
+{
+	unsigned int i;
+
+	fwnode_handle_get(fwnode);
+
+	for (i = 0; i < depth && fwnode; i++)
+		fwnode = fwnode_get_next_parent(fwnode);
+
+	return fwnode;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);
+
 /**
  * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
  * @fwnode: Endpoint firmware node pointing to the remote endpoint
diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 4c3b9813b284318462152bb2963f59e85a499a92..0b890f75e59b8dc1eda9a0135b88aeb9c01ea60a 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -71,9 +71,9 @@ software_node_to_swnode(const struct software_node *node)
 	return swnode;
 }
 
-const struct software_node *to_software_node(struct fwnode_handle *fwnode)
+const struct software_node *to_software_node(const struct fwnode_handle *fwnode)
 {
-	struct swnode *swnode = to_swnode(fwnode);
+	const struct swnode *swnode = to_swnode(fwnode);
 
 	return swnode ? swnode->node : NULL;
 }
@@ -515,6 +515,38 @@ static int software_node_read_string_array(const struct fwnode_handle *fwnode,
 						propname, val, nval);
 }
 
+static const char *
+software_node_get_name(const struct fwnode_handle *fwnode)
+{
+	const struct swnode *swnode = to_swnode(fwnode);
+
+	if (!swnode)
+		return "(null)";
+
+	return kobject_name(&swnode->kobj);
+}
+
+static const char *
+software_node_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent;
+	const char *prefix;
+
+	parent = fwnode_get_parent(fwnode);
+	if (!parent)
+		return "";
+
+	/* Figure out the prefix from the parents. */
+	while (is_software_node(parent))
+		parent = fwnode_get_next_parent(parent);
+
+	prefix = fwnode_get_name_prefix(parent);
+	fwnode_handle_put(parent);
+
+	/* Guess something if prefix was NULL. */
+	return prefix ?: "/";
+}
+
 static struct fwnode_handle *
 software_node_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -619,6 +651,8 @@ static const struct fwnode_operations software_node_ops = {
 	.property_present = software_node_property_present,
 	.property_read_int_array = software_node_read_int_array,
 	.property_read_string_array = software_node_read_string_array,
+	.get_name = software_node_get_name,
+	.get_name_prefix = software_node_get_name_prefix,
 	.get_parent = software_node_get_parent,
 	.get_next_child_node = software_node_get_next_child,
 	.get_named_child_node = software_node_get_named_child_node,
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 950dbc5635fce9a532824274e660ab23486a4580..751bdff249e31dd05428b8f1063b0fe7d8f4c975 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -2021,7 +2021,8 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
 	 * config ref and try to destroy the workqueue from inside the work
 	 * queue.
 	 */
-	flush_workqueue(nbd->recv_workq);
+	if (nbd->recv_workq)
+		flush_workqueue(nbd->recv_workq);
 	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 34bb88fe0b0a64c55935309c926f17efbdbc6d8a..2c2381a806ae7b8830a190a0aae6d811740af2f0 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -678,14 +678,6 @@ static long pp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long pp_compat_ioctl(struct file *file, unsigned int cmd,
-			    unsigned long arg)
-{
-	return pp_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static int pp_open(struct inode *inode, struct file *file)
 {
 	unsigned int minor = iminor(inode);
@@ -794,9 +786,7 @@ static const struct file_operations pp_fops = {
 	.write		= pp_write,
 	.poll		= pp_poll,
 	.unlocked_ioctl	= pp_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = pp_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open		= pp_open,
 	.release	= pp_release,
 };
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 2f6e087ec49656b97e1838d02416b25b43ddaf8c..91c772e38bb54b31bbf0f22f000e6ca40d247791 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -670,20 +670,10 @@ static long vtpmx_fops_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vtpmx_fops_compat_ioctl(struct file *f, unsigned int ioctl,
-					  unsigned long arg)
-{
-	return vtpmx_fops_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vtpmx_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = vtpmx_fops_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = vtpmx_fops_compat_ioctl,
-#endif
+	.compat_ioctl = compat_ptr_ioctl,
 	.llseek = noop_llseek,
 };
 
diff --git a/drivers/cpufreq/Kconfig.powerpc b/drivers/cpufreq/Kconfig.powerpc
index 35b4f700f05422a2d96f593411915e05e85d9d4f..58151ca566958ae3702a0c114a015aa99b515ca9 100644
--- a/drivers/cpufreq/Kconfig.powerpc
+++ b/drivers/cpufreq/Kconfig.powerpc
@@ -48,9 +48,9 @@ config PPC_PASEMI_CPUFREQ
 	  PWRficient processors.
 
 config POWERNV_CPUFREQ
-       tristate "CPU frequency scaling for IBM POWERNV platform"
-       depends on PPC_POWERNV
-       default y
-       help
+	tristate "CPU frequency scaling for IBM POWERNV platform"
+	depends on PPC_POWERNV
+	default y
+	help
 	 This adds support for CPU frequency switching on IBM POWERNV
 	 platform
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index dfa6457deaf6081456fa29ca9b833b5d2eb64c28..a6528388952eca47c2baf81253d2cacc2296122b 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -4,17 +4,17 @@
 #
 
 config X86_INTEL_PSTATE
-       bool "Intel P state control"
-       depends on X86
-       select ACPI_PROCESSOR if ACPI
-       select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
-       help
-          This driver provides a P state for Intel core processors.
+	bool "Intel P state control"
+	depends on X86
+	select ACPI_PROCESSOR if ACPI
+	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
+	help
+	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
-          the scaling driver and governor for Sandy bridge processors.
+	  the scaling driver and governor for Sandy bridge processors.
 
 	  When this driver is enabled it will become the preferred
-          scaling driver for Sandy bridge processors.
+	  scaling driver for Sandy bridge processors.
 
 	  If in doubt, say N.
 
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 88727b7c0d592cb50c92f085b624a7b659313b27..c0aeedd66f022ae7f92492d220a14685707338fc 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -16,7 +16,7 @@ config CPU_IDLE
 if CPU_IDLE
 
 config CPU_IDLE_MULTIPLE_DRIVERS
-        bool
+	bool
 
 config CPU_IDLE_GOV_LADDER
 	bool "Ladder governor (for periodic timer tick)"
@@ -63,13 +63,13 @@ source "drivers/cpuidle/Kconfig.powerpc"
 endmenu
 
 config HALTPOLL_CPUIDLE
-       tristate "Halt poll cpuidle driver"
-       depends on X86 && KVM_GUEST
-       default y
-       help
-         This option enables halt poll cpuidle driver, which allows to poll
-         before halting in the guest (more efficient than polling in the
-         host via halt_poll_ns for some scenarios).
+	tristate "Halt poll cpuidle driver"
+	depends on X86 && KVM_GUEST
+	default y
+	help
+	 This option enables halt poll cpuidle driver, which allows to poll
+	 before halting in the guest (more efficient than polling in the
+	 host via halt_poll_ns for some scenarios).
 
 endif
 
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index d8530475493cb552e308021b449aaab3d12c5382..62272ecfa771fee1a5a12d428dc9e99c0a84fd84 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -3,15 +3,15 @@
 # ARM CPU Idle drivers
 #
 config ARM_CPUIDLE
-        bool "Generic ARM/ARM64 CPU idle Driver"
-        select DT_IDLE_STATES
+	bool "Generic ARM/ARM64 CPU idle Driver"
+	select DT_IDLE_STATES
 	select CPU_IDLE_MULTIPLE_DRIVERS
-        help
-          Select this to enable generic cpuidle driver for ARM.
-          It provides a generic idle driver whose idle states are configured
-          at run-time through DT nodes. The CPUidle suspend backend is
-          initialized by calling the CPU operations init idle hook
-          provided by architecture code.
+	help
+	  Select this to enable generic cpuidle driver for ARM.
+	  It provides a generic idle driver whose idle states are configured
+	  at run-time through DT nodes. The CPUidle suspend backend is
+	  initialized by calling the CPU operations init idle hook
+	  provided by architecture code.
 
 config ARM_PSCI_CPUIDLE
 	bool "PSCI CPU idle Driver"
@@ -25,7 +25,7 @@ config ARM_PSCI_CPUIDLE
 
 config ARM_BIG_LITTLE_CPUIDLE
 	bool "Support for ARM big.LITTLE processors"
-	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS
+	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST
 	depends on MCPM && !ARM64
 	select ARM_CPU_SUSPEND
 	select CPU_IDLE_MULTIPLE_DRIVERS
@@ -51,13 +51,13 @@ config ARM_HIGHBANK_CPUIDLE
 
 config ARM_KIRKWOOD_CPUIDLE
 	bool "CPU Idle Driver for Marvell Kirkwood SoCs"
-	depends on MACH_KIRKWOOD && !ARM64
+	depends on (MACH_KIRKWOOD || COMPILE_TEST) && !ARM64
 	help
 	  This adds the CPU Idle driver for Marvell Kirkwood SoCs.
 
 config ARM_ZYNQ_CPUIDLE
 	bool "CPU Idle Driver for Xilinx Zynq processors"
-	depends on ARCH_ZYNQ && !ARM64
+	depends on (ARCH_ZYNQ || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle on Xilinx Zynq processors.
 
@@ -65,24 +65,24 @@ config ARM_U8500_CPUIDLE
 	bool "Cpu Idle Driver for the ST-E u8500 processors"
 	depends on ARCH_U8500 && !ARM64
 	help
-	  Select this to enable cpuidle for ST-E u8500 processors
+	  Select this to enable cpuidle for ST-E u8500 processors.
 
 config ARM_AT91_CPUIDLE
 	bool "Cpu Idle Driver for the AT91 processors"
 	default y
-	depends on ARCH_AT91 && !ARM64
+	depends on (ARCH_AT91 || COMPILE_TEST) && !ARM64
 	help
-	  Select this to enable cpuidle for AT91 processors
+	  Select this to enable cpuidle for AT91 processors.
 
 config ARM_EXYNOS_CPUIDLE
 	bool "Cpu Idle Driver for the Exynos processors"
-	depends on ARCH_EXYNOS && !ARM64
+	depends on (ARCH_EXYNOS || COMPILE_TEST) && !ARM64
 	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
 	help
-	  Select this to enable cpuidle for Exynos processors
+	  Select this to enable cpuidle for Exynos processors.
 
 config ARM_MVEBU_V7_CPUIDLE
 	bool "CPU Idle Driver for mvebu v7 family processors"
-	depends on ARCH_MVEBU && !ARM64
+	depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64
 	help
 	  Select this to enable cpuidle on Armada 370, 38x and XP processors.
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index b607278df25b4a09e639922b57a0b642e6366a46..04003b90dc4974ee210e5d4ae4570d295541fe8a 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -89,6 +89,7 @@
  * @coupled_cpus: mask of cpus that are part of the coupled set
  * @requested_state: array of requested states for cpus in the coupled set
  * @ready_waiting_counts: combined count of cpus  in ready or waiting loops
+ * @abort_barrier: synchronisation point for abort cases
  * @online_count: count of cpus that are online
  * @refcnt: reference count of cpuidle devices that are using this struct
  * @prevent: flag to prevent coupled idle while a cpu is hotplugging
@@ -338,7 +339,7 @@ static void cpuidle_coupled_poke(int cpu)
 
 /**
  * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting
- * @dev: struct cpuidle_device for this cpu
+ * @this_cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  *
  * Calls cpuidle_coupled_poke on all other online cpus.
@@ -355,7 +356,7 @@ static void cpuidle_coupled_poke_others(int this_cpu,
 
 /**
  * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop
- * @dev: struct cpuidle_device for this cpu
+ * @cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  * @next_state: the index in drv->states of the requested state for this cpu
  *
@@ -376,7 +377,7 @@ static int cpuidle_coupled_set_waiting(int cpu,
 
 /**
  * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop
- * @dev: struct cpuidle_device for this cpu
+ * @cpu: target cpu
  * @coupled: the struct coupled that contains the current cpu
  *
  * Removes the requested idle state for the specified cpuidle device.
@@ -412,7 +413,7 @@ static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled)
 
 /**
  * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed
- * @cpu - this cpu
+ * @cpu: this cpu
  *
  * Turns on interrupts and spins until any outstanding poke interrupts have
  * been processed and the poke bit has been cleared.
diff --git a/drivers/cpuidle/cpuidle-clps711x.c b/drivers/cpuidle/cpuidle-clps711x.c
index 6e36740f5719ba0fff74aa713ab7728d9362bc8c..fc22c59b6c73b5016a9490786d427e4aea60cd96 100644
--- a/drivers/cpuidle/cpuidle-clps711x.c
+++ b/drivers/cpuidle/cpuidle-clps711x.c
@@ -37,10 +37,7 @@ static struct cpuidle_driver clps711x_idle_driver = {
 
 static int __init clps711x_cpuidle_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	clps711x_halt = devm_ioremap_resource(&pdev->dev, res);
+	clps711x_halt = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(clps711x_halt))
 		return PTR_ERR(clps711x_halt);
 
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index b0ce9bc78113e6bde532101ffe0beba9c9baf860..3a39a7f48b7712632d65515a79cb1f6ebec95cdc 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -18,6 +18,10 @@
 #include <linux/kvm_para.h>
 #include <linux/cpuidle_haltpoll.h>
 
+static bool force __read_mostly;
+module_param(force, bool, 0444);
+MODULE_PARM_DESC(force, "Load unconditionally");
+
 static struct cpuidle_device __percpu *haltpoll_cpuidle_devices;
 static enum cpuhp_state haltpoll_hp_state;
 
@@ -90,6 +94,11 @@ static void haltpoll_uninit(void)
 	haltpoll_cpuidle_devices = NULL;
 }
 
+static bool haltpoll_want(void)
+{
+	return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
+}
+
 static int __init haltpoll_init(void)
 {
 	int ret;
@@ -99,12 +108,11 @@ static int __init haltpoll_init(void)
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	cpuidle_poll_state_init(drv);
-
-	if (!kvm_para_available() ||
-		!kvm_para_has_hint(KVM_HINTS_REALTIME))
+	if (!kvm_para_available() || !haltpoll_want())
 		return -ENODEV;
 
+	cpuidle_poll_state_init(drv);
+
 	ret = cpuidle_register_driver(drv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c
index d23d8f468c12212136861461cf5ca764c93f5668..511c4f46027a941d920cf10fecd4cea9d6dad5c0 100644
--- a/drivers/cpuidle/cpuidle-kirkwood.c
+++ b/drivers/cpuidle/cpuidle-kirkwood.c
@@ -55,10 +55,7 @@ static struct cpuidle_driver kirkwood_idle_driver = {
 /* Initialize CPU idle by registering the idle states */
 static int kirkwood_cpuidle_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	ddr_operation_base = devm_ioremap_resource(&pdev->dev, res);
+	ddr_operation_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(ddr_operation_base))
 		return PTR_ERR(ddr_operation_base);
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b39b6ae55f3ffc7714a2842f7e3bd2..1b299e801f749a57daadb27fba502468f225c2a4 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -56,13 +56,10 @@ static u64 get_snooze_timeout(struct cpuidle_device *dev,
 		return default_snooze_timeout;
 
 	for (i = index + 1; i < drv->state_count; i++) {
-		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
-
-		if (s->disabled || su->disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
-		return s->target_residency * tb_ticks_per_usec;
+		return drv->states[i].target_residency * tb_ticks_per_usec;
 	}
 
 	return default_snooze_timeout;
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 4c4ce65f399b586af1fab9e066c1ea8517444e2b..0acdbc7b0ef3e8b5abb0e5f663a206e033347ed7 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/tick.h>
+#include <linux/mmu_context.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -75,44 +76,45 @@ int cpuidle_play_dead(void)
 
 static int find_deepest_state(struct cpuidle_driver *drv,
 			      struct cpuidle_device *dev,
-			      unsigned int max_latency,
+			      u64 max_latency_ns,
 			      unsigned int forbidden_flags,
 			      bool s2idle)
 {
-	unsigned int latency_req = 0;
+	u64 latency_req = 0;
 	int i, ret = 0;
 
 	for (i = 1; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req
-		    || s->exit_latency > max_latency
-		    || (s->flags & forbidden_flags)
-		    || (s2idle && !s->enter_s2idle))
+		if (dev->states_usage[i].disable ||
+		    s->exit_latency_ns <= latency_req ||
+		    s->exit_latency_ns > max_latency_ns ||
+		    (s->flags & forbidden_flags) ||
+		    (s2idle && !s->enter_s2idle))
 			continue;
 
-		latency_req = s->exit_latency;
+		latency_req = s->exit_latency_ns;
 		ret = i;
 	}
 	return ret;
 }
 
 /**
- * cpuidle_use_deepest_state - Set/clear governor override flag.
- * @enable: New value of the flag.
+ * cpuidle_use_deepest_state - Set/unset governor override mode.
+ * @latency_limit_ns: Idle state exit latency limit (or no override if 0).
  *
- * Set/unset the current CPU to use the deepest idle state (override governors
- * going forward if set).
+ * If @latency_limit_ns is nonzero, set the current CPU to use the deepest idle
+ * state with exit latency within @latency_limit_ns (override governors going
+ * forward), or do not override governors if it is zero.
  */
-void cpuidle_use_deepest_state(bool enable)
+void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
 	struct cpuidle_device *dev;
 
 	preempt_disable();
 	dev = cpuidle_get_device();
 	if (dev)
-		dev->use_deepest_state = enable;
+		dev->forced_idle_latency_limit_ns = latency_limit_ns;
 	preempt_enable();
 }
 
@@ -120,11 +122,15 @@ void cpuidle_use_deepest_state(bool enable)
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
+ * @latency_limit_ns: Idle state exit latency limit
+ *
+ * Return: the index of the deepest available idle state.
  */
 int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-			       struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       u64 latency_limit_ns)
 {
-	return find_deepest_state(drv, dev, UINT_MAX, 0, false);
+	return find_deepest_state(drv, dev, latency_limit_ns, 0, false);
 }
 
 #ifdef CONFIG_SUSPEND
@@ -181,7 +187,7 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * that interrupts won't be enabled when it exits and allows the tick to
 	 * be frozen safely.
 	 */
-	index = find_deepest_state(drv, dev, UINT_MAX, 0, true);
+	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
 	if (index > 0)
 		enter_s2idle_proper(drv, dev, index);
 
@@ -210,7 +216,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	 * CPU as a broadcast timer, this call may fail if it is not available.
 	 */
 	if (broadcast && tick_broadcast_enter()) {
-		index = find_deepest_state(drv, dev, target_state->exit_latency,
+		index = find_deepest_state(drv, dev, target_state->exit_latency_ns,
 					   CPUIDLE_FLAG_TIMER_STOP, false);
 		if (index < 0) {
 			default_idle_call();
@@ -220,6 +226,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		broadcast = false;
 	}
 
+	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(dev->cpu);
+
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
 
@@ -248,7 +257,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		local_irq_enable();
 
 	if (entered_state >= 0) {
-		s64 diff, delay = drv->states[entered_state].exit_latency;
+		s64 diff, delay = drv->states[entered_state].exit_latency_ns;
 		int i;
 
 		/*
@@ -256,18 +265,15 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		 * This can be moved to within driver enter routine,
 		 * but that results in multiple copies of same code.
 		 */
-		diff = ktime_us_delta(time_end, time_start);
-		if (diff > INT_MAX)
-			diff = INT_MAX;
+		diff = ktime_sub(time_end, time_start);
 
-		dev->last_residency = (int)diff;
-		dev->states_usage[entered_state].time += dev->last_residency;
+		dev->last_residency_ns = diff;
+		dev->states_usage[entered_state].time_ns += diff;
 		dev->states_usage[entered_state].usage++;
 
-		if (diff < drv->states[entered_state].target_residency) {
+		if (diff < drv->states[entered_state].target_residency_ns) {
 			for (i = entered_state - 1; i >= 0; i--) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				/* Shallower states are enabled, so update. */
@@ -276,22 +282,21 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 			}
 		} else if (diff > delay) {
 			for (i = entered_state + 1; i < drv->state_count; i++) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				/*
 				 * Update if a deeper state would have been a
 				 * better match for the observed idle duration.
 				 */
-				if (diff - delay >= drv->states[i].target_residency)
+				if (diff - delay >= drv->states[i].target_residency_ns)
 					dev->states_usage[entered_state].below++;
 
 				break;
 			}
 		}
 	} else {
-		dev->last_residency = 0;
+		dev->last_residency_ns = 0;
 	}
 
 	return entered_state;
@@ -381,10 +386,10 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 
 	limit_ns = TICK_NSEC;
 	for (i = 1; i < drv->state_count; i++) {
-		if (drv->states[i].disabled || dev->states_usage[i].disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
-		limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC;
+		limit_ns = drv->states[i].target_residency_ns;
 		break;
 	}
 
@@ -556,7 +561,7 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev)
 static void __cpuidle_device_init(struct cpuidle_device *dev)
 {
 	memset(dev->states_usage, 0, sizeof(dev->states_usage));
-	dev->last_residency = 0;
+	dev->last_residency_ns = 0;
 	dev->next_hrtimer = 0;
 }
 
@@ -576,6 +581,9 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 		return -EINVAL;
 
 	for (i = 0; i < drv->state_count; i++) {
+		if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE)
+			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+
 		if (drv->states[i].flags & CPUIDLE_FLAG_OFF)
 			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER;
 	}
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 9db154224999c9e6d7a2704b0b685bdb7658b134..4070e573bf43a49ee638085448d46e705894e74d 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -155,8 +155,6 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 {
 	int i;
 
-	drv->refcnt = 0;
-
 	/*
 	 * Use all possible CPUs as the default, because if the kernel boots
 	 * with some CPUs offline and then we online one of them, the CPU
@@ -165,16 +163,27 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 	if (!drv->cpumask)
 		drv->cpumask = (struct cpumask *)cpu_possible_mask;
 
-	/*
-	 * Look for the timer stop flag in the different states, so that we know
-	 * if the broadcast timer has to be set up.  The loop is in the reverse
-	 * order, because usually one of the deeper states have this flag set.
-	 */
-	for (i = drv->state_count - 1; i >= 0 ; i--) {
-		if (drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP) {
+	for (i = 0; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		/*
+		 * Look for the timer stop flag in the different states and if
+		 * it is found, indicate that the broadcast timer has to be set
+		 * up.
+		 */
+		if (s->flags & CPUIDLE_FLAG_TIMER_STOP)
 			drv->bctimer = 1;
-			break;
-		}
+
+		/*
+		 * The core will use the target residency and exit latency
+		 * values in nanoseconds, but allow drivers to provide them in
+		 * microseconds too.
+		 */
+		if (s->target_residency > 0)
+			s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+
+		if (s->exit_latency > 0)
+			s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
 	}
 }
 
@@ -229,9 +238,6 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
  */
 static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
 {
-	if (WARN_ON(drv->refcnt > 0))
-		return;
-
 	if (drv->bctimer) {
 		drv->bctimer = 0;
 		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
@@ -339,42 +345,39 @@ struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)
 EXPORT_SYMBOL_GPL(cpuidle_get_cpu_driver);
 
 /**
- * cpuidle_driver_ref - get a reference to the driver.
- *
- * Increment the reference counter of the cpuidle driver associated with
- * the current CPU.
- *
- * Returns a pointer to the driver, or NULL if the current CPU has no driver.
+ * cpuidle_driver_state_disabled - Disable or enable an idle state
+ * @drv: cpuidle driver owning the state
+ * @idx: State index
+ * @disable: Whether or not to disable the state
  */
-struct cpuidle_driver *cpuidle_driver_ref(void)
+void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
+				 bool disable)
 {
-	struct cpuidle_driver *drv;
+	unsigned int cpu;
 
-	spin_lock(&cpuidle_driver_lock);
+	mutex_lock(&cpuidle_lock);
 
-	drv = cpuidle_get_driver();
-	if (drv)
-		drv->refcnt++;
+	spin_lock(&cpuidle_driver_lock);
 
-	spin_unlock(&cpuidle_driver_lock);
-	return drv;
-}
+	if (!drv->cpumask) {
+		drv->states[idx].flags |= CPUIDLE_FLAG_UNUSABLE;
+		goto unlock;
+	}
 
-/**
- * cpuidle_driver_unref - puts down the refcount for the driver
- *
- * Decrement the reference counter of the cpuidle driver associated with
- * the current CPU.
- */
-void cpuidle_driver_unref(void)
-{
-	struct cpuidle_driver *drv;
+	for_each_cpu(cpu, drv->cpumask) {
+		struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
-	spin_lock(&cpuidle_driver_lock);
+		if (!dev)
+			continue;
 
-	drv = cpuidle_get_driver();
-	if (drv && !WARN_ON(drv->refcnt <= 0))
-		drv->refcnt--;
+		if (disable)
+			dev->states_usage[idx].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+		else
+			dev->states_usage[idx].disable &= ~CPUIDLE_STATE_DISABLED_BY_DRIVER;
+	}
 
+unlock:
 	spin_unlock(&cpuidle_driver_lock);
+
+	mutex_unlock(&cpuidle_lock);
 }
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index e9801f26c73270c6665a117a60b1e45d32186217..e48271e117a32852eac4674a1f332f97db682834 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -107,11 +107,14 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
  * cpuidle_governor_latency_req - Compute a latency constraint for CPU
  * @cpu: Target CPU
  */
-int cpuidle_governor_latency_req(unsigned int cpu)
+s64 cpuidle_governor_latency_req(unsigned int cpu)
 {
 	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
 	int device_req = dev_pm_qos_raw_resume_latency(device);
 
-	return device_req < global_req ? device_req : global_req;
+	if (device_req > global_req)
+		device_req = global_req;
+
+	return (s64)device_req * NSEC_PER_USEC;
 }
diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
index 7a703d2e006403cc73cd443a326c5adcb3bf58c9..cb2a96eafc02750acc5e7e66fa53cea18a50523a 100644
--- a/drivers/cpuidle/governors/haltpoll.c
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -49,7 +49,7 @@ static int haltpoll_select(struct cpuidle_driver *drv,
 			   struct cpuidle_device *dev,
 			   bool *stop_tick)
 {
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 
 	if (!drv->state_count || latency_req == 0) {
 		*stop_tick = false;
@@ -75,10 +75,9 @@ static int haltpoll_select(struct cpuidle_driver *drv,
 	return 0;
 }
 
-static void adjust_poll_limit(struct cpuidle_device *dev, unsigned int block_us)
+static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 {
 	unsigned int val;
-	u64 block_ns = block_us*NSEC_PER_USEC;
 
 	/* Grow cpu_halt_poll_us if
 	 * cpu_halt_poll_us < block_ns < guest_halt_poll_us
@@ -115,7 +114,7 @@ static void haltpoll_reflect(struct cpuidle_device *dev, int index)
 	dev->last_state_idx = index;
 
 	if (index != 0)
-		adjust_poll_limit(dev, dev->last_residency);
+		adjust_poll_limit(dev, dev->last_residency_ns);
 }
 
 /**
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 428eeb832fe7cd32cd8fc5fee37c7f980f70586c..8e9058c4ea63e1217ad33d2f61dd8e768187af0a 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -27,8 +27,8 @@ struct ladder_device_state {
 	struct {
 		u32 promotion_count;
 		u32 demotion_count;
-		u32 promotion_time;
-		u32 demotion_time;
+		u64 promotion_time_ns;
+		u64 demotion_time_ns;
 	} threshold;
 	struct {
 		int promotion_count;
@@ -68,9 +68,10 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct ladder_device_state *last_state;
-	int last_residency, last_idx = dev->last_state_idx;
+	int last_idx = dev->last_state_idx;
 	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 last_residency;
 
 	/* Special case when user has set very strict latency requirement */
 	if (unlikely(latency_req == 0)) {
@@ -80,14 +81,13 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 
 	last_state = &ldev->states[last_idx];
 
-	last_residency = dev->last_residency - drv->states[last_idx].exit_latency;
+	last_residency = dev->last_residency_ns - drv->states[last_idx].exit_latency_ns;
 
 	/* consider promotion */
 	if (last_idx < drv->state_count - 1 &&
-	    !drv->states[last_idx + 1].disabled &&
 	    !dev->states_usage[last_idx + 1].disable &&
-	    last_residency > last_state->threshold.promotion_time &&
-	    drv->states[last_idx + 1].exit_latency <= latency_req) {
+	    last_residency > last_state->threshold.promotion_time_ns &&
+	    drv->states[last_idx + 1].exit_latency_ns <= latency_req) {
 		last_state->stats.promotion_count++;
 		last_state->stats.demotion_count = 0;
 		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
@@ -98,13 +98,12 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 
 	/* consider demotion */
 	if (last_idx > first_idx &&
-	    (drv->states[last_idx].disabled ||
-	    dev->states_usage[last_idx].disable ||
-	    drv->states[last_idx].exit_latency > latency_req)) {
+	    (dev->states_usage[last_idx].disable ||
+	    drv->states[last_idx].exit_latency_ns > latency_req)) {
 		int i;
 
 		for (i = last_idx - 1; i > first_idx; i--) {
-			if (drv->states[i].exit_latency <= latency_req)
+			if (drv->states[i].exit_latency_ns <= latency_req)
 				break;
 		}
 		ladder_do_selection(dev, ldev, last_idx, i);
@@ -112,7 +111,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	}
 
 	if (last_idx > first_idx &&
-	    last_residency < last_state->threshold.demotion_time) {
+	    last_residency < last_state->threshold.demotion_time_ns) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
 		if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) {
@@ -152,9 +151,9 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 		lstate->threshold.demotion_count = DEMOTION_COUNT;
 
 		if (i < drv->state_count - 1)
-			lstate->threshold.promotion_time = state->exit_latency;
+			lstate->threshold.promotion_time_ns = state->exit_latency_ns;
 		if (i > first_idx)
-			lstate->threshold.demotion_time = state->exit_latency;
+			lstate->threshold.demotion_time_ns = state->exit_latency_ns;
 	}
 
 	return 0;
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index e5a5d0c8d66b1629a69f5e0a5c46f249fd49bf20..c3aa8d6ccee338870da3e514f36959bb295a160d 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,22 +19,12 @@
 #include <linux/sched/stat.h>
 #include <linux/math64.h>
 
-/*
- * Please note when changing the tuning values:
- * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
- * a scaling operation multiplication may overflow on 32 bit platforms.
- * In that case, #define RESOLUTION as ULL to get 64 bit result:
- * #define RESOLUTION 1024ULL
- *
- * The default values do not overflow.
- */
 #define BUCKETS 12
 #define INTERVAL_SHIFT 3
 #define INTERVALS (1UL << INTERVAL_SHIFT)
 #define RESOLUTION 1024
 #define DECAY 8
-#define MAX_INTERESTING 50000
-
+#define MAX_INTERESTING (50000 * NSEC_PER_USEC)
 
 /*
  * Concepts and ideas behind the menu governor
@@ -120,14 +110,14 @@ struct menu_device {
 	int             needs_update;
 	int             tick_wakeup;
 
-	unsigned int	next_timer_us;
+	u64		next_timer_ns;
 	unsigned int	bucket;
 	unsigned int	correction_factor[BUCKETS];
 	unsigned int	intervals[INTERVALS];
 	int		interval_ptr;
 };
 
-static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
 {
 	int bucket = 0;
 
@@ -140,15 +130,15 @@ static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters
 	if (nr_iowaiters)
 		bucket = BUCKETS/2;
 
-	if (duration < 10)
+	if (duration_ns < 10ULL * NSEC_PER_USEC)
 		return bucket;
-	if (duration < 100)
+	if (duration_ns < 100ULL * NSEC_PER_USEC)
 		return bucket + 1;
-	if (duration < 1000)
+	if (duration_ns < 1000ULL * NSEC_PER_USEC)
 		return bucket + 2;
-	if (duration < 10000)
+	if (duration_ns < 10000ULL * NSEC_PER_USEC)
 		return bucket + 3;
-	if (duration < 100000)
+	if (duration_ns < 100000ULL * NSEC_PER_USEC)
 		return bucket + 4;
 	return bucket + 5;
 }
@@ -276,13 +266,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		       bool *stop_tick)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
-	int i;
-	int idx;
-	unsigned int interactivity_req;
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	unsigned int predicted_us;
+	u64 predicted_ns;
+	u64 interactivity_req;
 	unsigned long nr_iowaiters;
-	ktime_t delta_next;
+	ktime_t delta, delta_tick;
+	int i, idx;
 
 	if (data->needs_update) {
 		menu_update(drv, dev);
@@ -290,15 +280,20 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	}
 
 	/* determine the expected residency time, round up */
-	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
+	delta = tick_nohz_get_sleep_length(&delta_tick);
+	if (unlikely(delta < 0)) {
+		delta = 0;
+		delta_tick = 0;
+	}
+	data->next_timer_ns = delta;
 
 	nr_iowaiters = nr_iowait_cpu(dev->cpu);
-	data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
+	data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
 
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
-	    ((data->next_timer_us < drv->states[1].target_residency ||
-	      latency_req < drv->states[1].exit_latency) &&
-	     !drv->states[0].disabled && !dev->states_usage[0].disable)) {
+	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
+	      latency_req < drv->states[1].exit_latency_ns) &&
+	     !dev->states_usage[0].disable)) {
 		/*
 		 * In this case state[0] will be used no matter what, so return
 		 * it right away and keep the tick running if state[0] is a
@@ -308,18 +303,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		return 0;
 	}
 
-	/*
-	 * Force the result of multiplication to be 64 bits even if both
-	 * operands are 32 bits.
-	 * Make sure to round up for half microseconds.
-	 */
-	predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
-					 data->correction_factor[data->bucket],
-					 RESOLUTION * DECAY);
-	/*
-	 * Use the lowest expected idle interval to pick the idle state.
-	 */
-	predicted_us = min(predicted_us, get_typical_interval(data, predicted_us));
+	/* Round up the result for half microseconds. */
+	predicted_us = div_u64(data->next_timer_ns *
+			       data->correction_factor[data->bucket] +
+			       (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
+			       RESOLUTION * DECAY * NSEC_PER_USEC);
+	/* Use the lowest expected idle interval to pick the idle state. */
+	predicted_ns = (u64)min(predicted_us,
+				get_typical_interval(data, predicted_us)) *
+				NSEC_PER_USEC;
 
 	if (tick_nohz_tick_stopped()) {
 		/*
@@ -330,14 +322,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * the known time till the closest timer event for the idle
 		 * state selection.
 		 */
-		if (predicted_us < TICK_USEC)
-			predicted_us = ktime_to_us(delta_next);
+		if (predicted_ns < TICK_NSEC)
+			predicted_ns = data->next_timer_ns;
 	} else {
 		/*
 		 * Use the performance multiplier and the user-configurable
 		 * latency_req to determine the maximum exit latency.
 		 */
-		interactivity_req = predicted_us / performance_multiplier(nr_iowaiters);
+		interactivity_req = div64_u64(predicted_ns,
+					      performance_multiplier(nr_iowaiters));
 		if (latency_req > interactivity_req)
 			latency_req = interactivity_req;
 	}
@@ -349,27 +342,26 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	idx = -1;
 	for (i = 0; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
 		if (idx == -1)
 			idx = i; /* first enabled state */
 
-		if (s->target_residency > predicted_us) {
+		if (s->target_residency_ns > predicted_ns) {
 			/*
 			 * Use a physical idle state, not busy polling, unless
 			 * a timer is going to trigger soon enough.
 			 */
 			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
-			    s->exit_latency <= latency_req &&
-			    s->target_residency <= data->next_timer_us) {
-				predicted_us = s->target_residency;
+			    s->exit_latency_ns <= latency_req &&
+			    s->target_residency_ns <= data->next_timer_ns) {
+				predicted_ns = s->target_residency_ns;
 				idx = i;
 				break;
 			}
-			if (predicted_us < TICK_USEC)
+			if (predicted_ns < TICK_NSEC)
 				break;
 
 			if (!tick_nohz_tick_stopped()) {
@@ -379,7 +371,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 				 * tick in that case and let the governor run
 				 * again in the next iteration of the loop.
 				 */
-				predicted_us = drv->states[idx].target_residency;
+				predicted_ns = drv->states[idx].target_residency_ns;
 				break;
 			}
 
@@ -389,13 +381,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * closest timer event, select this one to avoid getting
 			 * stuck in the shallow one for too long.
 			 */
-			if (drv->states[idx].target_residency < TICK_USEC &&
-			    s->target_residency <= ktime_to_us(delta_next))
+			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
+			    s->target_residency_ns <= delta_tick)
 				idx = i;
 
 			return idx;
 		}
-		if (s->exit_latency > latency_req)
+		if (s->exit_latency_ns > latency_req)
 			break;
 
 		idx = i;
@@ -409,12 +401,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * expected idle duration is shorter than the tick period length.
 	 */
 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-	     predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-		unsigned int delta_next_us = ktime_to_us(delta_next);
-
+	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
-		if (idx > 0 && drv->states[idx].target_residency > delta_next_us) {
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
 			/*
 			 * The tick is not going to be stopped and the target
 			 * residency of the state to be returned is not within
@@ -422,12 +412,11 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * tick, so try to correct that.
 			 */
 			for (i = idx - 1; i >= 0; i--) {
-				if (drv->states[i].disabled ||
-				    dev->states_usage[i].disable)
+				if (dev->states_usage[i].disable)
 					continue;
 
 				idx = i;
-				if (drv->states[i].target_residency <= delta_next_us)
+				if (drv->states[i].target_residency_ns <= delta_tick)
 					break;
 			}
 		}
@@ -463,7 +452,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
 	int last_idx = dev->last_state_idx;
 	struct cpuidle_state *target = &drv->states[last_idx];
-	unsigned int measured_us;
+	u64 measured_ns;
 	unsigned int new_factor;
 
 	/*
@@ -481,7 +470,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * assume the state was never reached and the exit latency is 0.
 	 */
 
-	if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
+	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
 		/*
 		 * The nohz code said that there wouldn't be any events within
 		 * the tick boundary (if the tick was stopped), but the idle
@@ -491,7 +480,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * have been idle long (but not forever) to help the idle
 		 * duration predictor do a better job next time.
 		 */
-		measured_us = 9 * MAX_INTERESTING / 10;
+		measured_ns = 9 * MAX_INTERESTING / 10;
 	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
 		   dev->poll_time_limit) {
 		/*
@@ -501,28 +490,29 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * the CPU might have been woken up from idle by the next timer.
 		 * Assume that to be the case.
 		 */
-		measured_us = data->next_timer_us;
+		measured_ns = data->next_timer_ns;
 	} else {
 		/* measured value */
-		measured_us = dev->last_residency;
+		measured_ns = dev->last_residency_ns;
 
 		/* Deduct exit latency */
-		if (measured_us > 2 * target->exit_latency)
-			measured_us -= target->exit_latency;
+		if (measured_ns > 2 * target->exit_latency_ns)
+			measured_ns -= target->exit_latency_ns;
 		else
-			measured_us /= 2;
+			measured_ns /= 2;
 	}
 
 	/* Make sure our coefficients do not exceed unity */
-	if (measured_us > data->next_timer_us)
-		measured_us = data->next_timer_us;
+	if (measured_ns > data->next_timer_ns)
+		measured_ns = data->next_timer_ns;
 
 	/* Update our correction ratio */
 	new_factor = data->correction_factor[data->bucket];
 	new_factor -= new_factor / DECAY;
 
-	if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
-		new_factor += RESOLUTION * measured_us / data->next_timer_us;
+	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
+		new_factor += div64_u64(RESOLUTION * measured_ns,
+					data->next_timer_ns);
 	else
 		/*
 		 * we were idle so long that we count it as a perfect
@@ -542,7 +532,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	data->correction_factor[data->bucket] = new_factor;
 
 	/* update the repeating-pattern data */
-	data->intervals[data->interval_ptr++] = measured_us;
+	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
 	if (data->interval_ptr >= INTERVALS)
 		data->interval_ptr = 0;
 }
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index c71773c88890bff1779b88c04bc3a8919c9303ae..6deaaf5f05b5765598115132492edc4f654df9da 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -104,7 +104,7 @@ struct teo_cpu {
 	u64 sleep_length_ns;
 	struct teo_idle_state states[CPUIDLE_STATE_MAX];
 	int interval_idx;
-	unsigned int intervals[INTERVALS];
+	u64 intervals[INTERVALS];
 };
 
 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -117,9 +117,8 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-	unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns);
 	int i, idx_hit = -1, idx_timer = -1;
-	unsigned int measured_us;
+	u64 measured_ns;
 
 	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
 		/*
@@ -127,23 +126,28 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		 * enough to the closest timer event expected at the idle state
 		 * selection time to be discarded.
 		 */
-		measured_us = UINT_MAX;
+		measured_ns = U64_MAX;
 	} else {
-		unsigned int lat;
+		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
 
-		lat = drv->states[dev->last_state_idx].exit_latency;
-
-		measured_us = ktime_to_us(cpu_data->time_span_ns);
+		/*
+		 * The computations below are to determine whether or not the
+		 * (saved) time till the next timer event and the measured idle
+		 * duration fall into the same "bin", so use last_residency_ns
+		 * for that instead of time_span_ns which includes the cpuidle
+		 * overhead.
+		 */
+		measured_ns = dev->last_residency_ns;
 		/*
 		 * The delay between the wakeup and the first instruction
 		 * executed by the CPU is not likely to be worst-case every
 		 * time, so take 1/2 of the exit latency as a very rough
 		 * approximation of the average of it.
 		 */
-		if (measured_us >= lat)
-			measured_us -= lat / 2;
+		if (measured_ns >= lat_ns)
+			measured_ns -= lat_ns / 2;
 		else
-			measured_us /= 2;
+			measured_ns /= 2;
 	}
 
 	/*
@@ -155,9 +159,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 		cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
 
-		if (drv->states[i].target_residency <= sleep_length_us) {
+		if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) {
 			idx_timer = i;
-			if (drv->states[i].target_residency <= measured_us)
+			if (drv->states[i].target_residency_ns <= measured_ns)
 				idx_hit = i;
 		}
 	}
@@ -193,30 +197,35 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * Save idle duration values corresponding to non-timer wakeups for
 	 * pattern detection.
 	 */
-	cpu_data->intervals[cpu_data->interval_idx++] = measured_us;
+	cpu_data->intervals[cpu_data->interval_idx++] = measured_ns;
 	if (cpu_data->interval_idx >= INTERVALS)
 		cpu_data->interval_idx = 0;
 }
 
+static bool teo_time_ok(u64 interval_ns)
+{
+	return !tick_nohz_tick_stopped() || interval_ns >= TICK_NSEC;
+}
+
 /**
  * teo_find_shallower_state - Find shallower idle state matching given duration.
  * @drv: cpuidle driver containing state data.
  * @dev: Target CPU.
  * @state_idx: Index of the capping idle state.
- * @duration_us: Idle duration value to match.
+ * @duration_ns: Idle duration value to match.
  */
 static int teo_find_shallower_state(struct cpuidle_driver *drv,
 				    struct cpuidle_device *dev, int state_idx,
-				    unsigned int duration_us)
+				    u64 duration_ns)
 {
 	int i;
 
 	for (i = state_idx - 1; i >= 0; i--) {
-		if (drv->states[i].disabled || dev->states_usage[i].disable)
+		if (dev->states_usage[i].disable)
 			continue;
 
 		state_idx = i;
-		if (drv->states[i].target_residency <= duration_us)
+		if (drv->states[i].target_residency_ns <= duration_ns)
 			break;
 	}
 	return state_idx;
@@ -232,8 +241,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      bool *stop_tick)
 {
 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-	int latency_req = cpuidle_governor_latency_req(dev->cpu);
-	unsigned int duration_us, hits, misses, early_hits;
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	u64 duration_ns;
+	unsigned int hits, misses, early_hits;
 	int max_early_idx, prev_max_early_idx, constraint_idx, idx, i;
 	ktime_t delta_tick;
 
@@ -244,8 +254,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 	cpu_data->time_span_ns = local_clock();
 
-	cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
-	duration_us = ktime_to_us(cpu_data->sleep_length_ns);
+	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+	cpu_data->sleep_length_ns = duration_ns;
 
 	hits = 0;
 	misses = 0;
@@ -257,14 +267,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 	for (i = 0; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable) {
+		if (dev->states_usage[i].disable) {
 			/*
 			 * Ignore disabled states with target residencies beyond
 			 * the anticipated idle duration.
 			 */
-			if (s->target_residency > duration_us)
+			if (s->target_residency_ns > duration_ns)
 				continue;
 
 			/*
@@ -302,8 +311,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			 * check if the current candidate state is not too
 			 * shallow for that role.
 			 */
-			if (!(tick_nohz_tick_stopped() &&
-			      drv->states[idx].target_residency < TICK_USEC)) {
+			if (teo_time_ok(drv->states[idx].target_residency_ns)) {
 				prev_max_early_idx = max_early_idx;
 				early_hits = cpu_data->states[i].early_hits;
 				max_early_idx = idx;
@@ -318,10 +326,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 			misses = cpu_data->states[i].misses;
 		}
 
-		if (s->target_residency > duration_us)
+		if (s->target_residency_ns > duration_ns)
 			break;
 
-		if (s->exit_latency > latency_req && constraint_idx > i)
+		if (s->exit_latency_ns > latency_req && constraint_idx > i)
 			constraint_idx = i;
 
 		idx = i;
@@ -329,8 +337,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		misses = cpu_data->states[i].misses;
 
 		if (early_hits < cpu_data->states[i].early_hits &&
-		    !(tick_nohz_tick_stopped() &&
-		      drv->states[i].target_residency < TICK_USEC)) {
+		    teo_time_ok(drv->states[i].target_residency_ns)) {
 			prev_max_early_idx = max_early_idx;
 			early_hits = cpu_data->states[i].early_hits;
 			max_early_idx = i;
@@ -356,7 +363,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 		if (max_early_idx >= 0) {
 			idx = max_early_idx;
-			duration_us = drv->states[idx].target_residency;
+			duration_ns = drv->states[idx].target_residency_ns;
 		}
 	}
 
@@ -378,9 +385,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * the current expected idle duration value.
 		 */
 		for (i = 0; i < INTERVALS; i++) {
-			unsigned int val = cpu_data->intervals[i];
+			u64 val = cpu_data->intervals[i];
 
-			if (val >= duration_us)
+			if (val >= duration_ns)
 				continue;
 
 			count++;
@@ -392,17 +399,17 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * values are in the interesting range.
 		 */
 		if (count > INTERVALS / 2) {
-			unsigned int avg_us = div64_u64(sum, count);
+			u64 avg_ns = div64_u64(sum, count);
 
 			/*
 			 * Avoid spending too much time in an idle state that
 			 * would be too shallow.
 			 */
-			if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) {
-				duration_us = avg_us;
-				if (drv->states[idx].target_residency > avg_us)
+			if (teo_time_ok(avg_ns)) {
+				duration_ns = avg_ns;
+				if (drv->states[idx].target_residency_ns > avg_ns)
 					idx = teo_find_shallower_state(drv, dev,
-								       idx, avg_us);
+								       idx, avg_ns);
 			}
 		}
 	}
@@ -412,9 +419,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * expected idle duration is shorter than the tick period length.
 	 */
 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-	    duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-		unsigned int delta_tick_us = ktime_to_us(delta_tick);
-
+	    duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
 		*stop_tick = false;
 
 		/*
@@ -423,8 +428,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * till the closest timer including the tick, try to correct
 		 * that.
 		 */
-		if (idx > 0 && drv->states[idx].target_residency > delta_tick_us)
-			idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us);
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick)
+			idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
 	}
 
 	return idx;
@@ -468,7 +473,7 @@ static int teo_enable_device(struct cpuidle_driver *drv,
 	memset(cpu_data, 0, sizeof(*cpu_data));
 
 	for (i = 0; i < INTERVALS; i++)
-		cpu_data->intervals[i] = UINT_MAX;
+		cpu_data->intervals[i] = U64_MAX;
 
 	return 0;
 }
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index c8fa5f41dfc4cfdc5c0a33d091f748f747ad1b31..f7e83613ae94abf925003feff989e21b4f1127a8 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -49,9 +49,10 @@ void cpuidle_poll_state_init(struct cpuidle_driver *drv)
 	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
 	state->exit_latency = 0;
 	state->target_residency = 0;
+	state->exit_latency_ns = 0;
+	state->target_residency_ns = 0;
 	state->power_usage = -1;
 	state->enter = poll_idle;
-	state->disabled = false;
 	state->flags = CPUIDLE_FLAG_POLLING;
 }
 EXPORT_SYMBOL_GPL(cpuidle_poll_state_init);
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 306a6213df1c59d3eb6858dcf88bf4aff2d5338a..55107565b3193f6142eaf84442f0fd0cd1006fae 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -142,6 +142,7 @@ static struct attribute_group cpuidle_attr_group = {
 
 /**
  * cpuidle_add_interface - add CPU global sysfs attributes
+ * @dev: the target device
  */
 int cpuidle_add_interface(struct device *dev)
 {
@@ -153,6 +154,7 @@ int cpuidle_add_interface(struct device *dev)
 
 /**
  * cpuidle_remove_interface - remove CPU global sysfs attributes
+ * @dev: the target device
  */
 void cpuidle_remove_interface(struct device *dev)
 {
@@ -255,25 +257,6 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
-#define define_store_state_ull_function(_name) \
-static ssize_t store_state_##_name(struct cpuidle_state *state, \
-				   struct cpuidle_state_usage *state_usage, \
-				   const char *buf, size_t size)	\
-{ \
-	unsigned long long value; \
-	int err; \
-	if (!capable(CAP_SYS_ADMIN)) \
-		return -EPERM; \
-	err = kstrtoull(buf, 0, &value); \
-	if (err) \
-		return err; \
-	if (value) \
-		state_usage->_name = 1; \
-	else \
-		state_usage->_name = 0; \
-	return size; \
-}
-
 #define define_show_state_ull_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, \
 				  struct cpuidle_state_usage *state_usage, \
@@ -292,26 +275,68 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \
 	return sprintf(buf, "%s\n", state->_name);\
 }
 
-static ssize_t show_state_default_status(struct cpuidle_state *state,
-					  struct cpuidle_state_usage *state_usage,
-					  char *buf)
-{
-	return sprintf(buf, "%s\n",
-			       state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
+#define define_show_state_time_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+				  struct cpuidle_state_usage *state_usage, \
+				  char *buf) \
+{ \
+	return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \
 }
 
-define_show_state_function(exit_latency)
-define_show_state_function(target_residency)
+define_show_state_time_function(exit_latency)
+define_show_state_time_function(target_residency)
 define_show_state_function(power_usage)
 define_show_state_ull_function(usage)
-define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
-define_show_state_ull_function(disable)
-define_store_state_ull_function(disable)
 define_show_state_ull_function(above)
 define_show_state_ull_function(below)
 
+static ssize_t show_state_time(struct cpuidle_state *state,
+			       struct cpuidle_state_usage *state_usage,
+			       char *buf)
+{
+	return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns));
+}
+
+static ssize_t show_state_disable(struct cpuidle_state *state,
+				  struct cpuidle_state_usage *state_usage,
+				  char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER);
+}
+
+static ssize_t store_state_disable(struct cpuidle_state *state,
+				   struct cpuidle_state_usage *state_usage,
+				   const char *buf, size_t size)
+{
+	unsigned int value;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value)
+		state_usage->disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+	else
+		state_usage->disable &= ~CPUIDLE_STATE_DISABLED_BY_USER;
+
+	return size;
+}
+
+static ssize_t show_state_default_status(struct cpuidle_state *state,
+					  struct cpuidle_state_usage *state_usage,
+					  char *buf)
+{
+	return sprintf(buf, "%s\n",
+		       state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
+}
+
 define_one_state_ro(name, show_state_name);
 define_one_state_ro(desc, show_state_desc);
 define_one_state_ro(latency, show_state_exit_latency);
@@ -602,7 +627,7 @@ static struct kobj_type ktype_driver_cpuidle = {
 
 /**
  * cpuidle_add_driver_sysfs - adds the driver name sysfs attribute
- * @device: the target device
+ * @dev: the target device
  */
 static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
 {
@@ -633,7 +658,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
 
 /**
  * cpuidle_remove_driver_sysfs - removes the driver name sysfs attribute
- * @device: the target device
+ * @dev: the target device
  */
 static void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev)
 {
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 1f6308cdf79a2b5f76019e7cf168e50dcf464130..0cf6e4d1e73f3874c37d8506dc8e770e1d1588c7 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -605,6 +605,7 @@ config CRYPTO_DEV_MXS_DCP
 source "drivers/crypto/qat/Kconfig"
 source "drivers/crypto/cavium/cpt/Kconfig"
 source "drivers/crypto/cavium/nitrox/Kconfig"
+source "drivers/crypto/iax/Kconfig"
 
 config CRYPTO_DEV_CAVIUM_ZIP
 	tristate "Cavium ZIP driver"
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index afc4753b5d287207e8508cb2cfbfe3a88542d2c8..94a63a0f814fdccfb1450e81ab33619eed598e53 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_CRYPTO_DEV_ATMEL_I2C) += atmel-i2c.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_ECC) += atmel-ecc.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_SHA204A) += atmel-sha204a.o
 obj-$(CONFIG_CRYPTO_DEV_CAVIUM_ZIP) += cavium/
+obj-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO) += iax/
 obj-$(CONFIG_CRYPTO_DEV_CCP) += ccp/
 obj-$(CONFIG_CRYPTO_DEV_CCREE) += ccree/
 obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chelsio/
diff --git a/drivers/crypto/iax/Kconfig b/drivers/crypto/iax/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..20bce91a9dbf8aafcbd5e0e62d42f22bca960dbf
--- /dev/null
+++ b/drivers/crypto/iax/Kconfig
@@ -0,0 +1,15 @@
+config CRYPTO_DEV_IAX_CRYPTO
+	tristate "IAX Crypto Driver"
+	depends on CRYPTO_DEFLATE
+	depends on INTEL_IDXD
+	default n
+	help
+	  This driver supports Intel analytics accelerator hardware.
+	  The module will be called iax_crypto.
+
+config CRYPTO_DEV_IAX_CRYPTO_STATS
+	bool "IAX Crypto Stats"
+	depends on CRYPTO_DEV_IAX_CRYPTO
+	default n
+	help
+	  Enable IAX crypto stats
diff --git a/drivers/crypto/iax/Makefile b/drivers/crypto/iax/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..532ac9002f9ce964e9edcb1d04ee5e0f3dcfe53e
--- /dev/null
+++ b/drivers/crypto/iax/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for IAX crypto device drivers
+#
+
+ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
+obj-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO) := iax_crypto.o
+iax_crypto-y := iax_crypto_main.o
+
+iax_crypto-$(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS) += iax_crypto_stats.o
diff --git a/drivers/crypto/iax/iax_crypto.h b/drivers/crypto/iax/iax_crypto.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb6ebd7b18accefd6a6271ef59ff1c8028667a0c
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#ifndef __IAX_CRYPTO_H__
+#define __IAX_CRYPTO_H__
+
+#include <linux/crypto.h>
+#include <linux/idxd.h>
+#include <uapi/linux/idxd.h>
+#include "iax_crypto_stats.h"
+
+#undef pr_fmt
+#define	pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#define IAX_DECOMP_ENABLE		BIT(0)
+#define IAX_DECOMP_FLUSH_OUTPUT		BIT(1)
+#define IAX_DECOMP_CHECK_FOR_EOB	BIT(2)
+#define IAX_DECOMP_STOP_ON_EOB		BIT(3)
+#define IAX_DECOMP_SUPPRESS_OUTPUT	BIT(9)
+
+#define IAX_COMP_FLUSH_OUTPUT		BIT(1)
+#define IAX_COMP_APPEND_EOB		BIT(2)
+
+#define IAX_COMPLETION_TIMEOUT		1000000
+
+#define IAX_ANALYTICS_ERROR		0x0a
+#define IAX_ERROR_COMP_BUF_OVERFLOW	0x19
+#define IAX_ERROR_WATCHDOG_EXPIRED	0x24
+
+#define DYNAMIC_HDR			0x2
+#define DYNAMIC_HDR_SIZE		3
+
+#define IAX_COMP_FLAGS			(IAX_COMP_FLUSH_OUTPUT | \
+					 IAX_COMP_APPEND_EOB)
+
+#define IAX_DECOMP_FLAGS		(IAX_DECOMP_ENABLE |	   \
+					 IAX_DECOMP_FLUSH_OUTPUT | \
+					 IAX_DECOMP_CHECK_FOR_EOB | \
+					 IAX_DECOMP_STOP_ON_EOB)
+
+struct iax_wq {
+	struct list_head	list;
+	struct idxd_wq		*wq;
+
+	struct iax_device	*iax_device;
+
+	u64			comp_calls;
+	u64			comp_bytes;
+	u64			decomp_calls;
+	u64			decomp_bytes;
+};
+
+/* Representation of IAX device with wqs, populated by probe */
+struct iax_device {
+	struct list_head		list;
+	struct idxd_device		*idxd;
+
+	struct aecs_table_record	*aecs_table;
+	dma_addr_t			aecs_table_addr;
+
+	void				*aecs_table_unaligned;
+	dma_addr_t			aecs_table_addr_unaligned;
+
+	int				n_wq;
+	struct list_head		wqs;
+
+	u64				comp_calls;
+	u64				comp_bytes;
+	u64				decomp_calls;
+	u64				decomp_bytes;
+};
+
+/*
+ * Analytics Engine Configuration and State (AECS) contains parameters and
+ * internal state of the analytics engine.
+ */
+struct aecs_table_record {
+	u32 crc;
+	u32 xor_checksum;
+	u32 reserved0[5];
+	u32 num_output_accum_bits;
+	u8 output_accum[256];
+	u32 ll_sym[286];
+	u32 reserved1;
+	u32 reserved2;
+	u32 d_sym[30];
+	u32 reserved_padding[2];
+};
+
+#if defined(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS)
+void	global_stats_show(struct seq_file *m);
+void	device_stats_show(struct seq_file *m, struct iax_device *iax_device);
+void	reset_iax_crypto_stats(void);
+void	reset_device_stats(struct iax_device *iax_device);
+
+#else
+static inline void	global_stats_show(struct seq_file *m) {}
+static inline void	device_stats_show(struct seq_file *m, struct iax_device *iax_device) {}
+static inline void	reset_iax_crypto_stats(void) {}
+static inline void	reset_device_stats(struct iax_device *iax_device) {}
+#endif
+
+#endif
diff --git a/drivers/crypto/iax/iax_crypto_main.c b/drivers/crypto/iax/iax_crypto_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..d2e2c3843c402908889596d18a998e179f6ab102
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_main.c
@@ -0,0 +1,1263 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <uapi/linux/idxd.h>
+#include <linux/highmem.h>
+#include <crypto/internal/acompress.h>
+
+#include "registers.h"
+#include "idxd.h"
+#include "iax_crypto.h"
+
+#define IAX_CRYPTO_VER			"1.0"
+
+#define IAX_CRYPTO_WQ_NAME		"iax_crypto"
+#define IAX_ALG_PRIORITY		300
+#define IAX_AECS_ALIGN			32
+
+/* IAX completion timeout value in tsc units */
+static unsigned int iax_completion_timeout = IAX_COMPLETION_TIMEOUT;
+
+module_param_named(iax_completion_timeout, iax_completion_timeout, uint, 0644);
+MODULE_PARM_DESC(iax_completion_timeout, "IAX completion timeout (1000000 cycles default)");
+
+/* Verify results of IAX compress or not */
+static bool iax_verify_compress = 1;
+
+module_param_named(iax_verify_compress, iax_verify_compress, bool, 0644);
+MODULE_PARM_DESC(iax_verify_compress,
+		 "Verify IAX compression (value = 1) or not (value = 0)");
+
+static LIST_HEAD(iax_devices);
+static DEFINE_SPINLOCK(iax_devices_lock);
+
+struct deflate_generic_ctx {
+	struct crypto_comp *deflate_generic_tfm;
+};
+
+static int iax_deflate_generic_init(struct crypto_tfm *tfm)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_comp *deflate_tfm;
+
+	if (crypto_has_comp("deflate-generic", 0, 0))
+		deflate_tfm = crypto_alloc_comp("deflate-generic", 0, 0);
+
+	if (IS_ERR_OR_NULL(deflate_tfm)) {
+		pr_err("IAX could not alloc %s tfm: errcode = %ld\n",
+		       "deflate-generic", PTR_ERR(deflate_tfm));
+		return -ENOMEM;
+	}
+	ctx->deflate_generic_tfm = deflate_tfm;
+	return 0;
+}
+
+static void iax_deflate_generic_exit(struct crypto_tfm *tfm)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_comp(ctx->deflate_generic_tfm);
+}
+
+static int iax_wqs_get(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+	int n_wqs = 0;
+	int ret = 0;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		mutex_lock(&iax_wq->wq->wq_lock);
+		ret = idxd_wq_alloc_resources(iax_wq->wq);
+		if (ret < 0) {
+			pr_err("%s: WQ resource alloc failed for iax device %d, wq %d: ret=%d\n", __func__, iax_device->idxd->id, iax_wq->wq->id, ret);
+			mutex_unlock(&iax_wq->wq->wq_lock);
+			return ret;
+		}
+		idxd_wq_get(iax_wq->wq);
+		mutex_unlock(&iax_wq->wq->wq_lock);
+		n_wqs++;
+	}
+
+	return n_wqs;
+}
+
+static void iax_wqs_put(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		mutex_lock(&iax_wq->wq->wq_lock);
+		idxd_wq_free_resources(iax_wq->wq);
+		idxd_wq_put(iax_wq->wq);
+		mutex_unlock(&iax_wq->wq->wq_lock);
+	}
+}
+
+static int iax_all_wqs_get(void)
+{
+	struct iax_device *iax_device;
+	int n_wqs = 0;
+	int ret;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		ret = iax_wqs_get(iax_device);
+		if (ret < 0) {
+			spin_unlock(&iax_devices_lock);
+			return ret;
+		}
+		n_wqs += ret;
+	}
+	spin_unlock(&iax_devices_lock);
+
+	return n_wqs;
+}
+
+static void iax_all_wqs_put(void)
+{
+	struct iax_device *iax_device;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list)
+		iax_wqs_put(iax_device);
+	spin_unlock(&iax_devices_lock);
+}
+
+static bool iax_crypto_enabled = false;
+static int iax_crypto_enable(const char *val, const struct kernel_param *kp)
+{
+	int ret = 0;
+
+        if (val[0] == '0') {
+		iax_crypto_enabled = false;
+		iax_all_wqs_put();
+	} else if (val[0] == '1') {
+		ret = iax_all_wqs_get();
+		if (ret == 0) {
+			pr_info("%s: no wqs available, not enabling iax_crypto\n", __func__);
+			return ret;
+		} else if (ret < 0) {
+			pr_err("%s: iax_crypto enable failed: ret=%d\n", __func__, ret);
+			return ret;
+		} else
+			iax_crypto_enabled = true;
+	} else {
+		pr_err("%s: iax_crypto failed, bad enable val: ret=%d\n", __func__, -EINVAL);
+		return -EINVAL;
+	}
+
+	pr_info("%s: iax_crypto now %s\n", __func__,
+		iax_crypto_enabled ? "ENABLED" : "DISABLED");
+
+	return ret;
+}
+static const struct kernel_param_ops enable_ops = {
+	.set = iax_crypto_enable,
+	.get = param_get_bool,
+};
+module_param_cb(iax_crypto_enable, &enable_ops, &iax_crypto_enabled, 0644);
+MODULE_PARM_DESC(iax_crypto_enable, "Enable (value = 1) or disable (value = 0) iax_crypto");
+
+int wq_stats_show(struct seq_file *m, void *v)
+{
+	struct iax_device *iax_device;
+
+	spin_lock(&iax_devices_lock);
+
+	global_stats_show(m);
+
+	list_for_each_entry(iax_device, &iax_devices, list)
+		device_stats_show(m, iax_device);
+
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
+
+int iax_crypto_stats_reset(void *data, u64 value)
+{
+	struct iax_device *iax_device;
+
+	reset_iax_crypto_stats();
+
+	spin_lock(&iax_devices_lock);
+
+	list_for_each_entry(iax_device, &iax_devices, list)
+		reset_device_stats(iax_device);
+
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
+
+static struct iax_device *iax_device_alloc(void)
+{
+	struct iax_device *iax_device;
+
+	iax_device = kzalloc(sizeof(*iax_device), GFP_KERNEL);
+	if (!iax_device)
+		return NULL;
+
+	INIT_LIST_HEAD(&iax_device->wqs);
+
+	return iax_device;
+}
+
+static void iax_device_free(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq, *next;
+
+	list_for_each_entry_safe(iax_wq, next, &iax_device->wqs, list) {
+		list_del(&iax_wq->list);
+		kfree(iax_wq); // zzzz do this in original code too
+	}
+
+	kfree(iax_device);
+}
+
+static void free_iax_devices(void)
+{
+	struct iax_device *iax_device, *next;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry_safe(iax_device, next, &iax_devices, list) {
+		list_del(&iax_device->list);
+		iax_device_free(iax_device);
+	}
+	spin_unlock(&iax_devices_lock);
+}
+
+/* IAX number of iax instances found */
+static unsigned int nr_iax;
+static unsigned int nr_cpus;
+static unsigned int nr_nodes;
+
+/* Number of physical cpus sharing each iax instance */
+static unsigned int cpus_per_iax;
+
+/* Per-cpu lookup table for balanced wqs */
+static struct idxd_wq * __percpu *wq_table;
+
+/*
+ * Given a cpu, find the closest IAX instance.  The idea is to try to
+ * choose the most appropriate IAX instance for a caller and spread
+ * available workqueues around to clients.
+ */
+static inline int cpu_to_iax(int cpu)
+{
+	const struct cpumask *node_cpus;
+	int node, n_cpus = 0, test_cpu, iax;
+	int nr_iax_per_node;
+
+	nr_iax_per_node = nr_iax / nr_nodes;
+
+	for_each_online_node(node) {
+		node_cpus = cpumask_of_node(node);
+		if (!cpumask_test_cpu(cpu, node_cpus))
+			continue;
+
+		iax = node * nr_iax_per_node;
+
+		for_each_cpu(test_cpu, node_cpus) {
+			if (test_cpu == cpu)
+				return iax;
+
+			n_cpus++;
+			if ((n_cpus % cpus_per_iax) == 0)
+				iax++;
+		}
+	}
+
+	return -1;
+}
+
+static bool iax_has_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		if (iax_wq->wq == wq)
+			return true;
+	}
+
+	return false;
+}
+
+const u32 fixed_ll_sym[286] = {
+	0x40030, 0x40031, 0x40032, 0x40033, 0x40034, 0x40035, 0x40036, 0x40037,
+	0x40038, 0x40039, 0x4003A, 0x4003B, 0x4003C, 0x4003D, 0x4003E, 0x4003F,
+	0x40040, 0x40041, 0x40042, 0x40043, 0x40044, 0x40045, 0x40046, 0x40047,
+	0x40048, 0x40049, 0x4004A, 0x4004B, 0x4004C, 0x4004D, 0x4004E, 0x4004F,
+	0x40050, 0x40051, 0x40052, 0x40053, 0x40054, 0x40055, 0x40056, 0x40057,
+	0x40058, 0x40059, 0x4005A, 0x4005B, 0x4005C, 0x4005D, 0x4005E, 0x4005F,
+	0x40060, 0x40061, 0x40062, 0x40063, 0x40064, 0x40065, 0x40066, 0x40067,
+	0x40068, 0x40069, 0x4006A, 0x4006B, 0x4006C, 0x4006D, 0x4006E, 0x4006F,
+	0x40070, 0x40071, 0x40072, 0x40073, 0x40074, 0x40075, 0x40076, 0x40077,
+	0x40078, 0x40079, 0x4007A, 0x4007B, 0x4007C, 0x4007D, 0x4007E, 0x4007F,
+	0x40080, 0x40081, 0x40082, 0x40083, 0x40084, 0x40085, 0x40086, 0x40087,
+	0x40088, 0x40089, 0x4008A, 0x4008B, 0x4008C, 0x4008D, 0x4008E, 0x4008F,
+	0x40090, 0x40091, 0x40092, 0x40093, 0x40094, 0x40095, 0x40096, 0x40097,
+	0x40098, 0x40099, 0x4009A, 0x4009B, 0x4009C, 0x4009D, 0x4009E, 0x4009F,
+	0x400A0, 0x400A1, 0x400A2, 0x400A3, 0x400A4, 0x400A5, 0x400A6, 0x400A7,
+	0x400A8, 0x400A9, 0x400AA, 0x400AB, 0x400AC, 0x400AD, 0x400AE, 0x400AF,
+	0x400B0, 0x400B1, 0x400B2, 0x400B3, 0x400B4, 0x400B5, 0x400B6, 0x400B7,
+	0x400B8, 0x400B9, 0x400BA, 0x400BB, 0x400BC, 0x400BD, 0x400BE, 0x400BF,
+	0x48190, 0x48191, 0x48192, 0x48193, 0x48194, 0x48195, 0x48196, 0x48197,
+	0x48198, 0x48199, 0x4819A, 0x4819B, 0x4819C, 0x4819D, 0x4819E, 0x4819F,
+	0x481A0, 0x481A1, 0x481A2, 0x481A3, 0x481A4, 0x481A5, 0x481A6, 0x481A7,
+	0x481A8, 0x481A9, 0x481AA, 0x481AB, 0x481AC, 0x481AD, 0x481AE, 0x481AF,
+	0x481B0, 0x481B1, 0x481B2, 0x481B3, 0x481B4, 0x481B5, 0x481B6, 0x481B7,
+	0x481B8, 0x481B9, 0x481BA, 0x481BB, 0x481BC, 0x481BD, 0x481BE, 0x481BF,
+	0x481C0, 0x481C1, 0x481C2, 0x481C3, 0x481C4, 0x481C5, 0x481C6, 0x481C7,
+	0x481C8, 0x481C9, 0x481CA, 0x481CB, 0x481CC, 0x481CD, 0x481CE, 0x481CF,
+	0x481D0, 0x481D1, 0x481D2, 0x481D3, 0x481D4, 0x481D5, 0x481D6, 0x481D7,
+	0x481D8, 0x481D9, 0x481DA, 0x481DB, 0x481DC, 0x481DD, 0x481DE, 0x481DF,
+	0x481E0, 0x481E1, 0x481E2, 0x481E3, 0x481E4, 0x481E5, 0x481E6, 0x481E7,
+	0x481E8, 0x481E9, 0x481EA, 0x481EB, 0x481EC, 0x481ED, 0x481EE, 0x481EF,
+	0x481F0, 0x481F1, 0x481F2, 0x481F3, 0x481F4, 0x481F5, 0x481F6, 0x481F7,
+	0x481F8, 0x481F9, 0x481FA, 0x481FB, 0x481FC, 0x481FD, 0x481FE, 0x481FF,
+	0x38000, 0x38001, 0x38002, 0x38003, 0x38004, 0x38005, 0x38006, 0x38007,
+	0x38008, 0x38009, 0x3800A, 0x3800B, 0x3800C, 0x3800D, 0x3800E, 0x3800F,
+	0x38010, 0x38011, 0x38012, 0x38013, 0x38014, 0x38015, 0x38016, 0x38017,
+	0x400C0, 0x400C1, 0x400C2, 0x400C3, 0x400C4, 0x400C5
+};
+
+const u32 fixed_d_sym[30] = {
+	0x28000, 0x28001, 0x28002, 0x28003, 0x28004, 0x28005, 0x28006, 0x28007,
+	0x28008, 0x28009, 0x2800A, 0x2800B, 0x2800C, 0x2800D, 0x2800E, 0x2800F,
+	0x28010, 0x28011, 0x28012, 0x28013, 0x28014, 0x28015, 0x28016, 0x28017,
+	0x28018, 0x28019, 0x2801A, 0x2801B, 0x2801C, 0x2801D
+};
+
+static int iax_aecs_alloc(struct iax_device *iax_device)
+{
+	size_t size = sizeof(struct aecs_table_record) + IAX_AECS_ALIGN;
+	struct device *dev = &iax_device->idxd->pdev->dev;
+	u32 bfinal = 1;
+	u32 offset;
+
+	iax_device->aecs_table_unaligned = dma_alloc_coherent(dev, size,
+							      &iax_device->aecs_table_addr_unaligned, GFP_KERNEL);
+	if (!iax_device->aecs_table_unaligned) {
+		iax_device_free(iax_device);
+		return -ENOMEM;
+	}
+	iax_device->aecs_table = PTR_ALIGN(iax_device->aecs_table_unaligned, IAX_AECS_ALIGN);
+	iax_device->aecs_table_addr = ALIGN(iax_device->aecs_table_addr_unaligned, IAX_AECS_ALIGN);
+
+	/* Configure aecs table using fixed Huffman table */
+	iax_device->aecs_table->crc = 0;
+	iax_device->aecs_table->xor_checksum = 0;
+	offset = iax_device->aecs_table->num_output_accum_bits / 8;
+	iax_device->aecs_table->output_accum[offset] = DYNAMIC_HDR | bfinal;
+	iax_device->aecs_table->num_output_accum_bits = DYNAMIC_HDR_SIZE;
+
+	/* Add Huffman table to aecs */
+	memcpy(iax_device->aecs_table->ll_sym, fixed_ll_sym, sizeof(fixed_ll_sym));
+	memcpy(iax_device->aecs_table->d_sym, fixed_d_sym, sizeof(fixed_d_sym));
+
+	return 0;
+}
+
+static void iax_aecs_free(struct iax_device *iax_device)
+{
+	size_t size = sizeof(struct aecs_table_record) + IAX_AECS_ALIGN;
+	struct device *dev = &iax_device->idxd->pdev->dev;
+
+	dma_free_coherent(dev, size,
+			  iax_device->aecs_table_unaligned, iax_device->aecs_table_addr_unaligned);
+}
+
+static struct iax_device *add_iax_device(struct idxd_device *idxd)
+{
+	struct iax_device *iax_device;
+
+	iax_device = iax_device_alloc();
+	if (!iax_device)
+		return NULL;
+
+	iax_device->idxd = idxd;
+
+	if (iax_aecs_alloc(iax_device) < 0)
+		return NULL;
+
+	list_add_tail(&iax_device->list, &iax_devices);
+
+	nr_iax++;
+
+	return iax_device;
+}
+
+static void del_iax_device(struct iax_device *iax_device)
+{
+	iax_aecs_free(iax_device);
+
+	list_del(&iax_device->list);
+
+	iax_device_free(iax_device);
+
+	nr_iax--;
+}
+
+static int add_iax_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	iax_wq = kzalloc(sizeof(*iax_wq), GFP_KERNEL);
+	if (!iax_wq)
+		return -ENOMEM;
+
+	iax_wq->wq = wq;
+	iax_wq->iax_device = iax_device;
+	wq->private_data = iax_wq;
+
+	list_add_tail(&iax_wq->list, &iax_device->wqs);
+
+	iax_device->n_wq++;
+
+	pr_debug("%s: added wq %p to iax %p, n_wq %d\n", __func__, wq, iax_device, iax_device->n_wq);
+
+	return 0;
+}
+
+static void del_iax_wq(struct iax_device *iax_device, struct idxd_wq *wq)
+{
+	struct iax_wq *iax_wq;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list) {
+		if (iax_wq->wq == wq) {
+			list_del(&iax_wq->list);
+			iax_device->n_wq--;
+
+			pr_debug("%s: removed wq %p from iax_device %p, n_wq %d, nr_iax %d\n", __func__, wq, iax_device, iax_device->n_wq, nr_iax);
+
+			if (iax_device->n_wq == 0)
+				del_iax_device(iax_device);
+			break;
+		}
+	}
+}
+
+static int save_iax_wq(struct idxd_wq *wq)
+{
+	struct iax_device *iax_device, *found = NULL;
+	int ret = 0;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (iax_device->idxd == wq->idxd) {
+			/*
+			 * Check to see that we don't already have this wq.
+			 * Shouldn't happen but we don't control probing.
+			 */
+			if (iax_has_wq(iax_device, wq)) {
+				pr_warn("%s: same wq probed multiple times for iax_device %p\n", __func__, iax_device);
+				goto out;
+			}
+
+			found = iax_device;
+
+			ret = add_iax_wq(iax_device, wq);
+			if (ret)
+				goto out;
+
+			break;
+		}
+	}
+
+	if (!found) {
+		struct iax_device *new;
+
+		new = add_iax_device(wq->idxd);
+		if (!new) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = add_iax_wq(new, wq);
+		if (ret) {
+			del_iax_device(new);
+			goto out;
+		}
+	}
+
+	BUG_ON(nr_iax == 0);
+
+	cpus_per_iax = nr_cpus / nr_iax;
+out:
+	spin_unlock(&iax_devices_lock);
+
+	return 0;
+}
+
+static void clear_wq_table(void)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		*per_cpu_ptr(wq_table, cpu) = NULL;
+
+	pr_debug("%s: cleared wq table\n", __func__);
+}
+
+static void remove_iax_wq(struct idxd_wq *wq)
+{
+	struct iax_device *iax_device;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (iax_has_wq(iax_device, wq)) {
+			del_iax_wq(iax_device, wq);
+			if (nr_iax == 0)
+				clear_wq_table();
+			break;
+		}
+	}
+	spin_unlock(&iax_devices_lock);
+
+	if (nr_iax)
+		cpus_per_iax = nr_cpus / nr_iax;
+	else
+		cpus_per_iax = 0;
+}
+
+static struct idxd_wq *request_iax_wq(int iax)
+{
+	struct iax_device *iax_device, *found_device = NULL;
+	struct idxd_wq *bkup_wq = NULL, *found_wq = NULL;
+	int cur_iax = 0, cur_wq = 0, cur_bkup;
+	struct iax_wq *iax_wq;
+
+	spin_lock(&iax_devices_lock);
+	list_for_each_entry(iax_device, &iax_devices, list) {
+		if (cur_iax != iax) {
+			cur_iax++;
+			continue;
+		}
+
+		found_device = iax_device;
+		pr_debug("%s: getting wq from iax_device %p (%d)\n", __func__, found_device, cur_iax);
+		break;
+	}
+
+	if (!found_device) {
+		found_device = list_first_entry_or_null(&iax_devices,
+							struct iax_device, list);
+		if (!found_device) {
+			pr_warn("%s: couldn't find any iax devices with wqs!\n", __func__);
+			goto out;
+		}
+		cur_iax = 0;
+		pr_debug("%s: getting wq from only iax_device %p (%d)\n", __func__, found_device, cur_iax);
+	}
+
+	list_for_each_entry(iax_wq, &found_device->wqs, list) {
+		/* Prefer unused wq but use if we can't find one */
+		if (idxd_wq_refcount(iax_wq->wq) > 0) {
+			bkup_wq = iax_wq->wq;
+			cur_bkup = cur_wq;
+		} else {
+			pr_debug("%s: returning unused wq %p (%d) from iax device %p (%d)\n", __func__, iax_wq->wq, cur_wq, found_device, cur_iax);
+			found_wq = iax_wq->wq;
+			goto out;
+		}
+		cur_wq++;
+	}
+
+	if (bkup_wq) {
+		pr_debug("%s: returning used wq %p (%d) from iax device %p (%d)\n", __func__, bkup_wq, cur_bkup, found_device, cur_iax);
+		found_wq = bkup_wq;
+		goto out;
+	}
+out:
+	spin_unlock(&iax_devices_lock);
+
+	return found_wq;
+}
+
+static inline int check_completion(struct iax_completion_record *comp,
+				   bool compress)
+{
+	char *op_str = compress ? "compress" : "decompress";
+	int ret = 0;
+
+	while (!comp->status)
+		cpu_relax();
+
+	if (comp->status != IAX_COMP_SUCCESS) {
+		if (comp->status == IAX_ERROR_WATCHDOG_EXPIRED) {
+			ret = -ETIMEDOUT;
+			pr_warn("%s: %s timed out, size=0x%x\n",
+				__func__, op_str, comp->output_size);
+			update_completion_timeout_errs();
+			goto out;
+		}
+
+		if (comp->status == IAX_ANALYTICS_ERROR &&
+		    comp->error_code == IAX_ERROR_COMP_BUF_OVERFLOW &&
+		    compress == true) {
+			ret = -E2BIG;
+			pr_debug("%s: compressed size > uncompressed size, not compressing, size=0x%x\n", __func__, comp->output_size);
+			update_completion_comp_buf_overflow_errs();
+			goto out;
+		}
+
+		ret = -EINVAL;
+		pr_err("%s: iax %s status=0x%x, error=0x%x, size=0x%x\n",
+		       __func__, op_str, ret, comp->error_code, comp->output_size);
+		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET, 8, 1, comp, 64, 0);
+		update_completion_einval_errs();
+
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int iax_compress(struct crypto_tfm *tfm,
+			const u8 *src, unsigned int slen,
+			u8 *dst, unsigned int *dlen)
+{
+	dma_addr_t src_addr, dst_addr;
+	struct idxd_desc *idxd_desc;
+	struct iax_hw_desc *desc;
+	struct iax_wq *iax_wq;
+	u32 compression_crc;
+	struct idxd_wq *wq;
+	struct device *dev;
+	int ret = 0;
+
+	wq = *per_cpu_ptr(wq_table, smp_processor_id());
+	if (!wq) {
+		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
+		return -ENODEV;
+	}
+	dev = &wq->idxd->pdev->dev;
+
+	iax_wq = wq->private_data;
+
+	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax compress failed: ret=%ld\n", __func__, PTR_ERR(desc));
+
+		return PTR_ERR(idxd_desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR |
+		IDXD_OP_FLAG_RD_SRC2_AECS | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_COMPRESS;
+	desc->compr_flags = IAX_COMP_FLAGS;
+#ifdef SPR_E0
+	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
+
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
+
+	desc->src1_addr = (u64)src_addr;
+	desc->src1_size = slen;
+	desc->dst_addr = (u64)dst_addr;
+	desc->max_dst_size = *dlen;
+	desc->src2_addr = iax_wq->iax_device->aecs_table_addr;
+	desc->src2_size = sizeof(struct aecs_table_record);
+	desc->completion_addr = idxd_desc->compl_dma;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
+	*dlen = idxd_desc->iax_completion->output_size;
+
+	idxd_free_desc(wq, idxd_desc);
+
+	if (!iax_verify_compress)
+		goto out;
+
+	compression_crc = idxd_desc->iax_completion->crc;
+
+	/* Update stats */
+	update_total_comp_calls();
+	update_total_comp_bytes_out(*dlen);
+	update_wq_comp_calls(wq);
+	update_wq_comp_bytes(wq, *dlen);
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax compress (verify) failed: ret=%ld\n", __func__,
+			PTR_ERR(idxd_desc));
+
+		return PTR_ERR(idxd_desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	/* Verify (optional) - decompress and check crc, suppress dest write */
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_DECOMPRESS;
+	desc->max_dst_size = PAGE_SIZE;
+	desc->decompr_flags = IAX_DECOMP_FLAGS | IAX_DECOMP_SUPPRESS_OUTPUT;
+#ifdef SPR_E0
+	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
+
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
+
+	desc->src1_addr = (u64)dst_addr;
+	desc->src1_size = *dlen;
+	desc->dst_addr = (u64)src_addr;
+	desc->max_dst_size = slen;
+	desc->completion_addr = idxd_desc->compl_dma;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc (verify) failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion (verify) failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	if (compression_crc != idxd_desc->iax_completion->crc) {
+		ret = -EINVAL;
+		pr_err("%s: iax comp/decomp crc mismatch: comp=0x%x, decomp=0x%x\n", __func__,
+		       compression_crc, idxd_desc->iax_completion->crc);
+		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET, 8, 1, idxd_desc->iax_completion, 64, 0);
+		goto err;
+	}
+
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
+	idxd_free_desc(wq, idxd_desc);
+out:
+	return ret;
+err:
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+err_map_dst:
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+err_map_src:
+	idxd_free_desc(wq, idxd_desc);
+	pr_warn("iax compress failed: ret=%d\n", ret);
+
+	goto out;
+}
+
+static int iax_decompress(struct crypto_tfm *tfm,
+			  const u8 *src, unsigned int slen,
+			  u8 *dst, unsigned int *dlen)
+{
+	dma_addr_t src_addr, dst_addr;
+	struct idxd_desc *idxd_desc;
+	struct iax_hw_desc *desc;
+	struct idxd_wq *wq;
+	struct device *dev;
+	int ret = 0;
+
+	wq = *per_cpu_ptr(wq_table, smp_processor_id());
+	if (!wq) {
+		pr_err("%s: no wq configured for cpu=%d\n", __func__, smp_processor_id());
+		return -ENODEV;
+	}
+	dev = &wq->idxd->pdev->dev;
+
+	pr_debug("%s: using wq for cpu=%d = wq %p\n", __func__, smp_processor_id(), wq);
+
+	idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		pr_err("%s: idxd descriptor allocation failed\n", __func__);
+		pr_warn("%s: iax decompress failed: ret=%ld\n", __func__, PTR_ERR(idxd_desc));
+
+		return PTR_ERR(idxd_desc);
+	}
+	desc = idxd_desc->iax_hw;
+
+	desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode = IAX_OPCODE_DECOMPRESS;
+	desc->max_dst_size = PAGE_SIZE;
+	desc->decompr_flags = IAX_DECOMP_FLAGS;
+#ifdef SPR_E0
+	desc->priv = 1;
+#else
+	desc->priv = 0;
+#endif
+
+	src_addr = dma_map_single(dev, (void *)src, slen, DMA_TO_DEVICE);
+	pr_debug("%s: dma_map_single, src_addr %llx, dev %p, src %p, slen %d\n", __func__, src_addr, dev, src, slen);
+	if (unlikely(dma_mapping_error(dev, src_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_src;
+	}
+
+	dst_addr = dma_map_single(dev, (void *)dst, *dlen, DMA_FROM_DEVICE);
+	pr_debug("%s: dma_map_single, dst_addr %llx, dev %p, dst %p, *dlen %d\n", __func__, dst_addr, dev, dst, *dlen);
+	if (unlikely(dma_mapping_error(dev, dst_addr))) {
+		pr_debug("%s: dma_map_single err, exiting\n", __func__);
+		ret = -ENOMEM;
+		goto err_map_dst;
+	}
+
+	desc->src1_addr = (u64)src_addr;
+	desc->dst_addr = (u64)dst_addr;
+	desc->max_dst_size = *dlen;
+	desc->src1_size = slen;
+	desc->completion_addr = idxd_desc->compl_dma;
+
+	ret = idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		pr_warn("%s: submit_desc failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	ret = check_completion(idxd_desc->iax_completion, true);
+	if (ret) {
+		pr_warn("%s: check_completion failed ret=%d\n", __func__, ret);
+		goto err;
+	}
+
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+
+	*dlen = idxd_desc->iax_completion->output_size;
+
+	idxd_free_desc(wq, idxd_desc);
+
+	/* Update stats */
+	update_total_decomp_calls();
+	update_total_decomp_bytes_in(slen);
+	update_wq_decomp_calls(wq);
+	update_wq_decomp_bytes(wq, slen);
+out:
+	return ret;
+err:
+	dma_unmap_single(dev, dst_addr, *dlen, DMA_FROM_DEVICE);
+err_map_dst:
+	dma_unmap_single(dev, src_addr, slen, DMA_TO_DEVICE);
+err_map_src:
+	idxd_free_desc(wq, idxd_desc);
+	pr_warn("iax decompress failed: ret=%d\n", ret);
+
+	goto out;
+}
+
+static int iax_comp_compress(struct crypto_tfm *tfm,
+			     const u8 *src, unsigned int slen,
+			     u8 *dst, unsigned int *dlen)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 start_time_ns;
+	int ret = 0;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
+		ret = crypto_comp_compress(ctx->deflate_generic_tfm,
+					   src, slen, dst, dlen);
+		return ret;
+	}
+
+	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
+		 __func__, src, slen, dst, *dlen);
+
+	start_time_ns = ktime_get_ns();
+	ret = iax_compress(tfm, src, slen, dst, dlen);
+	update_max_comp_delay_ns(start_time_ns);
+	if (ret != 0)
+		pr_warn("synchronous compress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static int iax_comp_decompress(struct crypto_tfm *tfm,
+			       const u8 *src, unsigned int slen,
+			       u8 *dst, unsigned int *dlen)
+{
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 start_time_ns;
+	int ret = 0;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
+		ret = crypto_comp_decompress(ctx->deflate_generic_tfm,
+					     src, slen, dst, dlen);
+		return ret;
+	}
+
+	pr_debug("%s: src %p, slen %d, dst %p, dlen %u\n",
+		 __func__, src, slen, dst, *dlen);
+
+	start_time_ns = ktime_get_ns();
+	ret = iax_decompress(tfm, src, slen, dst, dlen);
+	update_max_decomp_delay_ns(start_time_ns);
+	if (ret != 0)
+		pr_warn("synchronous decompress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static struct crypto_alg iax_comp_deflate = {
+	.cra_name		= "deflate",
+	.cra_driver_name	= "iax_crypto",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_priority		= IAX_ALG_PRIORITY,
+	.cra_module		= THIS_MODULE,
+	.cra_ctxsize		= sizeof(struct deflate_generic_ctx),
+	.cra_init		= iax_deflate_generic_init,
+	.cra_exit		= iax_deflate_generic_exit,
+	.cra_u			= {
+		.compress = {
+			.coa_compress	= iax_comp_compress,
+			.coa_decompress	= iax_comp_decompress
+		}
+	}
+};
+
+static int iax_comp_acompress(struct acomp_req *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 start_time_ns;
+	void *src, *dst;
+	int ret = 0;
+
+	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
+	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic compression\n", __func__);
+		ret = crypto_comp_compress(ctx->deflate_generic_tfm,
+					   src, req->slen, dst, &req->dlen);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+
+		return ret;
+	}
+
+	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
+		 __func__, src, req->src->offset, req->slen,
+		 dst, req->dst->offset, req->dlen);
+
+	start_time_ns = ktime_get_ns();
+	ret = iax_compress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+	update_max_acomp_delay_ns(start_time_ns);
+
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+
+	if (ret != 0)
+		pr_warn("asynchronous compress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static int iax_comp_adecompress(struct acomp_req *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct deflate_generic_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 start_time_ns;
+	void *src, *dst;
+	int ret;
+
+	src = kmap_atomic(sg_page(req->src)) + req->src->offset;
+	dst = kmap_atomic(sg_page(req->dst)) + req->dst->offset;
+
+	if (!iax_crypto_enabled) {
+		pr_debug("%s: iax_crypto disabled, using deflate-generic decompression\n", __func__);
+		ret = crypto_comp_decompress(ctx->deflate_generic_tfm,
+					     src, req->slen, dst, &req->dlen);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+		return ret;
+	}
+
+	pr_debug("%s: src %p (offset %d), slen %d, dst %p (offset %d), dlen %u\n",
+		 __func__, src, req->src->offset, req->slen,
+		 dst, req->dst->offset, req->dlen);
+
+	start_time_ns = ktime_get_ns();
+	ret = iax_decompress(tfm, (const u8 *)src, req->slen, (u8 *)dst, &req->dlen);
+	update_max_decomp_delay_ns(start_time_ns);
+
+	kunmap_atomic(src);
+	kunmap_atomic(dst);
+
+	if (ret != 0)
+		pr_warn("asynchronous decompress failed ret=%d\n", ret);
+
+	return ret;
+}
+
+static struct acomp_alg iax_acomp_deflate = {
+	.compress		= iax_comp_acompress,
+	.decompress		= iax_comp_adecompress,
+	.base			= {
+		.cra_name		= "deflate",
+		.cra_driver_name	= "iax_crypto",
+		.cra_module		= THIS_MODULE,
+		.cra_ctxsize		= sizeof(struct deflate_generic_ctx),
+		.cra_init		= iax_deflate_generic_init,
+		.cra_exit		= iax_deflate_generic_exit,
+		.cra_priority           = IAX_ALG_PRIORITY,
+	}
+};
+
+static int iax_register_compression_device(void)
+{
+	int ret;
+
+	ret = crypto_register_alg(&iax_comp_deflate);
+	if (ret < 0) {
+		pr_err("deflate algorithm registration failed\n");
+		return ret;
+	}
+
+	ret = crypto_register_acomp(&iax_acomp_deflate);
+	if (ret) {
+		pr_err("deflate algorithm acomp registration failed (%d)\n", ret);
+		goto err_unregister_alg_deflate;
+	}
+
+	return ret;
+
+err_unregister_alg_deflate:
+	crypto_unregister_alg(&iax_comp_deflate);
+
+	return ret;
+}
+
+static void iax_unregister_compression_device(void)
+{
+	crypto_unregister_alg(&iax_comp_deflate);
+	crypto_unregister_acomp(&iax_acomp_deflate);
+}
+
+static void rebalance_wq_table(void)
+{
+	int node, cpu, iax;
+	struct idxd_wq *wq;
+
+	if (nr_iax == 0)
+		return;
+
+	pr_debug("%s: nr_nodes=%d, nr_cpus %d, nr_iax %d, cpus_per_iax %d\n",
+		 __func__, nr_nodes, nr_cpus, nr_iax, cpus_per_iax);
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		iax = cpu_to_iax(cpu);
+		pr_debug("%s: iax=%d\n", __func__, iax);
+
+		BUG_ON(iax == -1);
+
+		wq = request_iax_wq(iax);
+		if (!wq) {
+			pr_err("could not get wq for iax %d!\n", iax);
+			return;
+		}
+
+		*per_cpu_ptr(wq_table, cpu) = wq;
+		pr_debug("%s: assigned wq for cpu=%d, node=%d = wq %p\n", __func__, cpu, node, wq);
+	}
+}
+
+static int iax_crypto_probe(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_driver_data *data = idxd->data;
+	struct device *dev = &idxd_dev->conf_dev;
+	int ret = 0;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	if (data->type != IDXD_TYPE_IAX)
+		return -ENODEV;
+
+	mutex_lock(&wq->wq_lock);
+
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		pr_warn("%s: wq driver_name match failed: wq driver_name %s, dev driver name %s\n", __func__, wq->driver_name, dev->driver->name);
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		ret = -ENODEV;
+		goto err;
+	}
+
+	wq->type = IDXD_WQT_KERNEL;
+
+	ret = __drv_enable_wq(wq);
+	if (ret < 0) {
+		pr_warn("%s: enable wq %d failed: %d\n", __func__, wq->id, ret);
+		ret = -ENXIO;
+		goto err;
+	}
+
+	ret = idxd_wq_init_percpu_ref(wq);
+	if (ret < 0) {
+		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
+		pr_warn("%s: WQ percpu_ref setup failed: ret=%d\n", __func__, ret);
+		goto err_ref;
+	}
+
+	ret = save_iax_wq(wq);
+	if (ret)
+		goto err_save;
+
+	rebalance_wq_table();
+out:
+	mutex_unlock(&wq->wq_lock);
+
+	return ret;
+
+err_save:
+	__idxd_wq_quiesce(wq);
+	percpu_ref_exit(&wq->wq_active);
+err_ref:
+	idxd_wq_free_resources(wq);
+	__drv_disable_wq(wq);
+err:
+	wq->type = IDXD_WQT_NONE;
+
+	goto out;
+}
+
+static void iax_crypto_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+
+	__idxd_wq_quiesce(wq);
+	remove_iax_wq(wq);
+	__drv_disable_wq(wq);
+	idxd_wq_free_resources(wq);
+	wq->type = IDXD_WQT_NONE;
+	percpu_ref_exit(&wq->wq_active);
+	rebalance_wq_table();
+
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+static struct idxd_device_driver iax_crypto_driver = {
+	.probe = iax_crypto_probe,
+	.remove = iax_crypto_remove,
+	.name = "crypto",
+	.type = dev_types,
+};
+
+static int __init iax_crypto_init_module(void)
+{
+	int ret = 0;
+
+	nr_cpus = num_online_cpus();
+	nr_nodes = num_online_nodes();
+
+	wq_table = alloc_percpu(struct idxd_wq *);
+	if (!wq_table)
+		return -ENOMEM;
+
+	ret = __idxd_driver_register(&iax_crypto_driver, THIS_MODULE,
+				     KBUILD_MODNAME);
+	if (ret) {
+		pr_err("IAX wq sub-driver registration failed\n");
+		goto err_driver_register;
+	}
+
+	ret = iax_register_compression_device();
+	if (ret < 0) {
+		pr_err("IAX compression device registration failed\n");
+		goto err_crypto_register;
+	}
+
+	if (iax_crypto_debugfs_init())
+		pr_warn("debugfs init failed, stats not available\n");
+
+	pr_info("%s: initialized\n", __func__);
+out:
+	return ret;
+
+err_crypto_register:
+	idxd_driver_unregister(&iax_crypto_driver);
+err_driver_register:
+	free_percpu(wq_table);
+
+	goto out;
+}
+
+static void __exit iax_crypto_cleanup_module(void)
+{
+	iax_crypto_debugfs_cleanup();
+	idxd_driver_unregister(&iax_crypto_driver);
+	iax_unregister_compression_device();
+	free_percpu(wq_table);
+	free_iax_devices();
+	pr_info("%s: cleaned up\n", __func__);
+}
+
+MODULE_IMPORT_NS(IDXD);
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_IDXD_DEVICE(0);
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("IAX Crypto Driver");
+
+module_init(iax_crypto_init_module);
+module_exit(iax_crypto_cleanup_module);
diff --git a/drivers/crypto/iax/iax_crypto_stats.c b/drivers/crypto/iax/iax_crypto_stats.c
new file mode 100644
index 0000000000000000000000000000000000000000..525a20d12cc52f816be9dd5bd9b10f524b10ba04
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_stats.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <uapi/linux/idxd.h>
+#include <linux/idxd.h>
+#include <linux/dmaengine.h>
+#include <linux/intel-svm.h>
+#include "../../dma/idxd/idxd.h"
+#include <linux/debugfs.h>
+#include <crypto/internal/acompress.h>
+#include "iax_crypto.h"
+#include "iax_crypto_stats.h"
+
+static u64 total_comp_calls;
+static u64 total_decomp_calls;
+static u64 max_comp_delay_ns;
+static u64 max_decomp_delay_ns;
+static u64 max_acomp_delay_ns;
+static u64 max_adecomp_delay_ns;
+static u64 total_comp_bytes_out;
+static u64 total_decomp_bytes_in;
+static u64 total_completion_einval_errors;
+static u64 total_completion_timeout_errors;
+static u64 total_completion_comp_buf_overflow_errors;
+
+static struct dentry *iax_crypto_debugfs_root;
+
+void update_total_comp_calls(void)
+{
+	total_comp_calls++;
+}
+
+void update_total_comp_bytes_out(int n)
+{
+	total_comp_bytes_out += n;
+}
+
+void update_total_decomp_calls(void)
+{
+	total_decomp_calls++;
+}
+
+void update_total_decomp_bytes_in(int n)
+{
+	total_decomp_bytes_in += n;
+}
+
+void update_completion_einval_errs(void)
+{
+	total_completion_einval_errors++;
+}
+
+void update_completion_timeout_errs(void)
+{
+	total_completion_timeout_errors++;
+}
+
+void update_completion_comp_buf_overflow_errs(void)
+{
+	total_completion_comp_buf_overflow_errors++;
+}
+
+void update_max_comp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_comp_delay_ns)
+		max_comp_delay_ns = time_diff;
+}
+
+void update_max_decomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_decomp_delay_ns)
+		max_decomp_delay_ns = time_diff;
+}
+
+void update_max_acomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_acomp_delay_ns)
+		max_acomp_delay_ns = time_diff;
+}
+
+void update_max_adecomp_delay_ns(u64 start_time_ns)
+{
+	u64 time_diff;
+
+	time_diff = ktime_get_ns() - start_time_ns;
+
+	if (time_diff > max_adecomp_delay_ns)
+
+		max_adecomp_delay_ns = time_diff;
+}
+
+void update_wq_comp_calls(struct idxd_wq *idxd_wq)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->comp_calls++;
+	wq->iax_device->comp_calls++;
+}
+
+void update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->comp_bytes += n;
+	wq->iax_device->comp_bytes += n;
+}
+
+void update_wq_decomp_calls(struct idxd_wq *idxd_wq)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->decomp_calls++;
+	wq->iax_device->decomp_calls++;
+}
+
+void update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n)
+{
+	struct iax_wq *wq = idxd_wq->private_data;
+
+	wq->decomp_bytes += n;
+	wq->iax_device->decomp_bytes += n;
+}
+
+void reset_iax_crypto_stats(void)
+{
+	total_comp_calls = 0;
+	total_decomp_calls = 0;
+	max_comp_delay_ns = 0;
+	max_decomp_delay_ns = 0;
+	max_acomp_delay_ns = 0;
+	max_adecomp_delay_ns = 0;
+	total_comp_bytes_out = 0;
+	total_decomp_bytes_in = 0;
+	total_completion_einval_errors = 0;
+	total_completion_timeout_errors = 0;
+	total_completion_comp_buf_overflow_errors = 0;
+}
+
+static void reset_wq_stats(struct iax_wq *wq)
+{
+	wq->comp_calls = 0;
+	wq->comp_bytes = 0;
+	wq->decomp_calls = 0;
+	wq->decomp_bytes = 0;
+}
+
+void reset_device_stats(struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	iax_device->comp_calls = 0;
+	iax_device->comp_bytes = 0;
+	iax_device->decomp_calls = 0;
+	iax_device->decomp_bytes = 0;
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list)
+		reset_wq_stats(iax_wq);
+}
+
+static void wq_show(struct seq_file *m, struct iax_wq *iax_wq)
+{
+	seq_printf(m, "    name: %s\n", iax_wq->wq->name);
+	seq_printf(m, "    comp_calls: %llu\n", iax_wq->comp_calls);
+	seq_printf(m, "    comp_bytes: %llu\n", iax_wq->comp_bytes);
+	seq_printf(m, "    decomp_calls: %llu\n", iax_wq->decomp_calls);
+	seq_printf(m, "    decomp_bytes: %llu\n\n", iax_wq->decomp_bytes);
+}
+
+void device_stats_show(struct seq_file *m, struct iax_device *iax_device)
+{
+	struct iax_wq *iax_wq;
+
+	seq_puts(m, "iax device:\n");
+	seq_printf(m, "  id: %d\n", iax_device->idxd->id);
+	seq_printf(m, "  n_wqs: %d\n", iax_device->n_wq);
+	seq_printf(m, "  comp_calls: %llu\n", iax_device->comp_calls);
+	seq_printf(m, "  comp_bytes: %llu\n", iax_device->comp_bytes);
+	seq_printf(m, "  decomp_calls: %llu\n", iax_device->decomp_calls);
+	seq_printf(m, "  decomp_bytes: %llu\n", iax_device->decomp_bytes);
+	seq_puts(m, "  wqs:\n");
+
+	list_for_each_entry(iax_wq, &iax_device->wqs, list)
+		wq_show(m, iax_wq);
+}
+
+void global_stats_show(struct seq_file *m)
+{
+	seq_puts(m, "global stats:\n");
+	seq_printf(m, "  total_comp_calls: %llu\n", total_comp_calls);
+	seq_printf(m, "  total_decomp_calls: %llu\n", total_decomp_calls);
+	seq_printf(m, "  total_comp_bytes_out: %llu\n", total_comp_bytes_out);
+	seq_printf(m, "  total_decomp_bytes_in: %llu\n", total_decomp_bytes_in);
+	seq_printf(m, "  total_completion_einval_errors: %llu\n", total_completion_einval_errors);
+	seq_printf(m, "  total_completion_timeout_errors: %llu\n", total_completion_timeout_errors);
+	seq_printf(m, "  total_completion_comp_buf_overflow_errors: %llu\n\n", total_completion_comp_buf_overflow_errors);
+}
+
+static int wq_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wq_stats_show, file);
+}
+
+const struct file_operations wq_stats_fops = {
+	.open = wq_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+DEFINE_DEBUGFS_ATTRIBUTE(wq_stats_reset_fops, NULL, iax_crypto_stats_reset, "%llu\n");
+
+int __init iax_crypto_debugfs_init(void)
+{
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	iax_crypto_debugfs_root = debugfs_create_dir("iax_crypto", NULL);
+	if (!iax_crypto_debugfs_root)
+		return -ENOMEM;
+
+	debugfs_create_u64("max_comp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_comp_delay_ns);
+	debugfs_create_u64("max_decomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_decomp_delay_ns);
+	debugfs_create_u64("max_acomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_comp_delay_ns);
+	debugfs_create_u64("max_adecomp_delay_ns", 0644,
+			   iax_crypto_debugfs_root, &max_decomp_delay_ns);
+	debugfs_create_u64("total_comp_calls", 0644,
+			   iax_crypto_debugfs_root, &total_comp_calls);
+	debugfs_create_u64("total_decomp_calls", 0644,
+			   iax_crypto_debugfs_root, &total_decomp_calls);
+	debugfs_create_u64("total_comp_bytes_out", 0644,
+			   iax_crypto_debugfs_root, &total_comp_bytes_out);
+	debugfs_create_u64("total_decomp_bytes_in", 0644,
+			   iax_crypto_debugfs_root, &total_decomp_bytes_in);
+	debugfs_create_file("wq_stats", 0644, iax_crypto_debugfs_root, NULL,
+			    &wq_stats_fops);
+	debugfs_create_file("wq_stats_reset", 0644, iax_crypto_debugfs_root, NULL,
+			    &wq_stats_reset_fops);
+
+	return 0;
+}
+
+void __exit iax_crypto_debugfs_cleanup(void)
+{
+	debugfs_remove_recursive(iax_crypto_debugfs_root);
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/crypto/iax/iax_crypto_stats.h b/drivers/crypto/iax/iax_crypto_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..524f444e58ffbcddca0029403db8a68c162f4413
--- /dev/null
+++ b/drivers/crypto/iax/iax_crypto_stats.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#ifndef __CRYPTO_DEV_IAX_CRYPTO_STATS_H__
+#define __CRYPTO_DEV_IAX_CRYPTO_STATS_H__
+
+#if defined(CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS)
+int	iax_crypto_debugfs_init(void);
+void	iax_crypto_debugfs_cleanup(void);
+
+void	update_total_comp_calls(void);
+void	update_total_comp_bytes_out(int n);
+void	update_total_decomp_calls(void);
+void	update_total_decomp_bytes_in(int n);
+void	update_max_comp_delay_ns(u64 start_time_ns);
+void	update_max_decomp_delay_ns(u64 start_time_ns);
+void	update_max_acomp_delay_ns(u64 start_time_ns);
+void	update_max_adecomp_delay_ns(u64 start_time_ns);
+void	update_completion_einval_errs(void);
+void	update_completion_timeout_errs(void);
+void	update_completion_comp_buf_overflow_errs(void);
+
+void	update_wq_comp_calls(struct idxd_wq *idxd_wq);
+void	update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n);
+void	update_wq_decomp_calls(struct idxd_wq *idxd_wq);
+void	update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n);
+
+int	wq_stats_show(struct seq_file *m, void *v);
+int	iax_crypto_stats_reset(void *data, u64 value);
+
+#else
+static inline int	iax_crypto_debugfs_init(void) { return 0; }
+static inline void	iax_crypto_debugfs_cleanup(void) {}
+
+static inline void	update_total_comp_calls(void) {}
+static inline void	update_total_comp_bytes_out(int n) {}
+static inline void	update_total_decomp_calls(void) {}
+static inline void	update_total_decomp_bytes_in(int n) {}
+static inline void	update_max_comp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_decomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_acomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_max_adecomp_delay_ns(u64 start_time_ns) {}
+static inline void	update_completion_einval_errs(void) {}
+static inline void	update_completion_timeout_errs(void) {}
+static inline void	update_completion_comp_buf_overflow_errs(void) {}
+
+static inline void	update_wq_comp_calls(struct idxd_wq *idxd_wq) {}
+static inline void	update_wq_comp_bytes(struct idxd_wq *idxd_wq, int n) {}
+static inline void	update_wq_decomp_calls(struct idxd_wq *idxd_wq) {}
+static inline void	update_wq_decomp_bytes(struct idxd_wq *idxd_wq, int n) {}
+
+#endif // CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS
+
+#endif
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index f33c73e4af41621ba1ec863cb80f733ba9229b8d..3b6c06f073268643d9ba65675faee88a8c593369 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -32,19 +32,36 @@ config DEV_DAX_PMEM
 
 	  Say M if unsure
 
+config DEV_DAX_HMEM
+	tristate "HMEM DAX: direct access to 'specific purpose' memory"
+	depends on EFI_SOFT_RESERVE
+	default DEV_DAX
+	help
+	  EFI 2.8 platforms, and others, may advertise 'specific purpose'
+	  memory. For example, a high bandwidth memory pool. The
+	  indication from platform firmware is meant to reserve the
+	  memory from typical usage by default. This driver creates
+	  device-dax instances for these memory ranges, and that also
+	  enables the possibility to assign them to the DEV_DAX_KMEM
+	  driver to override the reservation and add them to kernel
+	  "System RAM" pool.
+
+	  Say M if unsure.
+
 config DEV_DAX_KMEM
 	tristate "KMEM DAX: volatile-use of persistent memory"
 	default DEV_DAX
 	depends on DEV_DAX
 	depends on MEMORY_HOTPLUG # for add_memory() and friends
 	help
-	  Support access to persistent memory as if it were RAM.  This
-	  allows easier use of persistent memory by unmodified
-	  applications.
+	  Support access to persistent, or other performance
+	  differentiated memory as if it were System RAM. This allows
+	  easier use of persistent memory by unmodified applications, or
+	  adds core kernel memory services to heterogeneous memory types
+	  (HMEM) marked "reserved" by platform firmware.
 
 	  To use this feature, a DAX device must be unbound from the
-	  device_dax driver (PMEM DAX) and bound to this kmem driver
-	  on each boot.
+	  device_dax driver and bound to this kmem driver on each boot.
 
 	  Say N if unsure.
 
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 81f7d54dadfb34ed470ee90627556172c9fc3f7a..80065b38b3c43f8a7fcfdbf1a072dd1853fa6cac 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,9 +2,11 @@
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
+obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
 
 dax-y := super.o
 dax-y += bus.o
 device_dax-y := device.o
+dax_hmem-y := hmem.o
 
 obj-y += pmem/
diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..fe7214daf62ef03a79f539d0a7c2a3f3870f1a11
--- /dev/null
+++ b/drivers/dax/hmem.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/platform_device.h>
+#include <linux/memregion.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "bus.h"
+
+static int dax_hmem_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct dev_pagemap pgmap = { };
+	struct dax_region *dax_region;
+	struct memregion_info *mri;
+	struct dev_dax *dev_dax;
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	mri = dev->platform_data;
+	memcpy(&pgmap.res, res, sizeof(*res));
+
+	dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node,
+			PMD_SIZE, PFN_DEV|PFN_MAP);
+	if (!dax_region)
+		return -ENOMEM;
+
+	dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap);
+	if (IS_ERR(dev_dax))
+		return PTR_ERR(dev_dax);
+
+	/* child dev_dax instances now own the lifetime of the dax_region */
+	dax_region_put(dax_region);
+	return 0;
+}
+
+static int dax_hmem_remove(struct platform_device *pdev)
+{
+	/* devm handles teardown */
+	return 0;
+}
+
+static struct platform_driver dax_hmem_driver = {
+	.probe = dax_hmem_probe,
+	.remove = dax_hmem_remove,
+	.driver = {
+		.name = "hmem",
+	},
+};
+
+module_platform_driver(dax_hmem_driver);
+
+MODULE_ALIAS("platform:hmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 7af874b69ffb9e3ea2cfa1cbcc7c4b46be7a4a39..fb75d6af57267fe09d4369b07ce2b3ce1dabff4e 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -15,19 +15,19 @@ menuconfig DMADEVICES
 	  be empty in some cases.
 
 config DMADEVICES_DEBUG
-        bool "DMA Engine debugging"
-        depends on DMADEVICES != n
-        help
-          This is an option for use by developers; most people should
-          say N here.  This enables DMA engine core and driver debugging.
+	bool "DMA Engine debugging"
+	depends on DMADEVICES != n
+	help
+	  This is an option for use by developers; most people should
+	  say N here.  This enables DMA engine core and driver debugging.
 
 config DMADEVICES_VDEBUG
-        bool "DMA Engine verbose debugging"
-        depends on DMADEVICES_DEBUG != n
-        help
-          This is an option for use by developers; most people should
-          say N here.  This enables deeper (more verbose) debugging of
-          the DMA engine core and drivers.
+	bool "DMA Engine verbose debugging"
+	depends on DMADEVICES_DEBUG != n
+	help
+	  This is an option for use by developers; most people should
+	  say N here.  This enables deeper (more verbose) debugging of
+	  the DMA engine core and drivers.
 
 
 if DMADEVICES
@@ -215,28 +215,28 @@ config FSL_EDMA
 	  This module can be found on Freescale Vybrid and LS-1 SoCs.
 
 config FSL_QDMA
-       tristate "NXP Layerscape qDMA engine support"
-       depends on ARM || ARM64
-       select DMA_ENGINE
-       select DMA_VIRTUAL_CHANNELS
-       select DMA_ENGINE_RAID
-       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
-       help
-         Support the NXP Layerscape qDMA engine with command queue and legacy mode.
-         Channel virtualization is supported through enqueuing of DMA jobs to,
-         or dequeuing DMA jobs from, different work queues.
-         This module can be found on NXP Layerscape SoCs.
+	tristate "NXP Layerscape qDMA engine support"
+	depends on ARM || ARM64
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	select DMA_ENGINE_RAID
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	help
+	 Support the NXP Layerscape qDMA engine with command queue and legacy mode.
+	 Channel virtualization is supported through enqueuing of DMA jobs to,
+	 or dequeuing DMA jobs from, different work queues.
+	 This module can be found on NXP Layerscape SoCs.
 	  The qdma driver only work on  SoCs with a DPAA hardware block.
 
 config FSL_RAID
-        tristate "Freescale RAID engine Support"
-        depends on FSL_SOC && !ASYNC_TX_ENABLE_CHANNEL_SWITCH
-        select DMA_ENGINE
-        select DMA_ENGINE_RAID
-        ---help---
-          Enable support for Freescale RAID Engine. RAID Engine is
-          available on some QorIQ SoCs (like P5020/P5040). It has
-          the capability to offload memcpy, xor and pq computation
+	tristate "Freescale RAID engine Support"
+	depends on FSL_SOC && !ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	select DMA_ENGINE
+	select DMA_ENGINE_RAID
+	---help---
+	  Enable support for Freescale RAID Engine. RAID Engine is
+	  available on some QorIQ SoCs (like P5020/P5040). It has
+	  the capability to offload memcpy, xor and pq computation
 	  for raid5/6.
 
 config IMG_MDC_DMA
@@ -273,6 +273,64 @@ config INTEL_IDMA64
 	  Enable DMA support for Intel Low Power Subsystem such as found on
 	  Intel Skylake PCH.
 
+config INTEL_IDXD_BUS
+	tristate
+	default INTEL_IDXD
+
+config INTEL_IDXD
+	tristate "Intel Data Accelerators support"
+	depends on PCI && X86_64 && !UML
+	depends on PCI_MSI
+	depends on PCI_PASID
+	depends on SBITMAP
+	select DMA_ENGINE
+	help
+	  Enable support for the Intel(R) data accelerators present
+	  in Intel Xeon CPU.
+
+	  Say Y if you have such a platform.
+
+	  If unsure, say N.
+
+config INTEL_IDXD_COMPAT
+	bool "Legacy behavior for idxd driver"
+	depends on PCI && X86_64
+	select INTEL_IDXD_BUS
+	help
+	  Compatible driver to support old /sys/bus/dsa/drivers/dsa behavior.
+	  The old behavior performed driver bind/unbind for device and wq
+	  devices all under the dsa driver. The compat driver will emulate
+	  the legacy behavior in order to allow existing support apps (i.e.
+	  accel-config) to continue function. It is expected that accel-config
+	  v3.2 and earlier will need the compat mode. A distro with later
+	  accel-config version can disable this compat config.
+
+	  Say Y if you have old applications that require such behavior.
+
+	  If unsure, say N.
+
+# Config symbol that collects all the dependencies that's necessary to
+# support shared virtual memory for the devices supported by idxd.
+config INTEL_IDXD_SVM
+	bool "Accelerator Shared Virtual Memory Support"
+	depends on INTEL_IDXD
+	depends on INTEL_IOMMU_SVM
+	depends on PCI_PRI
+	depends on PCI_PASID
+	depends on PCI_IOV
+
+config INTEL_IDXD_PERFMON
+	bool "Intel Data Accelerators performance monitor support"
+	depends on INTEL_IDXD
+	help
+	  Enable performance monitor (pmu) support for the Intel(R)
+	  data accelerators present in Intel Xeon CPU.  With this
+	  enabled, perf can be used to monitor the DSA (Intel Data
+	  Streaming Accelerator) events described in the Intel DSA
+	  spec.
+
+	  If unsure, say N.
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index f5ce8665e944e29151e30d040309623956d859c2..9fc038ff6c176449711c6547018a865a423596ef 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_IMX_DMA) += imx-dma.o
 obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
 obj-$(CONFIG_INTEL_IDMA64) += idma64.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
+obj-y += idxd/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o
 obj-$(CONFIG_K3_DMA) += k3dma.o
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index dcbcb712de6e880aa5a577a40f0e3ac95e1485ca..235f1396f968606ddfee61110490de4e16463df4 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -360,19 +360,12 @@ struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
 {
 	struct acpi_dma_parser_data pdata;
 	struct acpi_dma_spec *dma_spec = &pdata.dma_spec;
+	struct acpi_device *adev = ACPI_COMPANION(dev);
 	struct list_head resource_list;
-	struct acpi_device *adev;
 	struct acpi_dma *adma;
 	struct dma_chan *chan = NULL;
 	int found;
-
-	/* Check if the device was enumerated by ACPI */
-	if (!dev)
-		return ERR_PTR(-ENODEV);
-
-	adev = ACPI_COMPANION(dev);
-	if (!adev)
-		return ERR_PTR(-ENODEV);
+	int ret;
 
 	memset(&pdata, 0, sizeof(pdata));
 	pdata.index = index;
@@ -382,9 +375,11 @@ struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
 	dma_spec->slave_id = -1;
 
 	INIT_LIST_HEAD(&resource_list);
-	acpi_dev_get_resources(adev, &resource_list,
-			acpi_dma_parse_fixed_dma, &pdata);
+	ret = acpi_dev_get_resources(adev, &resource_list,
+				     acpi_dma_parse_fixed_dma, &pdata);
 	acpi_dev_free_resource_list(&resource_list);
+	if (ret < 0)
+		return ERR_PTR(ret);
 
 	if (dma_spec->slave_id < 0 || dma_spec->chan_id < 0)
 		return ERR_PTR(-ENODEV);
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 4b604086b1b3a933e02baf88ba70fb7bb13a73a4..2cfa8458b51be48d5a6cd1a0309b16ced7d817e9 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -53,18 +53,103 @@
 #include <linux/mempool.h>
 #include <linux/numa.h>
 
+#include "dmaengine.h"
+
 static DEFINE_MUTEX(dma_list_mutex);
 static DEFINE_IDA(dma_ida);
 static LIST_HEAD(dma_device_list);
 static long dmaengine_ref_count;
 
+/* --- debugfs implementation --- */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static struct dentry *rootdir;
+
+static void dmaengine_debug_register(struct dma_device *dma_dev)
+{
+	dma_dev->dbg_dev_root = debugfs_create_dir(dev_name(dma_dev->dev),
+						   rootdir);
+	if (IS_ERR(dma_dev->dbg_dev_root))
+		dma_dev->dbg_dev_root = NULL;
+}
+
+static void dmaengine_debug_unregister(struct dma_device *dma_dev)
+{
+	debugfs_remove_recursive(dma_dev->dbg_dev_root);
+	dma_dev->dbg_dev_root = NULL;
+}
+
+static void dmaengine_dbg_summary_show(struct seq_file *s,
+				       struct dma_device *dma_dev)
+{
+	struct dma_chan *chan;
+
+	list_for_each_entry(chan, &dma_dev->channels, device_node) {
+		if (chan->client_count) {
+			seq_printf(s, " %-13s| %s", dma_chan_name(chan),
+				   chan->dbg_client_name ?: "in-use");
+
+			if (chan->router)
+				seq_printf(s, " (via router: %s)\n",
+					dev_name(chan->router->dev));
+			else
+				seq_puts(s, "\n");
+		}
+	}
+}
+
+static int dmaengine_summary_show(struct seq_file *s, void *data)
+{
+	struct dma_device *dma_dev = NULL;
+
+	mutex_lock(&dma_list_mutex);
+	list_for_each_entry(dma_dev, &dma_device_list, global_node) {
+		seq_printf(s, "dma%d (%s): number of channels: %u\n",
+			   dma_dev->dev_id, dev_name(dma_dev->dev),
+			   dma_dev->chancnt);
+
+		if (dma_dev->dbg_summary_show)
+			dma_dev->dbg_summary_show(s, dma_dev);
+		else
+			dmaengine_dbg_summary_show(s, dma_dev);
+
+		if (!list_is_last(&dma_dev->global_node, &dma_device_list))
+			seq_puts(s, "\n");
+	}
+	mutex_unlock(&dma_list_mutex);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dmaengine_summary);
+
+static void __init dmaengine_debugfs_init(void)
+{
+	rootdir = debugfs_create_dir("dmaengine", NULL);
+
+	/* /sys/kernel/debug/dmaengine/summary */
+	debugfs_create_file("summary", 0444, rootdir, NULL,
+			    &dmaengine_summary_fops);
+}
+#else
+static inline void dmaengine_debugfs_init(void) { }
+static inline int dmaengine_debug_register(struct dma_device *dma_dev)
+{
+	return 0;
+}
+
+static inline void dmaengine_debug_unregister(struct dma_device *dma_dev) { }
+#endif	/* DEBUG_FS */
+
 /* --- sysfs implementation --- */
 
+#define DMA_SLAVE_NAME	"slave"
+
 /**
  * dev_to_dma_chan - convert a device pointer to its sysfs container object
- * @dev - device node
+ * @dev:	device node
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static struct dma_chan *dev_to_dma_chan(struct device *dev)
 {
@@ -149,10 +234,6 @@ static void chan_dev_release(struct device *dev)
 	struct dma_chan_dev *chan_dev;
 
 	chan_dev = container_of(dev, typeof(*chan_dev), device);
-	if (atomic_dec_and_test(chan_dev->idr_ref)) {
-		ida_free(&dma_ida, chan_dev->dev_id);
-		kfree(chan_dev->idr_ref);
-	}
 	kfree(chan_dev);
 }
 
@@ -164,11 +245,152 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
-#define dma_device_satisfies_mask(device, mask) \
-	__dma_device_satisfies_mask((device), &(mask))
-static int
-__dma_device_satisfies_mask(struct dma_device *device,
-			    const dma_cap_mask_t *want)
+/* enable iteration over all operation types */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * struct dma_chan_tbl_ent - tracks channel allocations per core/operation
+ * @chan:	associated channel for this entry
+ */
+struct dma_chan_tbl_ent {
+	struct dma_chan *chan;
+};
+
+/* percpu lookup table for memory-to-memory offload providers */
+static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
+
+static int __init dma_channel_table_init(void)
+{
+	enum dma_transaction_type cap;
+	int err = 0;
+
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* 'interrupt', 'private', and 'slave' are channel capabilities,
+	 * but are not associated with an operation so they do not need
+	 * an entry in the channel_table
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
+	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
+		if (!channel_table[cap]) {
+			err = -ENOMEM;
+			break;
+		}
+	}
+
+	if (err) {
+		pr_err("dmaengine dma_channel_table_init failure: %d\n", err);
+		for_each_dma_cap_mask(cap, dma_cap_mask_all)
+			free_percpu(channel_table[cap]);
+	}
+
+	return err;
+}
+arch_initcall(dma_channel_table_init);
+
+/**
+ * dma_chan_is_local - checks if the channel is in the same NUMA-node as the CPU
+ * @chan:	DMA channel to test
+ * @cpu:	CPU index which the channel should be close to
+ *
+ * Returns true if the channel is in the same NUMA-node as the CPU.
+ */
+static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
+{
+	int node = dev_to_node(chan->device->dev);
+	return node == NUMA_NO_NODE ||
+		cpumask_test_cpu(cpu, cpumask_of_node(node));
+}
+
+/**
+ * min_chan - finds the channel with min count and in the same NUMA-node as the CPU
+ * @cap:	capability to match
+ * @cpu:	CPU index which the channel should be close to
+ *
+ * If some channels are close to the given CPU, the one with the lowest
+ * reference count is returned. Otherwise, CPU is ignored and only the
+ * reference count is taken into account.
+ *
+ * Must be called under dma_list_mutex.
+ */
+static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	struct dma_chan *min = NULL;
+	struct dma_chan *localmin = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (!dma_has_cap(cap, device->cap_mask) ||
+		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (!chan->client_count)
+				continue;
+			if (!min || chan->table_count < min->table_count)
+				min = chan;
+
+			if (dma_chan_is_local(chan, cpu))
+				if (!localmin ||
+				    chan->table_count < localmin->table_count)
+					localmin = chan;
+		}
+	}
+
+	chan = localmin ? localmin : min;
+
+	if (chan)
+		chan->table_count++;
+
+	return chan;
+}
+
+/**
+ * dma_channel_rebalance - redistribute the available channels
+ *
+ * Optimize for CPU isolation (each CPU gets a dedicated channel for an
+ * operation type) in the SMP case, and operation isolation (avoid
+ * multi-tasking channels) in the non-SMP case.
+ *
+ * Must be called under dma_list_mutex.
+ */
+static void dma_channel_rebalance(void)
+{
+	struct dma_chan *chan;
+	struct dma_device *device;
+	int cpu;
+	int cap;
+
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node)
+			chan->table_count = 0;
+	}
+
+	/* don't populate the channel_table if no clients are available */
+	if (!dmaengine_ref_count)
+		return;
+
+	/* redistribute available channels */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			chan = min_chan(cap, cpu);
+			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
+		}
+}
+
+static int dma_device_satisfies_mask(struct dma_device *device,
+				     const dma_cap_mask_t *want)
 {
 	dma_cap_mask_t has;
 
@@ -184,9 +406,9 @@ static struct module *dma_chan_to_owner(struct dma_chan *chan)
 
 /**
  * balance_ref_count - catch up the channel reference count
- * @chan - channel to balance ->client_count versus dmaengine_ref_count
+ * @chan:	channel to balance ->client_count versus dmaengine_ref_count
  *
- * balance_ref_count must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static void balance_ref_count(struct dma_chan *chan)
 {
@@ -198,11 +420,28 @@ static void balance_ref_count(struct dma_chan *chan)
 	}
 }
 
+static void dma_device_release(struct kref *ref)
+{
+	struct dma_device *device = container_of(ref, struct dma_device, ref);
+
+	list_del_rcu(&device->global_node);
+	dma_channel_rebalance();
+
+	if (device->device_release)
+		device->device_release(device);
+}
+
+static void dma_device_put(struct dma_device *device)
+{
+	lockdep_assert_held(&dma_list_mutex);
+	kref_put(&device->ref, dma_device_release);
+}
+
 /**
- * dma_chan_get - try to grab a dma channel's parent driver module
- * @chan - channel to grab
+ * dma_chan_get - try to grab a DMA channel's parent driver module
+ * @chan:	channel to grab
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static int dma_chan_get(struct dma_chan *chan)
 {
@@ -218,6 +457,12 @@ static int dma_chan_get(struct dma_chan *chan)
 	if (!try_module_get(owner))
 		return -ENODEV;
 
+	ret = kref_get_unless_zero(&chan->device->ref);
+	if (!ret) {
+		ret = -ENODEV;
+		goto module_put_out;
+	}
+
 	/* allocate upon first client reference */
 	if (chan->device->device_alloc_chan_resources) {
 		ret = chan->device->device_alloc_chan_resources(chan);
@@ -233,15 +478,17 @@ static int dma_chan_get(struct dma_chan *chan)
 	return 0;
 
 err_out:
+	dma_device_put(chan->device);
+module_put_out:
 	module_put(owner);
 	return ret;
 }
 
 /**
- * dma_chan_put - drop a reference to a dma channel's parent driver module
- * @chan - channel to release
+ * dma_chan_put - drop a reference to a DMA channel's parent driver module
+ * @chan:	channel to release
  *
- * Must be called under dma_list_mutex
+ * Must be called under dma_list_mutex.
  */
 static void dma_chan_put(struct dma_chan *chan)
 {
@@ -250,7 +497,6 @@ static void dma_chan_put(struct dma_chan *chan)
 		return;
 
 	chan->client_count--;
-	module_put(dma_chan_to_owner(chan));
 
 	/* This channel is not in use anymore, free it */
 	if (!chan->client_count && chan->device->device_free_chan_resources) {
@@ -265,6 +511,9 @@ static void dma_chan_put(struct dma_chan *chan)
 		chan->router = NULL;
 		chan->route_data = NULL;
 	}
+
+	dma_device_put(chan->device);
+	module_put(dma_chan_to_owner(chan));
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
@@ -288,60 +537,9 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 }
 EXPORT_SYMBOL(dma_sync_wait);
 
-/**
- * dma_cap_mask_all - enable iteration over all operation types
- */
-static dma_cap_mask_t dma_cap_mask_all;
-
-/**
- * dma_chan_tbl_ent - tracks channel allocations per core/operation
- * @chan - associated channel for this entry
- */
-struct dma_chan_tbl_ent {
-	struct dma_chan *chan;
-};
-
-/**
- * channel_table - percpu lookup table for memory-to-memory offload providers
- */
-static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
-
-static int __init dma_channel_table_init(void)
-{
-	enum dma_transaction_type cap;
-	int err = 0;
-
-	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
-
-	/* 'interrupt', 'private', and 'slave' are channel capabilities,
-	 * but are not associated with an operation so they do not need
-	 * an entry in the channel_table
-	 */
-	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
-	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
-	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
-		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
-		if (!channel_table[cap]) {
-			err = -ENOMEM;
-			break;
-		}
-	}
-
-	if (err) {
-		pr_err("initialization failure\n");
-		for_each_dma_cap_mask(cap, dma_cap_mask_all)
-			free_percpu(channel_table[cap]);
-	}
-
-	return err;
-}
-arch_initcall(dma_channel_table_init);
-
 /**
  * dma_find_channel - find a channel to carry out the operation
- * @tx_type: transaction type
+ * @tx_type:	transaction type
  */
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
 {
@@ -369,97 +567,6 @@ void dma_issue_pending_all(void)
 }
 EXPORT_SYMBOL(dma_issue_pending_all);
 
-/**
- * dma_chan_is_local - returns true if the channel is in the same numa-node as the cpu
- */
-static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
-{
-	int node = dev_to_node(chan->device->dev);
-	return node == NUMA_NO_NODE ||
-		cpumask_test_cpu(cpu, cpumask_of_node(node));
-}
-
-/**
- * min_chan - returns the channel with min count and in the same numa-node as the cpu
- * @cap: capability to match
- * @cpu: cpu index which the channel should be close to
- *
- * If some channels are close to the given cpu, the one with the lowest
- * reference count is returned. Otherwise, cpu is ignored and only the
- * reference count is taken into account.
- * Must be called under dma_list_mutex.
- */
-static struct dma_chan *min_chan(enum dma_transaction_type cap, int cpu)
-{
-	struct dma_device *device;
-	struct dma_chan *chan;
-	struct dma_chan *min = NULL;
-	struct dma_chan *localmin = NULL;
-
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (!dma_has_cap(cap, device->cap_mask) ||
-		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
-			continue;
-		list_for_each_entry(chan, &device->channels, device_node) {
-			if (!chan->client_count)
-				continue;
-			if (!min || chan->table_count < min->table_count)
-				min = chan;
-
-			if (dma_chan_is_local(chan, cpu))
-				if (!localmin ||
-				    chan->table_count < localmin->table_count)
-					localmin = chan;
-		}
-	}
-
-	chan = localmin ? localmin : min;
-
-	if (chan)
-		chan->table_count++;
-
-	return chan;
-}
-
-/**
- * dma_channel_rebalance - redistribute the available channels
- *
- * Optimize for cpu isolation (each cpu gets a dedicated channel for an
- * operation type) in the SMP case,  and operation isolation (avoid
- * multi-tasking channels) in the non-SMP case.  Must be called under
- * dma_list_mutex.
- */
-static void dma_channel_rebalance(void)
-{
-	struct dma_chan *chan;
-	struct dma_device *device;
-	int cpu;
-	int cap;
-
-	/* undo the last distribution */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_possible_cpu(cpu)
-			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
-
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
-			continue;
-		list_for_each_entry(chan, &device->channels, device_node)
-			chan->table_count = 0;
-	}
-
-	/* don't populate the channel_table if no clients are available */
-	if (!dmaengine_ref_count)
-		return;
-
-	/* redistribute available channels */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_online_cpu(cpu) {
-			chan = min_chan(cap, cpu);
-			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
-		}
-}
-
 int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 {
 	struct dma_device *device;
@@ -485,13 +592,25 @@ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
 	caps->src_addr_widths = device->src_addr_widths;
 	caps->dst_addr_widths = device->dst_addr_widths;
 	caps->directions = device->directions;
+	caps->min_burst = device->min_burst;
 	caps->max_burst = device->max_burst;
+	caps->max_sg_burst = device->max_sg_burst;
 	caps->residue_granularity = device->residue_granularity;
 	caps->descriptor_reuse = device->descriptor_reuse;
 	caps->cmd_pause = !!device->device_pause;
 	caps->cmd_resume = !!device->device_resume;
 	caps->cmd_terminate = !!device->device_terminate_all;
 
+	/*
+	 * DMA engine device might be configured with non-uniformly
+	 * distributed slave capabilities per device channels. In this
+	 * case the corresponding driver may provide the device_caps
+	 * callback to override the generic capabilities with
+	 * channel-specific ones.
+	 */
+	if (device->device_caps)
+		device->device_caps(chan, caps);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dma_get_slave_caps);
@@ -502,7 +621,7 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
 {
 	struct dma_chan *chan;
 
-	if (mask && !__dma_device_satisfies_mask(dev, mask)) {
+	if (mask && !dma_device_satisfies_mask(dev, mask)) {
 		dev_dbg(dev->dev, "%s: wrong capabilities\n", __func__);
 		return NULL;
 	}
@@ -572,17 +691,16 @@ static struct dma_chan *find_candidate(struct dma_device *device,
 
 /**
  * dma_get_slave_channel - try to get specific channel exclusively
- * @chan: target channel
+ * @chan:	target channel
  */
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan)
 {
-	int err = -EBUSY;
-
 	/* lock against __dma_request_channel */
 	mutex_lock(&dma_list_mutex);
 
 	if (chan->client_count == 0) {
 		struct dma_device *device = chan->device;
+		int err;
 
 		dma_cap_set(DMA_PRIVATE, device->cap_mask);
 		device->privatecnt++;
@@ -626,10 +744,10 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
 
 /**
  * __dma_request_channel - try to allocate an exclusive channel
- * @mask: capabilities that the channel must satisfy
- * @fn: optional callback to disposition available channels
- * @fn_param: opaque parameter to pass to dma_filter_fn
- * @np: device node to look for DMA channels
+ * @mask:	capabilities that the channel must satisfy
+ * @fn:		optional callback to disposition available channels
+ * @fn_param:	opaque parameter to pass to dma_filter_fn()
+ * @np:		device node to look for DMA channels
  *
  * Returns pointer to appropriate DMA channel on success or NULL.
  */
@@ -704,11 +822,11 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	if (has_acpi_companion(dev) && !chan)
 		chan = acpi_dma_request_slave_chan_by_name(dev, name);
 
-	if (chan) {
-		/* Valid channel found or requester needs to be deferred */
-		if (!IS_ERR(chan) || PTR_ERR(chan) == -EPROBE_DEFER)
-			return chan;
-	}
+	if (PTR_ERR(chan) == -EPROBE_DEFER)
+		return chan;
+
+	if (!IS_ERR_OR_NULL(chan))
+		goto found;
 
 	/* Try to find the channel via the DMA filter map(s) */
 	mutex_lock(&dma_list_mutex);
@@ -728,31 +846,35 @@ struct dma_chan *dma_request_chan(struct device *dev, const char *name)
 	}
 	mutex_unlock(&dma_list_mutex);
 
-	return chan ? chan : ERR_PTR(-EPROBE_DEFER);
-}
-EXPORT_SYMBOL_GPL(dma_request_chan);
+	if (IS_ERR(chan))
+		return chan;
+	if (!chan)
+		return ERR_PTR(-EPROBE_DEFER);
 
-/**
- * dma_request_slave_channel - try to allocate an exclusive slave channel
- * @dev:	pointer to client device structure
- * @name:	slave channel name
- *
- * Returns pointer to appropriate DMA channel on success or NULL.
- */
-struct dma_chan *dma_request_slave_channel(struct device *dev,
-					   const char *name)
-{
-	struct dma_chan *ch = dma_request_chan(dev, name);
-	if (IS_ERR(ch))
-		return NULL;
+found:
+#ifdef CONFIG_DEBUG_FS
+	chan->dbg_client_name = kasprintf(GFP_KERNEL, "%s:%s", dev_name(dev),
+					  name);
+#endif
+
+	chan->name = kasprintf(GFP_KERNEL, "dma:%s", name);
+	if (!chan->name)
+		return chan;
+	chan->slave = dev;
+
+	if (sysfs_create_link(&chan->dev->device.kobj, &dev->kobj,
+			      DMA_SLAVE_NAME))
+		dev_warn(dev, "Cannot create DMA %s symlink\n", DMA_SLAVE_NAME);
+	if (sysfs_create_link(&dev->kobj, &chan->dev->device.kobj, chan->name))
+		dev_warn(dev, "Cannot create DMA %s symlink\n", chan->name);
 
-	return ch;
+	return chan;
 }
-EXPORT_SYMBOL_GPL(dma_request_slave_channel);
+EXPORT_SYMBOL_GPL(dma_request_chan);
 
 /**
  * dma_request_chan_by_mask - allocate a channel satisfying certain capabilities
- * @mask: capabilities that the channel must satisfy
+ * @mask:	capabilities that the channel must satisfy
  *
  * Returns pointer to appropriate DMA channel on success or an error pointer.
  */
@@ -786,6 +908,19 @@ void dma_release_channel(struct dma_chan *chan)
 	/* drop PRIVATE cap enabled by __dma_request_channel() */
 	if (--chan->device->privatecnt == 0)
 		dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
+
+	if (chan->slave) {
+		sysfs_remove_link(&chan->dev->device.kobj, DMA_SLAVE_NAME);
+		sysfs_remove_link(&chan->slave->kobj, chan->name);
+		kfree(chan->name);
+		chan->name = NULL;
+		chan->slave = NULL;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	kfree(chan->dbg_client_name);
+	chan->dbg_client_name = NULL;
+#endif
 	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
@@ -830,18 +965,18 @@ void dmaengine_get(void)
 EXPORT_SYMBOL(dmaengine_get);
 
 /**
- * dmaengine_put - let dma drivers be removed when ref_count == 0
+ * dmaengine_put - let DMA drivers be removed when ref_count == 0
  */
 void dmaengine_put(void)
 {
-	struct dma_device *device;
+	struct dma_device *device, *_d;
 	struct dma_chan *chan;
 
 	mutex_lock(&dma_list_mutex);
 	dmaengine_ref_count--;
 	BUG_ON(dmaengine_ref_count < 0);
 	/* drop channel references */
-	list_for_each_entry(device, &dma_device_list, global_node) {
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
 		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
 			continue;
 		list_for_each_entry(chan, &device->channels, device_node)
@@ -900,15 +1035,111 @@ static int get_dma_id(struct dma_device *device)
 	return 0;
 }
 
+static int __dma_async_device_channel_register(struct dma_device *device,
+					       struct dma_chan *chan)
+{
+	int rc;
+
+	chan->local = alloc_percpu(typeof(*chan->local));
+	if (!chan->local)
+		return -ENOMEM;
+	chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
+	if (!chan->dev) {
+		rc = -ENOMEM;
+		goto err_free_local;
+	}
+
+	/*
+	 * When the chan_id is a negative value, we are dynamically adding
+	 * the channel. Otherwise we are static enumerating.
+	 */
+	mutex_lock(&device->chan_mutex);
+	chan->chan_id = ida_alloc(&device->chan_ida, GFP_KERNEL);
+	mutex_unlock(&device->chan_mutex);
+	if (chan->chan_id < 0) {
+		pr_err("%s: unable to alloc ida for chan: %d\n",
+		       __func__, chan->chan_id);
+		rc = chan->chan_id;
+		goto err_free_dev;
+	}
+
+	chan->dev->device.class = &dma_devclass;
+	chan->dev->device.parent = device->dev;
+	chan->dev->chan = chan;
+	chan->dev->dev_id = device->dev_id;
+	dev_set_name(&chan->dev->device, "dma%dchan%d",
+		     device->dev_id, chan->chan_id);
+	rc = device_register(&chan->dev->device);
+	if (rc)
+		goto err_out_ida;
+	chan->client_count = 0;
+	device->chancnt++;
+
+	return 0;
+
+ err_out_ida:
+	mutex_lock(&device->chan_mutex);
+	ida_free(&device->chan_ida, chan->chan_id);
+	mutex_unlock(&device->chan_mutex);
+ err_free_dev:
+	kfree(chan->dev);
+ err_free_local:
+	free_percpu(chan->local);
+	chan->local = NULL;
+	return rc;
+}
+
+int dma_async_device_channel_register(struct dma_device *device,
+				      struct dma_chan *chan)
+{
+	int rc;
+
+	rc = __dma_async_device_channel_register(device, chan);
+	if (rc < 0)
+		return rc;
+
+	dma_channel_rebalance();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dma_async_device_channel_register);
+
+static void __dma_async_device_channel_unregister(struct dma_device *device,
+						  struct dma_chan *chan)
+{
+	WARN_ONCE(!device->device_release && chan->client_count,
+		  "%s called while %d clients hold a reference\n",
+		  __func__, chan->client_count);
+	mutex_lock(&dma_list_mutex);
+	device->chancnt--;
+	chan->dev->chan = NULL;
+	mutex_unlock(&dma_list_mutex);
+	mutex_lock(&device->chan_mutex);
+	ida_free(&device->chan_ida, chan->chan_id);
+	mutex_unlock(&device->chan_mutex);
+	device_unregister(&chan->dev->device);
+	free_percpu(chan->local);
+}
+
+void dma_async_device_channel_unregister(struct dma_device *device,
+					 struct dma_chan *chan)
+{
+	__dma_async_device_channel_unregister(device, chan);
+	dma_channel_rebalance();
+}
+EXPORT_SYMBOL_GPL(dma_async_device_channel_unregister);
+
 /**
  * dma_async_device_register - registers DMA devices found
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
+ *
+ * After calling this routine the structure should not be freed except in the
+ * device_release() callback which will be called after
+ * dma_async_device_unregister() is called and no further references are taken.
  */
 int dma_async_device_register(struct dma_device *device)
 {
-	int chancnt = 0, rc;
+	int rc;
 	struct dma_chan* chan;
-	atomic_t *idr_ref;
 
 	if (!device)
 		return -ENODEV;
@@ -928,6 +1159,13 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
+	if (dma_has_cap(DMA_MEMCPY_SG, device->cap_mask) && !device->device_prep_dma_memcpy_sg) {
+		dev_err(device->dev,
+			"Device claims capability %s, but op is not defined\n",
+			"DMA_MEMCPY_SG");
+		return -EIO;
+	}
+
 	if (dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor) {
 		dev_err(device->dev,
 			"Device claims capability %s, but op is not defined\n",
@@ -996,65 +1234,32 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
+	if (!device->device_release)
+		dev_dbg(device->dev,
+			 "WARN: Device release is not defined so it is not safe to unbind this driver while in use\n");
+
+	kref_init(&device->ref);
+
 	/* note: this only matters in the
 	 * CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH=n case
 	 */
 	if (device_has_all_tx_types(device))
 		dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
 
-	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
-	if (!idr_ref)
-		return -ENOMEM;
 	rc = get_dma_id(device);
-	if (rc != 0) {
-		kfree(idr_ref);
+	if (rc != 0)
 		return rc;
-	}
 
-	atomic_set(idr_ref, 0);
+	mutex_init(&device->chan_mutex);
+	ida_init(&device->chan_ida);
 
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
-		rc = -ENOMEM;
-		chan->local = alloc_percpu(typeof(*chan->local));
-		if (chan->local == NULL)
-			goto err_out;
-		chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
-		if (chan->dev == NULL) {
-			free_percpu(chan->local);
-			chan->local = NULL;
+		rc = __dma_async_device_channel_register(device, chan);
+		if (rc < 0)
 			goto err_out;
-		}
-
-		chan->chan_id = chancnt++;
-		chan->dev->device.class = &dma_devclass;
-		chan->dev->device.parent = device->dev;
-		chan->dev->chan = chan;
-		chan->dev->idr_ref = idr_ref;
-		chan->dev->dev_id = device->dev_id;
-		atomic_inc(idr_ref);
-		dev_set_name(&chan->dev->device, "dma%dchan%d",
-			     device->dev_id, chan->chan_id);
-
-		rc = device_register(&chan->dev->device);
-		if (rc) {
-			free_percpu(chan->local);
-			chan->local = NULL;
-			kfree(chan->dev);
-			atomic_dec(idr_ref);
-			goto err_out;
-		}
-		chan->client_count = 0;
 	}
 
-	if (!chancnt) {
-		dev_err(device->dev, "%s: device has no channels!\n", __func__);
-		rc = -ENODEV;
-		goto err_out;
-	}
-
-	device->chancnt = chancnt;
-
 	mutex_lock(&dma_list_mutex);
 	/* take references on public channels */
 	if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask))
@@ -1078,13 +1283,14 @@ int dma_async_device_register(struct dma_device *device)
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
+	dmaengine_debug_register(device);
+
 	return 0;
 
 err_out:
 	/* if we never registered a channel just release the idr */
-	if (atomic_read(idr_ref) == 0) {
+	if (!device->chancnt) {
 		ida_free(&dma_ida, device->dev_id);
-		kfree(idr_ref);
 		return rc;
 	}
 
@@ -1103,30 +1309,30 @@ EXPORT_SYMBOL(dma_async_device_register);
 
 /**
  * dma_async_device_unregister - unregister a DMA device
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
  *
  * This routine is called by dma driver exit routines, dmaengine holds module
  * references to prevent it being called while channels are in use.
  */
 void dma_async_device_unregister(struct dma_device *device)
 {
-	struct dma_chan *chan;
+	struct dma_chan *chan, *n;
+
+	dmaengine_debug_unregister(device);
+
+	list_for_each_entry_safe(chan, n, &device->channels, device_node)
+		__dma_async_device_channel_unregister(device, chan);
 
 	mutex_lock(&dma_list_mutex);
-	list_del_rcu(&device->global_node);
+	/*
+	 * setting DMA_PRIVATE ensures the device being torn down will not
+	 * be used in the channel_table
+	 */
+	dma_cap_set(DMA_PRIVATE, device->cap_mask);
 	dma_channel_rebalance();
+	ida_free(&dma_ida, device->dev_id);
+	dma_device_put(device);
 	mutex_unlock(&dma_list_mutex);
-
-	list_for_each_entry(chan, &device->channels, device_node) {
-		WARN_ONCE(chan->client_count,
-			  "%s called while %d clients hold a reference\n",
-			  __func__, chan->client_count);
-		mutex_lock(&dma_list_mutex);
-		chan->dev->chan = NULL;
-		mutex_unlock(&dma_list_mutex);
-		device_unregister(&chan->dev->device);
-		free_percpu(chan->local);
-	}
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
@@ -1140,7 +1346,7 @@ static void dmam_device_release(struct device *dev, void *res)
 
 /**
  * dmaenginem_async_device_register - registers DMA devices found
- * @device: &dma_device
+ * @device:	pointer to &struct dma_device
  *
  * The operation is managed and will be undone on driver detach.
  */
@@ -1304,8 +1510,82 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
-/* dma_wait_for_async_tx - spin wait for a transaction to complete
- * @tx: in-flight transaction to wait on
+static inline int desc_check_and_set_metadata_mode(
+	struct dma_async_tx_descriptor *desc, enum dma_desc_metadata_mode mode)
+{
+	/* Make sure that the metadata mode is not mixed */
+	if (!desc->desc_metadata_mode) {
+		if (dmaengine_is_metadata_mode_supported(desc->chan, mode))
+			desc->desc_metadata_mode = mode;
+		else
+			return -ENOTSUPP;
+	} else if (desc->desc_metadata_mode != mode) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int dmaengine_desc_attach_metadata(struct dma_async_tx_descriptor *desc,
+				   void *data, size_t len)
+{
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_CLIENT);
+	if (ret)
+		return ret;
+
+	if (!desc->metadata_ops || !desc->metadata_ops->attach)
+		return -ENOTSUPP;
+
+	return desc->metadata_ops->attach(desc, data, len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_attach_metadata);
+
+void *dmaengine_desc_get_metadata_ptr(struct dma_async_tx_descriptor *desc,
+				      size_t *payload_len, size_t *max_len)
+{
+	int ret;
+
+	if (!desc)
+		return ERR_PTR(-EINVAL);
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_ENGINE);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!desc->metadata_ops || !desc->metadata_ops->get_ptr)
+		return ERR_PTR(-ENOTSUPP);
+
+	return desc->metadata_ops->get_ptr(desc, payload_len, max_len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_get_metadata_ptr);
+
+int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc,
+				    size_t payload_len)
+{
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	ret = desc_check_and_set_metadata_mode(desc, DESC_METADATA_ENGINE);
+	if (ret)
+		return ret;
+
+	if (!desc->metadata_ops || !desc->metadata_ops->set_len)
+		return -ENOTSUPP;
+
+	return desc->metadata_ops->set_len(desc, payload_len);
+}
+EXPORT_SYMBOL_GPL(dmaengine_desc_set_metadata_len);
+
+/**
+ * dma_wait_for_async_tx - spin wait for a transaction to complete
+ * @tx:		in-flight transaction to wait on
  */
 enum dma_status
 dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
@@ -1328,9 +1608,12 @@ dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 }
 EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
 
-/* dma_run_dependencies - helper routine for dma drivers to process
- *	(start) dependent operations on their target channel
- * @tx: transaction with dependencies
+/**
+ * dma_run_dependencies - process dependent operations on the target channel
+ * @tx:		transaction with dependencies
+ *
+ * Helper routine for DMA drivers to process (start) dependent operations
+ * on their target channel.
  */
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
 {
@@ -1372,8 +1655,11 @@ static int __init dma_bus_init(void)
 
 	if (err)
 		return err;
-	return class_register(&dma_devclass);
-}
-arch_initcall(dma_bus_init);
 
+	err = class_register(&dma_devclass);
+	if (!err)
+		dmaengine_debugfs_init();
 
+	return err;
+}
+arch_initcall(dma_bus_init);
diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h
index 501c0b063f852d9a38a619940699d71b146399f4..1bfbd64b13717e0f7e1136c191986fc6384c99ac 100644
--- a/drivers/dma/dmaengine.h
+++ b/drivers/dma/dmaengine.h
@@ -77,6 +77,7 @@ static inline enum dma_status dma_cookie_status(struct dma_chan *chan,
 		state->last = complete;
 		state->used = used;
 		state->residue = 0;
+		state->in_flight_bytes = 0;
 	}
 	return dma_async_is_complete(cookie, complete, used);
 }
@@ -87,6 +88,13 @@ static inline void dma_set_residue(struct dma_tx_state *state, u32 residue)
 		state->residue = residue;
 }
 
+static inline void dma_set_in_flight_bytes(struct dma_tx_state *state,
+					   u32 in_flight_bytes)
+{
+	if (state)
+		state->in_flight_bytes = in_flight_bytes;
+}
+
 struct dmaengine_desc_callback {
 	dma_async_tx_callback callback;
 	dma_async_tx_callback_result callback_result;
@@ -171,4 +179,23 @@ dmaengine_desc_callback_valid(struct dmaengine_desc_callback *cb)
 	return (cb->callback) ? true : false;
 }
 
+struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
+struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static inline struct dentry *
+dmaengine_get_debugfs_root(struct dma_device *dma_dev) {
+	return dma_dev->dbg_dev_root;
+}
+#else
+struct dentry;
+static inline struct dentry *
+dmaengine_get_debugfs_root(struct dma_device *dma_dev)
+{
+	return NULL;
+}
+#endif /* CONFIG_DEBUG_FS */
+
 #endif
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..817ffa95a9b118b6be7c2f93add23737f5c0f29a
--- /dev/null
+++ b/drivers/dma/idxd/Makefile
@@ -0,0 +1,12 @@
+ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
+obj-$(CONFIG_INTEL_IDXD_BUS) += idxd_bus.o
+idxd_bus-y := bus.o
+
+obj-$(CONFIG_INTEL_IDXD) += idxd.o
+idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+
+idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
+
+obj-$(CONFIG_INTEL_IDXD_COMPAT) += idxd_compat.o
+idxd_compat-y := compat.o
diff --git a/drivers/dma/idxd/bus.c b/drivers/dma/idxd/bus.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef81c3bbe92c1e93e1e7e2d27a6411a97d629d33
--- /dev/null
+++ b/drivers/dma/idxd/bus.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include "idxd.h"
+
+
+int __idxd_driver_register(struct idxd_device_driver *idxd_drv, struct module *owner,
+			   const char *mod_name)
+{
+	struct device_driver *drv = &idxd_drv->drv;
+
+	if (!idxd_drv->type) {
+		pr_debug("driver type not set (%ps)\n", __builtin_return_address(0));
+		return -EINVAL;
+	}
+
+	drv->name = idxd_drv->name;
+	drv->bus = &dsa_bus_type;
+	drv->owner = owner;
+	drv->mod_name = mod_name;
+
+	return driver_register(drv);
+}
+EXPORT_SYMBOL_GPL(__idxd_driver_register);
+
+void idxd_driver_unregister(struct idxd_device_driver *idxd_drv)
+{
+	driver_unregister(&idxd_drv->drv);
+}
+EXPORT_SYMBOL_GPL(idxd_driver_unregister);
+
+static int idxd_config_bus_match(struct device *dev,
+				 struct device_driver *drv)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(drv, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+	int i = 0;
+
+	while (idxd_drv->type[i] != IDXD_DEV_NONE) {
+		if (idxd_dev->type == idxd_drv->type[i])
+			return 1;
+		i++;
+	}
+
+	return 0;
+}
+
+static int idxd_config_bus_probe(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return idxd_drv->probe(idxd_dev);
+}
+
+static int idxd_config_bus_remove(struct device *dev)
+{
+	struct idxd_device_driver *idxd_drv =
+		container_of(dev->driver, struct idxd_device_driver, drv);
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	idxd_drv->remove(idxd_dev);
+	return 0;
+}
+
+static int idxd_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "MODALIAS=" IDXD_DEVICES_MODALIAS_FMT, 0);
+}
+
+struct bus_type dsa_bus_type = {
+	.name = "dsa",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+	.uevent = idxd_bus_uevent,
+};
+EXPORT_SYMBOL_GPL(dsa_bus_type);
+
+static int __init dsa_bus_init(void)
+{
+	return bus_register(&dsa_bus_type);
+}
+module_init(dsa_bus_init);
+
+static void __exit dsa_bus_exit(void)
+{
+	bus_unregister(&dsa_bus_type);
+}
+module_exit(dsa_bus_exit);
+
+MODULE_DESCRIPTION("IDXD driver dsa_bus_type driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
new file mode 100644
index 0000000000000000000000000000000000000000..202ee1e95464a8469450d016b1b06642eb52bcde
--- /dev/null
+++ b/drivers/dma/idxd/cdev.c
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/intel-svm.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/iommu.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+
+struct idxd_cdev_context {
+	const char *name;
+	dev_t devt;
+	struct ida minor_ida;
+};
+
+/*
+ * ictx is an array based off of accelerator types. enum idxd_type
+ * is used as index
+ */
+static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
+	{ .name = "dsa" },
+	{ .name = "iax" }
+};
+
+struct idxd_user_context {
+	struct idxd_wq *wq;
+	struct task_struct *task;
+	unsigned int pasid;
+	unsigned int flags;
+	struct iommu_sva *sva;
+};
+
+static void idxd_cdev_dev_release(struct device *dev)
+{
+	struct idxd_cdev *idxd_cdev = dev_to_cdev(dev);
+	struct idxd_cdev_context *cdev_ctx;
+	struct idxd_wq *wq = idxd_cdev->wq;
+
+	cdev_ctx = &ictx[wq->idxd->data->type];
+	ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
+	kfree(idxd_cdev);
+}
+
+static struct device_type idxd_cdev_device_type = {
+	.name = "idxd_cdev",
+	.release = idxd_cdev_dev_release,
+};
+
+static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode)
+{
+	struct cdev *cdev = inode->i_cdev;
+
+	return container_of(cdev, struct idxd_cdev, cdev);
+}
+
+static inline struct idxd_wq *inode_wq(struct inode *inode)
+{
+	struct idxd_cdev *idxd_cdev = inode_idxd_cdev(inode);
+
+	return idxd_cdev->wq;
+}
+
+static int idxd_cdev_open(struct inode *inode, struct file *filp)
+{
+	struct idxd_user_context *ctx;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	struct device *dev;
+	int rc = 0;
+	struct iommu_sva *sva;
+	unsigned int pasid;
+
+	wq = inode_wq(inode);
+	idxd = wq->idxd;
+	dev = &idxd->pdev->dev;
+
+	dev_dbg(dev, "%s called: %d\n", __func__, idxd_wq_refcount(wq));
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	mutex_lock(&wq->wq_lock);
+
+	if (idxd_wq_refcount(wq) > 0 && wq_dedicated(wq)) {
+		rc = -EBUSY;
+		goto failed;
+	}
+
+	ctx->wq = wq;
+	filp->private_data = ctx;
+
+	if (device_user_pasid_enabled(idxd)) {
+		sva = iommu_sva_bind_device(dev, current->mm, NULL);
+		if (IS_ERR(sva)) {
+			rc = PTR_ERR(sva);
+			dev_err(dev, "pasid allocation failed: %d\n", rc);
+			goto failed;
+		}
+
+		pasid = iommu_sva_get_pasid(sva);
+		if (pasid == IOMMU_PASID_INVALID) {
+			iommu_sva_unbind_device(sva);
+			rc = -EINVAL;
+			goto failed;
+		}
+
+		ctx->sva = sva;
+		ctx->pasid = pasid;
+
+		if (wq_dedicated(wq)) {
+			rc = idxd_wq_set_pasid(wq, pasid);
+			if (rc < 0) {
+				iommu_sva_unbind_device(sva);
+				dev_err(dev, "wq set pasid failed: %d\n", rc);
+				goto failed;
+			}
+		}
+	}
+
+	idxd_wq_get(wq);
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+
+ failed:
+	mutex_unlock(&wq->wq_lock);
+	kfree(ctx);
+	return rc;
+}
+
+static int idxd_cdev_release(struct inode *node, struct file *filep)
+{
+	struct idxd_user_context *ctx = filep->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+
+	dev_dbg(dev, "%s called\n", __func__);
+	filep->private_data = NULL;
+
+	/* Wait for in-flight operations to complete. */
+	if (wq_shared(wq)) {
+		idxd_device_drain_pasid(idxd, ctx->pasid);
+	} else {
+		if (device_user_pasid_enabled(idxd)) {
+			/* The wq disable in the disable pasid function will drain the wq */
+			rc = idxd_wq_disable_pasid(wq);
+			if (rc < 0)
+				dev_err(dev, "wq disable pasid failed.\n");
+		} else {
+			idxd_wq_drain(wq, NULL);
+		}
+	}
+
+	if (ctx->sva)
+		iommu_sva_unbind_device(ctx->sva);
+	kfree(ctx);
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+}
+
+static int check_vma(struct idxd_wq *wq, struct vm_area_struct *vma,
+		     const char *func)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		dev_info_ratelimited(dev,
+				     "%s: %s: mapping too large: %lu\n",
+				     current->comm, func,
+				     vma->vm_end - vma->vm_start);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct idxd_user_context *ctx = filp->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	phys_addr_t base = pci_resource_start(pdev, IDXD_WQ_BAR);
+	unsigned long pfn;
+	int rc;
+
+	dev_dbg(&pdev->dev, "%s called\n", __func__);
+	rc = check_vma(wq, vma, __func__);
+	if (rc < 0)
+		return rc;
+
+	vma->vm_flags |= VM_DONTCOPY;
+	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
+				IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX)) >> PAGE_SHIFT;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_private_data = ctx;
+
+	return io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
+			vma->vm_page_prot);
+}
+
+static __poll_t idxd_cdev_poll(struct file *filp,
+			       struct poll_table_struct *wait)
+{
+	struct idxd_user_context *ctx = filp->private_data;
+	struct idxd_wq *wq = ctx->wq;
+	struct idxd_device *idxd = wq->idxd;
+	__poll_t out = 0;
+
+	poll_wait(filp, &wq->err_queue, wait);
+	spin_lock(&idxd->dev_lock);
+	if (idxd->sw_err.valid)
+		out = EPOLLIN | EPOLLRDNORM;
+	spin_unlock(&idxd->dev_lock);
+
+	return out;
+}
+
+static const struct file_operations idxd_cdev_fops = {
+	.owner = THIS_MODULE,
+	.open = idxd_cdev_open,
+	.release = idxd_cdev_release,
+	.mmap = idxd_cdev_mmap,
+	.poll = idxd_cdev_poll,
+};
+
+int idxd_cdev_get_major(struct idxd_device *idxd)
+{
+	return MAJOR(ictx[idxd->data->type].devt);
+}
+
+int idxd_wq_add_cdev(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_cdev *idxd_cdev;
+	struct cdev *cdev;
+	struct device *dev;
+	struct idxd_cdev_context *cdev_ctx;
+	int rc, minor;
+
+	idxd_cdev = kzalloc(sizeof(*idxd_cdev), GFP_KERNEL);
+	if (!idxd_cdev)
+		return -ENOMEM;
+
+	idxd_cdev->idxd_dev.type = IDXD_DEV_CDEV;
+	idxd_cdev->wq = wq;
+	cdev = &idxd_cdev->cdev;
+	dev = cdev_dev(idxd_cdev);
+	cdev_ctx = &ictx[wq->idxd->data->type];
+	minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
+	if (minor < 0) {
+		kfree(idxd_cdev);
+		return minor;
+	}
+	idxd_cdev->minor = minor;
+
+	device_initialize(dev);
+	dev->parent = wq_confdev(wq);
+	dev->bus = &dsa_bus_type;
+	dev->type = &idxd_cdev_device_type;
+	dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
+
+	rc = dev_set_name(dev, "%s/wq%u.%u", idxd->data->name_prefix, idxd->id, wq->id);
+	if (rc < 0)
+		goto err;
+
+	wq->idxd_cdev = idxd_cdev;
+	cdev_init(cdev, &idxd_cdev_fops);
+	rc = cdev_device_add(cdev, dev);
+	if (rc) {
+		dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc);
+		goto err;
+	}
+
+	return 0;
+
+ err:
+	put_device(dev);
+	wq->idxd_cdev = NULL;
+	return rc;
+}
+
+void idxd_wq_del_cdev(struct idxd_wq *wq)
+{
+	struct idxd_cdev *idxd_cdev;
+
+	idxd_cdev = wq->idxd_cdev;
+	wq->idxd_cdev = NULL;
+	cdev_device_del(&idxd_cdev->cdev, cdev_dev(idxd_cdev));
+	put_device(cdev_dev(idxd_cdev));
+}
+
+static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		rc = -ENODEV;
+		goto err_drv_name;
+	}
+
+	wq->type = IDXD_WQT_USER;
+	rc = __drv_enable_wq(wq);
+	if (rc < 0)
+		goto err;
+
+	rc = idxd_wq_add_cdev(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_CDEV_ERR;
+		goto err_cdev;
+	}
+
+	idxd->cmd_status = 0;
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+
+err_cdev:
+	__drv_disable_wq(wq);
+err:
+err_drv_name:
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
+
+static void idxd_user_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_del_cdev(wq);
+	__drv_disable_wq(wq);
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_user_drv = {
+	.probe = idxd_user_drv_probe,
+	.remove = idxd_user_drv_remove,
+	.name = "user",
+	.type = dev_types,
+};
+EXPORT_SYMBOL_GPL(idxd_user_drv);
+
+int idxd_cdev_register(void)
+{
+	int rc, i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		ida_init(&ictx[i].minor_ida);
+		rc = alloc_chrdev_region(&ictx[i].devt, 0, MINORMASK,
+					 ictx[i].name);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+void idxd_cdev_remove(void)
+{
+	int i;
+
+	for (i = 0; i < IDXD_TYPE_MAX; i++) {
+		unregister_chrdev_region(ictx[i].devt, MINORMASK);
+		ida_destroy(&ictx[i].minor_ida);
+	}
+}
diff --git a/drivers/dma/idxd/compat.c b/drivers/dma/idxd/compat.c
new file mode 100644
index 0000000000000000000000000000000000000000..41c4975cd623ea307a8776de215dc485f6da724e
--- /dev/null
+++ b/drivers/dma/idxd/compat.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include "idxd.h"
+
+extern int device_driver_attach(struct device_driver *drv, struct device *dev);
+extern void device_driver_detach(struct device *dev);
+
+#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)	\
+	struct driver_attribute driver_attr_##_name =		\
+	__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
+
+static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count)
+{
+	struct bus_type *bus = drv->bus;
+	struct device *dev;
+	int rc = -ENODEV;
+
+	dev = bus_find_device_by_name(bus, NULL, buf);
+	if (dev && dev->driver) {
+		device_driver_detach(dev);
+		rc = count;
+	}
+
+	return rc;
+}
+static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store);
+
+static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count)
+{
+	struct bus_type *bus = drv->bus;
+	struct device *dev;
+	struct device_driver *alt_drv = NULL;
+	int rc = -ENODEV;
+	struct idxd_dev *idxd_dev;
+
+	dev = bus_find_device_by_name(bus, NULL, buf);
+	if (!dev || dev->driver || drv != &dsa_drv.drv)
+		return -ENODEV;
+
+	idxd_dev = confdev_to_idxd_dev(dev);
+	if (is_idxd_dev(idxd_dev)) {
+		alt_drv = driver_find("idxd", bus);
+	} else if (is_idxd_wq_dev(idxd_dev)) {
+		struct idxd_wq *wq = confdev_to_wq(dev);
+
+		if (is_idxd_wq_kernel(wq))
+			alt_drv = driver_find("dmaengine", bus);
+		else if (is_idxd_wq_user(wq))
+			alt_drv = driver_find("user", bus);
+	}
+	if (!alt_drv)
+		return -ENODEV;
+
+	rc = device_driver_attach(alt_drv, dev);
+	if (rc < 0)
+		return rc;
+
+	return count;
+}
+static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);
+
+static struct attribute *dsa_drv_compat_attrs[] = {
+	&driver_attr_bind.attr,
+	&driver_attr_unbind.attr,
+	NULL,
+};
+
+static const struct attribute_group dsa_drv_compat_attr_group = {
+	.attrs = dsa_drv_compat_attrs,
+};
+
+static const struct attribute_group *dsa_drv_compat_groups[] = {
+	&dsa_drv_compat_attr_group,
+	NULL,
+};
+
+static int idxd_dsa_drv_probe(struct idxd_dev *idxd_dev)
+{
+	return -ENODEV;
+}
+
+static void idxd_dsa_drv_remove(struct idxd_dev *idxd_dev)
+{
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver dsa_drv = {
+	.name = "dsa",
+	.probe = idxd_dsa_drv_probe,
+	.remove = idxd_dsa_drv_remove,
+	.type = dev_types,
+	.drv = {
+		.suppress_bind_attrs = true,
+		.groups = dsa_drv_compat_groups,
+	},
+};
+
+module_idxd_driver(dsa_drv);
+MODULE_IMPORT_NS(IDXD);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
new file mode 100644
index 0000000000000000000000000000000000000000..328b2a498a570c5d069cc965f4145150ec162ed2
--- /dev/null
+++ b/drivers/dma/idxd/device.c
@@ -0,0 +1,1632 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
+#include "idxd.h"
+#include "registers.h"
+
+static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
+			  u32 *status);
+static void idxd_device_wqs_clear_state(struct idxd_device *idxd);
+static void idxd_wq_disable_cleanup(struct idxd_wq *wq);
+
+/* Interrupt control bits */
+void idxd_unmask_error_interrupts(struct idxd_device *idxd)
+{
+	union genctrl_reg genctrl;
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.softerr_int_en = 1;
+	genctrl.halt_int_en = 1;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+}
+
+void idxd_mask_error_interrupts(struct idxd_device *idxd)
+{
+	union genctrl_reg genctrl;
+
+	genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
+	genctrl.softerr_int_en = 0;
+	genctrl.halt_int_en = 0;
+	iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
+}
+
+static int alloc_desc_batch(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_batch *batch;
+	unsigned int size, cr_size, num;
+
+	batch = kzalloc_node(sizeof(*batch), GFP_KERNEL, dev_to_node(dev));
+	if (!batch)
+		return -ENOMEM;
+
+	num = wq->max_batch_size;
+	size = num * sizeof(struct dsa_hw_desc);
+	batch->descs = dma_alloc_coherent(dev, size, &batch->dma_descs, GFP_KERNEL);
+	if (!batch->descs)
+		goto descs_err;
+
+	cr_size = num * idxd->data->compl_size;
+	batch->crs = dma_alloc_coherent(dev, cr_size, &batch->dma_crs, GFP_KERNEL);
+	if (!batch->crs)
+		goto crs_err;
+
+	desc->batch = batch;
+
+	return 0;
+
+crs_err:
+	dma_free_coherent(dev, size, batch->descs, batch->dma_descs);
+descs_err:
+	kfree(batch);
+	dev_warn(dev, "Unable to allocate memory, consider lowering max batch size.\n");
+	return -ENOMEM;
+}
+
+static void free_desc_batch(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	unsigned int size, cr_size, num;
+	struct idxd_batch *batch;
+
+	batch = desc->batch;
+	if (!batch)
+		return;
+
+	num = wq->max_batch_size;
+	size = num * sizeof(struct dsa_hw_desc);
+	cr_size = num * idxd->data->compl_size;
+	dma_free_coherent(dev, size, batch->descs, batch->dma_descs);
+	dma_free_coherent(dev, cr_size, batch->crs, batch->dma_crs);
+	kfree(batch);
+}
+
+static void free_hw_descs(struct idxd_wq *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->num_descs; i++)
+		kfree(wq->hw_descs[i]);
+
+	kfree(wq->hw_descs);
+}
+
+static int alloc_hw_descs(struct idxd_wq *wq, int num)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+	int i;
+	int node = dev_to_node(dev);
+
+	wq->hw_descs = kcalloc_node(num, sizeof(struct dsa_hw_desc *),
+				    GFP_KERNEL, node);
+	if (!wq->hw_descs)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		wq->hw_descs[i] = kzalloc_node(sizeof(*wq->hw_descs[i]),
+					       GFP_KERNEL, node);
+		if (!wq->hw_descs[i]) {
+			free_hw_descs(wq);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void free_descs(struct idxd_wq *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->num_descs; i++) {
+		free_desc_batch(wq, wq->descs[i]);
+		kfree(wq->descs[i]);
+	}
+
+	kfree(wq->descs);
+}
+
+static int alloc_descs(struct idxd_wq *wq, int num)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+	int i;
+	int node = dev_to_node(dev);
+
+	wq->descs = kcalloc_node(num, sizeof(struct idxd_desc *),
+				 GFP_KERNEL, node);
+	if (!wq->descs)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		wq->descs[i] = kzalloc_node(sizeof(*wq->descs[i]),
+					    GFP_KERNEL, node);
+		if (!wq->descs[i]) {
+			free_descs(wq);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/* WQ control bits */
+int idxd_wq_alloc_resources(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int rc, num_descs, i;
+
+	if (wq->type != IDXD_WQT_KERNEL)
+		return 0;
+
+	num_descs = wq_dedicated(wq) ? wq->size : wq->threshold;
+	wq->num_descs = num_descs;
+
+	rc = alloc_hw_descs(wq, num_descs);
+	if (rc < 0)
+		return rc;
+
+	wq->compls_size = num_descs * idxd->data->compl_size;
+	wq->compls = dma_alloc_coherent(dev, wq->compls_size, &wq->compls_addr, GFP_KERNEL);
+	if (!wq->compls) {
+		rc = -ENOMEM;
+		goto fail_alloc_compls;
+	}
+
+	rc = alloc_descs(wq, num_descs);
+	if (rc < 0)
+		goto fail_alloc_descs;
+
+	rc = sbitmap_queue_init_node(&wq->sbq, num_descs, -1, false, GFP_KERNEL,
+				     dev_to_node(dev));
+	if (rc < 0)
+		goto fail_sbitmap_init;
+
+	for (i = 0; i < num_descs; i++) {
+		struct idxd_desc *desc = wq->descs[i];
+
+		desc->hw = wq->hw_descs[i];
+		if (idxd->data->type == IDXD_TYPE_DSA) {
+			desc->completion = &wq->compls[i];
+			/* pre-allocate batch for descriptor */
+			if (alloc_desc_batch(wq, desc))
+				goto fail_sbitmap_init;
+		} else if (idxd->data->type == IDXD_TYPE_IAX)
+			desc->iax_completion = &wq->iax_compls[i];
+		desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i;
+		desc->id = i;
+		desc->gen = 1;
+		desc->wq = wq;
+		desc->cpu = -1;
+	}
+
+	return 0;
+
+ fail_sbitmap_init:
+	free_descs(wq);
+ fail_alloc_descs:
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+ fail_alloc_compls:
+	free_hw_descs(wq);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_alloc_resources);
+
+void idxd_wq_free_resources(struct idxd_wq *wq)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	if (wq->type != IDXD_WQT_KERNEL)
+		return;
+
+	free_hw_descs(wq);
+	free_descs(wq);
+	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	sbitmap_queue_free(&wq->sbq);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_free_resources);
+
+int idxd_wq_enable(struct idxd_wq *wq, u32 *status)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 stat;
+
+	if (status)
+		*status = 0;
+
+	if (wq->state == IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d already enabled\n", wq->id);
+		return 0;
+	}
+
+	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_WQ, wq->id, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS &&
+	    stat != IDXD_CMDSTS_ERR_WQ_ENABLED) {
+		dev_dbg(dev, "WQ enable failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+	wq->state = IDXD_WQ_ENABLED;
+	dev_dbg(dev, "WQ %d enabled\n", wq->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_enable);
+
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config, u32 *status)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 stat, operand;
+
+	dev_dbg(dev, "Disabling WQ %d\n", wq->id);
+	if (status)
+		*status = 0;
+
+	/*
+	 * When the wq is in LOCKED state, it means it is disabled but
+	 * also at the same time is "enabled" as far as the user is
+	 * concerned. So a call to disable the hardware can be
+	 * skipped.
+	 */
+	if (wq->state == IDXD_WQ_LOCKED)
+		goto out;
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return 0;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ disable failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+out:
+	if (wq_dedicated(wq) && is_idxd_wq_mdev(wq)) {
+		wq->state = IDXD_WQ_LOCKED;
+	} else {
+		if (reset_config && test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+			idxd_wq_disable_cleanup(wq);
+		wq->state = IDXD_WQ_DISABLED;
+	}
+	dev_dbg(dev, "WQ %d disabled\n", wq->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_disable);
+
+int idxd_wq_drain(struct idxd_wq *wq, u32 *status)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, stat;
+
+	if (status)
+		*status = 0;
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return 0;
+	}
+
+	dev_dbg(dev, "Draining WQ %d\n", wq->id);
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ drain failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "WQ %d drained\n", wq->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_drain);
+
+void idxd_wq_reset(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d in wrong state: %d\n", wq->id, wq->state);
+		return;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	idxd_cmd_exec(idxd, IDXD_CMD_RESET_WQ, operand, NULL);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		idxd_wq_disable_cleanup(wq);
+	wq->state = IDXD_WQ_DISABLED;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_reset);
+
+int idxd_wq_map_portal(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	resource_size_t start;
+
+	start = pci_resource_start(pdev, IDXD_WQ_BAR);
+	start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX);
+
+	wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
+	if (!wq->portal)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void idxd_wq_unmap_portal(struct idxd_wq *wq)
+{
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	devm_iounmap(dev, wq->portal);
+	wq->portal = NULL;
+	wq->portal_offset = 0;
+}
+
+void idxd_wqs_unmap_portal(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		if (wq->portal)
+			idxd_wq_unmap_portal(wq);
+	}
+}
+
+int idxd_wq_abort(struct idxd_wq *wq, u32 *status)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, stat;
+
+	dev_dbg(dev, "Abort WQ %d\n", wq->id);
+	if (wq->state != IDXD_WQ_ENABLED) {
+		dev_dbg(dev, "WQ %d not active\n", wq->id);
+		return -ENXIO;
+	}
+
+	operand = BIT(wq->id % 16) | ((wq->id / 16) << 16);
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_ABORT_WQ, operand);
+	idxd_cmd_exec(idxd, IDXD_CMD_ABORT_WQ, operand, &stat);
+
+	if (status)
+		*status = stat;
+
+	if (stat != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "WQ abort failed: %#x\n", stat);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "WQ %d aborted\n", wq->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_abort);
+
+static void __idxd_wq_set_priv_locked(struct idxd_wq *wq, int priv)
+{
+	struct idxd_device *idxd = wq->idxd;
+	union wqcfg wqcfg;
+	unsigned int offset;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PRIV_IDX);
+	spin_lock(&idxd->dev_lock);
+	wqcfg.bits[WQCFG_PRIV_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.priv = priv;
+	wq->wqcfg->bits[WQCFG_PRIV_IDX] = wqcfg.bits[WQCFG_PRIV_IDX];
+	iowrite32(wqcfg.bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
+	spin_unlock(&idxd->dev_lock);
+}
+
+static void __idxd_wq_set_pasid_locked(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	union wqcfg wqcfg;
+	unsigned int offset;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock(&idxd->dev_lock);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 1;
+	wqcfg.pasid = pasid;
+	wq->wqcfg->bits[WQCFG_PASID_IDX] = wqcfg.bits[WQCFG_PASID_IDX];
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock(&idxd->dev_lock);
+}
+
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
+{
+	int rc;
+
+	rc = idxd_wq_disable(wq, false, NULL);
+	if (rc < 0)
+		return rc;
+
+	__idxd_wq_set_pasid_locked(wq, pasid);
+
+	rc = idxd_wq_enable(wq, NULL);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_set_pasid);
+
+int idxd_wq_disable_pasid(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+	union wqcfg wqcfg;
+	unsigned int offset;
+
+	rc = idxd_wq_disable(wq, false, NULL);
+	if (rc < 0)
+		return rc;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock(&idxd->dev_lock);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 0;
+	wqcfg.pasid = 0;
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock(&idxd->dev_lock);
+
+	rc = idxd_wq_enable(wq, NULL);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_disable_pasid);
+
+static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+
+	lockdep_assert_held(&wq->wq_lock);
+	memset(wq->wqcfg, 0, idxd->wqcfg_size);
+	wq->type = IDXD_WQT_NONE;
+	wq->threshold = 0;
+	wq->priority = 0;
+	wq->ats_dis = 0;
+	wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
+	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
+	clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+	memset(wq->name, 0, WQ_NAME_SIZE);
+	memset(wq->driver_name, 0, WQ_NAME_SIZE);
+	wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
+}
+
+static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq)
+{
+	lockdep_assert_held(&wq->wq_lock);
+
+	wq->size = 0;
+	wq->group = NULL;
+}
+
+static void idxd_wq_ref_release(struct percpu_ref *ref)
+{
+	struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active);
+
+	complete(&wq->wq_dead);
+}
+
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
+{
+	int rc;
+
+	memset(&wq->wq_active, 0, sizeof(wq->wq_active));
+	rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release,
+			     PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (rc < 0)
+		return rc;
+	reinit_completion(&wq->wq_dead);
+	reinit_completion(&wq->wq_resurrect);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_wq_init_percpu_ref);
+
+void __idxd_wq_quiesce(struct idxd_wq *wq)
+{
+	lockdep_assert_held(&wq->wq_lock);
+	reinit_completion(&wq->wq_resurrect);
+	percpu_ref_kill(&wq->wq_active);
+	complete_all(&wq->wq_resurrect);
+	wait_for_completion(&wq->wq_dead);
+}
+EXPORT_SYMBOL_GPL(__idxd_wq_quiesce);
+
+void idxd_wq_quiesce(struct idxd_wq *wq)
+{
+	mutex_lock(&wq->wq_lock);
+	__idxd_wq_quiesce(wq);
+	mutex_unlock(&wq->wq_lock);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_quiesce);
+
+void idxd_wq_setup_pasid(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int offset;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	/* PASID fields are 8 bytes into the WQCFG register */
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	wq->wqcfg->pasid = pasid;
+	iowrite32(wq->wqcfg->bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_setup_pasid);
+
+void idxd_wq_setup_priv(struct idxd_wq *wq, int priv)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int offset;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	/* priv field is 8 bytes into the WQCFG register */
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PRIV_IDX);
+	wq->wqcfg->priv = !!priv;
+	iowrite32(wq->wqcfg->bits[WQCFG_PRIV_IDX], idxd->reg_base + offset);
+}
+EXPORT_SYMBOL_GPL(idxd_wq_setup_priv);
+
+/* Device control bits */
+static inline bool idxd_is_enabled(struct idxd_device *idxd)
+{
+	union gensts_reg gensts;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+
+	if (gensts.state == IDXD_DEVICE_STATE_ENABLED)
+		return true;
+	return false;
+}
+
+static inline bool idxd_device_is_halted(struct idxd_device *idxd)
+{
+	union gensts_reg gensts;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+
+	return (gensts.state == IDXD_DEVICE_STATE_HALT);
+}
+
+/*
+ * This is function is only used for reset during probe and will
+ * poll for completion. Once the device is setup with interrupts,
+ * all commands will be done via interrupt completion.
+ */
+int idxd_device_init_reset(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	union idxd_command_reg cmd;
+
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		return -ENXIO;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = IDXD_CMD_RESET_DEVICE;
+	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
+	spin_lock(&idxd->cmd_lock);
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) &
+	       IDXD_CMDSTS_ACTIVE)
+		cpu_relax();
+	spin_unlock(&idxd->cmd_lock);
+	return 0;
+}
+
+static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
+			  u32 *status)
+{
+	union idxd_command_reg cmd;
+	DECLARE_COMPLETION_ONSTACK(done);
+	u32 stat;
+
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		if (status)
+			*status = IDXD_CMDSTS_HW_ERR;
+		return;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = cmd_code;
+	cmd.operand = operand;
+	cmd.int_req = 1;
+
+	spin_lock(&idxd->cmd_lock);
+	wait_event_lock_irq(idxd->cmd_waitq,
+			    !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags),
+			    idxd->cmd_lock);
+
+	dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
+		__func__, cmd_code, operand);
+
+	idxd->cmd_status = 0;
+	__set_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
+	idxd->cmd_done = &done;
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+	/*
+	 * After command submitted, release lock and go to sleep until
+	 * the command completes via interrupt.
+	 */
+	spin_unlock(&idxd->cmd_lock);
+	wait_for_completion(&done);
+	stat = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	spin_lock(&idxd->cmd_lock);
+	if (status)
+		*status = stat;
+	idxd->cmd_status = stat & GENMASK(7, 0);
+
+	__clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
+	/* Wake up other pending commands */
+	wake_up(&idxd->cmd_waitq);
+	spin_unlock(&idxd->cmd_lock);
+}
+
+int idxd_device_enable(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 status;
+
+	if (idxd_is_enabled(idxd)) {
+		dev_dbg(dev, "Device already enabled\n");
+		return -ENXIO;
+	}
+
+	idxd_cmd_exec(idxd, IDXD_CMD_ENABLE_DEVICE, 0, &status);
+
+	/* If the command is successful or if the device was enabled */
+	if (status != IDXD_CMDSTS_SUCCESS &&
+	    status != IDXD_CMDSTS_ERR_DEV_ENABLED) {
+		dev_dbg(dev, "%s: err_code: %#x\n", __func__, status);
+		return -ENXIO;
+	}
+
+	idxd->state = IDXD_DEV_ENABLED;
+	return 0;
+}
+
+int idxd_device_disable(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 status;
+
+	if (!idxd_is_enabled(idxd)) {
+		dev_dbg(dev, "Device is not enabled\n");
+		return 0;
+	}
+
+	idxd_cmd_exec(idxd, IDXD_CMD_DISABLE_DEVICE, 0, &status);
+
+	/* If the command is successful or if the device was disabled */
+	if (status != IDXD_CMDSTS_SUCCESS &&
+	    !(status & IDXD_CMDSTS_ERR_DIS_DEV_EN)) {
+		dev_dbg(dev, "%s: err_code: %#x\n", __func__, status);
+		return -ENXIO;
+	}
+
+	spin_lock(&idxd->dev_lock);
+	idxd_device_clear_state(idxd);
+	idxd->state = IDXD_DEV_DISABLED;
+	spin_unlock(&idxd->dev_lock);
+	return 0;
+}
+
+void idxd_device_reset(struct idxd_device *idxd)
+{
+	idxd_cmd_exec(idxd, IDXD_CMD_RESET_DEVICE, 0, NULL);
+	spin_lock(&idxd->dev_lock);
+	idxd_device_clear_state(idxd);
+	idxd->state = IDXD_DEV_DISABLED;
+	idxd_unmask_error_interrupts(idxd);
+	spin_unlock(&idxd->dev_lock);
+}
+
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	operand = pasid;
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_DRAIN_PASID, operand);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL);
+	dev_dbg(dev, "pasid %d drained\n", pasid);
+}
+EXPORT_SYMBOL_GPL(idxd_device_drain_pasid);
+
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+				   enum idxd_interrupt_type irq_type)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, status;
+
+	if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)))
+		return -EOPNOTSUPP;
+
+	dev_dbg(dev, "get int handle, idx %d\n", idx);
+
+	operand = idx & GENMASK(15, 0);
+	if (irq_type == IDXD_IRQ_IMS)
+		operand |= CMD_INT_HANDLE_IMS;
+
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_REQUEST_INT_HANDLE, operand);
+
+	idxd_cmd_exec(idxd, IDXD_CMD_REQUEST_INT_HANDLE, operand, &status);
+
+	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "request int handle failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	*handle = (status >> IDXD_CMDSTS_RES_SHIFT) & GENMASK(15, 0);
+
+	dev_dbg(dev, "int handle acquired: %u\n", *handle);
+	return 0;
+}
+
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+				   enum idxd_interrupt_type irq_type)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand, status;
+	union idxd_command_reg cmd;
+
+	if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)))
+		return -EOPNOTSUPP;
+
+	dev_dbg(dev, "release int handle, handle %d\n", handle);
+
+	memset(&cmd, 0, sizeof(cmd));
+	operand = handle & GENMASK(15, 0);
+
+	if (irq_type == IDXD_IRQ_IMS)
+		operand |= CMD_INT_HANDLE_IMS;
+
+	cmd.cmd = IDXD_CMD_RELEASE_INT_HANDLE;
+	cmd.operand = operand;
+
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand);
+
+	spin_lock(&idxd->cmd_lock);
+	iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+	while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE)
+		cpu_relax();
+	status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+	spin_unlock(&idxd->cmd_lock);
+
+	if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+		dev_dbg(dev, "release int handle failed: %#x\n", status);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "int handle released.\n");
+	return 0;
+}
+
+/* Device configuration bits */
+static void idxd_engines_clear_state(struct idxd_device *idxd)
+{
+	struct idxd_engine *engine;
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_engines; i++) {
+		engine = idxd->engines[i];
+		engine->group = NULL;
+	}
+}
+
+static void idxd_groups_clear_state(struct idxd_device *idxd)
+{
+	struct idxd_group *group;
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		memset(&group->grpcfg, 0, sizeof(group->grpcfg));
+		group->num_engines = 0;
+		group->num_wqs = 0;
+		group->use_rdbuf_limit = false;
+		group->rdbufs_allowed = 0;
+		group->rdbufs_reserved = 0;
+		if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override) {
+			group->tc_a = 1;
+			group->tc_b = 1;
+		} else {
+			group->tc_a = -1;
+			group->tc_b = -1;
+		}
+	}
+}
+
+static void idxd_device_wqs_clear_state(struct idxd_device *idxd)
+{
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		idxd_wq_disable_cleanup(wq);
+		idxd_wq_device_reset_cleanup(wq);
+	}
+}
+
+void idxd_device_clear_state(struct idxd_device *idxd)
+{
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return;
+
+	idxd_groups_clear_state(idxd);
+	idxd_engines_clear_state(idxd);
+	idxd_device_wqs_clear_state(idxd);
+}
+
+static void idxd_group_config_write(struct idxd_group *group)
+{
+	struct idxd_device *idxd = group->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int i;
+	u32 grpcfg_offset;
+
+	dev_dbg(dev, "Writing group %d cfg registers\n", group->id);
+
+	/* setup GRPWQCFG */
+	for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+		grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+		iowrite64(group->grpcfg.wqs[i], idxd->reg_base + grpcfg_offset);
+		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
+			group->id, i, grpcfg_offset,
+			ioread64(idxd->reg_base + grpcfg_offset));
+	}
+
+	/* setup GRPENGCFG */
+	grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
+	iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
+		grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset));
+
+	/* setup GRPFLAGS */
+	grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
+	iowrite32(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
+		group->id, grpcfg_offset,
+		ioread32(idxd->reg_base + grpcfg_offset));
+}
+
+static int idxd_groups_config_write(struct idxd_device *idxd)
+
+{
+	union gencfg_reg reg;
+	int i;
+	struct device *dev = &idxd->pdev->dev;
+
+	/* Setup bandwidth rdbuf limit */
+	if (idxd->hw.gen_cap.config_en && idxd->rdbuf_limit) {
+		reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+		reg.rdbuf_limit = idxd->rdbuf_limit;
+		iowrite32(reg.bits, idxd->reg_base + IDXD_GENCFG_OFFSET);
+	}
+
+	dev_dbg(dev, "GENCFG(%#x): %#x\n", IDXD_GENCFG_OFFSET,
+		ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET));
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = idxd->groups[i];
+
+		idxd_group_config_write(group);
+	}
+
+	return 0;
+}
+
+static bool idxd_device_pasid_priv_enabled(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	if (pdev->pasid_enabled && (pdev->pasid_features & PCI_PASID_CAP_PRIV))
+		return true;
+	return false;
+}
+
+static int idxd_wq_config_write(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	u32 wq_offset;
+	int i;
+
+	if (!wq->group)
+		return 0;
+
+	/*
+	 * Instead of memset the entire shadow copy of WQCFG, copy from the hardware after
+	 * wq reset. This will copy back the sticky values that are present on some devices.
+	 */
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		wq->wqcfg->bits[i] |= ioread32(idxd->reg_base + wq_offset);
+	}
+
+	if (wq->size == 0 && wq->type != IDXD_WQT_NONE)
+		wq->size = WQ_DEFAULT_QUEUE_DEPTH;
+
+	/* byte 0-3 */
+	wq->wqcfg->wq_size = wq->size;
+
+	/* bytes 4-7 */
+	wq->wqcfg->wq_thresh = wq->threshold;
+
+	/* byte 8-11 */
+	if (wq_dedicated(wq))
+		wq->wqcfg->mode = 1;
+
+	/*
+	 * Here the priv bit is set depending on the WQ type. priv = 1 if the
+	 * WQ type is kernel to indicate privileged access. This setting only
+	 * matters for dedicated WQ. According to the DSA spec:
+	 * If the WQ is in dedicated mode, WQ PASID Enable is 1, and the
+	 * Privileged Mode Enable field of the PCI Express PASID capability
+	 * is 0, this field must be 0.
+	 *
+	 * In the case of a dedicated kernel WQ that is not able to support
+	 * the PASID cap, then the configuration will be rejected.
+	 */
+	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	if (wq_dedicated(wq) && wq->wqcfg->pasid_en &&
+	    !idxd_device_pasid_priv_enabled(idxd) &&
+	    wq->type == IDXD_WQT_KERNEL) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_PRIV;
+		return -EOPNOTSUPP;
+	}
+
+	wq->wqcfg->priority = wq->priority;
+
+	if (idxd->hw.gen_cap.block_on_fault &&
+	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
+		wq->wqcfg->bof = 1;
+
+	if (idxd->hw.wq_cap.wq_ats_support)
+		wq->wqcfg->wq_ats_disable = wq->ats_dis;
+
+	/* bytes 12-15 */
+	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
+	idxd_wqcfg_set_max_batch_shift(idxd->data->type, wq->wqcfg, ilog2(wq->max_batch_size));
+
+	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		iowrite32(wq->wqcfg->bits[i], idxd->reg_base + wq_offset);
+		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
+			wq->id, i, wq_offset,
+			ioread32(idxd->reg_base + wq_offset));
+	}
+
+	return 0;
+}
+
+static int idxd_wqs_config_write(struct idxd_device *idxd)
+{
+	int i, rc;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		rc = idxd_wq_config_write(wq);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void idxd_group_flags_setup(struct idxd_device *idxd)
+{
+	int i;
+
+	/* TC-A 0 and TC-B 1 should be defaults */
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = idxd->groups[i];
+
+		if (group->tc_a == -1)
+			group->tc_a = group->grpcfg.flags.tc_a = 0;
+		else
+			group->grpcfg.flags.tc_a = group->tc_a;
+		if (group->tc_b == -1)
+			group->tc_b = group->grpcfg.flags.tc_b = 1;
+		else
+			group->grpcfg.flags.tc_b = group->tc_b;
+		group->grpcfg.flags.use_rdbuf_limit = group->use_rdbuf_limit;
+		group->grpcfg.flags.rdbufs_reserved = group->rdbufs_reserved;
+		if (group->rdbufs_allowed)
+			group->grpcfg.flags.rdbufs_allowed = group->rdbufs_allowed;
+		else
+			group->grpcfg.flags.rdbufs_allowed = idxd->max_rdbufs;
+	}
+}
+
+static int idxd_engines_setup(struct idxd_device *idxd)
+{
+	int i, engines = 0;
+	struct idxd_engine *eng;
+	struct idxd_group *group;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		group->grpcfg.engines = 0;
+	}
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		eng = idxd->engines[i];
+		group = eng->group;
+
+		if (!group)
+			continue;
+
+		group->grpcfg.engines |= BIT(eng->id);
+		engines++;
+	}
+
+	if (!engines)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int idxd_wqs_setup(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	struct idxd_group *group;
+	int i, j, configured = 0;
+	struct device *dev = &idxd->pdev->dev;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		for (j = 0; j < 4; j++)
+			group->grpcfg.wqs[j] = 0;
+	}
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		group = wq->group;
+
+		if (!wq->group)
+			continue;
+
+		if (wq_shared(wq) && !device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SWQ_SUPPORT;
+			dev_warn(dev, "No shared wq support but configured.\n");
+			return -EINVAL;
+		}
+
+		group->grpcfg.wqs[wq->id / 64] |= BIT(wq->id % 64);
+		configured++;
+	}
+
+	if (configured == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NONE_CONFIGURED;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int idxd_device_config(struct idxd_device *idxd)
+{
+	int rc;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	rc = idxd_wqs_setup(idxd);
+	if (rc < 0)
+		return rc;
+
+	rc = idxd_engines_setup(idxd);
+	if (rc < 0)
+		return rc;
+
+	idxd_group_flags_setup(idxd);
+
+	rc = idxd_wqs_config_write(idxd);
+	if (rc < 0)
+		return rc;
+
+	rc = idxd_groups_config_write(idxd);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int idxd_wq_load_config(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int wqcfg_offset;
+	int i;
+
+	wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, 0);
+	memcpy_fromio(wq->wqcfg, idxd->reg_base + wqcfg_offset, idxd->wqcfg_size);
+
+	wq->size = wq->wqcfg->wq_size;
+	wq->threshold = wq->wqcfg->wq_thresh;
+
+	if (wq->wqcfg->mode)
+		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+
+	wq->priority = wq->wqcfg->priority;
+
+	if (wq->wqcfg->bof)
+		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+
+	if (wq->wqcfg->mode_support)
+		set_bit(WQ_FLAG_MODE_1, &wq->flags);
+
+	wq->max_xfer_bytes = 1ULL << wq->wqcfg->max_xfer_shift;
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, 1U << wq->wqcfg->max_batch_shift);
+
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]);
+	}
+
+	return 0;
+}
+
+static void idxd_group_load_config(struct idxd_group *group)
+{
+	struct idxd_device *idxd = group->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int i, j, grpcfg_offset;
+
+	/*
+	 * Load WQS bit fields
+	 * Iterate through all 256 bits 64 bits at a time
+	 */
+	for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+		struct idxd_wq *wq;
+
+		grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+		group->grpcfg.wqs[i] = ioread64(idxd->reg_base + grpcfg_offset);
+		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
+			group->id, i, grpcfg_offset, group->grpcfg.wqs[i]);
+
+		if (i * 64 >= idxd->max_wqs)
+			break;
+
+		/* Iterate through all 64 bits and check for wq set */
+		for (j = 0; j < 64; j++) {
+			int id = i * 64 + j;
+
+			/* No need to check beyond max wqs */
+			if (id >= idxd->max_wqs)
+				break;
+
+			/* Set group assignment for wq if wq bit is set */
+			if (group->grpcfg.wqs[i] & BIT(j)) {
+				wq = idxd->wqs[id];
+				wq->group = group;
+			}
+		}
+	}
+
+	grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
+	group->grpcfg.engines = ioread64(idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
+		grpcfg_offset, group->grpcfg.engines);
+
+	/* Iterate through all 64 bits to check engines set */
+	for (i = 0; i < 64; i++) {
+		if (i >= idxd->max_engines)
+			break;
+
+		if (group->grpcfg.engines & BIT(i)) {
+			struct idxd_engine *engine = idxd->engines[i];
+
+			engine->group = group;
+		}
+	}
+
+	grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
+	group->grpcfg.flags.bits = ioread32(idxd->reg_base + grpcfg_offset);
+	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
+		group->id, grpcfg_offset, group->grpcfg.flags.bits);
+}
+
+int idxd_device_load_config(struct idxd_device *idxd)
+{
+	union gencfg_reg reg;
+	int i, rc;
+
+	reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+	idxd->rdbuf_limit = reg.rdbuf_limit;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = idxd->groups[i];
+
+		idxd_group_load_config(group);
+	}
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		rc = idxd_wq_load_config(wq);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void idxd_flush_pending_descs(struct idxd_irq_entry *ie)
+{
+	struct idxd_desc *desc, *itr;
+	struct llist_node *head;
+	LIST_HEAD(flist);
+	enum idxd_complete_type ctype;
+
+	spin_lock(&ie->list_lock);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(desc, itr, head, llnode)
+			list_add_tail(&desc->list, &ie->work_list);
+	}
+
+	list_for_each_entry_safe(desc, itr, &ie->work_list, list)
+		list_move_tail(&desc->list, &flist);
+	spin_unlock(&ie->list_lock);
+
+	list_for_each_entry_safe(desc, itr, &flist, list) {
+		list_del(&desc->list);
+		ctype = desc->completion->status ? IDXD_COMPLETE_NORMAL : IDXD_COMPLETE_ABORT;
+		idxd_dma_complete_txd(desc, ctype, true);
+	}
+}
+
+static void idxd_device_set_perm_entry(struct idxd_device *idxd,
+				       struct idxd_irq_entry *ie)
+{
+	union msix_perm mperm;
+
+	dev_dbg(&idxd->pdev->dev, "set MSIX_PERM entry for idx %d\n", ie->id);
+	if (ie->pasid == INVALID_IOASID)
+		return;
+
+	dev_dbg(&idxd->pdev->dev, "pasid %u for MSIX_PERM\n", idxd->pasid);
+	mperm.bits = 0;
+	mperm.pasid = ie->pasid;
+	mperm.pasid_en = 1;
+	iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8);
+}
+
+static void idxd_device_clear_perm_entry(struct idxd_device *idxd,
+					 struct idxd_irq_entry *ie)
+{
+	dev_dbg(&idxd->pdev->dev, "clear MSIX_PERM entry for idx %d\n", ie->id);
+	iowrite32(0, idxd->reg_base + idxd->msix_perm_offset + ie->id * 8);
+}
+
+void idxd_wq_free_irq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_irq_entry *ie = &wq->ie;
+
+	synchronize_irq(ie->vector);
+	free_irq(ie->vector, ie);
+	idxd_flush_pending_descs(ie);
+	if (idxd->request_int_handles)
+		idxd_device_release_int_handle(idxd, ie->int_handle, IDXD_IRQ_MSIX);
+	idxd_device_clear_perm_entry(idxd, ie);
+	ie->vector = -1;
+	ie->int_handle = INVALID_INT_HANDLE;
+	ie->pasid = INVALID_IOASID;
+}
+
+int idxd_wq_request_irq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	struct idxd_irq_entry *ie;
+	int rc;
+
+	dev_dbg(dev, "wq %d enabling ie %d irq\n", wq->id, ie->id);
+
+	ie = &wq->ie;
+	ie->vector = pci_irq_vector(pdev, ie->id);
+	ie->pasid = device_pasid_enabled(idxd) ? idxd->pasid : INVALID_IOASID;
+	idxd_device_set_perm_entry(idxd, ie);
+
+	rc = request_threaded_irq(ie->vector, NULL, idxd_wq_thread, 0, "idxd-portal", ie);
+	if (rc < 0) {
+		dev_err(dev, "Failed to request irq %d.\n", ie->vector);
+		goto err_irq;
+	}
+
+	if (idxd->request_int_handles) {
+		rc = idxd_device_request_int_handle(idxd, ie->id, &ie->int_handle,
+						    IDXD_IRQ_MSIX);
+		if (rc < 0)
+			goto err_int_handle;
+	} else {
+		ie->int_handle = ie->id;
+	}
+
+	return 0;
+
+err_int_handle:
+	ie->int_handle = INVALID_INT_HANDLE;
+	free_irq(ie->vector, ie);
+err_irq:
+	idxd_device_clear_perm_entry(idxd, ie);
+	ie->pasid = INVALID_IOASID;
+	return rc;
+}
+
+int __drv_enable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	int rc = -ENXIO;
+
+	lockdep_assert_held(&wq->wq_lock);
+
+	if (idxd->state != IDXD_DEV_ENABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_NOT_ENABLED;
+		goto err;
+	}
+
+	if (wq->state != IDXD_WQ_DISABLED) {
+		dev_dbg(dev, "wq %d already enabled.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_ENABLED;
+		rc = -EBUSY;
+		goto err;
+	}
+
+	if (!wq->group) {
+		dev_dbg(dev, "wq %d not attached to group.\n", wq->id);
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_GRP;
+		goto err;
+	}
+
+	if (strlen(wq->name) == 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_NAME;
+		dev_dbg(dev, "wq %d name not set.\n", wq->id);
+		goto err;
+	}
+
+	/* Shared WQ checks */
+	if (wq_shared(wq)) {
+		if (!device_swq_supported(idxd)) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_SVM;
+			dev_dbg(dev, "PASID not enabled and shared wq.\n");
+			goto err;
+		}
+		/*
+		 * Shared wq with the threshold set to 0 means the user
+		 * did not set the threshold or transitioned from a
+		 * dedicated wq but did not set threshold. A value
+		 * of 0 would effectively disable the shared wq. The
+		 * driver does not allow a value of 0 to be set for
+		 * threshold via sysfs.
+		 */
+		if (wq->threshold == 0) {
+			idxd->cmd_status = IDXD_SCMD_WQ_NO_THRESH;
+			dev_dbg(dev, "Shared wq and threshold 0.\n");
+			goto err;
+		}
+	}
+
+	/*
+	 * In the event that the WQ is configurable for pasid and priv bits.
+	 * For kernel wq, the driver should setup the pasid, pasid_en, and priv bit.
+	 * However, for non-kernel wq, the driver should only set the pasid_en bit for
+	 * shared wq. A dedicated wq will configure pasid and pasid_en later on so
+	 * there is no need to setup.
+	 */
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags) ||
+	    test_bit(WQ_FLAG_MODE_1, &wq->flags)) {
+		if (is_idxd_wq_kernel(wq)) {
+			if (device_pasid_enabled(idxd)) {
+				u32 pasid = wq_dedicated(wq) ? idxd->pasid : 0;
+
+				__idxd_wq_set_pasid_locked(wq, pasid);
+			}
+			__idxd_wq_set_priv_locked(wq, 1);
+		} else {
+			if (device_user_pasid_enabled(idxd) && wq_shared(wq))
+				__idxd_wq_set_pasid_locked(wq, 0);
+			__idxd_wq_set_priv_locked(wq, 0);
+		}
+	}
+
+	rc = 0;
+	spin_lock(&idxd->dev_lock);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		rc = idxd_device_config(idxd);
+	spin_unlock(&idxd->dev_lock);
+	if (rc < 0) {
+		dev_dbg(dev, "Writing wq %d config failed: %d\n", wq->id, rc);
+		goto err;
+	}
+
+	rc = idxd_wq_enable(wq, NULL);
+	if (rc < 0) {
+		dev_dbg(dev, "wq %d enabling failed: %d\n", wq->id, rc);
+		goto err;
+	}
+
+	rc = idxd_wq_map_portal(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_PORTAL_ERR;
+		dev_dbg(dev, "wq %d portal mapping failed: %d\n", wq->id, rc);
+		goto err_map_portal;
+	}
+
+	wq->client_count = 0;
+	return 0;
+
+err_map_portal:
+	rc = idxd_wq_disable(wq, false, NULL);
+	if (rc < 0)
+		dev_dbg(dev, "wq %s disable failed\n", dev_name(wq_confdev(wq)));
+err:
+	return rc;
+}
+EXPORT_SYMBOL_GPL(__drv_enable_wq);
+
+int drv_enable_wq(struct idxd_wq *wq)
+{
+	int rc;
+
+	mutex_lock(&wq->wq_lock);
+	rc = __drv_enable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(drv_enable_wq);
+
+void __drv_disable_wq(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+
+	lockdep_assert_held(&wq->wq_lock);
+
+	if (idxd_wq_refcount(wq))
+		dev_warn(dev, "Clients has claim on wq %d: %d\n",
+			 wq->id, idxd_wq_refcount(wq));
+
+	idxd_wq_unmap_portal(wq);
+
+	idxd_wq_drain(wq, NULL);
+	idxd_wq_reset(wq);
+
+	wq->client_count = 0;
+}
+EXPORT_SYMBOL_GPL(__drv_disable_wq);
+
+void drv_disable_wq(struct idxd_wq *wq)
+{
+	mutex_lock(&wq->wq_lock);
+	__drv_disable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+}
+EXPORT_SYMBOL_GPL(drv_disable_wq);
+
+int idxd_device_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
+	int rc = 0;
+
+	/*
+	 * Device should be in disabled state for the idxd_drv to load. If it's in
+	 * enabled state, then the device was altered outside of driver's control.
+	 * If the state is in halted state, then we don't want to proceed.
+	 */
+	if (idxd->state != IDXD_DEV_DISABLED) {
+		idxd->cmd_status = IDXD_SCMD_DEV_ENABLED;
+		return -ENXIO;
+	}
+
+	/* Device configuration */
+	spin_lock(&idxd->dev_lock);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		rc = idxd_device_config(idxd);
+	spin_unlock(&idxd->dev_lock);
+	if (rc < 0)
+		return -ENXIO;
+
+	/* Start device */
+	rc = idxd_device_enable(idxd);
+	if (rc < 0)
+		return rc;
+
+	/* Setup DMA device without channels */
+	rc = idxd_register_dma_device(idxd);
+	if (rc < 0) {
+		idxd_device_disable(idxd);
+		idxd->cmd_status = IDXD_SCMD_DEV_DMA_ERR;
+		return rc;
+	}
+
+	idxd->cmd_status = 0;
+	return 0;
+}
+
+void idxd_device_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_device *idxd = idxd_dev_to_idxd(idxd_dev);
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+		struct device *wq_dev = wq_confdev(wq);
+
+		if (wq->state == IDXD_WQ_DISABLED)
+			continue;
+		dev_warn(dev, "Active wq %d on disable %s.\n", i, dev_name(wq_dev));
+		device_release_driver(wq_dev);
+	}
+
+	idxd_unregister_dma_device(idxd);
+	idxd_device_disable(idxd);
+	if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		idxd_device_reset(idxd);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_DSA,
+	IDXD_DEV_IAX,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_drv = {
+	.type = dev_types,
+	.probe = idxd_device_drv_probe,
+	.remove = idxd_device_drv_remove,
+	.name = "idxd",
+};
+EXPORT_SYMBOL_GPL(idxd_drv);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
new file mode 100644
index 0000000000000000000000000000000000000000..1af0e279c549891822df67586f5d21cd2668121a
--- /dev/null
+++ b/drivers/dma/idxd/dma.c
@@ -0,0 +1,585 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
+#include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
+#include "registers.h"
+#include "idxd.h"
+
+
+#define DMA_COOKIE_BITS (sizeof(dma_cookie_t) * 8)
+/*
+ * The descriptor id takes the lower 16 bits of the cookie.
+ */
+#define DESC_ID_BITS 16
+#define DESC_ID_MASK ((1 << DESC_ID_BITS) - 1)
+/*
+ * The 'generation' is in the upper half of the cookie. But dma_cookie_t
+ * is signed, so we leave the upper-most bit for the sign. Further, we
+ * need to flag whether a cookie corresponds to an operation that is
+ * being completed via interrupt to avoid polling it, which takes
+ * the second most upper bit. So we subtract two bits from the upper half.
+ */
+#define DESC_GEN_MAX ((1 << (DMA_COOKIE_BITS - DESC_ID_BITS - 2)) - 1)
+#define DESC_INTERRUPT_FLAG (1 << (DMA_COOKIE_BITS - 2))
+
+static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
+{
+	struct idxd_dma_chan *idxd_chan;
+
+	idxd_chan = container_of(c, struct idxd_dma_chan, chan);
+	return idxd_chan->wq;
+}
+
+void idxd_dma_complete_txd(struct idxd_desc *desc,
+			   enum idxd_complete_type comp_type,
+			   bool free_desc)
+{
+	struct idxd_device *idxd = desc->wq->idxd;
+	struct dma_async_tx_descriptor *tx;
+	struct dmaengine_result res;
+	int complete = 1;
+
+	if (desc->completion->status == DSA_COMP_SUCCESS) {
+		res.result = DMA_TRANS_NOERROR;
+	} else if (desc->completion->status) {
+		if (idxd->request_int_handles && comp_type != IDXD_COMPLETE_ABORT &&
+		    desc->completion->status == DSA_COMP_INT_HANDLE_INVAL &&
+		    idxd_queue_int_handle_resubmit(desc))
+			return;
+		res.result = DMA_TRANS_WRITE_FAILED;
+	} else if (comp_type == IDXD_COMPLETE_ABORT) {
+		res.result = DMA_TRANS_ABORTED;
+	} else {
+		complete = 0;
+	}
+
+	tx = &desc->txd;
+	if (complete && tx->cookie) {
+		dma_cookie_complete(tx);
+		dma_descriptor_unmap(tx);
+		dmaengine_desc_get_callback_invoke(tx, &res);
+		tx->callback = NULL;
+		tx->callback_result = NULL;
+	}
+
+	if (free_desc)
+		idxd_free_desc(desc->wq, desc);
+}
+
+static void op_flag_setup(unsigned long flags, u32 *desc_flags)
+{
+	*desc_flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
+	if (flags & DMA_PREP_INTERRUPT)
+		*desc_flags |= IDXD_OP_FLAG_RCI;
+}
+
+static inline void set_completion_address(struct idxd_desc *desc,
+					  u64 *compl_addr)
+{
+		*compl_addr = desc->compl_dma;
+}
+
+static inline void idxd_prep_desc_common(struct idxd_wq *wq,
+					 struct dsa_hw_desc *hw, char opcode,
+					 u64 addr_f1, u64 addr_f2, u64 len,
+					 u64 compl, u32 flags)
+{
+	hw->flags = flags;
+	hw->opcode = opcode;
+	hw->src_addr = addr_f1;
+	hw->dst_addr = addr_f2;
+	hw->xfer_size = len;
+	/*
+	 * For dedicated WQ, this field is ignored and HW will use the WQCFG.priv
+	 * field instead. This field should be set to 1 for kernel descriptors.
+	 */
+	hw->priv = 1;
+	hw->completion_addr = compl;
+}
+
+static struct dma_async_tx_descriptor *
+idxd_dma_submit_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
+		       dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+	struct idxd_wq *wq = to_idxd_wq(c);
+	u32 desc_flags;
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_desc *desc;
+
+	if (wq->state != IDXD_WQ_ENABLED)
+		return NULL;
+
+	if (len > idxd->max_xfer_bytes)
+		return NULL;
+
+	op_flag_setup(flags, &desc_flags);
+	desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(desc))
+		return NULL;
+
+	idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_MEMMOVE,
+			      dma_src, dma_dest, len, desc->compl_dma,
+			      desc_flags);
+
+	desc->txd.flags = flags;
+
+	return &desc->txd;
+}
+
+static inline int fetch_sg_and_pos(struct scatterlist **sg, size_t *remain,
+				   unsigned int len)
+{
+	struct scatterlist *next = *sg;
+	int count = 0;
+
+	*remain -= len;
+
+	while (*remain == 0 && next && !sg_is_last(next)) {
+		next = sg_next(next);
+		*remain = sg_dma_len(next);
+		count++;
+	}
+
+	*sg = next;
+
+	return count;
+}
+
+/*
+ * idxd_dma_prep_memcpy_sg - prepare descriptors for a memcpy_sg transcation
+ *
+ * @chan: DMA channel
+ * @dst_sg: Destination scatter list
+ * @dst_nents: Number of entries in destination scatter list
+ * @src_sg: Source scatter list
+ * @src_nents: Number of entries in source scatter list
+ * @flags: DMA transcation flags
+ *
+ * Return: Async transcation descriptor on success and NULL in failure.
+ *
+ * DSA batch descriptor and work queue depth can provide large memcpy
+ * operation. Combined batch descriptor with WQ depth to support scatter
+ * list.
+ */
+static struct dma_async_tx_descriptor *
+idxd_dma_prep_memcpy_sg(struct dma_chan *chan,
+			struct scatterlist *dst_sg, unsigned int dst_nents,
+			struct scatterlist *src_sg, unsigned int src_nents,
+			unsigned long flags)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct idxd_desc *desc;
+	struct idxd_batch *batch;
+	dma_addr_t dma_dst, dma_src;
+	size_t dst_avail, src_avail, len;
+	u32 desc_flags;
+	int i;
+
+	/* sanity check */
+	if (unlikely(!dst_sg || !src_sg))
+		return NULL;
+	if (unlikely(dst_nents == 0 || src_nents == 0))
+		return NULL;
+
+	if (min(dst_nents, src_nents) > wq->max_batch_size)
+		return NULL;
+
+	dst_avail = sg_dma_len(dst_sg);
+	src_avail = sg_dma_len(src_sg);
+
+	if (dst_nents == 1 && src_nents == 1) {
+		if (unlikely(dst_avail != src_avail))
+			return NULL;
+
+		return idxd_dma_submit_memcpy(chan, sg_dma_address(dst_sg),
+				sg_dma_address(src_sg), dst_avail, flags);
+	}
+
+	desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+	if (IS_ERR(desc))
+		return NULL;
+
+	/*
+	 * DSA Batch descriptor has a set of descriptors in array
+	 * is called 'batch'. fill up 'batch' field with some
+	 * descriptors of DSA_OPCODE_MEMMOVE until max_batch_size
+	 * or scatter list is consumed.
+	 */
+	batch = desc->batch;
+	for (i = 0; i < wq->max_batch_size; i++) {
+		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
+			dst_avail;
+		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
+			src_avail;
+
+		len = min_t(size_t, dst_avail, src_avail);
+		len = min_t(size_t, len, wq->idxd->max_xfer_bytes);
+
+		memset(batch->descs + i, 0, sizeof(struct dsa_hw_desc));
+		idxd_prep_desc_common(wq, batch->descs + i, DSA_OPCODE_MEMMOVE,
+				dma_src, dma_dst, len, 0, IDXD_OP_FLAG_CC);
+		batch->num++;
+
+		dst_nents -= fetch_sg_and_pos(&dst_sg, &dst_avail, len);
+		src_nents -= fetch_sg_and_pos(&src_sg, &src_avail, len);
+
+		/* entries or src or dst consumed */
+		if (!dst_nents || !src_nents ||
+				!min_t(size_t, dst_avail, src_avail)) {
+			break;
+		}
+	}
+
+	/* prepare DSA_OPCODE_BATCH */
+	op_flag_setup(flags, &desc_flags);
+	idxd_prep_desc_common(wq, desc->hw, DSA_OPCODE_BATCH,
+			batch->dma_descs, 0, batch->num,
+			desc->compl_dma, desc_flags);
+
+	return &desc->txd;
+}
+
+static int idxd_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	idxd_wq_get(wq);
+	dev_dbg(dev, "%s: client_count: %d\n", __func__,
+		idxd_wq_refcount(wq));
+	return 0;
+}
+
+static void idxd_dma_free_chan_resources(struct dma_chan *chan)
+{
+	struct idxd_wq *wq = to_idxd_wq(chan);
+	struct device *dev = &wq->idxd->pdev->dev;
+
+	idxd_wq_put(wq);
+	dev_dbg(dev, "%s: client_count: %d\n", __func__,
+		idxd_wq_refcount(wq));
+}
+
+
+static enum dma_status idxd_dma_tx_status(struct dma_chan *dma_chan,
+					  dma_cookie_t cookie,
+					  struct dma_tx_state *txstate)
+{
+	u8 status;
+	struct idxd_wq *wq;
+	struct idxd_desc *desc;
+	u32 idx;
+
+	memset(txstate, 0, sizeof(*txstate));
+
+	if (dma_submit_error(cookie))
+		return DMA_ERROR;
+
+	wq = to_idxd_wq(dma_chan);
+
+	idx = cookie & DESC_ID_MASK;
+	if (idx >= wq->num_descs)
+		return DMA_ERROR;
+
+	desc = wq->descs[idx];
+
+	if (desc->txd.cookie != cookie) {
+		/*
+		 * The user asked about an old transaction
+		 */
+		return DMA_COMPLETE;
+	}
+
+	/*
+	 * For descriptors completed via interrupt, we can't go
+	 * look at the completion status directly because it races
+	 * with the IRQ handler recyling the descriptor. However,
+	 * since in this case we can rely on the interrupt handler
+	 * to invalidate the cookie when the command completes we
+	 * know that if we get here, the command is still in
+	 * progress.
+	 */
+	if ((cookie & DESC_INTERRUPT_FLAG) != 0)
+		return DMA_IN_PROGRESS;
+
+	status = desc->completion->status & DSA_COMP_STATUS_MASK;
+
+	if (status) {
+		/*
+		 * Check against the original status as ABORT is software defined
+		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+		 */
+		if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT))
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
+		else
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
+
+		return DMA_COMPLETE;
+	}
+
+	return DMA_IN_PROGRESS;
+}
+
+
+/*
+ * issue_pending() does not need to do anything since tx_submit() does the job
+ * already.
+ */
+static void idxd_dma_issue_pending(struct dma_chan *dma_chan)
+{
+}
+
+static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_chan *c = tx->chan;
+	struct idxd_wq *wq = to_idxd_wq(c);
+	dma_cookie_t cookie;
+	int rc;
+	struct idxd_desc *desc = container_of(tx, struct idxd_desc, txd);
+
+	cookie = (desc->gen << DESC_ID_BITS) | (desc->id & DESC_ID_MASK);
+
+	if ((desc->hw->flags & IDXD_OP_FLAG_RCI) != 0)
+		cookie |= DESC_INTERRUPT_FLAG;
+
+	if (desc->gen == DESC_GEN_MAX)
+		desc->gen = 1;
+	else
+		desc->gen++;
+
+	tx->cookie = cookie;
+
+	rc = idxd_submit_desc(wq, desc);
+	if (rc < 0) {
+		idxd_free_desc(wq, desc);
+		return rc;
+	}
+
+	return cookie;
+}
+
+static void idxd_dma_release(struct dma_device *device)
+{
+	struct idxd_dma_dev *idxd_dma = container_of(device, struct idxd_dma_dev, dma);
+
+	kfree(idxd_dma);
+}
+
+int idxd_register_dma_device(struct idxd_device *idxd)
+{
+	struct idxd_dma_dev *idxd_dma;
+	struct dma_device *dma;
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+
+	idxd_dma = kzalloc_node(sizeof(*idxd_dma), GFP_KERNEL, dev_to_node(dev));
+	if (!idxd_dma)
+		return -ENOMEM;
+
+	dma = &idxd_dma->dma;
+	INIT_LIST_HEAD(&dma->channels);
+	dma->dev = dev;
+
+	/*
+	 * claim device max_segment_size to HugePage/THP size, otherwise
+	 * DMA-API debug code would complain it's longer than default.
+	 */
+	idxd_dma->dma_parms.max_segment_size = HPAGE_PMD_SIZE;
+	dma->dev->dma_parms = &idxd_dma->dma_parms;
+
+	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+	dma->device_release = idxd_dma_release;
+
+	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_MEMMOVE) {
+		dma_cap_set(DMA_MEMCPY, dma->cap_mask);
+		dma->device_prep_dma_memcpy = idxd_dma_submit_memcpy;
+	}
+
+	if (idxd->hw.opcap.bits[0] & IDXD_OPCAP_BATCH) {
+		dma_cap_set(DMA_MEMCPY_SG, dma->cap_mask);
+		dma->device_prep_dma_memcpy_sg = idxd_dma_prep_memcpy_sg;
+	}
+
+	dma->device_tx_status = idxd_dma_tx_status;
+	dma->device_issue_pending = idxd_dma_issue_pending;
+	dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
+	dma->device_free_chan_resources = idxd_dma_free_chan_resources;
+
+	rc = dma_async_device_register(dma);
+	if (rc < 0) {
+		kfree(idxd_dma);
+		return rc;
+	}
+
+	idxd_dma->idxd = idxd;
+	/*
+	 * This pointer is protected by the refs taken by the dma_chan. It will remain valid
+	 * as long as there are outstanding channels.
+	 */
+	idxd->idxd_dma = idxd_dma;
+	return 0;
+}
+
+void idxd_unregister_dma_device(struct idxd_device *idxd)
+{
+	dma_async_device_unregister(&idxd->idxd_dma->dma);
+}
+
+int idxd_register_dma_channel(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct dma_device *dma = &idxd->idxd_dma->dma;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_dma_chan *idxd_chan;
+	struct dma_chan *chan;
+	int rc, i;
+
+	idxd_chan = kzalloc_node(sizeof(*idxd_chan), GFP_KERNEL, dev_to_node(dev));
+	if (!idxd_chan)
+		return -ENOMEM;
+
+	chan = &idxd_chan->chan;
+	chan->device = dma;
+	list_add_tail(&chan->device_node, &dma->channels);
+
+	for (i = 0; i < wq->num_descs; i++) {
+		struct idxd_desc *desc = wq->descs[i];
+
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = idxd_dma_tx_submit;
+	}
+
+	rc = dma_async_device_channel_register(dma, chan);
+	if (rc < 0) {
+		kfree(idxd_chan);
+		return rc;
+	}
+
+	wq->idxd_chan = idxd_chan;
+	idxd_chan->wq = wq;
+	get_device(wq_confdev(wq));
+
+	return 0;
+}
+
+void idxd_unregister_dma_channel(struct idxd_wq *wq)
+{
+	struct idxd_dma_chan *idxd_chan = wq->idxd_chan;
+	struct dma_chan *chan = &idxd_chan->chan;
+	struct idxd_dma_dev *idxd_dma = wq->idxd->idxd_dma;
+
+	dma_async_device_channel_unregister(&idxd_dma->dma, chan);
+	list_del(&chan->device_node);
+	kfree(wq->idxd_chan);
+	wq->idxd_chan = NULL;
+	put_device(wq_confdev(wq));
+}
+
+static int idxd_dmaengine_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	if (!idxd_wq_driver_name_match(wq, dev)) {
+		idxd->cmd_status = IDXD_SCMD_WQ_NO_DRV_NAME;
+		rc = -ENODEV;
+		goto err_drv_name;
+	}
+
+	wq->type = IDXD_WQT_KERNEL;
+
+	rc = __drv_enable_wq(wq);
+	if (rc < 0) {
+		dev_dbg(dev, "Enable wq %d failed: %d\n", wq->id, rc);
+		rc = -ENXIO;
+		goto err;
+	}
+
+	rc = idxd_wq_request_irq(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_IRQ_ERR;
+		dev_dbg(dev, "WQ %d irq setup failed: %d\n", wq->id, rc);
+		goto err_irq;
+	}
+
+	rc = idxd_wq_alloc_resources(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_WQ_RES_ALLOC_ERR;
+		dev_dbg(dev, "WQ resource alloc failed\n");
+		goto err_res_alloc;
+	}
+
+	rc = idxd_wq_init_percpu_ref(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_PERCPU_ERR;
+		dev_dbg(dev, "percpu_ref setup failed\n");
+		goto err_ref;
+	}
+
+	rc = idxd_register_dma_channel(wq);
+	if (rc < 0) {
+		idxd->cmd_status = IDXD_SCMD_DMA_CHAN_ERR;
+		dev_dbg(dev, "Failed to register dma channel\n");
+		goto err_dma;
+	}
+
+	idxd->cmd_status = 0;
+	mutex_unlock(&wq->wq_lock);
+	return 0;
+
+err_dma:
+	__idxd_wq_quiesce(wq);
+	percpu_ref_exit(&wq->wq_active);
+err_ref:
+	idxd_wq_free_resources(wq);
+err_res_alloc:
+	idxd_wq_free_irq(wq);
+err_irq:
+	__drv_disable_wq(wq);
+err:
+err_drv_name:
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+	return rc;
+}
+
+static void idxd_dmaengine_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+
+	mutex_lock(&wq->wq_lock);
+	__idxd_wq_quiesce(wq);
+	idxd_unregister_dma_channel(wq);
+	idxd_wq_free_resources(wq);
+	idxd_wq_free_irq(wq);
+	__drv_disable_wq(wq);
+	percpu_ref_exit(&wq->wq_active);
+	wq->type = IDXD_WQT_NONE;
+	mutex_unlock(&wq->wq_lock);
+}
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+struct idxd_device_driver idxd_dmaengine_drv = {
+	.probe = idxd_dmaengine_drv_probe,
+	.remove = idxd_dmaengine_drv_remove,
+	.name = "dmaengine",
+	.type = dev_types,
+};
+EXPORT_SYMBOL_GPL(idxd_dmaengine_drv);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a2bac349b4b1a3cca84e17711283a5f9ad9a99f
--- /dev/null
+++ b/drivers/dma/idxd/idxd.h
@@ -0,0 +1,733 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _IDXD_H_
+#define _IDXD_H_
+
+#include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/ioasid.h>
+#include <linux/perf_event.h>
+#include <linux/vfio_pci_core.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+
+#define IDXD_DRIVER_VERSION	"1.00"
+
+extern struct kmem_cache *idxd_desc_pool;
+extern bool tc_override;
+
+struct idxd_wq;
+struct idxd_dev;
+
+enum idxd_dev_type {
+	IDXD_DEV_NONE = -1,
+	IDXD_DEV_DSA = 0,
+	IDXD_DEV_IAX,
+	IDXD_DEV_WQ,
+	IDXD_DEV_GROUP,
+	IDXD_DEV_ENGINE,
+	IDXD_DEV_CDEV,
+	IDXD_DEV_MAX_TYPE,
+};
+
+struct idxd_dev {
+	struct device conf_dev;
+	enum idxd_dev_type type;
+};
+
+#define IDXD_REG_TIMEOUT	50
+#define IDXD_DRAIN_TIMEOUT	5000
+
+enum idxd_type {
+	IDXD_TYPE_UNKNOWN = -1,
+	IDXD_TYPE_DSA = 0,
+	IDXD_TYPE_IAX,
+	IDXD_TYPE_MAX,
+};
+
+#define IDXD_NAME_SIZE		128
+#define IDXD_PMU_EVENT_MAX	64
+
+struct idxd_device_ops {
+	 void (*notify_error)(struct idxd_wq *wq);
+};
+
+#define IDXD_ENQCMDS_RETRIES		32
+#define IDXD_ENQCMDS_MAX_RETRIES	64
+
+struct idxd_device_driver {
+	const char *name;
+	enum idxd_dev_type *type;
+	int (*probe)(struct idxd_dev *idxd_dev);
+	void (*remove)(struct idxd_dev *idxd_dev);
+	struct device_driver drv;
+	struct idxd_device_ops *ops;
+};
+
+extern struct idxd_device_driver dsa_drv;
+extern struct idxd_device_driver idxd_drv;
+extern struct idxd_device_driver idxd_dmaengine_drv;
+extern struct idxd_device_driver idxd_user_drv;
+
+#define INVALID_INT_HANDLE	-1
+struct idxd_irq_entry {
+	int id;
+	int vector;
+	struct llist_head pending_llist;
+	struct list_head work_list;
+	/*
+	 * Lock to protect access between irq thread process descriptor
+	 * and irq thread processing error descriptor.
+	 */
+	spinlock_t list_lock;
+	int int_handle;
+	ioasid_t pasid;
+};
+
+struct idxd_group {
+	struct idxd_dev idxd_dev;
+	struct idxd_device *idxd;
+	struct grpcfg grpcfg;
+	int id;
+	int num_engines;
+	int num_wqs;
+	bool use_rdbuf_limit;
+	u8 rdbufs_allowed;
+	u8 rdbufs_reserved;
+	int tc_a;
+	int tc_b;
+};
+
+struct idxd_pmu {
+	struct idxd_device *idxd;
+
+	struct perf_event *event_list[IDXD_PMU_EVENT_MAX];
+	int n_events;
+
+	DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX);
+
+	struct pmu pmu;
+	char name[IDXD_NAME_SIZE];
+	int cpu;
+
+	int n_counters;
+	int counter_width;
+	int n_event_categories;
+
+	bool per_counter_caps_supported;
+	unsigned long supported_event_categories;
+
+	unsigned long supported_filters;
+	int n_filters;
+
+	struct hlist_node cpuhp_node;
+};
+
+#define IDXD_MAX_PRIORITY	0xf
+
+enum idxd_wq_state {
+	IDXD_WQ_DISABLED = 0,
+	IDXD_WQ_ENABLED,
+	IDXD_WQ_LOCKED,
+};
+
+enum idxd_wq_flag {
+	WQ_FLAG_DEDICATED = 0,
+	WQ_FLAG_BLOCK_ON_FAULT,
+	WQ_FLAG_MODE_1,
+};
+
+enum idxd_wq_type {
+	IDXD_WQT_NONE = 0,
+	IDXD_WQT_KERNEL,
+	IDXD_WQT_USER,
+	IDXD_WQT_MDEV,
+};
+
+struct idxd_cdev {
+	struct idxd_wq *wq;
+	struct cdev cdev;
+	struct idxd_dev idxd_dev;
+	int minor;
+};
+
+#define IDXD_ALLOCATED_BATCH_SIZE	128U
+#define WQ_NAME_SIZE   1024
+#define WQ_TYPE_SIZE   10
+
+#define WQ_DEFAULT_QUEUE_DEPTH		16
+#define WQ_DEFAULT_MAX_XFER		SZ_2M
+#define WQ_DEFAULT_MAX_BATCH		32
+
+enum idxd_op_type {
+	IDXD_OP_BLOCK = 0,
+	IDXD_OP_NONBLOCK = 1,
+};
+
+enum idxd_complete_type {
+	IDXD_COMPLETE_NORMAL = 0,
+	IDXD_COMPLETE_ABORT,
+	IDXD_COMPLETE_DEV_FAIL,
+};
+
+struct idxd_dma_chan {
+	struct dma_chan chan;
+	struct idxd_wq *wq;
+};
+
+struct idxd_wq {
+	void __iomem *portal;
+	u32 portal_offset;
+	unsigned int enqcmds_retries;
+	struct percpu_ref wq_active;
+	struct completion wq_dead;
+	struct completion wq_resurrect;
+	struct idxd_dev idxd_dev;
+	struct idxd_cdev *idxd_cdev;
+	struct wait_queue_head err_queue;
+	struct idxd_device *idxd;
+	int id;
+	struct idxd_irq_entry ie;
+	enum idxd_wq_type type;
+	struct idxd_group *group;
+	int client_count;
+	struct mutex wq_lock;	/* mutex for workqueue */
+	u32 size;
+	u32 threshold;
+	u32 priority;
+	enum idxd_wq_state state;
+	unsigned long flags;
+	union wqcfg *wqcfg;
+	struct dsa_hw_desc **hw_descs;
+	int num_descs;
+	union {
+		struct dsa_completion_record *compls;
+		struct iax_completion_record *iax_compls;
+	};
+	dma_addr_t compls_addr;
+	int compls_size;
+	struct idxd_desc **descs;
+	struct sbitmap_queue sbq;
+	struct idxd_dma_chan *idxd_chan;
+	char name[WQ_NAME_SIZE + 1];
+	u64 max_xfer_bytes;
+	u32 max_batch_size;
+	bool ats_dis;
+	char driver_name[WQ_NAME_SIZE + 1];
+
+	void *private_data;
+	struct list_head vdcm_list;
+};
+
+struct idxd_engine {
+	struct idxd_dev idxd_dev;
+	int id;
+	struct idxd_group *group;
+	struct idxd_device *idxd;
+};
+
+/* shadow registers */
+struct idxd_hw {
+	u32 version;
+	union gen_cap_reg gen_cap;
+	union wq_cap_reg wq_cap;
+	union group_cap_reg group_cap;
+	union engine_cap_reg engine_cap;
+	struct opcap opcap;
+	u32 cmd_cap;
+};
+
+enum idxd_device_state {
+	IDXD_DEV_HALTED = -1,
+	IDXD_DEV_DISABLED = 0,
+	IDXD_DEV_ENABLED,
+};
+
+enum idxd_device_flag {
+	IDXD_FLAG_CONFIGURABLE = 0,
+	IDXD_FLAG_CMD_RUNNING,
+	IDXD_FLAG_PASID_ENABLED,
+	IDXD_FLAG_USER_PASID_ENABLED,
+	IDXD_FLAG_IMS_SUPPORTED,
+};
+
+struct idxd_dma_dev {
+	struct idxd_device *idxd;
+	struct dma_device dma;
+	struct device_dma_parameters dma_parms;
+};
+
+struct idxd_driver_data {
+	const char *name_prefix;
+	enum idxd_type type;
+	struct device_type *dev_type;
+	int compl_size;
+	int align;
+};
+
+struct idxd_device {
+	struct idxd_dev idxd_dev;
+	struct idxd_driver_data *data;
+	struct list_head list;
+	struct idxd_hw hw;
+	enum idxd_device_state state;
+	unsigned long flags;
+	int id;
+	int major;
+	u32 cmd_status;
+	struct idxd_irq_entry ie;	/* misc irq, msix 0 */
+
+	struct pci_dev *pdev;
+	void __iomem *reg_base;
+	void __iomem *portal_base;
+
+	spinlock_t dev_lock;	/* spinlock for device */
+	spinlock_t cmd_lock;	/* spinlock for device commands */
+	struct completion *cmd_done;
+	struct idxd_group **groups;
+	struct idxd_wq **wqs;
+	struct idxd_engine **engines;
+
+	unsigned int pasid;
+
+	int num_groups;
+	int irq_cnt;
+	bool request_int_handles;
+
+	u32 ims_offset;
+	u32 msix_perm_offset;
+	u32 wqcfg_offset;
+	u32 grpcfg_offset;
+	u32 perfmon_offset;
+
+	u64 max_xfer_bytes;
+	u32 max_batch_size;
+	int ims_size;
+	int max_groups;
+	int max_engines;
+	int max_rdbufs;
+	int max_wqs;
+	int max_wq_size;
+	int rdbuf_limit;
+	int nr_rdbufs;		/* non-reserved read buffers */
+	unsigned int wqcfg_size;
+
+	union sw_err_reg sw_err;
+	wait_queue_head_t cmd_waitq;
+
+	struct idxd_dma_dev *idxd_dma;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+
+	struct irq_domain *ims_domain;
+	struct vfio_pci_core_device vfio_pdev;
+	struct kref mdev_kref;
+	struct mutex kref_lock;
+	bool mdev_host_init;
+	int *new_handles;
+	struct ida vdev_ida;
+
+	struct idxd_pmu *idxd_pmu;
+};
+
+/**
+ * IDXD batch field for SW Batch descriptor
+ * @descs: Descriptor list address
+ * @dma_descs: DMA address for descs
+ * @crs: completion record list address
+ * @dma_crs: DMA address for completion records
+ * @num: Number of descs in batch
+ */
+struct idxd_batch {
+	struct dsa_hw_desc *descs;
+	dma_addr_t dma_descs;
+	struct dsa_completion_record *crs;
+	dma_addr_t dma_crs;
+	u32 num;
+};
+
+/* IDXD software descriptor */
+struct idxd_desc {
+	union {
+		struct dsa_hw_desc *hw;
+		struct iax_hw_desc *iax_hw;
+	};
+	dma_addr_t desc_dma;
+	union {
+		struct dsa_completion_record *completion;
+		struct iax_completion_record *iax_completion;
+	};
+	dma_addr_t compl_dma;
+	struct dma_async_tx_descriptor txd;
+	struct llist_node llnode;
+	struct list_head list;
+	u16 id;
+	u16 gen;
+	int cpu;
+	struct idxd_wq *wq;
+
+	struct idxd_batch *batch;
+};
+
+/*
+ * This is software defined error for the completion status. We overload the error code
+ * that will never appear in completion status and only SWERR register.
+ */
+enum idxd_completion_status {
+	IDXD_COMP_DESC_ABORT = 0xff,
+};
+
+#define idxd_confdev(idxd) &idxd->idxd_dev.conf_dev
+#define wq_confdev(wq) &wq->idxd_dev.conf_dev
+#define engine_confdev(engine) &engine->idxd_dev.conf_dev
+#define group_confdev(group) &group->idxd_dev.conf_dev
+#define cdev_dev(cdev) &cdev->idxd_dev.conf_dev
+
+#define confdev_to_idxd_dev(dev) container_of(dev, struct idxd_dev, conf_dev)
+#define idxd_dev_to_idxd(idxd_dev) container_of(idxd_dev, struct idxd_device, idxd_dev)
+#define idxd_dev_to_wq(idxd_dev) container_of(idxd_dev, struct idxd_wq, idxd_dev)
+
+static inline struct idxd_device *confdev_to_idxd(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return idxd_dev_to_idxd(idxd_dev);
+}
+
+static inline struct idxd_wq *confdev_to_wq(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return idxd_dev_to_wq(idxd_dev);
+}
+
+static inline struct idxd_engine *confdev_to_engine(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_engine, idxd_dev);
+}
+
+static inline struct idxd_group *confdev_to_group(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_group, idxd_dev);
+}
+
+static inline struct idxd_cdev *dev_to_cdev(struct device *dev)
+{
+	struct idxd_dev *idxd_dev = confdev_to_idxd_dev(dev);
+
+	return container_of(idxd_dev, struct idxd_cdev, idxd_dev);
+}
+
+static inline void idxd_dev_set_type(struct idxd_dev *idev, int type)
+{
+	if (type >= IDXD_DEV_MAX_TYPE) {
+		idev->type = IDXD_DEV_NONE;
+		return;
+	}
+
+	idev->type = type;
+}
+
+static inline struct idxd_irq_entry *idxd_get_ie(struct idxd_device *idxd, int idx)
+{
+	return (idx == 0) ? &idxd->ie : &idxd->wqs[idx - 1]->ie;
+}
+
+static inline struct idxd_wq *ie_to_wq(struct idxd_irq_entry *ie)
+{
+	return container_of(ie, struct idxd_wq, ie);
+}
+
+static inline struct idxd_device *ie_to_idxd(struct idxd_irq_entry *ie)
+{
+	return container_of(ie, struct idxd_device, ie);
+}
+
+extern struct bus_type dsa_bus_type;
+
+extern bool support_enqcmd;
+extern struct ida idxd_ida;
+extern struct device_type dsa_device_type;
+extern struct device_type iax_device_type;
+extern struct device_type idxd_wq_device_type;
+extern struct device_type idxd_engine_device_type;
+extern struct device_type idxd_group_device_type;
+
+static inline bool is_idxd_wq_mdev(struct idxd_wq *wq)
+{
+	return (wq->type == IDXD_WQT_MDEV);
+}
+
+static inline bool is_dsa_dev(struct idxd_dev *idxd_dev)
+{
+	return idxd_dev->type == IDXD_DEV_DSA;
+}
+
+static inline bool is_iax_dev(struct idxd_dev *idxd_dev)
+{
+	return idxd_dev->type == IDXD_DEV_IAX;
+}
+
+static inline bool is_idxd_dev(struct idxd_dev *idxd_dev)
+{
+	return is_dsa_dev(idxd_dev) || is_iax_dev(idxd_dev);
+}
+
+static inline bool is_idxd_wq_dev(struct idxd_dev *idxd_dev)
+{
+	return idxd_dev->type == IDXD_DEV_WQ;
+}
+
+static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
+{
+	if (wq->type == IDXD_WQT_KERNEL && strcmp(wq->name, "dmaengine") == 0)
+		return true;
+	return false;
+}
+
+static inline bool is_idxd_wq_user(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_USER;
+}
+
+static inline bool is_idxd_wq_kernel(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_KERNEL;
+}
+
+static inline bool wq_dedicated(struct idxd_wq *wq)
+{
+	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
+}
+
+static inline bool wq_shared(struct idxd_wq *wq)
+{
+	return !test_bit(WQ_FLAG_DEDICATED, &wq->flags);
+}
+
+static inline bool device_pasid_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+}
+
+static inline bool device_user_pasid_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags);
+}
+
+static inline bool device_swq_supported(struct idxd_device *idxd)
+{
+	return (support_enqcmd && device_pasid_enabled(idxd));
+}
+
+enum idxd_portal_prot {
+	IDXD_PORTAL_UNLIMITED = 0,
+	IDXD_PORTAL_LIMITED,
+};
+
+enum idxd_interrupt_type {
+	IDXD_IRQ_MSIX = 0,
+	IDXD_IRQ_IMS,
+};
+
+static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot,
+					    enum idxd_interrupt_type irq_type)
+{
+	return prot * 0x1000 + irq_type * 0x2000;
+}
+
+static inline int idxd_get_wq_portal_full_offset(int wq_id,
+						 enum idxd_portal_prot prot,
+						 enum idxd_interrupt_type irq_type)
+{
+	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot, irq_type);
+}
+
+#define IDXD_PORTAL_MASK	(PAGE_SIZE - 1)
+
+/*
+ * Even though this function can be accessed by multiple threads, it is safe to use.
+ * At worst the address gets used more than once before it gets incremented. We don't
+ * hit a threshold until iops becomes many million times a second. So the occasional
+ * reuse of the same address is tolerable compare to using an atomic variable. This is
+ * safe on a system that has atomic load/store for 32bit integers. Given that this is an
+ * Intel iEP device, that should not be a problem.
+ */
+static inline void __iomem *idxd_wq_portal_addr(struct idxd_wq *wq)
+{
+	int ofs = wq->portal_offset;
+
+	wq->portal_offset = (ofs + sizeof(struct dsa_raw_desc)) & IDXD_PORTAL_MASK;
+	return wq->portal + ofs;
+}
+
+static inline void idxd_wq_get(struct idxd_wq *wq)
+{
+	wq->client_count++;
+}
+
+static inline void idxd_wq_put(struct idxd_wq *wq)
+{
+	wq->client_count--;
+}
+
+static inline int idxd_wq_refcount(struct idxd_wq *wq)
+{
+	return wq->client_count;
+};
+
+/*
+ * Intel IAA does not support batch processing.
+ * The max batch size of device, max batch size of wq and
+ * max batch shift of wqcfg should be always 0 on IAA.
+ */
+static inline void idxd_set_max_batch_size(int idxd_type, struct idxd_device *idxd,
+					   u32 max_batch_size)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		idxd->max_batch_size = 0;
+	else
+		idxd->max_batch_size = max_batch_size;
+}
+
+static inline void idxd_wq_set_max_batch_size(int idxd_type, struct idxd_wq *wq,
+					      u32 max_batch_size)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		wq->max_batch_size = 0;
+	else
+		wq->max_batch_size = max_batch_size;
+}
+
+static inline void idxd_wqcfg_set_max_batch_shift(int idxd_type, union wqcfg *wqcfg,
+						  u32 max_batch_shift)
+{
+	if (idxd_type == IDXD_TYPE_IAX)
+		wqcfg->max_batch_shift = 0;
+	else
+		wqcfg->max_batch_shift = max_batch_shift;
+}
+
+#define MODULE_ALIAS_IDXD_DEVICE(type) MODULE_ALIAS("idxd:t" __stringify(type) "*")
+#define IDXD_DEVICES_MODALIAS_FMT "idxd:t%d"
+
+static inline int idxd_wq_driver_name_match(struct idxd_wq *wq, struct device *dev)
+{
+	return (strncmp(wq->driver_name, dev->driver->name, strlen(dev->driver->name)) == 0);
+}
+
+int __must_check __idxd_driver_register(struct idxd_device_driver *idxd_drv,
+					struct module *module, const char *mod_name);
+#define idxd_driver_register(driver) \
+	__idxd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
+
+void idxd_driver_unregister(struct idxd_device_driver *idxd_drv);
+
+#define module_idxd_driver(__idxd_driver) \
+	module_driver(__idxd_driver, idxd_driver_register, idxd_driver_unregister)
+
+int idxd_register_bus_type(void);
+void idxd_unregister_bus_type(void);
+int idxd_register_devices(struct idxd_device *idxd);
+void idxd_unregister_devices(struct idxd_device *idxd);
+int idxd_register_driver(void);
+void idxd_unregister_driver(void);
+void idxd_wqs_quiesce(struct idxd_device *idxd);
+bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc);
+
+/* device interrupt control */
+irqreturn_t idxd_misc_thread(int vec, void *data);
+irqreturn_t idxd_wq_thread(int irq, void *data);
+void idxd_mask_error_interrupts(struct idxd_device *idxd);
+void idxd_unmask_error_interrupts(struct idxd_device *idxd);
+
+/* device control */
+int idxd_register_idxd_drv(void);
+void idxd_unregister_idxd_drv(void);
+int idxd_device_drv_probe(struct idxd_dev *idxd_dev);
+void idxd_device_drv_remove(struct idxd_dev *idxd_dev);
+int drv_enable_wq(struct idxd_wq *wq);
+int __drv_enable_wq(struct idxd_wq *wq);
+void drv_disable_wq(struct idxd_wq *wq);
+void __drv_disable_wq(struct idxd_wq *wq);
+int idxd_device_init_reset(struct idxd_device *idxd);
+int idxd_device_enable(struct idxd_device *idxd);
+int idxd_device_disable(struct idxd_device *idxd);
+void idxd_device_reset(struct idxd_device *idxd);
+void idxd_device_clear_state(struct idxd_device *idxd);
+int idxd_device_config(struct idxd_device *idxd);
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
+int idxd_device_load_config(struct idxd_device *idxd);
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+				   enum idxd_interrupt_type irq_type);
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+				   enum idxd_interrupt_type irq_type);
+
+/* work queue control */
+void idxd_wqs_unmap_portal(struct idxd_device *idxd);
+int idxd_wq_alloc_resources(struct idxd_wq *wq);
+void idxd_wq_free_resources(struct idxd_wq *wq);
+int idxd_wq_enable(struct idxd_wq *wq, u32 *status);
+int idxd_wq_disable(struct idxd_wq *wq, bool reset_config, u32 *status);
+int idxd_wq_drain(struct idxd_wq *wq, u32 *status);
+void idxd_wq_reset(struct idxd_wq *wq);
+int idxd_wq_map_portal(struct idxd_wq *wq);
+void idxd_wq_unmap_portal(struct idxd_wq *wq);
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
+int idxd_wq_disable_pasid(struct idxd_wq *wq);
+void __idxd_wq_quiesce(struct idxd_wq *wq);
+void idxd_wq_quiesce(struct idxd_wq *wq);
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
+void idxd_wq_free_irq(struct idxd_wq *wq);
+int idxd_wq_request_irq(struct idxd_wq *wq);
+int idxd_wq_abort(struct idxd_wq *wq, u32 *status);
+void idxd_wq_setup_pasid(struct idxd_wq *wq, int pasid);
+void idxd_wq_setup_priv(struct idxd_wq *wq, int priv);
+
+/* submission */
+int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
+struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype);
+void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc);
+int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc);
+
+/* dmaengine */
+int idxd_register_dma_device(struct idxd_device *idxd);
+void idxd_unregister_dma_device(struct idxd_device *idxd);
+int idxd_register_dma_channel(struct idxd_wq *wq);
+void idxd_unregister_dma_channel(struct idxd_wq *wq);
+void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res);
+void idxd_dma_complete_txd(struct idxd_desc *desc,
+			   enum idxd_complete_type comp_type, bool free_desc);
+
+/* cdev */
+int idxd_cdev_register(void);
+void idxd_cdev_remove(void);
+int idxd_cdev_get_major(struct idxd_device *idxd);
+int idxd_wq_add_cdev(struct idxd_wq *wq);
+void idxd_wq_del_cdev(struct idxd_wq *wq);
+
+/* perfmon */
+#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON)
+int perfmon_pmu_init(struct idxd_device *idxd);
+void perfmon_pmu_remove(struct idxd_device *idxd);
+void perfmon_counter_overflow(struct idxd_device *idxd);
+void perfmon_init(void);
+void perfmon_exit(void);
+#else
+static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
+static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
+static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
+static inline void perfmon_init(void) {}
+static inline void perfmon_exit(void) {}
+#endif
+
+#endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd44a998195b428254ca2d0f05ecdf57c8bfea5e
--- /dev/null
+++ b/drivers/dma/idxd/init.c
@@ -0,0 +1,785 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/aer.h>
+#include <linux/fs.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/intel-svm.h>
+#include <linux/iommu.h>
+#include <linux/dma-iommu.h>
+#include <uapi/linux/idxd.h>
+#include <linux/dmaengine.h>
+#include "../dmaengine.h"
+#include "registers.h"
+#include "idxd.h"
+#include "perfmon.h"
+
+MODULE_VERSION(IDXD_DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_IMPORT_NS(IDXD);
+
+static bool sva = true;
+module_param(sva, bool, 0644);
+MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
+
+bool tc_override;
+module_param(tc_override, bool, 0644);
+MODULE_PARM_DESC(tc_override, "Override traffic class defaults");
+
+#define DRV_NAME "idxd"
+
+bool support_enqcmd;
+DEFINE_IDA(idxd_ida);
+
+static struct idxd_driver_data idxd_driver_data[] = {
+	[IDXD_TYPE_DSA] = {
+		.name_prefix = "dsa",
+		.type = IDXD_TYPE_DSA,
+		.compl_size = sizeof(struct dsa_completion_record),
+		.align = 32,
+		.dev_type = &dsa_device_type,
+	},
+	[IDXD_TYPE_IAX] = {
+		.name_prefix = "iax",
+		.type = IDXD_TYPE_IAX,
+		.compl_size = sizeof(struct iax_completion_record),
+		.align = 64,
+		.dev_type = &iax_device_type,
+	},
+};
+
+static struct pci_device_id idxd_pci_tbl[] = {
+	/* DSA ver 1.0 platforms */
+	{ PCI_DEVICE_DATA(INTEL, DSA_SPR0, &idxd_driver_data[IDXD_TYPE_DSA]) },
+
+	/* IAX ver 1.0 platforms */
+	{ PCI_DEVICE_DATA(INTEL, IAX_SPR0, &idxd_driver_data[IDXD_TYPE_IAX]) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
+
+static int idxd_setup_interrupts(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	struct idxd_irq_entry *ie;
+	int i, msixcnt;
+	int rc = 0;
+
+	msixcnt = pci_msix_vec_count(pdev);
+	if (msixcnt < 0) {
+		dev_err(dev, "Not MSI-X interrupt capable.\n");
+		return -ENOSPC;
+	}
+	idxd->irq_cnt = msixcnt;
+
+	rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX);
+	if (rc != msixcnt) {
+		dev_err(dev, "Failed enabling %d MSIX entries: %d\n", msixcnt, rc);
+		return -ENOSPC;
+	}
+	dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
+
+
+	ie = idxd_get_ie(idxd, 0);
+	ie->vector = pci_irq_vector(pdev, 0);
+	rc = request_threaded_irq(ie->vector, NULL, idxd_misc_thread, 0, "idxd-misc", ie);
+	if (rc < 0) {
+		dev_err(dev, "Failed to allocate misc interrupt.\n");
+		goto err_misc_irq;
+	}
+	dev_dbg(dev, "Requested idxd-misc handler on msix vector %d\n", ie->vector);
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		int msix_idx = i + 1;
+
+		ie = idxd_get_ie(idxd, msix_idx);
+		ie->id = msix_idx;
+		ie->int_handle = INVALID_INT_HANDLE;
+		ie->pasid = INVALID_IOASID;
+
+		spin_lock_init(&ie->list_lock);
+		init_llist_head(&ie->pending_llist);
+		INIT_LIST_HEAD(&ie->work_list);
+	}
+
+	idxd_unmask_error_interrupts(idxd);
+	return 0;
+
+ err_misc_irq:
+	idxd_mask_error_interrupts(idxd);
+	pci_free_irq_vectors(pdev);
+	dev_err(dev, "No usable interrupts\n");
+	return rc;
+}
+
+static void idxd_cleanup_interrupts(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct idxd_irq_entry *ie;
+	int msixcnt;
+
+	msixcnt = pci_msix_vec_count(pdev);
+	if (msixcnt <= 0)
+		return;
+
+	ie = idxd_get_ie(idxd, 0);
+	idxd_mask_error_interrupts(idxd);
+	free_irq(ie->vector, ie);
+	pci_free_irq_vectors(pdev);
+}
+
+static int idxd_setup_wqs(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_wq *wq;
+	struct device *conf_dev;
+	int i, rc;
+
+	idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *),
+				 GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->wqs)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev));
+		if (!wq) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ);
+		conf_dev = wq_confdev(wq);
+		wq->id = i;
+		wq->idxd = idxd;
+		device_initialize(wq_confdev(wq));
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_wq_device_type;
+		rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id);
+		if (rc < 0) {
+			put_device(conf_dev);
+			goto err;
+		}
+
+		mutex_init(&wq->wq_lock);
+		init_waitqueue_head(&wq->err_queue);
+		init_completion(&wq->wq_dead);
+		init_completion(&wq->wq_resurrect);
+		INIT_LIST_HEAD(&wq->vdcm_list);
+		wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
+		idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
+		wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
+		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
+		if (!wq->wqcfg) {
+			put_device(conf_dev);
+			rc = -ENOMEM;
+			goto err;
+		}
+		idxd->wqs[i] = wq;
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0) {
+		wq = idxd->wqs[i];
+		conf_dev = wq_confdev(wq);
+		put_device(conf_dev);
+	}
+	return rc;
+}
+
+static int idxd_setup_engines(struct idxd_device *idxd)
+{
+	struct idxd_engine *engine;
+	struct device *dev = &idxd->pdev->dev;
+	struct device *conf_dev;
+	int i, rc;
+
+	idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *),
+				     GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->engines)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		engine = kzalloc_node(sizeof(*engine), GFP_KERNEL, dev_to_node(dev));
+		if (!engine) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		idxd_dev_set_type(&engine->idxd_dev, IDXD_DEV_ENGINE);
+		conf_dev = engine_confdev(engine);
+		engine->id = i;
+		engine->idxd = idxd;
+		device_initialize(conf_dev);
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_engine_device_type;
+		rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id);
+		if (rc < 0) {
+			put_device(conf_dev);
+			goto err;
+		}
+
+		idxd->engines[i] = engine;
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0) {
+		engine = idxd->engines[i];
+		conf_dev = engine_confdev(engine);
+		put_device(conf_dev);
+	}
+	return rc;
+}
+
+static int idxd_setup_groups(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	struct device *conf_dev;
+	struct idxd_group *group;
+	int i, rc;
+
+	idxd->groups = kcalloc_node(idxd->max_groups, sizeof(struct idxd_group *),
+				    GFP_KERNEL, dev_to_node(dev));
+	if (!idxd->groups)
+		return -ENOMEM;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = kzalloc_node(sizeof(*group), GFP_KERNEL, dev_to_node(dev));
+		if (!group) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		idxd_dev_set_type(&group->idxd_dev, IDXD_DEV_GROUP);
+		conf_dev = group_confdev(group);
+		group->id = i;
+		group->idxd = idxd;
+		device_initialize(conf_dev);
+		conf_dev->parent = idxd_confdev(idxd);
+		conf_dev->bus = &dsa_bus_type;
+		conf_dev->type = &idxd_group_device_type;
+		rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id);
+		if (rc < 0) {
+			put_device(conf_dev);
+			goto err;
+		}
+
+		idxd->groups[i] = group;
+		if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override) {
+			group->tc_a = 1;
+			group->tc_b = 1;
+		} else {
+			group->tc_a = -1;
+			group->tc_b = -1;
+		}
+	}
+
+	return 0;
+
+ err:
+	while (--i >= 0) {
+		group = idxd->groups[i];
+		put_device(group_confdev(group));
+	}
+	return rc;
+}
+
+static void idxd_cleanup_internals(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_groups; i++)
+		put_device(group_confdev(idxd->groups[i]));
+	for (i = 0; i < idxd->max_engines; i++)
+		put_device(engine_confdev(idxd->engines[i]));
+	for (i = 0; i < idxd->max_wqs; i++)
+		put_device(wq_confdev(idxd->wqs[i]));
+	destroy_workqueue(idxd->wq);
+}
+
+static int idxd_setup_internals(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc, i;
+
+	init_waitqueue_head(&idxd->cmd_waitq);
+
+	rc = idxd_setup_wqs(idxd);
+	if (rc < 0)
+		goto err_wqs;
+
+	rc = idxd_setup_engines(idxd);
+	if (rc < 0)
+		goto err_engine;
+
+	rc = idxd_setup_groups(idxd);
+	if (rc < 0)
+		goto err_group;
+
+	idxd->wq = create_workqueue(dev_name(dev));
+	if (!idxd->wq) {
+		rc = -ENOMEM;
+		goto err_wkq_create;
+	}
+
+	return 0;
+
+ err_wkq_create:
+	for (i = 0; i < idxd->max_groups; i++)
+		put_device(group_confdev(idxd->groups[i]));
+ err_group:
+	for (i = 0; i < idxd->max_engines; i++)
+		put_device(engine_confdev(idxd->engines[i]));
+ err_engine:
+	for (i = 0; i < idxd->max_wqs; i++)
+		put_device(wq_confdev(idxd->wqs[i]));
+ err_wqs:
+	return rc;
+}
+
+static void idxd_read_table_offsets(struct idxd_device *idxd)
+{
+	union offsets_reg offsets;
+	struct device *dev = &idxd->pdev->dev;
+
+	offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET);
+	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET + sizeof(u64));
+	idxd->grpcfg_offset = offsets.grpcfg * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset);
+	idxd->wqcfg_offset = offsets.wqcfg * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", idxd->wqcfg_offset);
+	idxd->msix_perm_offset = offsets.msix_perm * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset);
+	idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
+	idxd->ims_offset = offsets.ims * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
+}
+
+static void idxd_check_ims(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+
+	/* verify that we have IMS vectors supported by device */
+	if (idxd->hw.gen_cap.max_ims_mult) {
+		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
+		dev_dbg(&pdev->dev, "IMS size: %u\n", idxd->ims_size);
+		set_bit(IDXD_FLAG_IMS_SUPPORTED, &idxd->flags);
+		dev_dbg(&pdev->dev, "IMS supported for device\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "IMS unsupported for device\n");
+}
+
+static void idxd_read_caps(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int i;
+
+	/* reading generic capabilities */
+	idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET);
+	dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits);
+
+	if (idxd->hw.gen_cap.cmd_cap) {
+		idxd->hw.cmd_cap = ioread32(idxd->reg_base + IDXD_CMDCAP_OFFSET);
+		dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap);
+	}
+
+	/* reading command capabilities */
+	if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE))
+		idxd->request_int_handles = true;
+
+	idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
+	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
+	idxd_set_max_batch_size(idxd->data->type, idxd, 1U << idxd->hw.gen_cap.max_batch_shift);
+	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
+	idxd_check_ims(idxd);
+	if (idxd->hw.gen_cap.config_en)
+		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
+
+	/* reading group capabilities */
+	idxd->hw.group_cap.bits =
+		ioread64(idxd->reg_base + IDXD_GRPCAP_OFFSET);
+	dev_dbg(dev, "group_cap: %#llx\n", idxd->hw.group_cap.bits);
+	idxd->max_groups = idxd->hw.group_cap.num_groups;
+	dev_dbg(dev, "max groups: %u\n", idxd->max_groups);
+	idxd->max_rdbufs = idxd->hw.group_cap.total_rdbufs;
+	dev_dbg(dev, "max read buffers: %u\n", idxd->max_rdbufs);
+	idxd->nr_rdbufs = idxd->max_rdbufs;
+
+	/* read engine capabilities */
+	idxd->hw.engine_cap.bits =
+		ioread64(idxd->reg_base + IDXD_ENGCAP_OFFSET);
+	dev_dbg(dev, "engine_cap: %#llx\n", idxd->hw.engine_cap.bits);
+	idxd->max_engines = idxd->hw.engine_cap.num_engines;
+	dev_dbg(dev, "max engines: %u\n", idxd->max_engines);
+
+	/* read workqueue capabilities */
+	idxd->hw.wq_cap.bits = ioread64(idxd->reg_base + IDXD_WQCAP_OFFSET);
+	dev_dbg(dev, "wq_cap: %#llx\n", idxd->hw.wq_cap.bits);
+	idxd->max_wq_size = idxd->hw.wq_cap.total_wq_size;
+	dev_dbg(dev, "total workqueue size: %u\n", idxd->max_wq_size);
+	idxd->max_wqs = idxd->hw.wq_cap.num_wqs;
+	dev_dbg(dev, "max workqueues: %u\n", idxd->max_wqs);
+	idxd->wqcfg_size = 1 << (idxd->hw.wq_cap.wqcfg_size + IDXD_WQCFG_MIN);
+	dev_dbg(dev, "wqcfg size: %u\n", idxd->wqcfg_size);
+
+	/* reading operation capabilities */
+	for (i = 0; i < 4; i++) {
+		idxd->hw.opcap.bits[i] = ioread64(idxd->reg_base +
+				IDXD_OPCAP_OFFSET + i * sizeof(u64));
+		dev_dbg(dev, "opcap[%d]: %#llx\n", i, idxd->hw.opcap.bits[i]);
+	}
+}
+
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data)
+{
+	struct device *dev = &pdev->dev;
+	struct device *conf_dev;
+	struct idxd_device *idxd;
+	int rc;
+
+	idxd = kzalloc_node(sizeof(*idxd), GFP_KERNEL, dev_to_node(dev));
+	if (!idxd)
+		return NULL;
+
+	conf_dev = idxd_confdev(idxd);
+	idxd->pdev = pdev;
+	idxd->data = data;
+	idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type);
+	idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
+	if (idxd->id < 0)
+		return NULL;
+
+	device_initialize(conf_dev);
+	conf_dev->parent = dev;
+	conf_dev->bus = &dsa_bus_type;
+	conf_dev->type = idxd->data->dev_type;
+	rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
+	if (rc < 0) {
+		put_device(conf_dev);
+		return NULL;
+	}
+
+	spin_lock_init(&idxd->dev_lock);
+	spin_lock_init(&idxd->cmd_lock);
+
+	return idxd;
+}
+
+static int idxd_enable_system_pasid(struct idxd_device *idxd)
+{
+	u32 pasid;
+
+	pasid = iommu_enable_pasid_dma(&idxd->pdev->dev);
+	if (pasid == INVALID_IOASID) {
+		dev_err(&idxd->pdev->dev, "No DMA PASID.\n");
+		return -ENXIO;
+	}
+	idxd->pasid = pasid;
+
+	return 0;
+}
+
+static void idxd_disable_system_pasid(struct idxd_device *idxd)
+{
+	iommu_disable_pasid_dma(&idxd->pdev->dev);
+}
+
+static int idxd_probe(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	dev_dbg(dev, "%s entered and resetting device\n", __func__);
+	rc = idxd_device_init_reset(idxd);
+	if (rc < 0)
+		return rc;
+
+	dev_dbg(dev, "IDXD reset complete\n");
+
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
+		rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
+		if (rc)
+			dev_warn(dev, "Failed to enable user PASID.\n");
+		else
+			set_bit(IDXD_FLAG_USER_PASID_ENABLED, &idxd->flags);
+		rc = idxd_enable_system_pasid(idxd);
+		if (rc < 0)
+			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+		else
+			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+
+	} else {
+		dev_warn(dev, "Unable to turn on SVA feature.\n");
+	}
+
+	idxd_read_caps(idxd);
+	idxd_read_table_offsets(idxd);
+
+	rc = idxd_setup_internals(idxd);
+	if (rc)
+		goto err;
+
+	/* If the configs are readonly, then load them from device */
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
+		dev_dbg(dev, "Loading RO device config\n");
+		rc = idxd_device_load_config(idxd);
+		if (rc < 0)
+			goto err_config;
+	}
+
+	rc = idxd_setup_interrupts(idxd);
+	if (rc)
+		goto err_config;
+
+	idxd->major = idxd_cdev_get_major(idxd);
+
+	rc = perfmon_pmu_init(idxd);
+	if (rc < 0)
+		dev_warn(dev, "Failed to initialize perfmon. No PMU support: %d\n", rc);
+
+	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
+	return 0;
+
+ err_config:
+	idxd_cleanup_internals(idxd);
+ err:
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+	return rc;
+}
+
+static void idxd_cleanup(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+
+	perfmon_pmu_remove(idxd);
+	idxd_cleanup_interrupts(idxd);
+	idxd_cleanup_internals(idxd);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+}
+
+static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct idxd_device *idxd;
+	struct idxd_driver_data *data = (struct idxd_driver_data *)id->driver_data;
+	int rc;
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		return rc;
+
+	dev_dbg(dev, "Alloc IDXD context\n");
+	idxd = idxd_alloc(pdev, data);
+	if (!idxd) {
+		rc = -ENOMEM;
+		goto err_idxd_alloc;
+	}
+
+	dev_dbg(dev, "Mapping BARs\n");
+	idxd->reg_base = pci_iomap(pdev, IDXD_MMIO_BAR, 0);
+	if (!idxd->reg_base) {
+		rc = -ENOMEM;
+		goto err_iomap;
+	}
+
+	idxd->portal_base = pcim_iomap(pdev, IDXD_WQ_BAR, 0);
+	if (!idxd->portal_base)
+		return -ENOMEM;
+
+	dev_dbg(dev, "Set DMA masks\n");
+	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (rc)
+		goto err;
+
+	dev_dbg(dev, "Set PCI master\n");
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, idxd);
+
+	idxd->hw.version = ioread32(idxd->reg_base + IDXD_VER_OFFSET);
+	rc = idxd_probe(idxd);
+	if (rc) {
+		dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n");
+		goto err;
+	}
+
+	rc = idxd_register_devices(idxd);
+	if (rc) {
+		dev_err(dev, "IDXD sysfs setup failed\n");
+		goto err_dev_register;
+	}
+
+	dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
+		 idxd->hw.version);
+
+	return 0;
+
+ err_dev_register:
+	idxd_cleanup(idxd);
+ err:
+	pci_iounmap(pdev, idxd->reg_base);
+ err_iomap:
+	put_device(idxd_confdev(idxd));
+ err_idxd_alloc:
+	pci_disable_device(pdev);
+	return rc;
+}
+
+void idxd_wqs_quiesce(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		if (wq->state == IDXD_WQ_ENABLED && wq->type == IDXD_WQT_KERNEL)
+			idxd_wq_quiesce(wq);
+	}
+}
+
+static void idxd_shutdown(struct pci_dev *pdev)
+{
+	struct idxd_device *idxd = pci_get_drvdata(pdev);
+	struct idxd_irq_entry *irq_entry;
+	int rc;
+
+	rc = idxd_device_disable(idxd);
+	if (rc)
+		dev_err(&pdev->dev, "Disabling device failed\n");
+
+	irq_entry = &idxd->ie;
+	synchronize_irq(irq_entry->vector);
+	idxd_mask_error_interrupts(idxd);
+	flush_workqueue(idxd->wq);
+}
+
+static void idxd_remove(struct pci_dev *pdev)
+{
+	struct idxd_device *idxd = pci_get_drvdata(pdev);
+	struct idxd_irq_entry *irq_entry;
+
+	idxd_unregister_devices(idxd);
+	/*
+	 * When ->release() is called for the idxd->conf_dev, it frees all the memory related
+	 * to the idxd context. The driver still needs those bits in order to do the rest of
+	 * the cleanup. However, we do need to unbound the idxd sub-driver. So take a ref
+	 * on the device here to hold off the freeing while allowing the idxd sub-driver
+	 * to unbind.
+	 */
+	get_device(idxd_confdev(idxd));
+	device_unregister(idxd_confdev(idxd));
+	idxd_shutdown(pdev);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
+
+	irq_entry = idxd_get_ie(idxd, 0);
+	free_irq(irq_entry->vector, irq_entry);
+	pci_free_irq_vectors(pdev);
+	pci_iounmap(pdev, idxd->reg_base);
+	if (device_user_pasid_enabled(idxd))
+		iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+	pci_disable_device(pdev);
+	destroy_workqueue(idxd->wq);
+	perfmon_pmu_remove(idxd);
+	put_device(idxd_confdev(idxd));
+}
+
+static struct pci_driver idxd_pci_driver = {
+	.name		= DRV_NAME,
+	.id_table	= idxd_pci_tbl,
+	.probe		= idxd_pci_probe,
+	.remove		= idxd_remove,
+	.shutdown	= idxd_shutdown,
+};
+
+static int __init idxd_init_module(void)
+{
+	int err;
+
+	/*
+	 * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
+	 * enumerating the device. We can not utilize it.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+		pr_warn("idxd driver failed to load without MOVDIR64B.\n");
+		return -ENODEV;
+	}
+
+	if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+		pr_warn("Platform does not have ENQCMD(S) support.\n");
+	else
+		support_enqcmd = true;
+
+	perfmon_init();
+
+	err = idxd_driver_register(&idxd_drv);
+	if (err < 0)
+		goto err_idxd_driver_register;
+
+	err = idxd_driver_register(&idxd_dmaengine_drv);
+	if (err < 0)
+		goto err_idxd_dmaengine_driver_register;
+
+	err = idxd_driver_register(&idxd_user_drv);
+	if (err < 0)
+		goto err_idxd_user_driver_register;
+
+	err = idxd_cdev_register();
+	if (err)
+		goto err_cdev_register;
+
+	err = pci_register_driver(&idxd_pci_driver);
+	if (err)
+		goto err_pci_register;
+
+	return 0;
+
+err_pci_register:
+	idxd_cdev_remove();
+err_cdev_register:
+	idxd_driver_unregister(&idxd_user_drv);
+err_idxd_user_driver_register:
+	idxd_driver_unregister(&idxd_dmaengine_drv);
+err_idxd_dmaengine_driver_register:
+	idxd_driver_unregister(&idxd_drv);
+err_idxd_driver_register:
+	return err;
+}
+module_init(idxd_init_module);
+
+static void __exit idxd_exit_module(void)
+{
+	idxd_driver_unregister(&idxd_user_drv);
+	idxd_driver_unregister(&idxd_dmaengine_drv);
+	idxd_driver_unregister(&idxd_drv);
+	pci_unregister_driver(&idxd_pci_driver);
+	idxd_cdev_remove();
+	perfmon_exit();
+}
+module_exit(idxd_exit_module);
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
new file mode 100644
index 0000000000000000000000000000000000000000..eb0174ee409d2ef786cf8171c6e2d2bd7252d0d9
--- /dev/null
+++ b/drivers/dma/idxd/irq.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <uapi/linux/idxd.h>
+#include "../dmaengine.h"
+#include "idxd.h"
+#include "registers.h"
+
+enum irq_work_type {
+	IRQ_WORK_NORMAL = 0,
+	IRQ_WORK_PROCESS_FAULT,
+};
+
+struct idxd_fault {
+	struct work_struct work;
+	u64 addr;
+	struct idxd_device *idxd;
+};
+
+struct idxd_resubmit {
+	struct work_struct work;
+	struct idxd_desc *desc;
+};
+
+struct idxd_int_handle_revoke {
+	struct work_struct work;
+	struct idxd_device *idxd;
+};
+
+static void idxd_device_reinit(struct work_struct *work)
+{
+	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
+	struct device *dev = &idxd->pdev->dev;
+	int rc, i;
+
+	idxd_device_reset(idxd);
+	rc = idxd_device_config(idxd);
+	if (rc < 0)
+		goto out;
+
+	rc = idxd_device_enable(idxd);
+	if (rc < 0)
+		goto out;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		if (wq->state == IDXD_WQ_ENABLED) {
+			rc = idxd_wq_enable(wq, NULL);
+			if (rc < 0) {
+				dev_warn(dev, "Unable to re-enable wq %s\n",
+					 dev_name(wq_confdev(wq)));
+			}
+		}
+	}
+
+	return;
+
+ out:
+	idxd_device_clear_state(idxd);
+}
+
+/*
+ * The function sends a drain descriptor for the interrupt handle. The drain ensures
+ * all descriptors with this interrupt handle is flushed and the interrupt
+ * will allow the cleanup of the outstanding descriptors.
+ */
+static void idxd_int_handle_revoke_drain(struct idxd_irq_entry *ie)
+{
+	struct idxd_wq *wq = ie_to_wq(ie);
+	struct idxd_device *idxd = wq->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct dsa_hw_desc desc = {};
+	void __iomem *portal;
+	int rc;
+
+	/* Issue a simple drain operation with interrupt but no completion record */
+	desc.flags = IDXD_OP_FLAG_RCI;
+	desc.opcode = DSA_OPCODE_DRAIN;
+	desc.priv = 1;
+
+	if (ie->pasid != INVALID_IOASID)
+		desc.pasid = ie->pasid;
+	desc.int_handle = ie->int_handle;
+	portal = idxd_wq_portal_addr(wq);
+
+	/*
+	 * The wmb() makes sure that the descriptor is all there before we
+	 * issue.
+	 */
+	wmb();
+	if (wq_dedicated(wq)) {
+		iosubmit_cmds512(portal, &desc, 1);
+	} else {
+		rc = idxd_enqcmds(wq, portal, &desc);
+		/* This should not fail unless hardware failed. */
+		if (rc < 0)
+			dev_warn(dev, "Failed to submit drain desc on wq %d\n", wq->id);
+	}
+}
+
+static void idxd_abort_invalid_int_handle_descs(struct idxd_irq_entry *ie)
+{
+	LIST_HEAD(flist);
+	struct idxd_desc *d, *t;
+	struct llist_node *head;
+
+	spin_lock(&ie->list_lock);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(d, t, head, llnode)
+			list_add_tail(&d->list, &ie->work_list);
+	}
+
+	list_for_each_entry_safe(d, t, &ie->work_list, list) {
+		if (d->completion->status == DSA_COMP_INT_HANDLE_INVAL)
+			list_move_tail(&d->list, &flist);
+	}
+	spin_unlock(&ie->list_lock);
+
+	list_for_each_entry_safe(d, t, &flist, list) {
+		list_del(&d->list);
+		idxd_dma_complete_txd(d, IDXD_COMPLETE_ABORT, true);
+	}
+}
+
+static void idxd_int_handle_revoke(struct work_struct *work)
+{
+	struct idxd_int_handle_revoke *revoke =
+		container_of(work, struct idxd_int_handle_revoke, work);
+	struct idxd_device *idxd = revoke->idxd;
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int i, new_handle, rc;
+
+	if (!idxd->request_int_handles) {
+		kfree(revoke);
+		dev_warn(dev, "Unexpected int handle refresh interrupt.\n");
+		return;
+	}
+
+	/*
+	 * The loop attempts to acquire new interrupt handle for all interrupt
+	 * vectors that supports a handle. If a new interrupt handle is acquired and the
+	 * wq is kernel type, the driver will kill the percpu_ref to pause all
+	 * ongoing descriptor submissions. The interrupt handle is then changed.
+	 * After change, the percpu_ref is revived and all the pending submissions
+	 * are woken to try again. A drain is sent to for the interrupt handle
+	 * at the end to make sure all invalid int handle descriptors are processed.
+	 */
+	for (i = 1; i < idxd->irq_cnt; i++) {
+		struct idxd_irq_entry *ie = idxd_get_ie(idxd, i);
+		struct idxd_wq *wq = ie_to_wq(ie);
+
+		if (ie->int_handle == INVALID_INT_HANDLE)
+			continue;
+
+		rc = idxd_device_request_int_handle(idxd, i, &new_handle, IDXD_IRQ_MSIX);
+		if (rc < 0) {
+			dev_warn(dev, "get int handle %d failed: %d\n", i, rc);
+			/*
+			 * Failed to acquire new interrupt handle. Kill the WQ
+			 * and release all the pending submitters. The submitters will
+			 * get error return code and handle appropriately.
+			 */
+			ie->int_handle = INVALID_INT_HANDLE;
+			idxd_wq_quiesce(wq);
+			idxd_abort_invalid_int_handle_descs(ie);
+			continue;
+		}
+
+		/* No change in interrupt handle, nothing needs to be done */
+		if (ie->int_handle == new_handle)
+			continue;
+
+		if (wq->state != IDXD_WQ_ENABLED || wq->type != IDXD_WQT_KERNEL) {
+			/*
+			 * All the MSIX interrupts are allocated at once during probe.
+			 * Therefore we need to update all interrupts even if the WQ
+			 * isn't supporting interrupt operations.
+			 */
+			ie->int_handle = new_handle;
+			continue;
+		}
+
+		mutex_lock(&wq->wq_lock);
+		reinit_completion(&wq->wq_resurrect);
+
+		/* Kill percpu_ref to pause additional descriptor submissions */
+		percpu_ref_kill(&wq->wq_active);
+
+		/* Wait for all submitters quiesce before we change interrupt handle */
+		wait_for_completion(&wq->wq_dead);
+
+		ie->int_handle = new_handle;
+
+		/* Revive percpu ref and wake up all the waiting submitters */
+		percpu_ref_reinit(&wq->wq_active);
+		complete_all(&wq->wq_resurrect);
+		mutex_unlock(&wq->wq_lock);
+
+		/*
+		 * The delay here is to wait for all possible MOVDIR64B that
+		 * are issued before percpu_ref_kill() has happened to have
+		 * reached the PCIe domain before the drain is issued. The driver
+		 * needs to ensure that the drain descriptor issued does not pass
+		 * all the other issued descriptors that contain the invalid
+		 * interrupt handle in order to ensure that the drain descriptor
+		 * interrupt will allow the cleanup of all the descriptors with
+		 * invalid interrupt handle.
+		 */
+		if (wq_dedicated(wq))
+			udelay(100);
+		idxd_int_handle_revoke_drain(ie);
+	}
+	kfree(revoke);
+}
+
+static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
+{
+	struct device *dev = &idxd->pdev->dev;
+	union gensts_reg gensts;
+	u32 val = 0;
+	int i;
+	bool err = false;
+
+	if (cause & IDXD_INTC_HALT_STATE)
+		goto halt;
+
+	if (cause & IDXD_INTC_ERR) {
+		spin_lock(&idxd->dev_lock);
+		for (i = 0; i < 4; i++)
+			idxd->sw_err.bits[i] = ioread64(idxd->reg_base +
+					IDXD_SWERR_OFFSET + i * sizeof(u64));
+
+		iowrite64(idxd->sw_err.bits[0] & IDXD_SWERR_ACK,
+			  idxd->reg_base + IDXD_SWERR_OFFSET);
+
+		if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) {
+			int id = idxd->sw_err.wq_idx;
+			struct idxd_wq *wq = idxd->wqs[id];
+
+			if (wq->type == IDXD_WQT_USER)
+				wake_up_interruptible(&wq->err_queue);
+		} else {
+			int i;
+
+			for (i = 0; i < idxd->max_wqs; i++) {
+				struct idxd_wq *wq = idxd->wqs[i];
+
+				if (wq->type == IDXD_WQT_USER)
+					wake_up_interruptible(&wq->err_queue);
+			}
+		}
+
+		spin_unlock(&idxd->dev_lock);
+		val |= IDXD_INTC_ERR;
+
+		for (i = 0; i < 4; i++)
+			dev_warn(dev, "err[%d]: %#16.16llx\n",
+				 i, idxd->sw_err.bits[i]);
+		err = true;
+	}
+
+	if (cause & IDXD_INTC_INT_HANDLE_REVOKED) {
+		struct idxd_int_handle_revoke *revoke;
+
+		val |= IDXD_INTC_INT_HANDLE_REVOKED;
+
+		revoke = kzalloc(sizeof(*revoke), GFP_ATOMIC);
+		if (revoke) {
+			revoke->idxd = idxd;
+			INIT_WORK(&revoke->work, idxd_int_handle_revoke);
+			queue_work(idxd->wq, &revoke->work);
+
+		} else {
+			dev_err(dev, "Failed to allocate work for int handle revoke\n");
+			idxd_wqs_quiesce(idxd);
+		}
+	}
+
+	if (cause & IDXD_INTC_CMD) {
+		val |= IDXD_INTC_CMD;
+		complete(idxd->cmd_done);
+	}
+
+	if (cause & IDXD_INTC_OCCUPY) {
+		/* Driver does not utilize occupancy interrupt */
+		val |= IDXD_INTC_OCCUPY;
+	}
+
+	if (cause & IDXD_INTC_PERFMON_OVFL) {
+		val |= IDXD_INTC_PERFMON_OVFL;
+		perfmon_counter_overflow(idxd);
+	}
+
+	val ^= cause;
+	if (val)
+		dev_warn_once(dev, "Unexpected interrupt cause bits set: %#x\n",
+			      val);
+
+	if (!err)
+		return 0;
+
+halt:
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
+		idxd->state = IDXD_DEV_HALTED;
+		if (gensts.reset_type == IDXD_DEVICE_RESET_SOFTWARE) {
+			/*
+			 * If we need a software reset, we will throw the work
+			 * on a system workqueue in order to allow interrupts
+			 * for the device command completions.
+			 */
+			INIT_WORK(&idxd->work, idxd_device_reinit);
+			queue_work(idxd->wq, &idxd->work);
+		} else {
+			idxd->state = IDXD_DEV_HALTED;
+			idxd_wqs_quiesce(idxd);
+			idxd_wqs_unmap_portal(idxd);
+			spin_lock(&idxd->dev_lock);
+			idxd_device_clear_state(idxd);
+			dev_err(&idxd->pdev->dev,
+				"idxd halted, need %s.\n",
+				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
+				"FLR" : "system reset");
+			spin_unlock(&idxd->dev_lock);
+			return -ENXIO;
+		}
+	}
+
+	return 0;
+}
+
+irqreturn_t idxd_misc_thread(int vec, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	struct idxd_device *idxd = ie_to_idxd(irq_entry);
+	int rc;
+	u32 cause;
+
+	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	if (cause)
+		iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+
+	while (cause) {
+		rc = process_misc_interrupts(idxd, cause);
+		if (rc < 0)
+			break;
+		cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+		if (cause)
+			iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void idxd_int_handle_resubmit_work(struct work_struct *work)
+{
+	struct idxd_resubmit *irw = container_of(work, struct idxd_resubmit, work);
+	struct idxd_desc *desc = irw->desc;
+	struct idxd_wq *wq = desc->wq;
+	int rc;
+
+	desc->completion->status = 0;
+	rc = idxd_submit_desc(wq, desc);
+	if (rc < 0) {
+		dev_dbg(&wq->idxd->pdev->dev, "Failed to resubmit desc %d to wq %d.\n",
+			desc->id, wq->id);
+		/*
+		 * If the error is not -EAGAIN, it means the submission failed due to wq
+		 * has been killed instead of ENQCMDS failure. Here the driver needs to
+		 * notify the submitter of the failure by reporting abort status.
+		 *
+		 * -EAGAIN comes from ENQCMDS failure. idxd_submit_desc() will handle the
+		 * abort.
+		 */
+		if (rc != -EAGAIN) {
+			desc->completion->status = IDXD_COMP_DESC_ABORT;
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, false);
+		}
+		idxd_free_desc(wq, desc);
+	}
+	kfree(irw);
+}
+
+bool idxd_queue_int_handle_resubmit(struct idxd_desc *desc)
+{
+	struct idxd_wq *wq = desc->wq;
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_resubmit *irw;
+
+	irw = kzalloc(sizeof(*irw), GFP_KERNEL);
+	if (!irw)
+		return false;
+
+	irw->desc = desc;
+	INIT_WORK(&irw->work, idxd_int_handle_resubmit_work);
+	queue_work(idxd->wq, &irw->work);
+	return true;
+}
+
+static void irq_process_pending_llist(struct idxd_irq_entry *irq_entry)
+{
+	struct idxd_desc *desc, *t;
+	struct llist_node *head;
+
+	head = llist_del_all(&irq_entry->pending_llist);
+	if (!head)
+		return;
+
+	llist_for_each_entry_safe(desc, t, head, llnode) {
+		u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
+
+		if (status) {
+			/*
+			 * Check against the original status as ABORT is software defined
+			 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+			 */
+			if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
+				idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
+				continue;
+			}
+
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
+		} else {
+			spin_lock(&irq_entry->list_lock);
+			list_add_tail(&desc->list,
+				      &irq_entry->work_list);
+			spin_unlock(&irq_entry->list_lock);
+		}
+	}
+}
+
+static void irq_process_work_list(struct idxd_irq_entry *irq_entry)
+{
+	LIST_HEAD(flist);
+	struct idxd_desc *desc, *n;
+
+	/*
+	 * This lock protects list corruption from access of list outside of the irq handler
+	 * thread.
+	 */
+	spin_lock(&irq_entry->list_lock);
+	if (list_empty(&irq_entry->work_list)) {
+		spin_unlock(&irq_entry->list_lock);
+		return;
+	}
+
+	list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) {
+		if (desc->completion->status) {
+			list_move_tail(&desc->list, &flist);
+		}
+	}
+
+	spin_unlock(&irq_entry->list_lock);
+
+	list_for_each_entry(desc, &flist, list) {
+		/*
+		 * Check against the original status as ABORT is software defined
+		 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
+		 */
+		if (unlikely(desc->completion->status == IDXD_COMP_DESC_ABORT)) {
+			idxd_dma_complete_txd(desc, IDXD_COMPLETE_ABORT, true);
+			continue;
+		}
+
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL, true);
+	}
+}
+
+irqreturn_t idxd_wq_thread(int irq, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+
+	/*
+	 * There are two lists we are processing. The pending_llist is where
+	 * submmiter adds all the submitted descriptor after sending it to
+	 * the workqueue. It's a lockless singly linked list. The work_list
+	 * is the common linux double linked list. We are in a scenario of
+	 * multiple producers and a single consumer. The producers are all
+	 * the kernel submitters of descriptors, and the consumer is the
+	 * kernel irq handler thread for the msix vector when using threaded
+	 * irq. To work with the restrictions of llist to remain lockless,
+	 * we are doing the following steps:
+	 * 1. Iterate through the work_list and process any completed
+	 *    descriptor. Delete the completed entries during iteration.
+	 * 2. llist_del_all() from the pending list.
+	 * 3. Iterate through the llist that was deleted from the pending list
+	 *    and process the completed entries.
+	 * 4. If the entry is still waiting on hardware, list_add_tail() to
+	 *    the work_list.
+	 */
+	irq_process_work_list(irq_entry);
+	irq_process_pending_llist(irq_entry);
+
+	return IRQ_HANDLED;
+}
diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
new file mode 100644
index 0000000000000000000000000000000000000000..d73004f47cf4b40f0cddf8a57907500884b69e29
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include "idxd.h"
+#include "perfmon.h"
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf);
+
+static cpumask_t		perfmon_dsa_cpu_mask;
+static bool			cpuhp_set_up;
+static enum cpuhp_state		cpuhp_slot;
+
+/*
+ * perf userspace reads this attribute to determine which cpus to open
+ * counters on.  It's connected to perfmon_dsa_cpu_mask, which is
+ * maintained by the cpu hotplug handlers.
+ */
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *perfmon_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = perfmon_cpumask_attrs,
+};
+
+/*
+ * These attributes specify the bits in the config word that the perf
+ * syscall uses to pass the event ids and categories to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3");
+DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31");
+
+/*
+ * These attributes specify the bits in the config1 word that the perf
+ * syscall uses to pass filter data to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31");
+DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39");
+DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43");
+DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51");
+DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59");
+
+#define PERFMON_FILTERS_START	2
+#define PERFMON_FILTERS_MAX	5
+
+static struct attribute *perfmon_format_attrs[] = {
+	&format_attr_idxd_event_category.attr,
+	&format_attr_idxd_event.attr,
+	&format_attr_idxd_filter_wq.attr,
+	&format_attr_idxd_filter_tc.attr,
+	&format_attr_idxd_filter_pgsz.attr,
+	&format_attr_idxd_filter_sz.attr,
+	&format_attr_idxd_filter_eng.attr,
+	NULL,
+};
+
+static struct attribute_group perfmon_format_attr_group = {
+	.name = "format",
+	.attrs = perfmon_format_attrs,
+};
+
+static const struct attribute_group *perfmon_attr_groups[] = {
+	&perfmon_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
+}
+
+static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
+{
+	return &idxd_pmu->pmu == event->pmu;
+}
+
+static int perfmon_collect_events(struct idxd_pmu *idxd_pmu,
+				  struct perf_event *leader,
+				  bool do_grp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = idxd_pmu->n_counters;
+	n = idxd_pmu->n_events;
+
+	if (n >= max_count)
+		return -EINVAL;
+
+	if (is_idxd_event(idxd_pmu, leader)) {
+		idxd_pmu->event_list[n] = leader;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	if (!do_grp)
+		return n;
+
+	for_each_sibling_event(event, leader) {
+		if (!is_idxd_event(idxd_pmu, event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		idxd_pmu->event_list[n] = event;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	return n;
+}
+
+static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
+				    struct perf_event *event, int idx)
+{
+	struct idxd_device *idxd = idxd_pmu->idxd;
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx));
+	hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx));
+}
+
+static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
+				struct perf_event *event)
+{
+	int i;
+
+	for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
+		if (!test_and_set_bit(i, idxd_pmu->used_mask))
+			return i;
+
+	return -EINVAL;
+}
+
+/*
+ * Check whether there are enough counters to satisfy that all the
+ * events in the group can actually be scheduled at the same time.
+ *
+ * To do this, create a fake idxd_pmu object so the event collection
+ * and assignment functions can be used without affecting the internal
+ * state of the real idxd_pmu object.
+ */
+static int perfmon_validate_group(struct idxd_pmu *pmu,
+				  struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct idxd_pmu *fake_pmu;
+	int i, ret = 0, n, idx;
+
+	fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL);
+	if (!fake_pmu)
+		return -ENOMEM;
+
+	fake_pmu->pmu.name = pmu->pmu.name;
+	fake_pmu->n_counters = pmu->n_counters;
+
+	n = perfmon_collect_events(fake_pmu, leader, true);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+	n = perfmon_collect_events(fake_pmu, event, false);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+
+	for (i = 0; i < n; i++) {
+		event = fake_pmu->event_list[i];
+
+		idx = perfmon_assign_event(fake_pmu, event);
+		if (idx < 0) {
+			ret = idx;
+			goto out;
+		}
+	}
+out:
+	kfree(fake_pmu);
+
+	return ret;
+}
+
+static int perfmon_pmu_event_init(struct perf_event *event)
+{
+	struct idxd_device *idxd;
+	int ret = 0;
+
+	idxd = event_to_idxd(event);
+	event->hw.idx = -1;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* sampling not supported */
+	if (event->attr.sample_period)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (event->pmu != &idxd->idxd_pmu->pmu)
+		return -EINVAL;
+
+	event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
+	event->cpu = idxd->idxd_pmu->cpu;
+	event->hw.config = event->attr.config;
+
+	if (event->group_leader != event)
+		 /* non-group events have themselves as leader */
+		ret = perfmon_validate_group(idxd->idxd_pmu, event);
+
+	return ret;
+}
+
+static inline u64 perfmon_pmu_read_counter(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int cntr = hwc->idx;
+
+	idxd = event_to_idxd(event);
+
+	return ioread64(CNTRDATA_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_update(struct perf_event *event)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	u64 prev_raw_count, new_raw_count, delta, p, n;
+	int shift = 64 - idxd->idxd_pmu->counter_width;
+	struct hw_perf_event *hwc = &event->hw;
+
+	do {
+		prev_raw_count = local64_read(&hwc->prev_count);
+		new_raw_count = perfmon_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			new_raw_count) != prev_raw_count);
+
+	n = (new_raw_count << shift);
+	p = (prev_raw_count << shift);
+
+	delta = ((n - p) >> shift);
+
+	local64_add(delta, &event->count);
+}
+
+void perfmon_counter_overflow(struct idxd_device *idxd)
+{
+	int i, n_counters, max_loop = OVERFLOW_SIZE;
+	struct perf_event *event;
+	unsigned long ovfstatus;
+
+	n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE);
+
+	ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+
+	/*
+	 * While updating overflowed counters, other counters behind
+	 * them could overflow and be missed in a given pass.
+	 * Normally this could happen at most n_counters times, but in
+	 * theory a tiny counter width could result in continual
+	 * overflows and endless looping.  max_loop provides a
+	 * failsafe in that highly unlikely case.
+	 */
+	while (ovfstatus && max_loop--) {
+		/* Figure out which counter(s) overflowed */
+		for_each_set_bit(i, &ovfstatus, n_counters) {
+			unsigned long ovfstatus_clear = 0;
+
+			/* Update event->count for overflowed counter */
+			event = idxd->idxd_pmu->event_list[i];
+			perfmon_pmu_event_update(event);
+			/* Writing 1 to OVFSTATUS bit clears it */
+			set_bit(i, &ovfstatus_clear);
+			iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd));
+		}
+
+		ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+	}
+
+	/*
+	 * Should never happen.  If so, it means a counter(s) looped
+	 * around twice while this handler was running.
+	 */
+	WARN_ON_ONCE(ovfstatus);
+}
+
+static inline void perfmon_reset_config(struct idxd_device *idxd)
+{
+	iowrite32(CONFIG_RESET, PERFRST_REG(idxd));
+	iowrite32(0, OVFSTATUS_REG(idxd));
+	iowrite32(0, PERFFRZ_REG(idxd));
+}
+
+static inline void perfmon_reset_counters(struct idxd_device *idxd)
+{
+	iowrite32(CNTR_RESET, PERFRST_REG(idxd));
+}
+
+static inline void perfmon_reset(struct idxd_device *idxd)
+{
+	perfmon_reset_config(idxd);
+	perfmon_reset_counters(idxd);
+}
+
+static void perfmon_pmu_event_start(struct perf_event *event, int mode)
+{
+	u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0;
+	u64 cntr_cfg, cntrdata, event_enc, event_cat = 0;
+	struct hw_perf_event *hwc = &event->hw;
+	union filter_cfg flt_cfg;
+	union event_cfg event_cfg;
+	struct idxd_device *idxd;
+	int cntr;
+
+	idxd = event_to_idxd(event);
+
+	event->hw.idx = hwc->idx;
+	cntr = hwc->idx;
+
+	/* Obtain event category and event value from user space */
+	event_cfg.val = event->attr.config;
+	flt_cfg.val = event->attr.config1;
+	event_cat = event_cfg.event_cat;
+	event_enc = event_cfg.event_enc;
+
+	/* Obtain filter configuration from user space */
+	flt_wq = flt_cfg.wq;
+	flt_tc = flt_cfg.tc;
+	flt_pg_sz = flt_cfg.pg_sz;
+	flt_xfer_sz = flt_cfg.xfer_sz;
+	flt_eng = flt_cfg.eng;
+
+	if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ));
+	if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC));
+	if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ));
+	if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ));
+	if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG));
+
+	/* Read the start value */
+	cntrdata = ioread64(CNTRDATA_REG(idxd, cntr));
+	local64_set(&event->hw.prev_count, cntrdata);
+
+	/* Set counter to event/category */
+	cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT;
+	cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT;
+	/* Set interrupt on overflow and counter enable bits */
+	cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE);
+
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int i, cntr = hwc->idx;
+	u64 cntr_cfg;
+
+	idxd = event_to_idxd(event);
+
+	/* remove this event from event list */
+	for (i = 0; i < idxd->idxd_pmu->n_events; i++) {
+		if (event != idxd->idxd_pmu->event_list[i])
+			continue;
+
+		for (++i; i < idxd->idxd_pmu->n_events; i++)
+			idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i];
+		--idxd->idxd_pmu->n_events;
+		break;
+	}
+
+	cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr));
+	cntr_cfg &= ~CNTRCFG_ENABLE;
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+
+	if (mode == PERF_EF_UPDATE)
+		perfmon_pmu_event_update(event);
+
+	event->hw.idx = -1;
+	clear_bit(cntr, idxd->idxd_pmu->used_mask);
+}
+
+static void perfmon_pmu_event_del(struct perf_event *event, int mode)
+{
+	perfmon_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int perfmon_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	struct idxd_pmu *idxd_pmu = idxd->idxd_pmu;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx, n;
+
+	n = perfmon_collect_events(idxd_pmu, event, false);
+	if (n < 0)
+		return n;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	idx = perfmon_assign_event(idxd_pmu, event);
+	if (idx < 0)
+		return idx;
+
+	perfmon_assign_hw_event(idxd_pmu, event, idx);
+
+	if (flags & PERF_EF_START)
+		perfmon_pmu_event_start(event, 0);
+
+	idxd_pmu->n_events = n;
+
+	return 0;
+}
+
+static void enable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd));
+}
+
+static void disable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd));
+}
+
+static void perfmon_pmu_enable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	enable_perfmon_pmu(idxd);
+}
+
+static void perfmon_pmu_disable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	disable_perfmon_pmu(idxd);
+}
+
+static void skip_filter(int i)
+{
+	int j;
+
+	for (j = i; j < PERFMON_FILTERS_MAX; j++)
+		perfmon_format_attrs[PERFMON_FILTERS_START + j] =
+			perfmon_format_attrs[PERFMON_FILTERS_START + j + 1];
+}
+
+static void idxd_pmu_init(struct idxd_pmu *idxd_pmu)
+{
+	int i;
+
+	for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) {
+		if (!test_bit(i, &idxd_pmu->supported_filters))
+			skip_filter(i);
+	}
+
+	idxd_pmu->pmu.name		= idxd_pmu->name;
+	idxd_pmu->pmu.attr_groups	= perfmon_attr_groups;
+	idxd_pmu->pmu.task_ctx_nr	= perf_invalid_context;
+	idxd_pmu->pmu.event_init	= perfmon_pmu_event_init;
+	idxd_pmu->pmu.pmu_enable	= perfmon_pmu_enable,
+	idxd_pmu->pmu.pmu_disable	= perfmon_pmu_disable,
+	idxd_pmu->pmu.add		= perfmon_pmu_event_add;
+	idxd_pmu->pmu.del		= perfmon_pmu_event_del;
+	idxd_pmu->pmu.start		= perfmon_pmu_event_start;
+	idxd_pmu->pmu.stop		= perfmon_pmu_event_stop;
+	idxd_pmu->pmu.read		= perfmon_pmu_event_update;
+	idxd_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+	idxd_pmu->pmu.module		= THIS_MODULE;
+}
+
+void perfmon_pmu_remove(struct idxd_device *idxd)
+{
+	if (!idxd->idxd_pmu)
+		return;
+
+	cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
+	perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+	kfree(idxd->idxd_pmu);
+	idxd->idxd_pmu = NULL;
+}
+
+static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	/* select the first online CPU as the designated reader */
+	if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
+		cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
+		idxd_pmu->cpu = cpu;
+	}
+
+	return 0;
+}
+
+static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+	unsigned int target;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+
+	/* migrate events if there is a valid target */
+	if (target < nr_cpu_ids)
+		cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
+	else
+		target = -1;
+
+	perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
+
+	return 0;
+}
+
+int perfmon_pmu_init(struct idxd_device *idxd)
+{
+	union idxd_perfcap perfcap;
+	struct idxd_pmu *idxd_pmu;
+	int rc = -ENODEV;
+
+	/*
+	 * perfmon module initialization failed, nothing to do
+	 */
+	if (!cpuhp_set_up)
+		return -ENODEV;
+
+	/*
+	 * If perfmon_offset or num_counters is 0, it means perfmon is
+	 * not supported on this hardware.
+	 */
+	if (idxd->perfmon_offset == 0)
+		return -ENODEV;
+
+	idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL);
+	if (!idxd_pmu)
+		return -ENOMEM;
+
+	idxd_pmu->idxd = idxd;
+	idxd->idxd_pmu = idxd_pmu;
+
+	if (idxd->data->type == IDXD_TYPE_DSA) {
+		rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else if (idxd->data->type == IDXD_TYPE_IAX) {
+		rc = sprintf(idxd_pmu->name, "iax%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else {
+		goto free;
+	}
+
+	perfmon_reset(idxd);
+
+	perfcap.bits = ioread64(PERFCAP_REG(idxd));
+
+	/*
+	 * If total perf counter is 0, stop further registration.
+	 * This is necessary in order to support driver running on
+	 * guest which does not have pmon support.
+	 */
+	if (perfcap.num_perf_counter == 0)
+		goto free;
+
+	/* A counter width of 0 means it can't count */
+	if (perfcap.counter_width == 0)
+		goto free;
+
+	/* Overflow interrupt and counter freeze support must be available */
+	if (!perfcap.overflow_interrupt || !perfcap.counter_freeze)
+		goto free;
+
+	/* Number of event categories cannot be 0 */
+	if (perfcap.num_event_category == 0)
+		goto free;
+
+	/*
+	 * We don't support per-counter capabilities for now.
+	 */
+	if (perfcap.cap_per_counter)
+		goto free;
+
+	idxd_pmu->n_event_categories = perfcap.num_event_category;
+	idxd_pmu->supported_event_categories = perfcap.global_event_category;
+	idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter;
+
+	/* check filter capability.  If 0, then filters are not supported */
+	idxd_pmu->supported_filters = perfcap.filter;
+	if (perfcap.filter)
+		idxd_pmu->n_filters = hweight8(perfcap.filter);
+
+	/* Store the total number of counters categories, and counter width */
+	idxd_pmu->n_counters = perfcap.num_perf_counter;
+	idxd_pmu->counter_width = perfcap.counter_width;
+
+	idxd_pmu_init(idxd_pmu);
+
+	rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1);
+	if (rc)
+		goto free;
+
+	rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
+	if (rc) {
+		perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+		goto free;
+	}
+out:
+	return rc;
+free:
+	kfree(idxd_pmu);
+	idxd->idxd_pmu = NULL;
+
+	goto out;
+}
+
+void __init perfmon_init(void)
+{
+	int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					 "driver/dma/idxd/perf:online",
+					 perf_event_cpu_online,
+					 perf_event_cpu_offline);
+	if (WARN_ON(rc < 0))
+		return;
+
+	cpuhp_slot = rc;
+	cpuhp_set_up = true;
+}
+
+void __exit perfmon_exit(void)
+{
+	if (cpuhp_set_up)
+		cpuhp_remove_multi_state(cpuhp_slot);
+}
diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a081a1bc60587a3ee42ec9b63ad1e3921183f32
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _PERFMON_H_
+#define _PERFMON_H_
+
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/idxd.h>
+#include <linux/perf_event.h>
+#include "registers.h"
+
+static inline struct idxd_pmu *event_to_pmu(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu;
+}
+
+static inline struct idxd_device *event_to_idxd(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+enum dsa_perf_events {
+	DSA_PERF_EVENT_WQ = 0,
+	DSA_PERF_EVENT_ENGINE,
+	DSA_PERF_EVENT_ADDR_TRANS,
+	DSA_PERF_EVENT_OP,
+	DSA_PERF_EVENT_COMPL,
+	DSA_PERF_EVENT_MAX,
+};
+
+enum filter_enc {
+	FLT_WQ = 0,
+	FLT_TC,
+	FLT_PG_SZ,
+	FLT_XFER_SZ,
+	FLT_ENG,
+	FLT_MAX,
+};
+
+#define CONFIG_RESET		0x0000000000000001
+#define CNTR_RESET		0x0000000000000002
+#define CNTR_ENABLE		0x0000000000000001
+#define INTR_OVFL		0x0000000000000002
+
+#define COUNTER_FREEZE		0x00000000FFFFFFFF
+#define COUNTER_UNFREEZE	0x0000000000000000
+#define OVERFLOW_SIZE		32
+
+#define CNTRCFG_ENABLE		BIT(0)
+#define CNTRCFG_IRQ_OVERFLOW	BIT(1)
+#define CNTRCFG_CATEGORY_SHIFT	8
+#define CNTRCFG_EVENT_SHIFT	32
+
+#define PERFMON_TABLE_OFFSET(_idxd)				\
+({								\
+	typeof(_idxd) __idxd = (_idxd);				\
+	((__idxd)->reg_base + (__idxd)->perfmon_offset);	\
+})
+#define PERFMON_REG_OFFSET(idxd, offset)			\
+	(PERFMON_TABLE_OFFSET(idxd) + (offset))
+
+#define PERFCAP_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET))
+#define PERFRST_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET))
+#define OVFSTATUS_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET))
+#define PERFFRZ_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET))
+
+#define FLTCFG_REG(idxd, cntr, flt)				\
+	(PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) +	((cntr) * 32) + ((flt) * 4))
+
+#define CNTRCFG_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8))
+#define CNTRDATA_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8))
+#define CNTRCAP_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8))
+
+#define EVNTCAP_REG(idxd, category) \
+	(PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8))
+
+#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format)			\
+static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_idxd_##_name =			\
+	__ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL)
+
+#endif
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
new file mode 100644
index 0000000000000000000000000000000000000000..0666966fd519582af52f6d5e4c59e2c553fa8db6
--- /dev/null
+++ b/drivers/dma/idxd/registers.h
@@ -0,0 +1,526 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _IDXD_REGISTERS_H_
+#define _IDXD_REGISTERS_H_
+
+/* PCI Config */
+#define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
+#define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
+
+#define DEVICE_VERSION_1		0x100
+#define DEVICE_VERSION_2		0x200
+
+#define IDXD_MMIO_BAR		0
+#define IDXD_WQ_BAR		2
+#define IDXD_PORTAL_SIZE	PAGE_SIZE
+
+/* MMIO Device BAR0 Registers */
+#define IDXD_VER_OFFSET			0x00
+#define IDXD_VER_MAJOR_MASK		0xf0
+#define IDXD_VER_MINOR_MASK		0x0f
+#define GET_IDXD_VER_MAJOR(x)		(((x) & IDXD_VER_MAJOR_MASK) >> 4)
+#define GET_IDXD_VER_MINOR(x)		((x) & IDXD_VER_MINOR_MASK)
+
+union gen_cap_reg {
+	struct {
+		u64 block_on_fault:1;
+		u64 overlap_copy:1;
+		u64 cache_control_mem:1;
+		u64 cache_control_cache:1;
+		u64 cmd_cap:1;
+		u64 rsvd:3;
+		u64 dest_readback:1;
+		u64 drain_readback:1;
+		u64 rsvd2:6;
+		u64 max_xfer_shift:5;
+		u64 max_batch_shift:4;
+		u64 max_ims_mult:6;
+		u64 config_en:1;
+		u64 rsvd3:32;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_GENCAP_OFFSET		0x10
+
+union wq_cap_reg {
+	struct {
+		u64 total_wq_size:16;
+		u64 num_wqs:8;
+		u64 wqcfg_size:4;
+		u64 rsvd:20;
+		u64 shared_mode:1;
+		u64 dedicated_mode:1;
+		u64 wq_ats_support:1;
+		u64 priority:1;
+		u64 occupancy:1;
+		u64 occupancy_int:1;
+		u64 rsvd3:10;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_WQCAP_OFFSET		0x20
+#define IDXD_WQCFG_MIN			5
+
+union group_cap_reg {
+	struct {
+		u64 num_groups:8;
+		u64 total_rdbufs:8;	/* formerly total_tokens */
+		u64 rdbuf_ctrl:1;	/* formerly token_en */
+		u64 rdbuf_limit:1;	/* formerly token_limit */
+		u64 rsvd:46;
+	};
+	u64 bits;
+} __packed;
+#define IDXD_GRPCAP_OFFSET		0x30
+
+union engine_cap_reg {
+	struct {
+		u64 num_engines:8;
+		u64 rsvd:56;
+	};
+	u64 bits;
+} __packed;
+
+#define IDXD_ENGCAP_OFFSET		0x38
+
+#define IDXD_OPCAP_NOOP			0x0001
+#define IDXD_OPCAP_BATCH			0x0002
+#define IDXD_OPCAP_MEMMOVE		0x0008
+struct opcap {
+	u64 bits[4];
+};
+
+#define OPCAP_OFS(op) (op - (0x40 * (op >> 6)))
+#define OPCAP_BIT(op) (BIT_ULL(OPCAP_OFS(op)))
+
+#define IDXD_OPCAP_OFFSET		0x40
+
+#define IDXD_TABLE_OFFSET		0x60
+union offsets_reg {
+	struct {
+		u64 grpcfg:16;
+		u64 wqcfg:16;
+		u64 msix_perm:16;
+		u64 ims:16;
+		u64 perfmon:16;
+		u64 rsvd:48;
+	};
+	u64 bits[2];
+} __packed;
+
+#define IDXD_TABLE_MULT			0x100
+
+#define IDXD_GENCFG_OFFSET		0x80
+union gencfg_reg {
+	struct {
+		u32 rdbuf_limit:8;
+		u32 rsvd:4;
+		u32 user_int_en:1;
+		u32 rsvd2:19;
+	};
+	u32 bits;
+} __packed;
+
+#define IDXD_GENCTRL_OFFSET		0x88
+union genctrl_reg {
+	struct {
+		u32 softerr_int_en:1;
+		u32 halt_int_en:1;
+		u32 rsvd:30;
+	};
+	u32 bits;
+} __packed;
+
+#define IDXD_GENSTATS_OFFSET		0x90
+union gensts_reg {
+	struct {
+		u32 state:2;
+		u32 reset_type:2;
+		u32 rsvd:28;
+	};
+	u32 bits;
+} __packed;
+
+enum idxd_device_status_state {
+	IDXD_DEVICE_STATE_DISABLED = 0,
+	IDXD_DEVICE_STATE_ENABLED,
+	IDXD_DEVICE_STATE_DRAIN,
+	IDXD_DEVICE_STATE_HALT,
+};
+
+enum idxd_device_reset_type {
+	IDXD_DEVICE_RESET_SOFTWARE = 0,
+	IDXD_DEVICE_RESET_FLR,
+	IDXD_DEVICE_RESET_WARM,
+	IDXD_DEVICE_RESET_COLD,
+};
+
+#define IDXD_INTCAUSE_OFFSET		0x98
+#define IDXD_INTC_ERR			0x01
+#define IDXD_INTC_CMD			0x02
+#define IDXD_INTC_OCCUPY			0x04
+#define IDXD_INTC_PERFMON_OVFL		0x08
+#define IDXD_INTC_HALT_STATE		0x10
+#define IDXD_INTC_INT_HANDLE_REVOKED	0x80000000
+
+#define IDXD_CMD_OFFSET			0xa0
+union idxd_command_reg {
+	struct {
+		u32 operand:20;
+		u32 cmd:5;
+		u32 rsvd:6;
+		u32 int_req:1;
+	};
+	u32 bits;
+} __packed;
+#define IDXD_CMD_INT_MASK		0x80000000
+
+enum idxd_cmd {
+	IDXD_CMD_ENABLE_DEVICE = 1,
+	IDXD_CMD_DISABLE_DEVICE,
+	IDXD_CMD_DRAIN_ALL,
+	IDXD_CMD_ABORT_ALL,
+	IDXD_CMD_RESET_DEVICE,
+	IDXD_CMD_ENABLE_WQ,
+	IDXD_CMD_DISABLE_WQ,
+	IDXD_CMD_DRAIN_WQ,
+	IDXD_CMD_ABORT_WQ,
+	IDXD_CMD_RESET_WQ,
+	IDXD_CMD_DRAIN_PASID,
+	IDXD_CMD_ABORT_PASID,
+	IDXD_CMD_REQUEST_INT_HANDLE,
+	IDXD_CMD_RELEASE_INT_HANDLE,
+	IDXD_CMD_REVOKED_HANDLES_PROCESSED,
+};
+
+#define CMD_INT_HANDLE_IMS		0x10000
+
+#define IDXD_CMDSTS_OFFSET		0xa8
+union cmdsts_reg {
+	struct {
+		u8 err;
+		u16 result;
+		u8 rsvd:7;
+		u8 active:1;
+	};
+	u32 bits;
+} __packed;
+#define IDXD_CMDS_ACTIVE_BIT		31
+#define IDXD_CMDSTS_ACTIVE		BIT(IDXD_CMDS_ACTIVE_BIT)
+#define IDXD_CMDSTS_ERR_MASK		0xff
+#define IDXD_CMDSTS_RES_SHIFT		8
+
+enum idxd_cmdsts_err {
+	IDXD_CMDSTS_SUCCESS = 0,
+	IDXD_CMDSTS_INVAL_CMD,
+	IDXD_CMDSTS_INVAL_WQIDX,
+	IDXD_CMDSTS_HW_ERR,
+	/* enable device errors */
+	IDXD_CMDSTS_ERR_DEV_ENABLED = 0x10,
+	IDXD_CMDSTS_ERR_CONFIG,
+	IDXD_CMDSTS_ERR_BUSMASTER_EN,
+	IDXD_CMDSTS_ERR_PASID_INVAL,
+	IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE,
+	IDXD_CMDSTS_ERR_GRP_CONFIG,
+	IDXD_CMDSTS_ERR_GRP_CONFIG2,
+	IDXD_CMDSTS_ERR_GRP_CONFIG3,
+	IDXD_CMDSTS_ERR_GRP_CONFIG4,
+	/* enable wq errors */
+	IDXD_CMDSTS_ERR_DEV_NOTEN = 0x20,
+	IDXD_CMDSTS_ERR_WQ_ENABLED,
+	IDXD_CMDSTS_ERR_WQ_SIZE,
+	IDXD_CMDSTS_ERR_WQ_PRIOR,
+	IDXD_CMDSTS_ERR_WQ_MODE,
+	IDXD_CMDSTS_ERR_BOF_EN,
+	IDXD_CMDSTS_ERR_PASID_EN,
+	IDXD_CMDSTS_ERR_MAX_BATCH_SIZE,
+	IDXD_CMDSTS_ERR_MAX_XFER_SIZE,
+	/* disable device errors */
+	IDXD_CMDSTS_ERR_DIS_DEV_EN = 0x31,
+	/* disable WQ, drain WQ, abort WQ, reset WQ */
+	IDXD_CMDSTS_ERR_WQ_NOT_EN,
+	/* request interrupt handle */
+	IDXD_CMDSTS_ERR_INVAL_INT_IDX = 0x41,
+	IDXD_CMDSTS_ERR_NO_HANDLE,
+	IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE,
+};
+
+#define IDXD_CMDCAP_OFFSET		0xb0
+
+#define IDXD_SWERR_OFFSET		0xc0
+#define IDXD_SWERR_VALID		0x00000001
+#define IDXD_SWERR_OVERFLOW		0x00000002
+#define IDXD_SWERR_ACK			(IDXD_SWERR_VALID | IDXD_SWERR_OVERFLOW)
+union sw_err_reg {
+	struct {
+		u64 valid:1;
+		u64 overflow:1;
+		u64 desc_valid:1;
+		u64 wq_idx_valid:1;
+		u64 batch:1;
+		u64 fault_rw:1;
+		u64 priv:1;
+		u64 rsvd:1;
+		u64 error:8;
+		u64 wq_idx:8;
+		u64 rsvd2:8;
+		u64 operation:8;
+		u64 pasid:20;
+		u64 rsvd3:4;
+
+		u64 batch_idx:16;
+		u64 rsvd4:16;
+		u64 invalid_flags:32;
+
+		u64 fault_addr;
+
+		u64 rsvd5;
+	};
+	u64 bits[4];
+} __packed;
+
+union msix_perm {
+	struct {
+		u32 rsvd:2;
+		u32 ignore:1;
+		u32 pasid_en:1;
+		u32 rsvd2:8;
+		u32 pasid:20;
+	};
+	u32 bits;
+} __packed;
+
+#define IDXD_MSIX_PERM_MASK	0xfffff00c
+#define IDXD_MSIX_PERM_IGNORE	0x3
+#define MSIX_ENTRY_MASK_INT	0x1
+#define MSIX_ENTRY_CTRL_BYTE	12
+
+union group_flags {
+	struct {
+		u32 tc_a:3;
+		u32 tc_b:3;
+		u32 rsvd:1;
+		u32 use_rdbuf_limit:1;
+		u32 rdbufs_reserved:8;
+		u32 rsvd2:4;
+		u32 rdbufs_allowed:8;
+		u32 rsvd3:4;
+	};
+	u32 bits;
+} __packed;
+
+struct grpcfg {
+	u64 wqs[4];
+	u64 engines;
+	union group_flags flags;
+} __packed;
+
+union wqcfg {
+	struct {
+		/* bytes 0-3 */
+		u16 wq_size;
+		u16 rsvd;
+
+		/* bytes 4-7 */
+		u16 wq_thresh;
+		u16 rsvd1;
+
+		/* bytes 8-11 */
+		u32 mode:1;	/* shared or dedicated */
+		u32 bof:1;	/* block on fault */
+		u32 wq_ats_disable:1;
+		u32 rsvd2:1;
+		u32 priority:4;
+		u32 pasid:20;
+		u32 pasid_en:1;
+		u32 priv:1;
+		u32 rsvd3:2;
+
+		/* bytes 12-15 */
+		u32 max_xfer_shift:5;
+		u32 max_batch_shift:4;
+		u32 rsvd4:23;
+
+		/* bytes 16-19 */
+		u16 occupancy_inth;
+		u16 occupancy_table_sel:1;
+		u16 rsvd5:15;
+
+		/* bytes 20-23 */
+		u16 occupancy_limit;
+		u16 occupancy_int_en:1;
+		u16 rsvd6:15;
+
+		/* bytes 24-27 */
+		u16 occupancy;
+		u16 occupancy_int:1;
+		u16 rsvd7:12;
+		u16 mode_support:1;
+		u16 wq_state:2;
+
+		/* bytes 28-31 */
+		u32 rsvd8;
+	};
+	u32 bits[8];
+} __packed;
+
+enum idxd_wq_hw_state {
+	IDXD_WQ_DEV_DISABLED = 0,
+	IDXD_WQ_DEV_ENABLED,
+	IDXD_WQ_DEV_BUSY,
+};
+
+#define WQCFG_PASID_IDX		2
+#define WQCFG_PRIV_IDX		2
+#define WQCFG_OCCUP_IDX		6
+
+#define WQCFG_MODE_DEDICATED	1
+#define WQCFG_MODE_SHARED	0
+
+#define WQCFG_OCCUP_MASK	0xffff
+
+/*
+ * This macro calculates the offset into the WQCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define WQCFG_OFFSET(_idxd_dev, n, ofs) \
+({\
+	typeof(_idxd_dev) __idxd_dev = (_idxd_dev);	\
+	(__idxd_dev)->wqcfg_offset + (n) * (__idxd_dev)->wqcfg_size + sizeof(u32) * (ofs);	\
+})
+
+#define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32))
+
+#define GRPCFG_SIZE		64
+#define GRPWQCFG_STRIDES	4
+
+/*
+ * This macro calculates the offset into the GRPCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define GRPWQCFG_OFFSET(idxd_dev, n, ofs) ((idxd_dev)->grpcfg_offset +\
+					   (n) * GRPCFG_SIZE + sizeof(u64) * (ofs))
+#define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
+#define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
+
+/* Following is performance monitor registers */
+#define IDXD_PERFCAP_OFFSET		0x0
+union idxd_perfcap {
+	struct {
+		u64 num_perf_counter:6;
+		u64 rsvd1:2;
+		u64 counter_width:8;
+		u64 num_event_category:4;
+		u64 global_event_category:16;
+		u64 filter:8;
+		u64 rsvd2:8;
+		u64 cap_per_counter:1;
+		u64 writeable_counter:1;
+		u64 counter_freeze:1;
+		u64 overflow_interrupt:1;
+		u64 rsvd3:8;
+	};
+	u64 bits;
+} __packed;
+
+#define IDXD_EVNTCAP_OFFSET		0x80
+union idxd_evntcap {
+	struct {
+		u64 events:28;
+		u64 rsvd:36;
+	};
+	u64 bits;
+} __packed;
+
+struct idxd_event {
+	union {
+		struct {
+			u32 event_category:4;
+			u32 events:28;
+		};
+		u32 val;
+	};
+} __packed;
+
+#define IDXD_CNTRCAP_OFFSET		0x800
+struct idxd_cntrcap {
+	union {
+		struct {
+			u32 counter_width:8;
+			u32 rsvd:20;
+			u32 num_events:4;
+		};
+		u32 val;
+	};
+	struct idxd_event events[];
+} __packed;
+
+#define IDXD_PERFRST_OFFSET		0x10
+union idxd_perfrst {
+	struct {
+		u32 perfrst_config:1;
+		u32 perfrst_counter:1;
+		u32 rsvd:30;
+	};
+	u32 val;
+} __packed;
+
+#define IDXD_OVFSTATUS_OFFSET		0x30
+#define IDXD_PERFFRZ_OFFSET		0x20
+#define IDXD_CNTRCFG_OFFSET		0x100
+union idxd_cntrcfg {
+	struct {
+		u64 enable:1;
+		u64 interrupt_ovf:1;
+		u64 global_freeze_ovf:1;
+		u64 rsvd1:5;
+		u64 event_category:4;
+		u64 rsvd2:20;
+		u64 events:28;
+		u64 rsvd3:4;
+	};
+	u64 val;
+} __packed;
+
+#define IDXD_FLTCFG_OFFSET		0x300
+
+#define IDXD_CNTRDATA_OFFSET		0x200
+union idxd_cntrdata {
+	struct {
+		u64 event_count_value;
+	};
+	u64 val;
+} __packed;
+
+union event_cfg {
+	struct {
+		u64 event_cat:4;
+		u64 event_enc:28;
+	};
+	u64 val;
+} __packed;
+
+union filter_cfg {
+	struct {
+		u64 wq:32;
+		u64 tc:8;
+		u64 pg_sz:4;
+		u64 xfer_sz:8;
+		u64 eng:8;
+	};
+	u64 val;
+} __packed;
+
+#endif
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
new file mode 100644
index 0000000000000000000000000000000000000000..64d0b17cfc28174f46b7e7044622eaf503593101
--- /dev/null
+++ b/drivers/dma/idxd/submit.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "registers.h"
+
+static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
+{
+	struct idxd_desc *desc;
+	struct idxd_device *idxd = wq->idxd;
+
+	desc = wq->descs[idx];
+	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
+	memset(desc->completion, 0, idxd->data->compl_size);
+	if (desc->batch)
+		desc->batch->num = 0;
+	desc->cpu = cpu;
+
+	if (device_pasid_enabled(idxd))
+		desc->hw->pasid = idxd->pasid;
+
+	return desc;
+}
+
+struct idxd_desc *idxd_alloc_desc(struct idxd_wq *wq, enum idxd_op_type optype)
+{
+	int cpu, idx;
+	struct idxd_device *idxd = wq->idxd;
+	DEFINE_SBQ_WAIT(wait);
+	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return ERR_PTR(-EIO);
+
+	sbq = &wq->sbq;
+	idx = sbitmap_queue_get(sbq, &cpu);
+	if (idx < 0) {
+		if (optype == IDXD_OP_NONBLOCK)
+			return ERR_PTR(-EAGAIN);
+	} else {
+		return __get_desc(wq, idx, cpu);
+	}
+
+	ws = &sbq->ws[0];
+	for (;;) {
+		sbitmap_prepare_to_wait(sbq, ws, &wait, TASK_INTERRUPTIBLE);
+		if (signal_pending_state(TASK_INTERRUPTIBLE, current))
+			break;
+		idx = sbitmap_queue_get(sbq, &cpu);
+		if (idx >= 0)
+			break;
+		schedule();
+	}
+
+	sbitmap_finish_wait(sbq, ws, &wait);
+	if (idx < 0)
+		return ERR_PTR(-EAGAIN);
+
+	return __get_desc(wq, idx, cpu);
+}
+EXPORT_SYMBOL_GPL(idxd_alloc_desc);
+
+void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	int cpu = desc->cpu;
+
+	desc->cpu = -1;
+	sbitmap_queue_clear(&wq->sbq, desc->id, cpu);
+}
+EXPORT_SYMBOL_GPL(idxd_free_desc);
+
+static struct idxd_desc *list_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
+					 struct idxd_desc *desc)
+{
+	struct idxd_desc *d, *n;
+
+	lockdep_assert_held(&ie->list_lock);
+	list_for_each_entry_safe(d, n, &ie->work_list, list) {
+		if (d == desc) {
+			list_del(&d->list);
+			return d;
+		}
+	}
+
+	/*
+	 * At this point, the desc needs to be aborted is held by the completion
+	 * handler where it has taken it off the pending list but has not added to the
+	 * work list. It will be cleaned up by the interrupt handler when it sees the
+	 * IDXD_COMP_DESC_ABORT for completion status.
+	 */
+	return NULL;
+}
+
+static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
+			     struct idxd_desc *desc)
+{
+	struct idxd_desc *d, *t, *found = NULL;
+	struct llist_node *head;
+	LIST_HEAD(flist);
+
+	desc->completion->status = IDXD_COMP_DESC_ABORT;
+	/*
+	 * Grab the list lock so it will block the irq thread handler. This allows the
+	 * abort code to locate the descriptor need to be aborted.
+	 */
+	spin_lock(&ie->list_lock);
+	head = llist_del_all(&ie->pending_llist);
+	if (head) {
+		llist_for_each_entry_safe(d, t, head, llnode) {
+			if (d == desc) {
+				found = desc;
+				continue;
+			}
+
+			if (d->completion->status)
+				list_add_tail(&d->list, &flist);
+			else
+				list_add_tail(&d->list, &ie->work_list);
+		}
+	}
+
+	if (!found)
+		found = list_abort_desc(wq, ie, desc);
+	spin_unlock(&ie->list_lock);
+
+	if (found)
+		idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, false);
+
+	/*
+	 * completing the descriptor will return desc to allocator and
+	 * the desc can be acquired by a different process and the
+	 * desc->list can be modified.  Delete desc from list so the
+	 * list trasversing does not get corrupted by the other process.
+	 */
+	list_for_each_entry_safe(d, t, &flist, list) {
+		list_del_init(&d->list);
+		idxd_dma_complete_txd(found, IDXD_COMPLETE_ABORT, true);
+	}
+}
+
+/*
+ * ENQCMDS typically fail when the WQ is inactive or busy. On host submission, the driver
+ * has better control of number of descriptors being submitted to a shared wq by limiting
+ * the number of driver allocated descriptors to the wq size. However, when the swq is
+ * exported to a guest kernel, it may be shared with multiple guest kernels. This means
+ * the likelihood of getting busy returned on the swq when submitting goes significantly up.
+ * Having a tunable retry mechanism allows the driver to keep trying for a bit before giving
+ * up. The sysfs knob can be tuned by the system administrator.
+ */
+int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc)
+{
+	unsigned int retries = wq->enqcmds_retries;
+	int rc;
+
+	do {
+		rc = enqcmds(portal, desc);
+		if (rc == 0)
+			break;
+		cpu_relax();
+	} while (retries--);
+
+	return rc;
+}
+
+int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
+{
+	struct idxd_device *idxd = wq->idxd;
+	struct idxd_irq_entry *ie = NULL;
+	u32 desc_flags = desc->hw->flags;
+	void __iomem *portal;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -EIO;
+
+	if (!percpu_ref_tryget_live(&wq->wq_active)) {
+		wait_for_completion(&wq->wq_resurrect);
+		if (!percpu_ref_tryget_live(&wq->wq_active))
+			return -ENXIO;
+	}
+
+	portal = idxd_wq_portal_addr(wq);
+
+	/*
+	 * The wmb() flushes writes to coherent DMA data before
+	 * possibly triggering a DMA read. The wmb() is necessary
+	 * even on UP because the recipient is a device.
+	 */
+	wmb();
+
+	/*
+	 * Pending the descriptor to the lockless list for the irq_entry
+	 * that we designated the descriptor to.
+	 */
+	if (desc_flags & IDXD_OP_FLAG_RCI) {
+		ie = &wq->ie;
+		desc->hw->int_handle = ie->int_handle;
+		llist_add(&desc->llnode, &ie->pending_llist);
+	}
+
+	if (wq_dedicated(wq)) {
+		iosubmit_cmds512(portal, desc->hw, 1);
+	} else {
+		rc = idxd_enqcmds(wq, portal, desc->hw);
+		if (rc < 0) {
+			percpu_ref_put(&wq->wq_active);
+			/* abort operation frees the descriptor */
+			if (ie)
+				llist_abort_desc(wq, ie, desc);
+			return rc;
+		}
+	}
+
+	percpu_ref_put(&wq->wq_active);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(idxd_submit_desc);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c955ecaa50d49402e00e3cc0255923664e69944
--- /dev/null
+++ b/drivers/dma/idxd/sysfs.c
@@ -0,0 +1,1615 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+
+static char *idxd_wq_type_names[] = {
+	[IDXD_WQT_NONE]		= "none",
+	[IDXD_WQT_KERNEL]	= "kernel",
+	[IDXD_WQT_USER]		= "user",
+	[IDXD_WQT_MDEV]		= "mdev",
+};
+
+/* IDXD engine attributes */
+static ssize_t engine_group_id_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct idxd_engine *engine = confdev_to_engine(dev);
+
+	if (engine->group)
+		return sysfs_emit(buf, "%d\n", engine->group->id);
+	else
+		return sysfs_emit(buf, "%d\n", -1);
+}
+
+static ssize_t engine_group_id_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct idxd_engine *engine = confdev_to_engine(dev);
+	struct idxd_device *idxd = engine->idxd;
+	long id;
+	int rc;
+	struct idxd_group *prevg;
+
+	rc = kstrtol(buf, 10, &id);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (id > idxd->max_groups - 1 || id < -1)
+		return -EINVAL;
+
+	if (id == -1) {
+		if (engine->group) {
+			engine->group->num_engines--;
+			engine->group = NULL;
+		}
+		return count;
+	}
+
+	prevg = engine->group;
+
+	if (prevg)
+		prevg->num_engines--;
+	engine->group = idxd->groups[id];
+	engine->group->num_engines++;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_engine_group =
+		__ATTR(group_id, 0644, engine_group_id_show,
+		       engine_group_id_store);
+
+static struct attribute *idxd_engine_attributes[] = {
+	&dev_attr_engine_group.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_engine_attribute_group = {
+	.attrs = idxd_engine_attributes,
+};
+
+static const struct attribute_group *idxd_engine_attribute_groups[] = {
+	&idxd_engine_attribute_group,
+	NULL,
+};
+
+static void idxd_conf_engine_release(struct device *dev)
+{
+	struct idxd_engine *engine = confdev_to_engine(dev);
+
+	kfree(engine);
+}
+
+struct device_type idxd_engine_device_type = {
+	.name = "engine",
+	.release = idxd_conf_engine_release,
+	.groups = idxd_engine_attribute_groups,
+};
+
+/* Group attributes */
+
+static void idxd_set_free_rdbufs(struct idxd_device *idxd)
+{
+	int i, rdbufs;
+
+	for (i = 0, rdbufs = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *g = idxd->groups[i];
+
+		rdbufs += g->rdbufs_reserved;
+	}
+
+	idxd->nr_rdbufs = idxd->max_rdbufs - rdbufs;
+}
+
+static ssize_t group_read_buffers_reserved_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	return sysfs_emit(buf, "%u\n", group->rdbufs_reserved);
+}
+
+static ssize_t group_tokens_reserved_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n");
+	return group_read_buffers_reserved_show(dev, attr, buf);
+}
+
+static ssize_t group_read_buffers_reserved_store(struct device *dev,
+						 struct device_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (idxd->data->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (val > idxd->max_rdbufs)
+		return -EINVAL;
+
+	if (val > idxd->nr_rdbufs + group->rdbufs_reserved)
+		return -EINVAL;
+
+	group->rdbufs_reserved = val;
+	idxd_set_free_rdbufs(idxd);
+	return count;
+}
+
+static ssize_t group_tokens_reserved_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_reserved.\n");
+	return group_read_buffers_reserved_store(dev, attr, buf, count);
+}
+
+static struct device_attribute dev_attr_group_tokens_reserved =
+		__ATTR(tokens_reserved, 0644, group_tokens_reserved_show,
+		       group_tokens_reserved_store);
+
+static struct device_attribute dev_attr_group_read_buffers_reserved =
+		__ATTR(read_buffers_reserved, 0644, group_read_buffers_reserved_show,
+		       group_read_buffers_reserved_store);
+
+static ssize_t group_read_buffers_allowed_show(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	return sysfs_emit(buf, "%u\n", group->rdbufs_allowed);
+}
+
+static ssize_t group_tokens_allowed_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n");
+	return group_read_buffers_allowed_show(dev, attr, buf);
+}
+
+static ssize_t group_read_buffers_allowed_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (idxd->data->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (val < 4 * group->num_engines ||
+	    val > group->rdbufs_reserved + idxd->nr_rdbufs)
+		return -EINVAL;
+
+	group->rdbufs_allowed = val;
+	return count;
+}
+
+static ssize_t group_tokens_allowed_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffers_allowed.\n");
+	return group_read_buffers_allowed_store(dev, attr, buf, count);
+}
+
+static struct device_attribute dev_attr_group_tokens_allowed =
+		__ATTR(tokens_allowed, 0644, group_tokens_allowed_show,
+		       group_tokens_allowed_store);
+
+static struct device_attribute dev_attr_group_read_buffers_allowed =
+		__ATTR(read_buffers_allowed, 0644, group_read_buffers_allowed_show,
+		       group_read_buffers_allowed_store);
+
+static ssize_t group_use_read_buffer_limit_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	return sysfs_emit(buf, "%u\n", group->use_rdbuf_limit);
+}
+
+static ssize_t group_use_token_limit_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n");
+	return group_use_read_buffer_limit_show(dev, attr, buf);
+}
+
+static ssize_t group_use_read_buffer_limit_store(struct device *dev,
+						 struct device_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	struct idxd_device *idxd = group->idxd;
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (idxd->data->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->rdbuf_limit == 0)
+		return -EPERM;
+
+	group->use_rdbuf_limit = !!val;
+	return count;
+}
+
+static ssize_t group_use_token_limit_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see use_read_buffer_limit.\n");
+	return group_use_read_buffer_limit_store(dev, attr, buf, count);
+}
+
+static struct device_attribute dev_attr_group_use_token_limit =
+		__ATTR(use_token_limit, 0644, group_use_token_limit_show,
+		       group_use_token_limit_store);
+
+static struct device_attribute dev_attr_group_use_read_buffer_limit =
+		__ATTR(use_read_buffer_limit, 0644, group_use_read_buffer_limit_show,
+		       group_use_read_buffer_limit_store);
+
+static ssize_t group_engines_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	int i, rc = 0;
+	struct idxd_device *idxd = group->idxd;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		struct idxd_engine *engine = idxd->engines[i];
+
+		if (!engine->group)
+			continue;
+
+		if (engine->group->id == group->id)
+			rc += sysfs_emit_at(buf, rc, "engine%d.%d ", idxd->id, engine->id);
+	}
+
+	if (!rc)
+		return 0;
+	rc--;
+	rc += sysfs_emit_at(buf, rc, "\n");
+
+	return rc;
+}
+
+static struct device_attribute dev_attr_group_engines =
+		__ATTR(engines, 0444, group_engines_show, NULL);
+
+static ssize_t group_work_queues_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	int i, rc = 0;
+	struct idxd_device *idxd = group->idxd;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		if (!wq->group)
+			continue;
+
+		if (wq->group->id == group->id)
+			rc += sysfs_emit_at(buf, rc, "wq%d.%d ", idxd->id, wq->id);
+	}
+
+	if (!rc)
+		return 0;
+	rc--;
+	rc += sysfs_emit_at(buf, rc, "\n");
+
+	return rc;
+}
+
+static struct device_attribute dev_attr_group_work_queues =
+		__ATTR(work_queues, 0444, group_work_queues_show, NULL);
+
+static ssize_t group_traffic_class_a_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	return sysfs_emit(buf, "%d\n", group->tc_a);
+}
+
+static ssize_t group_traffic_class_a_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	struct idxd_device *idxd = group->idxd;
+	long val;
+	int rc;
+
+	rc = kstrtol(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override)
+		return -EPERM;
+
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	group->tc_a = val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_traffic_class_a =
+		__ATTR(traffic_class_a, 0644, group_traffic_class_a_show,
+		       group_traffic_class_a_store);
+
+static ssize_t group_traffic_class_b_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	return sysfs_emit(buf, "%d\n", group->tc_b);
+}
+
+static ssize_t group_traffic_class_b_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+	struct idxd_device *idxd = group->idxd;
+	long val;
+	int rc;
+
+	rc = kstrtol(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (idxd->hw.version < DEVICE_VERSION_2 && !tc_override)
+		return -EPERM;
+
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	group->tc_b = val;
+	return count;
+}
+
+static struct device_attribute dev_attr_group_traffic_class_b =
+		__ATTR(traffic_class_b, 0644, group_traffic_class_b_show,
+		       group_traffic_class_b_store);
+
+static struct attribute *idxd_group_attributes[] = {
+	&dev_attr_group_work_queues.attr,
+	&dev_attr_group_engines.attr,
+	&dev_attr_group_use_token_limit.attr,
+	&dev_attr_group_use_read_buffer_limit.attr,
+	&dev_attr_group_tokens_allowed.attr,
+	&dev_attr_group_read_buffers_allowed.attr,
+	&dev_attr_group_tokens_reserved.attr,
+	&dev_attr_group_read_buffers_reserved.attr,
+	&dev_attr_group_traffic_class_a.attr,
+	&dev_attr_group_traffic_class_b.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_group_attribute_group = {
+	.attrs = idxd_group_attributes,
+};
+
+static const struct attribute_group *idxd_group_attribute_groups[] = {
+	&idxd_group_attribute_group,
+	NULL,
+};
+
+static void idxd_conf_group_release(struct device *dev)
+{
+	struct idxd_group *group = confdev_to_group(dev);
+
+	kfree(group);
+}
+
+struct device_type idxd_group_device_type = {
+	.name = "group",
+	.release = idxd_conf_group_release,
+	.groups = idxd_group_attribute_groups,
+};
+
+/* IDXD work queue attribs */
+static ssize_t wq_clients_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%d\n", wq->client_count);
+}
+
+static struct device_attribute dev_attr_wq_clients =
+		__ATTR(clients, 0444, wq_clients_show, NULL);
+
+static ssize_t wq_state_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	switch (wq->state) {
+	case IDXD_WQ_DISABLED:
+		return sysfs_emit(buf, "disabled\n");
+	case IDXD_WQ_ENABLED:
+		return sysfs_emit(buf, "enabled\n");
+	case IDXD_WQ_LOCKED:
+		return sysfs_emit(buf, "locked\n");
+	}
+
+	return sysfs_emit(buf, "unknown\n");
+}
+
+static struct device_attribute dev_attr_wq_state =
+		__ATTR(state, 0444, wq_state_show, NULL);
+
+static ssize_t wq_group_id_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	if (wq->group)
+		return sysfs_emit(buf, "%u\n", wq->group->id);
+	else
+		return sysfs_emit(buf, "-1\n");
+}
+
+static ssize_t wq_group_id_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	long id;
+	int rc;
+	struct idxd_group *prevg, *group;
+
+	rc = kstrtol(buf, 10, &id);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (id > idxd->max_groups - 1 || id < -1)
+		return -EINVAL;
+
+	if (id == -1) {
+		if (wq->group) {
+			wq->group->num_wqs--;
+			wq->group = NULL;
+		}
+		return count;
+	}
+
+	group = idxd->groups[id];
+	prevg = wq->group;
+
+	if (prevg)
+		prevg->num_wqs--;
+	wq->group = group;
+	group->num_wqs++;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_group_id =
+		__ATTR(group_id, 0644, wq_group_id_show, wq_group_id_store);
+
+static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared");
+}
+
+static ssize_t wq_mode_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (sysfs_streq(buf, "dedicated")) {
+		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+		wq->threshold = 0;
+	} else if (sysfs_streq(buf, "shared") && device_swq_supported(idxd)) {
+		clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
+	} else {
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_mode =
+		__ATTR(mode, 0644, wq_mode_show, wq_mode_store);
+
+static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", wq->size);
+}
+
+static int total_claimed_wq_size(struct idxd_device *idxd)
+{
+	int i;
+	int wq_size = 0;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		wq_size += wq->size;
+	}
+
+	return wq_size;
+}
+
+static ssize_t wq_size_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	unsigned long size;
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &size);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (size + total_claimed_wq_size(idxd) - wq->size > idxd->max_wq_size)
+		return -EINVAL;
+
+	wq->size = size;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_size =
+		__ATTR(size, 0644, wq_size_show, wq_size_store);
+
+static ssize_t wq_priority_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", wq->priority);
+}
+
+static ssize_t wq_priority_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	unsigned long prio;
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &prio);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (prio > IDXD_MAX_PRIORITY)
+		return -EINVAL;
+
+	wq->priority = prio;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_priority =
+		__ATTR(priority, 0644, wq_priority_show, wq_priority_store);
+
+static ssize_t wq_block_on_fault_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
+}
+
+static ssize_t wq_block_on_fault_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool bof;
+	int rc;
+
+	if (!idxd->hw.gen_cap.block_on_fault)
+		return -EOPNOTSUPP;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	rc = kstrtobool(buf, &bof);
+	if (rc < 0)
+		return rc;
+
+	if (bof)
+		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+	else
+		clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_block_on_fault =
+		__ATTR(block_on_fault, 0644, wq_block_on_fault_show,
+		       wq_block_on_fault_store);
+
+static ssize_t wq_threshold_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", wq->threshold);
+}
+
+static ssize_t wq_threshold_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (val > wq->size || val <= 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	if (test_bit(WQ_FLAG_DEDICATED, &wq->flags))
+		return -EINVAL;
+
+	wq->threshold = val;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_threshold =
+		__ATTR(threshold, 0644, wq_threshold_show, wq_threshold_store);
+
+static ssize_t wq_type_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	switch (wq->type) {
+	case IDXD_WQT_KERNEL:
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]);
+	case IDXD_WQT_USER:
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]);
+	case IDXD_WQT_MDEV:
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_MDEV]);
+	case IDXD_WQT_NONE:
+	default:
+		return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]);
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t wq_type_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	enum idxd_wq_type old_type;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	old_type = wq->type;
+	if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_NONE]))
+		wq->type = IDXD_WQT_NONE;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_KERNEL]))
+		wq->type = IDXD_WQT_KERNEL;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER]))
+		wq->type = IDXD_WQT_USER;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_MDEV]))
+		wq->type = IDXD_WQT_MDEV;
+	else
+		return -EINVAL;
+
+	/* If we are changing queue type, clear the name */
+	if (wq->type != old_type)
+		memset(wq->name, 0, WQ_NAME_SIZE + 1);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_type =
+		__ATTR(type, 0644, wq_type_show, wq_type_store);
+
+static ssize_t wq_name_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%s\n", wq->name);
+}
+
+static ssize_t wq_name_store(struct device *dev,
+			     struct device_attribute *attr, const char *buf,
+			     size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	char *input, *pos;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
+		return -EINVAL;
+
+	input = kstrndup(buf, count, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	pos = strim(input);
+	memset(wq->name, 0, WQ_NAME_SIZE + 1);
+	sprintf(wq->name, "%s", pos);
+	kfree(input);
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_name =
+		__ATTR(name, 0644, wq_name_show, wq_name_store);
+
+static ssize_t wq_cdev_minor_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	int minor = -1;
+
+	mutex_lock(&wq->wq_lock);
+	if (wq->idxd_cdev)
+		minor = wq->idxd_cdev->minor;
+	mutex_unlock(&wq->wq_lock);
+
+	if (minor == -1)
+		return -ENXIO;
+	return sysfs_emit(buf, "%d\n", minor);
+}
+
+static struct device_attribute dev_attr_wq_cdev_minor =
+		__ATTR(cdev_minor, 0444, wq_cdev_minor_show, NULL);
+
+static int __get_sysfs_u64(const char *buf, u64 *val)
+{
+	int rc;
+
+	rc = kstrtou64(buf, 0, val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (*val == 0)
+		return -EINVAL;
+
+	*val = roundup_pow_of_two(*val);
+	return 0;
+}
+
+static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attribute *attr,
+					 char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes);
+}
+
+static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	u64 xfer_size;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	rc = __get_sysfs_u64(buf, &xfer_size);
+	if (rc < 0)
+		return rc;
+
+	if (xfer_size > idxd->max_xfer_bytes)
+		return -EINVAL;
+
+	wq->max_xfer_bytes = xfer_size;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_max_transfer_size =
+		__ATTR(max_transfer_size, 0644,
+		       wq_max_transfer_size_show, wq_max_transfer_size_store);
+
+static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", wq->max_batch_size);
+}
+
+static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	u64 batch_size;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	rc = __get_sysfs_u64(buf, &batch_size);
+	if (rc < 0)
+		return rc;
+
+	if (batch_size > idxd->max_batch_size)
+		return -EINVAL;
+
+	idxd_wq_set_max_batch_size(idxd->data->type, wq, (u32)batch_size);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_max_batch_size =
+		__ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store);
+
+static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%u\n", wq->ats_dis);
+}
+
+static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool ats_dis;
+	int rc;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (!idxd->hw.wq_cap.wq_ats_support)
+		return -EOPNOTSUPP;
+
+	rc = kstrtobool(buf, &ats_dis);
+	if (rc < 0)
+		return rc;
+
+	wq->ats_dis = ats_dis;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_ats_disable =
+		__ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store);
+
+static ssize_t wq_occupancy_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	struct idxd_device *idxd = wq->idxd;
+	u32 occup, offset;
+
+	if (!idxd->hw.wq_cap.occupancy)
+		return -EOPNOTSUPP;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_OCCUP_IDX);
+	occup = ioread32(idxd->reg_base + offset) & WQCFG_OCCUP_MASK;
+
+	return sysfs_emit(buf, "%u\n", occup);
+}
+
+static struct device_attribute dev_attr_wq_occupancy =
+		__ATTR(occupancy, 0444, wq_occupancy_show, NULL);
+
+static ssize_t wq_enqcmds_retries_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	if (wq_dedicated(wq))
+		return -EOPNOTSUPP;
+
+	return sysfs_emit(buf, "%u\n", wq->enqcmds_retries);
+}
+
+static ssize_t wq_enqcmds_retries_store(struct device *dev, struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	int rc;
+	unsigned int retries;
+
+	if (wq_dedicated(wq))
+		return -EOPNOTSUPP;
+
+	rc = kstrtouint(buf, 10, &retries);
+	if (rc < 0)
+		return rc;
+
+	if (retries > IDXD_ENQCMDS_MAX_RETRIES)
+		retries = IDXD_ENQCMDS_MAX_RETRIES;
+
+	wq->enqcmds_retries = retries;
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_enqcmds_retries =
+		__ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
+
+static ssize_t wq_driver_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	return sysfs_emit(buf, "%s\n", wq->driver_name);
+}
+
+static ssize_t wq_driver_name_store(struct device *dev, struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+	char *input, *pos;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
+		return -EINVAL;
+
+	input = kstrndup(buf, count, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	pos = strim(input);
+	memset(wq->driver_name, 0, WQ_NAME_SIZE + 1);
+	sprintf(wq->driver_name, "%s", pos);
+	kfree(input);
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_driver_name =
+		__ATTR(driver_name, 0644, wq_driver_name_show, wq_driver_name_store);
+
+static struct attribute *idxd_wq_attributes[] = {
+	&dev_attr_wq_clients.attr,
+	&dev_attr_wq_state.attr,
+	&dev_attr_wq_group_id.attr,
+	&dev_attr_wq_mode.attr,
+	&dev_attr_wq_size.attr,
+	&dev_attr_wq_priority.attr,
+	&dev_attr_wq_block_on_fault.attr,
+	&dev_attr_wq_threshold.attr,
+	&dev_attr_wq_type.attr,
+	&dev_attr_wq_name.attr,
+	&dev_attr_wq_cdev_minor.attr,
+	&dev_attr_wq_max_transfer_size.attr,
+	&dev_attr_wq_max_batch_size.attr,
+	&dev_attr_wq_ats_disable.attr,
+	&dev_attr_wq_occupancy.attr,
+	&dev_attr_wq_enqcmds_retries.attr,
+	&dev_attr_wq_driver_name.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_wq_attribute_group = {
+	.attrs = idxd_wq_attributes,
+};
+
+static const struct attribute_group *idxd_wq_attribute_groups[] = {
+	&idxd_wq_attribute_group,
+	NULL,
+};
+
+static void idxd_conf_wq_release(struct device *dev)
+{
+	struct idxd_wq *wq = confdev_to_wq(dev);
+
+	kfree(wq->wqcfg);
+	kfree(wq);
+}
+
+struct device_type idxd_wq_device_type = {
+	.name = "wq",
+	.release = idxd_conf_wq_release,
+	.groups = idxd_wq_attribute_groups,
+};
+
+/* IDXD device attribs */
+static ssize_t version_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%#x\n", idxd->hw.version);
+}
+static DEVICE_ATTR_RO(version);
+
+static ssize_t max_work_queues_size_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_wq_size);
+}
+static DEVICE_ATTR_RO(max_work_queues_size);
+
+static ssize_t max_groups_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_groups);
+}
+static DEVICE_ATTR_RO(max_groups);
+
+static ssize_t max_work_queues_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_wqs);
+}
+static DEVICE_ATTR_RO(max_work_queues);
+
+static ssize_t max_engines_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_engines);
+}
+static DEVICE_ATTR_RO(max_engines);
+
+static ssize_t numa_node_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static ssize_t max_batch_size_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_batch_size);
+}
+static DEVICE_ATTR_RO(max_batch_size);
+
+static ssize_t max_transfer_size_show(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes);
+}
+static DEVICE_ATTR_RO(max_transfer_size);
+
+static ssize_t op_cap_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+	int i, rc = 0;
+
+	for (i = 0; i < 4; i++)
+		rc += sysfs_emit_at(buf, rc, "%#llx ", idxd->hw.opcap.bits[i]);
+
+	rc--;
+	rc += sysfs_emit_at(buf, rc, "\n");
+	return rc;
+}
+static DEVICE_ATTR_RO(op_cap);
+
+static ssize_t gen_cap_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits);
+}
+static DEVICE_ATTR_RO(gen_cap);
+
+static ssize_t configurable_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
+}
+static DEVICE_ATTR_RO(configurable);
+
+static ssize_t clients_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+	int count = 0, i;
+
+	spin_lock(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		count += wq->client_count;
+	}
+	spin_unlock(&idxd->dev_lock);
+
+	return sysfs_emit(buf, "%d\n", count);
+}
+static DEVICE_ATTR_RO(clients);
+
+static ssize_t pasid_enabled_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", device_pasid_enabled(idxd));
+}
+static DEVICE_ATTR_RO(pasid_enabled);
+
+static ssize_t state_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	switch (idxd->state) {
+	case IDXD_DEV_DISABLED:
+		return sysfs_emit(buf, "disabled\n");
+	case IDXD_DEV_ENABLED:
+		return sysfs_emit(buf, "enabled\n");
+	case IDXD_DEV_HALTED:
+		return sysfs_emit(buf, "halted\n");
+	}
+
+	return sysfs_emit(buf, "unknown\n");
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t errors_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+	int i, out = 0;
+
+	spin_lock(&idxd->dev_lock);
+	for (i = 0; i < 4; i++)
+		out += sysfs_emit_at(buf, out, "%#018llx ", idxd->sw_err.bits[i]);
+	spin_unlock(&idxd->dev_lock);
+	out--;
+	out += sysfs_emit_at(buf, out, "\n");
+	return out;
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t max_read_buffers_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->max_rdbufs);
+}
+
+static ssize_t max_tokens_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see max_read_buffers.\n");
+	return max_read_buffers_show(dev, attr, buf);
+}
+
+static DEVICE_ATTR_RO(max_tokens);	/* deprecated */
+static DEVICE_ATTR_RO(max_read_buffers);
+
+static ssize_t read_buffer_limit_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->rdbuf_limit);
+}
+
+static ssize_t token_limit_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffer_limit.\n");
+	return read_buffer_limit_show(dev, attr, buf);
+}
+
+static ssize_t read_buffer_limit_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+	unsigned long val;
+	int rc;
+
+	rc = kstrtoul(buf, 10, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (idxd->state == IDXD_DEV_ENABLED)
+		return -EPERM;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (!idxd->hw.group_cap.rdbuf_limit)
+		return -EPERM;
+
+	if (val > idxd->hw.group_cap.total_rdbufs)
+		return -EINVAL;
+
+	idxd->rdbuf_limit = val;
+	return count;
+}
+
+static ssize_t token_limit_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	dev_warn_once(dev, "attribute deprecated, see read_buffer_limit\n");
+	return read_buffer_limit_store(dev, attr, buf, count);
+}
+
+static DEVICE_ATTR_RW(token_limit);	/* deprecated */
+static DEVICE_ATTR_RW(read_buffer_limit);
+
+static ssize_t cdev_major_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%u\n", idxd->major);
+}
+static DEVICE_ATTR_RO(cdev_major);
+
+static ssize_t cmd_status_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
+}
+
+static ssize_t cmd_status_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	idxd->cmd_status = 0;
+	return count;
+}
+static DEVICE_ATTR_RW(cmd_status);
+
+static struct attribute *idxd_device_attributes[] = {
+	&dev_attr_version.attr,
+	&dev_attr_max_groups.attr,
+	&dev_attr_max_work_queues.attr,
+	&dev_attr_max_work_queues_size.attr,
+	&dev_attr_max_engines.attr,
+	&dev_attr_numa_node.attr,
+	&dev_attr_max_batch_size.attr,
+	&dev_attr_max_transfer_size.attr,
+	&dev_attr_op_cap.attr,
+	&dev_attr_gen_cap.attr,
+	&dev_attr_configurable.attr,
+	&dev_attr_clients.attr,
+	&dev_attr_pasid_enabled.attr,
+	&dev_attr_state.attr,
+	&dev_attr_errors.attr,
+	&dev_attr_max_tokens.attr,
+	&dev_attr_max_read_buffers.attr,
+	&dev_attr_token_limit.attr,
+	&dev_attr_read_buffer_limit.attr,
+	&dev_attr_cdev_major.attr,
+	&dev_attr_cmd_status.attr,
+	NULL,
+};
+
+static const struct attribute_group idxd_device_attribute_group = {
+	.attrs = idxd_device_attributes,
+};
+
+static const struct attribute_group *idxd_attribute_groups[] = {
+	&idxd_device_attribute_group,
+	NULL,
+};
+
+static void idxd_conf_device_release(struct device *dev)
+{
+	struct idxd_device *idxd = confdev_to_idxd(dev);
+
+	kfree(idxd->groups);
+	kfree(idxd->wqs);
+	kfree(idxd->engines);
+	ida_free(&idxd_ida, idxd->id);
+	kfree(idxd);
+}
+
+struct device_type dsa_device_type = {
+	.name = "dsa",
+	.release = idxd_conf_device_release,
+	.groups = idxd_attribute_groups,
+};
+
+struct device_type iax_device_type = {
+	.name = "iax",
+	.release = idxd_conf_device_release,
+	.groups = idxd_attribute_groups,
+};
+
+static int idxd_register_engine_devices(struct idxd_device *idxd)
+{
+	struct idxd_engine *engine;
+	int i, j, rc;
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		engine = idxd->engines[i];
+		rc = device_add(engine_confdev(engine));
+		if (rc < 0)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	j = i - 1;
+	for (; i < idxd->max_engines; i++) {
+		engine = idxd->engines[i];
+		put_device(engine_confdev(engine));
+	}
+
+	while (j--) {
+		engine = idxd->engines[j];
+		device_unregister(engine_confdev(engine));
+	}
+	return rc;
+}
+
+static int idxd_register_group_devices(struct idxd_device *idxd)
+{
+	struct idxd_group *group;
+	int i, j, rc;
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		rc = device_add(group_confdev(group));
+		if (rc < 0)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	j = i - 1;
+	for (; i < idxd->max_groups; i++) {
+		group = idxd->groups[i];
+		put_device(group_confdev(group));
+	}
+
+	while (j--) {
+		group = idxd->groups[j];
+		device_unregister(group_confdev(group));
+	}
+	return rc;
+}
+
+static int idxd_register_wq_devices(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	int i, rc, j;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		rc = device_add(wq_confdev(wq));
+		if (rc < 0)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	j = i - 1;
+	for (; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+		put_device(wq_confdev(wq));
+	}
+
+	while (j--) {
+		wq = idxd->wqs[j];
+		device_unregister(wq_confdev(wq));
+	}
+	return rc;
+}
+
+int idxd_register_devices(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc, i;
+
+	rc = device_add(idxd_confdev(idxd));
+	if (rc < 0)
+		return rc;
+
+	rc = idxd_register_wq_devices(idxd);
+	if (rc < 0) {
+		dev_dbg(dev, "WQ devices registering failed: %d\n", rc);
+		goto err_wq;
+	}
+
+	rc = idxd_register_engine_devices(idxd);
+	if (rc < 0) {
+		dev_dbg(dev, "Engine devices registering failed: %d\n", rc);
+		goto err_engine;
+	}
+
+	rc = idxd_register_group_devices(idxd);
+	if (rc < 0) {
+		dev_dbg(dev, "Group device registering failed: %d\n", rc);
+		goto err_group;
+	}
+
+	return 0;
+
+ err_group:
+	for (i = 0; i < idxd->max_engines; i++)
+		device_unregister(engine_confdev(idxd->engines[i]));
+ err_engine:
+	for (i = 0; i < idxd->max_wqs; i++)
+		device_unregister(wq_confdev(idxd->wqs[i]));
+ err_wq:
+	device_del(idxd_confdev(idxd));
+	return rc;
+}
+
+void idxd_unregister_devices(struct idxd_device *idxd)
+{
+	int i;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq = idxd->wqs[i];
+
+		device_unregister(wq_confdev(wq));
+	}
+
+	for (i = 0; i < idxd->max_engines; i++) {
+		struct idxd_engine *engine = idxd->engines[i];
+
+		device_unregister(engine_confdev(engine));
+	}
+
+	for (i = 0; i < idxd->max_groups; i++) {
+		struct idxd_group *group = idxd->groups[i];
+
+		device_unregister(group_confdev(group));
+	}
+}
+
+int idxd_register_bus_type(void)
+{
+	return bus_register(&dsa_bus_type);
+}
+
+void idxd_unregister_bus_type(void)
+{
+	bus_unregister(&dsa_bus_type);
+}
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 67736c801f3caa69220030b2f1d2bce9b9c0e9f9..5d21916b923a84d65541432260abccc85aee8d62 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1675,8 +1675,7 @@ static enum dma_status sdma_tx_status(struct dma_chan *chan,
 	}
 	spin_unlock_irqrestore(&sdmac->vc.lock, flags);
 
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			 residue);
+	dma_set_residue(txstate, residue);
 
 	return sdmac->status;
 }
diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c
index 89d90c456c0ce0828febb08652d2a173ab30e301..b4ce2f7effffa51e984f1200b0ce0612d5ee210d 100644
--- a/drivers/dma/mmp_tdma.c
+++ b/drivers/dma/mmp_tdma.c
@@ -532,8 +532,7 @@ static enum dma_status mmp_tdma_tx_status(struct dma_chan *chan,
 	struct mmp_tdma_chan *tdmac = to_mmp_tdma_chan(chan);
 
 	tdmac->pos = mmp_tdma_get_pos(tdmac);
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			 tdmac->buf_len - tdmac->pos);
+	dma_set_residue(txstate, tdmac->buf_len - tdmac->pos);
 
 	return tdmac->status;
 }
diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 3039bba0e4d559836179fe0f71941da428bd563c..595a65586bef674d35c5aea50b805bf991466a43 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -683,8 +683,7 @@ static enum dma_status mxs_dma_tx_status(struct dma_chan *chan,
 		residue -= bar;
 	}
 
-	dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
-			residue);
+	dma_set_residue(txstate, residue);
 
 	return mxs_chan->status;
 }
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 4bbf4172b9bf98caef63191e6a2e45fc86528271..0db816eb8080db32c60d21b441540318f1a1a7d9 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -15,6 +15,8 @@
 #include <linux/of.h>
 #include <linux/of_dma.h>
 
+#include "dmaengine.h"
+
 static LIST_HEAD(of_dma_list);
 static DEFINE_MUTEX(of_dma_lock);
 
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e73ca303f1a7e37ebdfbb7b877986475f585857b..7dbf4935d58086cf86a518102a372d2aee3f66e1 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -214,7 +214,9 @@ const char * const edac_mem_types[] = {
 	[MEM_DDR4]	= "Unbuffered-DDR4",
 	[MEM_RDDR4]	= "Registered-DDR4",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
+	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
+	[MEM_HBM2]	= "High-bandwidth-memory-Gen2",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 8a1913860d087d5b9de8d956eaba3132cc1b85f4..07e443b69d604ee27a0e9db7ddbafdbe5c8cb4b2 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.3"
+#define I10NM_REVISION	"v0.0.5"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -24,20 +24,271 @@
 	pci_read_config_dword((d)->uracu, 0xd0, &(reg))
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
+#define I10NM_GET_SAD(d, offset, i, reg)\
+	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
+#define I10NM_GET_HBM_IMC_BAR(d, reg)	\
+	pci_read_config_dword((d)->uracu, 0xd4, &(reg))
+#define I10NM_GET_CAPID3_CFG(d, reg)	\
+	pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4)
-#define I10NM_GET_MCDDRTCFG(m, i, j)	\
-	readl((m)->mbase + 0x20970 + (i) * 0x4000 + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
+#define I10NM_GET_MCDDRTCFG(m, i)	\
+	readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
+	readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \
+	(i) * (m)->chan_mmio_sz)
+#define I10NM_GET_AMAP(m, i)		\
+	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
+	(i) * (m)->chan_mmio_sz)
+#define I10NM_GET_REG32(m, i, offset)	\
+	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_GET_REG64(m, i, offset)	\
+	readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_SET_REG32(m, i, offset, v)	\
+	writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
+#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg)	\
+	((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000)
+
+#define I10NM_HBM_IMC_MMIO_SIZE		0x9000
+#define I10NM_IS_HBM_PRESENT(reg)	GET_BITFIELD(reg, 27, 30)
+#define I10NM_IS_HBM_IMC(reg)		GET_BITFIELD(reg, 29, 29)
+
+#define RETRY_RD_ERR_LOG_UC		BIT(1)
+#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
+#define RETRY_RD_ERR_LOG_EN		BIT(15)
+#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
+#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
+
+#define I10NM_MAX_SAD			16
+#define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
+#define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
 
 static struct list_head *i10nm_edac_list;
 
+static struct res_config *res_cfg;
+static int retry_rd_err_log;
+static int decoding_via_mca;
+static bool mem_cfg_2lm;
+
+static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8};
+static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
+static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10};
+static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
+static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};
+
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
+				      u32 *offsets_scrub, u32 *offsets_demand,
+				      u32 *offsets_demand2)
+{
+	u32 s, d, d2;
+
+	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
+	if (offsets_demand2)
+		d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]);
+
+	if (enable) {
+		/* Save default configurations */
+		imc->chan[chan].retry_rd_err_log_s = s;
+		imc->chan[chan].retry_rd_err_log_d = d;
+		if (offsets_demand2)
+			imc->chan[chan].retry_rd_err_log_d2 = d2;
+
+		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		s |=  RETRY_RD_ERR_LOG_EN;
+		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		d |=  RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			d2 &= ~RETRY_RD_ERR_LOG_UC;
+			d2 |=  RETRY_RD_ERR_LOG_NOOVER;
+			d2 |=  RETRY_RD_ERR_LOG_EN;
+		}
+	} else {
+		/* Restore default configurations */
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
+			s |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
+			s |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
+			s &= ~RETRY_RD_ERR_LOG_EN;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
+			d |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
+			d |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
+			d &= ~RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC)
+				d2 |=  RETRY_RD_ERR_LOG_UC;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER))
+				d2 &=  ~RETRY_RD_ERR_LOG_NOOVER;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN))
+				d2 &= ~RETRY_RD_ERR_LOG_EN;
+		}
+	}
+
+	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
+	if (offsets_demand2)
+		I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2);
+}
+
+static void enable_retry_rd_err_log(bool enable)
+{
+	struct skx_imc *imc;
+	struct skx_dev *d;
+	int i, j;
+
+	edac_dbg(2, "\n");
+
+	list_for_each_entry(d, i10nm_edac_list, list)
+		for (i = 0; i < I10NM_NUM_IMC; i++) {
+			imc = &d->imc[i];
+			if (!imc->mbase)
+				continue;
+
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++) {
+				if (imc->hbm_mc) {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm0,
+								  res_cfg->offsets_demand_hbm0,
+								  NULL);
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm1,
+								  res_cfg->offsets_demand_hbm1,
+								  NULL);
+				} else {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub,
+								  res_cfg->offsets_demand,
+								  res_cfg->offsets_demand2);
+				}
+			}
+	}
+}
+
+static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
+				  int len, bool scrub_err)
+{
+	struct skx_imc *imc = &res->dev->imc[res->imc];
+	u32 log0, log1, log2, log3, log4;
+	u32 corr0, corr1, corr2, corr3;
+	u32 lxg0, lxg1, lxg3, lxg4;
+	u32 *xffsets = NULL;
+	u64 log2a, log5;
+	u64 lxg2a, lxg5;
+	u32 *offsets;
+	int n, pch;
+
+	if (!imc->mbase)
+		return;
+
+	if (imc->hbm_mc) {
+		pch = res->cs & 1;
+
+		if (pch)
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 :
+					      res_cfg->offsets_demand_hbm1;
+		else
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
+					      res_cfg->offsets_demand_hbm0;
+	} else {
+		if (scrub_err) {
+			offsets = res_cfg->offsets_scrub;
+		} else {
+			offsets = res_cfg->offsets_demand;
+			xffsets = res_cfg->offsets_demand2;
+		}
+	}
+
+	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
+	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
+	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
+	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
+	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
+
+	if (xffsets) {
+		lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]);
+		lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]);
+		lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]);
+		lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]);
+		lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]);
+	}
+
+	if (res_cfg->type == SPR) {
+		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx",
+			     log0, log1, log2a, log3, log4, log5);
+
+		if (len - n > 0) {
+			if (xffsets) {
+				lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]);
+				n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]",
+					     lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5);
+			} else {
+				n += snprintf(msg + n, len - n, "]");
+			}
+		}
+	} else {
+		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
+			     log0, log1, log2, log3, log4, log5);
+	}
+
+	if (imc->hbm_mc) {
+		if (pch) {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24);
+		} else {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824);
+		}
+	} else {
+		corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+		corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+		corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+		corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+	}
+
+	if (len - n > 0)
+		snprintf(msg + n, len - n,
+			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
+			 corr0 & 0xffff, corr0 >> 16,
+			 corr1 & 0xffff, corr1 >> 16,
+			 corr2 & 0xffff, corr2 >> 16,
+			 corr3 & 0xffff, corr3 >> 16);
+
+	/* Clear status bits */
+	if (retry_rd_err_log == 2) {
+		if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) {
+			log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+		}
+
+		if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+			lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0);
+		}
+	}
+}
+
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@@ -61,7 +312,165 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 	return pdev;
 }
 
-static int i10nm_get_all_munits(void)
+static bool i10nm_check_2lm(struct res_config *cfg)
+{
+	struct skx_dev *d;
+	u32 reg;
+	int i;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1],
+						 PCI_SLOT(cfg->sad_all_devfn),
+						 PCI_FUNC(cfg->sad_all_devfn));
+		if (!d->sad_all)
+			continue;
+
+		for (i = 0; i < I10NM_MAX_SAD; i++) {
+			I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg);
+			if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) {
+				edac_dbg(2, "2-level memory configuration.\n");
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Check whether the error comes from DDRT by ICX/Tremont/SPR model specific error code.
+ * Refer to SDM vol3B 17.11.3/17.13.2 Intel IMC MC error codes for IA32_MCi_STATUS.
+ */
+static bool i10nm_mscod_is_ddrt(u32 mscod)
+{
+	switch (res_cfg->type) {
+	case I10NM:
+		switch (mscod) {
+		case 0x0106: case 0x0107:
+		case 0x0800: case 0x0804:
+		case 0x0806 ... 0x0808:
+		case 0x080a ... 0x080e:
+		case 0x0810: case 0x0811:
+		case 0x0816: case 0x081e:
+		case 0x081f:
+			return true;
+		}
+
+		break;
+	case SPR:
+		switch (mscod) {
+		case 0x0800: case 0x0804:
+		case 0x0806 ... 0x0808:
+		case 0x080a ... 0x080e:
+		case 0x0810: case 0x0811:
+		case 0x0816: case 0x081e:
+		case 0x081f:
+			return true;
+		}
+
+		break;
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static bool i10nm_mc_decode_available(struct mce *mce)
+{
+#define ICX_IMCx_CHy		0x06666000
+	u8 bank;
+
+	if (!decoding_via_mca || mem_cfg_2lm)
+		return false;
+
+	if ((mce->status & (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+			!= (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+		return false;
+
+	bank = mce->bank;
+
+	switch (res_cfg->type) {
+	case I10NM:
+		/* Check whether the bank is one of {13,14,17,18,21,22,25,26} */
+		if (!(ICX_IMCx_CHy & (1 << bank)))
+			return false;
+		break;
+	case SPR:
+		if (bank < 13 || bank > 20)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* DDRT errors can't be decoded from MCA bank registers */
+	if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
+		return false;
+
+	if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
+		return false;
+
+	return true;
+}
+
+static bool i10nm_mc_decode(struct decoded_addr *res)
+{
+	struct mce *m = res->mce;
+	struct skx_dev *d;
+	u8 bank;
+
+	if (!i10nm_mc_decode_available(m))
+		return false;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		if (d->imc[0].src_id == m->socketid) {
+			res->socket = m->socketid;
+			res->dev = d;
+			break;
+		}
+	}
+
+	switch (res_cfg->type) {
+	case I10NM:
+		bank              = m->bank - 13;
+		res->imc          = bank / 4;
+		res->channel      = bank % 2;
+		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+		res->row          = GET_BITFIELD(m->misc, 19, 39);
+		res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
+		res->bank_address = GET_BITFIELD(m->misc, 42, 43);
+		res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
+		res->rank         = GET_BITFIELD(m->misc, 56, 58);
+		res->dimm         = res->rank >> 2;
+		res->rank         = res->rank % 4;
+		break;
+	case SPR:
+		bank              = m->bank - 13;
+		res->imc          = bank / 2;
+		res->channel      = bank % 2;
+		res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+		res->row          = GET_BITFIELD(m->misc, 19, 36);
+		res->bank_group   = GET_BITFIELD(m->misc, 37, 38);
+		res->bank_address = GET_BITFIELD(m->misc, 39, 40);
+		res->bank_group  |= GET_BITFIELD(m->misc, 41, 41) << 2;
+		res->rank         = GET_BITFIELD(m->misc, 57, 57);
+		res->dimm         = GET_BITFIELD(m->misc, 58, 58);
+		break;
+	default:
+		return false;
+	}
+
+	if (!res->dev) {
+		skx_printk(KERN_ERR, "No device for src_id %d imc %d\n",
+			   m->socketid, res->imc);
+		return false;
+	}
+
+	return true;
+}
+
+static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
@@ -89,7 +498,7 @@ static int i10nm_get_all_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < I10NM_NUM_IMC; i++) {
+		for (i = 0; i < I10NM_NUM_DDR_IMC; i++) {
 			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
 						   12 + i, 0);
 			if (i == 0 && !mdev) {
@@ -125,22 +534,144 @@ static int i10nm_get_all_munits(void)
 	return 0;
 }
 
+static bool i10nm_check_hbm_imc(struct skx_dev *d)
+{
+	u32 reg;
+
+	if (I10NM_GET_CAPID3_CFG(d, reg)) {
+		i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n");
+		return false;
+	}
+
+	return I10NM_IS_HBM_PRESENT(reg) != 0;
+}
+
+static int i10nm_get_hbm_munits(void)
+{
+	struct pci_dev *mdev;
+	void __iomem *mbase;
+	u32 reg, off, mcmtr;
+	struct skx_dev *d;
+	int i, lmc;
+	u64 base;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3);
+		if (!d->pcu_cr3)
+			return -ENODEV;
+
+		if (!i10nm_check_hbm_imc(d)) {
+			i10nm_printk(KERN_DEBUG, "No hbm memory\n");
+			return -ENODEV;
+		}
+
+		if (I10NM_GET_SCK_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get socket bar\n");
+			return -ENODEV;
+		}
+		base = I10NM_GET_SCK_MMIO_BASE(reg);
+
+		if (I10NM_GET_HBM_IMC_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n");
+			return -ENODEV;
+		}
+		base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg);
+
+		lmc = I10NM_NUM_DDR_IMC;
+
+		for (i = 0; i < I10NM_NUM_HBM_IMC; i++) {
+			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
+						   12 + i / 4, 1 + i % 4);
+			if (i == 0 && !mdev) {
+				i10nm_printk(KERN_ERR, "No hbm mc found\n");
+				return -ENODEV;
+			}
+			if (!mdev)
+				continue;
+
+			d->imc[lmc].mdev = mdev;
+			off = i * I10NM_HBM_IMC_MMIO_SIZE;
+
+			edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n",
+				 lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE);
+
+			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
+			if (!mbase) {
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
+				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
+					     base + off);
+				return -ENOMEM;
+			}
+
+			d->imc[lmc].mbase = mbase;
+			d->imc[lmc].hbm_mc = true;
+
+			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
+			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				iounmap(d->imc[lmc].mbase);
+				d->imc[lmc].mbase = NULL;
+				d->imc[lmc].hbm_mc = false;
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
+				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
+				return -ENODEV;
+			}
+
+			lmc++;
+		}
+	}
+
+	return 0;
+}
+
 static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
+	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config i10nm_cfg1 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
+};
+
+static struct res_config spr_cfg = {
+	.type			= SPR,
+	.decs_did		= 0x3252,
+	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x8000,
+	.hbm_chan_mmio_sz	= 0x4000,
+	.support_ddr5		= true,
+	.sad_all_devfn		= PCI_DEVFN(10, 0),
+	.sad_all_offset		= 0x300,
+	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
+	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
+	.offsets_demand		= offsets_demand_spr,
+	.offsets_demand2	= offsets_demand2_spr,
+	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
+	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_D, 0, (kernel_ulong_t)&i10nm_cfg0 },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_X, 0, (kernel_ulong_t)&i10nm_cfg0 },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_D, 0, (kernel_ulong_t)&i10nm_cfg1 },
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SAPPHIRERAPIDS_X, 0, (kernel_ulong_t)&spr_cfg },
 	{ }
 };
 MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
@@ -155,30 +686,32 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
+static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
+				 struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	struct skx_imc *imc = pvt->imc;
+	u32 mtr, amap, mcddrtcfg;
 	struct dimm_info *dimm;
-	u32 mtr, mcddrtcfg;
 	int i, j, ndimms;
 
-	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
+	for (i = 0; i < imc->num_channels; i++) {
 		if (!imc->mbase)
 			continue;
 
 		ndimms = 0;
-		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
+		amap = I10NM_GET_AMAP(imc, i);
+		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
+		for (j = 0; j < imc->num_dimms; j++) {
 			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 					     mci->n_layers, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
-			mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
 			edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
 				 mtr, mcddrtcfg, imc->mc, i, j);
 
 			if (IS_DIMM_PRESENT(mtr))
-				ndimms += skx_get_dimm_info(mtr, 0, 0, dimm,
-							    imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, 0, amap, dimm,
+							    imc, i, j, cfg);
 			else if (IS_NVDIMM_PRESENT(mcddrtcfg, j))
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -267,6 +800,7 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	res_cfg = cfg;
 
 	/* Newer steppings have different offset for ATOM_TREMONT_D/ICELAKE_X */
 	if (boot_cpu_data.x86_stepping >= 4)
@@ -284,8 +818,12 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
-	rc = i10nm_get_all_munits();
-	if (rc < 0)
+	mem_cfg_2lm = i10nm_check_2lm(cfg);
+	skx_set_mem_cfg(mem_cfg_2lm);
+
+	rc = i10nm_get_ddr_munits();
+
+	if (i10nm_get_hbm_munits() && rc)
 		goto fail;
 
 	list_for_each_entry(d, i10nm_edac_list, list) {
@@ -306,10 +844,19 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
+			if (d->imc[i].hbm_mc) {
+				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_HBM_DIMMS;
+			} else {
+				d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_DDR_DIMMS;
+			}
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
-					      i10nm_get_dimm_config);
+					      i10nm_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
@@ -323,6 +870,14 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();
 
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(true);
+	} else {
+		skx_set_decode(i10nm_mc_decode, NULL);
+	}
+
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
 	return 0;
@@ -334,6 +889,13 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
+
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, NULL);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(false);
+	}
+
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@@ -343,5 +905,36 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+static int set_decoding_via_mca(const char *buf, const struct kernel_param *kp)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 0, &val);
+
+	if (ret || val > 1)
+		return -EINVAL;
+
+	if (val && mem_cfg_2lm) {
+		i10nm_printk(KERN_NOTICE, "Decoding errors via MCA banks for 2LM isn't supported yet\n");
+		return -EIO;
+	}
+
+	ret = param_set_int(buf, kp);
+
+	return ret;
+}
+
+static const struct kernel_param_ops decoding_via_mca_param_ops = {
+	.set = set_decoding_via_mca,
+	.get = param_get_int,
+};
+
+module_param_cb(decoding_via_mca, &decoding_via_mca_param_ops, &decoding_via_mca, 0644);
+MODULE_PARM_DESC(decoding_via_mca, "decoding_via_mca: 0=off(default), 1=enable");
+
+module_param(retry_rd_err_log, int, 0444);
+MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
+
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 49899108cb008bed2542e08f7d9fb9ed441704d5..1d078c37e856211d72f844e97322683199b6d9d6 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int skx_get_dimm_config(struct mem_ctl_info *mci)
+static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	u32 mtr, mcmtr, amap, mcddrtcfg;
@@ -196,7 +196,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 			pci_read_config_dword(imc->chan[i].cdev,
 					      0x80 + 4 * j, &mtr);
 			if (IS_DIMM_PRESENT(mtr)) {
-				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg);
 			} else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) {
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -231,7 +231,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)
 
 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len)
+				      char *msg, int len,
+				      bool scrub_err)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
@@ -703,7 +704,7 @@ static int __init skx_init(void)
 			d->imc[i].node_id = node_id;
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
-					      skx_get_dimm_config);
+					      skx_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
@@ -711,8 +712,13 @@ static int __init skx_init(void)
 
 	skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
 
-	if (nvdimm_count && skx_adxl_get() == -ENODEV)
-		skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+	if (nvdimm_count && skx_adxl_get() != -ENODEV) {
+		skx_set_decode(NULL, skx_show_retry_rd_err_log);
+	} else {
+		if (nvdimm_count)
+			skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+		skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
+	}
 
 	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
 	opstate_init();
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 8ceb993295cec0f0a2fdda052040c291fb817a8a..296f8b68ded2c1d87f4c1b9f245962356cd1c811 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -23,10 +23,15 @@
 #include "skx_common.h"
 
 static const char * const component_names[] = {
-	[INDEX_SOCKET]	= "ProcessorSocketId",
-	[INDEX_MEMCTRL]	= "MemoryControllerId",
-	[INDEX_CHANNEL]	= "ChannelId",
-	[INDEX_DIMM]	= "DimmSlotId",
+	[INDEX_SOCKET]		= "ProcessorSocketId",
+	[INDEX_MEMCTRL]		= "MemoryControllerId",
+	[INDEX_CHANNEL]		= "ChannelId",
+	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_CS]		= "ChipSelect",
+	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
+	[INDEX_NM_CHANNEL]	= "NmChannelId",
+	[INDEX_NM_DIMM]		= "NmDimmSlotId",
+	[INDEX_NM_CS]		= "NmChipSelect",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -34,12 +39,14 @@ static int adxl_component_count;
 static const char * const *adxl_component_names;
 static u64 *adxl_values;
 static char *adxl_msg;
+static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
-static skx_decode_f skx_decode;
+static skx_decode_f driver_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
+static bool skx_mem_cfg_2lm;
 
 int __init skx_adxl_get(void)
 {
@@ -56,14 +63,25 @@ int __init skx_adxl_get(void)
 		for (j = 0; names[j]; j++) {
 			if (!strcmp(component_names[i], names[j])) {
 				component_indices[i] = j;
+
+				if (i >= INDEX_NM_FIRST)
+					adxl_nm_bitmap |= 1 << i;
+
 				break;
 			}
 		}
 
-		if (!names[j])
+		if (!names[j] && i < INDEX_NM_FIRST)
 			goto err;
 	}
 
+	if (skx_mem_cfg_2lm) {
+		if (!adxl_nm_bitmap)
+			skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
+		else
+			edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
+	}
+
 	adxl_component_names = names;
 	while (*names++)
 		adxl_component_count++;
@@ -99,7 +117,7 @@ void __exit skx_adxl_put(void)
 	kfree(adxl_msg);
 }
 
-static bool skx_adxl_decode(struct decoded_addr *res)
+static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -116,11 +134,23 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
-	res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
-	res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	if (error_in_1st_level_mem) {
+		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
+		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
+		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
+			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+		res->cs      = (adxl_nm_bitmap & BIT_NM_CS) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CS]] : -1;
+	} else {
+		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
+		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+		res->cs      = (int)adxl_values[component_indices[INDEX_CS]];
+	}
 
-	if (res->imc > NUM_IMC - 1) {
+	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -148,12 +178,19 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 			break;
 	}
 
+	res->decoded_by_adxl = true;
+
 	return true;
 }
 
+void skx_set_mem_cfg(bool mem_cfg_2lm)
+{
+	skx_mem_cfg_2lm = mem_cfg_2lm;
+}
+
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
-	skx_decode = decode;
+	driver_decode = decode;
 	skx_show_retry_rd_err_log = show_retry_log;
 }
 
@@ -304,14 +341,27 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
 #define numcol(reg)	skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno)
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg)
 {
-	int  banks = 16, ranks, rows, cols, npages;
+	int  banks, ranks, rows, cols, npages;
+	enum mem_type mtype;
 	u64 size;
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
-	cols = numcol(mtr);
+	cols = imc->hbm_mc ? 6 : numcol(mtr);
+
+	if (imc->hbm_mc) {
+		banks = 32;
+		mtype = MEM_HBM2;
+	} else if (cfg->support_ddr5 && (amap & 0x8)) {
+		banks = 32;
+		mtype = MEM_DDR5;
+	} else {
+		banks = 16;
+		mtype = MEM_DDR4;
+	}
 
 	/*
 	 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
@@ -332,10 +382,15 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->nr_pages = npages;
 	dimm->grain = 32;
 	dimm->dtype = get_width(mtr);
-	dimm->mtype = MEM_DDR4;
+	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
-	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
-		 imc->src_id, imc->lmc, chan, dimmno);
+
+	if (imc->hbm_mc)
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
+			 imc->src_id, imc->lmc, chan);
+	else
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+			 imc->src_id, imc->lmc, chan, dimmno);
 
 	return 1;
 }
@@ -390,7 +445,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config)
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg)
 {
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[2];
@@ -425,13 +481,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 	}
 
 	mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM;
+	if (cfg->support_ddr5)
+		mci->mtype_cap |= MEM_FLAG_DDR5;
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = mod_str;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
 
-	rc = get_dimm_config(mci);
+	rc = get_dimm_config(mci, cfg);
 	if (rc < 0)
 		goto fail;
 
@@ -481,6 +539,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@@ -532,29 +591,30 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
+			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
 			break;
 		}
 	}
-	if (adxl_component_count) {
+	if (res->decoded_by_adxl) {
 		len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
 			 mscod, errcode, adxl_msg);
 	} else {
 		len = snprintf(skx_msg, MSG_SIZE,
-			 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
+			 "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
 			 mscod, errcode,
 			 res->socket, res->imc, res->rank,
-			 res->bank_group, res->bank_address, res->row, res->column);
+			 res->row, res->column, res->bank_address, res->bank_group);
 	}
 
 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
 
 	edac_dbg(0, "%s\n", skx_msg);
 
@@ -565,6 +625,27 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
+static bool skx_error_in_1st_level_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	if (!skx_mem_cfg_2lm)
+		return false;
+
+	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+
+	return errcode == MCACOD_EXT_MEM_ERR;
+}
+
+static bool skx_error_in_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+
+	return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
+}
+
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -579,18 +660,19 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	if (mce->kflags & MCE_HANDLED_CEC)
 		return NOTIFY_DONE;
 
-	/* ignore unless this is memory related with an address */
-	if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
+	/* Ignore unless this is memory related with an address */
+	if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
 		return NOTIFY_DONE;
 
 	memset(&res, 0, sizeof(res));
+	res.mce  = mce;
 	res.addr = mce->addr;
 
-	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res))
+	/* Try driver decoder first */
+	if (!(driver_decode && driver_decode(&res))) {
+		/* Then try firmware decoder (ACPI DSM methods) */
+		if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
 			return NOTIFY_DONE;
-	} else if (!skx_decode || !skx_decode(&res)) {
-		return NOTIFY_DONE;
 	}
 
 	mci = res.dev->imc[res.imc].mci;
@@ -648,6 +730,8 @@ void skx_remove(void)
 		}
 		if (d->util_all)
 			pci_dev_put(d->util_all);
+		if (d->pcu_cr3)
+			pci_dev_put(d->pcu_cr3);
 		if (d->sad_all)
 			pci_dev_put(d->sad_all);
 		if (d->uracu)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 78f8c1de0b71c80ddcdb7d64fc4bd63a064d0f3f..312032657264912352a5e804709f3da49978d73e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -9,6 +9,9 @@
 #ifndef _SKX_COMM_EDAC_H
 #define _SKX_COMM_EDAC_H
 
+#include <linux/bits.h>
+#include <asm/mce.h>
+
 #define MSG_SIZE		1024
 
 /*
@@ -30,9 +33,17 @@
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_IMC		4
-#define I10NM_NUM_CHANNELS	2
-#define I10NM_NUM_DIMMS		2
+#define I10NM_NUM_DDR_IMC	4
+#define I10NM_NUM_DDR_CHANNELS	2
+#define I10NM_NUM_DDR_DIMMS	2
+
+#define I10NM_NUM_HBM_IMC	16
+#define I10NM_NUM_HBM_CHANNELS	2
+#define I10NM_NUM_HBM_DIMMS	1
+
+#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
+#define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
+#define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
 #define MAX(a, b)	((a) > (b) ? (a) : (b))
 #define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
@@ -42,6 +53,33 @@
 #define IS_DIMM_PRESENT(r)		GET_BITFIELD(r, 15, 15)
 #define IS_NVDIMM_PRESENT(r, i)		GET_BITFIELD(r, i, i)
 
+#define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
+#define MCI_MISC_ECC_DDRT	8	/* read from DDRT */
+
+/*
+ * According to Intel Architecture spec vol 3B,
+ * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
+ * memory errors should fit one of these masks:
+ *	000f 0000 1mmm cccc (binary)
+ *	000f 0010 1mmm cccc (binary)	[RAM used as cache]
+ * where:
+ *	f = Correction Report Filtering Bit. If 1, subsequent errors
+ *	    won't be shown
+ *	mmm = error type
+ *	cccc = channel
+ */
+#define MCACOD_MEM_ERR_MASK	0xef80
+/*
+ * Errors from either the memory of the 1-level memory system or the
+ * 2nd level memory (the slow "far" memory) of the 2-level memory system.
+ */
+#define MCACOD_MEM_CTL_ERR	0x80
+/*
+ * Errors from the 1st level memory (the fast "near" memory as cache)
+ * of the 2-level memory system.
+ */
+#define MCACOD_EXT_MEM_ERR	0x280
+
 /*
  * Each cpu socket contains some pci devices that provide global
  * information, and also some that are local to each of the two
@@ -54,17 +92,25 @@ struct skx_dev {
 	struct pci_dev *sad_all;
 	struct pci_dev *util_all;
 	struct pci_dev *uracu; /* for i10nm CPU */
+	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
+		int chan_mmio_sz;     /* for i10nm CPU */
+		int num_channels; /* channels per memory controller */
+		int num_dimms; /* dimms per channel */
+		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
+			u32 retry_rd_err_log_s;
+			u32 retry_rd_err_log_d;
+			u32 retry_rd_err_log_d2;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -82,7 +128,8 @@ struct skx_pvt {
 
 enum type {
 	SKX,
-	I10NM
+	I10NM,
+	SPR
 };
 
 enum {
@@ -90,10 +137,22 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_CS,
+	INDEX_NM_FIRST,
+	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
+	INDEX_NM_CHANNEL,
+	INDEX_NM_DIMM,
+	INDEX_NM_CS,
 	INDEX_MAX
 };
 
+#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
+#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
+#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+#define BIT_NM_CS	BIT_ULL(INDEX_NM_CS)
+
 struct decoded_addr {
+	struct mce *mce;
 	struct skx_dev *dev;
 	u64	addr;
 	int	socket;
@@ -103,6 +162,7 @@ struct decoded_addr {
 	int	sktways;
 	int	chanways;
 	int	dimm;
+	int	cs;
 	int	rank;
 	int	channel_rank;
 	u64	rank_address;
@@ -110,6 +170,7 @@ struct decoded_addr {
 	int	column;
 	int	bank_address;
 	int	bank_group;
+	bool	decoded_by_adxl;
 };
 
 struct res_config {
@@ -118,15 +179,33 @@ struct res_config {
 	unsigned int decs_did;
 	/* Default bus number configuration register offset */
 	int busno_cfg_offset;
+	/* Per DDR channel memory-mapped I/O size */
+	int ddr_chan_mmio_sz;
+	/* Per HBM channel memory-mapped I/O size */
+	int hbm_chan_mmio_sz;
+	bool support_ddr5;
+	/* SAD device number and function number */
+	unsigned int sad_all_devfn;
+	int sad_all_offset;
+	/* Offsets of retry_rd_err_log registers */
+	u32 *offsets_scrub;
+	u32 *offsets_scrub_hbm0;
+	u32 *offsets_scrub_hbm1;
+	u32 *offsets_demand;
+	u32 *offsets_demand2;
+	u32 *offsets_demand_hbm0;
+	u32 *offsets_demand_hbm1;
 };
 
-typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
+typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
+				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
 
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
+void skx_set_mem_cfg(bool mem_cfg_2lm);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 int skx_get_node_id(struct skx_dev *d, u8 *id);
@@ -136,14 +215,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);
 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm);
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno);
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg);
 
 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 			int chan, int dimmno, const char *mod_str);
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config);
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg);
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 1da7ba18d3993207a2797ff53fc387685c756edc..c777088f5828bce4337fce24bc58c1158cfdbf16 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1646,14 +1646,6 @@ static long fw_device_op_ioctl(struct file *file,
 	return dispatch_ioctl(file->private_data, cmd, (void __user *)arg);
 }
 
-#ifdef CONFIG_COMPAT
-static long fw_device_op_compat_ioctl(struct file *file,
-				      unsigned int cmd, unsigned long arg)
-{
-	return dispatch_ioctl(file->private_data, cmd, compat_ptr(arg));
-}
-#endif
-
 static int fw_device_op_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct client *client = file->private_data;
@@ -1795,7 +1787,5 @@ const struct file_operations fw_device_ops = {
 	.mmap		= fw_device_op_mmap,
 	.release	= fw_device_op_release,
 	.poll		= fw_device_op_poll,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= fw_device_op_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 };
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 3222645c95b338189bdbdc45cb41dd30425b2f63..4a3355c75dbdd3b901599c56d4a7e549af5bf351 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -75,6 +75,27 @@ config EFI_MAX_FAKE_MEM
 	  Ranges can be set up to this value using comma-separated list.
 	  The default value is 8.
 
+config EFI_SOFT_RESERVE
+	bool "Reserve EFI Specific Purpose Memory"
+	depends on EFI && EFI_STUB && ACPI_HMAT
+	default ACPI_HMAT
+	help
+	  On systems that have mixed performance classes of memory EFI
+	  may indicate specific purpose memory with an attribute (See
+	  EFI_MEMORY_SP in UEFI 2.8). A memory range tagged with this
+	  attribute may have unique performance characteristics compared
+	  to the system's general purpose "System RAM" pool. On the
+	  expectation that such memory has application specific usage,
+	  and its base EFI memory type is "conventional" answer Y to
+	  arrange for the kernel to reserve it as a "Soft Reserved"
+	  resource, and set aside for direct-access (device-dax) by
+	  default. The memory range can later be optionally assigned to
+	  the page allocator by system administrator policy via the
+	  device-dax kmem facility. Say N to have the kernel treat this
+	  memory as "System RAM" by default.
+
+	  If unsure, say Y.
+
 config EFI_PARAMS_FROM_FDT
 	bool
 	help
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 4ac2de4dfa72aaf6e22a351417dd5c1dac207819..554d795270d9ead0cbddbc473a6b87975e2a7fc5 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -20,13 +20,16 @@ obj-$(CONFIG_UEFI_CPER)			+= cper.o
 obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
 obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
 obj-$(CONFIG_EFI_STUB)			+= libstub/
-obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_mem.o
+obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_map.o
 obj-$(CONFIG_EFI_BOOTLOADER_CONTROL)	+= efibc.o
 obj-$(CONFIG_EFI_TEST)			+= test/
 obj-$(CONFIG_EFI_DEV_PATH_PARSER)	+= dev-path-parser.o
 obj-$(CONFIG_APPLE_PROPERTIES)		+= apple-properties.o
 obj-$(CONFIG_EFI_RCI2_TABLE)		+= rci2-table.o
 
+fake_map-y				+= fake_mem.o
+fake_map-$(CONFIG_X86)			+= x86_fake_mem.o
+
 arm-obj-$(CONFIG_EFI)			:= arm-init.o arm-runtime.o
 obj-$(CONFIG_ARM)			+= $(arm-obj-y)
 obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 311cd349a8628bbe1e8b8441f5be32dc9ac71204..904fa09e6a6b0341ab3437080a25ed7b228bd806 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -163,6 +163,15 @@ static __init int is_usable_memory(efi_memory_desc_t *md)
 	case EFI_BOOT_SERVICES_DATA:
 	case EFI_CONVENTIONAL_MEMORY:
 	case EFI_PERSISTENT_MEMORY:
+		/*
+		 * Special purpose memory is 'soft reserved', which means it
+		 * is set aside initially, but can be hotplugged back in or
+		 * be assigned to the dax driver after boot.
+		 */
+		if (efi_soft_reserve_enabled() &&
+		    (md->attribute & EFI_MEMORY_SP))
+			return false;
+
 		/*
 		 * According to the spec, these regions are no longer reserved
 		 * after calling ExitBootServices(). However, we can only use
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index e2ac5fa5531b9f4ce39e96d0953bb82251a49cbe..899b803842bbe4ec471b861ab2f1774a2ae6698c 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -121,6 +121,30 @@ static int __init arm_enable_runtime_services(void)
 		return 0;
 	}
 
+	if (efi_soft_reserve_enabled()) {
+		efi_memory_desc_t *md;
+
+		for_each_efi_memory_desc(md) {
+			int md_size = md->num_pages << EFI_PAGE_SHIFT;
+			struct resource *res;
+
+			if (!(md->attribute & EFI_MEMORY_SP))
+				continue;
+
+			res = kzalloc(sizeof(*res), GFP_KERNEL);
+			if (WARN_ON(!res))
+				break;
+
+			res->start	= md->phys_addr;
+			res->end	= md->phys_addr + md_size - 1;
+			res->name	= "Soft Reserved";
+			res->flags	= IORESOURCE_MEM;
+			res->desc	= IORES_DESC_SOFT_RESERVED;
+
+			insert_resource(&iomem_resource, res);
+		}
+	}
+
 	if (efi_runtime_disabled()) {
 		pr_info("EFI runtime services will be disabled.\n");
 		return 0;
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3a2b607369151027245355911de5d10db63131c0..acde8985862aeb937f271cea491340861915098b 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -81,6 +81,11 @@ bool efi_runtime_disabled(void)
 	return disable_runtime;
 }
 
+bool __pure __efi_soft_reserve_enabled(void)
+{
+	return !efi_enabled(EFI_MEM_NO_SOFT_RESERVE);
+}
+
 static int __init parse_efi_cmdline(char *str)
 {
 	if (!str) {
@@ -94,6 +99,9 @@ static int __init parse_efi_cmdline(char *str)
 	if (parse_option_str(str, "noruntime"))
 		disable_runtime = true;
 
+	if (parse_option_str(str, "nosoftreserve"))
+		set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags);
+
 	return 0;
 }
 early_param("efi", parse_efi_cmdline);
@@ -844,15 +852,16 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
 	if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 		     EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO |
 		     EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP |
-		     EFI_MEMORY_NV |
+		     EFI_MEMORY_NV | EFI_MEMORY_SP |
 		     EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE))
 		snprintf(pos, size, "|attr=0x%016llx]",
 			 (unsigned long long)attr);
 	else
 		snprintf(pos, size,
-			 "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
+			 "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
 			 attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
 			 attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
+			 attr & EFI_MEMORY_SP      ? "SP"  : "",
 			 attr & EFI_MEMORY_NV      ? "NV"  : "",
 			 attr & EFI_MEMORY_XP      ? "XP"  : "",
 			 attr & EFI_MEMORY_RP      ? "RP"  : "",
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
index e8f71a50ba8961ab3da0cb7efecb85376dad2893..9761f303bcb97a0299d5ccd32169b2324ae31a79 100644
--- a/drivers/firmware/efi/esrt.c
+++ b/drivers/firmware/efi/esrt.c
@@ -246,6 +246,9 @@ void __init efi_esrt_init(void)
 	int rc;
 	phys_addr_t end;
 
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
 	pr_debug("esrt-init: loading.\n");
 	if (!esrt_table_exists())
 		return;
diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 9501edc0fcfb600175d57e2a127d48e77ec38547..bb9fc70d0cfab97d6f03779ab53e0139709dc5c9 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -17,12 +17,10 @@
 #include <linux/memblock.h>
 #include <linux/types.h>
 #include <linux/sort.h>
-#include <asm/efi.h>
+#include "fake_mem.h"
 
-#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
-
-static struct efi_mem_range fake_mems[EFI_MAX_FAKEMEM];
-static int nr_fake_mem;
+struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+int nr_fake_mem;
 
 static int __init cmp_fake_mem(const void *x1, const void *x2)
 {
@@ -44,13 +42,13 @@ void __init efi_fake_memmap(void)
 	void *new_memmap;
 	int i;
 
-	if (!nr_fake_mem)
+	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
 		return;
 
 	/* count up the number of EFI memory descriptor */
 	for (i = 0; i < nr_fake_mem; i++) {
 		for_each_efi_memory_desc(md) {
-			struct range *r = &fake_mems[i].range;
+			struct range *r = &efi_fake_mems[i].range;
 
 			new_nr_map += efi_memmap_split_count(md, r);
 		}
@@ -70,7 +68,7 @@ void __init efi_fake_memmap(void)
 	}
 
 	for (i = 0; i < nr_fake_mem; i++)
-		efi_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]);
+		efi_memmap_insert(&efi.memmap, new_memmap, &efi_fake_mems[i]);
 
 	/* swap into new EFI memmap */
 	early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map);
@@ -104,22 +102,22 @@ static int __init setup_fake_mem(char *p)
 		if (nr_fake_mem >= EFI_MAX_FAKEMEM)
 			break;
 
-		fake_mems[nr_fake_mem].range.start = start;
-		fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
-		fake_mems[nr_fake_mem].attribute = attribute;
+		efi_fake_mems[nr_fake_mem].range.start = start;
+		efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
+		efi_fake_mems[nr_fake_mem].attribute = attribute;
 		nr_fake_mem++;
 
 		if (*p == ',')
 			p++;
 	}
 
-	sort(fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
+	sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
 	     cmp_fake_mem, NULL);
 
 	for (i = 0; i < nr_fake_mem; i++)
 		pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]",
-			fake_mems[i].attribute, fake_mems[i].range.start,
-			fake_mems[i].range.end);
+			efi_fake_mems[i].attribute, efi_fake_mems[i].range.start,
+			efi_fake_mems[i].range.end);
 
 	return *p == '\0' ? 0 : -EINVAL;
 }
diff --git a/drivers/firmware/efi/fake_mem.h b/drivers/firmware/efi/fake_mem.h
new file mode 100644
index 0000000000000000000000000000000000000000..d52791af4b1871223402ed606f797749f2ce17f8
--- /dev/null
+++ b/drivers/firmware/efi/fake_mem.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __EFI_FAKE_MEM_H__
+#define __EFI_FAKE_MEM_H__
+#include <asm/efi.h>
+
+#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
+
+extern struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+extern int nr_fake_mem;
+#endif /* __EFI_FAKE_MEM_H__ */
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 41213bf5fcf5e8a84619923757fd033cce66761b..4566640de650d41856de686619a4e6e4226d4772 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -146,6 +146,11 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
 			continue;
 
 		case EFI_CONVENTIONAL_MEMORY:
+			/* Skip soft reserved conventional memory */
+			if (efi_soft_reserve_enabled() &&
+			    (desc->attribute & EFI_MEMORY_SP))
+				continue;
+
 			/*
 			 * Reserve the intersection between this entry and the
 			 * region.
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 35dbc2791c973f6281470bce96016f1ce37e95a9..e02579907f2e21abc20511560f1d85feb91527a4 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,7 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 static int __section(.data) __nokaslr;
 static int __section(.data) __quiet;
 static int __section(.data) __novamap;
+static bool __section(.data) efi_nosoftreserve;
 
 int __pure nokaslr(void)
 {
@@ -45,6 +46,10 @@ int __pure novamap(void)
 {
 	return __novamap;
 }
+bool __pure __efi_soft_reserve_enabled(void)
+{
+	return !efi_nosoftreserve;
+}
 
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
@@ -211,6 +216,10 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (desc->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (desc->num_pages < nr_pages)
 			continue;
 
@@ -305,6 +314,10 @@ efi_status_t efi_low_alloc_above(efi_system_table_t *sys_table_arg,
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
+		if (efi_soft_reserve_enabled() &&
+		    (desc->attribute & EFI_MEMORY_SP))
+			continue;
+
 		if (desc->num_pages < nr_pages)
 			continue;
 
@@ -484,6 +497,12 @@ efi_status_t efi_parse_options(char const *cmdline)
 			__novamap = 1;
 		}
 
+		if (IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) &&
+		    !strncmp(str, "nosoftreserve", 7)) {
+			str += strlen("nosoftreserve");
+			efi_nosoftreserve = 1;
+		}
+
 		/* Group words together, delimited by "," */
 		while (*str && *str != ' ' && *str != ',')
 			str++;
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index b4b1d1dcb5fdc0ce690af90e65b1ddfcdf898f74..6c188695e7305ed7ae08f15f3b31f3df0a70ab8a 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -46,6 +46,10 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
 	if (md->type != EFI_CONVENTIONAL_MEMORY)
 		return 0;
 
+	if (efi_soft_reserve_enabled() &&
+	    (md->attribute & EFI_MEMORY_SP))
+		return 0;
+
 	region_end = min((u64)ULONG_MAX, md->phys_addr + md->num_pages*EFI_PAGE_SIZE - 1);
 
 	first_slot = round_up(md->phys_addr, align);
diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5d6d5a1b2401ce39c6fa5a94279e1aa6c6172a2
--- /dev/null
+++ b/drivers/firmware/efi/x86_fake_mem.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019 Intel Corporation. All rights reserved. */
+#include <linux/efi.h>
+#include <asm/e820/api.h>
+#include "fake_mem.h"
+
+void __init efi_fake_memmap_early(void)
+{
+	int i;
+
+	/*
+	 * The late efi_fake_mem() call can handle all requests if
+	 * EFI_MEMORY_SP support is disabled.
+	 */
+	if (!efi_soft_reserve_enabled())
+		return;
+
+	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+		return;
+
+	/*
+	 * Given that efi_fake_memmap() needs to perform memblock
+	 * allocations it needs to run after e820__memblock_setup().
+	 * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given
+	 * address range that potentially needs to mark the memory as
+	 * reserved prior to e820__memblock_setup(). Update e820
+	 * directly if EFI_MEMORY_SP is specified for an
+	 * EFI_CONVENTIONAL_MEMORY descriptor.
+	 */
+	for (i = 0; i < nr_fake_mem; i++) {
+		struct efi_mem_range *mem = &efi_fake_mems[i];
+		efi_memory_desc_t *md;
+		u64 m_start, m_end;
+
+		if ((mem->attribute & EFI_MEMORY_SP) == 0)
+			continue;
+
+		m_start = mem->range.start;
+		m_end = mem->range.end;
+		for_each_efi_memory_desc(md) {
+			u64 start, end;
+
+			if (md->type != EFI_CONVENTIONAL_MEMORY)
+				continue;
+
+			start = md->phys_addr;
+			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+			if (m_start <= end && m_end >= start)
+				/* fake range overlaps descriptor */;
+			else
+				continue;
+
+			/*
+			 * Trim the boundary of the e820 update to the
+			 * descriptor in case the fake range overlaps
+			 * !EFI_CONVENTIONAL_MEMORY
+			 */
+			start = max(start, m_start);
+			end = min(end, m_end);
+
+			if (end <= start)
+				continue;
+			e820__range_update(start, end - start + 1, E820_TYPE_RAM,
+					E820_TYPE_SOFT_RESERVED);
+			e820__update_table(e820_table);
+		}
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e519df3fd2b6fd1d706e58300f2f6e23aa56c257..7cd72376e9a0ca46064034f5edb51294228a2f32 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -195,11 +195,11 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s
 	})
 
 /* GPUVM API */
-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasid,
+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, u32 pasid,
 					void **vm, void **process_info,
 					struct dma_fence **ef);
 int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
-					struct file *filp, unsigned int pasid,
+					struct file *filp, u32 pasid,
 					void **vm, void **process_info,
 					struct dma_fence **ef);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index ce30d4e8bf25ff0749c648ef5e7bf6252a320c26..72703f14e4a6c6fe86f15a099a6ec41121c40720 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -234,7 +234,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 5f459bf5f6222b71030eff48282e31a8c7dce138..0e9e63190269dae40c711727679f324ec353e160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -258,7 +258,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 6d2f6144960667c296f636fcb112517887b4e07c..4fc1bddf048a4e6b1186ee794fc5262541131f25 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -214,7 +214,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 92754cfb980867288810a71197c52fe30c1249b3..75d6c4cef7d5ccad54ba3fe1d563ddbdd8fefe6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -145,7 +145,7 @@ void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 	unlock_srbm(kgd);
 }
 
-int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 26d8879bff9d04faa31cd77fa65ec873458be969..838ff519d56f021ab5a5ef6e720d6a18148fb911 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -26,7 +26,7 @@ void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 		uint32_t sh_mem_config,
 		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
 		uint32_t sh_mem_bases);
-int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
+int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
 		unsigned int vmid);
 int kgd_gfx_v9_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
 int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f3fa271e3394c0b75070605764505b3710ffe2fc..7979f4c29af2c8f8e99f07a21e340c7969929769 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -923,7 +923,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
 	return ret;
 }
 
-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasid,
+int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, u32 pasid,
 					  void **vm, void **process_info,
 					  struct dma_fence **ef)
 {
@@ -959,7 +959,7 @@ int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasi
 }
 
 int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
-					   struct file *filp, unsigned int pasid,
+					   struct file *filp, u32 pasid,
 					   void **vm, void **process_info,
 					   struct dma_fence **ef)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index 53734da1c2df12361cb111ae4d309c639ec265e6..470ca5e6120c29b64dc44317f017cb75f551974d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -43,7 +43,7 @@ static DEFINE_IDA(amdgpu_pasid_ida);
 /* Helper to free pasid from a fence callback */
 struct amdgpu_pasid_cb {
 	struct dma_fence_cb cb;
-	unsigned int pasid;
+	u32 pasid;
 };
 
 /**
@@ -79,7 +79,7 @@ int amdgpu_pasid_alloc(unsigned int bits)
  * amdgpu_pasid_free - Free a PASID
  * @pasid: PASID to free
  */
-void amdgpu_pasid_free(unsigned int pasid)
+void amdgpu_pasid_free(u32 pasid)
 {
 	trace_amdgpu_pasid_freed(pasid);
 	ida_simple_remove(&amdgpu_pasid_ida, pasid);
@@ -105,7 +105,7 @@ static void amdgpu_pasid_free_cb(struct dma_fence *fence,
  * Free the pasid only after all the fences in resv are signaled.
  */
 void amdgpu_pasid_free_delayed(struct dma_resv *resv,
-			       unsigned int pasid)
+			       u32 pasid)
 {
 	struct dma_fence *fence, **fences;
 	struct amdgpu_pasid_cb *cb;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 8e58325bbca25723bc0e6003b7155ae5af0579c5..0c3b4fa1f93603bcaf9692a94c6db45ddeae2c78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -71,9 +71,9 @@ struct amdgpu_vmid_mgr {
 };
 
 int amdgpu_pasid_alloc(unsigned int bits);
-void amdgpu_pasid_free(unsigned int pasid);
+void amdgpu_pasid_free(u32 pasid);
 void amdgpu_pasid_free_delayed(struct dma_resv *resv,
-			       unsigned int pasid);
+			       u32 pasid);
 
 bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
 			       struct amdgpu_vmid *id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 59fd9ebf3a58bc6419b93780ed0adba86b69865a..4233e0c80b8e82ab0c6cd3fafd667d2f53ea465e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1052,7 +1052,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 	struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
 	struct amdgpu_bo_list *list;
 	struct amdgpu_bo *pd;
-	unsigned int pasid;
+	u32 pasid;
 	int handle;
 
 	if (!fpriv)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fb47ddc6f7f4e18303129878c7b2f7e9284b4326..bdc6227a6c8fa39713a0ddb73e33cda516d8ce4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2671,7 +2671,7 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
  * 0 for success, error for failure.
  */
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-		   int vm_context, unsigned int pasid)
+		   int vm_context, u32 pasid)
 {
 	struct amdgpu_bo_param bp;
 	struct amdgpu_bo *root;
@@ -2822,7 +2822,7 @@ static int amdgpu_vm_check_clean_reserved(struct amdgpu_device *adev,
  * Returns:
  * 0 for success, -errno for errors.
  */
-int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid)
+int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid)
 {
 	bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
 	int r;
@@ -3100,7 +3100,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			 struct amdgpu_task_info *task_info)
 {
 	struct amdgpu_vm *vm;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 4a64825b53cbd47a3cb68ef12d3cded9153752e6..f303a44f05da9cb507cdaa759df0344805e696c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -345,8 +345,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-		   int vm_context, unsigned int pasid);
-int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
+		   int vm_context, u32 pasid);
+int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
@@ -402,7 +402,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			     struct amdgpu_task_info *task_info);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 177d1e5329a5723482c437381253700c224aeb90..8e48706c76fb825d13c09eb4271171ea02d477a1 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -88,7 +88,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 			(const struct cik_ih_ring_entry *)ih_ring_entry;
 	uint32_t context_id = ihre->data & 0xfffffff;
 	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
-	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
+	u32 pasid = (ihre->ring_id & 0xffff0000) >> 16;
 
 	if (pasid == 0)
 		return;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index a3441b0e385b7a32edf8a887dbafe2daaed3d109..8dbe4fa575b4ab032539e9947419102af8c76b9d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -45,7 +45,7 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
 }
 
 static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
-				unsigned int pasid, uint64_t vmid0_address,
+				u32 pasid, uint64_t vmid0_address,
 				uint32_t *packet_buff, size_t size_in_bytes)
 {
 	struct pm4__release_mem *rm_packet;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
index a04a1fe1d0d935c389f9a4796b9467d9101fd0e7..f9c6df1fdc5c5c90c8d4ef75b32235a77e6e4109 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
@@ -275,7 +275,7 @@ struct kfd_dbgdev {
 };
 
 struct kfd_dbgmgr {
-	unsigned int pasid;
+	u32 pasid;
 	struct kfd_dev *dev;
 	struct kfd_dbgdev *dbgdev;
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ab69898c9cb72ee1afb372d34ace0857a706a1d4..12935197c5f9bceddba7f9bc2bf33555070169c8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -40,7 +40,7 @@
 #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
 
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
-					unsigned int pasid, unsigned int vmid);
+				  u32 pasid, unsigned int vmid);
 
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
@@ -837,7 +837,7 @@ static int unregister_process(struct device_queue_manager *dqm,
 }
 
 static int
-set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid,
+set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid,
 			unsigned int vmid)
 {
 	return dqm->dev->kfd2kgd->set_pasid_vmid_mapping(
@@ -1847,8 +1847,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	kfree(dqm);
 }
 
-int kfd_process_vm_fault(struct device_queue_manager *dqm,
-			 unsigned int pasid)
+int kfd_process_vm_fault(struct device_queue_manager *dqm, u32 pasid)
 {
 	struct kfd_process_device *pdd;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index d674d4b3340faa3437bf798cd73c05fd01d82411..853b600f588bad41cbd909e43ab940009704869e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -461,7 +461,7 @@ static void set_event_from_interrupt(struct kfd_process *p,
 	}
 }
 
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
 				uint32_t valid_id_bits)
 {
 	struct kfd_event *ev = NULL;
@@ -873,7 +873,7 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
 }
 
 #ifdef KFD_SUPPORT_IOMMU_V2
-void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
 		unsigned long address, bool is_write_requested,
 		bool is_execute_requested)
 {
@@ -950,7 +950,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 }
 #endif /* KFD_SUPPORT_IOMMU_V2 */
 
-void kfd_signal_hw_exception_event(unsigned int pasid)
+void kfd_signal_hw_exception_event(u32 pasid)
 {
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
@@ -971,7 +971,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 	kfd_unref_process(p);
 }
 
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
 				struct kfd_vm_fault_info *info)
 {
 	struct kfd_event *ev;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index c7ac6c73af86eb80c1f166bb96286682675a3c4a..c8fe5dbdad55c5c0cf84e8f224fddfea528cd960 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -79,7 +79,7 @@ struct kfd_event {
 #define KFD_EVENT_TYPE_DEBUG 5
 #define KFD_EVENT_TYPE_MEMORY 8
 
-extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
-					uint32_t valid_id_bits);
+extern void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
+				       uint32_t valid_id_bits);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 9266c8e76be7266a4f41b5fd7eaa86a48166f35f..e4080dc6fb6af46fed5738cd6c6cd83cc99defc5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -146,7 +146,7 @@ void kfd_iommu_unbind_process(struct kfd_process *p)
 }
 
 /* Callback for process shutdown invoked by the IOMMU driver */
-static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
+static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, u32 pasid)
 {
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 	struct kfd_process *p;
@@ -192,8 +192,8 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 }
 
 /* This function called by IOMMU driver on PPR failure */
-static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
-		unsigned long address, u16 flags)
+static int iommu_invalid_ppr_cb(struct pci_dev *pdev, u32 pasid,
+				unsigned long address, u16 flags)
 {
 	struct kfd_dev *dev;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
index 33b08ff00b5012f9741209e43d0c5d21e644e6ec..c19a2e6fd7c88d1e332f80e634b83e9b15e4f6f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@@ -51,7 +51,7 @@ unsigned int kfd_get_pasid_limit(void)
 	return 1U << pasid_bits;
 }
 
-unsigned int kfd_pasid_alloc(void)
+u32 kfd_pasid_alloc(void)
 {
 	int r;
 
@@ -77,7 +77,7 @@ unsigned int kfd_pasid_alloc(void)
 	return r > 0 ? r : 0;
 }
 
-void kfd_pasid_free(unsigned int pasid)
+void kfd_pasid_free(u32 pasid)
 {
 	if (kfd2kgd)
 		amdgpu_pasid_free(pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c89326125d71170509531a8ffc434db3101f1908..5645a02f963613de5d2b262ba9ba9cf65dad548a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -687,7 +687,7 @@ struct kfd_process {
 	/* We want to receive a notification when the mm_struct is destroyed */
 	struct mmu_notifier mmu_notifier;
 
-	unsigned int pasid;
+	u32 pasid;
 	unsigned int doorbell_index;
 
 	/*
@@ -761,7 +761,7 @@ int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
 struct kfd_process *kfd_create_process(struct file *filep);
 struct kfd_process *kfd_get_process(const struct task_struct *);
-struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 void kfd_unref_process(struct kfd_process *p);
 int kfd_process_evict_queues(struct kfd_process *p);
@@ -802,8 +802,8 @@ int kfd_pasid_init(void);
 void kfd_pasid_exit(void);
 bool kfd_set_pasid_limit(unsigned int new_limit);
 unsigned int kfd_get_pasid_limit(void);
-unsigned int kfd_pasid_alloc(void);
-void kfd_pasid_free(unsigned int pasid);
+u32 kfd_pasid_alloc(void);
+void kfd_pasid_free(u32 pasid);
 
 /* Doorbells */
 size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
@@ -886,7 +886,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
-int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, u32 pasid);
 
 /* Process Queue Manager */
 struct process_queue_node {
@@ -1009,12 +1009,12 @@ int kfd_wait_on_events(struct kfd_process *p,
 		       uint32_t num_events, void __user *data,
 		       bool all, uint32_t user_timeout_ms,
 		       uint32_t *wait_result);
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
 				uint32_t valid_id_bits);
 void kfd_signal_iommu_event(struct kfd_dev *dev,
-		unsigned int pasid, unsigned long address,
-		bool is_write_requested, bool is_execute_requested);
-void kfd_signal_hw_exception_event(unsigned int pasid);
+			    u32 pasid, unsigned long address,
+			    bool is_write_requested, bool is_execute_requested);
+void kfd_signal_hw_exception_event(u32 pasid);
 int kfd_set_event(struct kfd_process *p, uint32_t event_id);
 int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
 int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
@@ -1025,7 +1025,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 		     uint64_t *event_page_offset, uint32_t *event_slot_index);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
 				struct kfd_vm_fault_info *info);
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index aa0a617b8d445d334af05ec519e71a12edf60b4b..344b4c85e2d1dff7369ca1eff4bdc29787e7aa43 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -903,7 +903,7 @@ void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
 }
 
 /* This increments the process->ref counter. */
-struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid)
 {
 	struct kfd_process *p, *ret_p = NULL;
 	unsigned int temp;
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 98b9533e672ba9a3e876fe4cbe11e8f9d23d4c0a..d2a1d05bb070caf11b074dccb6752dff6801ba87 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -246,7 +246,7 @@ struct kfd2kgd_calls {
 			uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
 			uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
 
-	int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid,
+	int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, u32 pasid,
 					unsigned int vmid);
 
 	int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index 41c8e39a73ba85a986cf351d67873aa8df12dc97..e4f03fcb125e46dacbf35613a63e570334f4445e 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -21,7 +21,6 @@ config DRM_I915_DEBUG
         depends on DRM_I915
         select DEBUG_FS
         select PREEMPT_COUNT
-        select REFCOUNT_FULL
         select I2C_CHARDEV
         select STACKDEPOT
         select DRM_DP_AUX_CHARDEV
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index e0e677b2a3a94e8348b70fa7bdc9a3a41ad06140..9439cd995afc94cafe0f4e4bcb1f2ee987eb0212 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj,
 }
 
 static struct i915_vma *
-vma_lookup(struct drm_i915_gem_object *obj,
+i915_vma_lookup(struct drm_i915_gem_object *obj,
 	   struct i915_address_space *vm,
 	   const struct i915_ggtt_view *view)
 {
@@ -280,7 +280,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(vm->closed);
 
 	spin_lock(&obj->vma.lock);
-	vma = vma_lookup(obj, vm, view);
+	vma = i915_vma_lookup(obj, vm, view);
 	spin_unlock(&obj->vma.lock);
 
 	/* vma_create() will resolve the race if another creates the vma */
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 01a89c071bf5c7ab7b8ddd311fb4c3474196efd5..8e63dd7a2f3ec21cbb207148d34b98c2777bc4d8 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -24,6 +24,7 @@
 
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
+#include <linux/iommu.h>
 
 #include "gt/intel_gt.h"
 #include "gt/mock_engine.h"
@@ -132,6 +133,9 @@ struct drm_i915_private *mock_gem_device(void)
 {
 	struct drm_i915_private *i915;
 	struct pci_dev *pdev;
+#if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU)
+	struct dev_iommu iommu;
+#endif
 	int err;
 
 	pdev = kzalloc(sizeof(*pdev) + sizeof(*i915), GFP_KERNEL);
@@ -145,8 +149,10 @@ struct drm_i915_private *mock_gem_device(void)
 	dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 
 #if IS_ENABLED(CONFIG_IOMMU_API) && defined(CONFIG_INTEL_IOMMU)
-	/* hack to disable iommu for the fake device; force identity mapping */
-	pdev->dev.archdata.iommu = (void *)-1;
+	/* HACK HACK HACK to disable iommu for the fake device; force identity mapping */
+	memset(&iommu, 0, sizeof(iommu));
+	iommu.priv = (void *)-1;
+	pdev->dev.iommu = &iommu;
 #endif
 
 	i915 = (struct drm_i915_private *)(pdev + 1);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
index 985f2990ab0dda4c8f7d0cbc6610f95d13286fb5..13d4d7ac0697b474db64450c42e097b1cfaec91d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
@@ -594,8 +594,7 @@ gk20a_instmem_new(struct nvkm_device *device, int index,
 
 		nvkm_info(&imem->base.subdev, "using IOMMU\n");
 	} else {
-		imem->attrs = DMA_ATTR_NON_CONSISTENT |
-			      DMA_ATTR_WEAK_ORDERING |
+		imem->attrs = DMA_ATTR_WEAK_ORDERING |
 			      DMA_ATTR_WRITE_COMBINE;
 
 		nvkm_info(&imem->base.subdev, "using DMA API\n");
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4711fb191a0721e50c266fd9538dd14e737ca5ea..4f97e6c120595666ece1dd8810d0c389ad05ed6e 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -845,13 +845,6 @@ static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return r;
 }
 
-#ifdef CONFIG_COMPAT
-static long hiddev_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return hiddev_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations hiddev_fops = {
 	.owner =	THIS_MODULE,
 	.read =		hiddev_read,
@@ -861,9 +854,7 @@ static const struct file_operations hiddev_fops = {
 	.release =	hiddev_release,
 	.unlocked_ioctl =	hiddev_ioctl,
 	.fasync =	hiddev_fasync,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= hiddev_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= noop_llseek,
 };
 
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 603b83ac50852a81b99f5f28fc9a26df8c2a5cb8..2712e699ba08cf2d30415eef43287b36cdc5ec75 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -832,23 +832,13 @@ stm_char_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-#ifdef CONFIG_COMPAT
-static long
-stm_char_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return stm_char_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#else
-#define stm_char_compat_ioctl	NULL
-#endif
-
 static const struct file_operations stm_fops = {
 	.open		= stm_char_open,
 	.release	= stm_char_release,
 	.write		= stm_char_write,
 	.mmap		= stm_char_mmap,
 	.unlocked_ioctl	= stm_char_ioctl,
-	.compat_ioctl	= stm_char_compat_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4b0ad1864d76e52a55598e47f1643a02b845c857..0cc535b63b94bf757a6e3a66a7561212269ddf5e 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -2,12 +2,13 @@
 /*
  * intel_idle.c - native hardware idle loop for modern Intel processors
  *
- * Copyright (c) 2013, Intel Corporation.
+ * Copyright (c) 2013 - 2020, Intel Corporation.
  * Len Brown <len.brown@intel.com>
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  */
 
 /*
- * intel_idle is a cpuidle driver that loads on specific Intel processors
+ * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT
  * in lieu of the legacy ACPI processor_idle driver.  The intent is to
  * make Linux more efficient on these processors, as intel_idle knows
  * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
@@ -19,17 +20,16 @@
  * All CPUs have same idle states as boot CPU
  *
  * Chipset BM_STS (bus master status) bit is a NOP
- *	for preventing entry into deep C-stats
+ *	for preventing entry into deep C-states
+ *
+ * CPU will flush caches as needed when entering a C-state via MWAIT
+ *	(in contrast to entering ACPI C3, in which case the WBINVD
+ *	instruction needs to be executed to flush the caches)
  */
 
 /*
  * Known limitations
  *
- * The driver currently initializes for_each_online_cpu() upon modprobe.
- * It it unaware of subsequent processors hot-added to the system.
- * This means that if you boot with maxcpus=n and later online
- * processors above n, those processors will use C1 only.
- *
  * ACPI has a .suspend hack to turn off deep c-statees during suspend
  * to avoid complications with the lapic timer workaround.
  * Have not seen issues with suspend, but may need same workaround here.
@@ -37,7 +37,7 @@
  */
 
 /* un-comment DEBUG to enable pr_debug() statements */
-#define DEBUG
+/* #define DEBUG */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
@@ -54,8 +54,9 @@
 #include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
+#include <asm/fpu/api.h>
 
-#define INTEL_IDLE_VERSION "0.4.1"
+#define INTEL_IDLE_VERSION "0.5.1"
 
 static struct cpuidle_driver intel_idle_driver = {
 	.name = "intel_idle",
@@ -63,12 +64,18 @@ static struct cpuidle_driver intel_idle_driver = {
 };
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
+static unsigned int disabled_states_mask;
+static unsigned int preferred_states_mask;
+
+static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
-static unsigned int mwait_substates;
+static unsigned long auto_demotion_disable_flags;
 
-#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
-/* Reliable LAPIC Timer States, bit 1 for C1 etc.  */
-static unsigned int lapic_timer_reliable_states = (1 << 1);	 /* Default to only C1 */
+static enum {
+	C1E_PROMOTION_PRESERVE,
+	C1E_PROMOTION_ENABLE,
+	C1E_PROMOTION_DISABLE
+} c1e_promotion = C1E_PROMOTION_PRESERVE;
 
 struct idle_cpu {
 	struct cpuidle_state *state_table;
@@ -83,13 +90,16 @@ struct idle_cpu {
 	bool use_acpi;
 };
 
-static const struct idle_cpu *icpu;
-static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static int intel_idle(struct cpuidle_device *dev,
-			struct cpuidle_driver *drv, int index);
-static void intel_idle_s2idle(struct cpuidle_device *dev,
-			      struct cpuidle_driver *drv, int index);
-static struct cpuidle_state *cpuidle_state_table;
+static const struct idle_cpu *icpu __initdata;
+static struct cpuidle_state *cpuidle_state_table __initdata;
+
+static unsigned int mwait_substates __initdata;
+
+/*
+ * Enable interrupts before entering the C-state. On some platforms and for
+ * some C-states, this may measurably decrease interrupt latency.
+ */
+#define CPUIDLE_FLAG_IRQ_ENABLE		BIT(14)
 
 /*
  * Enable this state by default even if the ACPI _CST does not list it.
@@ -97,12 +107,9 @@ static struct cpuidle_state *cpuidle_state_table;
 #define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
 
 /*
- * Set this flag for states where the HW flushes the TLB for us
- * and so we don't need cross-calls to keep it consistent.
- * If this flag is set, SW flushes the TLB, so even if the
- * HW doesn't do the flushing, this flag is safe to use.
+ * Initialize large xstate for the C6-state entrance.
  */
-#define CPUIDLE_FLAG_TLB_FLUSHED	0x10000
+#define CPUIDLE_FLAG_INIT_XSTATE	BIT(17)
 
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
@@ -114,12 +121,88 @@ static struct cpuidle_state *cpuidle_state_table;
 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
 
+static __always_inline int __intel_idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	struct cpuidle_state *state = &drv->states[index];
+	unsigned long eax = flg2MWAIT(state->flags);
+	unsigned long ecx = 1; /* break on interrupt flag */
+
+	mwait_idle_with_hints(eax, ecx);
+
+	return index;
+}
+
+/**
+ * intel_idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * If the local APIC timer is not known to be reliable in the target idle state,
+ * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
+ *
+ * Must be called under local_irq_disable().
+ */
+static __cpuidle int intel_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int index)
+{
+	return __intel_idle(dev, drv, index);
+}
+
+static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
+				    struct cpuidle_driver *drv, int index)
+{
+	int ret;
+
+	raw_local_irq_enable();
+	ret = __intel_idle(dev, drv, index);
+	raw_local_irq_disable();
+
+	return ret;
+}
+
+static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int index)
+{
+	fpu_idle_fpregs();
+	return __intel_idle(dev, drv, index);
+}
+
+/**
+ * intel_idle_s2idle - Ask the processor to enter the given idle state.
+ * @dev: cpuidle device of the target CPU.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ * @index: Target idle state index.
+ *
+ * Use the MWAIT instruction to notify the processor that the CPU represented by
+ * @dev is idle and it can try to enter the idle state corresponding to @index.
+ *
+ * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
+ * scheduler tick and suspended scheduler clock on the target CPU.
+ */
+static __cpuidle void intel_idle_s2idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	unsigned long ecx = 1; /* break on interrupt flag */
+	struct cpuidle_state *state = &drv->states[index];
+	unsigned long eax = flg2MWAIT(state->flags);
+
+	if (state->flags & CPUIDLE_FLAG_INIT_XSTATE)
+		fpu_idle_fpregs();
+
+	mwait_idle_with_hints(eax, ecx);
+}
+
 /*
  * States are indexed by the cstate number,
  * which is also the index into the MWAIT hint array.
  * Thus C0 is a dummy.
  */
-static struct cpuidle_state nehalem_cstates[] = {
+static struct cpuidle_state nehalem_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -156,7 +239,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state snb_cstates[] = {
+static struct cpuidle_state snb_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -201,7 +284,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state byt_cstates[] = {
+static struct cpuidle_state byt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -246,7 +329,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state cht_cstates[] = {
+static struct cpuidle_state cht_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -291,7 +374,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivb_cstates[] = {
+static struct cpuidle_state ivb_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -336,7 +419,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates[] = {
+static struct cpuidle_state ivt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -373,7 +456,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates_4s[] = {
+static struct cpuidle_state ivt_cstates_4s[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -410,7 +493,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state ivt_cstates_8s[] = {
+static struct cpuidle_state ivt_cstates_8s[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -447,7 +530,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state hsw_cstates[] = {
+static struct cpuidle_state hsw_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -515,7 +598,7 @@ static struct cpuidle_state hsw_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state bdw_cstates[] = {
+static struct cpuidle_state bdw_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -584,7 +667,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state skl_cstates[] = {
+static struct cpuidle_state skl_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -653,11 +736,11 @@ static struct cpuidle_state skl_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state skx_cstates[] = {
+static struct cpuidle_state skx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
@@ -682,11 +765,11 @@ static struct cpuidle_state skx_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state icx_cstates[] = {
+static struct cpuidle_state icx_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
-		.flags = MWAIT2flg(0x00),
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
@@ -703,15 +786,45 @@ static struct cpuidle_state icx_cstates[] = {
 		.name = "C6",
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 128,
-		.target_residency = 384,
+		.exit_latency = 170,
+		.target_residency = 600,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
 
-static struct cpuidle_state atom_cstates[] = {
+static struct cpuidle_state spr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
+					   CPUIDLE_FLAG_INIT_XSTATE,
+		.exit_latency = 290,
+		.target_residency = 800,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
+static struct cpuidle_state atom_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x00",
@@ -747,7 +860,7 @@ static struct cpuidle_state atom_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state tangier_cstates[] = {
+static struct cpuidle_state tangier_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -791,7 +904,7 @@ static struct cpuidle_state tangier_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state avn_cstates[] = {
+static struct cpuidle_state avn_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -811,7 +924,7 @@ static struct cpuidle_state avn_cstates[] = {
 	{
 		.enter = NULL }
 };
-static struct cpuidle_state knl_cstates[] = {
+static struct cpuidle_state knl_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -832,7 +945,7 @@ static struct cpuidle_state knl_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state bxt_cstates[] = {
+static struct cpuidle_state bxt_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -893,7 +1006,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.enter = NULL }
 };
 
-static struct cpuidle_state dnv_cstates[] = {
+static struct cpuidle_state dnv_cstates[] __initdata = {
 	{
 		.name = "C1",
 		.desc = "MWAIT 0x00",
@@ -922,226 +1035,172 @@ static struct cpuidle_state dnv_cstates[] = {
 		.enter = NULL }
 };
 
-/**
- * intel_idle
- * @dev: cpuidle_device
- * @drv: cpuidle driver
- * @index: index of cpuidle state
- *
- * Must be called under local_irq_disable().
- */
-static __cpuidle int intel_idle(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index)
-{
-	unsigned long ecx = 1; /* break on interrupt flag */
-	struct cpuidle_state *state = &drv->states[index];
-	unsigned long eax = flg2MWAIT(state->flags);
-	unsigned int cstate;
-	bool uninitialized_var(tick);
-	int cpu = smp_processor_id();
-
-	/*
-	 * leave_mm() to avoid costly and often unnecessary wakeups
-	 * for flushing the user TLB's associated with the active mm.
-	 */
-	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(cpu);
-
-	if (!static_cpu_has(X86_FEATURE_ARAT)) {
-		cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) &
-				MWAIT_CSTATE_MASK) + 1;
-		tick = false;
-		if (!(lapic_timer_reliable_states & (1 << (cstate)))) {
-			tick = true;
-			tick_broadcast_enter();
-		}
-	}
-
-	mwait_idle_with_hints(eax, ecx);
-
-	if (!static_cpu_has(X86_FEATURE_ARAT) && tick)
-		tick_broadcast_exit();
-
-	return index;
-}
-
-/**
- * intel_idle_s2idle - simplified "enter" callback routine for suspend-to-idle
- * @dev: cpuidle_device
- * @drv: cpuidle driver
- * @index: state index
+/*
+ * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
+ * C6, and this is indicated in the CPUID mwait leaf.
  */
-static void intel_idle_s2idle(struct cpuidle_device *dev,
-			     struct cpuidle_driver *drv, int index)
-{
-	unsigned long ecx = 1; /* break on interrupt flag */
-	unsigned long eax = flg2MWAIT(drv->states[index].flags);
-
-	mwait_idle_with_hints(eax, ecx);
-}
-
-static bool intel_idle_verify_cstate(unsigned int mwait_hint)
-{
-	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
-	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
-					MWAIT_SUBSTATE_MASK;
-
-	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
-	if (num_substates == 0)
-		return false;
-
-	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-		mark_tsc_unstable("TSC halts in idle states deeper than C2");
-
-	return true;
-}
-
-static void __setup_broadcast_timer(bool on)
-{
-	if (on)
-		tick_broadcast_enable();
-	else
-		tick_broadcast_disable();
-}
-
-static void auto_demotion_disable(void)
-{
-	unsigned long long msr_bits;
-
-	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
-	msr_bits &= ~(icpu->auto_demotion_disable_flags);
-	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
-}
-static void c1e_promotion_disable(void)
-{
-	unsigned long long msr_bits;
-
-	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
-	msr_bits &= ~0x2;
-	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
-}
+static struct cpuidle_state snr_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 2,
+		.target_residency = 2,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 15,
+		.target_residency = 25,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 130,
+		.target_residency = 500,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
 
-static const struct idle_cpu idle_cpu_nehalem = {
+static const struct idle_cpu idle_cpu_nehalem __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_nhx = {
+static const struct idle_cpu idle_cpu_nhx __initconst = {
 	.state_table = nehalem_cstates,
 	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_atom = {
+static const struct idle_cpu idle_cpu_atom __initconst = {
 	.state_table = atom_cstates,
 };
 
-static const struct idle_cpu idle_cpu_tangier = {
+static const struct idle_cpu idle_cpu_tangier __initconst = {
 	.state_table = tangier_cstates,
 };
 
-static const struct idle_cpu idle_cpu_lincroft = {
+static const struct idle_cpu idle_cpu_lincroft __initconst = {
 	.state_table = atom_cstates,
 	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
 };
 
-static const struct idle_cpu idle_cpu_snb = {
+static const struct idle_cpu idle_cpu_snb __initconst = {
 	.state_table = snb_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_snx = {
+static const struct idle_cpu idle_cpu_snx __initconst = {
 	.state_table = snb_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_byt = {
+static const struct idle_cpu idle_cpu_byt __initconst = {
 	.state_table = byt_cstates,
 	.disable_promotion_to_c1e = true,
 	.byt_auto_demotion_disable_flag = true,
 };
 
-static const struct idle_cpu idle_cpu_cht = {
+static const struct idle_cpu idle_cpu_cht __initconst = {
 	.state_table = cht_cstates,
 	.disable_promotion_to_c1e = true,
 	.byt_auto_demotion_disable_flag = true,
 };
 
-static const struct idle_cpu idle_cpu_ivb = {
+static const struct idle_cpu idle_cpu_ivb __initconst = {
 	.state_table = ivb_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_ivt = {
+static const struct idle_cpu idle_cpu_ivt __initconst = {
 	.state_table = ivt_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_hsw = {
+static const struct idle_cpu idle_cpu_hsw __initconst = {
 	.state_table = hsw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_hsx = {
+static const struct idle_cpu idle_cpu_hsx __initconst = {
 	.state_table = hsw_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_bdw = {
+static const struct idle_cpu idle_cpu_bdw __initconst = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_bdx = {
+static const struct idle_cpu idle_cpu_bdx __initconst = {
 	.state_table = bdw_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_skl = {
+static const struct idle_cpu idle_cpu_skl __initconst = {
 	.state_table = skl_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_skx = {
+static const struct idle_cpu idle_cpu_skx __initconst = {
 	.state_table = skx_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_icx = {
+static const struct idle_cpu idle_cpu_icx __initconst = {
 	.state_table = icx_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_avn = {
+static const struct idle_cpu idle_cpu_spr __initconst = {
+	.state_table = spr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
+static const struct idle_cpu idle_cpu_avn __initconst = {
 	.state_table = avn_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_knl = {
+static const struct idle_cpu idle_cpu_knl __initconst = {
 	.state_table = knl_cstates,
 	.use_acpi = true,
 };
 
-static const struct idle_cpu idle_cpu_bxt = {
+static const struct idle_cpu idle_cpu_bxt __initconst = {
 	.state_table = bxt_cstates,
 	.disable_promotion_to_c1e = true,
 };
 
-static const struct idle_cpu idle_cpu_dnv = {
+static const struct idle_cpu idle_cpu_dnv __initconst = {
 	.state_table = dnv_cstates,
 	.disable_promotion_to_c1e = true,
 	.use_acpi = true,
 };
 
+static const struct idle_cpu idle_cpu_snr __initconst = {
+	.state_table = snr_cstates,
+	.disable_promotion_to_c1e = true,
+	.use_acpi = true,
+};
+
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(NEHALEM_EP,		idle_cpu_nhx),
 	INTEL_CPU_FAM6(NEHALEM,			idle_cpu_nehalem),
@@ -1174,13 +1233,15 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	INTEL_CPU_FAM6(KABYLAKE_L,		idle_cpu_skl),
 	INTEL_CPU_FAM6(KABYLAKE,		idle_cpu_skl),
 	INTEL_CPU_FAM6(SKYLAKE_X,		idle_cpu_skx),
-        INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
+	INTEL_CPU_FAM6(ICELAKE_X,		idle_cpu_icx),
+	INTEL_CPU_FAM6(ICELAKE_D,		idle_cpu_icx),
+	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X,	idle_cpu_spr),
 	INTEL_CPU_FAM6(XEON_PHI_KNL,		idle_cpu_knl),
 	INTEL_CPU_FAM6(XEON_PHI_KNM,		idle_cpu_knl),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT,		idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	idle_cpu_bxt),
 	INTEL_CPU_FAM6(ATOM_GOLDMONT_D,		idle_cpu_dnv),
-	INTEL_CPU_FAM6(ATOM_TREMONT_D,		idle_cpu_dnv),
+	INTEL_CPU_FAM6(ATOM_TREMONT_D,		idle_cpu_snr),
 	{}
 };
 
@@ -1192,7 +1253,7 @@ static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
 	{}
 };
 
-static bool intel_idle_max_cstate_reached(int cstate)
+static bool __init intel_idle_max_cstate_reached(int cstate)
 {
 	if (cstate + 1 > max_cstate) {
 		pr_info("max_cstate %d reached\n", max_cstate);
@@ -1201,6 +1262,20 @@ static bool intel_idle_max_cstate_reached(int cstate)
 	return false;
 }
 
+static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
+{
+	unsigned long eax = flg2MWAIT(state->flags);
+
+	if (boot_cpu_has(X86_FEATURE_ARAT))
+		return false;
+
+	/*
+	 * Switch over to one-shot tick broadcast if the target C-state
+	 * is deeper than C1.
+	 */
+	return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
+}
+
 #ifdef CONFIG_ACPI_PROCESSOR_CSTATE
 #include <acpi/processor.h>
 
@@ -1208,7 +1283,11 @@ static bool no_acpi __read_mostly;
 module_param(no_acpi, bool, 0444);
 MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
 
-static struct acpi_processor_power acpi_state_table;
+static bool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */
+module_param_named(use_acpi, force_use_acpi, bool, 0444);
+MODULE_PARM_DESC(use_acpi, "Use ACPI _CST for building the idle states list");
+
+static struct acpi_processor_power acpi_state_table __initdata;
 
 /**
  * intel_idle_cst_usable - Check if the _CST information can be used.
@@ -1216,7 +1295,7 @@ static struct acpi_processor_power acpi_state_table;
  * Check if all of the C-states listed by _CST in the max_cstate range are
  * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
  */
-static bool intel_idle_cst_usable(void)
+static bool __init intel_idle_cst_usable(void)
 {
 	int cstate, limit;
 
@@ -1233,7 +1312,7 @@ static bool intel_idle_cst_usable(void)
 	return true;
 }
 
-static bool intel_idle_acpi_cst_extract(void)
+static bool __init intel_idle_acpi_cst_extract(void)
 {
 	unsigned int cpu;
 
@@ -1256,19 +1335,18 @@ static bool intel_idle_acpi_cst_extract(void)
 		if (!intel_idle_cst_usable())
 			continue;
 
-		if (!acpi_processor_claim_cst_control()) {
-			acpi_state_table.count = 0;
-			return false;
-		}
+		if (!acpi_processor_claim_cst_control())
+			break;
 
 		return true;
 	}
 
+	acpi_state_table.count = 0;
 	pr_debug("ACPI _CST not found or not usable\n");
 	return false;
 }
 
-static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
+static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 {
 	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
 
@@ -1280,7 +1358,7 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		struct acpi_processor_cx *cx;
 		struct cpuidle_state *state;
 
-		if (intel_idle_max_cstate_reached(cstate))
+		if (intel_idle_max_cstate_reached(cstate - 1))
 			break;
 
 		cx = &acpi_state_table.states[cstate];
@@ -1307,12 +1385,18 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		if (cx->type > ACPI_STATE_C2)
 			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
 
+		if (disabled_states_mask & BIT(cstate))
+			state->flags |= CPUIDLE_FLAG_OFF;
+
+		if (intel_idle_state_needs_timer_stop(state))
+			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+
 		state->enter = intel_idle;
 		state->enter_s2idle = intel_idle_s2idle;
 	}
 }
 
-static bool intel_idle_off_by_default(u32 mwait_hint)
+static bool __init intel_idle_off_by_default(u32 mwait_hint)
 {
 	int cstate, limit;
 
@@ -1335,85 +1419,20 @@ static bool intel_idle_off_by_default(u32 mwait_hint)
 	return true;
 }
 #else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
+#define force_use_acpi	(false)
+
 static inline bool intel_idle_acpi_cst_extract(void) { return false; }
 static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
 static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
 #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
 
-/*
- * intel_idle_probe()
- */
-static int __init intel_idle_probe(void)
-{
-	unsigned int eax, ebx, ecx;
-	const struct x86_cpu_id *id;
-
-	if (max_cstate == 0) {
-		pr_debug("disabled\n");
-		return -EPERM;
-	}
-
-	id = x86_match_cpu(intel_idle_ids);
-	if (id) {
-		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
-			pr_debug("Please enable MWAIT in BIOS SETUP\n");
-			return -ENODEV;
-		}
-	} else {
-		id = x86_match_cpu(intel_mwait_ids);
-		if (!id)
-			return -ENODEV;
-	}
-
-	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
-		return -ENODEV;
-
-	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
-
-	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
-	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
-	    !mwait_substates)
-			return -ENODEV;
-
-	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
-
-	icpu = (const struct idle_cpu *)id->driver_data;
-	if (icpu) {
-		cpuidle_state_table = icpu->state_table;
-		if (icpu->use_acpi)
-			intel_idle_acpi_cst_extract();
-	} else if (!intel_idle_acpi_cst_extract()) {
-		return -ENODEV;
-	}
-
-	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
-		 boot_cpu_data.x86_model);
-
-	return 0;
-}
-
-/*
- * intel_idle_cpuidle_devices_uninit()
- * Unregisters the cpuidle devices.
- */
-static void intel_idle_cpuidle_devices_uninit(void)
-{
-	int i;
-	struct cpuidle_device *dev;
-
-	for_each_online_cpu(i) {
-		dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
-		cpuidle_unregister_device(dev);
-	}
-}
-
-/*
- * ivt_idle_state_table_update(void)
+/**
+ * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
  *
- * Tune IVT multi-socket targets
- * Assumption: num_sockets == (max_package_num + 1)
+ * Tune IVT multi-socket targets.
+ * Assumption: num_sockets == (max_package_num + 1).
  */
-static void ivt_idle_state_table_update(void)
+static void __init ivt_idle_state_table_update(void)
 {
 	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
 	int cpu, package_num, num_sockets = 1;
@@ -1436,15 +1455,17 @@ static void ivt_idle_state_table_update(void)
 	/* else, 1 and 2 socket systems use default ivt_cstates */
 }
 
-/*
- * Translate IRTL (Interrupt Response Time Limit) MSR to usec
+/**
+ * irtl_2_usec - IRTL to microseconds conversion.
+ * @irtl: IRTL MSR value.
+ *
+ * Translate the IRTL (Interrupt Response Time Limit) MSR value to microseconds.
  */
-
-static unsigned int irtl_ns_units[] = {
-	1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
-
-static unsigned long long irtl_2_usec(unsigned long long irtl)
+static unsigned long long __init irtl_2_usec(unsigned long long irtl)
 {
+	static const unsigned int irtl_ns_units[] __initconst = {
+		1, 32, 1024, 32768, 1048576, 33554432, 0, 0
+	};
 	unsigned long long ns;
 
 	if (!irtl)
@@ -1452,15 +1473,16 @@ static unsigned long long irtl_2_usec(unsigned long long irtl)
 
 	ns = irtl_ns_units[(irtl >> 10) & 0x7];
 
-	return div64_u64((irtl & 0x3FF) * ns, 1000);
+	return div_u64((irtl & 0x3FF) * ns, NSEC_PER_USEC);
 }
-/*
- * bxt_idle_state_table_update(void)
+
+/**
+ * bxt_idle_state_table_update - Fix up the Broxton idle states table.
  *
- * On BXT, we trust the IRTL to show the definitive maximum latency
- * We use the same value for target_residency.
+ * On BXT, trust the IRTL (Interrupt Response Time Limit) MSR to show the
+ * definitive maximum latency and use the same value for target_residency.
  */
-static void bxt_idle_state_table_update(void)
+static void __init bxt_idle_state_table_update(void)
 {
 	unsigned long long msr;
 	unsigned int usec;
@@ -1501,13 +1523,13 @@ static void bxt_idle_state_table_update(void)
 	}
 
 }
-/*
- * sklh_idle_state_table_update(void)
+
+/**
+ * sklh_idle_state_table_update - Fix up the Sky Lake idle states table.
  *
- * On SKL-H (model 0x5e) disable C8 and C9 if:
- * C10 is enabled and SGX disabled
+ * On SKL-H (model 0x5e) skip C8 and C9 if C10 is enabled and SGX disabled.
  */
-static void sklh_idle_state_table_update(void)
+static void __init sklh_idle_state_table_update(void)
 {
 	unsigned long long msr;
 	unsigned int eax, ebx, ecx, edx;
@@ -1540,19 +1562,82 @@ static void sklh_idle_state_table_update(void)
 			return;
 	}
 
-	skl_cstates[5].disabled = 1;	/* C8-SKL */
-	skl_cstates[6].disabled = 1;	/* C9-SKL */
+	skl_cstates[5].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C8-SKL */
+	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
 }
-/*
- * intel_idle_state_table_update()
- *
- * Update the default state_table for this CPU-id
+
+/**
+ * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
+ * idle states table.
  */
+static void __init skx_idle_state_table_update(void)
+{
+	unsigned long long msr;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/*
+	 * 000b: C0/C1 (no package C-state support)
+	 * 001b: C2
+	 * 010b: C6 (non-retention)
+	 * 011b: C6 (retention)
+	 * 111b: No Package C state limits.
+	 */
+	if ((msr & 0x7) < 2) {
+		/*
+		 * Uses the CC6 + PC0 latency and 3 times of
+		 * latency for target_residency if the PC6
+		 * is disabled in BIOS. This is consistent
+		 * with how intel_idle driver uses _CST
+		 * to set the target_residency.
+		 */
+		skx_cstates[2].exit_latency = 92;
+		skx_cstates[2].target_residency = 276;
+	}
+}
 
-static void intel_idle_state_table_update(void)
+/**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+static void __init spr_idle_state_table_update(void)
 {
-	switch (boot_cpu_data.x86_model) {
+	unsigned long long msr;
+
+	/*
+	 * By default, the C6 state assumes the worst-case scenario of package
+	 * C6. However, if PC6 is disabled, we update the numbers to match
+	 * core C6.
+	 */
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
+
+	/* Limit value 2 and above allow for PC6. */
+	if ((msr & 0x7) < 2) {
+		spr_cstates[2].exit_latency = 190;
+		spr_cstates[2].target_residency = 600;
+	}
+}
+
+static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
+{
+	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
+	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
+					MWAIT_SUBSTATE_MASK;
+
+	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
+	if (num_substates == 0)
+		return false;
+
+	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+		mark_tsc_unstable("TSC halts in idle states deeper than C2");
+
+	return true;
+}
+
+static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
+{
+	int cstate;
 
+	switch (boot_cpu_data.x86_model) {
 	case INTEL_FAM6_IVYBRIDGE_X:
 		ivt_idle_state_table_update();
 		break;
@@ -1563,12 +1648,13 @@ static void intel_idle_state_table_update(void)
 	case INTEL_FAM6_SKYLAKE:
 		sklh_idle_state_table_update();
 		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		skx_idle_state_table_update();
+		break;
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		spr_idle_state_table_update();
+		break;
 	}
-}
-
-static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
-{
-	int cstate;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
 		unsigned int mwait_hint;
@@ -1581,7 +1667,7 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 			break;
 
 		/* If marked as unusable, skip this state. */
-		if (cpuidle_state_table[cstate].disabled != 0) {
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
 			pr_debug("state %s is disabled\n",
 				 cpuidle_state_table[cstate].name);
 			continue;
@@ -1594,10 +1680,21 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		/* Structure copy. */
 		drv->states[drv->state_count] = cpuidle_state_table[cstate];
 
-		if (icpu->use_acpi && intel_idle_off_by_default(mwait_hint) &&
-		    !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE)
+			drv->states[drv->state_count].enter = intel_idle_irq;
+
+		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE)
+			drv->states[drv->state_count].enter = intel_idle_xstate;
+
+		if ((disabled_states_mask & BIT(drv->state_count)) ||
+		    ((icpu->use_acpi || force_use_acpi) &&
+		     intel_idle_off_by_default(mwait_hint) &&
+		     !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE)))
 			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
 
+		if (intel_idle_state_needs_timer_stop(&drv->states[drv->state_count]))
+			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_TIMER_STOP;
+
 		drv->state_count++;
 	}
 
@@ -1607,17 +1704,17 @@ static void intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 	}
 }
 
-/*
- * intel_idle_cpuidle_driver_init()
- * allocate, initialize cpuidle_states
+/**
+ * intel_idle_cpuidle_driver_init - Create the list of available idle states.
+ * @drv: cpuidle driver structure to initialize.
  */
-static void __init intel_idle_cpuidle_driver_init(void)
+static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
 {
-	struct cpuidle_driver *drv = &intel_idle_driver;
+	cpuidle_poll_state_init(drv);
 
-	intel_idle_state_table_update();
+	if (disabled_states_mask & BIT(0))
+		drv->states[0].flags |= CPUIDLE_FLAG_OFF;
 
-	cpuidle_poll_state_init(drv);
 	drv->state_count = 1;
 
 	if (icpu)
@@ -1626,10 +1723,39 @@ static void __init intel_idle_cpuidle_driver_init(void)
 		intel_idle_init_cstates_acpi(drv);
 }
 
-/*
- * intel_idle_cpu_init()
- * allocate, initialize, register cpuidle_devices
- * @cpu: cpu/core to initialize
+static void auto_demotion_disable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
+	msr_bits &= ~auto_demotion_disable_flags;
+	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
+}
+
+static void c1e_promotion_enable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits |= 0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
+static void c1e_promotion_disable(void)
+{
+	unsigned long long msr_bits;
+
+	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
+	msr_bits &= ~0x2;
+	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
+}
+
+/**
+ * intel_idle_cpu_init - Register the target CPU with the cpuidle core.
+ * @cpu: CPU to initialize.
+ *
+ * Register a cpuidle device object for @cpu and update its MSRs in accordance
+ * with the processor model flags.
  */
 static int intel_idle_cpu_init(unsigned int cpu)
 {
@@ -1643,13 +1769,12 @@ static int intel_idle_cpu_init(unsigned int cpu)
 		return -EIO;
 	}
 
-	if (!icpu)
-		return 0;
-
-	if (icpu->auto_demotion_disable_flags)
+	if (auto_demotion_disable_flags)
 		auto_demotion_disable();
 
-	if (icpu->disable_promotion_to_c1e)
+	if (c1e_promotion == C1E_PROMOTION_ENABLE)
+		c1e_promotion_enable();
+	else if (c1e_promotion == C1E_PROMOTION_DISABLE)
 		c1e_promotion_disable();
 
 	return 0;
@@ -1659,8 +1784,8 @@ static int intel_idle_cpu_online(unsigned int cpu)
 {
 	struct cpuidle_device *dev;
 
-	if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
-		__setup_broadcast_timer(true);
+	if (!boot_cpu_has(X86_FEATURE_ARAT))
+		tick_broadcast_enable();
 
 	/*
 	 * Some systems can hotplug a cpu at runtime after
@@ -1674,23 +1799,77 @@ static int intel_idle_cpu_online(unsigned int cpu)
 	return 0;
 }
 
+/**
+ * intel_idle_cpuidle_devices_uninit - Unregister all cpuidle devices.
+ */
+static void __init intel_idle_cpuidle_devices_uninit(void)
+{
+	int i;
+
+	for_each_online_cpu(i)
+		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
+}
+
 static int __init intel_idle_init(void)
 {
+	const struct x86_cpu_id *id;
+	unsigned int eax, ebx, ecx;
 	int retval;
 
 	/* Do not load intel_idle at all for now if idle= is passed */
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
 		return -ENODEV;
 
-	retval = intel_idle_probe();
-	if (retval)
-		return retval;
+	if (max_cstate == 0) {
+		pr_debug("disabled\n");
+		return -EPERM;
+	}
+
+	id = x86_match_cpu(intel_idle_ids);
+	if (id) {
+		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
+			pr_debug("Please enable MWAIT in BIOS SETUP\n");
+			return -ENODEV;
+		}
+	} else {
+		id = x86_match_cpu(intel_mwait_ids);
+		if (!id)
+			return -ENODEV;
+	}
+
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return -ENODEV;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+	    !mwait_substates)
+			return -ENODEV;
+
+	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
+
+	icpu = (const struct idle_cpu *)id->driver_data;
+	if (icpu) {
+		cpuidle_state_table = icpu->state_table;
+		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
+		if (icpu->disable_promotion_to_c1e)
+			c1e_promotion = C1E_PROMOTION_DISABLE;
+		if (icpu->use_acpi || force_use_acpi)
+			intel_idle_acpi_cst_extract();
+	} else if (!intel_idle_acpi_cst_extract()) {
+		return -ENODEV;
+	}
+
+	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
+		 boot_cpu_data.x86_model);
 
 	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
-	if (intel_idle_cpuidle_devices == NULL)
+	if (!intel_idle_cpuidle_devices)
 		return -ENOMEM;
 
-	intel_idle_cpuidle_driver_init();
+	intel_idle_cpuidle_driver_init(&intel_idle_driver);
+
 	retval = cpuidle_register_driver(&intel_idle_driver);
 	if (retval) {
 		struct cpuidle_driver *drv = cpuidle_get_driver();
@@ -1699,16 +1878,13 @@ static int __init intel_idle_init(void)
 		goto init_driver_fail;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
-		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
-
 	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
 				   intel_idle_cpu_online, NULL);
 	if (retval < 0)
 		goto hp_setup_fail;
 
-	pr_debug("lapic_timer_reliable_states 0x%x\n",
-		 lapic_timer_reliable_states);
+	pr_debug("Local APIC timer is reliable in %s\n",
+		 boot_cpu_has(X86_FEATURE_ARAT) ? "all C-states" : "C1");
 
 	return 0;
 
@@ -1729,3 +1905,22 @@ device_initcall(intel_idle_init);
  * is the easiest way (currently) to continue doing that.
  */
 module_param(max_cstate, int, 0444);
+/*
+ * The positions of the bits that are set in this number are the indices of the
+ * idle states to be disabled by default (as reflected by the names of the
+ * corresponding idle state directories in sysfs, "state0", "state1" ...
+ * "state<i>" ..., where <i> is the index of the given state).
+ */
+module_param_named(states_off, disabled_states_mask, uint, 0444);
+MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
+/*
+ * Some platforms come with mutually exclusive C-states, so that if one is
+ * enabled, the other C-states must not be used. Example: C1 and C1E on
+ * Sapphire Rapids platform. This parameter allows for selecting the
+ * preferred C-states among the groups of mutually exclusive C-states - the
+ * selected C-states will be registered, the other C-states from the mutually
+ * exclusive group won't be registered. If the platform has no mutually
+ * exclusive C-states, this parameter has no effect.
+ */
+module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
+MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2aa340c63939d62f5bc1438bfb31070e50d93e49..b140e8c553548f927f2a12deeb0811cde5565433 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -3,6 +3,15 @@
 config IOMMU_IOVA
 	tristate
 
+# The IOASID library may also be used by non-IOMMU_API users
+config IOASID
+	tristate
+
+config IOASID_USER
+	tristate
+	depends on IOASID
+	default n
+
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
 	bool
@@ -75,20 +84,61 @@ config IOMMU_DEBUGFS
 	  debug/iommu directory, and then populate a subdirectory with
 	  entries as required.
 
-config IOMMU_DEFAULT_PASSTHROUGH
-	bool "IOMMU passthrough by default"
+choice
+	prompt "IOMMU default domain type"
 	depends on IOMMU_API
-        help
-	  Enable passthrough by default, removing the need to pass in
-	  iommu.passthrough=on or iommu=pt through command line. If this
-	  is enabled, you can still disable with iommu.passthrough=off
-	  or iommu=nopt depending on the architecture.
+	default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU
+	default IOMMU_DEFAULT_DMA_STRICT
+	help
+	  Choose the type of IOMMU domain used to manage DMA API usage by
+	  device drivers. The options here typically represent different
+	  levels of tradeoff between robustness/security and performance,
+	  depending on the IOMMU driver. Not all IOMMUs support all options.
+	  This choice can be overridden at boot via the command line, and for
+	  some devices also at runtime via sysfs.
 
-	  If unsure, say N here.
+	  If unsure, keep the default.
+
+config IOMMU_DEFAULT_DMA_STRICT
+	bool "Translated - Strict"
+	help
+	  Trusted devices use translation to restrict their access to only
+	  DMA-mapped pages, with strict TLB invalidation on unmap. Equivalent
+	  to passing "iommu.passthrough=0 iommu.strict=1" on the command line.
+
+	  Untrusted devices always use this mode, with an additional layer of
+	  bounce-buffering such that they cannot gain access to any unrelated
+	  data within a mapped page.
+
+config IOMMU_DEFAULT_DMA_LAZY
+	bool "Translated - Lazy"
+	help
+	  Trusted devices use translation to restrict their access to only
+	  DMA-mapped pages, but with "lazy" batched TLB invalidation. This
+	  mode allows higher performance with some IOMMUs due to reduced TLB
+	  flushing, but at the cost of reduced isolation since devices may be
+	  able to access memory for some time after it has been unmapped.
+	  Equivalent to passing "iommu.passthrough=0 iommu.strict=0" on the
+	  command line.
+
+	  If this mode is not supported by the IOMMU driver, the effective
+	  runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+	bool "Passthrough"
+	help
+	  Trusted devices are identity-mapped, giving them unrestricted access
+	  to memory with minimal performance overhead. Equivalent to passing
+	  "iommu.passthrough=1" (historically "iommu=pt") on the command line.
+
+	  If this mode is not supported by the IOMMU driver, the effective
+	  runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT.
+
+endchoice
 
 config OF_IOMMU
-       def_bool y
-       depends on OF && IOMMU_API
+	def_bool y
+	depends on OF && IOMMU_API
 
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
@@ -98,6 +148,11 @@ config IOMMU_DMA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
 
+# Shared Virtual Addressing
+config IOMMU_SVA
+	bool
+	select IOASID
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
@@ -138,6 +193,7 @@ config AMD_IOMMU
 	select PCI_PASID
 	select IOMMU_API
 	select IOMMU_IOVA
+	select IOMMU_DMA
 	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
 	---help---
 	  With this option you can enable support for AMD IOMMU hardware in
@@ -171,76 +227,7 @@ config AMD_IOMMU_DEBUGFS
 	  This option is -NOT- intended for production environments, and should
 	  not generally be enabled.
 
-# Intel IOMMU support
-config DMAR_TABLE
-	bool
-
-config INTEL_IOMMU
-	bool "Support for Intel IOMMU using DMA Remapping Devices"
-	depends on PCI_MSI && ACPI && (X86 || IA64)
-	select IOMMU_API
-	select IOMMU_IOVA
-	select NEED_DMA_MAP_STATE
-	select DMAR_TABLE
-	select SWIOTLB
-	help
-	  DMA remapping (DMAR) devices support enables independent address
-	  translations for Direct Memory Access (DMA) from devices.
-	  These DMA remapping devices are reported via ACPI tables
-	  and include PCI device scope covered by these DMA
-	  remapping devices.
-
-config INTEL_IOMMU_DEBUGFS
-	bool "Export Intel IOMMU internals in Debugfs"
-	depends on INTEL_IOMMU && IOMMU_DEBUGFS
-	help
-	  !!!WARNING!!!
-
-	  DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!!
-
-	  Expose Intel IOMMU internals in Debugfs.
-
-	  This option is -NOT- intended for production environments, and should
-	  only be enabled for debugging Intel IOMMU.
-
-config INTEL_IOMMU_SVM
-	bool "Support for Shared Virtual Memory with Intel IOMMU"
-	depends on INTEL_IOMMU && X86_64
-	select PCI_PASID
-	select MMU_NOTIFIER
-	help
-	  Shared Virtual Memory (SVM) provides a facility for devices
-	  to access DMA resources through process address space by
-	  means of a Process Address Space ID (PASID).
-
-config INTEL_IOMMU_DEFAULT_ON
-	def_bool y
-	prompt "Enable Intel DMA Remapping Devices by default"
-	depends on INTEL_IOMMU
-	help
-	  Selecting this option will enable a DMAR device at boot time if
-	  one is found. If this option is not selected, DMAR support can
-	  be enabled by passing intel_iommu=on to the kernel.
-
-config INTEL_IOMMU_BROKEN_GFX_WA
-	bool "Workaround broken graphics drivers (going away soon)"
-	depends on INTEL_IOMMU && BROKEN && X86
-	---help---
-	  Current Graphics drivers tend to use physical address
-	  for DMA and avoid using DMA APIs. Setting this config
-	  option permits the IOMMU driver to set a unity map for
-	  all the OS-visible memory. Hence the driver can continue
-	  to use physical addresses for DMA, at least until this
-	  option is removed in the 2.6.32 kernel.
-
-config INTEL_IOMMU_FLOPPY_WA
-	def_bool y
-	depends on INTEL_IOMMU && X86
-	---help---
-	  Floppy disk drivers are known to bypass DMA API calls
-	  thereby failing to work when IOMMU is enabled. This
-	  workaround will setup a 1:1 mapping for the first
-	  16MiB to make floppy (an ISA device) work.
+source "drivers/iommu/intel/Kconfig"
 
 config IRQ_REMAP
 	bool "Support for Interrupt Remapping"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e739cdd4d00937f9b57b4d42ee06b30..12abbd02c84a43c0857b8ff473fbfa68161174ff 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-y += intel/
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
@@ -7,6 +8,8 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOASID) += ioasid.o
+obj-$(CONFIG_IOASID_USER) += ioasid_user.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
@@ -15,13 +18,8 @@ obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
-obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
-obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
-obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
-obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
 obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o
 obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
@@ -35,3 +33,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c392930253a30020619cba0591039daf0964bec0..516056835533d45ac5685c1138307c05f7857864 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -20,6 +20,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-iommu.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
 #include <linux/delay.h>
@@ -36,7 +37,6 @@
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/hw_irq.h>
-#include <asm/msidef.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -88,8 +88,6 @@ const struct iommu_ops amd_iommu_ops;
 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
 int amd_iommu_max_glx_val = -1;
 
-static const struct dma_map_ops amd_iommu_dma_ops;
-
 /*
  * general struct to manage commands send to an IOMMU
  */
@@ -102,21 +100,6 @@ struct kmem_cache *amd_iommu_irq_cache;
 static void update_domain(struct protection_domain *domain);
 static int protection_domain_init(struct protection_domain *domain);
 static void detach_device(struct device *dev);
-static void iova_domain_flush_tlb(struct iova_domain *iovad);
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
-	/* generic protection domain information */
-	struct protection_domain domain;
-
-	/* IOVA RB-Tree */
-	struct iova_domain iovad;
-};
-
-static struct iova_domain reserved_iova_ranges;
-static struct lock_class_key reserved_rbtree_key;
 
 /****************************************************************************
  *
@@ -187,12 +170,6 @@ static struct protection_domain *to_pdomain(struct iommu_domain *dom)
 	return container_of(dom, struct protection_domain, domain);
 }
 
-static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain)
-{
-	BUG_ON(domain->flags != PD_DMA_OPS_MASK);
-	return container_of(domain, struct dma_ops_domain, domain);
-}
-
 static struct iommu_dev_data *alloc_dev_data(u16 devid)
 {
 	struct iommu_dev_data *dev_data;
@@ -333,16 +310,15 @@ static struct iommu_group *acpihid_device_group(struct device *dev)
 static bool pci_iommuv2_capable(struct pci_dev *pdev)
 {
 	static const int caps[] = {
-		PCI_EXT_CAP_ID_ATS,
 		PCI_EXT_CAP_ID_PRI,
 		PCI_EXT_CAP_ID_PASID,
 	};
 	int i, pos;
 
-	if (pci_ats_disabled())
+	if (!pci_ats_supported(pdev))
 		return false;
 
-	for (i = 0; i < 3; ++i) {
+	for (i = 0; i < 2; ++i) {
 		pos = pci_find_ext_capability(pdev, caps[i]);
 		if (pos == 0)
 			return false;
@@ -385,21 +361,9 @@ static bool check_device(struct device *dev)
 	return true;
 }
 
-static void init_iommu_group(struct device *dev)
-{
-	struct iommu_group *group;
-
-	group = iommu_group_get_for_dev(dev);
-	if (IS_ERR(group))
-		return;
-
-	iommu_group_put(group);
-}
-
 static int iommu_init_device(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
 	int devid;
 
 	if (dev->archdata.iommu)
@@ -409,8 +373,6 @@ static int iommu_init_device(struct device *dev)
 	if (devid < 0)
 		return devid;
 
-	iommu = amd_iommu_rlookup_table[devid];
-
 	dev_data = find_dev_data(devid);
 	if (!dev_data)
 		return -ENOMEM;
@@ -433,8 +395,6 @@ static int iommu_init_device(struct device *dev)
 
 	dev->archdata.iommu = dev_data;
 
-	iommu_device_link(&iommu->iommu, dev);
-
 	return 0;
 }
 
@@ -452,7 +412,7 @@ static void iommu_ignore_device(struct device *dev)
 	setup_aliases(dev);
 }
 
-static void iommu_uninit_device(struct device *dev)
+static void amd_iommu_uninit_device(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
 	struct amd_iommu *iommu;
@@ -471,13 +431,6 @@ static void iommu_uninit_device(struct device *dev)
 	if (dev_data->domain)
 		detach_device(dev);
 
-	iommu_device_unlink(&iommu->iommu, dev);
-
-	iommu_group_remove_device(dev);
-
-	/* Remove dma-ops */
-	dev->dma_ops = NULL;
-
 	/*
 	 * We keep dev_data around for unplugged devices and reuse it when the
 	 * device is re-plugged - not doing so would introduce a ton of races.
@@ -558,10 +511,11 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
 static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 {
 	struct device *dev = iommu->iommu.dev;
-	int type, devid, pasid, flags, tag;
+	int type, devid, flags, tag;
 	volatile u32 *event = __evt;
 	int count = 0;
 	u64 address;
+	u32 pasid;
 
 retry:
 	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
@@ -623,8 +577,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 			pasid, address, flags);
 		break;
 	case EVENT_TYPE_INV_PPR_REQ:
-		pasid = ((event[0] >> 16) & 0xFFFF)
-			| ((event[1] << 6) & 0xF0000);
+		pasid = PPR_PASID(*((u64 *)__evt));
 		tag = event[1] & 0x03FF;
 		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
 			PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
@@ -775,7 +728,21 @@ static void iommu_poll_ga_log(struct amd_iommu *iommu)
 		}
 	}
 }
-#endif /* CONFIG_IRQ_REMAP */
+
+static void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
+{
+	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
+	    pci_dev_has_special_msi_domain(to_pci_dev(dev)))
+		return;
+
+	dev_set_msi_domain(dev, iommu->msi_domain);
+}
+
+#else /* CONFIG_IRQ_REMAP */
+static inline void
+amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
+#endif /* !CONFIG_IRQ_REMAP */
 
 #define AMD_IOMMU_INT_MASK	\
 	(MMIO_STATUS_EVT_INT_MASK | \
@@ -859,17 +826,18 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 			       struct iommu_cmd *cmd)
 {
 	u8 *target;
-
-	target = iommu->cmd_buf + iommu->cmd_buf_tail;
-
-	iommu->cmd_buf_tail += sizeof(*cmd);
-	iommu->cmd_buf_tail %= CMD_BUFFER_SIZE;
+	u32 tail;
 
 	/* Copy command to buffer */
+	tail = iommu->cmd_buf_tail;
+	target = iommu->cmd_buf + tail;
 	memcpy(target, cmd, sizeof(*cmd));
 
+	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+	iommu->cmd_buf_tail = tail;
+
 	/* Tell the IOMMU about it */
-	writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 }
 
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
@@ -954,7 +922,7 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
 }
 
-static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
 				  u64 address, bool size)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -972,7 +940,7 @@ static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
 	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
 }
 
-static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
 				  int qdep, u64 address, bool size)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -992,7 +960,7 @@ static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
 	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
 }
 
-static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
 			       int status, int tag, bool gn)
 {
 	memset(cmd, 0, sizeof(*cmd));
@@ -1300,12 +1268,6 @@ static void domain_flush_pages(struct protection_domain *domain,
 	__domain_flush_pages(domain, address, size, 0);
 }
 
-/* Flush the whole IO/TLB for a given protection domain */
-static void domain_flush_tlb(struct protection_domain *domain)
-{
-	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
 /* Flush the whole IO/TLB for a given protection domain - including PDE */
 static void domain_flush_tlb_pde(struct protection_domain *domain)
 {
@@ -1753,43 +1715,6 @@ static unsigned long iommu_unmap_page(struct protection_domain *dom,
 	return unmapped;
 }
 
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions.
- *
- ****************************************************************************/
-
-
-static unsigned long dma_ops_alloc_iova(struct device *dev,
-					struct dma_ops_domain *dma_dom,
-					unsigned int pages, u64 dma_mask)
-{
-	unsigned long pfn = 0;
-
-	pages = __roundup_pow_of_two(pages);
-
-	if (dma_mask > DMA_BIT_MASK(32))
-		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
-				      IOVA_PFN(DMA_BIT_MASK(32)), false);
-
-	if (!pfn)
-		pfn = alloc_iova_fast(&dma_dom->iovad, pages,
-				      IOVA_PFN(dma_mask), true);
-
-	return (pfn << PAGE_SHIFT);
-}
-
-static void dma_ops_free_iova(struct dma_ops_domain *dma_dom,
-			      unsigned long address,
-			      unsigned int pages)
-{
-	pages = __roundup_pow_of_two(pages);
-	address >>= PAGE_SHIFT;
-
-	free_iova_fast(&dma_dom->iovad, address, pages);
-}
-
 /****************************************************************************
  *
  * The next functions belong to the domain allocation. A domain is
@@ -1866,42 +1791,23 @@ static void free_gcr3_table(struct protection_domain *domain)
 	free_page((unsigned long)domain->gcr3_tbl);
 }
 
-static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dom->domain.lock, flags);
-	domain_flush_tlb(&dom->domain);
-	domain_flush_complete(&dom->domain);
-	spin_unlock_irqrestore(&dom->domain.lock, flags);
-}
-
-static void iova_domain_flush_tlb(struct iova_domain *iovad)
-{
-	struct dma_ops_domain *dom;
-
-	dom = container_of(iovad, struct dma_ops_domain, iovad);
-
-	dma_ops_domain_flush_tlb(dom);
-}
-
 /*
  * Free a domain, only used if something went wrong in the
  * allocation path and we need to free an already allocated page table
  */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
+static void dma_ops_domain_free(struct protection_domain *domain)
 {
-	if (!dom)
+	if (!domain)
 		return;
 
-	put_iova_domain(&dom->iovad);
+	iommu_put_dma_cookie(&domain->domain);
 
-	free_pagetable(&dom->domain);
+	free_pagetable(domain);
 
-	if (dom->domain.id)
-		domain_id_free(dom->domain.id);
+	if (domain->id)
+		domain_id_free(domain->id);
 
-	kfree(dom);
+	kfree(domain);
 }
 
 /*
@@ -1909,35 +1815,30 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
  * It also initializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
+static struct protection_domain *dma_ops_domain_alloc(void)
 {
-	struct dma_ops_domain *dma_dom;
+	struct protection_domain *domain;
 
-	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
-	if (!dma_dom)
+	domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL);
+	if (!domain)
 		return NULL;
 
-	if (protection_domain_init(&dma_dom->domain))
-		goto free_dma_dom;
-
-	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
-	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	dma_dom->domain.flags = PD_DMA_OPS_MASK;
-	if (!dma_dom->domain.pt_root)
-		goto free_dma_dom;
-
-	init_iova_domain(&dma_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
+	if (protection_domain_init(domain))
+		goto free_domain;
 
-	if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
-		goto free_dma_dom;
+	domain->mode = PAGE_MODE_3_LEVEL;
+	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	domain->flags = PD_DMA_OPS_MASK;
+	if (!domain->pt_root)
+		goto free_domain;
 
-	/* Initialize reserved ranges */
-	copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
+	if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
+		goto free_domain;
 
-	return dma_dom;
+	return domain;
 
-free_dma_dom:
-	dma_ops_domain_free(dma_dom);
+free_domain:
+	dma_ops_domain_free(domain);
 
 	return NULL;
 }
@@ -2158,8 +2059,8 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
 static int attach_device(struct device *dev,
 			 struct protection_domain *domain)
 {
-	struct pci_dev *pdev;
 	struct iommu_dev_data *dev_data;
+	struct pci_dev *pdev;
 	unsigned long flags;
 	int ret;
 
@@ -2178,8 +2079,10 @@ static int attach_device(struct device *dev,
 
 	pdev = to_pci_dev(dev);
 	if (domain->flags & PD_IOMMUV2_MASK) {
+		struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
+
 		ret = -EINVAL;
-		if (!dev_data->passthrough)
+		if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
 			goto out;
 
 		if (dev_data->iommu_v2) {
@@ -2261,54 +2164,51 @@ static void detach_device(struct device *dev)
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
-static int amd_iommu_add_device(struct device *dev)
+static struct iommu_device *amd_iommu_probe_device(struct device *dev)
 {
-	struct iommu_dev_data *dev_data;
-	struct iommu_domain *domain;
+	struct iommu_device *iommu_dev;
 	struct amd_iommu *iommu;
 	int ret, devid;
 
-	if (!check_device(dev) || get_dev_data(dev))
-		return 0;
+	if (!check_device(dev))
+		return ERR_PTR(-ENODEV);
 
 	devid = get_device_id(dev);
 	if (devid < 0)
-		return devid;
+		return ERR_PTR(devid);
 
 	iommu = amd_iommu_rlookup_table[devid];
 
+	if (get_dev_data(dev))
+		return &iommu->iommu;
+
 	ret = iommu_init_device(dev);
 	if (ret) {
 		if (ret != -ENOTSUPP)
 			dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
-
+		iommu_dev = ERR_PTR(ret);
 		iommu_ignore_device(dev);
-		dev->dma_ops = NULL;
-		goto out;
+	} else {
+		amd_iommu_set_pci_msi_domain(dev, iommu);
+		iommu_dev = &iommu->iommu;
 	}
-	init_iommu_group(dev);
 
-	dev_data = get_dev_data(dev);
+	iommu_completion_wait(iommu);
 
-	BUG_ON(!dev_data);
+	return iommu_dev;
+}
 
-	if (dev_data->iommu_v2)
-		iommu_request_dm_for_dev(dev);
+static void amd_iommu_probe_finalize(struct device *dev)
+{
+	struct iommu_domain *domain;
 
 	/* Domains are initialized for this device - have a look what we ended up with */
 	domain = iommu_get_domain_for_dev(dev);
-	if (domain->type == IOMMU_DOMAIN_IDENTITY)
-		dev_data->passthrough = true;
-	else
-		dev->dma_ops = &amd_iommu_dma_ops;
-
-out:
-	iommu_completion_wait(iommu);
-
-	return 0;
+	if (domain->type == IOMMU_DOMAIN_DMA)
+		iommu_setup_dma_ops(dev, 0, U64_MAX);
 }
 
-static void amd_iommu_remove_device(struct device *dev)
+static void amd_iommu_release_device(struct device *dev)
 {
 	struct amd_iommu *iommu;
 	int devid;
@@ -2322,7 +2222,7 @@ static void amd_iommu_remove_device(struct device *dev)
 
 	iommu = amd_iommu_rlookup_table[devid];
 
-	iommu_uninit_device(dev);
+	amd_iommu_uninit_device(dev);
 	iommu_completion_wait(iommu);
 }
 
@@ -2340,37 +2240,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
  *
  *****************************************************************************/
 
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
-	struct protection_domain *domain;
-	struct iommu_domain *io_domain;
-
-	if (!check_device(dev))
-		return ERR_PTR(-EINVAL);
-
-	domain = get_dev_data(dev)->domain;
-	if (domain == NULL && get_dev_data(dev)->defer_attach) {
-		get_dev_data(dev)->defer_attach = false;
-		io_domain = iommu_get_domain_for_dev(dev);
-		domain = to_pdomain(io_domain);
-		attach_device(dev, domain);
-	}
-	if (domain == NULL)
-		return ERR_PTR(-EBUSY);
-
-	if (!dma_ops_domain(domain))
-		return ERR_PTR(-EBUSY);
-
-	return domain;
-}
-
 static void update_device_table(struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data;
@@ -2391,458 +2260,6 @@ static void update_domain(struct protection_domain *domain)
 	domain_flush_complete(domain);
 }
 
-static int dir2prot(enum dma_data_direction direction)
-{
-	if (direction == DMA_TO_DEVICE)
-		return IOMMU_PROT_IR;
-	else if (direction == DMA_FROM_DEVICE)
-		return IOMMU_PROT_IW;
-	else if (direction == DMA_BIDIRECTIONAL)
-		return IOMMU_PROT_IW | IOMMU_PROT_IR;
-	else
-		return 0;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
-			       struct dma_ops_domain *dma_dom,
-			       phys_addr_t paddr,
-			       size_t size,
-			       enum dma_data_direction direction,
-			       u64 dma_mask)
-{
-	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start, ret;
-	unsigned long flags;
-	unsigned int pages;
-	int prot = 0;
-	int i;
-
-	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	paddr &= PAGE_MASK;
-
-	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);
-	if (!address)
-		goto out;
-
-	prot = dir2prot(direction);
-
-	start = address;
-	for (i = 0; i < pages; ++i) {
-		ret = iommu_map_page(&dma_dom->domain, start, paddr,
-				     PAGE_SIZE, prot, GFP_ATOMIC);
-		if (ret)
-			goto out_unmap;
-
-		paddr += PAGE_SIZE;
-		start += PAGE_SIZE;
-	}
-	address += offset;
-
-	domain_flush_np_cache(&dma_dom->domain, address, size);
-
-out:
-	return address;
-
-out_unmap:
-
-	for (--i; i >= 0; --i) {
-		start -= PAGE_SIZE;
-		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
-	}
-
-	spin_lock_irqsave(&dma_dom->domain.lock, flags);
-	domain_flush_tlb(&dma_dom->domain);
-	domain_flush_complete(&dma_dom->domain);
-	spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
-
-	dma_ops_free_iova(dma_dom, address, pages);
-
-	return DMA_MAPPING_ERROR;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
-			   dma_addr_t dma_addr,
-			   size_t size,
-			   int dir)
-{
-	dma_addr_t i, start;
-	unsigned int pages;
-
-	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	dma_addr &= PAGE_MASK;
-	start = dma_addr;
-
-	for (i = 0; i < pages; ++i) {
-		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
-		start += PAGE_SIZE;
-	}
-
-	if (amd_iommu_unmap_flush) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dma_dom->domain.lock, flags);
-		domain_flush_tlb(&dma_dom->domain);
-		domain_flush_complete(&dma_dom->domain);
-		spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
-		dma_ops_free_iova(dma_dom, dma_addr, pages);
-	} else {
-		pages = __roundup_pow_of_two(pages);
-		queue_iova(&dma_dom->iovad, dma_addr >> PAGE_SHIFT, pages, 0);
-	}
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
-			   unsigned long offset, size_t size,
-			   enum dma_data_direction dir,
-			   unsigned long attrs)
-{
-	phys_addr_t paddr = page_to_phys(page) + offset;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	u64 dma_mask;
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return (dma_addr_t)paddr;
-	else if (IS_ERR(domain))
-		return DMA_MAPPING_ERROR;
-
-	dma_mask = *dev->dma_mask;
-	dma_dom = to_dma_ops_domain(domain);
-
-	return __map_single(dev, dma_dom, paddr, size, dir, dma_mask);
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
-		       enum dma_data_direction dir, unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	dma_dom = to_dma_ops_domain(domain);
-
-	__unmap_single(dma_dom, dma_addr, size, dir);
-}
-
-static int sg_num_pages(struct device *dev,
-			struct scatterlist *sglist,
-			int nelems)
-{
-	unsigned long mask, boundary_size;
-	struct scatterlist *s;
-	int i, npages = 0;
-
-	mask          = dma_get_seg_boundary(dev);
-	boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT :
-				   1UL << (BITS_PER_LONG - PAGE_SHIFT);
-
-	for_each_sg(sglist, s, nelems, i) {
-		int p, n;
-
-		s->dma_address = npages << PAGE_SHIFT;
-		p = npages % boundary_size;
-		n = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-		if (p + n > boundary_size)
-			npages += boundary_size - p;
-		npages += n;
-	}
-
-	return npages;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, enum dma_data_direction direction,
-		  unsigned long attrs)
-{
-	int mapped_pages = 0, npages = 0, prot = 0, i;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct scatterlist *s;
-	unsigned long address;
-	u64 dma_mask;
-	int ret;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return 0;
-
-	dma_dom  = to_dma_ops_domain(domain);
-	dma_mask = *dev->dma_mask;
-
-	npages = sg_num_pages(dev, sglist, nelems);
-
-	address = dma_ops_alloc_iova(dev, dma_dom, npages, dma_mask);
-	if (!address)
-		goto out_err;
-
-	prot = dir2prot(direction);
-
-	/* Map all sg entries */
-	for_each_sg(sglist, s, nelems, i) {
-		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-
-		for (j = 0; j < pages; ++j) {
-			unsigned long bus_addr, phys_addr;
-
-			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
-			phys_addr = (sg_phys(s) & PAGE_MASK) + (j << PAGE_SHIFT);
-			ret = iommu_map_page(domain, bus_addr, phys_addr,
-					     PAGE_SIZE, prot,
-					     GFP_ATOMIC | __GFP_NOWARN);
-			if (ret)
-				goto out_unmap;
-
-			mapped_pages += 1;
-		}
-	}
-
-	/* Everything is mapped - write the right values into s->dma_address */
-	for_each_sg(sglist, s, nelems, i) {
-		/*
-		 * Add in the remaining piece of the scatter-gather offset that
-		 * was masked out when we were determining the physical address
-		 * via (sg_phys(s) & PAGE_MASK) earlier.
-		 */
-		s->dma_address += address + (s->offset & ~PAGE_MASK);
-		s->dma_length   = s->length;
-	}
-
-	if (s)
-		domain_flush_np_cache(domain, s->dma_address, s->dma_length);
-
-	return nelems;
-
-out_unmap:
-	dev_err(dev, "IOMMU mapping error in map_sg (io-pages: %d reason: %d)\n",
-		npages, ret);
-
-	for_each_sg(sglist, s, nelems, i) {
-		int j, pages = iommu_num_pages(sg_phys(s), s->length, PAGE_SIZE);
-
-		for (j = 0; j < pages; ++j) {
-			unsigned long bus_addr;
-
-			bus_addr  = address + s->dma_address + (j << PAGE_SHIFT);
-			iommu_unmap_page(domain, bus_addr, PAGE_SIZE);
-
-			if (--mapped_pages == 0)
-				goto out_free_iova;
-		}
-	}
-
-out_free_iova:
-	free_iova_fast(&dma_dom->iovad, address >> PAGE_SHIFT, npages);
-
-out_err:
-	return 0;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, enum dma_data_direction dir,
-		     unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	unsigned long startaddr;
-	int npages;
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	startaddr = sg_dma_address(sglist) & PAGE_MASK;
-	dma_dom   = to_dma_ops_domain(domain);
-	npages    = sg_num_pages(dev, sglist, nelems);
-
-	__unmap_single(dma_dom, startaddr, npages << PAGE_SHIFT, dir);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_addr, gfp_t flag,
-			    unsigned long attrs)
-{
-	u64 dma_mask = dev->coherent_dma_mask;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct page *page;
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL) {
-		page = alloc_pages(flag, get_order(size));
-		*dma_addr = page_to_phys(page);
-		return page_address(page);
-	} else if (IS_ERR(domain))
-		return NULL;
-
-	dma_dom   = to_dma_ops_domain(domain);
-	size	  = PAGE_ALIGN(size);
-	dma_mask  = dev->coherent_dma_mask;
-	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-	flag     |= __GFP_ZERO;
-
-	page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
-	if (!page) {
-		if (!gfpflags_allow_blocking(flag))
-			return NULL;
-
-		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-					get_order(size), flag & __GFP_NOWARN);
-		if (!page)
-			return NULL;
-	}
-
-	if (!dma_mask)
-		dma_mask = *dev->dma_mask;
-
-	*dma_addr = __map_single(dev, dma_dom, page_to_phys(page),
-				 size, DMA_BIDIRECTIONAL, dma_mask);
-
-	if (*dma_addr == DMA_MAPPING_ERROR)
-		goto out_free;
-
-	return page_address(page);
-
-out_free:
-
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, get_order(size));
-
-	return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
-			  void *virt_addr, dma_addr_t dma_addr,
-			  unsigned long attrs)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	struct page *page;
-
-	page = virt_to_page(virt_addr);
-	size = PAGE_ALIGN(size);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		goto free_mem;
-
-	dma_dom = to_dma_ops_domain(domain);
-
-	__unmap_single(dma_dom, dma_addr, size, DMA_BIDIRECTIONAL);
-
-free_mem:
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
-	if (!dma_direct_supported(dev, mask))
-		return 0;
-	return check_device(dev);
-}
-
-static const struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc		= alloc_coherent,
-	.free		= free_coherent,
-	.map_page	= map_page,
-	.unmap_page	= unmap_page,
-	.map_sg		= map_sg,
-	.unmap_sg	= unmap_sg,
-	.dma_supported	= amd_iommu_dma_supported,
-	.mmap		= dma_common_mmap,
-	.get_sgtable	= dma_common_get_sgtable,
-};
-
-static int init_reserved_iova_ranges(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct iova *val;
-
-	init_iova_domain(&reserved_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
-
-	lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
-			  &reserved_rbtree_key);
-
-	/* MSI memory range */
-	val = reserve_iova(&reserved_iova_ranges,
-			   IOVA_PFN(MSI_RANGE_START), IOVA_PFN(MSI_RANGE_END));
-	if (!val) {
-		pr_err("Reserving MSI range failed\n");
-		return -ENOMEM;
-	}
-
-	/* HT memory range */
-	val = reserve_iova(&reserved_iova_ranges,
-			   IOVA_PFN(HT_RANGE_START), IOVA_PFN(HT_RANGE_END));
-	if (!val) {
-		pr_err("Reserving HT range failed\n");
-		return -ENOMEM;
-	}
-
-	/*
-	 * Memory used for PCI resources
-	 * FIXME: Check whether we can reserve the PCI-hole completly
-	 */
-	for_each_pci_dev(pdev) {
-		int i;
-
-		for (i = 0; i < PCI_NUM_RESOURCES; ++i) {
-			struct resource *r = &pdev->resource[i];
-
-			if (!(r->flags & IORESOURCE_MEM))
-				continue;
-
-			val = reserve_iova(&reserved_iova_ranges,
-					   IOVA_PFN(r->start),
-					   IOVA_PFN(r->end));
-			if (!val) {
-				pci_err(pdev, "Reserve pci-resource range %pR failed\n", r);
-				return -ENOMEM;
-			}
-		}
-	}
-
-	return 0;
-}
-
 int __init amd_iommu_init_api(void)
 {
 	int ret, err = 0;
@@ -2851,10 +2268,6 @@ int __init amd_iommu_init_api(void)
 	if (ret)
 		return ret;
 
-	ret = init_reserved_iova_ranges();
-	if (ret)
-		return ret;
-
 	err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
 	if (err)
 		return err;
@@ -2874,12 +2287,6 @@ int __init amd_iommu_init_dma_ops(void)
 {
 	swiotlb        = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0;
 	iommu_detected = 1;
-
-	if (amd_iommu_unmap_flush)
-		pr_info("IO/TLB flush on unmap enabled\n");
-	else
-		pr_info("Lazy IO/TLB flushing enabled\n");
-
 	return 0;
 
 }
@@ -2925,7 +2332,6 @@ static void protection_domain_free(struct protection_domain *domain)
 static int protection_domain_init(struct protection_domain *domain)
 {
 	spin_lock_init(&domain->lock);
-	mutex_init(&domain->api_lock);
 	domain->id = domain_id_alloc();
 	if (!domain->id)
 		return -ENOMEM;
@@ -2956,7 +2362,6 @@ static struct protection_domain *protection_domain_alloc(void)
 static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 {
 	struct protection_domain *pdomain;
-	struct dma_ops_domain *dma_domain;
 
 	switch (type) {
 	case IOMMU_DOMAIN_UNMANAGED:
@@ -2977,12 +2382,11 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 
 		break;
 	case IOMMU_DOMAIN_DMA:
-		dma_domain = dma_ops_domain_alloc();
-		if (!dma_domain) {
+		pdomain = dma_ops_domain_alloc();
+		if (!pdomain) {
 			pr_err("Failed to allocate\n");
 			return NULL;
 		}
-		pdomain = &dma_domain->domain;
 		break;
 	case IOMMU_DOMAIN_IDENTITY:
 		pdomain = protection_domain_alloc();
@@ -3001,7 +2405,6 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
 static void amd_iommu_domain_free(struct iommu_domain *dom)
 {
 	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
 
 	domain = to_pdomain(dom);
 
@@ -3016,8 +2419,7 @@ static void amd_iommu_domain_free(struct iommu_domain *dom)
 	switch (dom->type) {
 	case IOMMU_DOMAIN_DMA:
 		/* Now release the domain */
-		dma_dom = to_dma_ops_domain(domain);
-		dma_ops_domain_free(dma_dom);
+		dma_ops_domain_free(domain);
 		break;
 	default:
 		if (domain->mode != PAGE_MODE_NONE)
@@ -3073,6 +2475,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 		return -EINVAL;
 
 	dev_data = dev->archdata.iommu;
+	dev_data->defer_attach = false;
 
 	iommu = amd_iommu_rlookup_table[dev_data->devid];
 	if (!iommu)
@@ -3098,7 +2501,8 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 }
 
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-			 phys_addr_t paddr, size_t page_size, int iommu_prot)
+			 phys_addr_t paddr, size_t page_size, int iommu_prot,
+			 gfp_t gfp)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	int prot = 0;
@@ -3112,9 +2516,7 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 	if (iommu_prot & IOMMU_WRITE)
 		prot |= IOMMU_PROT_IW;
 
-	mutex_lock(&domain->api_lock);
-	ret = iommu_map_page(domain, iova, paddr, page_size, prot, GFP_KERNEL);
-	mutex_unlock(&domain->api_lock);
+	ret = iommu_map_page(domain, iova, paddr, page_size, prot, gfp);
 
 	domain_flush_np_cache(domain, iova, page_size);
 
@@ -3126,16 +2528,11 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 			      struct iommu_iotlb_gather *gather)
 {
 	struct protection_domain *domain = to_pdomain(dom);
-	size_t unmap_size;
 
 	if (domain->mode == PAGE_MODE_NONE)
 		return 0;
 
-	mutex_lock(&domain->api_lock);
-	unmap_size = iommu_unmap_page(domain, iova, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	return unmap_size;
+	return iommu_unmap_page(domain, iova, page_size);
 }
 
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -3145,9 +2542,6 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	unsigned long offset_mask, pte_pgsize;
 	u64 *pte, __pte;
 
-	if (domain->mode == PAGE_MODE_NONE)
-		return iova;
-
 	pte = fetch_pte(domain, iova, &pte_pgsize);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
@@ -3168,6 +2562,8 @@ static bool amd_iommu_capable(enum iommu_cap cap)
 		return (irq_remapping_enabled == 1);
 	case IOMMU_CAP_NOEXEC:
 		return false;
+	case IOMMU_CAP_VIOMMU_HINT:
+		return amd_iommu_np_cache;
 	default:
 		break;
 	}
@@ -3236,19 +2632,6 @@ static void amd_iommu_put_resv_regions(struct device *dev,
 		kfree(entry);
 }
 
-static void amd_iommu_apply_resv_region(struct device *dev,
-				      struct iommu_domain *domain,
-				      struct iommu_resv_region *region)
-{
-	struct dma_ops_domain *dma_dom = to_dma_ops_domain(to_pdomain(domain));
-	unsigned long start, end;
-
-	start = IOVA_PFN(region->start);
-	end   = IOVA_PFN(region->start + region->length - 1);
-
-	WARN_ON_ONCE(reserve_iova(&dma_dom->iovad, start, end) == NULL);
-}
-
 static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
 					 struct device *dev)
 {
@@ -3273,6 +2656,20 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
 	amd_iommu_flush_iotlb_all(domain);
 }
 
+static int amd_iommu_def_domain_type(struct device *dev)
+{
+	struct iommu_dev_data *dev_data;
+
+	dev_data = get_dev_data(dev);
+	if (!dev_data)
+		return 0;
+
+	if (dev_data->iommu_v2)
+		return IOMMU_DOMAIN_IDENTITY;
+
+	return 0;
+}
+
 const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
 	.domain_alloc = amd_iommu_domain_alloc,
@@ -3282,16 +2679,17 @@ const struct iommu_ops amd_iommu_ops = {
 	.map = amd_iommu_map,
 	.unmap = amd_iommu_unmap,
 	.iova_to_phys = amd_iommu_iova_to_phys,
-	.add_device = amd_iommu_add_device,
-	.remove_device = amd_iommu_remove_device,
+	.probe_device = amd_iommu_probe_device,
+	.release_device = amd_iommu_release_device,
+	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.put_resv_regions = amd_iommu_put_resv_regions,
-	.apply_resv_region = amd_iommu_apply_resv_region,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
 	.iotlb_sync = amd_iommu_iotlb_sync,
+	.def_domain_type = amd_iommu_def_domain_type,
 };
 
 /*****************************************************************************
@@ -3383,7 +2781,7 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
 }
 EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
 
-static int __flush_pasid(struct protection_domain *domain, int pasid,
+static int __flush_pasid(struct protection_domain *domain, u32 pasid,
 			 u64 address, bool size)
 {
 	struct iommu_dev_data *dev_data;
@@ -3444,13 +2842,13 @@ static int __flush_pasid(struct protection_domain *domain, int pasid,
 	return ret;
 }
 
-static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
+static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
 				  u64 address)
 {
 	return __flush_pasid(domain, pasid, address, false);
 }
 
-int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
 			 u64 address)
 {
 	struct protection_domain *domain = to_pdomain(dom);
@@ -3465,13 +2863,13 @@ int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_flush_page);
 
-static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
 {
 	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
 			     true);
 }
 
-int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
+int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	unsigned long flags;
@@ -3485,7 +2883,7 @@ int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
 }
 EXPORT_SYMBOL(amd_iommu_flush_tlb);
 
-static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
+static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
 {
 	int index;
 	u64 *pte;
@@ -3517,7 +2915,7 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
 	return pte;
 }
 
-static int __set_gcr3(struct protection_domain *domain, int pasid,
+static int __set_gcr3(struct protection_domain *domain, u32 pasid,
 		      unsigned long cr3)
 {
 	u64 *pte;
@@ -3534,7 +2932,7 @@ static int __set_gcr3(struct protection_domain *domain, int pasid,
 	return __amd_iommu_flush_tlb(domain, pasid);
 }
 
-static int __clear_gcr3(struct protection_domain *domain, int pasid)
+static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
 {
 	u64 *pte;
 
@@ -3550,7 +2948,7 @@ static int __clear_gcr3(struct protection_domain *domain, int pasid)
 	return __amd_iommu_flush_tlb(domain, pasid);
 }
 
-int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
 			      unsigned long cr3)
 {
 	struct protection_domain *domain = to_pdomain(dom);
@@ -3565,7 +2963,7 @@ int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
 
-int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	unsigned long flags;
@@ -3579,7 +2977,7 @@ int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
 }
 EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
 
-int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 			   int status, int tag)
 {
 	struct iommu_dev_data *dev_data;
@@ -3599,9 +2997,23 @@ EXPORT_SYMBOL(amd_iommu_complete_ppr);
 struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
 {
 	struct protection_domain *pdomain;
+	struct iommu_domain *io_domain;
+	struct device *dev = &pdev->dev;
+
+	if (!check_device(dev))
+		return NULL;
+
+	pdomain = get_dev_data(dev)->domain;
+	if (pdomain == NULL && get_dev_data(dev)->defer_attach) {
+		get_dev_data(dev)->defer_attach = false;
+		io_domain = iommu_get_domain_for_dev(dev);
+		pdomain = to_pdomain(io_domain);
+		attach_device(dev, pdomain);
+	}
+	if (pdomain == NULL)
+		return NULL;
 
-	pdomain = get_domain(&pdev->dev);
-	if (IS_ERR(pdomain))
+	if (!dma_ops_domain(pdomain))
 		return NULL;
 
 	/* Only return IOMMUv2 domains */
@@ -3638,11 +3050,8 @@ int amd_iommu_device_info(struct pci_dev *pdev,
 
 	memset(info, 0, sizeof(*info));
 
-	if (!pci_ats_disabled()) {
-		pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
-		if (pos)
-			info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
-	}
+	if (pci_ats_supported(pdev))
+		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
 
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (pos)
@@ -3963,7 +3372,7 @@ static void free_irte(u16 devid, int index)
 }
 
 static void irte_prepare(void *entry,
-			 u32 delivery_mode, u32 dest_mode,
+			 u32 delivery_mode, bool dest_mode,
 			 u8 vector, u32 dest_apicid, int devid)
 {
 	union irte *irte = (union irte *) entry;
@@ -3977,7 +3386,7 @@ static void irte_prepare(void *entry,
 }
 
 static void irte_ga_prepare(void *entry,
-			    u32 delivery_mode, u32 dest_mode,
+			    u32 delivery_mode, bool dest_mode,
 			    u8 vector, u32 dest_apicid, int devid)
 {
 	struct irte_ga *irte = (struct irte_ga *) entry;
@@ -4097,69 +3506,18 @@ static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
 
 static int get_devid(struct irq_alloc_info *info)
 {
-	int devid = -1;
-
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-		devid     = get_ioapic_devid(info->ioapic_id);
-		break;
+		return get_ioapic_devid(info->devid);
 	case X86_IRQ_ALLOC_TYPE_HPET:
-		devid     = get_hpet_devid(info->hpet_id);
-		break;
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		devid = get_device_id(&info->msi_dev->dev);
-		break;
-	default:
-		BUG_ON(1);
-		break;
-	}
-
-	return devid;
-}
-
-static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	struct amd_iommu *iommu;
-	int devid;
-
-	if (!info)
-		return NULL;
-
-	devid = get_devid(info);
-	if (devid >= 0) {
-		iommu = amd_iommu_rlookup_table[devid];
-		if (iommu)
-			return iommu->ir_domain;
-	}
-
-	return NULL;
-}
-
-static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
-{
-	struct amd_iommu *iommu;
-	int devid;
-
-	if (!info)
-		return NULL;
-
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		devid = get_device_id(&info->msi_dev->dev);
-		if (devid < 0)
-			return NULL;
-
-		iommu = amd_iommu_rlookup_table[devid];
-		if (iommu)
-			return iommu->msi_domain;
-		break;
+		return get_hpet_devid(info->devid);
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+		return get_device_id(msi_desc_to_dev(info->desc));
 	default:
-		break;
+		WARN_ON_ONCE(1);
+		return -1;
 	}
-
-	return NULL;
 }
 
 struct irq_remap_ops amd_iommu_irq_ops = {
@@ -4168,18 +3526,22 @@ struct irq_remap_ops amd_iommu_irq_ops = {
 	.disable		= amd_iommu_disable,
 	.reenable		= amd_iommu_reenable,
 	.enable_faulting	= amd_iommu_enable_faulting,
-	.get_ir_irq_domain	= get_ir_irq_domain,
-	.get_irq_domain		= get_irq_domain,
 };
 
+static void fill_msi_msg(struct msi_msg *msg, u32 index)
+{
+	msg->data = index;
+	msg->address_lo = 0;
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+}
+
 static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 				       struct irq_cfg *irq_cfg,
 				       struct irq_alloc_info *info,
 				       int devid, int index, int sub_handle)
 {
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
-	struct msi_msg *msg = &data->msi_entry;
-	struct IO_APIC_route_entry *entry;
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
 
 	if (!iommu)
@@ -4187,31 +3549,16 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
 
 	data->irq_2_irte.devid = devid;
 	data->irq_2_irte.index = index + sub_handle;
-	iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
-				 apic->irq_dest_mode, irq_cfg->vector,
+	iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
+				 apic->dest_mode_logical, irq_cfg->vector,
 				 irq_cfg->dest_apicid, devid);
 
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-		/* Setup IOAPIC entry */
-		entry = info->ioapic_entry;
-		info->ioapic_entry = NULL;
-		memset(entry, 0, sizeof(*entry));
-		entry->vector        = index;
-		entry->mask          = 0;
-		entry->trigger       = info->ioapic_trigger;
-		entry->polarity      = info->ioapic_polarity;
-		/* Mask level triggered irqs. */
-		if (info->ioapic_trigger)
-			entry->mask = 1;
-		break;
-
 	case X86_IRQ_ALLOC_TYPE_HPET:
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->address_lo = MSI_ADDR_BASE_LO;
-		msg->data = irte_info->index;
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+		fill_msi_msg(&data->msi_entry, irte_info->index);
 		break;
 
 	default:
@@ -4252,15 +3599,15 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 
 	if (!info)
 		return -EINVAL;
-	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
 		return -EINVAL;
 
 	/*
 	 * With IRQ remapping enabled, don't need contiguous CPU vectors
 	 * to support multiple MSI interrupts.
 	 */
-	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
 		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 
 	devid = get_devid(info);
@@ -4288,15 +3635,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 					iommu->irte_ops->set_allocated(table, i);
 			}
 			WARN_ON(table->min_index != 32);
-			index = info->ioapic_pin;
+			index = info->ioapic.pin;
 		} else {
 			index = -ENOMEM;
 		}
-	} else if (info->type == X86_IRQ_ALLOC_TYPE_MSI ||
-		   info->type == X86_IRQ_ALLOC_TYPE_MSIX) {
-		bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI);
+	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
+		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
+		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
 
-		index = alloc_irq_index(devid, nr_irqs, align, info->msi_dev);
+		index = alloc_irq_index(devid, nr_irqs, align,
+					msi_desc_to_pci_dev(info->desc));
 	} else {
 		index = alloc_irq_index(devid, nr_irqs, false, NULL);
 	}
@@ -4309,8 +3657,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
 
 	for (i = 0; i < nr_irqs; i++) {
 		irq_data = irq_domain_get_irq_data(domain, virq + i);
-		cfg = irqd_cfg(irq_data);
-		if (!irq_data || !cfg) {
+		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
+		if (!cfg) {
 			ret = -EINVAL;
 			goto out_free_data;
 		}
@@ -4407,7 +3755,26 @@ static void irq_remapping_deactivate(struct irq_domain *domain,
 					    irte_info->index);
 }
 
+static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+				enum irq_domain_bus_token bus_token)
+{
+	struct amd_iommu *iommu;
+	int devid = -1;
+
+	if (x86_fwspec_is_ioapic(fwspec))
+		devid = get_ioapic_devid(fwspec->param[0]);
+	else if (x86_fwspec_is_hpet(fwspec))
+		devid = get_hpet_devid(fwspec->param[0]);
+
+	if (devid < 0)
+		return 0;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	return iommu && iommu->ir_domain == d;
+}
+
 static const struct irq_domain_ops amd_ir_domain_ops = {
+	.select = irq_remapping_select,
 	.alloc = irq_remapping_alloc,
 	.free = irq_remapping_free,
 	.activate = irq_remapping_activate,
@@ -4454,8 +3821,8 @@ int amd_iommu_deactivate_guest_mode(void *data)
 	entry->hi.val = 0;
 
 	entry->lo.fields_remap.valid       = valid;
-	entry->lo.fields_remap.dm          = apic->irq_dest_mode;
-	entry->lo.fields_remap.int_type    = apic->irq_delivery_mode;
+	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
+	entry->lo.fields_remap.int_type    = apic->delivery_mode;
 	entry->hi.fields.vector            = cfg->vector;
 	entry->lo.fields_remap.destination =
 				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index be098f15131e2a898d68879702084445b7a84da3..7f964b8afddb15e7cf50c062db22ee9432d89b83 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -25,7 +25,6 @@
 #include <asm/pci-direct.h>
 #include <asm/iommu.h>
 #include <asm/apic.h>
-#include <asm/msidef.h>
 #include <asm/gart.h>
 #include <asm/x86_init.h>
 #include <asm/iommu_table.h>
@@ -1594,9 +1593,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (ret)
 		return ret;
 
-	ret = amd_iommu_create_irq_domain(iommu);
-	if (ret)
-		return ret;
+	if (amd_iommu_irq_remap) {
+		ret = amd_iommu_create_irq_domain(iommu);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Make sure IOMMU is not considered to translate itself. The IVRS
@@ -1983,10 +1984,18 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
 	return 0;
 }
 
-#define XT_INT_DEST_MODE(x)	(((x) & 0x1ULL) << 2)
-#define XT_INT_DEST_LO(x)	(((x) & 0xFFFFFFULL) << 8)
-#define XT_INT_VEC(x)		(((x) & 0xFFULL) << 32)
-#define XT_INT_DEST_HI(x)	((((x) >> 24) & 0xFFULL) << 56)
+union intcapxt {
+	u64	capxt;
+	struct {
+		u64	reserved_0		:  2,
+			dest_mode_logical	:  1,
+			reserved_1		:  5,
+			destid_0_23		: 24,
+			vector			:  8,
+			reserved_2		: 16,
+			destid_24_31		:  8;
+	};
+} __attribute__ ((packed));
 
 /**
  * Setup the IntCapXT registers with interrupt routing information
@@ -1995,28 +2004,29 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
  */
 static void iommu_update_intcapxt(struct amd_iommu *iommu)
 {
-	u64 val;
-	u32 addr_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
-	u32 addr_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
-	u32 data    = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
-	bool dm     = (addr_lo >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1;
-	u32 dest    = ((addr_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xFF);
+	struct msi_msg msg;
+	union intcapxt xt;
+	u32 destid;
 
-	if (x2apic_enabled())
-		dest |= MSI_ADDR_EXT_DEST_ID(addr_hi);
+	msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
+	msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
+	msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
 
-	val = XT_INT_VEC(data & 0xFF) |
-	      XT_INT_DEST_MODE(dm) |
-	      XT_INT_DEST_LO(dest) |
-	      XT_INT_DEST_HI(dest);
+	destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+
+	xt.capxt = 0ULL;
+	xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
+	xt.vector = msg.arch_data.vector;
+	xt.destid_0_23 = destid & GENMASK(23, 0);
+	xt.destid_24_31 = destid >> 24;
 
 	/**
 	 * Current IOMMU implemtation uses the same IRQ for all
 	 * 3 IOMMU interrupts.
 	 */
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
-	writeq(val, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
+	writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
 }
 
 static void _irq_notifier_notify(struct irq_affinity_notify *notify,
@@ -3001,8 +3011,10 @@ static int __init parse_amd_iommu_intr(char *str)
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
-		if (strncmp(str, "fullflush", 9) == 0)
+		if (strncmp(str, "fullflush", 9) == 0) {
+			pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n");
 			amd_iommu_unmap_flush = true;
+		}
 		if (strncmp(str, "off", 3) == 0)
 			amd_iommu_disabled = true;
 		if (strncmp(str, "on", 2) == 0)
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 92c2ba6468a088bb59b32cc32c09a6847118a8dd..17d7957269ee35fcd5fb69762cba8d4f02ba316f 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -43,12 +43,12 @@ extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
 extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
 extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
 extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
-extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+extern int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
 				u64 address);
-extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid);
-extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+extern int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid);
+extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
 				     unsigned long cr3);
-extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
+extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid);
 extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
 
 #ifdef CONFIG_IRQ_REMAP
@@ -64,7 +64,7 @@ static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
 #define PPR_INVALID			0x1
 #define PPR_FAILURE			0xf
 
-extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+extern int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 				  int status, int tag);
 
 static inline bool is_rd890_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 76e9d3e2f9f207e8dfe04206e6fa71cbc74d49cb..7dd125f63ce5cdade94b3e26dd26cd3bb8dc9f97 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -395,10 +395,10 @@
 #define PD_IOMMUV2_MASK		(1UL << 3) /* domain has gcr3 table */
 
 extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...)					\
-	do {								\
-		if (amd_iommu_dump)						\
-			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
+#define DUMP_printk(format, arg...)				\
+	do {							\
+		if (amd_iommu_dump)				\
+			pr_info("AMD-Vi: " format, ## arg);	\
 	} while(0);
 
 /* global flag if IOMMUs cache non-present entries */
@@ -472,7 +472,6 @@ struct protection_domain {
 	struct iommu_domain domain; /* generic domain handle used by
 				       iommu core code */
 	spinlock_t lock;	/* mostly used to lock the page table*/
-	struct mutex api_lock;	/* protect page tables in the iommu-api path */
 	u16 id;			/* the domain id written to the device table */
 	int mode;		/* paging mode (0-6 levels) */
 	u64 *pt_root;		/* page table root pointer */
@@ -646,7 +645,6 @@ struct iommu_dev_data {
 	struct pci_dev *pdev;
 	u16 devid;			  /* PCI Device ID */
 	bool iommu_v2;			  /* Device can make use of IOMMUv2 */
-	bool passthrough;		  /* Device is identity mapped */
 	struct {
 		bool enabled;
 		int qdep;
@@ -740,12 +738,6 @@ extern u16 amd_iommu_last_bdf;
 /* allocation bitmap for domain ids */
 extern unsigned long *amd_iommu_pd_alloc_bitmap;
 
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
 /* Smallest max PASID supported by any IOMMU in the system */
 extern u32 amd_iommu_max_pasid;
 
@@ -891,7 +883,7 @@ struct amd_ir_data {
 };
 
 struct amd_irte_ops {
-	void (*prepare)(void *, u32, u32, u8, u32, int);
+	void (*prepare)(void *, u32, bool, u8, u32, int);
 	void (*activate)(void *, u16, u16);
 	void (*deactivate)(void *, u16, u16);
 	void (*set_affinity)(void *, u16, u16, u8, u32);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 05f3d93cf480c9da792132ff940bded94215a818..a679fe7ecaca8d9e2f44153ed93d92e49498aa84 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -42,7 +42,7 @@ struct pasid_state {
 	struct mmu_notifier mn;                 /* mmu_notifier handle */
 	struct pri_queue pri[PRI_QUEUE_SIZE];	/* PRI tag states */
 	struct device_state *device_state;	/* Link to our device_state */
-	int pasid;				/* PASID index */
+	u32 pasid;				/* PASID index */
 	bool invalid;				/* Used during setup and
 						   teardown of the pasid */
 	spinlock_t lock;			/* Protect pri_queues and
@@ -72,7 +72,7 @@ struct fault {
 	struct mm_struct *mm;
 	u64 address;
 	u16 devid;
-	u16 pasid;
+	u32 pasid;
 	u16 tag;
 	u16 finish;
 	u16 flags;
@@ -152,7 +152,7 @@ static void put_device_state(struct device_state *dev_state)
 
 /* Must be called under dev_state->lock */
 static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
-						  int pasid, bool alloc)
+						  u32 pasid, bool alloc)
 {
 	struct pasid_state **root, **ptr;
 	int level, index;
@@ -186,7 +186,7 @@ static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state
 
 static int set_pasid_state(struct device_state *dev_state,
 			   struct pasid_state *pasid_state,
-			   int pasid)
+			   u32 pasid)
 {
 	struct pasid_state **ptr;
 	unsigned long flags;
@@ -213,7 +213,7 @@ static int set_pasid_state(struct device_state *dev_state,
 	return ret;
 }
 
-static void clear_pasid_state(struct device_state *dev_state, int pasid)
+static void clear_pasid_state(struct device_state *dev_state, u32 pasid)
 {
 	struct pasid_state **ptr;
 	unsigned long flags;
@@ -231,7 +231,7 @@ static void clear_pasid_state(struct device_state *dev_state, int pasid)
 }
 
 static struct pasid_state *get_pasid_state(struct device_state *dev_state,
-					   int pasid)
+					   u32 pasid)
 {
 	struct pasid_state **ptr, *ret = NULL;
 	unsigned long flags;
@@ -598,7 +598,7 @@ static struct notifier_block ppr_nb = {
 	.notifier_call = ppr_notifier,
 };
 
-int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
 			 struct task_struct *task)
 {
 	struct pasid_state *pasid_state;
@@ -619,7 +619,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 		return -EINVAL;
 
 	ret = -EINVAL;
-	if (pasid < 0 || pasid >= dev_state->max_pasids)
+	if (pasid >= dev_state->max_pasids)
 		goto out;
 
 	ret = -ENOMEM;
@@ -683,7 +683,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
 }
 EXPORT_SYMBOL(amd_iommu_bind_pasid);
 
-void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
+void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid)
 {
 	struct pasid_state *pasid_state;
 	struct device_state *dev_state;
@@ -699,7 +699,7 @@ void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
 	if (dev_state == NULL)
 		return;
 
-	if (pasid < 0 || pasid >= dev_state->max_pasids)
+	if (pasid >= dev_state->max_pasids)
 		goto out;
 
 	pasid_state = get_pasid_state(dev_state, pasid);
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4736d8612d30b732e3b182964e30153e97ce5726..636b8ea54d0cdacc06a3ddb9c7a0137544d9488c 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2147,12 +2147,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 	if (!smmu_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&smmu_domain->domain)) {
-		kfree(smmu_domain);
-		return NULL;
-	}
-
 	mutex_init(&smmu_domain->init_mutex);
 	INIT_LIST_HEAD(&smmu_domain->devices);
 	spin_lock_init(&smmu_domain->devices_lock);
@@ -2183,7 +2177,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-	iommu_put_dma_cookie(domain);
 	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 
 	/* Free the CD and ASID, if we allocated them */
@@ -2370,26 +2363,20 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
 	}
 }
 
-#ifdef CONFIG_PCI_ATS
 static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 {
-	struct pci_dev *pdev;
+	struct device *dev = master->dev;
 	struct arm_smmu_device *smmu = master->smmu;
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 
-	if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
-	    !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled())
+	if (!(smmu->features & ARM_SMMU_FEAT_ATS))
 		return false;
 
-	pdev = to_pci_dev(master->dev);
-	return !pdev->untrusted && pdev->ats_cap;
-}
-#else
-static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
-{
-	return false;
+	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS))
+		return false;
+
+	return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
 }
-#endif
 
 static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 {
@@ -2681,15 +2668,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch (attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2713,23 +2691,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	mutex_lock(&smmu_domain->init_mutex);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
-		default:
-			ret = -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch(attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2743,11 +2704,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 		ret = -EINVAL;
 	}
 
-out_unlock:
 	mutex_unlock(&smmu_domain->init_mutex);
 	return ret;
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2826,6 +2801,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= arm_smmu_put_resv_regions,
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 056a5b1d00c18ead6d6594d9e72053f4edc822f3..dcd9a6735ece863c3876e0ea210ca2943c80b2ff 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -861,10 +861,10 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 {
 	struct arm_smmu_domain *smmu_domain;
 
-	if (type != IOMMU_DOMAIN_UNMANAGED &&
-	    type != IOMMU_DOMAIN_DMA &&
-	    type != IOMMU_DOMAIN_IDENTITY)
-		return NULL;
+	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) {
+		if (using_legacy_binding || type != IOMMU_DOMAIN_DMA)
+			return NULL;
+	}
 	/*
 	 * Allocate the domain and initialise some of its data structures.
 	 * We can't really do anything meaningful until we've added a
@@ -874,12 +874,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 	if (!smmu_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding ||
-	    iommu_get_dma_cookie(&smmu_domain->domain))) {
-		kfree(smmu_domain);
-		return NULL;
-	}
-
 	mutex_init(&smmu_domain->init_mutex);
 	spin_lock_init(&smmu_domain->cb_lock);
 
@@ -894,7 +888,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 	 * Free the domain resources. We assume that all devices have
 	 * already been detached.
 	 */
-	iommu_put_dma_cookie(domain);
 	arm_smmu_destroy_domain_context(domain);
 	kfree(smmu_domain);
 }
@@ -1474,9 +1467,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
 		default:
 			return -ENODEV;
 		}
@@ -1495,6 +1485,21 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	}
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr, void *data)
 {
@@ -1506,17 +1511,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
 		default:
 			ret = -ENODEV;
 		}
@@ -1593,6 +1587,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= arm_smmu_put_resv_regions,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 76bd2309e023451f07aa4e02665fd12e302fbdf9..4221422f8b14fc87a6ae969dd9b81a1ac049dc60 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -21,8 +21,11 @@
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/swiotlb.h>
 #include <linux/scatterlist.h>
 #include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-direct.h>
 
 struct iommu_dma_msi_page {
 	struct list_head	list;
@@ -49,6 +52,31 @@ struct iommu_dma_cookie {
 	struct iommu_domain		*fq_domain;
 };
 
+static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
+bool iommu_dma_forcedac __read_mostly;
+
+static int __init iommu_dma_forcedac_setup(char *str)
+{
+	int ret = kstrtobool(str, &iommu_dma_forcedac);
+
+	if (!ret && iommu_dma_forcedac)
+		pr_info("Forcing DAC for PCI devices\n");
+	return ret;
+}
+early_param("iommu.forcedac", iommu_dma_forcedac_setup);
+
+static void iommu_dma_entry_dtor(unsigned long data)
+{
+	struct page *freelist = (struct page *)data;
+
+	while (freelist) {
+		unsigned long p = (unsigned long)page_address(freelist);
+
+		freelist = freelist->freelist;
+		free_page(p);
+	}
+}
+
 static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
 {
 	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -147,6 +175,80 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
+#define PCI_PASID_MAX 0x100000 /* TODO: Use per dev limits */
+/**
+ * iommu_enable_pasid_dma --Enable in-kernel DMA request with PASID
+ * @dev:	Device to be enabled
+ *
+ * DMA request with PASID will be mapped the same way as the legacy DMA.
+ * If the device is in pass-through, PASID will also pass-through. If the
+ * device is in IOVA map, the supervisor PASID will point to the same IOVA
+ * page table.
+ *
+ * @return the kernel PASID to be used for DMA or INVALID_IOASID on failure
+ */
+ioasid_t iommu_enable_pasid_dma(struct device *dev)
+{
+	struct iommu_domain *dom;
+	u32 pasid;
+
+	if (dev->pasid) {
+		dev_err(dev, "PASID DMA already enabled\n");
+		return dev->pasid;
+	}
+	dom = iommu_get_domain_for_dev(dev);
+	if (!dom || !dom->ops)
+		return INVALID_IOASID;
+
+	pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE, PCI_PASID_MAX, dev);
+	dev_alert(dev, "%s: PASID %u\n", __func__, pasid);
+	/*
+	 * Use the reserved kernel PASID for all devices. For now,
+	 * there is no need to have different PASIDs for in-kernel use.
+	 */
+	if (!dom->ops->enable_pasid_dma || dom->ops->enable_pasid_dma(dev, pasid))
+		return INVALID_IOASID;
+	/* Used for device IOTLB flush */
+	dev->pasid = pasid;
+
+	return pasid;
+}
+EXPORT_SYMBOL(iommu_enable_pasid_dma);
+
+/**
+ * iommu_disable_pasid_dma --Disable in-kernel DMA request with PASID
+ * @dev:	Device's PASID DMA to be disabled
+ *
+ * It is the device driver's responsibility to ensure no more incoming DMA
+ * requests with the kernel PASID before calling this function. IOMMU driver
+ * ensures PASID cache, IOTLBs related to the kernel PASID are cleared and
+ * drained.
+ *
+ * @return 0 on success or error code on failure
+ */
+int iommu_disable_pasid_dma(struct device *dev)
+{
+	struct iommu_domain *dom;
+	int ret = 0;
+
+	if (!dev->pasid) {
+		dev_err(dev, "PASID DMA not enabled\n");
+		return -ENODEV;
+	}
+	dom = iommu_get_domain_for_dev(dev);
+	if (!dom->ops->disable_pasid_dma)
+		return -ENOTSUPP;
+
+	dev_alert(dev, "%s: PASID %u\n", __func__, dev->pasid);
+	ret = dom->ops->disable_pasid_dma(dev);
+	if (!ret)
+		ioasid_free(host_pasid_set, dev->pasid);
+	dev->pasid = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(iommu_disable_pasid_dma);
+
 /**
  * iommu_dma_get_resv_regions - Reserved region driver helper
  * @dev: Device from iommu_get_resv_regions()
@@ -286,20 +388,49 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 	domain->ops->flush_iotlb_all(domain);
 }
 
+static bool dev_is_untrusted(struct device *dev)
+{
+	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
+}
+
+/* sysfs updates are serialised by the mutex of the group owning @domain */
+int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	int ret;
+
+	if (cookie->fq_domain)
+		return 0;
+
+	ret = init_iova_flush_queue(&cookie->iovad, iommu_dma_flush_iotlb_all,
+				    iommu_dma_entry_dtor);
+	if (ret) {
+		pr_warn("iova flush queue initialization failed\n");
+		return ret;
+	}
+	/*
+	 * Prevent incomplete iovad->fq being observable. Pairs with path from
+	 * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova()
+	 */
+	smp_wmb();
+	WRITE_ONCE(cookie->fq_domain, domain);
+	return 0;
+}
+
 /**
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
  * @base: IOVA at which the mappable address space starts
- * @size: Size of IOVA space
+ * @limit: Last address of the IOVA space
  * @dev: Device the domain is being initialised for
  *
- * @base and @size should be exact multiples of IOMMU page granularity to
+ * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
  * avoid rounding surprises. If necessary, we reserve the page at address 0
  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
  * any change which could make prior IOVAs invalid will fail.
  */
 static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-		u64 size, struct device *dev)
+				 dma_addr_t limit, struct device *dev)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
@@ -318,7 +449,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	/* Check the domain allows at least some access to the device... */
 	if (domain->geometry.force_aperture) {
 		if (base > domain->geometry.aperture_end ||
-		    base + size <= domain->geometry.aperture_start) {
+		    limit < domain->geometry.aperture_start) {
 			pr_warn("specified DMA range outside IOMMU capability\n");
 			return -EFAULT;
 		}
@@ -340,10 +471,14 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
-	if (!cookie->fq_domain && !iommu_domain_get_attr(domain,
-			DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) {
-		cookie->fq_domain = domain;
-		init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, NULL);
+	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
+	    !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
+	    attr) {
+		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
+					  iommu_dma_entry_dtor))
+			pr_warn("iova flush queue initialization failed\n");
+		else
+			cookie->fq_domain = domain;
 	}
 
 	if (!dev)
@@ -404,14 +539,13 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
 		iova_len = roundup_pow_of_two(iova_len);
 
-	if (dev->bus_dma_mask)
-		dma_limit &= dev->bus_dma_mask;
+	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
 
 	if (domain->geometry.force_aperture)
 		dma_limit = min(dma_limit, domain->geometry.aperture_end);
 
 	/* Try to get PCI devices a SAC address */
-	if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+	if (dma_limit > DMA_BIT_MASK(32) && !iommu_dma_forcedac && dev_is_pci(dev))
 		iova = alloc_iova_fast(iovad, iova_len,
 				       DMA_BIT_MASK(32) >> shift, false);
 
@@ -423,16 +557,17 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 }
 
 static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
-		dma_addr_t iova, size_t size)
+		dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
 {
 	struct iova_domain *iovad = &cookie->iovad;
 
 	/* The MSI case is only ever cleaning up its most recent allocation */
 	if (cookie->type == IOMMU_DMA_MSI_COOKIE)
 		cookie->msi_iova -= size;
-	else if (cookie->fq_domain)	/* non-strict mode */
+	else if (gather && gather->queued)
 		queue_iova(iovad, iova_pfn(iovad, iova),
-				size >> iova_shift(iovad), 0);
+				size >> iova_shift(iovad),
+				(unsigned long)gather->freelist);
 	else
 		free_iova_fast(iovad, iova_pfn(iovad, iova),
 				size >> iova_shift(iovad));
@@ -451,17 +586,38 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
 	dma_addr -= iova_off;
 	size = iova_align(iovad, size + iova_off);
 	iommu_iotlb_gather_init(&iotlb_gather);
+	iotlb_gather.queued = READ_ONCE(cookie->fq_domain);
 
 	unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
 	WARN_ON(unmapped != size);
 
-	if (!cookie->fq_domain)
-		iommu_tlb_sync(domain, &iotlb_gather);
-	iommu_dma_free_iova(cookie, dma_addr, size);
+	if (!iotlb_gather.queued)
+		iommu_iotlb_sync(domain, &iotlb_gather);
+	iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
+}
+
+static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
+		size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	struct iommu_domain *domain = iommu_get_dma_domain(dev);
+	phys_addr_t phys;
+
+	phys = iommu_iova_to_phys(domain, dma_addr);
+	if (WARN_ON(!phys))
+		return;
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(phys, size, dir);
+
+	__iommu_dma_unmap(dev, dma_addr, size);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
-		size_t size, int prot)
+		size_t size, int prot, dma_addr_t dma_mask)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -469,19 +625,72 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	size_t iova_off = iova_offset(iovad, phys);
 	dma_addr_t iova;
 
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+	    iommu_deferred_attach(dev, domain))
+		return DMA_MAPPING_ERROR;
+
 	size = iova_align(iovad, size + iova_off);
 
-	iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+	iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
 	if (!iova)
 		return DMA_MAPPING_ERROR;
 
-	if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
-		iommu_dma_free_iova(cookie, iova, size);
+	if (iommu_map_atomic(domain, iova, phys - iova_off, size, prot)) {
+		iommu_dma_free_iova(cookie, iova, size, NULL);
 		return DMA_MAPPING_ERROR;
 	}
 	return iova + iova_off;
 }
 
+static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
+		size_t org_size, dma_addr_t dma_mask, bool coherent,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	int prot = dma_info_to_prot(dir, coherent, attrs);
+	struct iommu_domain *domain = iommu_get_dma_domain(dev);
+	struct iommu_dma_cookie *cookie = domain->iova_cookie;
+	struct iova_domain *iovad = &cookie->iovad;
+	size_t aligned_size = org_size;
+	void *padding_start;
+	size_t padding_size;
+	dma_addr_t iova;
+
+	/*
+	 * If both the physical buffer start address and size are
+	 * page aligned, we don't need to use a bounce page.
+	 */
+	if (IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev) &&
+	    iova_offset(iovad, phys | org_size)) {
+		aligned_size = iova_align(iovad, org_size);
+		phys = swiotlb_tbl_map_single(dev, phys, org_size,
+					      aligned_size, dir, attrs);
+
+		if (phys == DMA_MAPPING_ERROR)
+			return DMA_MAPPING_ERROR;
+
+		/* Cleanup the padding area. */
+		padding_start = phys_to_virt(phys);
+		padding_size = aligned_size;
+
+		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+		    (dir == DMA_TO_DEVICE ||
+		     dir == DMA_BIDIRECTIONAL)) {
+			padding_start += org_size;
+			padding_size -= org_size;
+		}
+
+		memset(padding_start, 0, padding_size);
+	}
+
+	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(phys, org_size, dir);
+
+	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
+	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
+	return iova;
+}
+
 static void __iommu_dma_free_pages(struct page **pages, int count)
 {
 	while (count--)
@@ -499,13 +708,16 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	if (!order_mask)
 		return NULL;
 
-	pages = kvzalloc(count * sizeof(*pages), GFP_KERNEL);
+	pages = kvcalloc(count, sizeof(*pages), GFP_KERNEL);
 	if (!pages)
 		return NULL;
 
 	/* IOMMU can map any pages, so himem can also be used here */
 	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
 
+	/* It makes no sense to muck about with huge pages */
+	gfp &= ~__GFP_COMP;
+
 	while (count) {
 		struct page *page = NULL;
 		unsigned int order_size;
@@ -526,15 +738,9 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 			page = alloc_pages_node(nid, alloc_flags, order);
 			if (!page)
 				continue;
-			if (!order)
-				break;
-			if (!PageCompound(page)) {
+			if (order)
 				split_page(page, order);
-				break;
-			} else if (!split_huge_page(page)) {
-				break;
-			}
-			__free_pages(page, order);
+			break;
 		}
 		if (!page) {
 			__iommu_dma_free_pages(pages, i);
@@ -578,6 +784,10 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 
 	*dma_handle = DMA_MAPPING_ERROR;
 
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+	    iommu_deferred_attach(dev, domain))
+		return NULL;
+
 	min_size = alloc_sizes & -alloc_sizes;
 	if (min_size < PAGE_SIZE) {
 		min_size = PAGE_SIZE;
@@ -610,7 +820,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 			arch_dma_prep_coherent(sg_page(sg), sg->length);
 	}
 
-	if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
+	if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
 			< size)
 		goto out_free_sg;
 
@@ -628,7 +838,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 out_free_sg:
 	sg_free_table(&sgt);
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_pages:
 	__iommu_dma_free_pages(pages, count);
 	return NULL;
@@ -654,11 +864,15 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_cpu(dev, phys, size, dir);
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(phys, size, dir);
+
+	if (is_swiotlb_buffer(phys))
+		swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -666,11 +880,15 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev))
+	if (dev_is_dma_coherent(dev) && !dev_is_untrusted(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	arch_sync_dma_for_device(dev, phys, size, dir);
+	if (is_swiotlb_buffer(phys))
+		swiotlb_sync_single_for_device(dev, phys, size, dir);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(phys, size, dir);
 }
 
 static void iommu_dma_sync_sg_for_cpu(struct device *dev,
@@ -680,11 +898,13 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+	if (dev_is_untrusted(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
+						      sg->length, dir);
+	else if (!dev_is_dma_coherent(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -694,11 +914,14 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
-		return;
-
-	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+	if (dev_is_untrusted(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			iommu_dma_sync_single_for_device(dev,
+							 sg_dma_address(sg),
+							 sg->length, dir);
+	else if (!dev_is_dma_coherent(dev))
+		for_each_sg(sgl, sg, nelems, i)
+			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
@@ -707,22 +930,15 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 {
 	phys_addr_t phys = page_to_phys(page) + offset;
 	bool coherent = dev_is_dma_coherent(dev);
-	int prot = dma_info_to_prot(dir, coherent, attrs);
-	dma_addr_t dma_handle;
 
-	dma_handle =__iommu_dma_map(dev, phys, size, prot);
-	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    dma_handle != DMA_MAPPING_ERROR)
-		arch_sync_dma_for_device(dev, phys, size, dir);
-	return dma_handle;
+	return __iommu_dma_map_swiotlb(dev, phys, size, dma_get_mask(dev),
+			coherent, dir, attrs);
 }
 
 static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
-	__iommu_dma_unmap(dev, dma_handle, size);
+	__iommu_dma_unmap_swiotlb(dev, dma_handle, size, dir, attrs);
 }
 
 /*
@@ -800,6 +1016,39 @@ static void __invalidate_sg(struct scatterlist *sg, int nents)
 	}
 }
 
+static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i)
+		__iommu_dma_unmap_swiotlb(dev, sg_dma_address(s),
+				sg_dma_len(s), dir, attrs);
+}
+
+static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		sg_dma_address(s) = __iommu_dma_map_swiotlb(dev, sg_phys(s),
+				s->length, dma_get_mask(dev),
+				dev_is_dma_coherent(dev), dir, attrs);
+		if (sg_dma_address(s) == DMA_MAPPING_ERROR)
+			goto out_unmap;
+		sg_dma_len(s) = s->length;
+	}
+
+	return nents;
+
+out_unmap:
+	iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+	return -EIO;
+}
+
 /*
  * The DMA API client is passing in a scatterlist which could describe
  * any old buffer layout, but the IOMMU API requires everything to be
@@ -818,8 +1067,18 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	dma_addr_t iova;
 	size_t iova_len = 0;
 	unsigned long mask = dma_get_seg_boundary(dev);
+	ssize_t ret;
 	int i;
 
+	if (static_branch_unlikely(&iommu_deferred_attach_enabled)) {
+		ret = iommu_deferred_attach(dev, domain);
+		if (ret)
+			goto out;
+	}
+
+	if (dev_is_untrusted(dev))
+		return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs);
+
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
 
@@ -863,23 +1122,29 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	}
 
 	iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
-	if (!iova)
+	if (!iova) {
+		ret = -ENOMEM;
 		goto out_restore_sg;
+	}
 
 	/*
 	 * We'll leave any physical concatenation to the IOMMU driver's
 	 * implementation - it knows better than we do.
 	 */
-	if (iommu_map_sg(domain, iova, sg, nents, prot) < iova_len)
+	ret = iommu_map_sg_atomic(domain, iova, sg, nents, prot);
+	if (ret < iova_len)
 		goto out_free_iova;
 
 	return __finalise_sg(dev, sg, nents, iova);
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, iova_len);
+	iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
 	__invalidate_sg(sg, nents);
-	return 0;
+out:
+	if (ret != -ENOMEM)
+		return -EINVAL;
+	return ret;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
@@ -889,6 +1154,11 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 	struct scatterlist *tmp;
 	int i;
 
+	if (dev_is_untrusted(dev)) {
+		iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs);
+		return;
+	}
+
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
 
@@ -910,7 +1180,8 @@ static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	return __iommu_dma_map(dev, phys, size,
-			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO);
+			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
+			dma_get_mask(dev));
 }
 
 static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
@@ -927,7 +1198,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 
 	/* Non-coherent atomic allocation? Easy */
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_free_from_pool(cpu_addr, alloc_size))
+	    dma_free_from_pool(dev, cpu_addr, alloc_size))
 		return;
 
 	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
@@ -1010,13 +1281,15 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    !gfpflags_allow_blocking(gfp) && !coherent)
-		cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+		page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
+					       gfp, NULL);
 	else
 		cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
 	if (!cpu_addr)
 		return NULL;
 
-	*handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot);
+	*handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
+			dev->coherent_dma_mask);
 	if (*handle == DMA_MAPPING_ERROR) {
 		__iommu_dma_free(dev, size, cpu_addr);
 		return NULL;
@@ -1093,6 +1366,8 @@ static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
 static const struct dma_map_ops iommu_dma_ops = {
 	.alloc			= iommu_dma_alloc,
 	.free			= iommu_dma_free,
+	.alloc_pages		= dma_common_alloc_pages,
+	.free_pages		= dma_common_free_pages,
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
@@ -1112,7 +1387,7 @@ static const struct dma_map_ops iommu_dma_ops = {
  * The IOMMU core code allocates the default DMA domain, which the underlying
  * IOMMU driver needs to support via the dma-iommu layer.
  */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
@@ -1123,8 +1398,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
 	 * The IOMMU core code allocates the default DMA domain, which the
 	 * underlying IOMMU driver needs to support via the dma-iommu layer.
 	 */
-	if (domain->type == IOMMU_DOMAIN_DMA) {
-		if (iommu_dma_init_domain(domain, dma_base, size, dev))
+	if (iommu_is_dma_domain(domain)) {
+		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
 			goto out_err;
 		dev->dma_ops = &iommu_dma_ops;
 	}
@@ -1167,7 +1442,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	return msi_page;
 
 out_free_iova:
-	iommu_dma_free_iova(cookie, iova, size);
+	iommu_dma_free_iova(cookie, iova, size, NULL);
 out_free_page:
 	kfree(msi_page);
 	return NULL;
@@ -1220,6 +1495,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 static int iommu_dma_init(void)
 {
+	if (is_kdump_kernel())
+		static_branch_enable(&iommu_deferred_attach_enabled);
+
 	return iova_cache_get();
 }
 arch_initcall(iommu_dma_init);
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 06828e2698d5bf0e5053a81f4c9bf9c9692a3c56..be9102454e1c962107542b52f0c17d46d8df0051 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -473,53 +473,6 @@ static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
 	return ret;
 }
 
-static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-	struct device_domain_info *info;
-	int ret = 0;
-
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
-			ret = pamu_disable_liodn(info->liodn);
-			if (!ret)
-				dma_domain->enabled = 0;
-		} else {
-			ret = pamu_disable_spaace(info->liodn, wnd_nr);
-		}
-	}
-
-	return ret;
-}
-
-static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	if (!dma_domain->win_arr) {
-		pr_debug("Number of windows not configured\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (wnd_nr >= dma_domain->win_cnt) {
-		pr_debug("Invalid window index\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (dma_domain->win_arr[wnd_nr].valid) {
-		ret = disable_domain_win(dma_domain, wnd_nr);
-		if (!ret) {
-			dma_domain->win_arr[wnd_nr].valid = 0;
-			dma_domain->mapped--;
-		}
-	}
-
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-}
 
 static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				  phys_addr_t paddr, u64 size, int prot)
@@ -1044,7 +997,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.attach_dev	= fsl_pamu_attach_device,
 	.detach_dev	= fsl_pamu_detach_device,
 	.domain_window_enable = fsl_pamu_window_enable,
-	.domain_window_disable = fsl_pamu_window_disable,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
 	.domain_set_attr = fsl_pamu_set_domain_attr,
 	.domain_get_attr = fsl_pamu_get_domain_attr,
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index f0fe5030acd361a418cbc95d35bb2e11b7366dfa..2f0be3f8119f36de9feeee018e3cddc56b085f1c 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -52,7 +52,7 @@ static int hyperv_ir_set_affinity(struct irq_data *data,
 		return ret;
 
 	entry = data->chip_data;
-	entry->dest = cfg->dest_apicid;
+	entry->destid_0_7 = cfg->dest_apicid;
 	entry->vector = cfg->vector;
 	send_cleanup_vector(cfg);
 
@@ -89,20 +89,6 @@ static int hyperv_irq_remapping_alloc(struct irq_domain *domain,
 
 	irq_data->chip = &hyperv_ir_chip;
 
-	/*
-	 * If there is interrupt remapping function of IOMMU, setting irq
-	 * affinity only needs to change IRTE of IOMMU. But Hyper-V doesn't
-	 * support interrupt remapping function, setting irq affinity of IO-APIC
-	 * interrupts still needs to change IO-APIC registers. But ioapic_
-	 * configure_entry() will ignore value of cfg->vector and cfg->
-	 * dest_apicid when IO-APIC's parent irq domain is not the vector
-	 * domain.(See ioapic_configure_entry()) In order to setting vector
-	 * and dest_apicid to IO-APIC register, IO-APIC entry pointer is saved
-	 * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_
-	 * affinity() set vector and dest_apicid directly into IO-APIC entry.
-	 */
-	irq_data->chip_data = info->ioapic_entry;
-
 	/*
 	 * Hypver-V IO APIC irq affinity should be in the scope of
 	 * ioapic_max_cpumask because no irq remapping support.
@@ -125,7 +111,7 @@ static int hyperv_irq_remapping_activate(struct irq_domain *domain,
 	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	struct IO_APIC_route_entry *entry = irq_data->chip_data;
 
-	entry->dest = cfg->dest_apicid;
+	entry->destid_0_7 = cfg->dest_apicid;
 	entry->vector = cfg->vector;
 
 	return 0;
@@ -182,18 +168,9 @@ static int __init hyperv_enable_irq_remapping(void)
 	return IRQ_REMAP_X2APIC_MODE;
 }
 
-static struct irq_domain *hyperv_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC)
-		return ioapic_ir_domain;
-	else
-		return NULL;
-}
-
 struct irq_remap_ops hyperv_irq_remap_ops = {
 	.prepare		= hyperv_prepare_irq_remapping,
 	.enable			= hyperv_enable_irq_remapping,
-	.get_ir_irq_domain	= hyperv_get_ir_irq_domain,
 };
 
 #endif
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..80a95acdd03a6b2a748a50558d8ef19c74bd21e5
--- /dev/null
+++ b/drivers/iommu/intel/Kconfig
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Intel IOMMU support
+config DMAR_TABLE
+	bool
+
+config DMAR_PERF
+	bool
+
+config DMAR_DEBUG
+	bool
+
+config INTEL_IOMMU
+	bool "Support for Intel IOMMU using DMA Remapping Devices"
+	depends on PCI_MSI && ACPI && (X86 || IA64)
+	select IOMMU_API
+	select IOMMU_IOVA
+	select NEED_DMA_MAP_STATE
+	select DMAR_TABLE
+	select SWIOTLB
+	select IOASID
+	select IOASID_USER
+	select IOMMU_DMA
+	select PCI_ATS
+	help
+	  DMA remapping (DMAR) devices support enables independent address
+	  translations for Direct Memory Access (DMA) from devices.
+	  These DMA remapping devices are reported via ACPI tables
+	  and include PCI device scope covered by these DMA
+	  remapping devices.
+
+if INTEL_IOMMU
+
+config INTEL_IOMMU_DEBUGFS
+	bool "Export Intel IOMMU internals in Debugfs"
+	depends on INTEL_IOMMU && IOMMU_DEBUGFS
+	select DMAR_PERF
+	select DMAR_DEBUG
+	help
+	  !!!WARNING!!!
+
+	  DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!!
+
+	  Expose Intel IOMMU internals in Debugfs.
+
+	  This option is -NOT- intended for production environments, and should
+	  only be enabled for debugging Intel IOMMU.
+
+config INTEL_IOMMU_SVM
+	bool "Support for Shared Virtual Memory with Intel IOMMU"
+	depends on X86_64
+	select PCI_PASID
+	select PCI_PRI
+	select MMU_NOTIFIER
+	select IOASID
+	select IOMMU_SVA
+	help
+	  Shared Virtual Memory (SVM) provides a facility for devices
+	  to access DMA resources through process address space by
+	  means of a Process Address Space ID (PASID).
+
+config INTEL_IOMMU_DEFAULT_ON
+	bool "Enable Intel DMA Remapping Devices by default"
+	default y
+	help
+	  Selecting this option will enable a DMAR device at boot time if
+	  one is found. If this option is not selected, DMAR support can
+	  be enabled by passing intel_iommu=on to the kernel.
+
+config INTEL_IOMMU_BROKEN_GFX_WA
+	bool "Workaround broken graphics drivers (going away soon)"
+	depends on BROKEN && X86
+	help
+	  Current Graphics drivers tend to use physical address
+	  for DMA and avoid using DMA APIs. Setting this config
+	  option permits the IOMMU driver to set a unity map for
+	  all the OS-visible memory. Hence the driver can continue
+	  to use physical addresses for DMA, at least until this
+	  option is removed in the 2.6.32 kernel.
+
+config INTEL_IOMMU_FLOPPY_WA
+	def_bool y
+	depends on X86
+	help
+	  Floppy disk drivers are known to bypass DMA API calls
+	  thereby failing to work when IOMMU is enabled. This
+	  workaround will setup a 1:1 mapping for the first
+	  16MiB to make floppy (an ISA device) work.
+
+config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
+	bool "Enable Intel IOMMU scalable mode by default"
+	default y
+	help
+	  Selecting this option will enable by default the scalable mode if
+	  hardware presents the capability. The scalable mode is defined in
+	  VT-d 3.0. The scalable mode capability could be checked by reading
+	  /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option
+	  is not selected, scalable mode support could also be enabled by
+	  passing intel_iommu=sm_on to the kernel. If not sure, please use
+	  the default value.
+
+endif # INTEL_IOMMU
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..fa0dae16441cb50e712973063df5c6bde935ccf9
--- /dev/null
+++ b/drivers/iommu/intel/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_DMAR_TABLE) += dmar.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
+obj-$(CONFIG_DMAR_PERF) += perf.o
+obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
+obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
+obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
new file mode 100644
index 0000000000000000000000000000000000000000..eace338c7dbac6df852b6d9da98c58f77d3ff689
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cap_audit.c - audit iommu capabilities for boot time and hot plug
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ *         Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)	"DMAR: " fmt
+
+#include <linux/intel-iommu.h>
+#include "cap_audit.h"
+
+static u64 intel_iommu_cap_sanity;
+static u64 intel_iommu_ecap_sanity;
+
+static inline void check_irq_capabilities(struct intel_iommu *a,
+					  struct intel_iommu *b)
+{
+	CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK);
+}
+
+static inline void check_dmar_capabilities(struct intel_iommu *a,
+					   struct intel_iommu *b)
+{
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK);
+
+	CHECK_FEATURE_MISMATCH(a, b, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK);
+}
+
+static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	bool mismatch = false;
+	u64 old_cap = intel_iommu_cap_sanity;
+	u64 old_ecap = intel_iommu_ecap_sanity;
+
+	if (type == CAP_AUDIT_HOTPLUG_IRQR) {
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK);
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK);
+		goto out;
+	}
+
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK);
+
+	/* Abort hot plug if the hot plug iommu feature is smaller than global */
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch);
+
+out:
+	if (mismatch) {
+		intel_iommu_cap_sanity = old_cap;
+		intel_iommu_ecap_sanity = old_ecap;
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	struct dmar_drhd_unit *d;
+	struct intel_iommu *i;
+
+	rcu_read_lock();
+	if (list_empty(&dmar_drhd_units))
+		goto out;
+
+	for_each_active_iommu(i, d) {
+		if (!iommu) {
+			intel_iommu_ecap_sanity = i->ecap;
+			intel_iommu_cap_sanity = i->cap;
+			iommu = i;
+			continue;
+		}
+
+		if (type == CAP_AUDIT_STATIC_DMAR)
+			check_dmar_capabilities(iommu, i);
+		else
+			check_irq_capabilities(iommu, i);
+	}
+
+	/*
+	 * If the system is sane to support scalable mode, either SL or FL
+	 * should be sane.
+	 */
+	if (intel_cap_smts_sanity() &&
+	    !intel_cap_flts_sanity() && !intel_cap_slts_sanity())
+		return -EOPNOTSUPP;
+
+out:
+	rcu_read_unlock();
+	return 0;
+}
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu)
+{
+	switch (type) {
+	case CAP_AUDIT_STATIC_DMAR:
+	case CAP_AUDIT_STATIC_IRQR:
+		return cap_audit_static(iommu, type);
+	case CAP_AUDIT_HOTPLUG_DMAR:
+	case CAP_AUDIT_HOTPLUG_IRQR:
+		return cap_audit_hotplug(iommu, type);
+	default:
+		break;
+	}
+
+	return -EFAULT;
+}
+
+bool intel_cap_smts_sanity(void)
+{
+	return ecap_smts(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_pasid_sanity(void)
+{
+	return ecap_pasid(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_nest_sanity(void)
+{
+	return ecap_nest(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_flts_sanity(void)
+{
+	return ecap_flts(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_slad_sanity(void)
+{
+	return ecap_slts(intel_iommu_ecap_sanity) &&
+	       ecap_slads(intel_iommu_ecap_sanity);
+}
+
+bool intel_cap_slts_sanity(void)
+{
+	return ecap_slts(intel_iommu_ecap_sanity);
+}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c59d7e713ccd8b92449e0b53069b95ba15a1a3
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cap_audit.h - audit iommu capabilities header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ */
+
+/*
+ * Capability Register Mask
+ */
+#define CAP_FL5LP_MASK		BIT_ULL(60)
+#define CAP_PI_MASK		BIT_ULL(59)
+#define CAP_FL1GP_MASK		BIT_ULL(56)
+#define CAP_RD_MASK		BIT_ULL(55)
+#define CAP_WD_MASK		BIT_ULL(54)
+#define CAP_MAMV_MASK		GENMASK_ULL(53, 48)
+#define CAP_NFR_MASK		GENMASK_ULL(47, 40)
+#define CAP_PSI_MASK		BIT_ULL(39)
+#define CAP_SLLPS_MASK		GENMASK_ULL(37, 34)
+#define CAP_FRO_MASK		GENMASK_ULL(33, 24)
+#define CAP_ZLR_MASK		BIT_ULL(22)
+#define CAP_MGAW_MASK		GENMASK_ULL(21, 16)
+#define CAP_SAGAW_MASK		GENMASK_ULL(12, 8)
+#define CAP_CM_MASK		BIT_ULL(7)
+#define CAP_PHMR_MASK		BIT_ULL(6)
+#define CAP_PLMR_MASK		BIT_ULL(5)
+#define CAP_RWBF_MASK		BIT_ULL(4)
+#define CAP_AFL_MASK		BIT_ULL(3)
+#define CAP_NDOMS_MASK		GENMASK_ULL(2, 0)
+
+/*
+ * Extended Capability Register Mask
+ */
+#define ECAP_RPS_MASK		BIT_ULL(49)
+#define ECAP_SMPWC_MASK		BIT_ULL(48)
+#define ECAP_FLTS_MASK		BIT_ULL(47)
+#define ECAP_SLTS_MASK		BIT_ULL(46)
+#define ECAP_SLADS_MASK		BIT_ULL(45)
+#define ECAP_VCS_MASK		BIT_ULL(44)
+#define ECAP_SMTS_MASK		BIT_ULL(43)
+#define ECAP_PDS_MASK		BIT_ULL(42)
+#define ECAP_DIT_MASK		BIT_ULL(41)
+#define ECAP_PASID_MASK		BIT_ULL(40)
+#define ECAP_PSS_MASK		GENMASK_ULL(39, 35)
+#define ECAP_EAFS_MASK		BIT_ULL(34)
+#define ECAP_NWFS_MASK		BIT_ULL(33)
+#define ECAP_SRS_MASK		BIT_ULL(31)
+#define ECAP_ERS_MASK		BIT_ULL(30)
+#define ECAP_PRS_MASK		BIT_ULL(29)
+#define ECAP_NEST_MASK		BIT_ULL(26)
+#define ECAP_MTS_MASK		BIT_ULL(25)
+#define ECAP_MHMV_MASK		GENMASK_ULL(23, 20)
+#define ECAP_IRO_MASK		GENMASK_ULL(17, 8)
+#define ECAP_SC_MASK		BIT_ULL(7)
+#define ECAP_PT_MASK		BIT_ULL(6)
+#define ECAP_EIM_MASK		BIT_ULL(4)
+#define ECAP_DT_MASK		BIT_ULL(2)
+#define ECAP_QI_MASK		BIT_ULL(1)
+#define ECAP_C_MASK		BIT_ULL(0)
+
+/* Capabilities related to nested translation */
+#define VTD_CAP_MASK		(CAP_FL1GP_MASK | CAP_FL5LP_MASK)
+
+#define VTD_ECAP_MASK		(ECAP_PRS_MASK | ECAP_ERS_MASK | \
+				 ECAP_SRS_MASK | ECAP_EAFS_MASK | \
+				 ECAP_PASID_MASK)
+
+/*
+ * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each
+ * IOMMU gets audited.
+ */
+#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(a) != cap##_##feature(b)) { \
+		intel_iommu_##cap##_sanity &= ~(MASK); \
+		pr_info("IOMMU feature %s inconsistent", #feature); \
+	} \
+} while (0)
+
+#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+	DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK)
+
+#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(intel_iommu_##cap##_sanity)) \
+		DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \
+					  (b)->cap, cap, feature, MASK); \
+} while (0)
+
+#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \
+do { \
+	u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \
+	min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \
+	intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \
+				     min_feature; \
+} while (0)
+
+#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \
+do { \
+	if ((intel_iommu_##cap##_sanity & (MASK)) > \
+	    (cap##_##feature((iommu)->cap))) \
+		mismatch = true; \
+	else \
+		(iommu)->cap = ((iommu)->cap & ~(MASK)) | \
+		(intel_iommu_##cap##_sanity & (MASK)); \
+} while (0)
+
+enum cap_audit_type {
+	CAP_AUDIT_STATIC_DMAR,
+	CAP_AUDIT_STATIC_IRQR,
+	CAP_AUDIT_HOTPLUG_DMAR,
+	CAP_AUDIT_HOTPLUG_IRQR,
+};
+
+bool intel_cap_smts_sanity(void);
+bool intel_cap_pasid_sanity(void);
+bool intel_cap_nest_sanity(void);
+bool intel_cap_flts_sanity(void);
+bool intel_cap_slad_sanity(void);
+bool intel_cap_slts_sanity(void);
+
+static inline bool scalable_mode_support(void)
+{
+	return (intel_iommu_sm && intel_cap_smts_sanity());
+}
+
+static inline bool pasid_mode_support(void)
+{
+	return scalable_mode_support() && intel_cap_pasid_sanity();
+}
+
+static inline bool nested_mode_support(void)
+{
+	return scalable_mode_support() && intel_cap_nest_sanity();
+}
+
+static inline bool slad_support(void)
+{
+	return scalable_mode_support() && intel_cap_slad_sanity();
+}
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel/debugfs.c
similarity index 56%
rename from drivers/iommu/intel-iommu-debugfs.c
rename to drivers/iommu/intel/debugfs.c
index bdf095e9dbe0308e6ce0a0af99543d6b0a879847..2c1b9e5698169dd0b2570888a078a7562ff7b5b3 100644
--- a/drivers/iommu/intel-iommu-debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -5,6 +5,7 @@
  * Authors: Gayatri Kammela <gayatri.kammela@intel.com>
  *	    Sohil Mehta <sohil.mehta@intel.com>
  *	    Jacob Pan <jacob.jun.pan@linux.intel.com>
+ *	    Lu Baolu <baolu.lu@linux.intel.com>
  */
 
 #include <linux/debugfs.h>
@@ -14,7 +15,8 @@
 
 #include <asm/irq_remapping.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
+#include "perf.h"
 
 struct tbl_walk {
 	u16 bus;
@@ -30,6 +32,9 @@ struct iommu_regset {
 	const char *regs;
 };
 
+#define DEBUG_BUFFER_SIZE	1024
+static char debug_buf[DEBUG_BUFFER_SIZE];
+
 #define IOMMU_REGSET_ENTRY(_reg_)					\
 	{ DMAR_##_reg_##_REG, __stringify(_reg_) }
 
@@ -300,6 +305,140 @@ static int dmar_translation_struct_show(struct seq_file *m, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct);
 
+static inline unsigned long level_to_directory_size(int level)
+{
+	return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1));
+}
+
+static inline void
+dump_page_info(struct seq_file *m, unsigned long iova, u64 *path)
+{
+	seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n",
+		   iova >> VTD_PAGE_SHIFT, path[5], path[4],
+		   path[3], path[2], path[1]);
+}
+
+static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde,
+			       int level, unsigned long start,
+			       u64 *path)
+{
+	int i;
+
+	if (level > 5 || level < 1)
+		return;
+
+	for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT);
+			i++, pde++, start += level_to_directory_size(level)) {
+		if (!dma_pte_present(pde))
+			continue;
+
+		path[level] = pde->val;
+		if (dma_pte_superpage(pde) || level == 1)
+			dump_page_info(m, start, path);
+		else
+			pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)),
+					   level - 1, start, path);
+		path[level] = 0;
+	}
+}
+
+static int show_device_domain_translation(struct device *dev, void *data)
+{
+	struct dmar_domain *domain = find_domain(dev);
+	struct seq_file *m = data;
+	u64 path[6] = { 0 };
+
+	if (!domain)
+		return 0;
+
+	seq_printf(m, "Device %s with pasid %d @0x%llx\n",
+		   dev_name(dev), domain->default_pasid,
+		   (u64)virt_to_phys(domain->pgd));
+	seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n");
+
+	pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static int domain_translation_struct_show(struct seq_file *m, void *unused)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	ret = bus_for_each_dev(&pci_bus_type, NULL, m,
+			       show_device_domain_translation);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
+
+static void invalidation_queue_entry_show(struct seq_file *m,
+					  struct intel_iommu *iommu)
+{
+	int index, shift = qi_shift(iommu);
+	struct qi_desc *desc;
+	int offset;
+
+	if (ecap_smts(iommu->ecap))
+		seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
+	else
+		seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
+
+	for (index = 0; index < QI_LENGTH; index++) {
+		offset = index << shift;
+		desc = iommu->qi->desc + offset;
+		if (ecap_smts(iommu->ecap))
+			seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
+				   index, desc->qw0, desc->qw1,
+				   desc->qw2, desc->qw3,
+				   iommu->qi->desc_status[index]);
+		else
+			seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
+				   index, desc->qw0, desc->qw1,
+				   iommu->qi->desc_status[index]);
+	}
+}
+
+static int invalidation_queue_show(struct seq_file *m, void *unused)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	struct q_inval *qi;
+	int shift;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		qi = iommu->qi;
+		shift = qi_shift(iommu);
+
+		if (!qi || !ecap_qis(iommu->ecap))
+			continue;
+
+		seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
+
+		raw_spin_lock_irqsave(&qi->q_lock, flags);
+		seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+			   (u64)virt_to_phys(qi->desc),
+			   dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+			   dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+		seq_printf(m, "No. of received PRQs: %ld\n", iommu->num_prqs);
+		seq_printf(m, "No. of PRR SUCCESS sent: %ld\n", iommu->num_prrs);
+		seq_printf(m, "No. of PRR INVALID sent: %ld\n", iommu->num_prri);
+		invalidation_queue_entry_show(m, iommu);
+		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
+		seq_putc(m, '\n');
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
+
 #ifdef CONFIG_IRQ_REMAP
 static void ir_tbl_remap_entry_show(struct seq_file *m,
 				    struct intel_iommu *iommu)
@@ -406,6 +545,219 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused)
 DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
 #endif
 
+static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
+			     struct dmar_drhd_unit *drhd)
+{
+	int ret;
+
+	seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+		   iommu->name, drhd->reg_base_addr);
+
+	ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+	if (ret < 0)
+		seq_puts(m, "Failed to get latency snapshot");
+	else
+		seq_puts(m, debug_buf);
+	seq_puts(m, "\n");
+}
+
+static int latency_show(struct seq_file *m, void *v)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd)
+		latency_show_one(m, iommu, drhd);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int dmar_perf_latency_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, latency_show, NULL);
+}
+
+static ssize_t dmar_perf_latency_write(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt, loff_t *ppos)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int counting;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (kstrtoint(buf, 0, &counting))
+		return -EINVAL;
+
+	switch (counting) {
+	case 0:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd) {
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
+			dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+		}
+		rcu_read_unlock();
+		break;
+	case 1:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB);
+		rcu_read_unlock();
+		break;
+	case 2:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB);
+		rcu_read_unlock();
+		break;
+	case 3:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC);
+		rcu_read_unlock();
+		break;
+	case 4:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_PRQ);
+		rcu_read_unlock();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static const struct file_operations dmar_perf_latency_fops = {
+	.open		= dmar_perf_latency_open,
+	.write		= dmar_perf_latency_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
+			     struct dmar_drhd_unit *drhd)
+{
+	int ret;
+
+	seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+		   iommu->name, drhd->reg_base_addr);
+
+	ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+	if (ret < 0)
+		seq_printf(m, "Failed to get latency snapshot");
+	else
+		seq_puts(m, debug_buf);
+	seq_puts(m, "\n");
+}
+
+static int latency_show(struct seq_file *m, void *v)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd)
+		latency_show_one(m, iommu, drhd);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int dmar_perf_latency_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, latency_show, NULL);
+}
+
+static ssize_t dmar_perf_latency_write(struct file *filp,
+				       const char __user *ubuf,
+				       size_t cnt, loff_t *ppos)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	int counting;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (kstrtoint(buf, 0, &counting))
+		return -EINVAL;
+
+	switch (counting) {
+	case 0:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd) {
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
+			dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
+			dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+			iommu->num_prqs = 0;
+			iommu->num_prrs = 0;
+			iommu->num_prri = 0;
+		}
+		rcu_read_unlock();
+		break;
+	case 1:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB);
+		rcu_read_unlock();
+		break;
+	case 2:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB);
+		rcu_read_unlock();
+		break;
+	case 3:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC);
+		rcu_read_unlock();
+		break;
+	case 4:
+		rcu_read_lock();
+		for_each_active_iommu(iommu, drhd)
+			dmar_latency_enable(iommu, DMAR_LATENCY_PRQ);
+		rcu_read_unlock();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static const struct file_operations dmar_perf_latency_fops = {
+	.open		= dmar_perf_latency_open,
+	.write		= dmar_perf_latency_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void __init intel_iommu_debugfs_init(void)
 {
 	struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
@@ -415,8 +767,17 @@ void __init intel_iommu_debugfs_init(void)
 			    &iommu_regset_fops);
 	debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &dmar_translation_struct_fops);
+	debugfs_create_file("domain_translation_struct", 0444,
+			    intel_iommu_debug, NULL,
+			    &domain_translation_struct_fops);
+	debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
+			    NULL, &invalidation_queue_fops);
 #ifdef CONFIG_IRQ_REMAP
 	debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
 			    NULL, &ir_translation_struct_fops);
 #endif
+	debugfs_create_file("qi_done", 0644, intel_iommu_debug,
+			    NULL, &qi_done_fops);
+	debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug,
+			    NULL, &dmar_perf_latency_fops);
 }
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/intel/dmar.c
similarity index 79%
rename from drivers/iommu/dmar.c
rename to drivers/iommu/intel/dmar.c
index 1b9795743276d378fe4f961760234eae4963e9f8..659f0cfc6680fad1e8e18f8139d9a5f4c864a59b 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -31,8 +31,10 @@
 #include <linux/limits.h>
 #include <asm/irq_remapping.h>
 #include <asm/iommu_table.h>
+#include <trace/events/intel_iommu.h>
 
-#include "irq_remapping.h"
+#include "../irq_remapping.h"
+#include "perf.h"
 
 typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
 struct dmar_res_callback {
@@ -66,6 +68,8 @@ static void free_iommu(struct intel_iommu *iommu);
 
 extern const struct iommu_ops intel_iommu_ops;
 
+int qi_done_no_cpu_relax __read_mostly;
+
 static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
 {
 	/*
@@ -147,8 +151,6 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
 	} else {
 		info = kzalloc(size, GFP_KERNEL);
 		if (!info) {
-			pr_warn("Out of memory when allocating notify_info "
-				"for %s.\n", pci_name(dev));
 			if (dmar_dev_scope_status == 0)
 				dmar_dev_scope_status = -ENOMEM;
 			return NULL;
@@ -252,7 +254,7 @@ int dmar_insert_dev_scope(struct dmar_pci_notify_info *info,
 		     info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) ||
 		    (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE &&
 		     (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL &&
-		      info->dev->class >> 8 != PCI_CLASS_BRIDGE_OTHER))) {
+		      info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) {
 			pr_warn("Device scope type does not match for %s\n",
 				pci_name(info->dev));
 			return -EINVAL;
@@ -316,6 +318,9 @@ static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info)
 	if (ret < 0 && dmar_dev_scope_status == 0)
 		dmar_dev_scope_status = ret;
 
+	if (ret >= 0)
+		intel_irq_remap_add_device(info);
+
 	return ret;
 }
 
@@ -330,6 +335,13 @@ static void  dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info)
 	dmar_iommu_notify_scope_dev(info);
 }
 
+static inline void vf_inherit_msi_domain(struct pci_dev *pdev)
+{
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&physfn->dev));
+}
+
 static int dmar_pci_bus_notifier(struct notifier_block *nb,
 				 unsigned long action, void *data)
 {
@@ -339,8 +351,20 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 	/* Only care about add/remove events for physical functions.
 	 * For VFs we actually do the lookup based on the corresponding
 	 * PF in device_to_iommu() anyway. */
-	if (pdev->is_virtfn)
+	if (pdev->is_virtfn) {
+		/*
+		 * Ensure that the VF device inherits the irq domain of the
+		 * PF device. Ideally the device would inherit the domain
+		 * from the bus, but DMAR can have multiple units per bus
+		 * which makes this impossible. The VF 'bus' could inherit
+		 * from the PF device, but that's yet another x86'sism to
+		 * inflict on everybody else.
+		 */
+		if (action == BUS_NOTIFY_ADD_DEVICE)
+			vf_inherit_msi_domain(pdev);
 		return NOTIFY_DONE;
+	}
+
 	if (action != BUS_NOTIFY_ADD_DEVICE &&
 	    action != BUS_NOTIFY_REMOVED_DEVICE)
 		return NOTIFY_DONE;
@@ -363,7 +387,7 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 
 static struct notifier_block dmar_pci_bus_nb = {
 	.notifier_call = dmar_pci_bus_notifier,
-	.priority = INT_MIN,
+	.priority = INT_MAX,
 };
 
 static struct dmar_drhd_unit *
@@ -380,7 +404,7 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
 	return NULL;
 }
 
-/**
+/*
  * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition
  * structure which uniquely represent one DMA remapping hardware unit
  * present in the platform
@@ -473,7 +497,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
 	rhsa = (struct acpi_dmar_rhsa *)header;
 	for_each_drhd_unit(drhd) {
 		if (drhd->reg_base_addr == rhsa->base_address) {
-			int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
+			int node = pxm_to_node(rhsa->proximity_domain);
 
 			if (!node_online(node))
 				node = NUMA_NO_NODE;
@@ -503,6 +527,7 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct acpi_dmar_atsr *atsr;
 	struct acpi_dmar_rhsa *rhsa;
+	struct acpi_dmar_satc *satc;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
@@ -532,6 +557,10 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 		/* We don't print this here because we need to sanity-check
 		   it first. So print it in dmar_parse_one_andd() instead. */
 		break;
+	case ACPI_DMAR_TYPE_SATC:
+		satc = container_of(header, struct acpi_dmar_satc, header);
+		pr_info("SATC flags: 0x%x\n", satc->flags);
+		break;
 	}
 }
 
@@ -619,6 +648,7 @@ parse_dmar_table(void)
 		.cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
 		.cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
 		.cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+		.cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc,
 	};
 
 	/*
@@ -906,8 +936,11 @@ int __init detect_intel_iommu(void)
 	}
 
 #ifdef CONFIG_X86
-	if (!ret)
+	if (!ret) {
 		x86_init.iommu.iommu_init = intel_iommu_init;
+		x86_platform.iommu_shutdown = intel_iommu_shutdown;
+	}
+
 #endif
 
 	if (dmar_tbl) {
@@ -961,6 +994,8 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr)
 		warn_invalid_dmar(phys_addr, " returns all ones");
 		goto unmap;
 	}
+	if (ecap_vcs(iommu->ecap))
+		iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
 
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
@@ -1127,6 +1162,12 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
 	return err;
 }
 
+static inline void dmar_free_fault_wq(struct intel_iommu *iommu)
+{
+	if (iommu->fault_wq)
+		destroy_workqueue(iommu->fault_wq);
+}
+
 static void free_iommu(struct intel_iommu *iommu)
 {
 	if (intel_iommu_enabled && !iommu->drhd->ignored) {
@@ -1143,6 +1184,7 @@ static void free_iommu(struct intel_iommu *iommu)
 		free_irq(iommu->irq, iommu);
 		dmar_free_hwirq(iommu->irq);
 		iommu->irq = 0;
+		dmar_free_fault_wq(iommu);
 	}
 
 	if (iommu->qi) {
@@ -1171,18 +1213,76 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 	}
 }
 
-static int qi_check_fault(struct intel_iommu *iommu, int index)
+static const char *qi_type_string(u8 type)
+{
+	switch (type) {
+	case QI_CC_TYPE:
+		return "Context-cache Invalidation";
+	case QI_IOTLB_TYPE:
+		return "IOTLB Invalidation";
+	case QI_DIOTLB_TYPE:
+		return "Device-TLB Invalidation";
+	case QI_IEC_TYPE:
+		return "Interrupt Entry Cache Invalidation";
+	case QI_IWD_TYPE:
+		return "Invalidation Wait";
+	case QI_EIOTLB_TYPE:
+		return "PASID-based IOTLB Invalidation";
+	case QI_PC_TYPE:
+		return "PASID-cache Invalidation";
+	case QI_DEIOTLB_TYPE:
+		return "PASID-based Device-TLB Invalidation";
+	case QI_PGRP_RESP_TYPE:
+		return "Page Group Response";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void qi_dump_fault(struct intel_iommu *iommu, u32 fault)
+{
+	unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG);
+	u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG);
+	struct qi_desc *desc = iommu->qi->desc + head;
+
+	if (fault & DMA_FSTS_IQE)
+		pr_err("VT-d detected Invalidation Queue Error: Reason %llx",
+		       DMAR_IQER_REG_IQEI(iqe_err));
+	if (fault & DMA_FSTS_ITE)
+		pr_err("VT-d detected Invalidation Time-out Error: SID %llx",
+		       DMAR_IQER_REG_ITESID(iqe_err));
+	if (fault & DMA_FSTS_ICE)
+		pr_err("VT-d detected Invalidation Completion Error: SID %llx",
+		       DMAR_IQER_REG_ICESID(iqe_err));
+
+	pr_err("QI HEAD: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+
+	head = ((head >> qi_shift(iommu)) + QI_LENGTH - 1) % QI_LENGTH;
+	head <<= qi_shift(iommu);
+	desc = iommu->qi->desc + head;
+
+	pr_err("QI PRIOR: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+}
+
+static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
 	u32 fault;
 	int head, tail;
 	struct q_inval *qi = iommu->qi;
-	int wait_index = (index + 1) % QI_LENGTH;
 	int shift = qi_shift(iommu);
 
 	if (qi->desc_status[wait_index] == QI_ABORT)
 		return -EAGAIN;
 
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE))
+		qi_dump_fault(iommu, fault);
 
 	/*
 	 * If IQE happens, the head points to the descriptor associated
@@ -1199,12 +1299,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 			 * used by software as private data. We won't print
 			 * out these two qw's for security consideration.
 			 */
-			pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
-			       (unsigned long long)desc->qw0,
-			       (unsigned long long)desc->qw1);
 			memcpy(desc, qi->desc + (wait_index << shift),
 			       1 << shift);
 			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			pr_info("Invalidation Queue Error (IQE) cleared\n");
 			return -EINVAL;
 		}
 	}
@@ -1221,6 +1319,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 		tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
 
 		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Time-out Error (ITE) cleared\n");
 
 		do {
 			if (qi->desc_status[head] == QI_IN_USE)
@@ -1232,58 +1331,100 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 			return -EAGAIN;
 	}
 
-	if (fault & DMA_FSTS_ICE)
+	if (fault & DMA_FSTS_ICE) {
 		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Completion Error (ICE) cleared\n");
+	}
 
 	return 0;
 }
 
 /*
- * Submit the queued invalidation descriptor to the remapping
- * hardware unit and wait for its completion.
+ * Function to submit invalidation descriptors of all types to the queued
+ * invalidation interface(QI). Multiple descriptors can be submitted at a
+ * time, a wait descriptor will be appended to each submission to ensure
+ * hardware has completed the invalidation before return. Wait descriptors
+ * can be part of the submission but it will not be polled for completion.
  */
-int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options)
 {
-	int rc;
 	struct q_inval *qi = iommu->qi;
-	int offset, shift, length;
+	s64 devtlb_start_ktime = 0;
+	s64 iotlb_start_ktime = 0;
+	s64 iec_start_ktime = 0;
 	struct qi_desc wait_desc;
 	int wait_index, index;
 	unsigned long flags;
+	int offset, shift;
+	int rc, i;
+	u64 type;
 
 	if (!qi)
 		return 0;
 
+	type = desc->qw0 & GENMASK_ULL(3, 0);
+
+	if ((type == QI_IOTLB_TYPE || type == QI_EIOTLB_TYPE) &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IOTLB))
+		iotlb_start_ktime = ktime_to_ns(ktime_get());
+
+	if ((type == QI_DIOTLB_TYPE || type == QI_DEIOTLB_TYPE) &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_DEVTLB))
+		devtlb_start_ktime = ktime_to_ns(ktime_get());
+
+	if (type == QI_IEC_TYPE &&
+	    dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IEC))
+		iec_start_ktime = ktime_to_ns(ktime_get());
+
 restart:
 	rc = 0;
 
 	raw_spin_lock_irqsave(&qi->q_lock, flags);
-	while (qi->free_cnt < 3) {
+	/*
+	 * Check if we have enough empty slots in the queue to submit,
+	 * the calculation is based on:
+	 * # of desc + 1 wait desc + 1 space between head and tail
+	 */
+	while (qi->free_cnt < count + 2) {
 		raw_spin_unlock_irqrestore(&qi->q_lock, flags);
 		cpu_relax();
 		raw_spin_lock_irqsave(&qi->q_lock, flags);
 	}
 
 	index = qi->free_head;
-	wait_index = (index + 1) % QI_LENGTH;
+	wait_index = (index + count) % QI_LENGTH;
 	shift = qi_shift(iommu);
-	length = 1 << shift;
 
-	qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+	for (i = 0; i < count; i++) {
+		offset = ((index + i) % QI_LENGTH) << shift;
+		memcpy(qi->desc + offset, &desc[i], 1 << shift);
+		qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+		trace_qi_submit(iommu, desc[i].qw0, desc[i].qw1,
+				desc[i].qw2, desc[i].qw3);
+	}
+	qi->desc_status[wait_index] = QI_IN_USE;
 
-	offset = index << shift;
-	memcpy(qi->desc + offset, desc, length);
 	wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
 			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	if (options & QI_OPT_WAIT_DRAIN)
+		wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
 	wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]);
 	wait_desc.qw2 = 0;
 	wait_desc.qw3 = 0;
 
+	if ((desc[count - 1].qw0 & 0xF) == 9) {
+		if ((desc[count - 1].qw0 & 0xF000) == 0)
+			iommu->num_prrs++;
+		else if ((desc[count - 1].qw0 & 0xF000) == 0x1000)
+			iommu->num_prri++;
+	}
+
 	offset = wait_index << shift;
-	memcpy(qi->desc + offset, &wait_desc, length);
+	memcpy(qi->desc + offset, &wait_desc, 1 << shift);
 
-	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
-	qi->free_cnt -= 2;
+	qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
+	qi->free_cnt -= count + 1;
 
 	/*
 	 * update the HW tail register indicating the presence of
@@ -1299,16 +1440,19 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 * a deadlock where the interrupt context can wait indefinitely
 		 * for free slots in the queue.
 		 */
-		rc = qi_check_fault(iommu, index);
+		rc = qi_check_fault(iommu, index, wait_index);
 		if (rc)
 			break;
 
-		raw_spin_unlock(&qi->q_lock);
-		cpu_relax();
-		raw_spin_lock(&qi->q_lock);
+		if (!qi_done_no_cpu_relax) {
+			raw_spin_unlock(&qi->q_lock);
+			cpu_relax();
+			raw_spin_lock(&qi->q_lock);
+		}
 	}
 
-	qi->desc_status[index] = QI_DONE;
+	for (i = 0; i < count; i++)
+		qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
 
 	reclaim_free_desc(qi);
 	raw_spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -1316,6 +1460,18 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	if (rc == -EAGAIN)
 		goto restart;
 
+	if (iotlb_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_IOTLB,
+				ktime_to_ns(ktime_get()) - iotlb_start_ktime);
+
+	if (devtlb_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_DEVTLB,
+				ktime_to_ns(ktime_get()) - devtlb_start_ktime);
+
+	if (iec_start_ktime)
+		dmar_latency_update(iommu, DMAR_LATENCY_INV_IEC,
+				ktime_to_ns(ktime_get()) - iec_start_ktime);
+
 	return rc;
 }
 
@@ -1332,7 +1488,7 @@ void qi_global_iec(struct intel_iommu *iommu)
 	desc.qw3 = 0;
 
 	/* should never fail */
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
@@ -1346,7 +1502,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
@@ -1370,7 +1526,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
@@ -1392,7 +1548,102 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based IOTLB invalidation */
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih)
+{
+	struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
+
+	/*
+	 * npages == -1 means a PASID-selective invalidation, otherwise,
+	 * a positive value for Page-selective-within-PASID invalidation.
+	 * 0 is not a valid input.
+	 */
+	if (WARN_ON(!npages)) {
+		pr_err("Invalid input npages = %ld\n", npages);
+		return;
+	}
+
+	if (npages == -1) {
+		desc.qw0 = QI_EIOTLB_PASID(pasid) |
+				QI_EIOTLB_DID(did) |
+				QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+				QI_EIOTLB_TYPE;
+		desc.qw1 = 0;
+	} else {
+		int mask = ilog2(__roundup_pow_of_two(npages));
+		unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
+
+		if (WARN_ON_ONCE(!IS_ALIGNED(addr, align)))
+			addr = ALIGN_DOWN(addr, align);
+
+		desc.qw0 = QI_EIOTLB_PASID(pasid) |
+				QI_EIOTLB_DID(did) |
+				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+				QI_EIOTLB_TYPE;
+		desc.qw1 = QI_EIOTLB_ADDR(addr) |
+				QI_EIOTLB_IH(ih) |
+				QI_EIOTLB_AM(mask);
+	}
+
+	qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+/* PASID-based device IOTLB Invalidate */
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid,  u16 qdep, u64 addr, unsigned int size_order)
+{
+	unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
+	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+	desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+		QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+		QI_DEV_IOTLB_PFSID(pfsid);
+
+	/*
+	 * If S bit is 0, we only flush a single page. If S bit is set,
+	 * The least significant zero bit indicates the invalidation address
+	 * range. VT-d spec 6.5.2.6.
+	 * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
+	 * size order = 0 is PAGE_SIZE 4KB
+	 * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
+	 * ECAP.
+	 */
+	if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order))
+		pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
+				    addr, size_order);
+
+	/* Take page address */
+	desc.qw1 = QI_DEV_EIOTLB_ADDR(addr);
+
+	if (size_order) {
+		/*
+		 * Existing 0s in address below size_order may be the least
+		 * significant bit, we must set them to 1s to avoid having
+		 * smaller size than desired.
+		 */
+		desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1,
+					VTD_PAGE_SHIFT);
+		/* Clear size_order bit to indicate size */
+		desc.qw1 &= ~mask;
+		/* Set the S bit to indicate flushing more than 1 page */
+		desc.qw1 |= QI_DEV_EIOTLB_SIZE;
+	}
+
+	qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
+			  u64 granu, u32 pasid)
+{
+	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
+
+	desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
+			QI_PC_GRAN(granu) | QI_PC_TYPE;
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 /*
@@ -1615,6 +1866,31 @@ static const char *irq_remap_fault_reasons[] =
 	"Blocked an interrupt request due to source-id verification failure",
 };
 
+/* fault data and status */
+enum intel_iommu_fault_reason {
+	INTEL_IOMMU_FAULT_REASON_SW,
+	INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID,
+	INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH,
+	INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS,
+	INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS,
+	INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID,
+	INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID,
+	INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP,
+	INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE,
+	NR_INTEL_IOMMU_FAULT_REASON,
+};
+
+/* fault reasons that are allowed to be reported outside IOMMU subsystem */
+#define INTEL_IOMMU_FAULT_REASON_ALLOWED			\
+	((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) |	\
+		(1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) |	\
+		(1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS))
+
+
 static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
 {
 	if (fault_reason >= 0x20 && (fault_reason - 0x20 <
@@ -1699,26 +1975,132 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+static enum iommu_fault_reason to_iommu_fault_reason(u8 reason)
+{
+	if (reason >= NR_INTEL_IOMMU_FAULT_REASON) {
+		pr_warn("unknown DMAR fault reason %d\n", reason);
+		return IOMMU_FAULT_REASON_UNKNOWN;
+	}
+	switch (reason) {
+	case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH:
+		return 	IOMMU_FAULT_REASON_OOR_ADDRESS;
+	case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS:
+	case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS:
+		return IOMMU_FAULT_REASON_PERMISSION;
+		/* REVISIT: Internal IOMMU fault reasons are reported as
+		 * unknown to the device. Need to sort throuh all SM reasons.
+		 */
+	case INTEL_IOMMU_FAULT_REASON_SW:
+	case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID:
+	case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID:
+	default:
+		return IOMMU_FAULT_REASON_UNKNOWN;
+	}
+}
+
+struct dmar_fault_work {
+	struct work_struct fault_work;
+	struct intel_iommu *iommu;
+	u64 addr;
+	int type;
+	int fault_type;
+	enum intel_iommu_fault_reason reason;
+	u16 sid;
+};
+
+static void report_fault_to_device(struct work_struct *work)
+{
+	struct dmar_fault_work *dfw = container_of(work, struct dmar_fault_work,
+						fault_work);
+	struct iommu_fault_event event;
+	struct pci_dev *pdev;
+	u8 bus, devfn;
+
+	memset(&event, 0, sizeof(struct iommu_fault_event));
+	bus = PCI_BUS_NUM(dfw->sid);
+	devfn = PCI_DEVFN(PCI_SLOT(dfw->sid), PCI_FUNC(dfw->sid));
+	/*
+	 * we need to check if the fault reporting is requested for the
+	 * offending device.
+	 */
+	pdev = pci_get_domain_bus_and_slot(dfw->iommu->segment, bus, devfn);
+	if (!pdev) {
+		pr_warn("No PCI device found for source ID %x\n", dfw->sid);
+		goto free_work;
+	}
+	/*
+	 * unrecoverable fault is reported per IOMMU, notifier handler can
+	 * resolve PCI device based on source ID.
+	 */
+	event.fault.event.reason = to_iommu_fault_reason(dfw->reason);
+	event.fault.event.addr = dfw->addr;
+	event.fault.type = IOMMU_FAULT_DMA_UNRECOV;
+	event.fault.event.perm = dfw->type ? IOMMU_READ : IOMMU_WRITE;
+	iommu_report_device_fault(&pdev->dev, &event);
+	pci_dev_put(pdev);
+
+free_work:
+	kfree(dfw);
+}
+
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
-		u8 fault_reason, int pasid, u16 source_id,
+		u8 fault_reason, u32 pasid, u16 source_id,
 		unsigned long long addr)
 {
 	const char *reason;
 	int fault_type;
+	struct dmar_fault_work *dfw;
 
 	reason = dmar_get_fault_reason(fault_reason, &fault_type);
 
-	if (fault_type == INTR_REMAP)
-		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n",
-			source_id >> 8, PCI_SLOT(source_id & 0xFF),
-			PCI_FUNC(source_id & 0xFF), addr >> 48,
-			fault_reason, reason);
-	else
-		pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n",
+	if (fault_type == INTR_REMAP) {
+		pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n",
+		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr >> 48,
+		       fault_reason, reason);
+
+		return 0;
+	}
+
+	if (pasid == INVALID_IOASID)
+		pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
 		       type ? "DMA Read" : "DMA Write",
 		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
-		       PCI_FUNC(source_id & 0xFF), pasid, addr,
+		       PCI_FUNC(source_id & 0xFF), addr,
 		       fault_reason, reason);
+	else
+		pr_err("[%s PASID 0x%x] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n",
+		       type ? "DMA Read" : "DMA Write", pasid,
+		       source_id >> 8, PCI_SLOT(source_id & 0xFF),
+		       PCI_FUNC(source_id & 0xFF), addr,
+		       fault_reason, reason);
+
+	dmar_fault_dump_ptes(iommu, source_id, addr, pasid);
+
+	/* check if fault reason is permitted to report outside IOMMU */
+	if (!((1 << fault_reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED))
+		return 0;
+
+	dfw = kmalloc(sizeof(*dfw), GFP_ATOMIC);
+	if (!dfw)
+		return -ENOMEM;
+
+	INIT_WORK(&dfw->fault_work, report_fault_to_device);
+	dfw->addr = addr;
+	dfw->type = type;
+	dfw->fault_type = fault_type;
+	dfw->reason = fault_reason;
+	dfw->sid = source_id;
+	dfw->iommu = iommu;
+	if (!queue_work(iommu->fault_wq, &dfw->fault_work)) {
+		kfree(dfw);
+		return -EBUSY;
+	}
+
 	return 0;
 }
 
@@ -1750,7 +2132,8 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		u8 fault_reason;
 		u16 source_id;
 		u64 guest_addr;
-		int type, pasid;
+		u32 pasid;
+		int type;
 		u32 data;
 		bool pasid_present;
 
@@ -1784,7 +2167,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		if (!ratelimited)
 			/* Using pasid -1 if pasid is not present */
 			dmar_fault_do_one(iommu, type, fault_reason,
-					  pasid_present ? pasid : -1,
+					  pasid_present ? pasid : INVALID_IOASID,
 					  source_id, guest_addr);
 
 		fault_index++;
@@ -1801,10 +2184,28 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-int dmar_set_interrupt(struct intel_iommu *iommu)
+static int dmar_set_fault_wq(struct intel_iommu *iommu)
+{
+	if (iommu->fault_wq)
+		return 0;
+
+	iommu->fault_wq = alloc_ordered_workqueue(iommu->name, 0);
+	if (!iommu->fault_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault)
 {
 	int irq, ret;
 
+	/* fault can be reported back to device drivers via a wq */
+	if (queue_fault) {
+		ret = dmar_set_fault_wq(iommu);
+		if (ret)
+			pr_err("Failed to create fault handling workqueue\n");
+	}
 	/*
 	 * Check if the fault interrupt is already initialized.
 	 */
@@ -1820,8 +2221,10 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	}
 
 	ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
-	if (ret)
+	if (ret) {
 		pr_err("Can't request irq\n");
+		dmar_free_fault_wq(iommu);
+	}
 	return ret;
 }
 
@@ -1835,7 +2238,7 @@ int __init enable_drhd_fault_handling(void)
 	 */
 	for_each_iommu(iommu, drhd) {
 		u32 fault_status;
-		int ret = dmar_set_interrupt(iommu);
+		int ret = dmar_set_interrupt(iommu, false);
 
 		if (ret) {
 			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
@@ -1938,6 +2341,7 @@ static guid_t dmar_hp_guid =
 #define	DMAR_DSM_FUNC_DRHD		1
 #define	DMAR_DSM_FUNC_ATSR		2
 #define	DMAR_DSM_FUNC_RHSA		3
+#define	DMAR_DSM_FUNC_SATC		4
 
 static inline bool dmar_detect_dsm(acpi_handle handle, int func)
 {
@@ -1955,6 +2359,7 @@ static int dmar_walk_dsm_resource(acpi_handle handle, int func,
 		[DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
 		[DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
 		[DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+		[DMAR_DSM_FUNC_SATC] = ACPI_DMAR_TYPE_SATC,
 	};
 
 	if (!dmar_detect_dsm(handle, func))
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel/iommu.c
similarity index 65%
rename from drivers/iommu/intel-iommu.c
rename to drivers/iommu/intel/iommu.c
index 953d86ca6d2b253cb02ae793e5d814da4677667c..5d06cc2b740c76f462fefec8786ab44a583c468d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -31,7 +31,9 @@
 #include <linux/io.h>
 #include <linux/iova.h>
 #include <linux/iommu.h>
+#include <linux/dma-iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
 #include <linux/syscore_ops.h>
 #include <linux/tboot.h>
 #include <linux/dmi.h>
@@ -41,17 +43,29 @@
 #include <linux/dma-direct.h>
 #include <linux/crash_dump.h>
 #include <linux/numa.h>
-#include <linux/swiotlb.h>
+#include <linux/ioasid.h>
 #include <asm/irq_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
-#include <trace/events/intel_iommu.h>
 
-#include "irq_remapping.h"
-#include "intel-pasid.h"
+#include "../irq_remapping.h"
+#include "../iommu-sva-lib.h"
+#include "pasid.h"
+#include "cap_audit.h"
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
+/* PRS_Allocation */
+static int prs_allocation = 32;
+
+/* PRQ_Size, use large enough size to avoid PRQ overflow, on SPR the sum of
+ * DSA, IAX/A, QAT = 512 + 256 + 64, round up to power of 2 is 1024.
+ * Since we process the PRQ once then move QH, we need to double that
+ * to 2048 entries. Each 4K page can hold 128 entries. So we need 64KB
+ * in total to prevent PRQ overflow in the worst case.
+ * For default 32 entries per dev,8 KB should be enough.
+ */
+int prq_size_page_order = 4;
 
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
@@ -67,8 +81,8 @@
 #define MAX_AGAW_WIDTH 64
 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
 
-#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
+#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
+#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
 
 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
@@ -103,6 +117,9 @@
  */
 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 
+/* PASIDs used by host SVM */
+struct ioasid_set *host_pasid_set;
+
 static inline int agaw_to_level(int agaw)
 {
 	return agaw + 2;
@@ -168,11 +185,16 @@ static inline unsigned long virt_to_dma_pfn(void *p)
 	return page_to_dma_pfn(virt_to_page(p));
 }
 
+static int dev_satc_state(struct pci_dev *dev);
+
 /* global iommu list, set NULL for ignored DMAR units */
 static struct intel_iommu **g_iommus;
 
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
+static inline struct device_domain_info *
+dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
+static bool scalable_mode_support(void);
 
 /*
  * set to 1 to panic kernel if can't successfully enable VT-d
@@ -295,17 +317,7 @@ static inline void context_clear_entry(struct context_entry *context)
  */
 static struct dmar_domain *si_domain;
 static int hw_pass_through = 1;
-
-/* si_domain contains mulitple devices */
-#define DOMAIN_FLAG_STATIC_IDENTITY		BIT(0)
-
-/*
- * This is a DMA domain allocated through the iommu domain allocation
- * interface. But one or more devices belonging to this domain have
- * been chosen to use a private domain. We should avoid to use the
- * map/unmap/iova_to_phys APIs on it.
- */
-#define DOMAIN_FLAG_LOSE_CHILDREN		BIT(1)
+static int intel_caching_mode;
 
 #define for_each_domain_iommu(idx, domain)			\
 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
@@ -328,8 +340,18 @@ struct dmar_atsr_unit {
 	u8 include_all:1;		/* include all ports */
 };
 
+struct dmar_satc_unit {
+	struct list_head list;		/* list of SATC units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct dmar_dev_scope *devices;	/* target devices */
+	struct intel_iommu *iommu;	/* the corresponding iommu */
+	int devices_cnt;		/* target device count */
+	u8 atc_required:1;		/* ATS is required */
+};
+
 static LIST_HEAD(dmar_atsr_units);
 static LIST_HEAD(dmar_rmrr_units);
+static LIST_HEAD(dmar_satc_units);
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
@@ -341,72 +363,50 @@ static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
 static void dmar_remove_one_dev_info(struct device *dev);
 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
-static void domain_context_clear(struct intel_iommu *iommu,
-				 struct device *dev);
-static int domain_detach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu);
-static bool device_is_rmrr_locked(struct device *dev);
 static int intel_iommu_attach_device(struct iommu_domain *domain,
 				     struct device *dev);
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 					    dma_addr_t iova);
 
-#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
+int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
+
+/* == 0 --> use FL for IOVA (default), != 0 --> use SL for IOVA */
+static int default_iova = 0;
 
-int intel_iommu_sm;
 int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
-static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
-static int intel_no_bounce;
+static int iommu_skip_te_disable;
 
-#define IDENTMAP_ALL		1
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 
-#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
-static DEFINE_SPINLOCK(device_domain_lock);
-static LIST_HEAD(device_domain_list);
-
-#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
-				to_pci_dev(d)->untrusted)
-
-/*
- * Iterate over elements in device_domain_list and call the specified
- * callback @fn against each element.
- */
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
-				     void *data), void *data)
+struct device_domain_info *get_domain_info(struct device *dev)
 {
-	int ret = 0;
-	unsigned long flags;
 	struct device_domain_info *info;
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &device_domain_list, global) {
-		ret = fn(info, data);
-		if (ret) {
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-			return ret;
-		}
-	}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
+	if (!dev)
+		return NULL;
 
-	return 0;
+	info = dev_iommu_priv_get(dev);
+	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
+		return NULL;
+
+	return info;
 }
 
+DEFINE_SPINLOCK(device_domain_lock);
+static LIST_HEAD(device_domain_list);
+
 const struct iommu_ops intel_iommu_ops;
 
 static bool translation_pre_enabled(struct intel_iommu *iommu)
@@ -428,16 +428,11 @@ static void init_translation_status(struct intel_iommu *iommu)
 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 }
 
-/* Convert generic 'struct iommu_domain to private struct dmar_domain */
-static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
-{
-	return container_of(dom, struct dmar_domain, domain);
-}
-
 static int __init intel_iommu_setup(char *str)
 {
 	if (!str)
 		return -EINVAL;
+
 	while (*str) {
 		if (!strncmp(str, "on", 2)) {
 			dmar_disabled = 0;
@@ -450,8 +445,8 @@ static int __init intel_iommu_setup(char *str)
 			dmar_map_gfx = 0;
 			pr_info("Disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			pr_info("Forcing DAC for PCI devices\n");
-			dmar_forcedac = 1;
+			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
+			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
 			pr_info("Disable batched IOTLB flush\n");
 			intel_iommu_strict = 1;
@@ -459,25 +454,68 @@ static int __init intel_iommu_setup(char *str)
 			pr_info("Disable supported super page\n");
 			intel_iommu_superpage = 0;
 		} else if (!strncmp(str, "sm_on", 5)) {
-			pr_info("Intel-IOMMU: scalable mode supported\n");
+			pr_info("Enable scalable mode if hardware supports\n");
 			intel_iommu_sm = 1;
+		} else if (!strncmp(str, "sm_off", 6)) {
+			pr_info("Scalable mode is disallowed\n");
+			intel_iommu_sm = 0;
+		} else if (!strncmp(str, "prq_size_4kb", 12)) {
+			prq_size_page_order = 0;
+		} else if (!strncmp(str, "prq_size_8kb", 12)) {
+			prq_size_page_order = 1;
+		} else if (!strncmp(str, "prq_size_16kb", 13)) {
+			prq_size_page_order = 2;
+		} else if (!strncmp(str, "prq_size_32kb", 13)) {
+			prq_size_page_order = 3;
+		} else if (!strncmp(str, "prq_size_64kb", 13)) {
+			prq_size_page_order = 4;
+		} else if (!strncmp(str, "prs_allocation_32", 17)) {
+			prs_allocation = 32;
+		} else if (!strncmp(str, "prs_allocation_64", 17)) {
+			prs_allocation = 64;
+		} else if (!strncmp(str, "prs_allocation_128", 18)) {
+			prs_allocation = 128;
+		} else if (!strncmp(str, "prs_allocation_256", 18)) {
+			prs_allocation = 256;
+		} else if (!strncmp(str, "prs_allocation_512", 18)) {
+			prs_allocation = 512;
 		} else if (!strncmp(str, "tboot_noforce", 13)) {
-			printk(KERN_INFO
-				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 			intel_iommu_tboot_noforce = 1;
-		} else if (!strncmp(str, "nobounce", 8)) {
-			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
-			intel_no_bounce = 1;
+		} else if (!strncmp(str, "qi_done_no_cpu_relax", 19)) {
+			pr_info("Intel-IOMMU: no cpu_relax() in qi\n");
+			qi_done_no_cpu_relax = 1;
+		} else if (!strncmp(str, "iova_sl", 7)) {
+			pr_info("Intel-IOMMU: default SL IOVA enabled\n");
+			default_iova = 1;
+		} else {
+			pr_notice("Unknown option - '%s'\n", str);
 		}
 
 		str += strcspn(str, ",");
 		while (*str == ',')
 			str++;
 	}
-	return 0;
+
+	return 1;
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
+static int __init intel_prs_allocation_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (kstrtoint(str, 10, &prs_allocation) < 0)
+		pr_info("prs_allocation: wrong parameter %s %d\n", str, prs_allocation);
+
+
+	pr_info("prs_allocation set to %d\n", prs_allocation);
+
+	return 1;
+}
+__setup("prs_allocation=", intel_prs_allocation_setup);
+
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
 
@@ -549,7 +587,12 @@ static inline void free_devinfo_mem(void *vaddr)
 
 static inline int domain_type_is_si(struct dmar_domain *domain)
 {
-	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
+}
+
+static inline bool domain_use_first_level(struct dmar_domain *domain)
+{
+	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 }
 
 static inline int domain_pfn_supported(struct dmar_domain *domain,
@@ -563,7 +606,7 @@ static inline int domain_pfn_supported(struct dmar_domain *domain,
 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 {
 	unsigned long sagaw;
-	int agaw = -1;
+	int agaw;
 
 	sagaw = cap_sagaw(iommu->cap);
 	for (agaw = width_to_agaw(max_gaw);
@@ -599,7 +642,7 @@ struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 	int iommu_id;
 
 	/* si_domain and vm domain should not get here. */
-	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
+	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
 		return NULL;
 
 	for_each_domain_iommu(iommu_id, domain)
@@ -624,12 +667,12 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	bool found = false;
 	int i;
 
-	domain->iommu_coherency = 1;
+	domain->iommu_coherency = true;
 
 	for_each_domain_iommu(i, domain) {
 		found = true;
 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
@@ -640,24 +683,31 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (!iommu_paging_structure_coherency(iommu)) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
 	rcu_read_unlock();
 }
 
-static int domain_update_iommu_snooping(struct intel_iommu *skip)
+static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int ret = 1;
+	bool ret = true;
 
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (iommu != skip) {
-			if (!ecap_sc_support(iommu->ecap)) {
-				ret = 0;
+			/*
+			 * If the hardware is operating in the scalable mode,
+			 * the snooping control is always supported since we
+			 * always set PASID-table-entry.PGSNP bit if the domain
+			 * is managed outside (UNMANAGED).
+			 */
+			if (!sm_supported(iommu) &&
+			    !ecap_sc_support(iommu->ecap)) {
+				ret = false;
 				break;
 			}
 		}
@@ -667,21 +717,27 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
 	return ret;
 }
 
-static int domain_update_iommu_superpage(struct intel_iommu *skip)
+static int domain_update_iommu_superpage(struct dmar_domain *domain,
+					 struct intel_iommu *skip)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int mask = 0xf;
+	int mask = 0x3;
 
-	if (!intel_iommu_superpage) {
+	if (!intel_iommu_superpage)
 		return 0;
-	}
 
 	/* set iommu_superpage to the smallest common denominator */
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (iommu != skip) {
-			mask &= cap_super_page_val(iommu->cap);
+			if (domain && domain_use_first_level(domain)) {
+				if (!cap_fl1gp_support(iommu->cap))
+					mask = 0x1;
+			} else {
+				mask &= cap_super_page_val(iommu->cap);
+			}
+
 			if (!mask)
 				break;
 		}
@@ -691,12 +747,81 @@ static int domain_update_iommu_superpage(struct intel_iommu *skip)
 	return fls(mask);
 }
 
+static int domain_update_device_node(struct dmar_domain *domain)
+{
+	struct device_domain_info *info;
+	int nid = NUMA_NO_NODE;
+
+	assert_spin_locked(&device_domain_lock);
+
+	if (list_empty(&domain->devices))
+		return NUMA_NO_NODE;
+
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev)
+			continue;
+
+		/*
+		 * There could possibly be multiple device numa nodes as devices
+		 * within the same domain may sit behind different IOMMUs. There
+		 * isn't perfect answer in such situation, so we select first
+		 * come first served policy.
+		 */
+		nid = dev_to_node(info->dev);
+		if (nid != NUMA_NO_NODE)
+			break;
+	}
+
+	return nid;
+}
+
+static void domain_update_iotlb(struct dmar_domain *domain);
+
+/* Return the super pagesize bitmap if supported. */
+static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
+{
+	unsigned long bitmap = 0;
+
+	/*
+	 * 1-level super page supports page size of 2MiB, 2-level super page
+	 * supports page size of both 2MiB and 1GiB.
+	 */
+	if (domain->iommu_superpage == 1)
+		bitmap |= SZ_2M;
+	else if (domain->iommu_superpage == 2)
+		bitmap |= SZ_2M | SZ_1G;
+
+	return bitmap;
+}
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
 	domain_update_iommu_coherency(domain);
 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
-	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
+	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
+
+	/*
+	 * If RHSA is missing, we should default to the device numa domain
+	 * as fall back.
+	 */
+	if (domain->nid == NUMA_NO_NODE)
+		domain->nid = domain_update_device_node(domain);
+
+	/*
+	 * First-level translation restricts the input-address to a
+	 * canonical address (i.e., address bits 63:N have the same
+	 * value as address bit [N-1], where N is 48-bits with 4-level
+	 * paging and 57-bits with 5-level paging). Hence, skip bit
+	 * [N-1].
+	 */
+	if (domain_use_first_level(domain))
+		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
+	else
+		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
+
+	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
+	domain_update_iotlb(domain);
 }
 
 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
@@ -733,9 +858,9 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 	return &context[devfn];
 }
 
-static int iommu_dummy(struct device *dev)
+static bool attach_deferred(struct device *dev)
 {
-	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 }
 
 /**
@@ -765,28 +890,68 @@ is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 	return false;
 }
 
-static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+	struct dmar_drhd_unit *drhd;
+	u32 vtbar;
+	int rc;
+
+	/* We know that this device on this chipset has its own IOMMU.
+	 * If we find it under a different IOMMU, then the BIOS is lying
+	 * to us. Hope that the IOMMU for this device is actually
+	 * disabled, and it needs no translation...
+	 */
+	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+	if (rc) {
+		/* "can't" happen */
+		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+		return false;
+	}
+	vtbar &= 0xffff0000;
+
+	/* we know that the this iommu should be at offset 0xa000 from vtbar */
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
+		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+		return true;
+	}
+
+	return false;
+}
+
+static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
+{
+	if (!iommu || iommu->drhd->ignored)
+		return true;
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
+		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
+		    quirk_ioat_snb_local_iommu(pdev))
+			return true;
+	}
+
+	return false;
+}
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 {
 	struct dmar_drhd_unit *drhd = NULL;
+	struct pci_dev *pdev = NULL;
 	struct intel_iommu *iommu;
 	struct device *tmp;
-	struct pci_dev *pdev = NULL;
 	u16 segment = 0;
 	int i;
 
-	if (iommu_dummy(dev))
+	if (!dev)
 		return NULL;
 
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pf_pdev;
 
-		pdev = to_pci_dev(dev);
-
-#ifdef CONFIG_X86
-		/* VMD child devices currently cannot be handled individually */
-		if (is_vmd(pdev->bus))
-			return NULL;
-#endif
+		pdev = pci_real_dma_dev(to_pci_dev(dev));
 
 		/* VFs aren't listed in scope tables; we need to look up
 		 * the PF instead to find the IOMMU. */
@@ -797,7 +962,7 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 		dev = &ACPI_COMPANION(dev)->dev;
 
 	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
+	for_each_iommu(iommu, drhd) {
 		if (pdev && segment != drhd->segment)
 			continue;
 
@@ -811,8 +976,10 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 				if (pdev && pdev->is_virtfn)
 					goto got_pdev;
 
-				*bus = drhd->devices[i].bus;
-				*devfn = drhd->devices[i].devfn;
+				if (bus && devfn) {
+					*bus = drhd->devices[i].bus;
+					*devfn = drhd->devices[i].devfn;
+				}
 				goto out;
 			}
 
@@ -822,13 +989,18 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
 
 		if (pdev && drhd->include_all) {
 		got_pdev:
-			*bus = pdev->bus->number;
-			*devfn = pdev->devfn;
+			if (bus && devfn) {
+				*bus = pdev->bus->number;
+				*devfn = pdev->devfn;
+			}
 			goto out;
 		}
 	}
 	iommu = NULL;
  out:
+	if (iommu_is_dummy(iommu, dev))
+		iommu = NULL;
+
 	rcu_read_unlock();
 
 	return iommu;
@@ -884,10 +1056,126 @@ static void free_context_table(struct intel_iommu *iommu)
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
+#ifdef CONFIG_DMAR_DEBUG
+static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
+{
+	struct device_domain_info *info;
+	struct dma_pte *parent, *pte;
+	struct dmar_domain *domain;
+	int offset, level;
+
+	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
+	if (!info || !info->domain) {
+		pr_info("device [%02x:%02x.%d] not probed\n",
+			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+		return;
+	}
+
+	domain = info->domain;
+	level = agaw_to_level(domain->agaw);
+	parent = domain->pgd;
+	if (!parent) {
+		pr_info("no page table setup\n");
+		return;
+	}
+
+	while (1) {
+		offset = pfn_level_offset(pfn, level);
+		pte = &parent[offset];
+		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
+			pr_info("PTE not present at level %d\n", level);
+			break;
+		}
+
+		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
+
+		if (level == 1)
+			break;
+
+		parent = phys_to_virt(dma_pte_addr(pte));
+		level--;
+	}
+}
+
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+			  unsigned long long addr, u32 pasid)
+{
+	struct pasid_dir_entry *dir, *pde;
+	struct pasid_entry *entries, *pte;
+	struct context_entry *ctx_entry;
+	struct root_entry *rt_entry;
+	u8 devfn = source_id & 0xff;
+	u8 bus = source_id >> 8;
+	int i, dir_index, index;
+
+	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
+
+	/* root entry dump */
+	rt_entry = &iommu->root_entry[bus];
+	if (!rt_entry) {
+		pr_info("root table entry is not present\n");
+		return;
+	}
+
+	if (sm_supported(iommu))
+		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
+			rt_entry->hi, rt_entry->lo);
+	else
+		pr_info("root entry: 0x%016llx", rt_entry->lo);
+
+	/* context entry dump */
+	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
+	if (!ctx_entry) {
+		pr_info("context table entry is not present\n");
+		return;
+	}
+
+	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
+		ctx_entry->hi, ctx_entry->lo);
+
+	/* legacy mode does not require PASID entries */
+	if (!sm_supported(iommu))
+		goto pgtable_walk;
+
+	/* get the pointer to pasid directory entry */
+	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
+	if (!dir) {
+		pr_info("pasid directory entry is not present\n");
+		return;
+	}
+	/* For request-without-pasid, get the pasid from context entry */
+	if (intel_iommu_sm && pasid == INVALID_IOASID)
+		pasid = PASID_RID2PASID;
+
+	dir_index = pasid >> PASID_PDE_SHIFT;
+	pde = &dir[dir_index];
+	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
+
+	/* get the pointer to the pasid table entry */
+	entries = get_pasid_table_from_pde(pde);
+	if (!entries) {
+		pr_info("pasid table entry is not present\n");
+		return;
+	}
+	index = pasid & PASID_PTE_MASK;
+	pte = &entries[index];
+	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
+		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
+
+pgtable_walk:
+	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
+}
+#endif
+
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 				      unsigned long pfn, int *target_level)
 {
 	struct dma_pte *parent, *pte;
+	/*
+	 * level == 5: 5 level page table;
+	 * level == 4: 4 level page table;
+	 * level == 3: 3 level page table;
+	 */
 	int level = agaw_to_level(domain->agaw);
 	int offset;
 
@@ -919,6 +1207,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
+			if (domain_use_first_level(domain)) {
+				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
+				if (iommu_is_dma_domain(&domain->domain))
+					pteval |= DMA_FL_PTE_ACCESS;
+			}
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
@@ -1151,17 +1444,17 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
    pages can only be freed after the IOTLB flush has been done. */
 static struct page *domain_unmap(struct dmar_domain *domain,
 				 unsigned long start_pfn,
-				 unsigned long last_pfn)
+				 unsigned long last_pfn,
+				 struct page *freelist)
 {
-	struct page *freelist;
-
 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
 	BUG_ON(start_pfn > last_pfn);
 
 	/* we don't need lock here; nobody else touches the iova range */
 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
-				       domain->pgd, 0, start_pfn, last_pfn, NULL);
+				       domain->pgd, 0, start_pfn, last_pfn,
+				       freelist);
 
 	/* free pgd */
 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
@@ -1185,13 +1478,6 @@ static void dma_free_pagelist(struct page *freelist)
 	}
 }
 
-static void iova_entry_free(unsigned long data)
-{
-	struct page *freelist = (struct page *)data;
-
-	dma_free_pagelist(freelist);
-}
-
 /* iommu handling */
 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
@@ -1234,6 +1520,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
 		      readl, (sts & DMA_GSTS_RTPS), sts);
 
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu)
@@ -1374,17 +1665,22 @@ static void domain_update_iotlb(struct dmar_domain *domain)
 
 	assert_spin_locked(&device_domain_lock);
 
-	list_for_each_entry(info, &domain->devices, link) {
-		struct pci_dev *pdev;
-
-		if (!info->dev || !dev_is_pci(info->dev))
-			continue;
-
-		pdev = to_pci_dev(info->dev);
-		if (pdev->ats_enabled) {
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->ats_enabled) {
 			has_iotlb_device = true;
 			break;
 		}
+
+	if (!has_iotlb_device) {
+		struct subdev_domain_info *sinfo;
+
+		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+			info = get_domain_info(sinfo->pdev);
+			if (info && info->ats_enabled) {
+				has_iotlb_device = true;
+				break;
+			}
+		}
 	}
 
 	domain->has_iotlb_device = has_iotlb_device;
@@ -1426,11 +1722,10 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 
 	if (info->pri_supported &&
 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
-	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, prs_allocation))
 		info->pri_enabled = 1;
 #endif
-	if (!pdev->untrusted && info->ats_supported &&
-	    pci_ats_page_aligned(pdev) &&
+	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
 		info->ats_enabled = 1;
 		domain_update_iotlb(info->domain);
@@ -1466,29 +1761,62 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 #endif
 }
 
+static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
+				    u64 addr, unsigned int mask)
+{
+	u16 sid, qdep;
+
+	if (!info || !info->ats_enabled)
+		return;
+
+	sid = info->bus << 8 | info->devfn;
+	qdep = info->ats_qdep;
+	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+			   qdep, addr, mask);
+}
+
 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
 				  u64 addr, unsigned mask)
 {
-	u16 sid, qdep;
 	unsigned long flags;
 	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
 
 	if (!domain->has_iotlb_device)
 		return;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry(info, &domain->devices, link) {
-		if (!info->ats_enabled)
-			continue;
+	list_for_each_entry(info, &domain->devices, link)
+		__iommu_flush_dev_iotlb(info, addr, mask);
 
-		sid = info->bus << 8 | info->devfn;
-		qdep = info->ats_qdep;
-		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
-				qdep, addr, mask);
+	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		__iommu_flush_dev_iotlb(info, addr, mask);
 	}
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
+static void domain_flush_piotlb(struct intel_iommu *iommu,
+				struct dmar_domain *domain,
+				u64 addr, unsigned long npages, bool ih)
+{
+	u16 did = domain->iommu_did[iommu->seq_id];
+
+	if (domain->default_pasid)
+		qi_flush_piotlb(iommu, did, domain->default_pasid,
+				addr, npages, ih);
+	if (domain->kernel_pasid && !domain_type_is_si(domain)) {
+		/*
+		 * REVISIT: we only do PASID IOTLB inval for FL, we could have SL
+		 * for PASID in the future such as vIOMMU PT. this doesn't get hit.
+		 */
+		qi_flush_piotlb(iommu, did, domain->kernel_pasid,
+				addr, npages, ih);
+	}
+	if (!list_empty(&domain->devices))
+		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
+}
+
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
 				  struct dmar_domain *domain,
 				  unsigned long pfn, unsigned int pages,
@@ -1502,18 +1830,23 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
 
 	if (ih)
 		ih = 1 << 6;
-	/*
-	 * Fallback to domain selective flush if no PSI support or the size is
-	 * too big.
-	 * PSI requires page size to be 2 ^ x, and the base address is naturally
-	 * aligned to the size
-	 */
-	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
-		iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH);
-	else
-		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
-						DMA_TLB_PSI_FLUSH);
+
+	if (domain_use_first_level(domain)) {
+		domain_flush_piotlb(iommu, domain, addr, pages, ih);
+	} else {
+		/*
+		 * Fallback to domain selective flush if no PSI support or
+		 * the size is too big. PSI requires page size to be 2 ^ x,
+		 * and the base address is naturally aligned to the size.
+		 */
+		if (!cap_pgsel_inv(iommu->cap) ||
+		    mask > cap_max_amask_val(iommu->cap))
+			iommu->flush.flush_iotlb(iommu, did, 0, 0,
+							DMA_TLB_DSI_FLUSH);
+		else
+			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
+							DMA_TLB_PSI_FLUSH);
+	}
 
 	/*
 	 * In caching mode, changes of pages from non-present to present require
@@ -1528,25 +1861,30 @@ static inline void __mapping_notify_one(struct intel_iommu *iommu,
 					struct dmar_domain *domain,
 					unsigned long pfn, unsigned int pages)
 {
-	/* It's a non-present to present mapping. Only flush if caching mode */
-	if (cap_caching_mode(iommu->cap))
+	/*
+	 * It's a non-present to present mapping. Only flush if caching mode
+	 * and second level.
+	 */
+	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
 	else
 		iommu_flush_write_buffer(iommu);
 }
 
-static void iommu_flush_iova(struct iova_domain *iovad)
+static void intel_flush_iotlb_all(struct iommu_domain *domain)
 {
-	struct dmar_domain *domain;
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	int idx;
 
-	domain = container_of(iovad, struct dmar_domain, iovad);
-
-	for_each_domain_iommu(idx, domain) {
+	for_each_domain_iommu(idx, dmar_domain) {
 		struct intel_iommu *iommu = g_iommus[idx];
-		u16 did = domain->iommu_did[iommu->seq_id];
+		u16 did = dmar_domain->iommu_did[iommu->seq_id];
 
-		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+		if (domain_use_first_level(dmar_domain))
+			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
+		else
+			iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						 DMA_TLB_DSI_FLUSH);
 
 		if (!cap_caching_mode(iommu->cap))
 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
@@ -1595,6 +1933,10 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
 	u32 sts;
 	unsigned long flag;
 
+	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
+	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
+		return;
+
 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
 	iommu->gcmd &= ~DMA_GCMD_TE;
 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
@@ -1619,11 +1961,8 @@ static int iommu_init_domains(struct intel_iommu *iommu)
 	spin_lock_init(&iommu->lock);
 
 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
-	if (!iommu->domain_ids) {
-		pr_err("%s: Allocating domain id array failed\n",
-		       iommu->name);
+	if (!iommu->domain_ids)
 		return -ENOMEM;
-	}
 
 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
 	iommu->domains = kzalloc(size, GFP_KERNEL);
@@ -1712,10 +2051,35 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 		if (ecap_prs(iommu->ecap))
 			intel_svm_finish_prq(iommu);
 	}
+	if (vccap_pasid(iommu->vccap))
+		ioasid_unregister_allocator(&iommu->pasid_allocator);
+
 #endif
 }
 
-static struct dmar_domain *alloc_domain(int flags)
+/*
+ * Check and return whether first level is used by default for
+ * DMA translation.
+ */
+static bool first_level_by_default(unsigned int type)
+{
+	/* Change IOVA mapping to SL instead of FL */
+	if (default_iova)
+		return false;
+
+	/* Only SL is available in legacy mode */
+	if (!scalable_mode_support())
+		return false;
+
+	/* Only level (either FL or SL) is available, just use it */
+	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
+		return intel_cap_flts_sanity();
+
+	/* Both levels are available, decide it based on domain type */
+	return type != IOMMU_DOMAIN_UNMANAGED;
+}
+
+static struct dmar_domain *alloc_domain(unsigned int type)
 {
 	struct dmar_domain *domain;
 
@@ -1725,9 +2089,11 @@ static struct dmar_domain *alloc_domain(int flags)
 
 	memset(domain, 0, sizeof(*domain));
 	domain->nid = NUMA_NO_NODE;
-	domain->flags = flags;
+	if (first_level_by_default(type))
+		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
 	INIT_LIST_HEAD(&domain->devices);
+	INIT_LIST_HEAD(&domain->subdevices);
 
 	return domain;
 }
@@ -1743,7 +2109,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] += 1;
-	domain->iommu_count += 1;
 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
 		ndomains = cap_ndoms(iommu->cap);
 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
@@ -1751,7 +2116,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 		if (num >= ndomains) {
 			pr_err("%s: No free domain ids\n", iommu->name);
 			domain->iommu_refcnt[iommu->seq_id] -= 1;
-			domain->iommu_count -= 1;
 			return -ENOSPC;
 		}
 
@@ -1767,16 +2131,15 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	return 0;
 }
 
-static int domain_detach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu)
+static void domain_detach_iommu(struct dmar_domain *domain,
+				struct intel_iommu *iommu)
 {
-	int num, count;
+	int num;
 
 	assert_spin_locked(&device_domain_lock);
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] -= 1;
-	count = --domain->iommu_count;
 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
 		num = domain->iommu_did[iommu->seq_id];
 		clear_bit(num, iommu->domain_ids);
@@ -1785,55 +2148,6 @@ static int domain_detach_iommu(struct dmar_domain *domain,
 		domain_update_iommu_cap(domain);
 		domain->iommu_did[iommu->seq_id] = 0;
 	}
-
-	return count;
-}
-
-static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
-
-static int dmar_init_reserved_ranges(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct iova *iova;
-	int i;
-
-	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
-
-	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
-		&reserved_rbtree_key);
-
-	/* IOAPIC ranges shouldn't be accessed by DMA */
-	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
-		IOVA_PFN(IOAPIC_RANGE_END));
-	if (!iova) {
-		pr_err("Reserve IOAPIC range failed\n");
-		return -ENODEV;
-	}
-
-	/* Reserve all PCI MMIO to avoid peer-to-peer access */
-	for_each_pci_dev(pdev) {
-		struct resource *r;
-
-		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-			r = &pdev->resource[i];
-			if (!r->flags || !(r->flags & IORESOURCE_MEM))
-				continue;
-			iova = reserve_iova(&reserved_iova_list,
-					    IOVA_PFN(r->start),
-					    IOVA_PFN(r->end));
-			if (!iova) {
-				pci_err(pdev, "Reserve iova for %pR failed\n", r);
-				return -ENODEV;
-			}
-		}
-	}
-	return 0;
-}
-
-static void domain_reserve_special_ranges(struct dmar_domain *domain)
-{
-	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
 }
 
 static inline int guestwidth_to_adjustwidth(int gaw)
@@ -1850,76 +2164,17 @@ static inline int guestwidth_to_adjustwidth(int gaw)
 	return agaw;
 }
 
-static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
-		       int guest_width)
-{
-	int adjust_width, agaw;
-	unsigned long sagaw;
-	int err;
-
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-
-	err = init_iova_flush_queue(&domain->iovad,
-				    iommu_flush_iova, iova_entry_free);
-	if (err)
-		return err;
-
-	domain_reserve_special_ranges(domain);
-
-	/* calculate AGAW */
-	if (guest_width > cap_mgaw(iommu->cap))
-		guest_width = cap_mgaw(iommu->cap);
-	domain->gaw = guest_width;
-	adjust_width = guestwidth_to_adjustwidth(guest_width);
-	agaw = width_to_agaw(adjust_width);
-	sagaw = cap_sagaw(iommu->cap);
-	if (!test_bit(agaw, &sagaw)) {
-		/* hardware doesn't support it, choose a bigger one */
-		pr_debug("Hardware doesn't support agaw %d\n", agaw);
-		agaw = find_next_bit(&sagaw, 5, agaw);
-		if (agaw >= 5)
-			return -ENODEV;
-	}
-	domain->agaw = agaw;
-
-	if (ecap_coherent(iommu->ecap))
-		domain->iommu_coherency = 1;
-	else
-		domain->iommu_coherency = 0;
-
-	if (ecap_sc_support(iommu->ecap))
-		domain->iommu_snooping = 1;
-	else
-		domain->iommu_snooping = 0;
-
-	if (intel_iommu_superpage)
-		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
-	else
-		domain->iommu_superpage = 0;
-
-	domain->nid = iommu->node;
-
-	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-	if (!domain->pgd)
-		return -ENOMEM;
-	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
-	return 0;
-}
-
 static void domain_exit(struct dmar_domain *domain)
 {
 
 	/* Remove associated devices and clear attached or cached domains */
 	domain_remove_dev_info(domain);
 
-	/* destroy iovas */
-	put_iova_domain(&domain->iovad);
-
 	if (domain->pgd) {
 		struct page *freelist;
 
-		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+		freelist = domain_unmap(domain, 0,
+					DOMAIN_MAX_PFN(domain->gaw), NULL);
 		dma_free_pagelist(freelist);
 	}
 
@@ -1952,7 +2207,6 @@ static inline void
 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
 {
 	context->hi |= pasid & ((1 << 20) - 1);
-	context->hi |= (1 << 20);
 }
 
 /*
@@ -2227,65 +2481,90 @@ static inline int hardware_largepage_caps(struct dmar_domain *domain,
 	return level;
 }
 
-static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-			    struct scatterlist *sg, unsigned long phys_pfn,
-			    unsigned long nr_pages, int prot)
+/*
+ * Ensure that old small page tables are removed to make room for superpage(s).
+ * We're going to add new large pages, so make sure we don't remove their parent
+ * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
+ */
+static void switch_to_super_page(struct dmar_domain *domain,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn, int level)
+{
+	unsigned long lvl_pages = lvl_to_nr_pages(level);
+	struct dma_pte *pte = NULL;
+	int i;
+
+	while (start_pfn <= end_pfn) {
+		if (!pte)
+			pte = pfn_to_dma_pte(domain, start_pfn, &level);
+
+		if (dma_pte_present(pte)) {
+			dma_pte_free_pagetable(domain, start_pfn,
+					       start_pfn + lvl_pages - 1,
+					       level + 1);
+
+			for_each_domain_iommu(i, domain)
+				iommu_flush_iotlb_psi(g_iommus[i], domain,
+						      start_pfn, lvl_pages,
+						      0, 0);
+		}
+
+		pte++;
+		start_pfn += lvl_pages;
+		if (first_pte_in_page(pte))
+			pte = NULL;
+	}
+}
+
+static int
+__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
+		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
 	struct dma_pte *first_pte = NULL, *pte = NULL;
-	phys_addr_t uninitialized_var(pteval);
-	unsigned long sg_res = 0;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
+	phys_addr_t pteval;
+	u64 attr;
 
 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
 
 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
 		return -EINVAL;
 
-	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
-
-	if (!sg) {
-		sg_res = nr_pages;
-		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
+	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+	attr |= DMA_FL_PTE_PRESENT;
+	if (domain_use_first_level(domain)) {
+		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
+		if (prot & DMA_PTE_WRITE)
+			attr |= DMA_FL_PTE_DIRTY;
 	}
 
+	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
+
 	while (nr_pages > 0) {
 		uint64_t tmp;
 
-		if (!sg_res) {
-			unsigned int pgoff = sg->offset & ~PAGE_MASK;
-
-			sg_res = aligned_nrpages(sg->offset, sg->length);
-			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
-			sg->dma_length = sg->length;
-			pteval = (sg_phys(sg) - pgoff) | prot;
-			phys_pfn = pteval >> VTD_PAGE_SHIFT;
-		}
-
 		if (!pte) {
-			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+					phys_pfn, nr_pages);
 
-			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 			if (!pte)
 				return -ENOMEM;
+			first_pte = pte;
+
+			lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
 			/* It is large page*/
 			if (largepage_lvl > 1) {
-				unsigned long nr_superpages, end_pfn;
+				unsigned long end_pfn;
+				unsigned long pages_to_remove;
 
 				pteval |= DMA_PTE_LARGE_PAGE;
-				lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-				nr_superpages = sg_res / lvl_pages;
-				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
-
-				/*
-				 * Ensure that old small page tables are
-				 * removed to make room for superpage(s).
-				 * We're adding new large pages, so make sure
-				 * we don't remove their parent tables.
-				 */
-				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
-						       largepage_lvl + 1);
+				pages_to_remove = min_t(unsigned long, nr_pages,
+							nr_pte_to_next_page(pte) * lvl_pages);
+				end_pfn = iov_pfn + pages_to_remove - 1;
+				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
 			} else {
 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
 			}
@@ -2306,80 +2585,40 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			WARN_ON(1);
 		}
 
-		lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-		BUG_ON(nr_pages < lvl_pages);
-		BUG_ON(sg_res < lvl_pages);
-
 		nr_pages -= lvl_pages;
 		iov_pfn += lvl_pages;
 		phys_pfn += lvl_pages;
 		pteval += lvl_pages * VTD_PAGE_SIZE;
-		sg_res -= lvl_pages;
 
 		/* If the next PTE would be the first in a new page, then we
-		   need to flush the cache on the entries we've just written.
-		   And then we'll need to recalculate 'pte', so clear it and
-		   let it get set again in the if (!pte) block above.
-
-		   If we're done (!nr_pages) we need to flush the cache too.
-
-		   Also if we've been setting superpages, we may need to
-		   recalculate 'pte' and switch back to smaller pages for the
-		   end of the mapping, if the trailing size is not enough to
-		   use another superpage (i.e. sg_res < lvl_pages). */
+		 * need to flush the cache on the entries we've just written.
+		 * And then we'll need to recalculate 'pte', so clear it and
+		 * let it get set again in the if (!pte) block above.
+		 *
+		 * If we're done (!nr_pages) we need to flush the cache too.
+		 *
+		 * Also if we've been setting superpages, we may need to
+		 * recalculate 'pte' and switch back to smaller pages for the
+		 * end of the mapping, if the trailing size is not enough to
+		 * use another superpage (i.e. nr_pages < lvl_pages).
+		 */
 		pte++;
 		if (!nr_pages || first_pte_in_page(pte) ||
-		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
+		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
 			domain_flush_cache(domain, first_pte,
 					   (void *)pte - (void *)first_pte);
 			pte = NULL;
 		}
-
-		if (!sg_res && nr_pages)
-			sg = sg_next(sg);
-	}
-	return 0;
-}
-
-static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-			  struct scatterlist *sg, unsigned long phys_pfn,
-			  unsigned long nr_pages, int prot)
-{
-	int iommu_id, ret;
-	struct intel_iommu *iommu;
-
-	/* Do the real mapping first */
-	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
-	if (ret)
-		return ret;
-
-	for_each_domain_iommu(iommu_id, domain) {
-		iommu = g_iommus[iommu_id];
-		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
 	}
 
 	return 0;
 }
 
-static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				    struct scatterlist *sg, unsigned long nr_pages,
-				    int prot)
-{
-	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
-}
-
-static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-				     unsigned long phys_pfn, unsigned long nr_pages,
-				     int prot)
-{
-	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
-}
-
-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
 {
-	unsigned long flags;
+	struct intel_iommu *iommu = info->iommu;
 	struct context_entry *context;
+	unsigned long flags;
 	u16 did_old;
 
 	if (!iommu)
@@ -2391,7 +2630,16 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 		spin_unlock_irqrestore(&iommu->lock, flags);
 		return;
 	}
-	did_old = context_domain_id(context);
+
+	if (sm_supported(iommu)) {
+		if (hw_pass_through && domain_type_is_si(info->domain))
+			did_old = FLPT_DEFAULT_DID;
+		else
+			did_old = info->domain->iommu_did[iommu->seq_id];
+	} else {
+		did_old = context_domain_id(context);
+	}
+
 	context_clear_entry(context);
 	__iommu_flush_cache(iommu, context, sizeof(*context));
 	spin_unlock_irqrestore(&iommu->lock, flags);
@@ -2400,11 +2648,17 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 				   (((u16)bus) << 8) | devfn,
 				   DMA_CCMD_MASK_NOBIT,
 				   DMA_CCMD_DEVICE_INVL);
+
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
 	iommu->flush.flush_iotlb(iommu,
 				 did_old,
 				 0,
 				 0,
 				 DMA_TLB_DSI_FLUSH);
+
+	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
 }
 
 static inline void unlink_domain_info(struct device_domain_info *info)
@@ -2413,7 +2667,7 @@ static inline void unlink_domain_info(struct device_domain_info *info)
 	list_del(&info->link);
 	list_del(&info->global);
 	if (info->dev)
-		info->dev->archdata.iommu = NULL;
+		dev_iommu_priv_set(info->dev, NULL);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -2427,28 +2681,21 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-/*
- * find_domain
- * Note: we use struct device->archdata.iommu stores the info
- */
-static struct dmar_domain *find_domain(struct device *dev)
+struct dmar_domain *find_domain(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
-		struct iommu_domain *domain;
+	if (unlikely(!dev || !dev->iommu))
+		return NULL;
 
-		dev->archdata.iommu = NULL;
-		domain = iommu_get_domain_for_dev(dev);
-		if (domain)
-			intel_iommu_attach_device(domain, dev);
-	}
+	if (unlikely(attach_deferred(dev)))
+		return NULL;
 
 	/* No lock here, assumes no domain exit in normal case */
-	info = dev->archdata.iommu;
-
+	info = get_domain_info(dev);
 	if (likely(info))
 		return info->domain;
+
 	return NULL;
 }
 
@@ -2458,13 +2705,55 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
 	struct device_domain_info *info;
 
 	list_for_each_entry(info, &device_domain_list, global)
-		if (info->iommu->segment == segment && info->bus == bus &&
+		if (info->segment == segment && info->bus == bus &&
 		    info->devfn == devfn)
 			return info;
 
 	return NULL;
 }
 
+static int domain_setup_first_level(struct intel_iommu *iommu,
+				    struct dmar_domain *domain,
+				    struct device *dev,
+				    u32 pasid)
+{
+	struct dma_pte *pgd = domain->pgd;
+	int agaw, level;
+	int flags = 0;
+
+	/*
+	 * Skip top levels of page tables for iommu which has
+	 * less agaw than default. Unnecessary for PT mode.
+	 */
+	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+		pgd = phys_to_virt(dma_pte_addr(pgd));
+		if (!dma_pte_present(pgd))
+			return -ENOMEM;
+	}
+
+	level = agaw_to_level(agaw);
+	if (level != 4 && level != 5)
+		return -EINVAL;
+
+	if (pasid != PASID_RID2PASID)
+		flags |= PASID_FLAG_SUPERVISOR_MODE;
+	if (level == 5)
+		flags |= PASID_FLAG_FL5LP;
+
+	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+		flags |= PASID_FLAG_PAGE_SNOOP;
+
+	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
+					     domain->iommu_did[iommu->seq_id],
+					     flags);
+}
+
+static bool dev_is_real_dma_subdevice(struct device *dev)
+{
+	return dev && dev_is_pci(dev) &&
+	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+}
+
 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 						    int bus, int devfn,
 						    struct device *dev,
@@ -2479,8 +2768,18 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	if (!info)
 		return NULL;
 
-	info->bus = bus;
-	info->devfn = devfn;
+	if (!dev_is_real_dma_subdevice(dev)) {
+		info->bus = bus;
+		info->devfn = devfn;
+		info->segment = iommu->segment;
+	} else {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		info->bus = pdev->bus->number;
+		info->devfn = pdev->devfn;
+		info->segment = pci_domain_nr(pdev->bus);
+	}
+
 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
 	info->ats_qdep = 0;
@@ -2489,16 +2788,14 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	info->iommu = iommu;
 	info->pasid_table = NULL;
 	info->auxd_enabled = 0;
-	INIT_LIST_HEAD(&info->auxiliary_domains);
+	INIT_LIST_HEAD(&info->subdevices);
 
 	if (dev && dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(info->dev);
 
-		if (!pdev->untrusted &&
-		    !pci_ats_disabled() &&
-		    ecap_dev_iotlb_support(iommu->ecap) &&
-		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
-		    dmar_find_matched_atsr_unit(pdev))
+		if (ecap_dev_iotlb_support(iommu->ecap) &&
+		    pci_ats_supported(pdev) &&
+		    dmar_ats_supported(pdev, iommu))
 			info->ats_supported = 1;
 
 		if (sm_supported(iommu)) {
@@ -2509,7 +2806,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 			}
 
 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
-			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+			    pci_pri_supported(pdev))
 				info->pri_supported = 1;
 		}
 	}
@@ -2520,7 +2817,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 
 	if (!found) {
 		struct device_domain_info *info2;
-		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
+		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
+						       info->devfn);
 		if (info2) {
 			found      = info2->domain;
 			info2->dev = dev;
@@ -2547,7 +2845,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	list_add(&info->link, &domain->devices);
 	list_add(&info->global, &device_domain_list);
 	if (dev)
-		dev->archdata.iommu = info;
+		dev_iommu_priv_set(dev, info);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	/* PASID table is mandatory for a PCI device in scalable mode. */
@@ -2564,6 +2862,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 		if (hw_pass_through && domain_type_is_si(domain))
 			ret = intel_pasid_setup_pass_through(iommu, domain,
 					dev, PASID_RID2PASID);
+		else if (domain_use_first_level(domain))
+			ret = domain_setup_first_level(iommu, domain, dev,
+					PASID_RID2PASID);
 		else
 			ret = intel_pasid_setup_second_level(iommu, domain,
 					dev, PASID_RID2PASID);
@@ -2584,158 +2885,21 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	return domain;
 }
 
-static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-	*(u16 *)opaque = alias;
-	return 0;
-}
-
-static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
-{
-	struct device_domain_info *info;
-	struct dmar_domain *domain = NULL;
-	struct intel_iommu *iommu;
-	u16 dma_alias;
-	unsigned long flags;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return NULL;
-
-	if (dev_is_pci(dev)) {
-		struct pci_dev *pdev = to_pci_dev(dev);
-
-		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
-
-		spin_lock_irqsave(&device_domain_lock, flags);
-		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
-						      PCI_BUS_NUM(dma_alias),
-						      dma_alias & 0xff);
-		if (info) {
-			iommu = info->iommu;
-			domain = info->domain;
-		}
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-
-		/* DMA alias already has a domain, use it */
-		if (info)
-			goto out;
-	}
-
-	/* Allocate and initialize new domain for the device */
-	domain = alloc_domain(0);
-	if (!domain)
-		return NULL;
-	if (domain_init(domain, iommu, gaw)) {
-		domain_exit(domain);
-		return NULL;
-	}
-
-out:
-	return domain;
-}
-
-static struct dmar_domain *set_domain_for_dev(struct device *dev,
-					      struct dmar_domain *domain)
-{
-	struct intel_iommu *iommu;
-	struct dmar_domain *tmp;
-	u16 req_id, dma_alias;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return NULL;
-
-	req_id = ((u16)bus << 8) | devfn;
-
-	if (dev_is_pci(dev)) {
-		struct pci_dev *pdev = to_pci_dev(dev);
-
-		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
-
-		/* register PCI DMA alias device */
-		if (req_id != dma_alias) {
-			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
-					dma_alias & 0xff, NULL, domain);
-
-			if (!tmp || tmp != domain)
-				return tmp;
-		}
-	}
-
-	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
-	if (!tmp || tmp != domain)
-		return tmp;
-
-	return domain;
-}
-
 static int iommu_domain_identity_map(struct dmar_domain *domain,
-				     unsigned long long start,
-				     unsigned long long end)
+				     unsigned long first_vpfn,
+				     unsigned long last_vpfn)
 {
-	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
-	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
-
-	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
-			  dma_to_mm_pfn(last_vpfn))) {
-		pr_err("Reserving iova failed\n");
-		return -ENOMEM;
-	}
-
-	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
 	/*
 	 * RMRR range might have overlap with physical memory range,
 	 * clear it first
 	 */
 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
 
-	return __domain_mapping(domain, first_vpfn, NULL,
+	return __domain_mapping(domain, first_vpfn,
 				first_vpfn, last_vpfn - first_vpfn + 1,
 				DMA_PTE_READ|DMA_PTE_WRITE);
 }
 
-static int domain_prepare_identity_map(struct device *dev,
-				       struct dmar_domain *domain,
-				       unsigned long long start,
-				       unsigned long long end)
-{
-	/* For _hardware_ passthrough, don't bother. But for software
-	   passthrough, we do it anyway -- it may indicate a memory
-	   range which is reserved in E820, so which didn't get set
-	   up to start with in si_domain */
-	if (domain == si_domain && hw_pass_through) {
-		dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
-			 start, end);
-		return 0;
-	}
-
-	dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
-
-	if (end < start) {
-		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
-			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-			dmi_get_system_info(DMI_BIOS_VENDOR),
-			dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		return -EIO;
-	}
-
-	if (end >> agaw_to_width(domain->agaw)) {
-		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
-		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
-		     agaw_to_width(domain->agaw),
-		     dmi_get_system_info(DMI_BIOS_VENDOR),
-		     dmi_get_system_info(DMI_BIOS_VERSION),
-		     dmi_get_system_info(DMI_PRODUCT_VERSION));
-		return -EIO;
-	}
-
-	return iommu_domain_identity_map(domain, start, end);
-}
-
 static int md_domain_init(struct dmar_domain *domain, int guest_width);
 
 static int __init si_domain_init(int hw)
@@ -2744,7 +2908,7 @@ static int __init si_domain_init(int hw)
 	struct device *dev;
 	int i, nid, ret;
 
-	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
+	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
 	if (!si_domain)
 		return -EFAULT;
 
@@ -2762,7 +2926,8 @@ static int __init si_domain_init(int hw)
 
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			ret = iommu_domain_identity_map(si_domain,
-					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+					mm_to_dma_pfn(start_pfn),
+					mm_to_dma_pfn(end_pfn));
 			if (ret)
 				return ret;
 		}
@@ -2782,7 +2947,9 @@ static int __init si_domain_init(int hw)
 				    end >> agaw_to_width(si_domain->agaw)))
 				continue;
 
-			ret = iommu_domain_identity_map(si_domain, start, end);
+			ret = iommu_domain_identity_map(si_domain,
+					mm_to_dma_pfn(start >> PAGE_SHIFT),
+					mm_to_dma_pfn(end >> PAGE_SHIFT));
 			if (ret)
 				return ret;
 		}
@@ -2791,17 +2958,6 @@ static int __init si_domain_init(int hw)
 	return 0;
 }
 
-static int identity_mapping(struct device *dev)
-{
-	struct device_domain_info *info;
-
-	info = dev->archdata.iommu;
-	if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
-		return (info->domain == si_domain);
-
-	return 0;
-}
-
 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 {
 	struct dmar_domain *ndomain;
@@ -2900,6 +3056,13 @@ static bool device_is_rmrr_locked(struct device *dev)
 	return true;
 }
 
+static bool device_has_satc(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return dev_satc_state(pdev) >= 0;
+}
+
 /*
  * Return the required default domain type for a specific device.
  *
@@ -2916,47 +3079,19 @@ static int device_def_domain_type(struct device *dev)
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
 
-		/*
-		 * Prevent any device marked as untrusted from getting
-		 * placed into the statically identity mapping domain.
-		 */
-		if (pdev->untrusted)
-			return IOMMU_DOMAIN_DMA;
-
 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
 
 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
 			return IOMMU_DOMAIN_IDENTITY;
 
-		/*
-		 * We want to start off with all devices in the 1:1 domain, and
-		 * take them out later if we find they can't access all of memory.
-		 *
-		 * However, we can't do this for PCI devices behind bridges,
-		 * because all PCI devices behind the same bridge will end up
-		 * with the same source-id on their transactions.
-		 *
-		 * Practically speaking, we can't change things around for these
-		 * devices at run-time, because we can't be sure there'll be no
-		 * DMA transactions in flight for any of their siblings.
-		 *
-		 * So PCI devices (unless they're on the root bus) as well as
-		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
-		 * the 1:1 domain, just in _case_ one of their siblings turns out
-		 * not to be able to map all of memory.
-		 */
-		if (!pci_is_pcie(pdev)) {
-			if (!pci_is_root_bus(pdev->bus))
-				return IOMMU_DOMAIN_DMA;
-			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
-				return IOMMU_DOMAIN_DMA;
-		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
-			return IOMMU_DOMAIN_DMA;
+		if (device_has_satc(dev)) {
+			dev_info(dev, "Use identity domain for SATC devices");
+			return IOMMU_DOMAIN_IDENTITY;
+		}
 	}
 
-	return (iommu_identity_mapping & IDENTMAP_ALL) ?
-			IOMMU_DOMAIN_IDENTITY : 0;
+	return 0;
 }
 
 static void intel_iommu_init_qi(struct intel_iommu *iommu)
@@ -3178,6 +3313,85 @@ static int copy_translation_tables(struct intel_iommu *iommu)
 	return ret;
 }
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
+{
+	struct intel_iommu *iommu = data;
+	ioasid_t ioasid;
+
+	if (!iommu)
+		return INVALID_IOASID;
+	/*
+	 * VT-d virtual command interface always uses the full 20 bit
+	 * PASID range. Host can partition guest PASID range based on
+	 * policies but it is out of guest's control.
+	 */
+	if (min < IOASID_ALLOC_BASE || max > intel_pasid_max_id)
+		return INVALID_IOASID;
+
+	if (vcmd_alloc_pasid(iommu, &ioasid))
+		return INVALID_IOASID;
+
+	return ioasid;
+}
+
+static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
+{
+	struct intel_iommu *iommu = data;
+
+	if (!iommu)
+		return;
+	/*
+	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
+	 * We can only free the PASID when all the devices are unbound.
+	 */
+	if (IS_ERR(ioasid_find(host_pasid_set, ioasid, NULL))) {
+		pr_err("IOASID %d to be freed but not in system set\n", ioasid);
+		return;
+	}
+	vcmd_free_pasid(iommu, ioasid);
+}
+
+static void register_pasid_allocator(struct intel_iommu *iommu)
+{
+	/*
+	 * If we are running in the host, no need for custom allocator
+	 * in that PASIDs are allocated from the host system-wide.
+	 */
+	if (!cap_caching_mode(iommu->cap))
+		return;
+
+	if (!sm_supported(iommu)) {
+		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
+		return;
+	}
+
+	/*
+	 * Register a custom PASID allocator if we are running in a guest,
+	 * guest PASID must be obtained via virtual command interface.
+	 * There can be multiple vIOMMUs in each guest but only one allocator
+	 * is active. All vIOMMU allocators will eventually be calling the same
+	 * host allocator.
+	 */
+	if (!vccap_pasid(iommu->vccap))
+		return;
+
+	pr_info("Register custom PASID allocator\n");
+	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
+	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
+	iommu->pasid_allocator.pdata = (void *)iommu;
+	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
+		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
+		/*
+		 * Disable scalable mode on this IOMMU if there
+		 * is no custom allocator. Mixing SM capable vIOMMU
+		 * and non-SM vIOMMU are not supported.
+		 */
+		intel_iommu_sm = 0;
+	}
+}
+#endif
+
 static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
@@ -3210,11 +3424,14 @@ static int __init init_dmars(void)
 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
 			GFP_KERNEL);
 	if (!g_iommus) {
-		pr_err("Allocating global iommu array failed\n");
 		ret = -ENOMEM;
 		goto error;
 	}
 
+	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
+	if (ret)
+		goto free_iommu;
+
 	for_each_iommu(iommu, drhd) {
 		if (drhd->ignored) {
 			iommu_disable_translation(iommu);
@@ -3285,16 +3502,14 @@ static int __init init_dmars(void)
 
 		if (!ecap_pass_through(iommu->ecap))
 			hw_pass_through = 0;
+		if (cap_caching_mode(iommu->cap))
+			intel_caching_mode = 1;
 
 		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
 			pr_info("Disable batched IOTLB flush due to virtualization");
 			intel_iommu_strict = 1;
 		}
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-		if (pasid_supported(iommu))
-			intel_svm_init(iommu);
-#endif
+		intel_svm_check(iommu);
 	}
 
 	/*
@@ -3304,14 +3519,12 @@ static int __init init_dmars(void)
 	 */
 	for_each_active_iommu(iommu, drhd) {
 		iommu_flush_write_buffer(iommu);
+#ifdef CONFIG_INTEL_IOMMU_SVM
+		register_pasid_allocator(iommu);
+#endif
 		iommu_set_root_entry(iommu);
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	}
 
-	if (iommu_default_passthrough())
-		iommu_identity_mapping |= IDENTMAP_ALL;
-
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
 	dmar_map_gfx = 0;
 #endif
@@ -3325,6 +3538,32 @@ static int __init init_dmars(void)
 	if (ret)
 		goto free_iommu;
 
+	/* PASID is needed for scalable mode irrespective to SVM */
+	if (scalable_mode_support()) {
+		ioasid_install_capacity(intel_pasid_max_id);
+		/* We should not run out of IOASIDs at boot */
+		host_pasid_set = ioasid_set_alloc(NULL, PID_MAX_DEFAULT,
+						  IOASID_SET_TYPE_NULL);
+		if (IS_ERR_OR_NULL(host_pasid_set)) {
+			pr_err("Failed to allocate host PASID set %lu\n",
+				PTR_ERR(host_pasid_set));
+			intel_iommu_sm = 0;
+		} else {
+			intel_svm_add_pasid_notifier();
+			/* TODO: where to free? */
+			/* If do this allocation in guest, then it may encounter
+			 * failure as guest allocation will go into host, while
+			 * PASID#0 should have been allocated before VM boot. So
+			 * Add this check. And in future, we may want to let ioasid
+			 * provide a way to only reserve PASID #0 in its own ioasid
+			 * space.
+			 */
+			if (!cap_caching_mode(iommu->cap))
+				ioasid_alloc(host_pasid_set, PASID_RID2PASID,
+					     PASID_RID2PASID, NULL);
+		}
+	}
+
 	/*
 	 * for each drhd
 	 *   enable fault log
@@ -3358,7 +3597,7 @@ static int __init init_dmars(void)
 				goto free_iommu;
 		}
 #endif
-		ret = dmar_set_interrupt(iommu);
+		ret = dmar_set_interrupt(iommu, true);
 		if (ret)
 			goto free_iommu;
 	}
@@ -3370,2646 +3609,2980 @@ static int __init init_dmars(void)
 		disable_dmar_iommu(iommu);
 		free_dmar_iommu(iommu);
 	}
-
+	ioasid_set_free(host_pasid_set);
 	kfree(g_iommus);
 
 error:
 	return ret;
 }
 
-/* This takes a number of _MM_ pages, not VTD pages */
-static unsigned long intel_alloc_iova(struct device *dev,
-				     struct dmar_domain *domain,
-				     unsigned long nrpages, uint64_t dma_mask)
+static inline int iommu_domain_cache_init(void)
 {
-	unsigned long iova_pfn;
+	int ret = 0;
 
-	/* Restrict dma_mask to the width that the iommu can handle */
-	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
-	/* Ensure we reserve the whole size-aligned region */
-	nrpages = __roundup_pow_of_two(nrpages);
+	iommu_domain_cache = kmem_cache_create("iommu_domain",
+					 sizeof(struct dmar_domain),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
 
-	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
-		/*
-		 * First try to allocate an io virtual address in
-		 * DMA_BIT_MASK(32) and if that fails then try allocating
-		 * from higher range
-		 */
-		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-					   IOVA_PFN(DMA_BIT_MASK(32)), false);
-		if (iova_pfn)
-			return iova_pfn;
-	}
-	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
-				   IOVA_PFN(dma_mask), true);
-	if (unlikely(!iova_pfn)) {
-		dev_err_once(dev, "Allocating %ld-page iova failed\n",
-			     nrpages);
-		return 0;
+					 NULL);
+	if (!iommu_domain_cache) {
+		pr_err("Couldn't create iommu_domain cache\n");
+		ret = -ENOMEM;
 	}
 
-	return iova_pfn;
+	return ret;
 }
 
-static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
+static inline int iommu_devinfo_cache_init(void)
 {
-	struct dmar_domain *domain, *tmp;
-	struct dmar_rmrr_unit *rmrr;
-	struct device *i_dev;
-	int i, ret;
+	int ret = 0;
 
-	/* Device shouldn't be attached by any domains. */
-	domain = find_domain(dev);
-	if (domain)
-		return NULL;
+	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
+					 sizeof(struct device_domain_info),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_devinfo_cache) {
+		pr_err("Couldn't create devinfo cache\n");
+		ret = -ENOMEM;
+	}
 
-	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-	if (!domain)
-		goto out;
+	return ret;
+}
 
-	/* We have a new domain - setup possible RMRRs for the device */
-	rcu_read_lock();
-	for_each_rmrr_units(rmrr) {
-		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
-					  i, i_dev) {
-			if (i_dev != dev)
-				continue;
+static int __init iommu_init_mempool(void)
+{
+	int ret;
+	ret = iova_cache_get();
+	if (ret)
+		return ret;
 
-			ret = domain_prepare_identity_map(dev, domain,
-							  rmrr->base_address,
-							  rmrr->end_address);
-			if (ret)
-				dev_err(dev, "Mapping reserved region failed\n");
-		}
-	}
-	rcu_read_unlock();
+	ret = iommu_domain_cache_init();
+	if (ret)
+		goto domain_error;
 
-	tmp = set_domain_for_dev(dev, domain);
-	if (!tmp || domain != tmp) {
-		domain_exit(domain);
-		domain = tmp;
-	}
+	ret = iommu_devinfo_cache_init();
+	if (!ret)
+		return ret;
 
-out:
-	if (!domain)
-		dev_err(dev, "Allocating domain failed\n");
-	else
-		domain->domain.type = IOMMU_DOMAIN_DMA;
+	kmem_cache_destroy(iommu_domain_cache);
+domain_error:
+	iova_cache_put();
 
-	return domain;
+	return -ENOMEM;
 }
 
-/* Check if the dev needs to go through non-identity map and unmap process.*/
-static bool iommu_need_mapping(struct device *dev)
+static void __init iommu_exit_mempool(void)
 {
-	int ret;
+	kmem_cache_destroy(iommu_devinfo_cache);
+	kmem_cache_destroy(iommu_domain_cache);
+	iova_cache_put();
+}
 
-	if (iommu_dummy(dev))
-		return false;
+static void __init init_no_remapping_devices(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct device *dev;
+	int i;
 
-	ret = identity_mapping(dev);
-	if (ret) {
-		u64 dma_mask = *dev->dma_mask;
+	for_each_drhd_unit(drhd) {
+		if (!drhd->include_all) {
+			for_each_active_dev_scope(drhd->devices,
+						  drhd->devices_cnt, i, dev)
+				break;
+			/* ignore DMAR unit if no devices exist */
+			if (i == drhd->devices_cnt)
+				drhd->ignored = 1;
+		}
+	}
 
-		if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
-			dma_mask = dev->coherent_dma_mask;
+	for_each_active_drhd_unit(drhd) {
+		if (drhd->include_all)
+			continue;
 
-		if (dma_mask >= dma_direct_get_required_mask(dev))
-			return false;
+		for_each_active_dev_scope(drhd->devices,
+					  drhd->devices_cnt, i, dev)
+			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
+				break;
+		if (i < drhd->devices_cnt)
+			continue;
 
-		/*
-		 * 32 bit DMA is removed from si_domain and fall back to
-		 * non-identity mapping.
-		 */
-		dmar_remove_one_dev_info(dev);
-		ret = iommu_request_dma_domain_for_dev(dev);
-		if (ret) {
-			struct iommu_domain *domain;
-			struct dmar_domain *dmar_domain;
+		/* This IOMMU has *only* gfx devices. Either bypass it or
+		   set the gfx_mapped flag, as appropriate */
+		drhd->gfx_dedicated = 1;
+		if (!dmar_map_gfx)
+			drhd->ignored = 1;
+	}
+}
 
-			domain = iommu_get_domain_for_dev(dev);
-			if (domain) {
-				dmar_domain = to_dmar_domain(domain);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-			}
-			dmar_remove_one_dev_info(dev);
-			get_private_domain_for_dev(dev);
+#ifdef CONFIG_SUSPEND
+static int init_iommu_hw(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_active_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	for_each_iommu(iommu, drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(iommu);
+			continue;
 		}
 
-		dev_info(dev, "32bit DMA uses non-identity mapping\n");
+		iommu_flush_write_buffer(iommu);
+		iommu_set_root_entry(iommu);
+		iommu_enable_translation(iommu);
+		iommu_disable_protect_mem_regions(iommu);
 	}
 
-	return true;
+	return 0;
 }
 
-static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
-				     size_t size, int dir, u64 dma_mask)
+static void iommu_flush_all(void)
 {
-	struct dmar_domain *domain;
-	phys_addr_t start_paddr;
-	unsigned long iova_pfn;
-	int prot = 0;
-	int ret;
+	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
 
-	BUG_ON(dir == DMA_NONE);
+	for_each_active_iommu(iommu, drhd) {
+		iommu->flush.flush_context(iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH);
+	}
+}
 
-	domain = find_domain(dev);
-	if (!domain)
-		return DMA_MAPPING_ERROR;
+static int iommu_suspend(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
 
-	iommu = domain_get_iommu(domain);
-	size = aligned_nrpages(paddr, size);
+	for_each_active_iommu(iommu, drhd) {
+		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
+					     GFP_KERNEL);
+		if (!iommu->iommu_state)
+			goto nomem;
+	}
 
-	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-	if (!iova_pfn)
-		goto error;
+	iommu_flush_all();
 
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-	/*
-	 * paddr - (paddr + size) might be partial page, we should map the whole
-	 * page.  Note: if two part of one page are separately mapped, we
-	 * might have two guest_addr mapping to the same host paddr, but this
-	 * is not a big problem
-	 */
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-				 mm_to_dma_pfn(paddr_pfn), size, prot);
-	if (ret)
-		goto error;
+	for_each_active_iommu(iommu, drhd) {
+		iommu_disable_translation(iommu);
 
-	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
-	start_paddr += paddr & ~PAGE_MASK;
+		raw_spin_lock_irqsave(&iommu->register_lock, flag);
 
-	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
+		iommu->iommu_state[SR_DMAR_FECTL_REG] =
+			readl(iommu->reg + DMAR_FECTL_REG);
+		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
+			readl(iommu->reg + DMAR_FEDATA_REG);
+		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
+			readl(iommu->reg + DMAR_FEADDR_REG);
+		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
+			readl(iommu->reg + DMAR_FEUADDR_REG);
 
-	return start_paddr;
+		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+	}
+	return 0;
 
-error:
-	if (iova_pfn)
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
-		size, (unsigned long long)paddr, dir);
-	return DMA_MAPPING_ERROR;
-}
+nomem:
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
 
-static dma_addr_t intel_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir,
-				 unsigned long attrs)
-{
-	if (iommu_need_mapping(dev))
-		return __intel_map_single(dev, page_to_phys(page) + offset,
-				size, dir, *dev->dma_mask);
-	return dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	return -ENOMEM;
 }
 
-static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
-				     size_t size, enum dma_data_direction dir,
-				     unsigned long attrs)
+static void iommu_resume(void)
 {
-	if (iommu_need_mapping(dev))
-		return __intel_map_single(dev, phys_addr, size, dir,
-				*dev->dma_mask);
-	return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-}
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+	unsigned long flag;
 
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
-{
-	struct dmar_domain *domain;
-	unsigned long start_pfn, last_pfn;
-	unsigned long nrpages;
-	unsigned long iova_pfn;
-	struct intel_iommu *iommu;
-	struct page *freelist;
-	struct pci_dev *pdev = NULL;
-
-	domain = find_domain(dev);
-	BUG_ON(!domain);
+	if (init_iommu_hw()) {
+		if (force_on)
+			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+		else
+			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+		return;
+	}
 
-	iommu = domain_get_iommu(domain);
+	for_each_active_iommu(iommu, drhd) {
 
-	iova_pfn = IOVA_PFN(dev_addr);
+		raw_spin_lock_irqsave(&iommu->register_lock, flag);
 
-	nrpages = aligned_nrpages(dev_addr, size);
-	start_pfn = mm_to_dma_pfn(iova_pfn);
-	last_pfn = start_pfn + nrpages - 1;
+		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
+			iommu->reg + DMAR_FECTL_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
+			iommu->reg + DMAR_FEDATA_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
+			iommu->reg + DMAR_FEADDR_REG);
+		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
+			iommu->reg + DMAR_FEUADDR_REG);
 
-	if (dev_is_pci(dev))
-		pdev = to_pci_dev(dev);
-
-	freelist = domain_unmap(domain, start_pfn, last_pfn);
-	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
-			!has_iova_flush_queue(&domain->iovad)) {
-		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-				      nrpages, !freelist, 0);
-		/* free iova */
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-		dma_free_pagelist(freelist);
-	} else {
-		queue_iova(&domain->iovad, iova_pfn, nrpages,
-			   (unsigned long)freelist);
-		/*
-		 * queue up the release of the unmap to save the 1/6th of the
-		 * cpu used up by the iotlb flush operation...
-		 */
+		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 	}
 
-	trace_unmap_single(dev, dev_addr, size);
+	for_each_active_iommu(iommu, drhd)
+		kfree(iommu->iommu_state);
 }
 
-static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			     size_t size, enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-	if (iommu_need_mapping(dev))
-		intel_unmap(dev, dev_addr, size);
-	else
-		dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
-}
+static struct syscore_ops iommu_syscore_ops = {
+	.resume		= iommu_resume,
+	.suspend	= iommu_suspend,
+};
 
-static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+static void __init init_iommu_pm_ops(void)
 {
-	if (iommu_need_mapping(dev))
-		intel_unmap(dev, dev_addr, size);
+	register_syscore_ops(&iommu_syscore_ops);
 }
 
-static void *intel_alloc_coherent(struct device *dev, size_t size,
-				  dma_addr_t *dma_handle, gfp_t flags,
-				  unsigned long attrs)
-{
-	struct page *page = NULL;
-	int order;
+#else
+static inline void init_iommu_pm_ops(void) {}
+#endif	/* CONFIG_PM */
 
-	if (!iommu_need_mapping(dev))
-		return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
+static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
+	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
+	    rmrr->end_address <= rmrr->base_address ||
+	    arch_rmrr_sanity_check(rmrr))
+		return -EINVAL;
 
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
+	return 0;
+}
 
-	if (gfpflags_allow_blocking(flags)) {
-		unsigned int count = size >> PAGE_SHIFT;
+int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
+{
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct dmar_rmrr_unit *rmrru;
 
-		page = dma_alloc_from_contiguous(dev, count, order,
-						 flags & __GFP_NOWARN);
+	rmrr = (struct acpi_dmar_reserved_memory *)header;
+	if (rmrr_sanity_check(rmrr)) {
+		pr_warn(FW_BUG
+			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
+			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
+			   rmrr->base_address, rmrr->end_address,
+			   dmi_get_system_info(DMI_BIOS_VENDOR),
+			   dmi_get_system_info(DMI_BIOS_VERSION),
+			   dmi_get_system_info(DMI_PRODUCT_VERSION));
+		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 	}
 
-	if (!page)
-		page = alloc_pages(flags, order);
-	if (!page)
-		return NULL;
-	memset(page_address(page), 0, size);
-
-	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
-					 DMA_BIDIRECTIONAL,
-					 dev->coherent_dma_mask);
-	if (*dma_handle != DMA_MAPPING_ERROR)
-		return page_address(page);
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, order);
+	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
+	if (!rmrru)
+		goto out;
 
-	return NULL;
-}
+	rmrru->hdr = header;
 
-static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
-				dma_addr_t dma_handle, unsigned long attrs)
-{
-	int order;
-	struct page *page = virt_to_page(vaddr);
+	rmrru->base_address = rmrr->base_address;
+	rmrru->end_address = rmrr->end_address;
 
-	if (!iommu_need_mapping(dev))
-		return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
+	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
+				((void *)rmrr) + rmrr->header.length,
+				&rmrru->devices_cnt);
+	if (rmrru->devices_cnt && rmrru->devices == NULL)
+		goto free_rmrru;
 
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
+	list_add(&rmrru->list, &dmar_rmrr_units);
 
-	intel_unmap(dev, dma_handle, size);
-	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-		__free_pages(page, order);
+	return 0;
+free_rmrru:
+	kfree(rmrru);
+out:
+	return -ENOMEM;
 }
 
-static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
-			   int nelems, enum dma_data_direction dir,
-			   unsigned long attrs)
+static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
 {
-	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
-	unsigned long nrpages = 0;
-	struct scatterlist *sg;
-	int i;
-
-	if (!iommu_need_mapping(dev))
-		return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
+	struct dmar_atsr_unit *atsru;
+	struct acpi_dmar_atsr *tmp;
 
-	for_each_sg(sglist, sg, nelems, i) {
-		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+				dmar_rcu_check()) {
+		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
+		if (atsr->segment != tmp->segment)
+			continue;
+		if (atsr->header.length != tmp->header.length)
+			continue;
+		if (memcmp(atsr, tmp, atsr->header.length) == 0)
+			return atsru;
 	}
 
-	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
-
-	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
+	return NULL;
 }
 
-static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-			enum dma_data_direction dir, unsigned long attrs)
+int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
 {
-	int i;
-	struct dmar_domain *domain;
-	size_t size = 0;
-	int prot = 0;
-	unsigned long iova_pfn;
-	int ret;
-	struct scatterlist *sg;
-	unsigned long start_vpfn;
-	struct intel_iommu *iommu;
-
-	BUG_ON(dir == DMA_NONE);
-	if (!iommu_need_mapping(dev))
-		return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
 
-	domain = find_domain(dev);
-	if (!domain)
+	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
 		return 0;
 
-	iommu = domain_get_iommu(domain);
-
-	for_each_sg(sglist, sg, nelems, i)
-		size += aligned_nrpages(sg->offset, sg->length);
-
-	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
-				*dev->dma_mask);
-	if (!iova_pfn) {
-		sglist->dma_length = 0;
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (atsru)
 		return 0;
-	}
+
+	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
 
 	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
+	 * If memory is allocated from slab by ACPI _DSM method, we need to
+	 * copy the memory content because the memory buffer will be freed
+	 * on return.
 	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-
-	start_vpfn = mm_to_dma_pfn(iova_pfn);
-
-	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
-	if (unlikely(ret)) {
-		dma_pte_free_pagetable(domain, start_vpfn,
-				       start_vpfn + size - 1,
-				       agaw_to_level(domain->agaw) + 1);
-		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
-		return 0;
+	atsru->hdr = (void *)(atsru + 1);
+	memcpy(atsru->hdr, hdr, hdr->length);
+	atsru->include_all = atsr->flags & 0x1;
+	if (!atsru->include_all) {
+		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt);
+		if (atsru->devices_cnt && atsru->devices == NULL) {
+			kfree(atsru);
+			return -ENOMEM;
+		}
 	}
 
-	trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
-		     sg_phys(sglist), size << VTD_PAGE_SHIFT);
-
-	return nelems;
-}
+	list_add_rcu(&atsru->list, &dmar_atsr_units);
 
-static u64 intel_get_required_mask(struct device *dev)
-{
-	if (!iommu_need_mapping(dev))
-		return dma_direct_get_required_mask(dev);
-	return DMA_BIT_MASK(32);
+	return 0;
 }
 
-static const struct dma_map_ops intel_dma_ops = {
-	.alloc = intel_alloc_coherent,
-	.free = intel_free_coherent,
-	.map_sg = intel_map_sg,
-	.unmap_sg = intel_unmap_sg,
-	.map_page = intel_map_page,
-	.unmap_page = intel_unmap_page,
-	.map_resource = intel_map_resource,
-	.unmap_resource = intel_unmap_resource,
-	.dma_supported = dma_direct_supported,
-	.mmap = dma_common_mmap,
-	.get_sgtable = dma_common_get_sgtable,
-	.get_required_mask = intel_get_required_mask,
-};
-
-static void
-bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
-		   enum dma_data_direction dir, enum dma_sync_target target)
+static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
 {
-	struct dmar_domain *domain;
-	phys_addr_t tlb_addr;
-
-	domain = find_domain(dev);
-	if (WARN_ON(!domain))
-		return;
-
-	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
+	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
+	kfree(atsru);
 }
 
-static dma_addr_t
-bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
-		  enum dma_data_direction dir, unsigned long attrs,
-		  u64 dma_mask)
+int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
 {
-	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-	struct dmar_domain *domain;
-	struct intel_iommu *iommu;
-	unsigned long iova_pfn;
-	unsigned long nrpages;
-	phys_addr_t tlb_addr;
-	int prot = 0;
-	int ret;
-
-	domain = find_domain(dev);
-	if (WARN_ON(dir == DMA_NONE || !domain))
-		return DMA_MAPPING_ERROR;
-
-	iommu = domain_get_iommu(domain);
-	if (WARN_ON(!iommu))
-		return DMA_MAPPING_ERROR;
-
-	nrpages = aligned_nrpages(0, size);
-	iova_pfn = intel_alloc_iova(dev, domain,
-				    dma_to_mm_pfn(nrpages), dma_mask);
-	if (!iova_pfn)
-		return DMA_MAPPING_ERROR;
-
-	/*
-	 * Check if DMAR supports zero-length reads on write only
-	 * mappings..
-	 */
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
-			!cap_zlr(iommu->cap))
-		prot |= DMA_PTE_READ;
-	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-		prot |= DMA_PTE_WRITE;
-
-	/*
-	 * If both the physical buffer start address and size are
-	 * page aligned, we don't need to use a bounce page.
-	 */
-	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
-		tlb_addr = swiotlb_tbl_map_single(dev,
-				__phys_to_dma(dev, io_tlb_start),
-				paddr, size, aligned_size, dir, attrs);
-		if (tlb_addr == DMA_MAPPING_ERROR) {
-			goto swiotlb_error;
-		} else {
-			/* Cleanup the padding area. */
-			void *padding_start = phys_to_virt(tlb_addr);
-			size_t padding_size = aligned_size;
-
-			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-			    (dir == DMA_TO_DEVICE ||
-			     dir == DMA_BIDIRECTIONAL)) {
-				padding_start += size;
-				padding_size -= size;
-			}
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
 
-			memset(padding_start, 0, padding_size);
-		}
-	} else {
-		tlb_addr = paddr;
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (atsru) {
+		list_del_rcu(&atsru->list);
+		synchronize_rcu();
+		intel_iommu_free_atsr(atsru);
 	}
 
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
-				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
-	if (ret)
-		goto mapping_error;
-
-	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
-
-	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
-
-mapping_error:
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-					 aligned_size, dir, attrs);
-swiotlb_error:
-	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
-	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
-		size, (unsigned long long)paddr, dir);
-
-	return DMA_MAPPING_ERROR;
+	return 0;
 }
 
-static void
-bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
-		    enum dma_data_direction dir, unsigned long attrs)
+int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
 {
-	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
-	struct dmar_domain *domain;
-	phys_addr_t tlb_addr;
-
-	domain = find_domain(dev);
-	if (WARN_ON(!domain))
-		return;
+	int i;
+	struct device *dev;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
 
-	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
-	if (WARN_ON(!tlb_addr))
-		return;
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = dmar_find_atsr(atsr);
+	if (!atsru)
+		return 0;
 
-	intel_unmap(dev, dev_addr, size);
-	if (is_swiotlb_buffer(tlb_addr))
-		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
-					 aligned_size, dir, attrs);
+	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
+		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
+					  i, dev)
+			return -EBUSY;
+	}
 
-	trace_bounce_unmap_single(dev, dev_addr, size);
+	return 0;
 }
 
-static dma_addr_t
-bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
 {
-	return bounce_map_single(dev, page_to_phys(page) + offset,
-				 size, dir, attrs, *dev->dma_mask);
-}
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_satc *tmp;
 
-static dma_addr_t
-bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
-		    enum dma_data_direction dir, unsigned long attrs)
-{
-	return bounce_map_single(dev, phys_addr, size,
-				 dir, attrs, *dev->dma_mask);
-}
+	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
+				dmar_rcu_check()) {
+		tmp = (struct acpi_dmar_satc *)satcu->hdr;
+		if (satc->segment != tmp->segment)
+			continue;
+		if (satc->header.length != tmp->header.length)
+			continue;
+		if (memcmp(satc, tmp, satc->header.length) == 0)
+			return satcu;
+	}
 
-static void
-bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
-		  enum dma_data_direction dir, unsigned long attrs)
-{
-	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
+	return NULL;
 }
 
-static void
-bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
-		      enum dma_data_direction dir, unsigned long attrs)
+int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
 {
-	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
-}
+	struct acpi_dmar_satc *satc;
+	struct dmar_satc_unit *satcu;
 
-static void
-bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
+	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+		return 0;
 
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_unmap_page(dev, sg->dma_address,
-				  sg_dma_len(sg), dir, attrs);
-}
+	satc = container_of(hdr, struct acpi_dmar_satc, header);
+	satcu = dmar_find_satc(satc);
+	if (satcu)
+		return 0;
 
-static int
-bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
-	      enum dma_data_direction dir, unsigned long attrs)
-{
-	int i;
-	struct scatterlist *sg;
+	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
+	if (!satcu)
+		return -ENOMEM;
 
-	for_each_sg(sglist, sg, nelems, i) {
-		sg->dma_address = bounce_map_page(dev, sg_page(sg),
-						  sg->offset, sg->length,
-						  dir, attrs);
-		if (sg->dma_address == DMA_MAPPING_ERROR)
-			goto out_unmap;
-		sg_dma_len(sg) = sg->length;
+	satcu->hdr = (void *)(satcu + 1);
+	memcpy(satcu->hdr, hdr, hdr->length);
+	satcu->atc_required = satc->flags & 0x1;
+	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
+					      (void *)satc + satc->header.length,
+					      &satcu->devices_cnt);
+	if (satcu->devices_cnt && !satcu->devices) {
+		kfree(satcu);
+		return -ENOMEM;
 	}
+	list_add_rcu(&satcu->list, &dmar_satc_units);
 
-	return nelems;
-
-out_unmap:
-	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
 
-static void
-bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-			   size_t size, enum dma_data_direction dir)
+static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 {
-	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
-}
+	int sp, ret;
+	struct intel_iommu *iommu = dmaru->iommu;
 
-static void
-bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
-			      size_t size, enum dma_data_direction dir)
-{
-	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
-}
+	if (g_iommus[iommu->seq_id])
+		return 0;
 
-static void
-bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
-		       int nelems, enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
+	if (ret)
+		goto out;
 
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_sync_single(dev, sg_dma_address(sg),
-				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
-}
+	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
+		pr_warn("%s: Doesn't support hardware pass through.\n",
+			iommu->name);
+		return -ENXIO;
+	}
+	if (!ecap_sc_support(iommu->ecap) &&
+	    domain_update_iommu_snooping(iommu)) {
+		pr_warn("%s: Doesn't support snooping.\n",
+			iommu->name);
+		return -ENXIO;
+	}
+	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
+	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
+		pr_warn("%s: Doesn't support large page.\n",
+			iommu->name);
+		return -ENXIO;
+	}
 
-static void
-bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-			  int nelems, enum dma_data_direction dir)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sglist, sg, nelems, i)
-		bounce_sync_single(dev, sg_dma_address(sg),
-				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
-}
-
-static const struct dma_map_ops bounce_dma_ops = {
-	.alloc			= intel_alloc_coherent,
-	.free			= intel_free_coherent,
-	.map_sg			= bounce_map_sg,
-	.unmap_sg		= bounce_unmap_sg,
-	.map_page		= bounce_map_page,
-	.unmap_page		= bounce_unmap_page,
-	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
-	.sync_single_for_device	= bounce_sync_single_for_device,
-	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
-	.sync_sg_for_device	= bounce_sync_sg_for_device,
-	.map_resource		= bounce_map_resource,
-	.unmap_resource		= bounce_unmap_resource,
-	.dma_supported		= dma_direct_supported,
-};
+	/*
+	 * Disable translation if already enabled prior to OS handover.
+	 */
+	if (iommu->gcmd & DMA_GCMD_TE)
+		iommu_disable_translation(iommu);
 
-static inline int iommu_domain_cache_init(void)
-{
-	int ret = 0;
+	g_iommus[iommu->seq_id] = iommu;
+	ret = iommu_init_domains(iommu);
+	if (ret == 0)
+		ret = iommu_alloc_root_entry(iommu);
+	if (ret)
+		goto out;
 
-	iommu_domain_cache = kmem_cache_create("iommu_domain",
-					 sizeof(struct dmar_domain),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
+	intel_svm_check(iommu);
 
-					 NULL);
-	if (!iommu_domain_cache) {
-		pr_err("Couldn't create iommu_domain cache\n");
-		ret = -ENOMEM;
+	if (dmaru->ignored) {
+		/*
+		 * we always have to disable PMRs or DMA may fail on this device
+		 */
+		if (force_on)
+			iommu_disable_protect_mem_regions(iommu);
+		return 0;
 	}
 
-	return ret;
-}
-
-static inline int iommu_devinfo_cache_init(void)
-{
-	int ret = 0;
+	intel_iommu_init_qi(iommu);
+	iommu_flush_write_buffer(iommu);
 
-	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
-					 sizeof(struct device_domain_info),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_devinfo_cache) {
-		pr_err("Couldn't create devinfo cache\n");
-		ret = -ENOMEM;
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+		ret = intel_svm_enable_prq(iommu);
+		if (ret)
+			goto disable_iommu;
 	}
+#endif
+	ret = dmar_set_interrupt(iommu, true);
+	if (ret)
+		goto disable_iommu;
 
+	iommu_set_root_entry(iommu);
+	iommu_enable_translation(iommu);
+
+	iommu_disable_protect_mem_regions(iommu);
+	return 0;
+
+disable_iommu:
+	disable_dmar_iommu(iommu);
+out:
+	free_dmar_iommu(iommu);
 	return ret;
 }
 
-static int __init iommu_init_mempool(void)
+int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
 {
-	int ret;
-	ret = iova_cache_get();
-	if (ret)
-		return ret;
-
-	ret = iommu_domain_cache_init();
-	if (ret)
-		goto domain_error;
-
-	ret = iommu_devinfo_cache_init();
-	if (!ret)
-		return ret;
+	int ret = 0;
+	struct intel_iommu *iommu = dmaru->iommu;
 
-	kmem_cache_destroy(iommu_domain_cache);
-domain_error:
-	iova_cache_put();
+	if (!intel_iommu_enabled)
+		return 0;
+	if (iommu == NULL)
+		return -EINVAL;
 
-	return -ENOMEM;
-}
+	if (insert) {
+		ret = intel_iommu_add(dmaru);
+	} else {
+		disable_dmar_iommu(iommu);
+		free_dmar_iommu(iommu);
+	}
 
-static void __init iommu_exit_mempool(void)
-{
-	kmem_cache_destroy(iommu_devinfo_cache);
-	kmem_cache_destroy(iommu_domain_cache);
-	iova_cache_put();
+	return ret;
 }
 
-static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+static void intel_iommu_free_dmars(void)
 {
-	struct dmar_drhd_unit *drhd;
-	u32 vtbar;
-	int rc;
+	struct dmar_rmrr_unit *rmrru, *rmrr_n;
+	struct dmar_atsr_unit *atsru, *atsr_n;
+	struct dmar_satc_unit *satcu, *satc_n;
 
-	/* We know that this device on this chipset has its own IOMMU.
-	 * If we find it under a different IOMMU, then the BIOS is lying
-	 * to us. Hope that the IOMMU for this device is actually
-	 * disabled, and it needs no translation...
-	 */
-	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
-	if (rc) {
-		/* "can't" happen */
-		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
-		return;
+	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
+		list_del(&rmrru->list);
+		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
+		kfree(rmrru);
 	}
-	vtbar &= 0xffff0000;
 
-	/* we know that the this iommu should be at offset 0xa000 from vtbar */
-	drhd = dmar_find_matched_drhd_unit(pdev);
-	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
-		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
-		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
-		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
+		list_del(&atsru->list);
+		intel_iommu_free_atsr(atsru);
+	}
+	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
+		list_del(&satcu->list);
+		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
+		kfree(satcu);
 	}
 }
-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
 
-static void __init init_no_remapping_devices(void)
+/* dev_satc_state - Find if dev is in a DMAR SATC table
+ *
+ * return value:
+ *    1: dev is in STAC table and ATS is required
+ *    0: dev is in SATC table and ATS is optional
+ *    -1: dev isn't in SATC table
+ */
+static int dev_satc_state(struct pci_dev *dev)
 {
-	struct dmar_drhd_unit *drhd;
-	struct device *dev;
-	int i;
-
-	for_each_drhd_unit(drhd) {
-		if (!drhd->include_all) {
-			for_each_active_dev_scope(drhd->devices,
-						  drhd->devices_cnt, i, dev)
-				break;
-			/* ignore DMAR unit if no devices exist */
-			if (i == drhd->devices_cnt)
-				drhd->ignored = 1;
-		}
-	}
+	int i, ret = -1;
+	struct device *tmp;
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_satc *satc;
 
-	for_each_active_drhd_unit(drhd) {
-		if (drhd->include_all)
-			continue;
+	dev = pci_physfn(dev);
+	rcu_read_lock();
 
-		for_each_active_dev_scope(drhd->devices,
-					  drhd->devices_cnt, i, dev)
-			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
-				break;
-		if (i < drhd->devices_cnt)
+	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
+		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+		if (satc->segment != pci_domain_nr(dev->bus))
 			continue;
-
-		/* This IOMMU has *only* gfx devices. Either bypass it or
-		   set the gfx_mapped flag, as appropriate */
-		if (!dmar_map_gfx) {
-			drhd->ignored = 1;
-			for_each_active_dev_scope(drhd->devices,
-						  drhd->devices_cnt, i, dev)
-				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
-		}
+		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
+			if (to_pci_dev(tmp) == dev) {
+				if (satc->flags)
+					ret = 1;
+				else
+					ret = 0;
+				goto out;
+			}
 	}
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
-#ifdef CONFIG_SUSPEND
-static int init_iommu_hw(void)
+int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
+	int i, ret = 1;
+	struct pci_bus *bus;
+	struct pci_dev *bridge = NULL;
+	struct device *tmp;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
 
-	for_each_active_iommu(iommu, drhd)
-		if (iommu->qi)
-			dmar_reenable_qi(iommu);
+	dev = pci_physfn(dev);
+	i = dev_satc_state(dev);
+	if (i >= 0) {
+		/* This dev supports ATS as it is in SATC table!
+		 * When IOMMU is in legacy mode, enabling ATS is done
+		 * automatically by HW for the device that requires
+		 * ATS, hence OS should not enable this device ATS
+		 * to avoid duplicated TLB invalidation
+		 */
+		if (i && !sm_supported(iommu))
+			ret = 0;
+		return ret;
+	}
 
-	for_each_iommu(iommu, drhd) {
-		if (drhd->ignored) {
-			/*
-			 * we always have to disable PMRs or DMA may fail on
-			 * this device
-			 */
-			if (force_on)
-				iommu_disable_protect_mem_regions(iommu);
-			continue;
-		}
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		bridge = bus->self;
+		/* If it's an integrated device, allow ATS */
+		if (!bridge)
+			return 1;
+		/* Connected via non-PCIe: no ATS */
+		if (!pci_is_pcie(bridge) ||
+		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+		/* If we found the root port, look it up in the ATSR */
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
+			break;
+	}
 
-		iommu_flush_write_buffer(iommu);
+	rcu_read_lock();
+	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+				dmar_rcu_check()) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment != pci_domain_nr(dev->bus))
+			continue;
 
-		iommu_set_root_entry(iommu);
+		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
+			if (tmp == &bridge->dev)
+				goto out;
 
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-		iommu_enable_translation(iommu);
-		iommu_disable_protect_mem_regions(iommu);
+		if (atsru->include_all)
+			goto out;
 	}
+	ret = 0;
+out:
+	rcu_read_unlock();
 
-	return 0;
+	return ret;
 }
 
-static void iommu_flush_all(void)
+int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
+	int ret;
+	struct dmar_rmrr_unit *rmrru;
+	struct dmar_atsr_unit *atsru;
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_atsr *atsr;
+	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_satc *satc;
 
-	for_each_active_iommu(iommu, drhd) {
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-					 DMA_TLB_GLOBAL_FLUSH);
-	}
-}
+	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
+		return 0;
 
-static int iommu_suspend(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-	unsigned long flag;
+	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
+		rmrr = container_of(rmrru->hdr,
+				    struct acpi_dmar_reserved_memory, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
+				((void *)rmrr) + rmrr->header.length,
+				rmrr->segment, rmrru->devices,
+				rmrru->devices_cnt);
+			if (ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			dmar_remove_dev_scope(info, rmrr->segment,
+				rmrru->devices, rmrru->devices_cnt);
+		}
+	}
 
-	for_each_active_iommu(iommu, drhd) {
-		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
-						 GFP_ATOMIC);
-		if (!iommu->iommu_state)
-			goto nomem;
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		if (atsru->include_all)
+			continue;
+
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
+					(void *)atsr + atsr->header.length,
+					atsr->segment, atsru->devices,
+					atsru->devices_cnt);
+			if (ret > 0)
+				break;
+			else if (ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			if (dmar_remove_dev_scope(info, atsr->segment,
+					atsru->devices, atsru->devices_cnt))
+				break;
+		}
+	}
+	list_for_each_entry(satcu, &dmar_satc_units, list) {
+		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
+					(void *)satc + satc->header.length,
+					satc->segment, satcu->devices,
+					satcu->devices_cnt);
+			if (ret > 0)
+				break;
+			else if (ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			if (dmar_remove_dev_scope(info, satc->segment,
+					satcu->devices, satcu->devices_cnt))
+				break;
+		}
 	}
 
-	iommu_flush_all();
+	return 0;
+}
 
-	for_each_active_iommu(iommu, drhd) {
-		iommu_disable_translation(iommu);
+static int intel_iommu_memory_notifier(struct notifier_block *nb,
+				       unsigned long val, void *v)
+{
+	struct memory_notify *mhp = v;
+	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
+			mhp->nr_pages - 1);
 
-		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	switch (val) {
+	case MEM_GOING_ONLINE:
+		if (iommu_domain_identity_map(si_domain,
+					      start_vpfn, last_vpfn)) {
+			pr_warn("Failed to build identity map for [%lx-%lx]\n",
+				start_vpfn, last_vpfn);
+			return NOTIFY_BAD;
+		}
+		break;
 
-		iommu->iommu_state[SR_DMAR_FECTL_REG] =
-			readl(iommu->reg + DMAR_FECTL_REG);
-		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
-			readl(iommu->reg + DMAR_FEDATA_REG);
-		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
-			readl(iommu->reg + DMAR_FEADDR_REG);
-		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
-			readl(iommu->reg + DMAR_FEUADDR_REG);
+	case MEM_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+		{
+			struct dmar_drhd_unit *drhd;
+			struct intel_iommu *iommu;
+			struct page *freelist;
 
-		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+			freelist = domain_unmap(si_domain,
+						start_vpfn, last_vpfn,
+						NULL);
+
+			rcu_read_lock();
+			for_each_active_iommu(iommu, drhd)
+				iommu_flush_iotlb_psi(iommu, si_domain,
+					start_vpfn, mhp->nr_pages,
+					!freelist, 0);
+			rcu_read_unlock();
+			dma_free_pagelist(freelist);
+		}
+		break;
 	}
-	return 0;
 
-nomem:
-	for_each_active_iommu(iommu, drhd)
-		kfree(iommu->iommu_state);
+	return NOTIFY_OK;
+}
 
-	return -ENOMEM;
+static struct notifier_block intel_iommu_memory_nb = {
+	.notifier_call = intel_iommu_memory_notifier,
+	.priority = 0
+};
+
+static void intel_disable_iommus(void)
+{
+	struct intel_iommu *iommu = NULL;
+	struct dmar_drhd_unit *drhd;
+
+	for_each_iommu(iommu, drhd)
+		iommu_disable_translation(iommu);
 }
 
-static void iommu_resume(void)
+void intel_iommu_shutdown(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu = NULL;
-	unsigned long flag;
 
-	if (init_iommu_hw()) {
-		if (force_on)
-			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
-		else
-			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+	if (no_iommu || dmar_disabled)
 		return;
-	}
 
-	for_each_active_iommu(iommu, drhd) {
+	down_write(&dmar_global_lock);
 
-		raw_spin_lock_irqsave(&iommu->register_lock, flag);
+	/* Disable PMRs explicitly here. */
+	for_each_iommu(iommu, drhd)
+		iommu_disable_protect_mem_regions(iommu);
 
-		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
-			iommu->reg + DMAR_FECTL_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
-			iommu->reg + DMAR_FEDATA_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
-			iommu->reg + DMAR_FEADDR_REG);
-		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
-			iommu->reg + DMAR_FEUADDR_REG);
+	/* Make sure the IOMMUs are switched off */
+	intel_disable_iommus();
 
-		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-	}
+	up_write(&dmar_global_lock);
+}
 
-	for_each_active_iommu(iommu, drhd)
-		kfree(iommu->iommu_state);
+static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+{
+	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+
+	return container_of(iommu_dev, struct intel_iommu, iommu);
 }
 
-static struct syscore_ops iommu_syscore_ops = {
-	.resume		= iommu_resume,
-	.suspend	= iommu_suspend,
-};
+static ssize_t version_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	u32 ver = readl(iommu->reg + DMAR_VER_REG);
+	return sprintf(buf, "%d:%d\n",
+		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+}
+static DEVICE_ATTR_RO(version);
 
-static void __init init_iommu_pm_ops(void)
+static ssize_t address_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
 {
-	register_syscore_ops(&iommu_syscore_ops);
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->reg_phys);
 }
+static DEVICE_ATTR_RO(address);
 
-#else
-static inline void init_iommu_pm_ops(void) {}
-#endif	/* CONFIG_PM */
+static ssize_t cap_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->cap);
+}
+static DEVICE_ATTR_RO(cap);
 
-int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
+static ssize_t ecap_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
 {
-	struct acpi_dmar_reserved_memory *rmrr;
-	struct dmar_rmrr_unit *rmrru;
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%llx\n", iommu->ecap);
+}
+static DEVICE_ATTR_RO(ecap);
 
-	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
-	if (!rmrru)
-		goto out;
+static ssize_t domains_supported_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR_RO(domains_supported);
 
-	rmrru->hdr = header;
-	rmrr = (struct acpi_dmar_reserved_memory *)header;
-	rmrru->base_address = rmrr->base_address;
-	rmrru->end_address = rmrr->end_address;
+static ssize_t domains_used_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+						  cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR_RO(domains_used);
 
-	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
-				((void *)rmrr) + rmrr->header.length,
-				&rmrru->devices_cnt);
-	if (rmrru->devices_cnt && rmrru->devices == NULL)
-		goto free_rmrru;
+static struct attribute *intel_iommu_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_address.attr,
+	&dev_attr_cap.attr,
+	&dev_attr_ecap.attr,
+	&dev_attr_domains_supported.attr,
+	&dev_attr_domains_used.attr,
+	NULL,
+};
 
-	list_add(&rmrru->list, &dmar_rmrr_units);
+static struct attribute_group intel_iommu_group = {
+	.name = "intel-iommu",
+	.attrs = intel_iommu_attrs,
+};
 
-	return 0;
-free_rmrru:
-	kfree(rmrru);
-out:
-	return -ENOMEM;
+const struct attribute_group *intel_iommu_groups[] = {
+	&intel_iommu_group,
+	NULL,
+};
+
+static inline bool has_external_pci(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	for_each_pci_dev(pdev)
+		if (pdev->external_facing)
+			return true;
+
+	return false;
 }
 
-static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
+static int __init platform_optin_force_iommu(void)
 {
-	struct dmar_atsr_unit *atsru;
-	struct acpi_dmar_atsr *tmp;
+	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
+		return 0;
 
-	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
-				dmar_rcu_check()) {
-		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
-		if (atsr->segment != tmp->segment)
-			continue;
-		if (atsr->header.length != tmp->header.length)
-			continue;
-		if (memcmp(atsr, tmp, atsr->header.length) == 0)
-			return atsru;
-	}
+	if (no_iommu || dmar_disabled)
+		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
 
-	return NULL;
+	/*
+	 * If Intel-IOMMU is disabled by default, we will apply identity
+	 * map for all devices except those marked as being untrusted.
+	 */
+	if (dmar_disabled)
+		iommu_set_default_passthrough(false);
+
+	dmar_disabled = 0;
+	no_iommu = 0;
+
+	return 1;
 }
 
-int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+static int __init probe_acpi_namespace_devices(void)
 {
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
+	struct dmar_drhd_unit *drhd;
+	/* To avoid a -Wunused-but-set-variable warning. */
+	struct intel_iommu *iommu __maybe_unused;
+	struct device *dev;
+	int i, ret = 0;
 
-	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
-		return 0;
+	for_each_active_iommu(iommu, drhd) {
+		for_each_active_dev_scope(drhd->devices,
+					  drhd->devices_cnt, i, dev) {
+			struct acpi_device_physical_node *pn;
+			struct iommu_group *group;
+			struct acpi_device *adev;
 
-	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-	atsru = dmar_find_atsr(atsr);
-	if (atsru)
-		return 0;
+			if (dev->bus != &acpi_bus_type)
+				continue;
 
-	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
-	if (!atsru)
+			adev = to_acpi_device(dev);
+			mutex_lock(&adev->physical_node_lock);
+			list_for_each_entry(pn,
+					    &adev->physical_node_list, node) {
+				group = iommu_group_get(pn->dev);
+				if (group) {
+					iommu_group_put(group);
+					continue;
+				}
+
+				pn->dev->bus->iommu_ops = &intel_iommu_ops;
+				ret = iommu_probe_device(pn->dev);
+				if (ret)
+					break;
+			}
+			mutex_unlock(&adev->physical_node_lock);
+
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int __init intel_iommu_init(void)
+{
+	int ret = -ENODEV;
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+
+	/*
+	 * Intel IOMMU is required for a TXT/tboot launch or platform
+	 * opt in, so enforce that.
+	 */
+	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
+		    platform_optin_force_iommu();
+
+	if (iommu_init_mempool()) {
+		if (force_on)
+			panic("tboot: Failed to initialize iommu memory\n");
 		return -ENOMEM;
+	}
+
+	down_write(&dmar_global_lock);
+	if (dmar_table_init()) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR table\n");
+		goto out_free_dmar;
+	}
+
+	if (dmar_dev_scope_init() < 0) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMAR device scope\n");
+		goto out_free_dmar;
+	}
+
+	up_write(&dmar_global_lock);
 
 	/*
-	 * If memory is allocated from slab by ACPI _DSM method, we need to
-	 * copy the memory content because the memory buffer will be freed
-	 * on return.
+	 * The bus notifier takes the dmar_global_lock, so lockdep will
+	 * complain later when we register it under the lock.
 	 */
-	atsru->hdr = (void *)(atsru + 1);
-	memcpy(atsru->hdr, hdr, hdr->length);
-	atsru->include_all = atsr->flags & 0x1;
-	if (!atsru->include_all) {
-		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
-				(void *)atsr + atsr->header.length,
-				&atsru->devices_cnt);
-		if (atsru->devices_cnt && atsru->devices == NULL) {
-			kfree(atsru);
-			return -ENOMEM;
+	dmar_register_bus_notifier();
+
+	down_write(&dmar_global_lock);
+
+	if (!no_iommu)
+		intel_iommu_debugfs_init();
+
+	if (no_iommu || dmar_disabled) {
+		/*
+		 * We exit the function here to ensure IOMMU's remapping and
+		 * mempool aren't setup, which means that the IOMMU's PMRs
+		 * won't be disabled via the call to init_dmars(). So disable
+		 * it explicitly here. The PMRs were setup by tboot prior to
+		 * calling SENTER, but the kernel is expected to reset/tear
+		 * down the PMRs.
+		 */
+		if (intel_iommu_tboot_noforce) {
+			for_each_iommu(iommu, drhd)
+				iommu_disable_protect_mem_regions(iommu);
 		}
+
+		/*
+		 * Make sure the IOMMUs are switched off, even when we
+		 * boot into a kexec kernel and the previous kernel left
+		 * them enabled
+		 */
+		intel_disable_iommus();
+		goto out_free_dmar;
+	}
+
+	if (list_empty(&dmar_rmrr_units))
+		pr_info("No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		pr_info("No ATSR found\n");
+
+	if (list_empty(&dmar_satc_units))
+		pr_info("No SATC found\n");
+
+	if (dmar_map_gfx)
+		intel_iommu_gfx_mapped = 1;
+
+	init_no_remapping_devices();
+
+	ret = init_dmars();
+	if (ret) {
+		if (force_on)
+			panic("tboot: Failed to initialize DMARs\n");
+		pr_err("Initialization failed\n");
+		goto out_free_dmar;
+	}
+	up_write(&dmar_global_lock);
+
+	init_iommu_pm_ops();
+
+	down_read(&dmar_global_lock);
+	for_each_active_iommu(iommu, drhd) {
+		iommu_device_sysfs_add(&iommu->iommu, NULL,
+				       intel_iommu_groups,
+				       "%s", iommu->name);
+		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
+		iommu_device_register(&iommu->iommu);
+	}
+	up_read(&dmar_global_lock);
+
+	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
+	if (si_domain && !hw_pass_through)
+		register_memory_notifier(&intel_iommu_memory_nb);
+
+	down_read(&dmar_global_lock);
+	if (probe_acpi_namespace_devices())
+		pr_warn("ACPI name space devices didn't probe correctly\n");
+
+	/* Finally, we enable the DMA remapping hardware. */
+	for_each_iommu(iommu, drhd) {
+		if (!drhd->ignored && !translation_pre_enabled(iommu))
+			iommu_enable_translation(iommu);
+
+		iommu_disable_protect_mem_regions(iommu);
 	}
+	up_read(&dmar_global_lock);
 
-	list_add_rcu(&atsru->list, &dmar_atsr_units);
+	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+
+	intel_iommu_enabled = 1;
 
 	return 0;
-}
 
-static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
-{
-	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
-	kfree(atsru);
+out_free_dmar:
+	intel_iommu_free_dmars();
+	up_write(&dmar_global_lock);
+	iommu_exit_mempool();
+	return ret;
 }
 
-int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 {
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
-
-	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-	atsru = dmar_find_atsr(atsr);
-	if (atsru) {
-		list_del_rcu(&atsru->list);
-		synchronize_rcu();
-		intel_iommu_free_atsr(atsru);
-	}
+	struct device_domain_info *info = opaque;
 
+	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
 	return 0;
 }
 
-int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
+/*
+ * NB - intel-iommu lacks any sort of reference counting for the users of
+ * dependent devices.  If multiple endpoints have intersecting dependent
+ * devices, unbinding the driver from any one of them will possibly leave
+ * the others unable to operate.
+ */
+static void domain_context_clear(struct device_domain_info *info)
 {
-	int i;
-	struct device *dev;
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
-
-	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
-	atsru = dmar_find_atsr(atsr);
-	if (!atsru)
-		return 0;
-
-	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
-		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
-					  i, dev)
-			return -EBUSY;
-	}
+	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
+		return;
 
-	return 0;
+	pci_for_each_dma_alias(to_pci_dev(info->dev),
+			       &domain_context_clear_one_cb, info);
 }
 
-static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 {
-	int sp, ret;
-	struct intel_iommu *iommu = dmaru->iommu;
-
-	if (g_iommus[iommu->seq_id])
-		return 0;
+	struct dmar_domain *domain;
+	struct intel_iommu *iommu;
+	unsigned long flags;
 
-	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
-		pr_warn("%s: Doesn't support hardware pass through.\n",
-			iommu->name);
-		return -ENXIO;
-	}
-	if (!ecap_sc_support(iommu->ecap) &&
-	    domain_update_iommu_snooping(iommu)) {
-		pr_warn("%s: Doesn't support snooping.\n",
-			iommu->name);
-		return -ENXIO;
-	}
-	sp = domain_update_iommu_superpage(iommu) - 1;
-	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
-		pr_warn("%s: Doesn't support large page.\n",
-			iommu->name);
-		return -ENXIO;
-	}
+	assert_spin_locked(&device_domain_lock);
 
-	/*
-	 * Disable translation if already enabled prior to OS handover.
-	 */
-	if (iommu->gcmd & DMA_GCMD_TE)
-		iommu_disable_translation(iommu);
+	if (WARN_ON(!info))
+		return;
 
-	g_iommus[iommu->seq_id] = iommu;
-	ret = iommu_init_domains(iommu);
-	if (ret == 0)
-		ret = iommu_alloc_root_entry(iommu);
-	if (ret)
-		goto out;
+	iommu = info->iommu;
+	domain = info->domain;
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-	if (pasid_supported(iommu))
-		intel_svm_init(iommu);
-#endif
+	if (info->dev) {
+		if (dev_is_pci(info->dev) && sm_supported(iommu))
+			intel_pasid_tear_down_entry(iommu, info->dev,
+					PASID_RID2PASID, false, false);
 
-	if (dmaru->ignored) {
-		/*
-		 * we always have to disable PMRs or DMA may fail on this device
-		 */
-		if (force_on)
-			iommu_disable_protect_mem_regions(iommu);
-		return 0;
+		iommu_disable_dev_iotlb(info);
+		if (!dev_is_real_dma_subdevice(info->dev))
+			domain_context_clear(info);
+		intel_pasid_free_table(info->dev);
 	}
 
-	intel_iommu_init_qi(iommu);
-	iommu_flush_write_buffer(iommu);
+	unlink_domain_info(info);
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
-		ret = intel_svm_enable_prq(iommu);
-		if (ret)
-			goto disable_iommu;
-	}
-#endif
-	ret = dmar_set_interrupt(iommu);
-	if (ret)
-		goto disable_iommu;
+	spin_lock_irqsave(&iommu->lock, flags);
+	domain_detach_iommu(domain, iommu);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
-	iommu_set_root_entry(iommu);
-	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
-	iommu_enable_translation(iommu);
+	free_devinfo_mem(info);
+}
 
-	iommu_disable_protect_mem_regions(iommu);
-	return 0;
+static void dmar_remove_one_dev_info(struct device *dev)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
 
-disable_iommu:
-	disable_dmar_iommu(iommu);
-out:
-	free_dmar_iommu(iommu);
-	return ret;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info = get_domain_info(dev);
+	if (info)
+		__dmar_remove_one_dev_info(info);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
+static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
-	int ret = 0;
-	struct intel_iommu *iommu = dmaru->iommu;
+	int adjust_width;
 
-	if (!intel_iommu_enabled)
-		return 0;
-	if (iommu == NULL)
-		return -EINVAL;
+	/* calculate AGAW */
+	domain->gaw = guest_width;
+	adjust_width = guestwidth_to_adjustwidth(guest_width);
+	domain->agaw = width_to_agaw(adjust_width);
 
-	if (insert) {
-		ret = intel_iommu_add(dmaru);
-	} else {
-		disable_dmar_iommu(iommu);
-		free_dmar_iommu(iommu);
-	}
+	domain->iommu_coherency = false;
+	domain->iommu_snooping = false;
+	domain->iommu_superpage = 0;
+	domain->max_addr = 0;
 
-	return ret;
+	/* always allocate the top pgd */
+	domain->pgd = alloc_pgtable_page(domain->nid);
+	if (!domain->pgd)
+		return -ENOMEM;
+	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+	return 0;
 }
 
-static void intel_iommu_free_dmars(void)
+static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 {
-	struct dmar_rmrr_unit *rmrru, *rmrr_n;
-	struct dmar_atsr_unit *atsru, *atsr_n;
-
-	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
-		list_del(&rmrru->list);
-		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
-		kfree(rmrru);
-	}
+	struct dmar_domain *dmar_domain;
+	struct iommu_domain *domain;
 
-	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
-		list_del(&atsru->list);
-		intel_iommu_free_atsr(atsru);
-	}
-}
+	switch (type) {
+	case IOMMU_DOMAIN_DMA:
+	case IOMMU_DOMAIN_DMA_FQ:
+	case IOMMU_DOMAIN_UNMANAGED:
+		dmar_domain = alloc_domain(type);
+		if (!dmar_domain) {
+			pr_err("Can't allocate dmar_domain\n");
+			return NULL;
+		}
+		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+			pr_err("Domain initialization failed\n");
+			domain_exit(dmar_domain);
+			return NULL;
+		}
 
-int dmar_find_matched_atsr_unit(struct pci_dev *dev)
-{
-	int i, ret = 1;
-	struct pci_bus *bus;
-	struct pci_dev *bridge = NULL;
-	struct device *tmp;
-	struct acpi_dmar_atsr *atsr;
-	struct dmar_atsr_unit *atsru;
+		domain = &dmar_domain->domain;
+		domain->geometry.aperture_start = 0;
+		domain->geometry.aperture_end   =
+				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
+		domain->geometry.force_aperture = true;
 
-	dev = pci_physfn(dev);
-	for (bus = dev->bus; bus; bus = bus->parent) {
-		bridge = bus->self;
-		/* If it's an integrated device, allow ATS */
-		if (!bridge)
-			return 1;
-		/* Connected via non-PCIe: no ATS */
-		if (!pci_is_pcie(bridge) ||
-		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
-			return 0;
-		/* If we found the root port, look it up in the ATSR */
-		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
-			break;
+		return domain;
+	case IOMMU_DOMAIN_IDENTITY:
+		return &si_domain->domain;
+	default:
+		return NULL;
 	}
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
-		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-		if (atsr->segment != pci_domain_nr(dev->bus))
-			continue;
+	return NULL;
+}
 
-		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
-			if (tmp == &bridge->dev)
-				goto out;
+static void intel_iommu_domain_free(struct iommu_domain *domain)
+{
+	if (domain != &si_domain->domain)
+		domain_exit(to_dmar_domain(domain));
+}
 
-		if (atsru->include_all)
-			goto out;
-	}
-	ret = 0;
-out:
-	rcu_read_unlock();
+/*
+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+inline bool is_aux_domain(struct device *dev,
+			  struct iommu_domain *domain)
+{
+	struct device_domain_info *info = get_domain_info(dev);
 
-	return ret;
+	return info && info->auxd_enabled &&
+			domain->type == IOMMU_DOMAIN_UNMANAGED;
 }
 
-int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
+static inline struct subdev_domain_info *
+lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
 {
-	int ret;
-	struct dmar_rmrr_unit *rmrru;
-	struct dmar_atsr_unit *atsru;
-	struct acpi_dmar_atsr *atsr;
-	struct acpi_dmar_reserved_memory *rmrr;
-
-	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
-		return 0;
+	struct subdev_domain_info *sinfo;
 
-	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
-		rmrr = container_of(rmrru->hdr,
-				    struct acpi_dmar_reserved_memory, header);
-		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
-			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
-				((void *)rmrr) + rmrr->header.length,
-				rmrr->segment, rmrru->devices,
-				rmrru->devices_cnt);
-			if (ret < 0)
-				return ret;
-		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
-			dmar_remove_dev_scope(info, rmrr->segment,
-				rmrru->devices, rmrru->devices_cnt);
+	if (!list_empty(&domain->subdevices)) {
+		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+			if (sinfo->pdev == dev)
+				return sinfo;
 		}
 	}
 
-	list_for_each_entry(atsru, &dmar_atsr_units, list) {
-		if (atsru->include_all)
-			continue;
+	return NULL;
+}
 
-		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
-		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
-			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
-					(void *)atsr + atsr->header.length,
-					atsr->segment, atsru->devices,
-					atsru->devices_cnt);
-			if (ret > 0)
-				break;
-			else if (ret < 0)
-				return ret;
-		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
-			if (dmar_remove_dev_scope(info, atsr->segment,
-					atsru->devices, atsru->devices_cnt))
-				break;
-		}
+static int auxiliary_link_device(struct dmar_domain *domain,
+				 struct device *dev)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
+
+	assert_spin_locked(&device_domain_lock);
+	if (WARN_ON(!info))
+		return -EINVAL;
+
+	if (!sinfo) {
+		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
+		if (!sinfo)
+			return -ENOMEM;
+		sinfo->domain = domain;
+		sinfo->pdev = dev;
+		list_add(&sinfo->link_phys, &info->subdevices);
+		list_add(&sinfo->link_domain, &domain->subdevices);
 	}
 
-	return 0;
+	return ++sinfo->users;
 }
 
-static int intel_iommu_memory_notifier(struct notifier_block *nb,
-				       unsigned long val, void *v)
+static int auxiliary_unlink_device(struct dmar_domain *domain,
+				   struct device *dev)
 {
-	struct memory_notify *mhp = v;
-	unsigned long long start, end;
-	unsigned long start_vpfn, last_vpfn;
+	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
+	int ret;
 
-	switch (val) {
-	case MEM_GOING_ONLINE:
-		start = mhp->start_pfn << PAGE_SHIFT;
-		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
-		if (iommu_domain_identity_map(si_domain, start, end)) {
-			pr_warn("Failed to build identity map for [%llx-%llx]\n",
-				start, end);
-			return NOTIFY_BAD;
-		}
-		break;
+	assert_spin_locked(&device_domain_lock);
+	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
+		return -EINVAL;
 
-	case MEM_OFFLINE:
-	case MEM_CANCEL_ONLINE:
-		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
-		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
-		while (start_vpfn <= last_vpfn) {
-			struct iova *iova;
-			struct dmar_drhd_unit *drhd;
-			struct intel_iommu *iommu;
-			struct page *freelist;
+	ret = --sinfo->users;
+	if (!ret) {
+		list_del(&sinfo->link_phys);
+		list_del(&sinfo->link_domain);
+		kfree(sinfo);
+	}
 
-			iova = find_iova(&si_domain->iovad, start_vpfn);
-			if (iova == NULL) {
-				pr_debug("Failed get IOVA for PFN %lx\n",
-					 start_vpfn);
-				break;
-			}
+	return ret;
+}
 
-			iova = split_and_remove_iova(&si_domain->iovad, iova,
-						     start_vpfn, last_vpfn);
-			if (iova == NULL) {
-				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
-					start_vpfn, last_vpfn);
-				return NOTIFY_BAD;
-			}
+static int aux_domain_add_dev(struct dmar_domain *domain,
+			      struct device *dev)
+{
+	int ret;
+	unsigned long flags;
+	struct intel_iommu *iommu;
 
-			freelist = domain_unmap(si_domain, iova->pfn_lo,
-					       iova->pfn_hi);
+	iommu = device_to_iommu(dev, NULL, NULL);
+	if (!iommu)
+		return -ENODEV;
 
-			rcu_read_lock();
-			for_each_active_iommu(iommu, drhd)
-				iommu_flush_iotlb_psi(iommu, si_domain,
-					iova->pfn_lo, iova_size(iova),
-					!freelist, 0);
-			rcu_read_unlock();
-			dma_free_pagelist(freelist);
+	if (domain->default_pasid <= 0) {
+		u32 pasid;
 
-			start_vpfn = iova->pfn_hi + 1;
-			free_iova_mem(iova);
+		/* No private data needed for the default pasid */
+		pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE,
+				     pci_max_pasids(to_pci_dev(dev)) - 1,
+				     NULL);
+		if (pasid == INVALID_IOASID) {
+			pr_err("Can't allocate default pasid\n");
+			return -ENODEV;
 		}
-		break;
+		domain->default_pasid = pasid;
 	}
 
-	return NOTIFY_OK;
-}
+	spin_lock_irqsave(&device_domain_lock, flags);
+	ret = auxiliary_link_device(domain, dev);
+	if (ret <= 0)
+		goto link_failed;
 
-static struct notifier_block intel_iommu_memory_nb = {
-	.notifier_call = intel_iommu_memory_notifier,
-	.priority = 0
-};
+	/*
+	 * Subdevices from the same physical device can be attached to the
+	 * same domain. For such cases, only the first subdevice attachment
+	 * needs to go through the full steps in this function. So if ret >
+	 * 1, just goto out.
+	 */
+	if (ret > 1)
+		goto out;
 
-static void free_all_cpu_cached_iovas(unsigned int cpu)
-{
-	int i;
+	/*
+	 * iommu->lock must be held to attach domain to iommu and setup the
+	 * pasid entry for second level translation.
+	 */
+	spin_lock(&iommu->lock);
+	ret = domain_attach_iommu(domain, iommu);
+	if (ret)
+		goto attach_failed;
 
-	for (i = 0; i < g_num_of_iommus; i++) {
-		struct intel_iommu *iommu = g_iommus[i];
-		struct dmar_domain *domain;
-		int did;
+	/* Setup the PASID entry for mediated devices: */
+	if (domain_use_first_level(domain))
+		ret = domain_setup_first_level(iommu, domain, dev,
+					       domain->default_pasid);
+	else
+		ret = intel_pasid_setup_second_level(iommu, domain, dev,
+						     domain->default_pasid);
+	if (ret)
+		goto table_failed;
 
-		if (!iommu)
-			continue;
+	spin_unlock(&iommu->lock);
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
-		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
-			domain = get_iommu_domain(iommu, (u16)did);
+	return 0;
 
-			if (!domain)
-				continue;
-			free_cpu_cached_iovas(cpu, &domain->iovad);
-		}
-	}
-}
+table_failed:
+	domain_detach_iommu(domain, iommu);
+attach_failed:
+	spin_unlock(&iommu->lock);
+	auxiliary_unlink_device(domain, dev);
+link_failed:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
+		ioasid_put(host_pasid_set, domain->default_pasid);
 
-static int intel_iommu_cpu_dead(unsigned int cpu)
-{
-	free_all_cpu_cached_iovas(cpu);
-	return 0;
+	return ret;
 }
 
-static void intel_disable_iommus(void)
+static void aux_domain_remove_dev(struct dmar_domain *domain,
+				  struct device *dev)
 {
-	struct intel_iommu *iommu = NULL;
-	struct dmar_drhd_unit *drhd;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags;
 
-	for_each_iommu(iommu, drhd)
-		iommu_disable_translation(iommu);
-}
+	if (!is_aux_domain(dev, &domain->domain))
+		return;
 
-static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
-{
-	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info = get_domain_info(dev);
+	iommu = info->iommu;
 
-	return container_of(iommu_dev, struct intel_iommu, iommu);
-}
+	if (!auxiliary_unlink_device(domain, dev)) {
+		spin_lock(&iommu->lock);
+		intel_pasid_tear_down_entry(iommu, dev,
+					    domain->default_pasid, false, false);
+		domain_detach_iommu(domain, iommu);
+		spin_unlock(&iommu->lock);
+	}
 
-static ssize_t intel_iommu_show_version(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
-{
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	u32 ver = readl(iommu->reg + DMAR_VER_REG);
-	return sprintf(buf, "%d:%d\n",
-		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
-}
-static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
-static ssize_t intel_iommu_show_address(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
-{
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%llx\n", iommu->reg_phys);
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
+		ioasid_put(host_pasid_set, domain->default_pasid);
 }
-static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
 
-static ssize_t intel_iommu_show_cap(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
+static int prepare_domain_attach_device(struct iommu_domain *domain,
+					struct device *dev)
 {
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%llx\n", iommu->cap);
-}
-static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct intel_iommu *iommu;
+	int addr_width;
 
-static ssize_t intel_iommu_show_ecap(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
-{
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%llx\n", iommu->ecap);
-}
-static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
+	iommu = device_to_iommu(dev, NULL, NULL);
+	if (!iommu)
+		return -ENODEV;
 
-static ssize_t intel_iommu_show_ndoms(struct device *dev,
-				      struct device_attribute *attr,
-				      char *buf)
-{
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
+	    !ecap_nest(iommu->ecap)) {
+		dev_err(dev, "%s: iommu not support nested translation\n",
+			iommu->name);
+		return -EINVAL;
+	}
+
+	/* check if this iommu agaw is sufficient for max mapped address */
+	addr_width = agaw_to_width(iommu->agaw);
+	if (addr_width > cap_mgaw(iommu->cap))
+		addr_width = cap_mgaw(iommu->cap);
+
+	if (dmar_domain->max_addr > (1LL << addr_width)) {
+		dev_err(dev, "%s: iommu width (%d) is not "
+		        "sufficient for the mapped address (%llx)\n",
+		        __func__, addr_width, dmar_domain->max_addr);
+		return -EFAULT;
+	}
+	dmar_domain->gaw = addr_width;
+
+	/*
+	 * Knock out extra levels of page tables if necessary
+	 */
+	while (iommu->agaw < dmar_domain->agaw) {
+		struct dma_pte *pte;
+
+		pte = dmar_domain->pgd;
+		if (dma_pte_present(pte)) {
+			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
+			free_pgtable_page(pte);
+		}
+		dmar_domain->agaw--;
+	}
+
+	return 0;
 }
-static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
 
-static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+				     struct device *dev)
 {
-	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
-						  cap_ndoms(iommu->cap)));
-}
-static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+	int ret;
 
-static struct attribute *intel_iommu_attrs[] = {
-	&dev_attr_version.attr,
-	&dev_attr_address.attr,
-	&dev_attr_cap.attr,
-	&dev_attr_ecap.attr,
-	&dev_attr_domains_supported.attr,
-	&dev_attr_domains_used.attr,
-	NULL,
-};
+	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
+	    device_is_rmrr_locked(dev)) {
+		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
+		return -EPERM;
+	}
 
-static struct attribute_group intel_iommu_group = {
-	.name = "intel-iommu",
-	.attrs = intel_iommu_attrs,
-};
+	if (is_aux_domain(dev, domain))
+		return -EPERM;
 
-const struct attribute_group *intel_iommu_groups[] = {
-	&intel_iommu_group,
-	NULL,
-};
+	/* normally dev is not mapped */
+	if (unlikely(domain_context_mapped(dev))) {
+		struct dmar_domain *old_domain;
+
+		old_domain = find_domain(dev);
+		if (old_domain)
+			dmar_remove_one_dev_info(dev);
+	}
+
+	ret = prepare_domain_attach_device(domain, dev);
+	if (ret)
+		return ret;
+
+	return domain_add_dev_info(to_dmar_domain(domain), dev);
+}
 
-static inline bool has_untrusted_dev(void)
+static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
+					 struct device *dev)
 {
-	struct pci_dev *pdev = NULL;
+	int ret;
 
-	for_each_pci_dev(pdev)
-		if (pdev->untrusted)
-			return true;
+	if (!is_aux_domain(dev, domain))
+		return -EPERM;
 
-	return false;
+	ret = prepare_domain_attach_device(domain, dev);
+	if (ret)
+		return ret;
+
+	return aux_domain_add_dev(to_dmar_domain(domain), dev);
 }
 
-static int __init platform_optin_force_iommu(void)
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+				      struct device *dev)
 {
-	if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
-		return 0;
+	dmar_remove_one_dev_info(dev);
+}
 
-	if (no_iommu || dmar_disabled)
-		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
+static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
+					  struct device *dev)
+{
+	aux_domain_remove_dev(to_dmar_domain(domain), dev);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned devices guest
+ * owns the first level page tables. Invalidations of translation caches in the
+ * guest are trapped and passed down to the host.
+ *
+ * vIOMMU in the guest will only expose first level page tables, therefore
+ * we do not support IOTLB granularity for request without PASID (second level).
+ *
+ * For example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ */
 
+static const int
+inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
 	/*
-	 * If Intel-IOMMU is disabled by default, we will apply identity
-	 * map for all devices except those marked as being untrusted.
+	 * PASID based IOTLB invalidation: PASID selective (per PASID),
+	 * page selective (address granularity)
 	 */
-	if (dmar_disabled)
-		iommu_identity_mapping |= IDENTMAP_ALL;
+	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+	/* PASID based dev TLBs */
+	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
+	/* PASID cache */
+	{-EINVAL, -EINVAL, -EINVAL}
+};
 
-	dmar_disabled = 0;
-	no_iommu = 0;
+static inline int to_vtd_granularity(int type, int granu)
+{
+	return inv_type_granu_table[type][granu];
+}
 
-	return 1;
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
+	 * granu size in contiguous memory.
+	 */
+	return order_base_2(nr_pages);
 }
 
-static int __init probe_acpi_namespace_devices(void)
+static int
+intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
+			   struct iommu_cache_invalidate_info *inv_info)
 {
-	struct dmar_drhd_unit *drhd;
-	/* To avoid a -Wunused-but-set-variable warning. */
-	struct intel_iommu *iommu __maybe_unused;
-	struct device *dev;
-	int i, ret = 0;
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	int cache_type;
+	u8 bus, devfn;
+	u16 did, sid;
+	int ret = 0;
+	u64 size = 0;
+	bool default_pasid = false;
 
-	for_each_active_iommu(iommu, drhd) {
-		for_each_active_dev_scope(drhd->devices,
-					  drhd->devices_cnt, i, dev) {
-			struct acpi_device_physical_node *pn;
-			struct iommu_group *group;
-			struct acpi_device *adev;
+	if (!inv_info || !dmar_domain)
+		return -EINVAL;
 
-			if (dev->bus != &acpi_bus_type)
-				continue;
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
 
-			adev = to_acpi_device(dev);
-			mutex_lock(&adev->physical_node_lock);
-			list_for_each_entry(pn,
-					    &adev->physical_node_list, node) {
-				group = iommu_group_get(pn->dev);
-				if (group) {
-					iommu_group_put(group);
-					continue;
-				}
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
 
-				pn->dev->bus->iommu_ops = &intel_iommu_ops;
-				ret = iommu_probe_device(pn->dev);
-				if (ret)
-					break;
-			}
-			mutex_unlock(&adev->physical_node_lock);
+	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+		return -EINVAL;
 
-			if (ret)
-				return ret;
-		}
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+	info = get_domain_info(dev);
+	if (!info) {
+		ret = -EINVAL;
+		goto out_unlock;
 	}
+	did = dmar_domain->iommu_did[iommu->seq_id];
+	sid = PCI_DEVID(bus, devfn);
+
+	/* Size is only valid in address selective invalidation */
+	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
+		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
+				   inv_info->granu.addr_info.nb_granules);
+
+	for_each_set_bit(cache_type,
+			 (unsigned long *)&inv_info->cache,
+			 IOMMU_CACHE_INV_TYPE_NR) {
+		int granu = 0;
+		u64 pasid = 0;
+		u64 addr = 0;
+
+		granu = to_vtd_granularity(cache_type, inv_info->granularity);
+		if (granu == -EINVAL) {
+			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
+					   cache_type, inv_info->granularity);
+			break;
+		}
 
-	return 0;
-}
+		/*
+		 * PASID is stored in different locations based on the
+		 * granularity.
+		 */
+		if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+			if (inv_info->granu.pasid_info.flags &
+			    IOMMU_INV_PASID_FLAGS_PASID) {
+				pasid = inv_info->granu.pasid_info.pasid;
+			} else {
+				pasid = domain_get_pasid(domain, dev);
+				default_pasid = true;
+			}
+		} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+			if (inv_info->granu.addr_info.flags &
+			    IOMMU_INV_ADDR_FLAGS_PASID) {
+				pasid = inv_info->granu.addr_info.pasid;
+			} else {
+				pasid = domain_get_pasid(domain, dev);
+				default_pasid = true;
+			}
+		}
 
-int __init intel_iommu_init(void)
-{
-	int ret = -ENODEV;
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
+		if (default_pasid)
+			ret = ioasid_get(NULL, pasid);
+		else
+			ret = ioasid_get_if_owned(pasid);
 
-	/*
-	 * Intel IOMMU is required for a TXT/tboot launch or platform
-	 * opt in, so enforce that.
-	 */
-	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
-		    platform_optin_force_iommu();
+		if (ret)
+			goto out_unlock;
+
+		switch (BIT(cache_type)) {
+		case IOMMU_CACHE_INV_TYPE_IOTLB:
+			/* HW will ignore LSB bits based on address mask */
+			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
+			    size &&
+			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
+				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
+						   inv_info->granu.addr_info.addr, size);
+			}
 
-	if (iommu_init_mempool()) {
-		if (force_on)
-			panic("tboot: Failed to initialize iommu memory\n");
-		return -ENOMEM;
-	}
+			/*
+			 * If granu is PASID-selective, address is ignored.
+			 * We use npages = -1 to indicate that.
+			 */
+			qi_flush_piotlb(iommu, did, pasid,
+					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
+					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
+					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
 
-	down_write(&dmar_global_lock);
-	if (dmar_table_init()) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMAR table\n");
-		goto out_free_dmar;
-	}
+			if (!info->ats_enabled)
+				break;
+			/*
+			 * Always flush device IOTLB if ATS is enabled. vIOMMU
+			 * in the guest may assume IOTLB flush is inclusive,
+			 * which is more efficient.
+			 */
+			fallthrough;
+		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
+			/*
+			 * PASID based device TLB invalidation does not support
+			 * IOMMU_INV_GRANU_PASID granularity but only supports
+			 * IOMMU_INV_GRANU_ADDR.
+			 * The equivalent of that is we set the size to be the
+			 * entire range of 64 bit. User only provides PASID info
+			 * without address info. So we set addr to 0.
+			 */
+			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+				size = 64 - VTD_PAGE_SHIFT;
+				addr = 0;
+			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+				addr = inv_info->granu.addr_info.addr;
+			}
 
-	if (dmar_dev_scope_init() < 0) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMAR device scope\n");
-		goto out_free_dmar;
+			if (info->ats_enabled)
+				qi_flush_dev_iotlb_pasid(iommu, sid,
+						info->pfsid, pasid,
+						info->ats_qdep, addr,
+						size);
+			else
+				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
+			break;
+		default:
+			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
+					    cache_type);
+			ret = -EINVAL;
+		}
+		ioasid_put(NULL, pasid);
 	}
+out_unlock:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
-	up_write(&dmar_global_lock);
+	return ret;
+}
+#endif
 
-	/*
-	 * The bus notifier takes the dmar_global_lock, so lockdep will
-	 * complain later when we register it under the lock.
-	 */
-	dmar_register_bus_notifier();
+static int intel_iommu_map(struct iommu_domain *domain,
+			   unsigned long iova, phys_addr_t hpa,
+			   size_t size, int iommu_prot, gfp_t gfp)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	u64 max_addr;
+	int prot = 0;
 
-	down_write(&dmar_global_lock);
+	if (iommu_prot & IOMMU_READ)
+		prot |= DMA_PTE_READ;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= DMA_PTE_WRITE;
+	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+		prot |= DMA_PTE_SNP;
 
-	if (!no_iommu)
-		intel_iommu_debugfs_init();
+	max_addr = iova + size;
+	if (dmar_domain->max_addr < max_addr) {
+		u64 end;
 
-	if (no_iommu || dmar_disabled) {
-		/*
-		 * We exit the function here to ensure IOMMU's remapping and
-		 * mempool aren't setup, which means that the IOMMU's PMRs
-		 * won't be disabled via the call to init_dmars(). So disable
-		 * it explicitly here. The PMRs were setup by tboot prior to
-		 * calling SENTER, but the kernel is expected to reset/tear
-		 * down the PMRs.
-		 */
-		if (intel_iommu_tboot_noforce) {
-			for_each_iommu(iommu, drhd)
-				iommu_disable_protect_mem_regions(iommu);
+		/* check if minimum agaw is sufficient for mapped address */
+		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
+		if (end < max_addr) {
+			pr_err("%s: iommu width (%d) is not "
+			       "sufficient for the mapped address (%llx)\n",
+			       __func__, dmar_domain->gaw, max_addr);
+			return -EFAULT;
 		}
-
-		/*
-		 * Make sure the IOMMUs are switched off, even when we
-		 * boot into a kexec kernel and the previous kernel left
-		 * them enabled
-		 */
-		intel_disable_iommus();
-		goto out_free_dmar;
+		dmar_domain->max_addr = max_addr;
 	}
+	/* Round up size to next multiple of PAGE_SIZE, if it and
+	   the low bits of hpa would take us onto the next page */
+	size = aligned_nrpages(hpa, size);
+	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+				hpa >> VTD_PAGE_SHIFT, size, prot);
+}
 
-	if (list_empty(&dmar_rmrr_units))
-		pr_info("No RMRR found\n");
-
-	if (list_empty(&dmar_atsr_units))
-		pr_info("No ATSR found\n");
+static size_t intel_iommu_unmap(struct iommu_domain *domain,
+				unsigned long iova, size_t size,
+				struct iommu_iotlb_gather *gather)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long start_pfn, last_pfn;
+	int level = 0;
 
-	if (dmar_init_reserved_ranges()) {
-		if (force_on)
-			panic("tboot: Failed to reserve iommu ranges\n");
-		goto out_free_reserved_range;
-	}
+	/* Cope with horrid API which requires us to unmap more than the
+	   size argument if it happens to be a large-page mapping. */
+	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
 
-	if (dmar_map_gfx)
-		intel_iommu_gfx_mapped = 1;
+	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
+		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
 
-	init_no_remapping_devices();
+	start_pfn = iova >> VTD_PAGE_SHIFT;
+	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
 
-	ret = init_dmars();
-	if (ret) {
-		if (force_on)
-			panic("tboot: Failed to initialize DMARs\n");
-		pr_err("Initialization failed\n");
-		goto out_free_reserved_range;
-	}
-	up_write(&dmar_global_lock);
+	gather->freelist = domain_unmap(dmar_domain, start_pfn,
+					last_pfn, gather->freelist);
 
-#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
-	/*
-	 * If the system has no untrusted device or the user has decided
-	 * to disable the bounce page mechanisms, we don't need swiotlb.
-	 * Mark this and the pre-allocated bounce pages will be released
-	 * later.
-	 */
-	if (!has_untrusted_dev() || intel_no_bounce)
-		swiotlb = 0;
-#endif
-	dma_ops = &intel_dma_ops;
+	if (dmar_domain->max_addr == iova + size)
+		dmar_domain->max_addr = iova;
 
-	init_iommu_pm_ops();
+	iommu_iotlb_gather_add_page(domain, gather, iova, size);
 
-	down_read(&dmar_global_lock);
-	for_each_active_iommu(iommu, drhd) {
-		iommu_device_sysfs_add(&iommu->iommu, NULL,
-				       intel_iommu_groups,
-				       "%s", iommu->name);
-		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-		iommu_device_register(&iommu->iommu);
-	}
-	up_read(&dmar_global_lock);
+	return size;
+}
 
-	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
-	if (si_domain && !hw_pass_through)
-		register_memory_notifier(&intel_iommu_memory_nb);
-	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
-			  intel_iommu_cpu_dead);
+static void intel_iommu_tlb_sync(struct iommu_domain *domain,
+				 struct iommu_iotlb_gather *gather)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	unsigned long iova_pfn = IOVA_PFN(gather->start);
+	size_t size = gather->end - gather->start;
+	unsigned long start_pfn;
+	unsigned long nrpages;
+	int iommu_id;
 
-	down_read(&dmar_global_lock);
-	if (probe_acpi_namespace_devices())
-		pr_warn("ACPI name space devices didn't probe correctly\n");
+	nrpages = aligned_nrpages(gather->start, size);
+	start_pfn = mm_to_dma_pfn(iova_pfn);
 
-	/* Finally, we enable the DMA remapping hardware. */
-	for_each_iommu(iommu, drhd) {
-		if (!drhd->ignored && !translation_pre_enabled(iommu))
-			iommu_enable_translation(iommu);
+	for_each_domain_iommu(iommu_id, dmar_domain)
+		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+				      start_pfn, nrpages, !gather->freelist, 0);
 
-		iommu_disable_protect_mem_regions(iommu);
-	}
-	up_read(&dmar_global_lock);
+	dma_free_pagelist(gather->freelist);
+}
 
-	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+					    dma_addr_t iova)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct dma_pte *pte;
+	int level = 0;
+	u64 phys = 0;
 
-	intel_iommu_enabled = 1;
+	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
+	if (pte && dma_pte_present(pte))
+		phys = dma_pte_addr(pte) +
+			(iova & (BIT_MASK(level_to_offset_bits(level) +
+						VTD_PAGE_SHIFT) - 1));
 
-	return 0;
+	return phys;
+}
 
-out_free_reserved_range:
-	put_iova_domain(&reserved_iova_list);
-out_free_dmar:
-	intel_iommu_free_dmars();
-	up_write(&dmar_global_lock);
-	iommu_exit_mempool();
-	return ret;
+static bool intel_iommu_capable(enum iommu_cap cap)
+{
+	if (cap == IOMMU_CAP_CACHE_COHERENCY)
+		return domain_update_iommu_snooping(NULL);
+	if (cap == IOMMU_CAP_INTR_REMAP)
+		return irq_remapping_enabled == 1;
+	if (cap == IOMMU_CAP_VIOMMU_HINT)
+		return intel_caching_mode;
+
+	return false;
 }
 
-static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 {
-	struct intel_iommu *iommu = opaque;
+	struct intel_iommu *iommu;
 
-	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
-	return 0;
+	iommu = device_to_iommu(dev, NULL, NULL);
+	if (!iommu)
+		return ERR_PTR(-ENODEV);
+
+	if (translation_pre_enabled(iommu))
+		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
+
+	return &iommu->iommu;
 }
 
-/*
- * NB - intel-iommu lacks any sort of reference counting for the users of
- * dependent devices.  If multiple endpoints have intersecting dependent
- * devices, unbinding the driver from any one of them will possibly leave
- * the others unable to operate.
- */
-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+static void intel_iommu_release_device(struct device *dev)
 {
-	if (!iommu || !dev || !dev_is_pci(dev))
+	struct intel_iommu *iommu;
+
+	iommu = device_to_iommu(dev, NULL, NULL);
+	if (!iommu)
 		return;
 
-	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+	dmar_remove_one_dev_info(dev);
+
+	set_dma_ops(dev, NULL);
 }
 
-static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+static void intel_iommu_probe_finalize(struct device *dev)
 {
-	struct dmar_domain *domain;
-	struct intel_iommu *iommu;
-	unsigned long flags;
+	set_dma_ops(dev, NULL);
+	iommu_setup_dma_ops(dev, 0, U64_MAX);
+}
 
-	assert_spin_locked(&device_domain_lock);
+static void intel_iommu_get_resv_regions(struct device *device,
+					 struct list_head *head)
+{
+	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
+	struct iommu_resv_region *reg;
+	struct dmar_rmrr_unit *rmrr;
+	struct device *i_dev;
+	int i;
 
-	if (WARN_ON(!info))
-		return;
+	down_read(&dmar_global_lock);
+	for_each_rmrr_units(rmrr) {
+		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+					  i, i_dev) {
+			struct iommu_resv_region *resv;
+			enum iommu_resv_type type;
+			size_t length;
 
-	iommu = info->iommu;
-	domain = info->domain;
+			if (i_dev != device &&
+			    !is_downstream_to_pci_bridge(device, i_dev))
+				continue;
 
-	if (info->dev) {
-		if (dev_is_pci(info->dev) && sm_supported(iommu))
-			intel_pasid_tear_down_entry(iommu, info->dev,
-					PASID_RID2PASID);
+			length = rmrr->end_address - rmrr->base_address + 1;
 
-		iommu_disable_dev_iotlb(info);
-		domain_context_clear(iommu, info->dev);
-		intel_pasid_free_table(info->dev);
-	}
+			type = device_rmrr_is_relaxable(device) ?
+				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
 
-	unlink_domain_info(info);
+			resv = iommu_alloc_resv_region(rmrr->base_address,
+						       length, prot, type);
+			if (!resv)
+				break;
 
-	spin_lock_irqsave(&iommu->lock, flags);
-	domain_detach_iommu(domain, iommu);
-	spin_unlock_irqrestore(&iommu->lock, flags);
+			list_add_tail(&resv->list, head);
+		}
+	}
+	up_read(&dmar_global_lock);
 
-	/* free the private domain */
-	if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
-	    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
-	    list_empty(&domain->devices))
-		domain_exit(info->domain);
+#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
+	if (dev_is_pci(device)) {
+		struct pci_dev *pdev = to_pci_dev(device);
 
-	free_devinfo_mem(info);
+		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
+			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
+						   IOMMU_RESV_DIRECT_RELAXABLE);
+			if (reg)
+				list_add_tail(&reg->list, head);
+		}
+	}
+#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
+
+	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
+				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
+				      0, IOMMU_RESV_MSI);
+	if (!reg)
+		return;
+	list_add_tail(&reg->list, head);
 }
 
-static void dmar_remove_one_dev_info(struct device *dev)
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 {
 	struct device_domain_info *info;
+	struct context_entry *context;
+	struct dmar_domain *domain;
 	unsigned long flags;
+	u64 ctx_lo;
+	int ret;
+
+	domain = find_domain(dev);
+	if (!domain)
+		return -EINVAL;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
-	if (info && info != DEFER_DEVICE_DOMAIN_INFO
-	    && info != DUMMY_DEVICE_DOMAIN_INFO)
-		__dmar_remove_one_dev_info(info);
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-}
+	spin_lock(&iommu->lock);
 
-static int md_domain_init(struct dmar_domain *domain, int guest_width)
-{
-	int adjust_width;
+	ret = -EINVAL;
+	info = get_domain_info(dev);
+	if (!info || !info->pasid_supported)
+		goto out;
+
+	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
+	if (WARN_ON(!context))
+		goto out;
 
-	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
-	domain_reserve_special_ranges(domain);
+	ctx_lo = context[0].lo;
 
-	/* calculate AGAW */
-	domain->gaw = guest_width;
-	adjust_width = guestwidth_to_adjustwidth(guest_width);
-	domain->agaw = width_to_agaw(adjust_width);
+	if (!(ctx_lo & CONTEXT_PASIDE)) {
+		ctx_lo |= CONTEXT_PASIDE;
+		context[0].lo = ctx_lo;
+		wmb();
+		iommu->flush.flush_context(iommu,
+					   domain->iommu_did[iommu->seq_id],
+					   PCI_DEVID(info->bus, info->devfn),
+					   DMA_CCMD_MASK_NOBIT,
+					   DMA_CCMD_DEVICE_INVL);
+	}
 
-	domain->iommu_coherency = 0;
-	domain->iommu_snooping = 0;
-	domain->iommu_superpage = 0;
-	domain->max_addr = 0;
+	/* Enable PASID support in the device, if it wasn't already */
+	if (!info->pasid_enabled)
+		iommu_enable_dev_iotlb(info);
 
-	/* always allocate the top pgd */
-	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
-	if (!domain->pgd)
-		return -ENOMEM;
-	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
-	return 0;
+	ret = 0;
+
+ out:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
 }
 
-static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
+static struct iommu_group *intel_iommu_device_group(struct device *dev)
 {
-	struct dmar_domain *dmar_domain;
-	struct iommu_domain *domain;
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+	return generic_device_group(dev);
+}
 
-	switch (type) {
-	case IOMMU_DOMAIN_DMA:
-	/* fallthrough */
-	case IOMMU_DOMAIN_UNMANAGED:
-		dmar_domain = alloc_domain(0);
-		if (!dmar_domain) {
-			pr_err("Can't allocate dmar_domain\n");
-			return NULL;
-		}
-		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
-			pr_err("Domain initialization failed\n");
-			domain_exit(dmar_domain);
-			return NULL;
-		}
+static int intel_iommu_enable_auxd(struct device *dev)
+{
+	struct device_domain_info *info;
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	int ret;
 
-		if (type == IOMMU_DOMAIN_DMA &&
-		    init_iova_flush_queue(&dmar_domain->iovad,
-					  iommu_flush_iova, iova_entry_free)) {
-			pr_warn("iova flush queue initialization failed\n");
-			intel_iommu_strict = 1;
-		}
+	iommu = device_to_iommu(dev, NULL, NULL);
+	if (!iommu || dmar_disabled)
+		return -EINVAL;
 
-		domain_update_iommu_cap(dmar_domain);
+	if (!sm_supported(iommu) || !pasid_supported(iommu))
+		return -EINVAL;
 
-		domain = &dmar_domain->domain;
-		domain->geometry.aperture_start = 0;
-		domain->geometry.aperture_end   =
-				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
-		domain->geometry.force_aperture = true;
+	ret = intel_iommu_enable_pasid(iommu, dev);
+	if (ret)
+		return -ENODEV;
 
-		return domain;
-	case IOMMU_DOMAIN_IDENTITY:
-		return &si_domain->domain;
-	default:
-		return NULL;
-	}
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info = get_domain_info(dev);
+	info->auxd_enabled = 1;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
-	return NULL;
+	return 0;
 }
 
-static void intel_iommu_domain_free(struct iommu_domain *domain)
+static int intel_iommu_disable_auxd(struct device *dev)
 {
-	if (domain != &si_domain->domain)
-		domain_exit(to_dmar_domain(domain));
+	struct device_domain_info *info;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	info = get_domain_info(dev);
+	if (!WARN_ON(!info))
+		info->auxd_enabled = 0;
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return 0;
 }
 
 /*
- * Check whether a @domain could be attached to the @dev through the
- * aux-domain attach/detach APIs.
+ * A PCI express designated vendor specific extended capability is defined
+ * in the section 3.7 of Intel scalable I/O virtualization technical spec
+ * for system software and tools to detect endpoint devices supporting the
+ * Intel scalable IO virtualization without host driver dependency.
+ *
+ * Returns the address of the matching extended capability structure within
+ * the device's PCI configuration space or 0 if the device does not support
+ * it.
  */
-static inline bool
-is_aux_domain(struct device *dev, struct iommu_domain *domain)
+static int siov_find_pci_dvsec(struct pci_dev *pdev)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	int pos;
+	u16 vendor, id;
 
-	return info && info->auxd_enabled &&
-			domain->type == IOMMU_DOMAIN_UNMANAGED;
+	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
+	while (pos) {
+		pci_read_config_word(pdev, pos + 4, &vendor);
+		pci_read_config_word(pdev, pos + 8, &id);
+		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
+			return pos;
+
+		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+	}
+
+	return 0;
 }
 
-static void auxiliary_link_device(struct dmar_domain *domain,
-				  struct device *dev)
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	struct device_domain_info *info = dev->archdata.iommu;
+	if (feat == IOMMU_DEV_FEAT_AUX) {
+		int ret;
 
-	assert_spin_locked(&device_domain_lock);
-	if (WARN_ON(!info))
-		return;
+		if (!dev_is_pci(dev) || dmar_disabled ||
+		    !scalable_mode_support() || !pasid_mode_support())
+			return false;
 
-	domain->auxd_refcnt++;
-	list_add(&domain->auxd, &info->auxiliary_domains);
-}
+		ret = pci_pasid_features(to_pci_dev(dev));
+		if (ret < 0)
+			return false;
 
-static void auxiliary_unlink_device(struct dmar_domain *domain,
-				    struct device *dev)
-{
-	struct device_domain_info *info = dev->archdata.iommu;
+		return !!siov_find_pci_dvsec(to_pci_dev(dev));
+	}
 
-	assert_spin_locked(&device_domain_lock);
-	if (WARN_ON(!info))
-		return;
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
 
-	list_del(&domain->auxd);
-	domain->auxd_refcnt--;
+		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
+			info->pasid_supported && info->pri_supported &&
+			info->ats_supported;
+	}
+
+	if (feat == IOMMU_DEV_FEAT_HWDBM) {
+		struct device_domain_info *info = get_domain_info(dev);
+
+		/* FL supports dirty bit by default. */
+		return domain_use_first_level(info->domain) ||
+		       (!domain_use_first_level(info->domain) && slad_support());
+	}
 
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		intel_pasid_free_id(domain->default_pasid);
+	return false;
 }
 
-static int aux_domain_add_dev(struct dmar_domain *domain,
-			      struct device *dev)
+static int
+intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	int ret;
-	u8 bus, devfn;
-	unsigned long flags;
-	struct intel_iommu *iommu;
+	if (feat == IOMMU_DEV_FEAT_AUX)
+		return intel_iommu_enable_auxd(dev);
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return -ENODEV;
+	if (feat == IOMMU_DEV_FEAT_SVA) {
+		struct device_domain_info *info = get_domain_info(dev);
 
-	if (domain->default_pasid <= 0) {
-		int pasid;
+		if (!info)
+			return -EINVAL;
 
-		pasid = intel_pasid_alloc_id(domain, PASID_MIN,
-					     pci_max_pasids(to_pci_dev(dev)),
-					     GFP_KERNEL);
-		if (pasid <= 0) {
-			pr_err("Can't allocate default pasid\n");
-			return -ENODEV;
-		}
-		domain->default_pasid = pasid;
+		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
+			return 0;
 	}
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	/*
-	 * iommu->lock must be held to attach domain to iommu and setup the
-	 * pasid entry for second level translation.
-	 */
-	spin_lock(&iommu->lock);
-	ret = domain_attach_iommu(domain, iommu);
-	if (ret)
-		goto attach_failed;
+	return -ENODEV;
+}
 
-	/* Setup the PASID entry for mediated devices: */
-	ret = intel_pasid_setup_second_level(iommu, domain, dev,
-					     domain->default_pasid);
-	if (ret)
-		goto table_failed;
-	spin_unlock(&iommu->lock);
+static int
+intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+	if (feat == IOMMU_DEV_FEAT_AUX)
+		return intel_iommu_disable_auxd(dev);
 
-	auxiliary_link_device(domain, dev);
+	return -ENODEV;
+}
 
-	spin_unlock_irqrestore(&device_domain_lock, flags);
+static bool
+intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
+{
+	struct device_domain_info *info = get_domain_info(dev);
 
-	return 0;
+	if (feat == IOMMU_DEV_FEAT_AUX)
+		return scalable_mode_support() && info && info->auxd_enabled;
 
-table_failed:
-	domain_detach_iommu(domain, iommu);
-attach_failed:
-	spin_unlock(&iommu->lock);
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		intel_pasid_free_id(domain->default_pasid);
+	if (feat == IOMMU_DEV_FEAT_HWDBM)
+		return intel_iommu_dev_has_feat(dev, feat);
 
-	return ret;
+	return false;
 }
 
-static void aux_domain_remove_dev(struct dmar_domain *domain,
-				  struct device *dev)
+static int
+intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
 {
-	struct device_domain_info *info;
-	struct intel_iommu *iommu;
-	unsigned long flags;
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
-	if (!is_aux_domain(dev, &domain->domain))
-		return;
+	return dmar_domain->default_pasid > 0 ?
+			dmar_domain->default_pasid : -EINVAL;
+}
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
-	iommu = info->iommu;
+int domain_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
-	auxiliary_unlink_device(domain, dev);
+	if (is_aux_domain(dev, domain))
+		return intel_iommu_aux_get_pasid(domain, dev);
 
-	spin_lock(&iommu->lock);
-	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
-	domain_detach_iommu(domain, iommu);
-	spin_unlock(&iommu->lock);
+	return dmar_domain->default_pasid;
+}
 
-	spin_unlock_irqrestore(&device_domain_lock, flags);
+static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
+					   struct device *dev)
+{
+	return attach_deferred(dev);
 }
 
-static int prepare_domain_attach_device(struct iommu_domain *domain,
-					struct device *dev)
+static bool domain_use_flush_queue(void)
 {
-	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int addr_width;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return -ENODEV;
-
-	/* check if this iommu agaw is sufficient for max mapped address */
-	addr_width = agaw_to_width(iommu->agaw);
-	if (addr_width > cap_mgaw(iommu->cap))
-		addr_width = cap_mgaw(iommu->cap);
+	bool r = true;
 
-	if (dmar_domain->max_addr > (1LL << addr_width)) {
-		dev_err(dev, "%s: iommu width (%d) is not "
-		        "sufficient for the mapped address (%llx)\n",
-		        __func__, addr_width, dmar_domain->max_addr);
-		return -EFAULT;
-	}
-	dmar_domain->gaw = addr_width;
+	if (intel_iommu_strict)
+		return false;
 
 	/*
-	 * Knock out extra levels of page tables if necessary
+	 * The flush queue implementation does not perform page-selective
+	 * invalidations that are required for efficient TLB flushes in virtual
+	 * environments. The benefit of batching is likely to be much lower than
+	 * the overhead of synchronizing the virtual and physical IOMMU
+	 * page-tables.
 	 */
-	while (iommu->agaw < dmar_domain->agaw) {
-		struct dma_pte *pte;
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		if (!cap_caching_mode(iommu->cap))
+			continue;
 
-		pte = dmar_domain->pgd;
-		if (dma_pte_present(pte)) {
-			dmar_domain->pgd = (struct dma_pte *)
-				phys_to_virt(dma_pte_addr(pte));
-			free_pgtable_page(pte);
-		}
-		dmar_domain->agaw--;
+		pr_warn_once("IOMMU batching is disabled due to virtualization");
+		r = false;
+		break;
 	}
+	rcu_read_unlock();
 
-	return 0;
+	return r;
 }
 
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-				     struct device *dev)
+static int intel_iommu_get_nesting_info(struct iommu_domain *domain,
+					struct iommu_nesting_info *info)
 {
-	int ret;
-
-	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
-	    device_is_rmrr_locked(dev)) {
-		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
-		return -EPERM;
-	}
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	u64 cap = VTD_CAP_MASK, ecap = VTD_ECAP_MASK;
+	struct device_domain_info *domain_info;
+	struct iommu_nesting_info_vtd vtd;
+	unsigned int size;
 
-	if (is_aux_domain(dev, domain))
-		return -EPERM;
+	if (!info)
+		return -EINVAL;
 
-	/* normally dev is not mapped */
-	if (unlikely(domain_context_mapped(dev))) {
-		struct dmar_domain *old_domain;
+	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
+		return -ENODEV;
 
-		old_domain = find_domain(dev);
-		if (old_domain)
-			dmar_remove_one_dev_info(dev);
+	size = sizeof(struct iommu_nesting_info);
+	/*
+	 * if provided buffer size is smaller than expected, should
+	 * return 0 and also the expected buffer size to caller.
+	 */
+	if (info->argsz < size) {
+		info->argsz = size;
+		return 0;
 	}
 
-	ret = prepare_domain_attach_device(domain, dev);
-	if (ret)
-		return ret;
-
-	return domain_add_dev_info(to_dmar_domain(domain), dev);
-}
-
-static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
-					 struct device *dev)
-{
-	int ret;
-
-	if (!is_aux_domain(dev, domain))
-		return -EPERM;
+	/*
+	 * arbitrary select the first domain_info as all nesting
+	 * related capabilities should be consistent across iommu
+	 * units.
+	 */
+	/*
+	 * Check full-device list first, and then sub-device list
+	 */
+	if (!list_empty(&dmar_domain->devices))
+		domain_info = list_first_entry(&dmar_domain->devices,
+					struct device_domain_info, link);
+	else if (!list_empty(&dmar_domain->subdevices)) {
+		struct subdev_domain_info *sinfo;
+
+		sinfo = list_first_entry(&dmar_domain->subdevices,
+					struct subdev_domain_info, link_domain);
+		domain_info = get_domain_info(sinfo->pdev);
+	} else
+		return -ENODEV;
 
-	ret = prepare_domain_attach_device(domain, dev);
-	if (ret)
-		return ret;
+	cap &= domain_info->iommu->cap;
+	ecap &= domain_info->iommu->ecap;
 
-	return aux_domain_add_dev(to_dmar_domain(domain), dev);
+	info->addr_width = dmar_domain->gaw;
+	info->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+	/* REVISIT:
+	 * to be precise, may only report SYSWIDE_PASID when pasid is
+	 * supported, also may only report page_resp when PRS is supported
+	 */
+	info->features = IOMMU_NESTING_FEAT_BIND_PGTBL |
+			 IOMMU_NESTING_FEAT_CACHE_INVLD |
+			 IOMMU_NESTING_FEAT_PAGE_RESP;
+	info->pasid_bits = ilog2(intel_pasid_max_id);
+	memset(&info->padding, 0x0, 12);
+
+	vtd.flags = 0;
+	memset(&vtd.padding, 0x0, 12);
+	vtd.cap_reg = cap & VTD_CAP_MASK;
+	vtd.ecap_reg = ecap & VTD_ECAP_MASK;
+
+	memcpy(&info->vendor.vtd, &vtd, sizeof(vtd));
+	return 0;
 }
 
-static void intel_iommu_detach_device(struct iommu_domain *domain,
-				      struct device *dev)
+static int
+intel_iommu_domain_get_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
 {
-	dmar_remove_one_dev_info(dev);
+	switch (domain->type) {
+	case IOMMU_DOMAIN_UNMANAGED:
+		switch (attr) {
+		case DOMAIN_ATTR_NESTING:
+		{
+			struct iommu_nesting_info *info =
+				(struct iommu_nesting_info *)data;
+			unsigned long flags;
+			int ret;
+
+			spin_lock_irqsave(&device_domain_lock, flags);
+			ret = intel_iommu_get_nesting_info(domain, info);
+			spin_unlock_irqrestore(&device_domain_lock, flags);
+			return ret;
+		}
+		default:
+			return -ENODEV;
+		}
+	case IOMMU_DOMAIN_DMA:
+		switch (attr) {
+		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
+			*(int *)data = domain_use_flush_queue();
+			return 0;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
 }
 
-static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
-					  struct device *dev)
+/*
+ * Check that the device does not live on an external facing PCI port that is
+ * marked as untrusted. Such devices should not be able to apply quirks and
+ * thus not be able to bypass the IOMMU restrictions.
+ */
+static bool risky_device(struct pci_dev *pdev)
 {
-	aux_domain_remove_dev(to_dmar_domain(domain), dev);
+	if (pdev->untrusted) {
+		pci_info(pdev,
+			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
+			 pdev->vendor, pdev->device);
+		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
+		return true;
+	}
+	return false;
 }
 
-static int intel_iommu_map(struct iommu_domain *domain,
-			   unsigned long iova, phys_addr_t hpa,
-			   size_t size, int iommu_prot)
+static int
+intel_iommu_domain_set_attr(struct iommu_domain *domain,
+			    enum iommu_attr attr, void *data)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	u64 max_addr;
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= DMA_PTE_READ;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= DMA_PTE_WRITE;
-	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
-		prot |= DMA_PTE_SNP;
+	unsigned long flags;
+	int ret = 0;
 
-	max_addr = iova + size;
-	if (dmar_domain->max_addr < max_addr) {
-		u64 end;
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
 
-		/* check if minimum agaw is sufficient for mapped address */
-		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
-		if (end < max_addr) {
-			pr_err("%s: iommu width (%d) is not "
-			       "sufficient for the mapped address (%llx)\n",
-			       __func__, dmar_domain->gaw, max_addr);
-			return -EFAULT;
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		spin_lock_irqsave(&device_domain_lock, flags);
+		if (nested_mode_support() &&
+		    list_empty(&dmar_domain->devices)) {
+			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+		} else {
+			ret = -ENODEV;
 		}
-		dmar_domain->max_addr = max_addr;
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
 	}
-	/* Round up size to next multiple of PAGE_SIZE, if it and
-	   the low bits of hpa would take us onto the next page */
-	size = aligned_nrpages(hpa, size);
-	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-				 hpa >> VTD_PAGE_SHIFT, size, prot);
+
 	return ret;
 }
 
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
-				unsigned long iova, size_t size,
-				struct iommu_iotlb_gather *gather)
+static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
+				       unsigned long iova, size_t size)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	struct page *freelist = NULL;
-	unsigned long start_pfn, last_pfn;
-	unsigned int npages;
-	int iommu_id, level = 0;
-
-	/* Cope with horrid API which requires us to unmap more than the
-	   size argument if it happens to be a large-page mapping. */
-	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
-
-	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
-		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
-	start_pfn = iova >> VTD_PAGE_SHIFT;
-	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
-	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+	unsigned long pages = aligned_nrpages(iova, size);
+	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
+	struct intel_iommu *iommu;
+	int iommu_id;
 
-	npages = last_pfn - start_pfn + 1;
+	for_each_domain_iommu(iommu_id, dmar_domain) {
+		iommu = g_iommus[iommu_id];
+		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
+	}
+}
 
-	for_each_domain_iommu(iommu_id, dmar_domain)
-		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
-				      start_pfn, npages, !freelist, 0);
+static int __setup_slade(struct iommu_domain *domain,
+			 struct device_domain_info *info, bool enable)
+{
+	u32 pasid;
+	int ret = 0;
 
-	dma_free_pagelist(freelist);
+	pasid = domain_get_pasid(domain, info->dev);
 
-	if (dmar_domain->max_addr == iova + size)
-		dmar_domain->max_addr = iova;
+	spin_lock(&info->iommu->lock);
+	ret = intel_pasid_setup_slade(info->dev, to_dmar_domain(domain), pasid, enable);
+	spin_unlock(&info->iommu->lock);
 
-	return size;
+	return ret;
 }
 
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
-					    dma_addr_t iova)
+static int
+__domain_clear_dirty_log(struct dmar_domain *domain,
+			 unsigned long iova, size_t size,
+			 unsigned long *bitmap,
+			 unsigned long base_iova,
+			 unsigned long bitmap_pgshift);
+
+static int intel_iommu_set_hwdbm(struct iommu_domain *domain, bool enable,
+				 unsigned long iova, size_t size)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	struct dma_pte *pte;
-	int level = 0;
-	u64 phys = 0;
+	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	unsigned long flags;
+	int ret = 0;
 
-	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
-	if (pte && dma_pte_present(pte))
-		phys = dma_pte_addr(pte) +
-			(iova & (BIT_MASK(level_to_offset_bits(level) +
-						VTD_PAGE_SHIFT) - 1));
+	if (domain_use_first_level(dmar_domain)) {
+		/* FL supports A/D bits by default. */
+		/* TODO: shall we clear FLT existing D bits? */
+		if (enable)
+			__domain_clear_dirty_log(dmar_domain, iova, size, NULL,
+						 iova, VTD_PAGE_SHIFT);
+		return 0;
+	}
 
-	return phys;
-}
+	if (!slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
 
-static inline bool scalable_mode_support(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool ret = true;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		ret = __setup_slade(domain, info, enable);
+		if (ret)
+			goto out;
+	}
 
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!sm_supported(iommu)) {
-			ret = false;
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		ret = __setup_slade(domain, info, enable);
+		if (ret)
 			break;
-		}
 	}
-	rcu_read_unlock();
+
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return ret;
 }
 
-static inline bool iommu_pasid_support(void)
+/* Temporary check */
+static inline bool check_pasid_pt_sre(void)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool ret = true;
+	struct cpuinfo_x86 *c = &cpu_data(0);
 
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!pasid_supported(iommu)) {
-			ret = false;
-			break;
-		}
+	if (c->x86_model == 0x8f && c->x86_stepping >= 4) {
+		pr_debug("SPR E0+, PASID PT SRE enabled");
+		return true;
 	}
-	rcu_read_unlock();
-
-	return ret;
+	pr_alert("No PASID PT SRE, in-kernel PASID DMA not supported!!!");
+	return false;
 }
 
-static bool intel_iommu_capable(enum iommu_cap cap)
+static int intel_enable_pasid_dma(struct device *dev, u32 pasid)
 {
-	if (cap == IOMMU_CAP_CACHE_COHERENCY)
-		return domain_update_iommu_snooping(NULL) == 1;
-	if (cap == IOMMU_CAP_INTR_REMAP)
-		return irq_remapping_enabled == 1;
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct device_domain_info *info;
+	unsigned long flags;
+	int ret = 0;
 
-	return false;
-}
+	/*
+	 * We don't bail here in that some drivers tie user SVM with
+	 * kernel PASID support.
+	 */
+	check_pasid_pt_sre();
 
-static int intel_iommu_add_device(struct device *dev)
-{
-	struct dmar_domain *dmar_domain;
-	struct iommu_domain *domain;
-	struct intel_iommu *iommu;
-	struct iommu_group *group;
-	u8 bus, devfn;
-	int ret;
+	info = get_domain_info(dev);
+	if (!info)
+		return -ENODEV;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
+	if (!dev_is_pci(dev) || !sm_supported(info->iommu))
+		return -EINVAL;
+
+	if (intel_iommu_enable_pasid(info->iommu, dev))
 		return -ENODEV;
 
-	iommu_device_link(&iommu->iommu, dev);
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+	/*
+	 * Store PASID for IOTLB flush, but only needed for non-passthrough
+	 * unmap case. For passthrough, we only need to do IOTLB flush during
+	 * PASID teardown. Flush covers all devices in the same domain as the
+	 * domain ID is the same for the same SL.
+	 */
+	info->domain->kernel_pasid = pasid;
 
-	if (translation_pre_enabled(iommu))
-		dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
+	/*
+	 * Tracks how many attached devices are using the kernel PASID. Clear
+	 * the domain kernel PASID when all users called disable_pasid_dma().
+	 */
+	atomic_inc(&info->domain->kernel_pasid_user);
 
-	group = iommu_group_get_for_dev(dev);
+	/*
+	 * Addressing modes (IOVA vs. PA) is a per device choice made by the
+	 * platform code. We must treat legacy DMA (request w/o PASID) and
+	 * DMA w/ PASID identially in terms of mapping. Here we just set up
+	 * the kernel PASID to match the mapping of RID2PASID/PASID0.
+	 */
+	if (hw_pass_through && domain_type_is_si(info->domain)) {
+		ret = intel_pasid_setup_pass_through(info->iommu, info->domain,
+						dev, pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel PASID %d in BYPASS", pasid);
 
-	if (IS_ERR(group)) {
-		ret = PTR_ERR(group);
-		goto unlink;
+	} else if (domain_use_first_level(info->domain)) {
+		/* We are using FL for IOVA, this is the default option */
+		ret = domain_setup_first_level(info->iommu, info->domain, dev,
+					       pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel PASID %d IOVA FL", pasid);
+	} else {
+		ret = intel_pasid_setup_second_level(info->iommu, info->domain,
+						     dev, pasid);
+		if (ret)
+			dev_err(dev, "Failed kernel SPASID %d IOVA SL", pasid);
 	}
 
-	iommu_group_put(group);
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
-	domain = iommu_get_domain_for_dev(dev);
-	dmar_domain = to_dmar_domain(domain);
-	if (domain->type == IOMMU_DOMAIN_DMA) {
-		if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
-			ret = iommu_request_dm_for_dev(dev);
-			if (ret) {
-				dmar_remove_one_dev_info(dev);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-				domain_add_dev_info(si_domain, dev);
-				dev_info(dev,
-					 "Device uses a private identity domain.\n");
-			}
+	return ret;
+}
+
+static int
+__domain_sync_dirty_log(struct dmar_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap,
+			unsigned long base_iova,
+			unsigned long bitmap_pgshift)
+{
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long offset;
+	unsigned int largepage_lvl = 0;
+	unsigned int nbits;
+
+	if (bitmap_pgshift != VTD_PAGE_SHIFT)
+		return -EINVAL;
+
+	while (nr_pages > 0) {
+		largepage_lvl = 0;
+
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
+
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_debug("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
 		}
-	} else {
-		if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
-			ret = iommu_request_dma_domain_for_dev(dev);
-			if (ret) {
-				dmar_remove_one_dev_info(dev);
-				dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
-				if (!get_private_domain_for_dev(dev)) {
-					dev_warn(dev,
-						 "Failed to get a private domain.\n");
-					ret = -ENOMEM;
-					goto unlink;
-				}
 
-				dev_info(dev,
-					 "Device uses a private dma domain.\n");
+		if (!domain_use_first_level(domain)) {
+			if (!(pte->val & BIT(9))) {
+				pr_debug("SL: The 0x%lx pte is not dirty.\n", iova);
+				goto skip;
+			}
+		} else {
+			if (!(pte->val & BIT(6))) {
+				pr_debug("FL: The 0x%lx pte is not dirty.\n", iova);
+				goto skip;
 			}
 		}
-	}
 
-	if (device_needs_bounce(dev)) {
-		dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
-		set_dma_ops(dev, &bounce_dma_ops);
+		if (bitmap) {
+			nbits = lvl_pages;
+			offset = (iova - base_iova) >> bitmap_pgshift;
+			bitmap_set(bitmap, offset, nbits);
+		}
+
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SIZE;
 	}
 
 	return 0;
-
-unlink:
-	iommu_device_unlink(&iommu->iommu, dev);
-	return ret;
 }
 
-static void intel_iommu_remove_device(struct device *dev)
+/*
+ * Make sure the quering iova has been mapped and is being used. Otherwise,
+ * there may be fatal error if another thread free the visiting pages.
+ */
+static int intel_iommu_sync_dirty_log(struct iommu_domain *domain,
+				      unsigned long iova, size_t size,
+				      unsigned long *bitmap,
+				      unsigned long base_iova,
+				      unsigned long bitmap_pgshift)
 {
-	struct intel_iommu *iommu;
-	u8 bus, devfn;
-
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
-		return;
-
-	dmar_remove_one_dev_info(dev);
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
-	iommu_group_remove_device(dev);
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
 
-	iommu_device_unlink(&iommu->iommu, dev);
+	__domain_sync_dirty_log(dmar_domain, iova, size, bitmap,
+				base_iova, bitmap_pgshift);
 
-	if (device_needs_bounce(dev))
-		set_dma_ops(dev, NULL);
+	return 0;
 }
 
-static void intel_iommu_get_resv_regions(struct device *device,
-					 struct list_head *head)
-{
-	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
-	struct iommu_resv_region *reg;
-	struct dmar_rmrr_unit *rmrr;
-	struct device *i_dev;
-	int i;
-
-	down_read(&dmar_global_lock);
-	for_each_rmrr_units(rmrr) {
-		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
-					  i, i_dev) {
-			struct iommu_resv_region *resv;
-			enum iommu_resv_type type;
-			size_t length;
+static int
+__domain_clear_dirty_log(struct dmar_domain *domain,
+			 unsigned long iova, size_t size,
+			 unsigned long *bitmap,
+			 unsigned long base_iova,
+			 unsigned long bitmap_pgshift)
+{
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long offset;
+	unsigned int largepage_lvl = 0;
+	unsigned int nbits;
+	int iommu_id, i;
+	unsigned long start_pfn = iov_pfn;
+	bool cleared = false;
 
-			if (i_dev != device &&
-			    !is_downstream_to_pci_bridge(device, i_dev))
-				continue;
+	if (bitmap_pgshift != VTD_PAGE_SHIFT)
+		return -EINVAL;
 
-			length = rmrr->end_address - rmrr->base_address + 1;
+	while (nr_pages > 0) {
+		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+							0, nr_pages);
 
-			type = device_rmrr_is_relaxable(device) ?
-				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
 
-			resv = iommu_alloc_resv_region(rmrr->base_address,
-						       length, prot, type);
-			if (!resv)
-				break;
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
 
-			list_add_tail(&resv->list, head);
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
 		}
-	}
-	up_read(&dmar_global_lock);
-
-#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
-	if (dev_is_pci(device)) {
-		struct pci_dev *pdev = to_pci_dev(device);
 
-		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
-			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
-						   IOMMU_RESV_DIRECT_RELAXABLE);
-			if (reg)
-				list_add_tail(&reg->list, head);
+		/* Ensure all corresponding bits are set */
+		if (bitmap) {
+			nbits = lvl_pages;
+			offset = (iova - base_iova) >> bitmap_pgshift;
+			for (i = offset; i < offset + nbits; i++) {
+				if (!test_bit(i, bitmap))
+					goto skip;
+			}
 		}
-	}
-#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
 
-	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
-				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
-				      0, IOMMU_RESV_MSI);
-	if (!reg)
-		return;
-	list_add_tail(&reg->list, head);
+		if (!domain_use_first_level(domain))
+			test_and_clear_bit(9, (unsigned long *)&pte->val);
+		else
+			test_and_clear_bit(6, (unsigned long *)&pte->val);
+
+		cleared = true;
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SIZE;
+	}
+
+	if (cleared)
+		for_each_domain_iommu(iommu_id, domain)
+			iommu_flush_iotlb_psi(g_iommus[iommu_id], domain,
+				start_pfn, size >> VTD_PAGE_SHIFT, 1, 0);
+
+	return 0;
 }
 
-static void intel_iommu_put_resv_regions(struct device *dev,
-					 struct list_head *head)
+/*
+ * Make sure the clearing iova has been mapped and is being used. Otherwise,
+ * there may be fatal error if another thread free the visiting pages.
+ */
+static int intel_iommu_clear_dirty_log(struct iommu_domain *domain,
+				       unsigned long iova, size_t size,
+				       unsigned long *bitmap,
+				       unsigned long base_iova,
+				       unsigned long bitmap_pgshift)
 {
-	struct iommu_resv_region *entry, *next;
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	int ret = 0;
+
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
 
-	list_for_each_entry_safe(entry, next, head, list)
-		kfree(entry);
+	ret = __domain_clear_dirty_log(dmar_domain, iova, size, bitmap,
+					base_iova, bitmap_pgshift);
+	return ret;
 }
 
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
+static int intel_disable_pasid_dma(struct device *dev)
 {
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	struct device_domain_info *info;
-	struct context_entry *context;
-	struct dmar_domain *domain;
 	unsigned long flags;
-	u64 ctx_lo;
-	int ret;
+	int ret = 0;
 
-	domain = find_domain(dev);
-	if (!domain)
+	info = get_domain_info(dev);
+	if (!info)
+		return -ENODEV;
+
+	if (!dev_is_pci(dev) || !sm_supported(info->iommu))
 		return -EINVAL;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
 	spin_lock(&iommu->lock);
 
-	ret = -EINVAL;
-	info = dev->archdata.iommu;
-	if (!info || !info->pasid_supported)
-		goto out;
-
-	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
-	if (WARN_ON(!context))
-		goto out;
-
-	ctx_lo = context[0].lo;
-
-	if (!(ctx_lo & CONTEXT_PASIDE)) {
-		ctx_lo |= CONTEXT_PASIDE;
-		context[0].lo = ctx_lo;
-		wmb();
-		iommu->flush.flush_context(iommu,
-					   domain->iommu_did[iommu->seq_id],
-					   PCI_DEVID(info->bus, info->devfn),
-					   DMA_CCMD_MASK_NOBIT,
-					   DMA_CCMD_DEVICE_INVL);
-	}
-
-	/* Enable PASID support in the device, if it wasn't already */
-	if (!info->pasid_enabled)
-		iommu_enable_dev_iotlb(info);
+	/* Tear down kernel PASID for this device */
+	intel_pasid_tear_down_entry(info->iommu, info->dev,
+				    info->domain->kernel_pasid, false,
+				    false);
+	/* Clear the domain kernel PASID when there is no users */
+	if (atomic_dec_and_test(&info->domain->kernel_pasid_user))
+		info->domain->kernel_pasid = 0;
 
-	ret = 0;
-
- out:
 	spin_unlock(&iommu->lock);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
-
 	return ret;
 }
 
-static void intel_iommu_apply_resv_region(struct device *dev,
-					  struct iommu_domain *domain,
-					  struct iommu_resv_region *region)
+static int
+__domain_split_block(struct dmar_domain *domain, struct intel_iommu *iommu,
+		     unsigned long iova, size_t size)
 {
-	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-	unsigned long start, end;
-
-	start = IOVA_PFN(region->start);
-	end   = IOVA_PFN(region->start + region->length - 1);
+	struct dma_pte *pte = NULL;
+	unsigned long nr_pages = size >> VTD_PAGE_SHIFT;
+	unsigned long lvl_pages = 0, child_lvl_pages = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned int largepage_lvl = 0;
+	unsigned int create_pages = 0;
+	int i;
+	unsigned long page_pfn;
+	unsigned long phys_pfn;
+	phys_addr_t pteval;
+	u64 attr;
+	bool splitted = false;
 
-	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
-}
+	spin_lock(&iommu->lock);
+	while (nr_pages > 0) {
+		/*
+		 * largepage_lvl == 1: 4KB page;
+		 * largepage_lvl == 2: 2MB page;
+		 * largepage_lvl == 3: 1GB page;
+		 */
+		largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
+							0, nr_pages);
 
-static struct iommu_group *intel_iommu_device_group(struct device *dev)
-{
-	if (dev_is_pci(dev))
-		return pci_device_group(dev);
-	return generic_device_group(dev);
-}
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+		BUG_ON(nr_pages < lvl_pages);
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
-{
-	struct intel_iommu *iommu;
-	u8 bus, devfn;
+		if (largepage_lvl <= 1) {
+			/* It is not large page*/
+			pr_debug("The 0x%lx pte is not super page. largepage_lvl=%d\n",
+				iova, largepage_lvl);
+			goto skip;
+		}
 
-	if (iommu_dummy(dev)) {
-		dev_warn(dev,
-			 "No IOMMU translation for device; cannot enable SVM\n");
-		return NULL;
-	}
+		/* Get the current level pte, e.g. if largepage_lvl == 1,
+		 * we get "SL-PDE: 4KB page" item which is assigned to pte.
+		 */
+		pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+		if (!pte || !dma_pte_present(pte))
+			return -EINVAL;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if ((!iommu)) {
-		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
-		return NULL;
-	}
+		if (!(pte->val & DMA_PTE_WRITE)) {
+			pr_warn("The 0x%lx pte is READ ONLY.\n", iova);
+			goto skip;
+		}
 
-	return iommu;
-}
-#endif /* CONFIG_INTEL_IOMMU_SVM */
+		/* If the page is super, split it. */
+		if (!dma_pte_superpage(pte)) {
+			pr_warn("The 0x%lx pte is not super page.\n", iova);
+			goto skip;
+		}
 
-static int intel_iommu_enable_auxd(struct device *dev)
-{
-	struct device_domain_info *info;
-	struct intel_iommu *iommu;
-	unsigned long flags;
-	u8 bus, devfn;
-	int ret;
+		phys_pfn = dma_pte_addr(pte);
+		page_pfn = iov_pfn;
+		attr = pte->val & VTD_ATTR_MASK;
+		pteval = ((phys_addr_t)phys_pfn) | attr;
+		pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+		if (largepage_lvl == 2) {
+			/* 2MB: Create 512 4KB items, i.e. split 2MB to 512 4KB pages */
+			create_pages = 512;
+		} else {
+			/* 1GB: Create 512*512 4KB items */
+			create_pages = 512 * 512;
+		}
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu || dmar_disabled)
-		return -EINVAL;
+		/* Change big page level to 4KB level */
+		largepage_lvl = 1;
 
-	if (!sm_supported(iommu) || !pasid_supported(iommu))
-		return -EINVAL;
+		/* Establish page table and get the 4KB page item */
+		pte = NULL;
+		for (i = 0; i < create_pages; i++) {
+			if (!pte) {
+				pte = pfn_to_dma_pte(domain, page_pfn, &largepage_lvl);
+				if (!pte)
+					return -ENOMEM;
+			}
+			cmpxchg64_local(&pte->val, 0ULL, pteval);
+			/* Always 1 here */
+			child_lvl_pages = lvl_to_nr_pages(largepage_lvl);
+			page_pfn += child_lvl_pages;
+			phys_pfn += child_lvl_pages;
+			pteval += child_lvl_pages * VTD_PAGE_SIZE;
+			pte++;
+			pr_debug("%s: Big page splitted to 4KB pages, pteval=0x%llx, pte=0x%llx\n",
+				__func__, pteval, (uint64_t)pte);
+			if (first_pte_in_page(pte))
+				pte = NULL;
+		}
 
-	ret = intel_iommu_enable_pasid(iommu, dev);
-	if (ret)
-		return -ENODEV;
+		splitted = true;
+skip:
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		iova += lvl_pages * VTD_PAGE_SIZE;
+	}
+	spin_unlock(&iommu->lock);
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
-	info->auxd_enabled = 1;
-	spin_unlock_irqrestore(&device_domain_lock, flags);
+	if (splitted)
+		return 1;
 
 	return 0;
 }
 
-static int intel_iommu_disable_auxd(struct device *dev)
+static int intel_iommu_split_block(struct iommu_domain *domain,
+				   unsigned long iova, size_t size)
 {
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	int ret = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&device_domain_lock, flags);
-	info = dev->archdata.iommu;
-	if (!WARN_ON(!info))
-		info->auxd_enabled = 0;
-	spin_unlock_irqrestore(&device_domain_lock, flags);
-
-	return 0;
-}
-
-/*
- * A PCI express designated vendor specific extended capability is defined
- * in the section 3.7 of Intel scalable I/O virtualization technical spec
- * for system software and tools to detect endpoint devices supporting the
- * Intel scalable IO virtualization without host driver dependency.
- *
- * Returns the address of the matching extended capability structure within
- * the device's PCI configuration space or 0 if the device does not support
- * it.
- */
-static int siov_find_pci_dvsec(struct pci_dev *pdev)
-{
-	int pos;
-	u16 vendor, id;
-
-	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
-	while (pos) {
-		pci_read_config_word(pdev, pos + 4, &vendor);
-		pci_read_config_word(pdev, pos + 8, &id);
-		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
-			return pos;
-
-		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
 	}
 
-	return 0;
-}
+	/* Return if size is less than 2MB */
+	if (size < 0x200000)
+		return 0;
 
-static bool
-intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
-{
-	if (feat == IOMMU_DEV_FEAT_AUX) {
-		int ret;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		pr_debug("%s: split for %02x:%02x.%d, iova=0x%lx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, size);
 
-		if (!dev_is_pci(dev) || dmar_disabled ||
-		    !scalable_mode_support() || !iommu_pasid_support())
-			return false;
+		ret = __domain_split_block(dmar_domain, info->iommu, iova, size);
+		if (ret && ret != 1)
+			goto out;
+	}
 
-		ret = pci_pasid_features(to_pci_dev(dev));
-		if (ret < 0)
-			return false;
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		pr_debug("%s: split for subdev %02x:%02x.%d, iova=0x%lx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, size);
 
-		return !!siov_find_pci_dvsec(to_pci_dev(dev));
+		ret = __domain_split_block(dmar_domain, info->iommu, iova, size);
+		if (ret && ret != 1)
+			break;
 	}
 
-	return false;
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
 }
 
 static int
-intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+__domain_merge_pages(struct dmar_domain *domain, struct intel_iommu *iommu,
+		     unsigned long iova, phys_addr_t paddr, size_t size)
 {
-	if (feat == IOMMU_DEV_FEAT_AUX)
-		return intel_iommu_enable_auxd(dev);
+	struct dma_pte *pte = NULL;
+	unsigned int largepage_lvl = 0;
+	unsigned long iov_pfn = iova >> VTD_PAGE_SHIFT;
+	unsigned long start_pfn = iov_pfn;
+	unsigned long end_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+	struct page *freelist;
+	int prot = 0;
+	int ret = 0;
 
-	return -ENODEV;
-}
+	/* Construct the big page again */
+	largepage_lvl = 1;
+	pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
+	if (!pte || !dma_pte_present(pte))
+		return -EINVAL;
 
-static int
-intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
-{
-	if (feat == IOMMU_DEV_FEAT_AUX)
-		return intel_iommu_disable_auxd(dev);
+	if (pte->val & DMA_PTE_READ)
+		prot |= DMA_PTE_READ;
+	if (pte->val & DMA_PTE_WRITE)
+		prot |= DMA_PTE_WRITE;
+	if (pte->val & DMA_PTE_SNP)
+		prot |= DMA_PTE_SNP;
 
-	return -ENODEV;
-}
+	pr_debug("%s: start_pfn=0x%lx, end_pfn=0x%lx, prot=0x%x, size=0x%zx, paddr=0x%llx\n",
+		__func__, start_pfn, end_pfn, prot, size, paddr);
 
-static bool
-intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-	struct device_domain_info *info = dev->archdata.iommu;
+	/* Free the splitted 4KB pages */
+	freelist = domain_unmap(domain, start_pfn, end_pfn, NULL);
+	dma_free_pagelist(freelist);
 
-	if (feat == IOMMU_DEV_FEAT_AUX)
-		return scalable_mode_support() && info && info->auxd_enabled;
+	ret = __domain_mapping(domain, start_pfn, paddr >> VTD_PAGE_SHIFT,
+				size >> VTD_PAGE_SHIFT, prot);
 
-	return false;
+	return ret;
 }
 
-static int
-intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+static int intel_iommu_merge_pages(struct iommu_domain *domain, unsigned long iova,
+				   phys_addr_t phys, size_t size)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+	struct device_domain_info *info;
+	struct subdev_domain_info *sinfo;
+	int ret = 0;
+	unsigned long flags;
 
-	return dmar_domain->default_pasid > 0 ?
-			dmar_domain->default_pasid : -EINVAL;
-}
+	if (!domain_use_first_level(dmar_domain) && !slad_support()) {
+		pr_err("Don't support SLAD\n");
+		return -EINVAL;
+	}
 
-static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
-					   struct device *dev)
-{
-	return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
-}
+	/* Return if size is less than 2MB */
+	if (size < 0x200000)
+		return 0;
 
-/*
- * Check that the device does not live on an external facing PCI port that is
- * marked as untrusted. Such devices should not be able to apply quirks and
- * thus not be able to bypass the IOMMU restrictions.
- */
-static bool risky_device(struct pci_dev *pdev)
-{
-	if (pdev->untrusted) {
-		pci_info(pdev,
-			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
-			 pdev->vendor, pdev->device);
-		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
-		return true;
+	spin_lock_irqsave(&device_domain_lock, flags);
+
+	list_for_each_entry(info, &dmar_domain->devices, link) {
+		pr_debug("%s: merge for %02x:%02x.%d, iova=0x%lx, phys=0x%llx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, phys, size);
+
+		ret = __domain_merge_pages(dmar_domain, info->iommu, iova, phys, size);
+		if (ret)
+			goto out;
 	}
-	return false;
+
+	list_for_each_entry(sinfo, &dmar_domain->subdevices, link_domain) {
+		info = get_domain_info(sinfo->pdev);
+		pr_debug("%s: merge for subdev %02x:%02x.%d, iova=0x%lx, phys=0x%llx, size=0x%zx\n",
+			__func__, info->bus, PCI_SLOT(info->devfn), PCI_FUNC(info->devfn),
+			iova, phys, size);
+
+		ret = __domain_merge_pages(dmar_domain, info->iommu, iova, phys, size);
+		if (ret)
+			break;
+	}
+
+out:
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	return ret;
 }
 
 const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
+	.domain_set_attr	= intel_iommu_domain_set_attr,
+	.domain_get_attr	= intel_iommu_domain_get_attr,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
 	.aux_detach_dev		= intel_iommu_aux_detach_device,
 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
 	.map			= intel_iommu_map,
+	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
 	.unmap			= intel_iommu_unmap,
+	.flush_iotlb_all        = intel_flush_iotlb_all,
+	.iotlb_sync		= intel_iommu_tlb_sync,
 	.iova_to_phys		= intel_iommu_iova_to_phys,
-	.add_device		= intel_iommu_add_device,
-	.remove_device		= intel_iommu_remove_device,
+	.probe_device		= intel_iommu_probe_device,
+	.probe_finalize		= intel_iommu_probe_finalize,
+	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
-	.put_resv_regions	= intel_iommu_put_resv_regions,
-	.apply_resv_region	= intel_iommu_apply_resv_region,
+	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.device_group		= intel_iommu_device_group,
 	.dev_has_feat		= intel_iommu_dev_has_feat,
 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
-	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
+	.def_domain_type	= device_def_domain_type,
+	.pgsize_bitmap		= SZ_4K,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	.cache_invalidate	= intel_iommu_sva_invalidate,
+	.sva_bind_gpasid	= intel_svm_bind_gpasid,
+	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
+	.sva_bind		= intel_svm_bind,
+	.sva_unbind		= intel_svm_unbind,
+	.sva_get_pasid		= intel_svm_get_pasid,
+	.page_response		= intel_svm_page_response,
+#endif
+	.merge_pages		= intel_iommu_merge_pages,
+	.split_block		= intel_iommu_split_block,
+	.set_hwdbm		= intel_iommu_set_hwdbm,
+	.sync_dirty_log		= intel_iommu_sync_dirty_log,
+	.clear_dirty_log	= intel_iommu_clear_dirty_log,
+	.enable_pasid_dma	= intel_enable_pasid_dma,
+	.disable_pasid_dma	= intel_disable_pasid_dma,
 };
 
 static void quirk_iommu_igfx(struct pci_dev *dev)
@@ -6104,13 +6677,34 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
 		/* we have to ensure the gfx device is idle before we flush */
 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
 		intel_iommu_strict = 1;
-       }
+	}
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
 
+static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
+{
+	unsigned short ver;
+
+	if (!IS_GFX_DEVICE(dev))
+		return;
+
+	ver = (dev->device >> 8) & 0xff;
+	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
+	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
+	    ver != 0x9a)
+		return;
+
+	if (risky_device(dev))
+		return;
+
+	pci_info(dev, "Skip IOMMU disabling for graphics\n");
+	iommu_skip_te_disable = 1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
+
 /* On Tylersburg chipsets, some BIOSes have been known to enable the
    ISOCH DMAR unit for the Azalia sound device, but not give it any
    TLB entries, which causes it to deadlock. Check for that.  We do
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
similarity index 92%
rename from drivers/iommu/intel_irq_remapping.c
rename to drivers/iommu/intel/irq_remapping.c
index 5dcc81b1df623a3e477bb914756884b832f723ea..f969570fb0a99718687339abff101322477beff1 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -15,13 +15,14 @@
 #include <linux/irqdomain.h>
 #include <linux/crash_dump.h>
 #include <asm/io_apic.h>
+#include <asm/apic.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
 #include <asm/irq_remapping.h>
 #include <asm/pci-direct.h>
-#include <asm/msidef.h>
 
-#include "irq_remapping.h"
+#include "../irq_remapping.h"
+#include "cap_audit.h"
 
 enum irq_mode {
 	IRQ_REMAPPING,
@@ -151,7 +152,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	return qi_submit_sync(&desc, iommu);
+	return qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static int modify_irte(struct irq_2_iommu *irq_iommu,
@@ -203,35 +204,33 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
 	return rc;
 }
 
-static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+static struct intel_iommu *map_hpet_to_iommu(u8 hpet_id)
 {
 	int i;
 
-	for (i = 0; i < MAX_HPET_TBS; i++)
+	for (i = 0; i < MAX_HPET_TBS; i++) {
 		if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu)
 			return ir_hpet[i].iommu;
+	}
 	return NULL;
 }
 
-static struct intel_iommu *map_ioapic_to_ir(int apic)
+static struct intel_iommu *map_ioapic_to_iommu(int apic)
 {
 	int i;
 
-	for (i = 0; i < MAX_IO_APICS; i++)
+	for (i = 0; i < MAX_IO_APICS; i++) {
 		if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu)
 			return ir_ioapic[i].iommu;
+	}
 	return NULL;
 }
 
-static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+static struct irq_domain *map_dev_to_ir(struct pci_dev *dev)
 {
-	struct dmar_drhd_unit *drhd;
+	struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev);
 
-	drhd = dmar_find_matched_drhd_unit(dev);
-	if (!drhd)
-		return NULL;
-
-	return drhd->iommu;
+	return drhd ? drhd->iommu->ir_msi_domain : NULL;
 }
 
 static int clear_entries(struct irq_2_iommu *irq_iommu)
@@ -736,6 +735,9 @@ static int __init intel_prepare_irq_remapping(void)
 	if (dmar_table_init() < 0)
 		return -ENODEV;
 
+	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
+		return -ENODEV;
+
 	if (!dmar_ir_support())
 		return -ENODEV;
 
@@ -1001,7 +1003,7 @@ static int __init parse_ioapics_under_ir(void)
 
 	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
 		int ioapic_id = mpc_ioapic_id(ioapic_idx);
-		if (!map_ioapic_to_ir(ioapic_id)) {
+		if (!map_ioapic_to_iommu(ioapic_id)) {
 			pr_err(FW_BUG "ioapic %d has no mapping iommu, "
 			       "interrupt remapping will be disabled\n",
 			       ioapic_id);
@@ -1086,12 +1088,28 @@ static int reenable_irq_remapping(int eim)
 	return -1;
 }
 
+/*
+ * Store the MSI remapping domain pointer in the device if enabled.
+ *
+ * This is called from dmar_pci_bus_add_dev() so it works even when DMA
+ * remapping is disabled. Only update the pointer if the device is not
+ * already handled by a non default PCI/MSI interrupt domain. This protects
+ * e.g. VMD devices.
+ */
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info)
+{
+	if (!irq_remapping_enabled || pci_dev_has_special_msi_domain(info->dev))
+		return;
+
+	dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev));
+}
+
 static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 {
 	memset(irte, 0, sizeof(*irte));
 
 	irte->present = 1;
-	irte->dst_mode = apic->irq_dest_mode;
+	irte->dst_mode = apic->dest_mode_logical;
 	/*
 	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
 	 * actual level or edge trigger will be setup in the IO-APIC
@@ -1100,67 +1118,18 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 	 * irq migration in the presence of interrupt-remapping.
 	*/
 	irte->trigger_mode = 0;
-	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->dlvry_mode = apic->delivery_mode;
 	irte->vector = vector;
 	irte->dest_id = IRTE_DEST(dest);
 	irte->redir_hint = 1;
 }
 
-static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	struct intel_iommu *iommu = NULL;
-
-	if (!info)
-		return NULL;
-
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_IOAPIC:
-		iommu = map_ioapic_to_ir(info->ioapic_id);
-		break;
-	case X86_IRQ_ALLOC_TYPE_HPET:
-		iommu = map_hpet_to_ir(info->hpet_id);
-		break;
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		iommu = map_dev_to_ir(info->msi_dev);
-		break;
-	default:
-		BUG_ON(1);
-		break;
-	}
-
-	return iommu ? iommu->ir_domain : NULL;
-}
-
-static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
-{
-	struct intel_iommu *iommu;
-
-	if (!info)
-		return NULL;
-
-	switch (info->type) {
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		iommu = map_dev_to_ir(info->msi_dev);
-		if (iommu)
-			return iommu->ir_msi_domain;
-		break;
-	default:
-		break;
-	}
-
-	return NULL;
-}
-
 struct irq_remap_ops intel_irq_remap_ops = {
 	.prepare		= intel_prepare_irq_remapping,
 	.enable			= intel_enable_irq_remapping,
 	.disable		= disable_irq_remapping,
 	.reenable		= reenable_irq_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
-	.get_ir_irq_domain	= intel_get_ir_irq_domain,
-	.get_irq_domain		= intel_get_irq_domain,
 };
 
 static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
@@ -1270,66 +1239,58 @@ static struct irq_chip intel_ir_chip = {
 	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
 };
 
+static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->arch_addr_lo.dmar_base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.dmar_subhandle_valid = true;
+	msg->arch_addr_lo.dmar_format = true;
+	msg->arch_addr_lo.dmar_index_0_14 = index & 0x7FFF;
+	msg->arch_addr_lo.dmar_index_15 = !!(index & 0x8000);
+
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+
+	msg->arch_data.dmar_subhandle = subhandle;
+}
+
 static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 					     struct irq_cfg *irq_cfg,
 					     struct irq_alloc_info *info,
 					     int index, int sub_handle)
 {
-	struct IR_IO_APIC_route_entry *entry;
 	struct irte *irte = &data->irte_entry;
-	struct msi_msg *msg = &data->msi_entry;
 
 	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+
 	switch (info->type) {
 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
 		/* Set source-id of interrupt request */
-		set_ioapic_sid(irte, info->ioapic_id);
+		set_ioapic_sid(irte, info->devid);
 		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
-			info->ioapic_id, irte->present, irte->fpd,
+			info->devid, irte->present, irte->fpd,
 			irte->dst_mode, irte->redir_hint,
 			irte->trigger_mode, irte->dlvry_mode,
 			irte->avail, irte->vector, irte->dest_id,
 			irte->sid, irte->sq, irte->svt);
-
-		entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
-		info->ioapic_entry = NULL;
-		memset(entry, 0, sizeof(*entry));
-		entry->index2	= (index >> 15) & 0x1;
-		entry->zero	= 0;
-		entry->format	= 1;
-		entry->index	= (index & 0x7fff);
-		/*
-		 * IO-APIC RTE will be configured with virtual vector.
-		 * irq handler will do the explicit EOI to the io-apic.
-		 */
-		entry->vector	= info->ioapic_pin;
-		entry->mask	= 0;			/* enable IRQ */
-		entry->trigger	= info->ioapic_trigger;
-		entry->polarity	= info->ioapic_polarity;
-		if (info->ioapic_trigger)
-			entry->mask = 1; /* Mask level triggered irqs. */
+		sub_handle = info->ioapic.pin;
 		break;
-
 	case X86_IRQ_ALLOC_TYPE_HPET:
-	case X86_IRQ_ALLOC_TYPE_MSI:
-	case X86_IRQ_ALLOC_TYPE_MSIX:
-		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
-			set_hpet_sid(irte, info->hpet_id);
-		else
-			set_msi_sid(irte, info->msi_dev);
-
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->data = sub_handle;
-		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-				  MSI_ADDR_IR_SHV |
-				  MSI_ADDR_IR_INDEX1(index) |
-				  MSI_ADDR_IR_INDEX2(index);
+		set_hpet_sid(irte, info->devid);
+		break;
+	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
+	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+		set_msi_sid(irte,
+			    pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
+		break;
+	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
+		set_msi_sid(irte, to_pci_dev(info->desc->dev->parent));
 		break;
-
 	default:
 		BUG_ON(1);
 		break;
 	}
+	fill_msi_msg(&data->msi_entry, index, sub_handle);
 }
 
 static void intel_free_irq_resources(struct irq_domain *domain,
@@ -1367,15 +1328,16 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
 
 	if (!info || !iommu)
 		return -EINVAL;
-	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX &&
+	    info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
 		return -EINVAL;
 
 	/*
 	 * With IRQ remapping enabled, don't need contiguous CPU vectors
 	 * to support multiple MSI interrupts.
 	 */
-	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
 		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 
 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
@@ -1456,7 +1418,22 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain,
 	modify_irte(&data->irq_2_iommu, &entry);
 }
 
+static int intel_irq_remapping_select(struct irq_domain *d,
+				      struct irq_fwspec *fwspec,
+				      enum irq_domain_bus_token bus_token)
+{
+	struct intel_iommu *iommu = NULL;
+
+	if (x86_fwspec_is_ioapic(fwspec))
+		iommu = map_ioapic_to_iommu(fwspec->param[0]);
+	else if (x86_fwspec_is_hpet(fwspec))
+		iommu = map_hpet_to_iommu(fwspec->param[0]);
+
+	return iommu && d == iommu->ir_domain;
+}
+
 static const struct irq_domain_ops intel_ir_domain_ops = {
+	.select = intel_irq_remapping_select,
 	.alloc = intel_irq_remapping_alloc,
 	.free = intel_irq_remapping_free,
 	.activate = intel_irq_remapping_activate,
@@ -1471,6 +1448,10 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
 	int ret;
 	int eim = x2apic_enabled();
 
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu);
+	if (ret)
+		return ret;
+
 	if (eim && !ecap_eim_support(iommu->ecap)) {
 		pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
 			iommu->reg_phys, iommu->ecap);
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel/pasid.c
similarity index 42%
rename from drivers/iommu/intel-pasid.c
rename to drivers/iommu/intel/pasid.c
index e7cb0b8a73327083fbd7c9dd85382d3143cb7fb7..6ca39da14c91ccb88d20062f5919aa17a853ec12 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * intel-pasid.c - PASID idr, table and entry manipulation
  *
  * Copyright (C) 2018 Intel Corporation
@@ -19,101 +19,74 @@
 #include <linux/pci-ats.h>
 #include <linux/spinlock.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
+#include "cap_audit.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
  */
-static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
-static DEFINE_IDR(pasid_idr);
-
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
-{
-	int ret, min, max;
-
-	min = max_t(int, start, PASID_MIN);
-	max = min_t(int, end, intel_pasid_max_id);
 
-	WARN_ON(in_interrupt());
-	idr_preload(gfp);
-	spin_lock(&pasid_lock);
-	ret = idr_alloc(&pasid_idr, ptr, min, max, GFP_ATOMIC);
-	spin_unlock(&pasid_lock);
-	idr_preload_end();
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
+{
+	unsigned long flags;
+	u8 status_code;
+	int ret = 0;
+	u64 res;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+		      !(res & VCMD_VRSP_IP), res);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	status_code = VCMD_VRSP_SC(res);
+	switch (status_code) {
+	case VCMD_VRSP_SC_SUCCESS:
+		*pasid = VCMD_VRSP_RESULT_PASID(res);
+		break;
+	case VCMD_VRSP_SC_NO_PASID_AVAIL:
+		pr_info("IOMMU: %s: No PASID available\n", iommu->name);
+		ret = -ENOSPC;
+		break;
+	default:
+		ret = -ENODEV;
+		pr_warn("IOMMU: %s: Unexpected error code %d\n",
+			iommu->name, status_code);
+	}
 
 	return ret;
 }
 
-void intel_pasid_free_id(int pasid)
-{
-	spin_lock(&pasid_lock);
-	idr_remove(&pasid_idr, pasid);
-	spin_unlock(&pasid_lock);
-}
-
-void *intel_pasid_lookup_id(int pasid)
-{
-	void *p;
-
-	spin_lock(&pasid_lock);
-	p = idr_find(&pasid_idr, pasid);
-	spin_unlock(&pasid_lock);
-
-	return p;
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid)
+{
+	unsigned long flags;
+	u8 status_code;
+	u64 res;
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+	dmar_writeq(iommu->reg + DMAR_VCMD_REG,
+		    VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE);
+	IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+		      !(res & VCMD_VRSP_IP), res);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	status_code = VCMD_VRSP_SC(res);
+	switch (status_code) {
+	case VCMD_VRSP_SC_SUCCESS:
+		break;
+	case VCMD_VRSP_SC_INVALID_PASID:
+		pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
+		break;
+	default:
+		pr_warn("IOMMU: %s: Unexpected error code %d\n",
+			iommu->name, status_code);
+	}
 }
 
 /*
  * Per device pasid table management:
  */
-static inline void
-device_attach_pasid_table(struct device_domain_info *info,
-			  struct pasid_table *pasid_table)
-{
-	info->pasid_table = pasid_table;
-	list_add(&info->table, &pasid_table->dev);
-}
-
-static inline void
-device_detach_pasid_table(struct device_domain_info *info,
-			  struct pasid_table *pasid_table)
-{
-	info->pasid_table = NULL;
-	list_del(&info->table);
-}
-
-struct pasid_table_opaque {
-	struct pasid_table	**pasid_table;
-	int			segment;
-	int			bus;
-	int			devfn;
-};
-
-static int search_pasid_table(struct device_domain_info *info, void *opaque)
-{
-	struct pasid_table_opaque *data = opaque;
-
-	if (info->iommu->segment == data->segment &&
-	    info->bus == data->bus &&
-	    info->devfn == data->devfn &&
-	    info->pasid_table) {
-		*data->pasid_table = info->pasid_table;
-		return 1;
-	}
-
-	return 0;
-}
-
-static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque)
-{
-	struct pasid_table_opaque *data = opaque;
-
-	data->segment = pci_domain_nr(pdev->bus);
-	data->bus = PCI_BUS_NUM(alias);
-	data->devfn = alias & 0xff;
-
-	return for_each_device_domain(&search_pasid_table, data);
-}
 
 /*
  * Allocate a pasid table for @dev. It should be called in a
@@ -123,31 +96,21 @@ int intel_pasid_alloc_table(struct device *dev)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
-	struct pasid_table_opaque data;
 	struct page *pages;
-	int max_pasid = 0;
-	int ret, order;
-	int size;
+	u32 max_pasid = 0;
+	int order, size;
 
 	might_sleep();
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
 		return -EINVAL;
 
-	/* DMA alias device already has a pasid table, use it: */
-	data.pasid_table = &pasid_table;
-	ret = pci_for_each_dma_alias(to_pci_dev(dev),
-				     &get_alias_pasid_table, &data);
-	if (ret)
-		goto attach_out;
-
 	pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL);
 	if (!pasid_table)
 		return -ENOMEM;
-	INIT_LIST_HEAD(&pasid_table->dev);
 
 	if (info->pasid_supported)
-		max_pasid = min_t(int, pci_max_pasids(to_pci_dev(dev)),
+		max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)),
 				  intel_pasid_max_id);
 
 	size = max_pasid >> (PASID_PDE_SHIFT - 3);
@@ -162,9 +125,7 @@ int intel_pasid_alloc_table(struct device *dev)
 	pasid_table->table = page_address(pages);
 	pasid_table->order = order;
 	pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
-
-attach_out:
-	device_attach_pasid_table(info, pasid_table);
+	info->pasid_table = pasid_table;
 
 	return 0;
 }
@@ -177,15 +138,12 @@ void intel_pasid_free_table(struct device *dev)
 	struct pasid_entry *table;
 	int i, max_pde;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !dev_is_pci(dev) || !info->pasid_table)
 		return;
 
 	pasid_table = info->pasid_table;
-	device_detach_pasid_table(info, pasid_table);
-
-	if (!list_empty(&pasid_table->dev))
-		return;
+	info->pasid_table = NULL;
 
 	/* Free scalable mode PASID directory tables: */
 	dir = pasid_table->table;
@@ -203,25 +161,25 @@ struct pasid_table *intel_pasid_get_table(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info)
 		return NULL;
 
 	return info->pasid_table;
 }
 
-int intel_pasid_get_dev_max_id(struct device *dev)
+static int intel_pasid_get_dev_max_id(struct device *dev)
 {
 	struct device_domain_info *info;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->pasid_table)
 		return 0;
 
 	return info->pasid_table->max_pasid;
 }
 
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
+static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
@@ -230,28 +188,33 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid)
 	int dir_index, index;
 
 	pasid_table = intel_pasid_get_table(dev);
-	if (WARN_ON(!pasid_table || pasid < 0 ||
-		    pasid >= intel_pasid_get_dev_max_id(dev)))
+	if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev)))
 		return NULL;
 
 	dir = pasid_table->table;
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	dir_index = pasid >> PASID_PDE_SHIFT;
 	index = pasid & PASID_PTE_MASK;
 
-	spin_lock(&pasid_lock);
+retry:
 	entries = get_pasid_table_from_pde(&dir[dir_index]);
 	if (!entries) {
 		entries = alloc_pgtable_page(info->iommu->node);
-		if (!entries) {
-			spin_unlock(&pasid_lock);
+		if (!entries)
 			return NULL;
-		}
 
-		WRITE_ONCE(dir[dir_index].val,
-			   (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
+		/*
+		 * The pasid directory table entry won't be freed after
+		 * allocation. No worry about the race with free and
+		 * clear. However, this entry might be populated by others
+		 * while we are preparing it. Use theirs with a retry.
+		 */
+		if (cmpxchg64(&dir[dir_index].val, 0ULL,
+			      (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+			free_pgtable_page(entries);
+			goto retry;
+		}
 	}
-	spin_unlock(&pasid_lock);
 
 	return &entries[index];
 }
@@ -271,15 +234,50 @@ static inline void pasid_clear_entry(struct pasid_entry *pe)
 	WRITE_ONCE(pe->val[7], 0);
 }
 
-static void intel_pasid_clear_entry(struct device *dev, int pasid)
+static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe)
+{
+	WRITE_ONCE(pe->val[0], PASID_PTE_FPD);
+	WRITE_ONCE(pe->val[1], 0);
+	WRITE_ONCE(pe->val[2], 0);
+	WRITE_ONCE(pe->val[3], 0);
+	WRITE_ONCE(pe->val[4], 0);
+	WRITE_ONCE(pe->val[5], 0);
+	WRITE_ONCE(pe->val[6], 0);
+	WRITE_ONCE(pe->val[7], 0);
+}
+
+static void
+intel_pasid_clear_entry(struct intel_iommu *iommu, struct device *dev,
+			u32 pasid, bool fault_ignore, bool keep_pte)
 {
 	struct pasid_entry *pe;
+	bool keep_slt = false;
+	u64 pe_val;
+	u8 pgtt = 0;
 
 	pe = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pe))
 		return;
 
-	pasid_clear_entry(pe);
+	/*
+	 * The guest may reboot from scalable mode to legacy mode. During this
+	 * phase, there is no chance to setup SLT. So, we should only reset PGTT
+	 * from NESTED to SL and keep other bits when unbind gpasid is executed.
+	 */
+	pe_val = READ_ONCE(pe->val[0]);
+	pgtt = (pe_val >> 6) & 0x7;
+	keep_slt = (pgtt == PASID_ENTRY_PGTT_NESTED ||
+		    pgtt == PASID_ENTRY_PGTT_SL_ONLY);
+	if (keep_slt && keep_pte) {
+		pe_val &= 0xfffffffffffffebf;
+		WRITE_ONCE(pe->val[0], pe_val);
+		return;
+	}
+
+	if (fault_ignore && pasid_pte_is_present(pe))
+		pasid_clear_entry_with_fpd(pe);
+	else
+		pasid_clear_entry(pe);
 }
 
 static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
@@ -357,6 +355,15 @@ static inline void pasid_set_sre(struct pasid_entry *pe)
 	pasid_set_bits(&pe->val[2], 1 << 0, 1);
 }
 
+/*
+ * Setup the WPE(Write Protect Enable) field (Bit 132) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_wpe(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4);
+}
+
 /*
  * Setup the P(Present) field (Bit 0) of a scalable mode PASID
  * entry.
@@ -375,6 +382,16 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
 	pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
 }
 
+/*
+ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_pgsnp(struct pasid_entry *pe)
+{
+	pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
+}
+
 /*
  * Setup the First Level Page table Pointer field (Bit 140~191)
  * of a scalable mode PASID entry.
@@ -395,42 +412,48 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
 	pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
 }
 
-static void
-pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
-				    u16 did, int pasid)
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_eafe(struct pasid_entry *pe)
 {
-	struct qi_desc desc;
-
-	desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid);
-	desc.qw1 = 0;
-	desc.qw2 = 0;
-	desc.qw3 = 0;
+	pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
 
-	qi_submit_sync(&desc, iommu);
+/*
+ * Setup Second Level Access/Dirty bit Enable field (Bit 9) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_slade(struct pasid_entry *pe, bool value)
+{
+	pasid_set_bits(&pe->val[0], 1 << 9, value << 9);
 }
 
 static void
-iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
+pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
+				    u16 did, u32 pasid)
 {
 	struct qi_desc desc;
 
-	desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
-			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
+	desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) |
+		QI_PC_PASID(pasid) | QI_PC_TYPE;
 	desc.qw1 = 0;
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
-	qi_submit_sync(&desc, iommu);
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static void
 devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
-			       struct device *dev, int pasid)
+			       struct device *dev, u32 pasid)
 {
 	struct device_domain_info *info;
 	u16 sid, qdep, pfsid;
 
-	info = dev->archdata.iommu;
+	info = get_domain_info(dev);
 	if (!info || !info->ats_enabled)
 		return;
 
@@ -438,40 +461,112 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
 	qdep = info->ats_qdep;
 	pfsid = info->pfsid;
 
-	qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	/*
+	 * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID),
+	 * devTLB flush w/o PASID should be used. For non-zero PASID under
+	 * SVA usage, device could do DMA with multiple PASIDs. It is more
+	 * efficient to flush devTLB specific to the PASID.
+	 */
+	if (pasid == PASID_RID2PASID)
+		qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	else
+		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT);
+	/*
+	 * Flush the kernel PASID if used by the device. This is the case where
+	 * a device driver uses IOVA via DMA map APIs for request with PASID.
+	 */
+	if (dev->pasid)
+		qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, dev->pasid, qdep, 0,
+					 64 - VTD_PAGE_SHIFT);
+}
+
+static void
+flush_iotlb_all(struct intel_iommu *iommu, struct device *dev,
+		u16 did, u32 pasid, u64 type)
+{
+	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+	if (type)
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, type);
+	else
+		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+
+	if (!cap_caching_mode(iommu->cap))
+		devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
 
-void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-				 struct device *dev, int pasid)
+void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
+				 u32 pasid, bool fault_ignore, bool keep_pte)
 {
 	struct pasid_entry *pte;
 	u16 did;
+	u64 pe_val;
+	u16 pgtt_type;
 
 	pte = intel_pasid_get_entry(dev, pasid);
 	if (WARN_ON(!pte))
 		return;
 
+	if (!(pte->val[0] & PASID_PTE_PRESENT))
+		return;
+
 	did = pasid_get_domain_id(pte);
-	intel_pasid_clear_entry(dev, pasid);
+
+	pe_val = READ_ONCE(pte->val[0]);
+	pgtt_type = (pe_val >> 6) & 0x7;
+
+	intel_pasid_clear_entry(iommu, dev, pasid, fault_ignore, keep_pte);
 
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(pte, sizeof(*pte));
 
-	pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-	iotlb_invalidation_with_pasid(iommu, did, pasid);
+	if (pgtt_type == PASID_ENTRY_PGTT_FL_ONLY ||
+			pgtt_type == PASID_ENTRY_PGTT_PT)
+		flush_iotlb_all(iommu, dev, did, pasid, 0);
+	else
+		flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
+}
 
-	/* Device IOTLB doesn't need to be flushed in caching mode. */
-	if (!cap_caching_mode(iommu->cap))
-		devtlb_invalidation_with_pasid(iommu, dev, pasid);
+/*
+ * This function flushes cache for a newly setup pasid table entry.
+ * Caller of it should not modify the in-use pasid table entries.
+ */
+static void pasid_flush_caches(struct intel_iommu *iommu,
+				struct pasid_entry *pte,
+			       u32 pasid, u16 did)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(pte, sizeof(*pte));
+
+	if (cap_caching_mode(iommu->cap)) {
+		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+		qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+	} else {
+		iommu_flush_write_buffer(iommu);
+	}
 }
 
+static inline int pasid_enable_wpe(struct pasid_entry *pte)
+{
+	unsigned long cr0 = read_cr0();
+
+	/* CR0.WP is normally set but just to be sure */
+	if (unlikely(!(cr0 & X86_CR0_WP))) {
+		pr_err_ratelimited("No CPU write protect!\n");
+		return -EINVAL;
+	}
+	pasid_set_wpe(pte);
+
+	return 0;
+};
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
  */
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 				  struct device *dev, pgd_t *pgd,
-				  int pasid, u16 did, int flags)
+				  u32 pasid, u16 did, int flags)
 {
 	struct pasid_entry *pte;
 
@@ -496,40 +591,53 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 			return -EINVAL;
 		}
 		pasid_set_sre(pte);
+		if (pasid_enable_wpe(pte))
+			return -EINVAL;
+
 	}
 
-#ifdef CONFIG_X86
-	/* Both CPU and IOMMU paging mode need to match */
-	if (cpu_feature_enabled(X86_FEATURE_LA57)) {
+	if (flags & PASID_FLAG_FL5LP) {
 		if (cap_5lp_support(iommu->cap)) {
 			pasid_set_flpm(pte, 1);
 		} else {
-			pr_err("VT-d has no 5-level paging support for CPU\n");
+			pr_err("No 5-level paging support for first-level\n");
 			pasid_clear_entry(pte);
 			return -EINVAL;
 		}
 	}
-#endif /* CONFIG_X86 */
+
+	if (flags & PASID_FLAG_PAGE_SNOOP)
+		pasid_set_pgsnp(pte);
 
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
 	/* Setup Present and PASID Granular Transfer Type: */
-	pasid_set_translation_type(pte, 1);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
 	pasid_set_present(pte);
+	pasid_flush_caches(iommu, pte, pasid, did);
 
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
+	return 0;
+}
 
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
+/*
+ * Skip top levels of page tables for iommu which has less agaw
+ * than default. Unnecessary for PT mode.
+ */
+static inline int iommu_skip_agaw(struct dmar_domain *domain,
+				  struct intel_iommu *iommu,
+				  struct dma_pte **pgd)
+{
+	int agaw;
+
+	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
+		*pgd = phys_to_virt(dma_pte_addr(*pgd));
+		if (!dma_pte_present(*pgd))
+			return -EINVAL;
 	}
 
-	return 0;
+	return agaw;
 }
 
 /*
@@ -537,7 +645,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
  */
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid)
+				   struct device *dev, u32 pasid)
 {
 	struct pasid_entry *pte;
 	struct dma_pte *pgd;
@@ -555,17 +663,11 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 		return -EINVAL;
 	}
 
-	/*
-	 * Skip top levels of page tables for iommu which has less agaw
-	 * than default. Unnecessary for PT mode.
-	 */
 	pgd = domain->pgd;
-	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-		pgd = phys_to_virt(dma_pte_addr(pgd));
-		if (!dma_pte_present(pgd)) {
-			dev_err(dev, "Invalid domain page table\n");
-			return -EINVAL;
-		}
+	agaw = iommu_skip_agaw(domain, iommu, &pgd);
+	if (agaw < 0) {
+		dev_err(dev, "Invalid domain page table\n");
+		return -EINVAL;
 	}
 
 	pgd_val = virt_to_phys(pgd);
@@ -581,26 +683,23 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 	pasid_set_domain_id(pte, did);
 	pasid_set_slptr(pte, pgd_val);
 	pasid_set_address_width(pte, agaw);
-	pasid_set_translation_type(pte, 2);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
 	pasid_set_fault_enable(pte);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
+	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+		pasid_set_pgsnp(pte);
+
 	/*
 	 * Since it is a second level only translation setup, we should
 	 * set SRE bit as well (addresses are expected to be GPAs).
 	 */
-	pasid_set_sre(pte);
+	if (pasid != PASID_RID2PASID)
+		pasid_set_sre(pte);
+	if (slad_support())
+		pasid_set_slade(pte, true);
 	pasid_set_present(pte);
-
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
-
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
-	}
+	pasid_flush_caches(iommu, pte, pasid, did);
 
 	return 0;
 }
@@ -610,7 +709,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
  */
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid)
+				   struct device *dev, u32 pasid)
 {
 	u16 did = FLPT_DEFAULT_DID;
 	struct pasid_entry *pte;
@@ -624,7 +723,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 	pasid_clear_entry(pte);
 	pasid_set_domain_id(pte, did);
 	pasid_set_address_width(pte, iommu->agaw);
-	pasid_set_translation_type(pte, 4);
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
 	pasid_set_fault_enable(pte);
 	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
@@ -634,16 +733,217 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 	 */
 	pasid_set_sre(pte);
 	pasid_set_present(pte);
+	pasid_flush_caches(iommu, pte, pasid, did);
 
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(pte, sizeof(*pte));
+	return 0;
+}
 
-	if (cap_caching_mode(iommu->cap)) {
-		pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-		iotlb_invalidation_with_pasid(iommu, did, pasid);
-	} else {
-		iommu_flush_write_buffer(iommu);
+static int
+intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
+			    struct iommu_gpasid_bind_data_vtd *pasid_data)
+{
+	/*
+	 * Not all guest PASID table entry fields are passed down during bind,
+	 * here we only set up the ones that are dependent on guest settings.
+	 * Execution related bits such as NXE, SMEP are not supported.
+	 * Other fields, such as snoop related, are set based on host needs
+	 * regardless of guest settings.
+	 */
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) {
+		if (!ecap_srs(iommu->ecap)) {
+			pr_err_ratelimited("No supervisor request support on %s\n",
+					   iommu->name);
+			return -EINVAL;
+		}
+		pasid_set_sre(pte);
+		/* Enable write protect WP if guest requested */
+		if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE) {
+			if (pasid_enable_wpe(pte))
+				return -EINVAL;
+		}
+	}
+
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
+		if (!ecap_eafs(iommu->ecap)) {
+			pr_err_ratelimited("No extended access flag support on %s\n",
+					   iommu->name);
+			return -EINVAL;
+		}
+		pasid_set_eafe(pte);
+	}
+
+	/*
+	 * Memory type is only applicable to devices inside processor coherent
+	 * domain. Will add MTS support once coherent devices are available.
+	 */
+	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) {
+		pr_warn_ratelimited("No memory type support %s\n",
+				    iommu->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * This could be used for guest shared virtual address. In this case, the
+ * first level page tables are used for GVA-GPA translation in the guest,
+ * second level page tables are used for GPA-HPA translation.
+ *
+ * @iommu:      IOMMU which the device belong to
+ * @dev:        Device to be set up for translation
+ * @gpgd:       FLPTPTR: First Level Page translation pointer in GPA
+ * @pasid:      PASID to be programmed in the device PASID table
+ * @pasid_data: Additional PASID info from the guest bind request
+ * @domain:     Domain info for setting up second level page tables
+ * @addr_width: Address width of the first level (guest)
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+			     pgd_t *gpgd, u32 pasid,
+			     struct iommu_gpasid_bind_data_vtd *pasid_data,
+			     struct dmar_domain *domain, int addr_width)
+{
+	struct pasid_entry *pte;
+	struct dma_pte *pgd;
+	int ret = 0;
+	u64 pgd_val;
+	int agaw;
+	u16 did;
+	bool pasid_present;
+
+	if (!ecap_nest(iommu->ecap)) {
+		pr_err_ratelimited("IOMMU: %s: No nested translation support\n",
+				   iommu->name);
+		return -EINVAL;
 	}
 
+	if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) {
+		pr_err_ratelimited("Domain is not in nesting mode, %x\n",
+				   domain->flags);
+		return -EINVAL;
+	}
+
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pte))
+		return -EINVAL;
+
+	/*
+	 * PASID entries with nesting translation type should not be set
+	 * multiple times. If caller tries to setup nesting for a PASID
+	 * entry which is already nested mode, should fail it.
+	 */
+	pasid_present = pasid_pte_is_present(pte);
+
+	if (pasid_present && pasid_pte_is_nested(pte))
+		return -EBUSY;
+
+	pasid_clear_entry(pte);
+
+	did = domain->iommu_did[iommu->seq_id];
+
+	if (pasid_present)
+		flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
+
+	/* Sanity checking performed by caller to make sure address
+	 * width matching in two dimensions:
+	 * 1. CPU vs. IOMMU
+	 * 2. Guest vs. Host.
+	 */
+	switch (addr_width) {
+#ifdef CONFIG_X86
+	case ADDR_WIDTH_5LEVEL:
+		if (!cpu_feature_enabled(X86_FEATURE_LA57) ||
+		    !cap_5lp_support(iommu->cap)) {
+			dev_err_ratelimited(dev,
+					    "5-level paging not supported\n");
+			return -EINVAL;
+		}
+
+		pasid_set_flpm(pte, 1);
+		break;
+#endif
+	case ADDR_WIDTH_4LEVEL:
+		pasid_set_flpm(pte, 0);
+		break;
+	default:
+		dev_err_ratelimited(dev, "Invalid guest address width %d\n",
+				    addr_width);
+		return -EINVAL;
+	}
+
+	/* First level PGD is in GPA, must be supported by the second level */
+	if ((uintptr_t)gpgd > domain->max_addr) {
+		dev_err_ratelimited(dev,
+				    "Guest PGD %lx not supported, max %llx\n",
+				    (uintptr_t)gpgd, domain->max_addr);
+		return -EINVAL;
+	}
+	pasid_set_flptr(pte, (uintptr_t)gpgd);
+
+	ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data);
+	if (ret)
+		return ret;
+
+	/* Setup the second level based on the given domain */
+	pgd = domain->pgd;
+
+	agaw = iommu_skip_agaw(domain, iommu, &pgd);
+	if (agaw < 0) {
+		dev_err_ratelimited(dev, "Invalid domain page table\n");
+		return -EINVAL;
+	}
+	pgd_val = virt_to_phys(pgd);
+	pasid_set_slptr(pte, pgd_val);
+	pasid_set_fault_enable(pte);
+
+	pasid_set_domain_id(pte, did);
+
+	pasid_set_address_width(pte, agaw);
+	pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+	pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+	if (slad_support())
+		pasid_set_slade(pte, true);
+	pasid_set_present(pte);
+	pasid_flush_caches(iommu, pte, pasid, did);
+
+	return ret;
+}
+
+/**
+ * intel_pasid_setup_slade() - Set up Second Level Access/Dirty bit Enable
+ * field in PASID entry for scalable mode pasid table.
+ *
+ * @dev:     Device to be set up for translation
+ * @domain:  Domain info for setting up slad enabling
+ * @pasid:   PASID to be programmed in the device PASID table
+ * @value:   Value set to the entry
+ */
+int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain,
+			    u32 pasid, bool value)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu;
+	struct pasid_entry *pte;
+	u16 did;
+
+	if (!info || !info->iommu)
+		return -ENODEV;
+
+	iommu = info->iommu;
+	did = domain->iommu_did[iommu->seq_id];
+
+	pte = intel_pasid_get_entry(dev, pasid);
+	if (WARN_ON(!pte))
+		return -ENODEV;
+
+	if (!pasid_pte_is_present(pte))
+		return -EINVAL;
+
+	pasid_set_slade(pte, value);
+
+	flush_iotlb_all(iommu, dev, did, pasid, DMA_TLB_DSI_FLUSH);
+
 	return 0;
 }
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel/pasid.h
similarity index 55%
rename from drivers/iommu/intel-pasid.h
rename to drivers/iommu/intel/pasid.h
index fc8cd8f17de166a8c96c82c3e73b3e1c3f1572d0..71057dc748131d5e5bead205ca00c4ce79a311f9 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * intel-pasid.h - PASID idr, table and entry header
+ * pasid.h - PASID idr, table and entry header
  *
  * Copyright (C) 2018 Intel Corporation
  *
@@ -10,11 +10,11 @@
 #ifndef __INTEL_PASID_H
 #define __INTEL_PASID_H
 
-#define PASID_RID2PASID			0x0
-#define PASID_MIN			0x1
+#define PASID_RID2PASID			IOASID_DMA_NO_PASID
 #define PASID_MAX			0x100000
 #define PASID_PTE_MASK			0x3F
 #define PASID_PTE_PRESENT		1
+#define PASID_PTE_FPD			2
 #define PDE_PFN_MASK			PAGE_MASK
 #define PASID_PDE_SHIFT			6
 #define MAX_NR_PASID_BITS		20
@@ -23,6 +23,16 @@
 #define is_pasid_enabled(entry)		(((entry)->lo >> 3) & 0x1)
 #define get_pasid_dir_size(entry)	(1 << ((((entry)->lo >> 9) & 0x7) + 7))
 
+/* Virtual command interface for enlightened pasid management. */
+#define VCMD_CMD_ALLOC			0x1
+#define VCMD_CMD_FREE			0x2
+#define VCMD_VRSP_IP			0x1
+#define VCMD_VRSP_SC(e)			(((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC_SUCCESS		0
+#define VCMD_VRSP_SC_NO_PASID_AVAIL	2
+#define VCMD_VRSP_SC_INVALID_PASID	2
+#define VCMD_VRSP_RESULT_PASID(e)	(((e) >> 8) & 0xfffff)
+#define VCMD_CMD_OPERAND(e)		((e) << 8)
 /*
  * Domain ID reserved for pasid entries programmed for first-level
  * only and pass-through transfer modes.
@@ -36,6 +46,14 @@
  * to vmalloc or even module mappings.
  */
 #define PASID_FLAG_SUPERVISOR_MODE	BIT(0)
+#define PASID_FLAG_NESTED		BIT(1)
+#define PASID_FLAG_PAGE_SNOOP		BIT(2)
+
+/*
+ * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
+ * level translation, otherwise, 4-level paging will be used.
+ */
+#define PASID_FLAG_FL5LP		BIT(1)
 
 struct pasid_dir_entry {
 	u64 val;
@@ -45,12 +63,16 @@ struct pasid_entry {
 	u64 val[8];
 };
 
+#define PASID_ENTRY_PGTT_FL_ONLY	(1)
+#define PASID_ENTRY_PGTT_SL_ONLY	(2)
+#define PASID_ENTRY_PGTT_NESTED		(3)
+#define PASID_ENTRY_PGTT_PT		(4)
+
 /* The representative of a PASID table */
 struct pasid_table {
 	void			*table;		/* pasid table pointer */
 	int			order;		/* page order of pasid table */
-	int			max_pasid;	/* max pasid */
-	struct list_head	dev;		/* device list */
+	u32			max_pasid;	/* max pasid */
 };
 
 /* Get PRESENT bit of a PASID directory entry. */
@@ -75,25 +97,38 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 	return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
 }
 
-extern u32 intel_pasid_max_id;
+/* Check if PGTT bits of a PASID table entry is nested. */
+static inline bool pasid_pte_is_nested(struct pasid_entry *pte)
+{
+	return ((READ_ONCE(pte->val[0]) >> 6) & 0x7) == PASID_ENTRY_PGTT_NESTED;
+}
+
+extern unsigned int intel_pasid_max_id;
+extern struct ioasid_set *host_pasid_set;
 int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
-void intel_pasid_free_id(int pasid);
-void *intel_pasid_lookup_id(int pasid);
+void intel_pasid_free_id(u32 pasid);
+void *intel_pasid_lookup_id(u32 pasid);
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
-int intel_pasid_get_dev_max_id(struct device *dev);
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 				  struct device *dev, pgd_t *pgd,
-				  int pasid, u16 did, int flags);
+				  u32 pasid, u16 did, int flags);
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid);
+				   struct device *dev, u32 pasid);
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
 				   struct dmar_domain *domain,
-				   struct device *dev, int pasid);
+				   struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu,
+			     struct device *dev, pgd_t *pgd, u32 pasid,
+			     struct iommu_gpasid_bind_data_vtd *pasid_data,
+			     struct dmar_domain *domain, int addr_width);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
-				 struct device *dev, int pasid);
-
+				 struct device *dev, u32 pasid,
+				 bool fault_ignore, bool keep_pte);
+int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid);
+int intel_pasid_setup_slade(struct device *dev, struct dmar_domain *domain,
+			    u32 pasid, bool value);
 #endif /* __INTEL_PASID_H */
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
new file mode 100644
index 0000000000000000000000000000000000000000..faaa96dda437703a65126eb8732995a4dc52df75
--- /dev/null
+++ b/drivers/iommu/intel/perf.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * perf.c - performance monitor
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *         Fenghua Yu <fenghua.yu@intel.com>
+ */
+
+#include <linux/spinlock.h>
+#include <linux/intel-iommu.h>
+
+#include "perf.h"
+
+static DEFINE_SPINLOCK(latency_lock);
+
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+
+	return lstat && lstat[type].enabled;
+}
+
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat;
+	unsigned long flags;
+	int ret = -EBUSY;
+
+	if (dmar_latency_enabled(iommu, type))
+		return 0;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (!iommu->perf_statistic) {
+		iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM,
+						GFP_ATOMIC);
+		if (!iommu->perf_statistic) {
+			ret = -ENOMEM;
+			goto unlock_out;
+		}
+	}
+
+	lstat = iommu->perf_statistic;
+
+	if (!lstat[type].enabled) {
+		lstat[type].enabled = true;
+		lstat[type].counter[COUNTS_MIN] = UINT_MAX;
+		ret = 0;
+	}
+unlock_out:
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return ret;
+}
+
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	u64 min, max;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (latency < 100)
+		lstat[type].counter[COUNTS_10e2]++;
+	else if (latency < 1000)
+		lstat[type].counter[COUNTS_10e3]++;
+	else if (latency < 10000)
+		lstat[type].counter[COUNTS_10e4]++;
+	else if (latency < 100000)
+		lstat[type].counter[COUNTS_10e5]++;
+	else if (latency < 1000000)
+		lstat[type].counter[COUNTS_10e6]++;
+	else if (latency < 10000000)
+		lstat[type].counter[COUNTS_10e7]++;
+	else
+		lstat[type].counter[COUNTS_10e8_plus]++;
+
+	min = lstat[type].counter[COUNTS_MIN];
+	max = lstat[type].counter[COUNTS_MAX];
+	lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency);
+	lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency);
+	lstat[type].counter[COUNTS_SUM] += latency;
+	lstat[type].samples++;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static char *latency_counter_names[] = {
+	"                  <0.1us",
+	"   0.1us-1us", "    1us-10us", "  10us-100us",
+	"   100us-1ms", "    1ms-10ms", "      >=10ms",
+	"     min(us)", "     max(us)", " average(us)"
+};
+
+static char *latency_type_names[] = {
+	"   inv_iotlb", "  inv_devtlb", "     inv_iec",
+	"     svm_prq"
+};
+
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	int bytes = 0, i, j;
+
+	memset(str, 0, size);
+
+	for (i = 0; i < COUNTS_NUM; i++)
+		bytes += snprintf(str + bytes, size - bytes,
+				  "%s", latency_counter_names[i]);
+
+	spin_lock_irqsave(&latency_lock, flags);
+	for (i = 0; i < DMAR_LATENCY_NUM; i++) {
+		if (!dmar_latency_enabled(iommu, i))
+			continue;
+
+		bytes += snprintf(str + bytes, size - bytes,
+				  "\n%s", latency_type_names[i]);
+
+		for (j = 0; j < COUNTS_NUM; j++) {
+			u64 val = lstat[i].counter[j];
+
+			switch (j) {
+			case COUNTS_MIN:
+				if (val == UINT_MAX)
+					val = 0;
+				else
+					val /= 1000;
+				break;
+			case COUNTS_MAX:
+				val /= 1000;
+				break;
+			case COUNTS_SUM:
+				if (lstat[i].samples)
+					val /= (lstat[i].samples * 1000);
+				else
+					val = 0;
+				break;
+			default:
+				break;
+			}
+
+			bytes += snprintf(str + bytes, size - bytes,
+					  "%12lld", val);
+		}
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return bytes;
+}
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd6db8049d1a773fa3c1dd9482f9b9a64b12061f
--- /dev/null
+++ b/drivers/iommu/intel/perf.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf.h - performance monitor header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+enum latency_type {
+	DMAR_LATENCY_INV_IOTLB = 0,
+	DMAR_LATENCY_INV_DEVTLB,
+	DMAR_LATENCY_INV_IEC,
+	DMAR_LATENCY_PRQ,
+	DMAR_LATENCY_NUM
+};
+
+enum latency_count {
+	COUNTS_10e2 = 0,	/* < 0.1us	*/
+	COUNTS_10e3,		/* 0.1us ~ 1us	*/
+	COUNTS_10e4,		/* 1us ~ 10us	*/
+	COUNTS_10e5,		/* 10us ~ 100us	*/
+	COUNTS_10e6,		/* 100us ~ 1ms	*/
+	COUNTS_10e7,		/* 1ms ~ 10ms	*/
+	COUNTS_10e8_plus,	/* 10ms and plus*/
+	COUNTS_MIN,
+	COUNTS_MAX,
+	COUNTS_SUM,
+	COUNTS_NUM
+};
+
+struct latency_statistic {
+	bool enabled;
+	u64 counter[COUNTS_NUM];
+	u64 samples;
+};
+
+#ifdef CONFIG_DMAR_PERF
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
+			 u64 latency);
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+#else
+static inline int
+dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	return -EINVAL;
+}
+
+static inline void
+dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+}
+
+static inline bool
+dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	return false;
+}
+
+static inline void
+dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+}
+
+static inline int
+dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR_PERF */
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel/svm.c
similarity index 30%
rename from drivers/iommu/intel-svm.c
rename to drivers/iommu/intel/svm.c
index a3739f626629c8c2ba04915e8f5f426529ddb3e9..a932e0a8cc0f2ba524c36c7cb0faf7d095ee1fd7 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -17,33 +17,26 @@
 #include <linux/dmar.h>
 #include <linux/interrupt.h>
 #include <linux/mm_types.h>
+#include <linux/xarray.h>
+#include <linux/ioasid.h>
 #include <asm/page.h>
+#include <asm/fpu/api.h>
+#include <trace/events/intel_iommu.h>
 
-#include "intel-pasid.h"
+#include "pasid.h"
+#include "perf.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
+static void intel_svm_drain_prq(struct device *dev, u32 pasid);
 
-int intel_svm_init(struct intel_iommu *iommu)
-{
-	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
-			!cap_fl1gp_support(iommu->cap))
-		return -EINVAL;
-
-	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
-			!cap_5lp_support(iommu->cap))
-		return -EINVAL;
-
-	return 0;
-}
-
-#define PRQ_ORDER 0
+extern int prq_size_page_order;
 
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
 	struct page *pages;
 	int irq, ret;
 
-	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, prq_size_page_order);
 	if (!pages) {
 		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
 			iommu->name);
@@ -57,7 +50,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 		       iommu->name);
 		ret = -EINVAL;
 	err:
-		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+		free_pages((unsigned long)iommu->prq, prq_size_page_order);
 		iommu->prq = NULL;
 		return ret;
 	}
@@ -76,7 +69,9 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	}
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
-	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+	dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | prq_size_page_order);
+
+	init_completion(&iommu->prq_complete);
 
 	return 0;
 }
@@ -93,7 +88,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->pr_irq = 0;
 	}
 
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	free_pages((unsigned long)iommu->prq, prq_size_page_order);
 	iommu->prq = NULL;
 
 	return 0;
@@ -104,53 +99,168 @@ static void __flush_svm_range_dev(struct intel_svm *svm,
 				  unsigned long address,
 				  unsigned long pages, int ih)
 {
-	struct qi_desc desc;
+	struct device_domain_info *info = get_domain_info(sdev->dev);
 
-	if (pages == -1) {
-		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-			QI_EIOTLB_DID(sdev->did) |
-			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
-			QI_EIOTLB_TYPE;
-		desc.qw1 = 0;
-	} else {
-		int mask = ilog2(__roundup_pow_of_two(pages));
-
-		desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
-				QI_EIOTLB_DID(sdev->did) |
-				QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
-				QI_EIOTLB_TYPE;
-		desc.qw1 = QI_EIOTLB_ADDR(address) |
-				QI_EIOTLB_IH(ih) |
-				QI_EIOTLB_AM(mask);
-	}
-	desc.qw2 = 0;
-	desc.qw3 = 0;
-	qi_submit_sync(&desc, svm->iommu);
-
-	if (sdev->dev_iotlb) {
-		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
-				QI_DEV_EIOTLB_SID(sdev->sid) |
-				QI_DEV_EIOTLB_QDEP(sdev->qdep) |
-				QI_DEIOTLB_TYPE;
-		if (pages == -1) {
-			desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
-					QI_DEV_EIOTLB_SIZE;
-		} else if (pages > 1) {
-			/* The least significant zero bit indicates the size. So,
-			 * for example, an "address" value of 0x12345f000 will
-			 * flush from 0x123440000 to 0x12347ffff (256KiB). */
-			unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
-			unsigned long mask = __rounddown_pow_of_two(address ^ last);
-
-			desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
-					(mask - 1)) | QI_DEV_EIOTLB_SIZE;
-		} else {
-			desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
+	if (WARN_ON(!pages))
+		return;
+
+	qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
+	if (info->ats_enabled)
+		qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
+					 svm->pasid, sdev->qdep, address,
+					 order_base_2(pages));
+}
+
+static inline bool intel_svm_capable(struct intel_iommu *iommu)
+{
+	return iommu->flags & VTD_FLAG_SVM_CAPABLE;
+}
+
+static inline void intel_svm_drop_pasid(ioasid_t pasid, u64 flags)
+{
+	/*
+	 * Detaching SPID results in UNBIND notification on the set, we must
+	 * do this before dropping the IOASID reference, otherwise the
+	 * notification chain may get destroyed.
+	 */
+	if (!(flags & IOMMU_SVA_HPASID_DEF))
+		ioasid_detach_spid(pasid);
+	ioasid_detach_data(pasid);
+	ioasid_put(NULL, pasid);
+}
+
+static DEFINE_MUTEX(pasid_mutex);
+#define pasid_lock_held() lock_is_held(&pasid_mutex.dep_map)
+
+static void intel_svm_free_async_fn(struct work_struct *work)
+{
+	struct intel_svm *svm = container_of(work, struct intel_svm, work);
+	struct intel_svm_dev *sdev, *subdev, *tmp;
+	LIST_HEAD(subdevs);
+	u32 pasid = svm->pasid;
+
+	/*
+	 * Unbind all devices associated with this PASID which is
+	 * being freed by other users such as VFIO.
+	 */
+	mutex_lock(&pasid_mutex);
+	list_for_each_entry_rcu(sdev, &svm->devs, list, pasid_lock_held()) {
+		/* Does not poison forward pointer */
+		list_del_rcu(&sdev->list);
+		spin_lock(&sdev->iommu->lock);
+		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
+					svm->pasid, true, false);
+		intel_svm_drain_prq(sdev->dev, svm->pasid);
+		spin_unlock(&sdev->iommu->lock);
+		if (is_aux_domain(sdev->dev, &sdev->domain->domain)) {
+			subdev = kzalloc(sizeof(*subdev), GFP_KERNEL);
+			if (!subdev) {
+				dev_err_ratelimited(sdev->dev, "Failed to record for fault data del %u\n", pasid);
+				continue;
+			}
+			subdev->dev = sdev->dev;
+
+			kfree_rcu(sdev, rcu);
+			/*
+			 * Record the sdev and delete device_fault_data outside pasid_mutex
+			 * protection to avoid race with page response and prq reporting.
+			 */
+			list_add_tail(&subdev->list, &subdevs);
 		}
-		desc.qw2 = 0;
-		desc.qw3 = 0;
-		qi_submit_sync(&desc, svm->iommu);
 	}
+	/*
+	 * We may not be the last user to drop the reference but since
+	 * the PASID is in FREE_PENDING state, no one can get new reference.
+	 * Therefore, we can safely free the private data svm.
+	 */
+	intel_svm_drop_pasid(svm->pasid, 0);
+
+	/*
+	 * Free before unbind can only happen with host PASIDs used for
+	 * guest SVM. We get here because ioasid_free is called with
+	 * outstanding references. So we need to drop the reference
+	 * such that the PASID can be reclaimed. unbind_gpasid() after this
+	 * will not result in dropping refcount since the private data is
+	 * already detached.
+	 */
+	kfree(svm);
+
+	mutex_unlock(&pasid_mutex);
+
+	list_for_each_entry_safe(subdev, tmp, &subdevs, list) {
+		list_del(&subdev->list);
+		/*
+		 * Partial assignment needs to delete fault data
+		 */
+		dev_dbg(subdev->dev, "try to del fault data for %u\n", pasid);
+		iommu_delete_device_fault_data(subdev->dev, pasid);
+		kfree(subdev);
+	}
+}
+
+
+static int pasid_status_change(struct notifier_block *nb,
+				unsigned long code, void *data)
+{
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct intel_svm *svm = (struct intel_svm *)args->pdata;
+	int ret = NOTIFY_DONE;
+
+	/*
+	 * Notification private data is a choice of vendor driver when the
+	 * IOASID is allocated or attached after allocation. When the data
+	 * type changes, we must make modifications here accordingly.
+	 */
+	if (code == IOASID_NOTIFY_FREE) {
+		/*
+		 * If PASID UNBIND happens before FREE, private data of the
+		 * IOASID should be NULL, then we don't need to do anything.
+		 */
+		if (!svm)
+			goto done;
+		if (args->id != svm->pasid) {
+			pr_warn("Notify PASID does not match data %d : %d\n",
+				args->id, svm->pasid);
+			goto done;
+		}
+		if (!ioasid_queue_work(&svm->work))
+			pr_warn("Cleanup work already queued\n");
+		return NOTIFY_OK;
+	}
+done:
+	return ret;
+}
+
+static struct notifier_block pasid_nb = {
+	.notifier_call = pasid_status_change,
+};
+
+void intel_svm_add_pasid_notifier(void)
+{
+	/* Listen to all PASIDs, not specific to a set */
+	ioasid_register_notifier(NULL, &pasid_nb);
+}
+
+void intel_svm_check(struct intel_iommu *iommu)
+{
+	if (!pasid_supported(iommu))
+		return;
+
+	if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
+	    !cap_fl1gp_support(iommu->cap)) {
+		pr_err("%s SVM disabled, incompatible 1GB page capability\n",
+		       iommu->name);
+		return;
+	}
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) &&
+	    !cap_5lp_support(iommu->cap)) {
+		pr_err("%s SVM disabled, incompatible paging mode\n",
+		       iommu->name);
+		return;
+	}
+
+	iommu->flags |= VTD_FLAG_SVM_CAPABLE;
 }
 
 static void intel_flush_svm_range_dev(struct intel_svm *svm,
@@ -209,10 +319,9 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * *has* to handle gracefully without affecting other processes.
 	 */
 	rcu_read_lock();
-	list_for_each_entry_rcu(sdev, &svm->devs, list) {
-		intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
-		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-	}
+	list_for_each_entry_rcu(sdev, &svm->devs, list)
+		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
+					    svm->pasid, true, false);
 	rcu_read_unlock();
 
 }
@@ -222,22 +331,357 @@ static const struct mmu_notifier_ops intel_mmuops = {
 	.invalidate_range = intel_invalidate_range,
 };
 
-static DEFINE_MUTEX(pasid_mutex);
 static LIST_HEAD(global_svm_list);
 
-int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
+#define for_each_svm_dev(sdev, svm, d)			\
+	list_for_each_entry((sdev), &(svm)->devs, list)	\
+		if ((d) != (sdev)->dev) {} else
+
+static int pasid_to_svm_sdev(struct device *dev,
+			     struct ioasid_set *set,
+			     unsigned int pasid,
+			     struct intel_svm **rsvm,
+			     struct intel_svm_dev **rsdev)
+{
+	struct intel_svm_dev *d, *sdev = NULL;
+	struct intel_svm *svm;
+
+	/* The caller should hold the pasid_mutex lock */
+	if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
+		return -EINVAL;
+
+	if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
+		return -EINVAL;
+
+	svm = ioasid_find(set, pasid, NULL);
+	if (IS_ERR(svm)) {
+		if (pasid == PASID_RID2PASID) {
+			svm = NULL;
+		} else {
+			return PTR_ERR(svm);
+		}
+	}
+
+	if (!svm)
+		goto out;
+
+	/*
+	 * If we found svm for the PASID, there must be at least one device
+	 * bond.
+	 */
+	if (WARN_ON(list_empty(&svm->devs)))
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(d, &svm->devs, list) {
+		if (d->dev == dev) {
+			sdev = d;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+out:
+	*rsvm = svm;
+	*rsdev = sdev;
+
+	return 0;
+}
+
+int intel_svm_bind_gpasid(struct iommu_domain *domain,
+			  struct device *dev,
+			  struct iommu_gpasid_bind_data *data,
+			  void *fault_data)
 {
-	struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm_dev *sdev = NULL;
+	struct dmar_domain *dmar_domain;
 	struct device_domain_info *info;
-	struct intel_svm_dev *sdev;
 	struct intel_svm *svm = NULL;
-	struct mm_struct *mm = NULL;
+	unsigned long iflags;
+	int ret = 0;
+	struct ioasid_set *pasid_set;
+	u64 hpasid_org;
+
+	if (WARN_ON(!iommu) || !data)
+		return -EINVAL;
+
+	if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
+		return -EINVAL;
+
+	/* IOMMU core ensures argsz is more than the start of the union */
+	if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
+		return -EINVAL;
+
+	/* Make sure no undefined flags are used in vendor data */
+	if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
+		return -EINVAL;
+
+	if (!dev_is_pci(dev))
+		return -ENOTSUPP;
+
+	/* Except gIOVA binding, VT-d supports devices with full 20 bit PASIDs only */
+	if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 &&
+	    pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+		return -EINVAL;
+
+	dmar_domain = to_dmar_domain(domain);
+	pasid_set = NULL; //dmar_domain->pasid_set;
+
+	/*
+	 * We only check host PASID range, we have no knowledge to check
+	 * guest PASID range.
+	 */
+	if (data->flags & IOMMU_SVA_HPASID_DEF) {
+		ret = domain_get_pasid(domain, dev);
+		if (ret < 0)
+			return ret;
+		hpasid_org = data->hpasid;
+		data->hpasid = ret;
+		/* TODO: may consider to use NULL because host_pasid_set is native scope */
+		pasid_set = host_pasid_set;
+	} else if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
+		return -EINVAL;
+
+	info = get_domain_info(dev);
+	if (!info)
+		return -EINVAL;
+
+	/*
+	 * Partial assignment needs to add fault data per-pasid.
+	 * Add the fault data in advance as per pasid entry setup it should
+	 * be able to handle prq. And this should be outside of pasid_mutex
+	 * to avoid race with page response and prq reporting.
+	 */
+	if (is_aux_domain(dev, domain) && fault_data) {
+		ret = iommu_add_device_fault_data(dev, data->hpasid,
+						  fault_data);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&pasid_mutex);
+	ret = pasid_to_svm_sdev(dev, pasid_set,
+				data->hpasid, &svm, &sdev);
+	if (ret)
+		goto out;
+
+	if (sdev) {
+		/*
+		 * Do not allow multiple bindings of the same device-PASID since
+		 * there is only one SL page tables per PASID. We may revisit
+		 * once sharing PGD across domains are supported.
+		 */
+		dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
+				     svm->pasid);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!svm) {
+		/* We come here when PASID has never been bond to a device. */
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		svm->pasid = data->hpasid;
+		if (data->flags & IOMMU_SVA_GPASID_VAL) {
+			svm->gpasid = data->gpasid;
+			svm->flags |= SVM_FLAG_GUEST_PASID;
+			if (!(data->flags & IOMMU_SVA_HPASID_DEF))
+				ioasid_attach_spid(data->hpasid, data->gpasid);
+		}
+		ioasid_attach_data(data->hpasid, svm);
+		ioasid_get(NULL, svm->pasid);
+		/*
+		 * Set up cleanup async work in case IOASID core notify us PASID
+		 * is freed before unbind.
+		 */
+		INIT_WORK(&svm->work, intel_svm_free_async_fn);
+		INIT_LIST_HEAD_RCU(&svm->devs);
+	}
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	sdev->dev = dev;
+	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+	sdev->iommu = iommu;
+	sdev->domain = dmar_domain;
+
+	/* Only count users if device has aux domains */
+	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+		sdev->users = 1;
+
+	/* For legacy device passthr giova usage, do not enable pasid */
+	if ((data->flags & IOMMU_SVA_HPASID_DEF) == 0 &&
+	    pci_max_pasids(to_pci_dev(dev)) == PASID_MAX) {
+		/* Set up device context entry for PASID if not enabled already */
+		ret = intel_iommu_enable_pasid(iommu, sdev->dev);
+		if (ret) {
+			dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
+			kfree(sdev);
+			goto out;
+		}
+	}
+
+	/*
+	 * PASID table is per device for better security. Therefore, for
+	 * each bind of a new device even with an existing PASID, we need to
+	 * call the nested mode setup function here.
+	 */
+	spin_lock_irqsave(&iommu->lock, iflags);
+	if (data->flags & IOMMU_SVA_SL_ONLY) {
+		ret = intel_pasid_setup_second_level(iommu, dmar_domain, dev, data->hpasid);
+	} else {
+		ret = intel_pasid_setup_nested(iommu, dev,
+					       (pgd_t *)(uintptr_t)data->gpgd,
+					       data->hpasid, &data->vendor.vtd, dmar_domain,
+					       data->addr_width);
+	}
+	spin_unlock_irqrestore(&iommu->lock, iflags);
+	if (ret) {
+		dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
+				    data->hpasid, ret);
+		/*
+		 * PASID entry should be in cleared state if nested mode
+		 * set up failed. So we only need to clear IOASID tracking
+		 * data such that free call will succeed.
+		 */
+		kfree(sdev);
+		goto out;
+	}
+
+	svm->flags |= SVM_FLAG_GUEST_MODE;
+
+	init_rcu_head(&sdev->rcu);
+	list_add_rcu(&sdev->list, &svm->devs);
+ out:
+	if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
+		ioasid_detach_data(data->hpasid);
+		kfree(svm);
+	}
+
+	if (data->flags & IOMMU_SVA_HPASID_DEF)
+		data->hpasid = hpasid_org;
+
+	mutex_unlock(&pasid_mutex);
+
+	if (ret && is_aux_domain(dev, domain) && fault_data)
+		iommu_delete_device_fault_data(dev,
+				(data->flags & IOMMU_SVA_HPASID_DEF) ? hpasid_org : data->hpasid);
+	return ret;
+}
+
+int intel_svm_unbind_gpasid(struct iommu_domain *domain,
+			    struct device *dev, u32 pasid, u64 user_flags)
+{
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret;
+	struct dmar_domain *dmar_domain;
+	struct ioasid_set *pasid_set;
+	bool keep_pte = false;
+
+	if (WARN_ON(!iommu))
+		return -EINVAL;
+
+	dmar_domain = to_dmar_domain(domain);
+	pasid_set = NULL; // dmar_domain->pasid_set;
+
+	if (user_flags & IOMMU_SVA_HPASID_DEF) {
+		ret = domain_get_pasid(domain, dev);
+		if (ret < 0)
+			return ret;
+		pasid = ret;
+		pasid_set = host_pasid_set;
+		keep_pte = true;
+	}
+
+	mutex_lock(&pasid_mutex);
+	ret = pasid_to_svm_sdev(dev, pasid_set, pasid, &svm, &sdev);
+	if (ret)
+		goto out;
+
+	if (sdev) {
+		if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
+			sdev->users--;
+		if (!sdev->users) {
+			list_del_rcu(&sdev->list);
+			intel_pasid_tear_down_entry(iommu, dev,
+						    svm->pasid, false, keep_pte);
+			intel_svm_drain_prq(dev, svm->pasid);
+
+			if (list_empty(&svm->devs)) {
+				/*
+				 * We do not free the IOASID here in that
+				 * IOMMU driver did not allocate it.
+				 * Unlike native SVM, IOASID for guest use was
+				 * allocated prior to the bind call.
+				 * In any case, if the free call comes before
+				 * the unbind, IOMMU driver will get notified
+				 * and perform cleanup.
+				 */
+				intel_svm_drop_pasid(pasid, user_flags);
+				kfree(svm);
+			}
+		}
+	}
+out:
+	mutex_unlock(&pasid_mutex);
+	if (sdev) {
+		/*
+		 * Partial assignment needs to delete fault data, this should
+		 * be outside of pasid_mutex protection to avoid race with
+		 * page response and prq reporting.
+		 */
+		if (is_aux_domain(dev, domain))
+			iommu_delete_device_fault_data(dev, pasid);
+		kfree_rcu(sdev, rcu);
+	}
+	return ret;
+}
+
+static void _load_pasid(void *unused)
+{
+//	update_pasid();
+}
+
+static void load_pasid(struct mm_struct *mm, u32 pasid)
+{
+	mutex_lock(&mm->context.lock);
+
+	/* Synchronize with READ_ONCE in update_pasid(). */
+	smp_store_release(&mm->pasid, pasid);
+
+	/* Update PASID MSR on all CPUs running the mm's tasks. */
+	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
+
+	mutex_unlock(&mm->context.lock);
+}
+
+/* Caller must hold pasid_mutex, mm reference */
+static int
+intel_svm_bind_mm(struct device *dev, unsigned int flags,
+		  struct mm_struct *mm, struct intel_svm_dev **sd)
+{
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm *svm = NULL, *t;
+	struct device_domain_info *info;
+	struct intel_svm_dev *sdev;
+	unsigned long iflags;
 	int pasid_max;
 	int ret;
 
 	if (!iommu || dmar_disabled)
 		return -EINVAL;
 
+	if (!intel_svm_capable(iommu))
+		return -ENOTSUPP;
+
 	if (dev_is_pci(dev)) {
 		pasid_max = pci_max_pasids(to_pci_dev(dev));
 		if (pasid_max < 0)
@@ -245,44 +689,34 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 	} else
 		pasid_max = 1 << 20;
 
+	/* Bind supervisor PASID shuld have mm = NULL */
 	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-		if (!ecap_srs(iommu->ecap))
+		if (!ecap_srs(iommu->ecap) || mm) {
+			pr_err("Supervisor PASID with user provided mm.\n");
 			return -EINVAL;
-	} else if (pasid) {
-		mm = get_task_mm(current);
-		BUG_ON(!mm);
+		}
 	}
 
-	mutex_lock(&pasid_mutex);
-	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
-		struct intel_svm *t;
-
-		list_for_each_entry(t, &global_svm_list, list) {
-			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
-				continue;
-
-			svm = t;
-			if (svm->pasid >= pasid_max) {
-				dev_warn(dev,
-					 "Limited PASID width. Cannot use existing PASID %d\n",
-					 svm->pasid);
-				ret = -ENOSPC;
-				goto out;
-			}
+	list_for_each_entry(t, &global_svm_list, list) {
+		if (t->mm != mm)
+			continue;
 
-			list_for_each_entry(sdev, &svm->devs, list) {
-				if (dev == sdev->dev) {
-					if (sdev->ops != ops) {
-						ret = -EBUSY;
-						goto out;
-					}
-					sdev->users++;
-					goto success;
-				}
-			}
+		svm = t;
+		if (svm->pasid >= pasid_max) {
+			dev_warn(dev,
+				 "Limited PASID width. Cannot use existing PASID %d\n",
+				 svm->pasid);
+			ret = -ENOSPC;
+			goto out;
+		}
 
-			break;
+		/* Find the matching device in svm list */
+		for_each_svm_dev(sdev, svm, dev) {
+			sdev->users++;
+			goto success;
 		}
+
+		break;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
@@ -291,21 +725,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		goto out;
 	}
 	sdev->dev = dev;
+	sdev->iommu = iommu;
 
 	ret = intel_iommu_enable_pasid(iommu, dev);
-	if (ret || !pasid) {
-		/* If they don't actually want to assign a PASID, this is
-		 * just an enabling check/preparation. */
-		kfree(sdev);
-		goto out;
-	}
-
-	info = dev->archdata.iommu;
-	if (!info || !info->pasid_supported) {
+	if (ret) {
 		kfree(sdev);
 		goto out;
 	}
 
+	info = get_domain_info(dev);
 	sdev->did = FLPT_DEFAULT_DID;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
 	if (info->ats_enabled) {
@@ -315,9 +743,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 			sdev->qdep = 0;
 	}
 
+	sdev->domain = info->domain;
 	/* Finish the setup now we know we're keeping it */
 	sdev->users = 1;
-	sdev->ops = ops;
 	init_rcu_head(&sdev->rcu);
 
 	if (!svm) {
@@ -327,21 +755,19 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 			kfree(sdev);
 			goto out;
 		}
-		svm->iommu = iommu;
 
 		if (pasid_max > intel_pasid_max_id)
 			pasid_max = intel_pasid_max_id;
 
-		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
-		ret = intel_pasid_alloc_id(svm,
-					   !!cap_caching_mode(iommu->cap),
-					   pasid_max, GFP_KERNEL);
-		if (ret < 0) {
+		/* Do not use PASID 0, reserved for RID to PASID */
+		svm->pasid = ioasid_alloc(host_pasid_set, IOASID_ALLOC_BASE,
+					  pasid_max - 1, svm);
+		if (svm->pasid == INVALID_IOASID) {
 			kfree(svm);
 			kfree(sdev);
+			ret = -ENOSPC;
 			goto out;
 		}
-		svm->pasid = ret;
 		svm->notifier.ops = &intel_mmuops;
 		svm->mm = mm;
 		svm->flags = flags;
@@ -351,145 +777,117 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		if (mm) {
 			ret = mmu_notifier_register(&svm->notifier, mm);
 			if (ret) {
-				intel_pasid_free_id(svm->pasid);
+				ioasid_put(host_pasid_set, svm->pasid);
 				kfree(svm);
 				kfree(sdev);
 				goto out;
 			}
 		}
 
-		spin_lock(&iommu->lock);
+		spin_lock_irqsave(&iommu->lock, iflags);
 		ret = intel_pasid_setup_first_level(iommu, dev,
 				mm ? mm->pgd : init_mm.pgd,
 				svm->pasid, FLPT_DEFAULT_DID,
-				mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
-		spin_unlock(&iommu->lock);
+				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+				(cpu_feature_enabled(X86_FEATURE_LA57) ?
+				 PASID_FLAG_FL5LP : 0));
+		spin_unlock_irqrestore(&iommu->lock, iflags);
 		if (ret) {
 			if (mm)
 				mmu_notifier_unregister(&svm->notifier, mm);
-			intel_pasid_free_id(svm->pasid);
+			ioasid_put(host_pasid_set, svm->pasid);
 			kfree(svm);
 			kfree(sdev);
 			goto out;
 		}
 
 		list_add_tail(&svm->list, &global_svm_list);
+		if (mm) {
+			/* The newly allocated pasid is loaded to the mm. */
+			load_pasid(mm, svm->pasid);
+		}
 	} else {
 		/*
 		 * Binding a new device with existing PASID, need to setup
 		 * the PASID entry.
 		 */
-		spin_lock(&iommu->lock);
+		spin_lock_irqsave(&iommu->lock, iflags);
 		ret = intel_pasid_setup_first_level(iommu, dev,
 						mm ? mm->pgd : init_mm.pgd,
 						svm->pasid, FLPT_DEFAULT_DID,
-						mm ? 0 : PASID_FLAG_SUPERVISOR_MODE);
-		spin_unlock(&iommu->lock);
+						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
+						(cpu_feature_enabled(X86_FEATURE_LA57) ?
+						PASID_FLAG_FL5LP : 0));
+		spin_unlock_irqrestore(&iommu->lock, iflags);
 		if (ret) {
 			kfree(sdev);
 			goto out;
 		}
 	}
 	list_add_rcu(&sdev->list, &svm->devs);
-
- success:
-	*pasid = svm->pasid;
+success:
+	sdev->pasid = svm->pasid;
+	sdev->sva.dev = dev;
+	if (sd)
+		*sd = sdev;
 	ret = 0;
- out:
-	mutex_unlock(&pasid_mutex);
-	if (mm)
-		mmput(mm);
+out:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
 
-int intel_svm_unbind_mm(struct device *dev, int pasid)
+/* Caller must hold pasid_mutex */
+static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 {
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
 	struct intel_svm *svm;
 	int ret = -EINVAL;
 
-	mutex_lock(&pasid_mutex);
-	iommu = intel_svm_device_to_iommu(dev);
+	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		goto out;
 
-	svm = intel_pasid_lookup_id(pasid);
-	if (!svm)
+	ret = pasid_to_svm_sdev(dev, host_pasid_set,
+				pasid, &svm, &sdev);
+	if (ret)
 		goto out;
 
-	list_for_each_entry(sdev, &svm->devs, list) {
-		if (dev == sdev->dev) {
-			ret = 0;
-			sdev->users--;
-			if (!sdev->users) {
-				list_del_rcu(&sdev->list);
-				/* Flush the PASID cache and IOTLB for this device.
-				 * Note that we do depend on the hardware *not* using
-				 * the PASID any more. Just as we depend on other
-				 * devices never using PASIDs that they have no right
-				 * to use. We have a *shared* PASID table, because it's
-				 * large and has to be physically contiguous. So it's
-				 * hard to be as defensive as we might like. */
-				intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
-				intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-				kfree_rcu(sdev, rcu);
-
-				if (list_empty(&svm->devs)) {
-					intel_pasid_free_id(svm->pasid);
-					if (svm->mm)
-						mmu_notifier_unregister(&svm->notifier, svm->mm);
-
-					list_del(&svm->list);
-
-					/* We mandate that no page faults may be outstanding
-					 * for the PASID when intel_svm_unbind_mm() is called.
-					 * If that is not obeyed, subtle errors will happen.
-					 * Let's make them less subtle... */
-					memset(svm, 0x6b, sizeof(*svm));
-					kfree(svm);
+	if (sdev) {
+		sdev->users--;
+		if (!sdev->users) {
+			list_del_rcu(&sdev->list);
+			/* Flush the PASID cache and IOTLB for this device.
+			 * Note that we do depend on the hardware *not* using
+			 * the PASID any more. Just as we depend on other
+			 * devices never using PASIDs that they have no right
+			 * to use. We have a *shared* PASID table, because it's
+			 * large and has to be physically contiguous. So it's
+			 * hard to be as defensive as we might like. */
+			intel_pasid_tear_down_entry(iommu, dev,
+						    svm->pasid, false, false);
+			intel_svm_drain_prq(dev, svm->pasid);
+			kfree_rcu(sdev, rcu);
+
+			if (list_empty(&svm->devs)) {
+				ioasid_put(host_pasid_set, svm->pasid);
+				if (svm->mm) {
+					mmu_notifier_unregister(&svm->notifier, svm->mm);
+					/* Clear mm's pasid. */
+					load_pasid(svm->mm, PASID_DISABLED);
 				}
+				list_del(&svm->list);
+				/* We mandate that no page faults may be outstanding
+				 * for the PASID when intel_svm_unbind_mm() is called.
+				 * If that is not obeyed, subtle errors will happen.
+				 * Let's make them less subtle... */
+				memset(svm, 0x6b, sizeof(*svm));
+				kfree(svm);
 			}
-			break;
 		}
 	}
- out:
-	mutex_unlock(&pasid_mutex);
-
+out:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);
-
-int intel_svm_is_pasid_valid(struct device *dev, int pasid)
-{
-	struct intel_iommu *iommu;
-	struct intel_svm *svm;
-	int ret = -EINVAL;
-
-	mutex_lock(&pasid_mutex);
-	iommu = intel_svm_device_to_iommu(dev);
-	if (!iommu)
-		goto out;
-
-	svm = intel_pasid_lookup_id(pasid);
-	if (!svm)
-		goto out;
-
-	/* init_mm is used in this case */
-	if (!svm->mm)
-		ret = 1;
-	else if (atomic_read(&svm->mm->mm_users) > 0)
-		ret = 1;
-	else
-		ret = 0;
-
- out:
-	mutex_unlock(&pasid_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
 
 /* Page request queue descriptor */
 struct page_req_dsc {
@@ -520,8 +918,6 @@ struct page_req_dsc {
 	u64 priv_data[2];
 };
 
-#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
-
 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
 {
 	unsigned long requested = 0;
@@ -546,11 +942,169 @@ static bool is_canonical_address(u64 addr)
 	return (((saddr << shift) >> shift) == saddr);
 }
 
+/**
+ * intel_svm_drain_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+static void intel_svm_drain_prq(struct device *dev, u32 pasid)
+{
+	struct device_domain_info *info;
+	struct dmar_domain *domain;
+	struct intel_iommu *iommu;
+	struct qi_desc desc[3];
+	struct pci_dev *pdev;
+	int head, tail;
+	u16 sid, did;
+	int qdep;
+
+	info = get_domain_info(dev);
+	if (WARN_ON(!info || !dev_is_pci(dev)))
+		return;
+
+	if (!info->pri_enabled)
+		return;
+
+	iommu = info->iommu;
+	domain = info->domain;
+	pdev = to_pci_dev(dev);
+	sid = PCI_DEVID(info->bus, info->devfn);
+	did = domain->iommu_did[iommu->seq_id];
+	qdep = pci_ats_queue_depth(pdev);
+
+	/*
+	 * Check and wait until all pending page requests in the queue are
+	 * handled by the prq handling thread.
+	 */
+prq_retry:
+	reinit_completion(&iommu->prq_complete);
+	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+	while (head != tail) {
+		struct page_req_dsc *req;
+
+		req = &iommu->prq[head / sizeof(*req)];
+		if (!req->pasid_present || req->pasid != pasid) {
+			head = (head + sizeof(*req)) & PRQ_RING_MASK;
+			continue;
+		}
+
+		wait_for_completion(&iommu->prq_complete);
+		goto prq_retry;
+	}
+
+	/*
+	 * Perform steps described in VT-d spec CH7.10 to drain page
+	 * requests and responses in hardware.
+	 */
+	memset(desc, 0, sizeof(desc));
+	desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+			QI_IWD_FENCE |
+			QI_IWD_TYPE;
+	desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
+			QI_EIOTLB_DID(did) |
+			QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+			QI_EIOTLB_TYPE;
+	desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
+			QI_DEV_EIOTLB_SID(sid) |
+			QI_DEV_EIOTLB_QDEP(qdep) |
+			QI_DEIOTLB_TYPE |
+			QI_DEV_IOTLB_PFSID(info->pfsid);
+qi_retry:
+	reinit_completion(&iommu->prq_complete);
+	qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+		wait_for_completion(&iommu->prq_complete);
+		goto qi_retry;
+	}
+}
+
+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+	int prot = 0;
+
+	if (req->rd_req)
+		prot |= IOMMU_FAULT_PERM_READ;
+	if (req->wr_req)
+		prot |= IOMMU_FAULT_PERM_WRITE;
+	if (req->exe_req)
+		prot |= IOMMU_FAULT_PERM_EXEC;
+	if (req->pm_req)
+		prot |= IOMMU_FAULT_PERM_PRIV;
+
+	return prot;
+}
+
+static int
+intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+{
+	struct device_domain_info *info;
+	struct iommu_fault_event event;
+
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
+
+	/* Fill in event data for device specific processing */
+	memset(&event, 0, sizeof(struct iommu_fault_event));
+	event.fault.type = IOMMU_FAULT_PAGE_REQ;
+	event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
+	event.fault.prm.pasid = desc->pasid;
+	event.fault.prm.grpid = desc->prg_index;
+	event.fault.prm.perm = prq_to_iommu_prot(desc);
+
+	if (desc->lpig)
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+	if (desc->pasid_present) {
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+	}
+	if (desc->priv_data_present) {
+		/*
+		 * Set last page in group bit if private data is present,
+		 * page response is required as it does for LPIG.
+		 * iommu_report_device_fault() doesn't understand this vendor
+		 * specific requirement thus we set last_page as a workaround.
+		 */
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+		memcpy(event.fault.prm.private_data, desc->priv_data,
+		       sizeof(desc->priv_data));
+	}
+
+	/*
+	 * If the device supports PASID granu scalable mode, reports the
+	 * PASID as vector such that handlers can be dispatched with per
+	 * vector data.
+	 */
+	info = get_domain_info(dev);
+	if (!list_empty(&info->subdevices)) {
+		dev_dbg(dev, "Aux domain present, assign vector %d\n", desc->pasid);
+		event.vector = desc->pasid;
+	}
+	return iommu_report_device_fault(dev, &event);
+}
+
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
+	struct intel_svm_dev *sdev = NULL;
 	struct intel_iommu *iommu = d;
 	struct intel_svm *svm = NULL;
 	int head, tail, handled = 0;
+	unsigned int flags = 0;
+	s64 start_ktime = 0;
+
+	if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ))
+		start_ktime = ktime_to_ns(ktime_get());
 
 	/* Clear PPR bit before reading head/tail registers, to
 	 * ensure that we get a new interrupt if needed. */
@@ -559,7 +1113,6 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 	head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 	while (head != tail) {
-		struct intel_svm_dev *sdev;
 		struct vm_area_struct *vma;
 		struct page_req_dsc *req;
 		struct qi_desc resp;
@@ -567,11 +1120,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		vm_fault_t ret;
 		u64 address;
 
+		iommu->num_prqs++;
 		handled = 1;
-
 		req = &iommu->prq[head / sizeof(*req)];
-
-		result = QI_RESP_FAILURE;
+		result = QI_RESP_INVALID;
 		address = (u64)req->addr << VTD_PAGE_SHIFT;
 		if (!req->pasid_present) {
 			pr_err("%s: Page request without PASID: %08llx %08llx\n",
@@ -579,16 +1131,25 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			       ((unsigned long long *)req)[1]);
 			goto no_pasid;
 		}
-
+		/* We shall not receive page request for supervisor SVM */
+		if (req->pm_req && (req->rd_req | req->wr_req)) {
+			pr_err("Unexpected page request in Privilege Mode");
+			/* No need to find the matching sdev as for bad_req */
+			goto no_pasid;
+		}
+		/* DMA read with exec requeset is not supported. */
+		if (req->exe_req && req->rd_req) {
+			pr_err("Execution request not supported\n");
+			goto no_pasid;
+		}
 		if (!svm || svm->pasid != req->pasid) {
 			rcu_read_lock();
-			svm = intel_pasid_lookup_id(req->pasid);
+			svm = ioasid_find(NULL, req->pasid, NULL);
 			/* It *can't* go away, because the driver is not permitted
 			 * to unbind the mm while any page faults are outstanding.
 			 * So we only need RCU to protect the internal idr code. */
 			rcu_read_unlock();
-
-			if (!svm) {
+			if (IS_ERR_OR_NULL(svm)) {
 				pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
 				       iommu->name, req->pasid, ((unsigned long long *)req)[0],
 				       ((unsigned long long *)req)[1]);
@@ -596,12 +1157,35 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			}
 		}
 
-		result = QI_RESP_INVALID;
+		if (!sdev || sdev->sid != req->rid) {
+			struct intel_svm_dev *t;
+
+			sdev = NULL;
+			rcu_read_lock();
+			list_for_each_entry_rcu(t, &svm->devs, list) {
+				if (t->sid == req->rid) {
+					sdev = t;
+					break;
+				}
+			}
+			rcu_read_unlock();
+		}
+
+		/*
+		 * If prq is to be handled outside iommu driver via receiver of
+		 * the fault notifiers, we skip the page response here.
+		 */
+		if (svm->flags & SVM_FLAG_GUEST_MODE) {
+			if (sdev && !intel_svm_prq_report(sdev->dev, req))
+				goto prq_advance;
+			else
+				goto bad_req;
+
 		/* Since we're using init_mm.pgd directly, we should never take
 		 * any faults on kernel addresses. */
 		if (!svm->mm)
 			goto bad_req;
-
+		}
 		/* If address is not canonical, return invalid response */
 		if (!is_canonical_address(address))
 			goto bad_req;
@@ -618,41 +1202,24 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (access_error(vma, req))
 			goto invalid;
 
-		ret = handle_mm_fault(vma, address,
-				      req->wr_req ? FAULT_FLAG_WRITE : 0);
+		flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+		if (req->wr_req)
+			flags |= FAULT_FLAG_WRITE;
+
+		ret = handle_mm_fault(vma, address, flags);
 		if (ret & VM_FAULT_ERROR)
 			goto invalid;
 
 		result = QI_RESP_SUCCESS;
-	invalid:
+invalid:
 		up_read(&svm->mm->mmap_sem);
 		mmput(svm->mm);
-	bad_req:
-		/* Accounting for major/minor faults? */
-		rcu_read_lock();
-		list_for_each_entry_rcu(sdev, &svm->devs, list) {
-			if (sdev->sid == req->rid)
-				break;
-		}
-		/* Other devices can go away, but the drivers are not permitted
-		 * to unbind while any page faults might be in flight. So it's
-		 * OK to drop the 'lock' here now we have it. */
-		rcu_read_unlock();
-
-		if (WARN_ON(&sdev->list == &svm->devs))
-			sdev = NULL;
-
-		if (sdev && sdev->ops && sdev->ops->fault_cb) {
-			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-				(req->exe_req << 1) | (req->pm_req);
-			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
-					    req->priv_data, rwxp, result);
-		}
+bad_req:
 		/* We get here in the error case where the PASID lookup failed,
 		   and these can be NULL. Do not use them below this point! */
 		sdev = NULL;
 		svm = NULL;
-	no_pasid:
+no_pasid:
 		if (req->lpig || req->priv_data_present) {
 			/*
 			 * Per VT-d spec. v3.0 ch7.7, system software must
@@ -669,18 +1236,180 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 				QI_PGRP_RESP_TYPE;
 			resp.qw1 = QI_PGRP_IDX(req->prg_index) |
 				QI_PGRP_LPIG(req->lpig);
+			resp.qw2 = 0;
+			resp.qw3 = 0;
 
 			if (req->priv_data_present)
 				memcpy(&resp.qw2, req->priv_data,
 				       sizeof(req->priv_data));
-			resp.qw2 = 0;
-			resp.qw3 = 0;
-			qi_submit_sync(&resp, iommu);
+			qi_submit_sync(iommu, &resp, 1, 0);
 		}
+
+		if (start_ktime)
+			dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
+					    ktime_to_ns(ktime_get()) - start_ktime);
+prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
 
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
 
+	/*
+	 * Clear the page request overflow bit and wake up all threads that
+	 * are waiting for the completion of this handling.
+	 */
+	if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+		pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
+				    iommu->name);
+		head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+		tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+		if (head == tail) {
+			writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+			pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
+					    iommu->name);
+		}
+	}
+
+	complete(&iommu->prq_complete);
+
 	return IRQ_RETVAL(handled);
 }
+
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
+struct iommu_sva *
+intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+{
+	struct iommu_sva *sva = ERR_PTR(-EINVAL);
+	struct intel_svm_dev *sdev = NULL;
+	unsigned int flags = 0;
+	int ret;
+
+	/*
+	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
+	 * It will require shared SVM data structures, i.e. combine io_mm
+	 * and intel_svm etc.
+	 */
+	if (drvdata)
+		flags = *(unsigned int *)drvdata;
+	mutex_lock(&pasid_mutex);
+	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
+	if (ret)
+		sva = ERR_PTR(ret);
+	else if (sdev)
+		sva = &sdev->sva;
+	else
+		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+
+	mutex_unlock(&pasid_mutex);
+
+	return sva;
+}
+
+void intel_svm_unbind(struct iommu_sva *sva)
+{
+	struct intel_svm_dev *sdev;
+
+	mutex_lock(&pasid_mutex);
+	sdev = to_intel_svm_dev(sva);
+	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
+	mutex_unlock(&pasid_mutex);
+}
+
+u32 intel_svm_get_pasid(struct iommu_sva *sva)
+{
+	struct intel_svm_dev *sdev;
+	u32 pasid;
+
+	mutex_lock(&pasid_mutex);
+	sdev = to_intel_svm_dev(sva);
+	pasid = sdev->pasid;
+	mutex_unlock(&pasid_mutex);
+
+	return pasid;
+}
+
+int intel_svm_page_response(struct iommu_domain *domain,
+			    struct device *dev,
+			    struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg)
+{
+	struct iommu_fault_page_request *prm;
+	struct dmar_domain *dmar_domain;
+	struct intel_svm_dev *sdev = NULL;
+	struct intel_svm *svm = NULL;
+	struct intel_iommu *iommu;
+	bool private_present;
+	bool pasid_present;
+	bool last_page;
+	u8 bus, devfn;
+	int ret = 0;
+	u16 sid;
+
+	if (!dev || !dev_is_pci(dev))
+		return -ENODEV;
+
+	iommu = device_to_iommu(dev, &bus, &devfn);
+	if (!iommu)
+		return -ENODEV;
+
+	if (!msg || !evt)
+		return -EINVAL;
+
+	mutex_lock(&pasid_mutex);
+
+	prm = &evt->fault.prm;
+	sid = PCI_DEVID(bus, devfn);
+	pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+	private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+
+	if (!pasid_present) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dmar_domain = to_dmar_domain(domain);
+	ret = pasid_to_svm_sdev(dev, NULL, // dmar_domain->pasid_set,
+				prm->pasid, &svm, &sdev);
+	if (ret || !sdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * Per VT-d spec. v3.0 ch7.7, system software must respond
+	 * with page group response if private data is present (PDP)
+	 * or last page in group (LPIG) bit is set. This is an
+	 * additional VT-d requirement beyond PCI ATS spec.
+	 */
+	if (last_page || private_present) {
+		struct qi_desc desc;
+
+		desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
+				QI_PGRP_PASID_P(pasid_present) |
+				QI_PGRP_PDP(private_present) |
+				QI_PGRP_RESP_CODE(msg->code) |
+				QI_PGRP_RESP_TYPE;
+		desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
+		desc.qw2 = 0;
+		desc.qw3 = 0;
+
+		if (private_present) {
+			desc.qw2 = prm->private_data[0];
+			desc.qw3 = prm->private_data[1];
+		} else if (prm->private_data[0]) {
+			dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
+				ktime_to_ns(ktime_get()) - prm->private_data[0]);
+		}
+
+		qi_submit_sync(iommu, &desc, 1, 0);
+	}
+out:
+	mutex_unlock(&pasid_mutex);
+	return ret;
+}
diff --git a/drivers/iommu/intel-trace.c b/drivers/iommu/intel/trace.c
similarity index 100%
rename from drivers/iommu/intel-trace.c
rename to drivers/iommu/intel/trace.c
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
new file mode 100644
index 0000000000000000000000000000000000000000..7e72acf5eaab030f85010b21b03c55e55393c11a
--- /dev/null
+++ b/drivers/iommu/io-pgfault.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle device page faults
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ */
+
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "iommu-sva-lib.h"
+
+/**
+ * struct iopf_queue - IO Page Fault queue
+ * @wq: the fault workqueue
+ * @devices: devices attached to this queue
+ * @lock: protects the device list
+ */
+struct iopf_queue {
+	struct workqueue_struct		*wq;
+	struct list_head		devices;
+	struct mutex			lock;
+};
+
+/**
+ * struct iopf_device_param - IO Page Fault data attached to a device
+ * @dev: the device that owns this param
+ * @queue: IOPF queue
+ * @queue_list: index into queue->devices
+ * @partial: faults that are part of a Page Request Group for which the last
+ *           request hasn't been submitted yet.
+ */
+struct iopf_device_param {
+	struct device			*dev;
+	struct iopf_queue		*queue;
+	struct list_head		queue_list;
+	struct list_head		partial;
+};
+
+struct iopf_fault {
+	struct iommu_fault		fault;
+	struct list_head		list;
+};
+
+struct iopf_group {
+	struct iopf_fault		last_fault;
+	struct list_head		faults;
+	struct work_struct		work;
+	struct device			*dev;
+};
+
+static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
+			       enum iommu_page_response_code status)
+{
+	struct iommu_page_response resp = {
+		.version		= IOMMU_PAGE_RESP_VERSION_1,
+		.pasid			= iopf->fault.prm.pasid,
+		.grpid			= iopf->fault.prm.grpid,
+		.code			= status,
+	};
+
+	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
+		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+	return iommu_page_response(dev, &resp);
+}
+
+static enum iommu_page_response_code
+iopf_handle_single(struct iopf_fault *iopf)
+{
+	vm_fault_t ret;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned int access_flags = 0;
+	unsigned int fault_flags = FAULT_FLAG_REMOTE;
+	struct iommu_fault_page_request *prm = &iopf->fault.prm;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+
+	if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID))
+		return status;
+
+	mm = iommu_sva_find(prm->pasid);
+	if (IS_ERR_OR_NULL(mm))
+		return status;
+
+	down_read(&mm->mmap_sem);
+
+	vma = find_extend_vma(mm, prm->addr);
+	if (!vma)
+		/* Unmapped area */
+		goto out_put_mm;
+
+	if (prm->perm & IOMMU_FAULT_PERM_READ)
+		access_flags |= VM_READ;
+
+	if (prm->perm & IOMMU_FAULT_PERM_WRITE) {
+		access_flags |= VM_WRITE;
+		fault_flags |= FAULT_FLAG_WRITE;
+	}
+
+	if (prm->perm & IOMMU_FAULT_PERM_EXEC) {
+		access_flags |= VM_EXEC;
+		fault_flags |= FAULT_FLAG_INSTRUCTION;
+	}
+
+	if (!(prm->perm & IOMMU_FAULT_PERM_PRIV))
+		fault_flags |= FAULT_FLAG_USER;
+
+	if (access_flags & ~vma->vm_flags)
+		/* Access fault */
+		goto out_put_mm;
+
+	ret = handle_mm_fault(vma, prm->addr, fault_flags);
+	status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID :
+		IOMMU_PAGE_RESP_SUCCESS;
+
+out_put_mm:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return status;
+}
+
+static void iopf_handle_group(struct work_struct *work)
+{
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
+
+	group = container_of(work, struct iopf_group, work);
+
+	list_for_each_entry_safe(iopf, next, &group->faults, list) {
+		/*
+		 * For the moment, errors are sticky: don't handle subsequent
+		 * faults in the group if there is an error.
+		 */
+		if (status == IOMMU_PAGE_RESP_SUCCESS)
+			status = iopf_handle_single(iopf);
+
+		if (!(iopf->fault.prm.flags &
+		      IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+			kfree(iopf);
+	}
+
+	iopf_complete_group(group->dev, &group->last_fault, status);
+	kfree(group);
+}
+
+/**
+ * iommu_queue_iopf - IO Page Fault handler
+ * @fault: fault event
+ * @cookie: struct device, passed to iommu_register_device_fault_handler.
+ *
+ * Add a fault to the device workqueue, to be handled by mm.
+ *
+ * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
+ * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
+ * expect a response. It may be generated when disabling a PASID (issuing a
+ * PASID stop request) by some PCI devices.
+ *
+ * The PASID stop request is issued by the device driver before unbind(). Once
+ * it completes, no page request is generated for this PASID anymore and
+ * outstanding ones have been pushed to the IOMMU (as per PCIe 4.0r1.0 - 6.20.1
+ * and 10.4.1.2 - Managing PASID TLP Prefix Usage). Some PCI devices will wait
+ * for all outstanding page requests to come back with a response before
+ * completing the PASID stop request. Others do not wait for page responses, and
+ * instead issue this Stop Marker that tells us when the PASID can be
+ * reallocated.
+ *
+ * It is safe to discard the Stop Marker because it is an optimization.
+ * a. Page requests, which are posted requests, have been flushed to the IOMMU
+ *    when the stop request completes.
+ * b. The IOMMU driver flushes all fault queues on unbind() before freeing the
+ *    PASID.
+ *
+ * So even though the Stop Marker might be issued by the device *after* the stop
+ * request completes, outstanding faults will have been dealt with by the time
+ * the PASID is freed.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	int ret;
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	struct device *dev = cookie;
+	struct dev_iommu *param = dev->iommu;
+
+	lockdep_assert_held(&param->lock);
+
+	if (fault->type != IOMMU_FAULT_PAGE_REQ)
+		/* Not a recoverable page fault */
+		return -EOPNOTSUPP;
+
+	/*
+	 * As long as we're holding param->lock, the queue can't be unlinked
+	 * from the device and therefore cannot disappear.
+	 */
+	iopf_param = param->iopf_param;
+	if (!iopf_param)
+		return -ENODEV;
+
+	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+		if (!iopf)
+			return -ENOMEM;
+
+		iopf->fault = *fault;
+
+		/* Non-last request of a group. Postpone until the last one */
+		list_add(&iopf->list, &iopf_param->partial);
+
+		return 0;
+	}
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		/*
+		 * The caller will send a response to the hardware. But we do
+		 * need to clean up before leaving, otherwise partial faults
+		 * will be stuck.
+		 */
+		ret = -ENOMEM;
+		goto cleanup_partial;
+	}
+
+	group->dev = dev;
+	group->last_fault.fault = *fault;
+	INIT_LIST_HEAD(&group->faults);
+	list_add(&group->last_fault.list, &group->faults);
+	INIT_WORK(&group->work, iopf_handle_group);
+
+	/* See if we have partial faults for this group */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid)
+			/* Insert *before* the last fault */
+			list_move(&iopf->list, &group->faults);
+	}
+
+	queue_work(iopf_param->queue->wq, &group->work);
+	return 0;
+
+cleanup_partial:
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_queue_iopf);
+
+/**
+ * iopf_queue_flush_dev - Ensure that all queued faults have been processed
+ * @dev: the endpoint whose faults need to be flushed.
+ *
+ * The IOMMU driver calls this before releasing a PASID, to ensure that all
+ * pending faults for this PASID have been handled, and won't hit the address
+ * space of the next process that uses this PASID. The driver must make sure
+ * that no new fault is added to the queue. In particular it must flush its
+ * low-level queue before calling this function.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_flush_dev(struct device *dev)
+{
+	int ret = 0;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param)
+		flush_workqueue(iopf_param->queue->wq);
+	else
+		ret = -ENODEV;
+	mutex_unlock(&param->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
+
+/**
+ * iopf_queue_discard_partial - Remove all pending partial fault
+ * @queue: the queue whose partial faults need to be discarded
+ *
+ * When the hardware queue overflows, last page faults in a group may have been
+ * lost and the IOMMU driver calls this to discard all partial faults. The
+ * driver shouldn't be adding new faults to this queue concurrently.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	if (!queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+		list_for_each_entry_safe(iopf, next, &iopf_param->partial,
+					 list) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	mutex_unlock(&queue->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_discard_partial);
+
+/**
+ * iopf_queue_add_device - Add producer to the fault queue
+ * @queue: IOPF queue
+ * @dev: device to add
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EBUSY;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL);
+	if (!iopf_param)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&iopf_param->partial);
+	iopf_param->queue = queue;
+	iopf_param->dev = dev;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	if (!param->iopf_param) {
+		list_add(&iopf_param->queue_list, &queue->devices);
+		param->iopf_param = iopf_param;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+
+	if (ret)
+		kfree(iopf_param);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_add_device);
+
+/**
+ * iopf_queue_remove_device - Remove producer from fault queue
+ * @queue: IOPF queue
+ * @dev: device to remove
+ *
+ * Caller makes sure that no more faults are reported for this device.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EINVAL;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param || !queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param && iopf_param->queue == queue) {
+		list_del(&iopf_param->queue_list);
+		param->iopf_param = NULL;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+	if (ret)
+		return ret;
+
+	/* Just in case some faults are still stuck */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list)
+		kfree(iopf);
+
+	kfree(iopf_param);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
+
+/**
+ * iopf_queue_alloc - Allocate and initialize a fault queue
+ * @name: a unique string identifying the queue (for workqueue)
+ *
+ * Return: the queue on success and NULL on error.
+ */
+struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	struct iopf_queue *queue;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+
+	/*
+	 * The WQ is unordered because the low-level handler enqueues faults by
+	 * group. PRI requests within a group have to be ordered, but once
+	 * that's dealt with, the high-level function can handle groups out of
+	 * order.
+	 */
+	queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name);
+	if (!queue->wq) {
+		kfree(queue);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&queue->devices);
+	mutex_init(&queue->lock);
+
+	return queue;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_alloc);
+
+/**
+ * iopf_queue_free - Free IOPF queue
+ * @queue: queue to free
+ *
+ * Counterpart to iopf_queue_alloc(). The driver must not be queuing faults or
+ * adding/removing devices on this queue anymore.
+ */
+void iopf_queue_free(struct iopf_queue *queue)
+{
+	struct iopf_device_param *iopf_param, *next;
+
+	if (!queue)
+		return;
+
+	list_for_each_entry_safe(iopf_param, next, &queue->devices, queue_list)
+		iopf_queue_remove_device(queue, iopf_param->dev);
+
+	destroy_workqueue(queue->wq);
+	kfree(queue);
+}
+EXPORT_SYMBOL_GPL(iopf_queue_free);
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
new file mode 100644
index 0000000000000000000000000000000000000000..31030330df948a2177572e834db0e411412947d7
--- /dev/null
+++ b/drivers/iommu/ioasid.c
@@ -0,0 +1,1390 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O Address Space ID allocator. There is one global IOASID space, split into
+ * sets. Users create a set with ioasid_set_alloc, then allocate/free IDs
+ * with ioasid_alloc, ioasid_put, and ioasid_free.
+ */
+#include <linux/ioasid.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/xarray.h>
+#include <linux/sched/mm.h>
+
+/*
+ * An IOASID can have multiple consumers where each consumer may have
+ * hardware contexts associated with the IOASID.
+ * When a status change occurs, like on IOASID deallocation, notifier chains
+ * are used to keep the consumers in sync.
+ * This is a publisher-subscriber pattern where publisher can change the
+ * state of each IOASID, e.g. alloc/free, bind IOASID to a device and mm.
+ * On the other hand, subscribers get notified for the state change and
+ * keep local states in sync.
+ */
+static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+/* List to hold pending notification block registrations */
+static LIST_HEAD(ioasid_nb_pending_list);
+
+/* Default to PCIe standard 20 bit PASID */
+#define PCI_PASID_MAX 0x100000
+static ioasid_t ioasid_capacity = PCI_PASID_MAX;
+static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
+static DEFINE_XARRAY_ALLOC(ioasid_sets);
+
+/* Workqueue for IOASID users to do cleanup upon notification */
+static struct workqueue_struct *ioasid_wq;
+
+struct ioasid_set_nb {
+	struct list_head	list;
+	struct notifier_block	*nb;
+	void			*token;
+	struct ioasid_set	*set;
+	bool			active;
+};
+
+enum ioasid_state {
+	IOASID_STATE_IDLE,
+	IOASID_STATE_ACTIVE,
+	IOASID_STATE_FREE_PENDING,
+};
+
+/**
+ * struct ioasid_data - Meta data about ioasid
+ *
+ * @id:		Unique ID
+ * @spid:	Private ID unique within a set
+ * @refs:	Number of active users
+ * @state:	Track state of the IOASID
+ * @set:	ioasid_set of the IOASID belongs to
+ * @private:	Private data associated with the IOASID
+ * @rcu:	For free after RCU grace period
+ */
+struct ioasid_data {
+	ioasid_t id;
+	ioasid_t spid;
+	enum ioasid_state state;
+	struct ioasid_set *set;
+	void *private;
+	struct rcu_head rcu;
+	refcount_t refs;
+};
+
+/*
+ * struct ioasid_allocator_data - Internal data structure to hold information
+ * about an allocator. There are two types of allocators:
+ *
+ * - Default allocator always has its own XArray to track the IOASIDs allocated.
+ * - Custom allocators may share allocation helpers with different private data.
+ *   Custom allocators that share the same helper functions also share the same
+ *   XArray.
+ * Rules:
+ * 1. Default allocator is always available, not dynamically registered. This is
+ *    to prevent race conditions with early boot code that want to register
+ *    custom allocators or allocate IOASIDs.
+ * 2. Custom allocators take precedence over the default allocator.
+ * 3. When all custom allocators sharing the same helper functions are
+ *    unregistered (e.g. due to hotplug), all outstanding IOASIDs must be
+ *    freed. Otherwise, outstanding IOASIDs will be lost and orphaned.
+ * 4. When switching between custom allocators sharing the same helper
+ *    functions, outstanding IOASIDs are preserved.
+ * 5. When switching between custom allocator and default allocator, all IOASIDs
+ *    must be freed to ensure unadulterated space for the new allocator.
+ *
+ * @ops:	allocator helper functions and its data
+ * @list:	registered custom allocators
+ * @slist:	allocators share the same ops but different data
+ * @flags:	attributes of the allocator
+ * @xa:		xarray holds the IOASID space
+ * @rcu:	used for kfree_rcu when unregistering allocator
+ */
+struct ioasid_allocator_data {
+	struct ioasid_allocator_ops *ops;
+	struct list_head list;
+	struct list_head slist;
+#define IOASID_ALLOCATOR_CUSTOM BIT(0) /* Needs framework to track results */
+	unsigned long flags;
+	struct xarray xa;
+	struct rcu_head rcu;
+};
+
+static DEFINE_SPINLOCK(ioasid_allocator_lock);
+static LIST_HEAD(allocators_list);
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque);
+static void default_free(ioasid_t ioasid, void *opaque);
+
+static struct ioasid_allocator_ops default_ops = {
+	.alloc = default_alloc,
+	.free = default_free,
+};
+
+static struct ioasid_allocator_data default_allocator = {
+	.ops = &default_ops,
+	.flags = 0,
+	.xa = XARRAY_INIT(ioasid_xa, XA_FLAGS_ALLOC),
+};
+
+static struct ioasid_allocator_data *active_allocator = &default_allocator;
+
+static ioasid_t default_alloc(ioasid_t min, ioasid_t max, void *opaque)
+{
+	ioasid_t id;
+
+	if (xa_alloc(&default_allocator.xa, &id, opaque, XA_LIMIT(min, max), GFP_ATOMIC)) {
+		pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+		return INVALID_IOASID;
+	}
+
+	return id;
+}
+
+static void default_free(ioasid_t ioasid, void *opaque)
+{
+	struct ioasid_data *ioasid_data;
+
+	ioasid_data = xa_erase(&default_allocator.xa, ioasid);
+	kfree_rcu(ioasid_data, rcu);
+}
+
+/* Allocate and initialize a new custom allocator with its helper functions */
+static struct ioasid_allocator_data *ioasid_alloc_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *ia_data;
+
+	ia_data = kzalloc(sizeof(*ia_data), GFP_ATOMIC);
+	if (!ia_data)
+		return NULL;
+
+	xa_init_flags(&ia_data->xa, XA_FLAGS_ALLOC);
+	INIT_LIST_HEAD(&ia_data->slist);
+	ia_data->flags |= IOASID_ALLOCATOR_CUSTOM;
+	ia_data->ops = ops;
+
+	/* For tracking custom allocators that share the same ops */
+	list_add_tail(&ops->list, &ia_data->slist);
+
+	return ia_data;
+}
+
+static bool use_same_ops(struct ioasid_allocator_ops *a, struct ioasid_allocator_ops *b)
+{
+	return (a->free == b->free) && (a->alloc == b->alloc);
+}
+
+/**
+ * ioasid_register_allocator - register a custom allocator
+ * @ops: the custom allocator ops to be registered
+ *
+ * Custom allocators take precedence over the default xarray based allocator.
+ * Private data associated with the IOASID allocated by the custom allocators
+ * are managed by IOASID framework similar to data stored in xa by default
+ * allocator.
+ *
+ * There can be multiple allocators registered but only one is active. In case
+ * of runtime removal of a custom allocator, the next one is activated based
+ * on the registration ordering.
+ *
+ * Multiple allocators can share the same alloc() function, in this case the
+ * IOASID space is shared.
+ */
+int ioasid_register_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *ia_data;
+	struct ioasid_allocator_data *pallocator;
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+
+	ia_data = ioasid_alloc_allocator(ops);
+	if (!ia_data) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * No particular preference, we activate the first one and keep
+	 * the later registered allocators in a list in case the first one gets
+	 * removed due to hotplug.
+	 */
+	if (list_empty(&allocators_list)) {
+		WARN_ON(active_allocator != &default_allocator);
+		/* Use this new allocator if default is not active */
+		if (xa_empty(&active_allocator->xa)) {
+			rcu_assign_pointer(active_allocator, ia_data);
+			list_add_tail(&ia_data->list, &allocators_list);
+			goto out_unlock;
+		}
+		pr_warn("Default allocator active with outstanding IOASID\n");
+		ret = -EAGAIN;
+		goto out_free;
+	}
+
+	/* Check if the allocator is already registered */
+	list_for_each_entry(pallocator, &allocators_list, list) {
+		if (pallocator->ops == ops) {
+			pr_err("IOASID allocator already registered\n");
+			ret = -EEXIST;
+			goto out_free;
+		} else if (use_same_ops(pallocator->ops, ops)) {
+			/*
+			 * If the new allocator shares the same ops,
+			 * then they will share the same IOASID space.
+			 * We should put them under the same xarray.
+			 */
+			list_add_tail(&ops->list, &pallocator->slist);
+			goto out_free;
+		}
+	}
+	list_add_tail(&ia_data->list, &allocators_list);
+
+	spin_unlock(&ioasid_allocator_lock);
+	return 0;
+out_free:
+	kfree(ia_data);
+out_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_allocator);
+
+/**
+ * ioasid_unregister_allocator - Remove a custom IOASID allocator ops
+ * @ops: the custom allocator to be removed
+ *
+ * Remove an allocator from the list, activate the next allocator in
+ * the order it was registered. Or revert to default allocator if all
+ * custom allocators are unregistered without outstanding IOASIDs.
+ */
+void ioasid_unregister_allocator(struct ioasid_allocator_ops *ops)
+{
+	struct ioasid_allocator_data *pallocator;
+	struct ioasid_allocator_ops *sops;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (list_empty(&allocators_list)) {
+		pr_warn("No custom IOASID allocators active!\n");
+		goto exit_unlock;
+	}
+
+	list_for_each_entry(pallocator, &allocators_list, list) {
+		if (!use_same_ops(pallocator->ops, ops))
+			continue;
+
+		if (list_is_singular(&pallocator->slist)) {
+			/* No shared helper functions */
+			list_del(&pallocator->list);
+			/*
+			 * All IOASIDs should have been freed before
+			 * the last allocator that shares the same ops
+			 * is unregistered.
+			 */
+			WARN_ON(!xa_empty(&pallocator->xa));
+			if (list_empty(&allocators_list)) {
+				pr_info("No custom IOASID allocators, switch to default.\n");
+				rcu_assign_pointer(active_allocator, &default_allocator);
+			} else if (pallocator == active_allocator) {
+				rcu_assign_pointer(active_allocator,
+						list_first_entry(&allocators_list,
+								struct ioasid_allocator_data, list));
+				pr_info("IOASID allocator changed");
+			}
+			kfree_rcu(pallocator, rcu);
+			break;
+		}
+		/*
+		 * Find the matching shared ops to delete,
+		 * but keep outstanding IOASIDs
+		 */
+		list_for_each_entry(sops, &pallocator->slist, list) {
+			if (sops == ops) {
+				list_del(&ops->list);
+				break;
+			}
+		}
+		break;
+	}
+
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
+
+void ioasid_install_capacity(ioasid_t total)
+{
+	spin_lock(&ioasid_allocator_lock);
+	if (ioasid_capacity && ioasid_capacity != PCI_PASID_MAX) {
+		pr_warn("IOASID capacity is already set.\n");
+		goto done_unlock;
+	}
+	ioasid_capacity = ioasid_capacity_avail = total;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_install_capacity);
+
+/**
+ * @brief Reserve capacity from the system pool
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be reserved, 0 means
+ *	reserve all remaining IDs.
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (nr_ioasid > ioasid_capacity_avail) {
+		ret = -ENOSPC;
+		goto done_unlock;
+	}
+	if (!nr_ioasid)
+		nr_ioasid = ioasid_capacity_avail;
+	ioasid_capacity_avail -= nr_ioasid;
+	ret = nr_ioasid;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_reserve_capacity);
+
+/**
+ * @brief Return capacity to the system pool
+ * 	We trust the caller not to return more than it has reserved, we could
+ * 	also track reservation if needed.
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be returned
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (nr_ioasid + ioasid_capacity_avail > ioasid_capacity) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	ioasid_capacity_avail += nr_ioasid;
+	ret = ioasid_capacity_avail;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_cancel_capacity);
+
+/**
+ * ioasid_attach_data - Set private data for an allocated ioasid
+ * @ioasid: the ID to set data
+ * @data:   the private data
+ *
+ * For IOASID that is already allocated, private data can be set
+ * via this API. Future lookup can be done via ioasid_find.
+ */
+int ioasid_attach_data(ioasid_t ioasid, void *data)
+{
+	struct ioasid_data *ioasid_data;
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!ioasid_data) {
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+
+	if (ioasid_data->private) {
+		ret = -EBUSY;
+		goto done_unlock;
+	}
+	rcu_assign_pointer(ioasid_data->private, data);
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_attach_data);
+
+/**
+ * ioasid_detach_data - Clear the private data of an ioasid
+ *
+ * @ioasid: the IOASIDD to clear private data
+ */
+void ioasid_detach_data(ioasid_t ioasid)
+{
+	struct ioasid_data *ioasid_data;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!ioasid_data) {
+		pr_warn("IOASID %u not found to detach data from\n", ioasid);
+		goto done_unlock;
+	}
+
+	if (ioasid_data->private) {
+		rcu_assign_pointer(ioasid_data->private, NULL);
+		goto done_unlock;
+	}
+
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	/*
+	 * Wait for readers to stop accessing the old private data,
+	 * so the caller can free it.
+	 */
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_data);
+
+/**
+ * ioasid_notify - Send notification on a given IOASID for status change.
+ *
+ * @data:	The IOASID data to which the notification will send
+ * @cmd:	Notification event sent by IOASID external users, can be
+ *		IOASID_BIND or IOASID_UNBIND.
+ *
+ * @flags:	Special instructions, e.g. notify within a set or global by
+ *		IOASID_NOTIFY_FLAG_SET or IOASID_NOTIFY_FLAG_ALL flags
+ * Caller must hold ioasid_allocator_lock and reference to the IOASID
+ */
+static int ioasid_notify(struct ioasid_data *data,
+			 enum ioasid_notify_val cmd, unsigned int flags)
+{
+	struct ioasid_nb_args args = { 0 };
+	int ret = 0;
+
+	assert_spin_locked(&ioasid_allocator_lock);
+	if (flags & ~(IOASID_NOTIFY_FLAG_ALL | IOASID_NOTIFY_FLAG_SET))
+		return -EINVAL;
+
+	args.id = data->id;
+	args.set = data->set;
+	args.pdata = data->private;
+	args.spid = data->spid;
+	if (flags & IOASID_NOTIFY_FLAG_ALL)
+		ret = atomic_notifier_call_chain(&ioasid_notifier, cmd, &args);
+	if (flags & IOASID_NOTIFY_FLAG_SET)
+		ret = atomic_notifier_call_chain(&data->set->nh, cmd, &args);
+
+	return ret;
+}
+
+static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t spid, bool get)
+{
+	ioasid_t ioasid = INVALID_IOASID;
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	if (!xa_load(&ioasid_sets, set->id)) {
+		pr_warn("Invalid set\n");
+		goto done;
+	}
+
+	xa_for_each(&set->xa, index, entry) {
+		if (spid == entry->spid) {
+			if (get)
+				refcount_inc(&entry->refs);
+			ioasid = index;
+		}
+	}
+done:
+	return ioasid;
+}
+
+/**
+ * ioasid_attach_spid - Attach ioasid_set private ID to an IOASID
+ *
+ * @ioasid: the system-wide IOASID to attach
+ * @spid:   the ioasid_set private ID of @ioasid
+ *
+ * After attching SPID, future lookup can be done via ioasid_find_by_spid().
+ */
+int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
+{
+	struct ioasid_data *data;
+	int ret = 0;
+
+	if (spid == INVALID_IOASID)
+		return -EINVAL;
+
+	spin_lock(&ioasid_allocator_lock);
+	data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!data) {
+		pr_err("No IOASID entry %d to attach SPID %d\n",
+			ioasid, spid);
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+	/* Check if SPID is unique within the set */
+	if (ioasid_find_by_spid_locked(data->set, spid, false) != INVALID_IOASID) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	data->spid = spid;
+	ioasid_notify(data, IOASID_NOTIFY_BIND, IOASID_NOTIFY_FLAG_SET);
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_attach_spid);
+
+void ioasid_detach_spid(ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	spin_lock(&ioasid_allocator_lock);
+	data = xa_load(&active_allocator->xa, ioasid);
+
+	if (!data || data->spid == INVALID_IOASID) {
+		pr_err("Invalid IOASID entry %d to detach\n", ioasid);
+		goto done_unlock;
+	}
+	ioasid_notify(data, IOASID_NOTIFY_UNBIND, IOASID_NOTIFY_FLAG_SET);
+	data->spid = INVALID_IOASID;
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_spid);
+
+/**
+ * ioasid_find_by_spid - Find the system-wide IOASID by a set private ID and
+ * its set.
+ *
+ * @set:	the ioasid_set to search within
+ * @spid:	the set private ID
+ * @get:	flag indicates whether to take a reference once found
+ *
+ * Given a set private ID and its IOASID set, find the system-wide IOASID. Take
+ * a reference upon finding the matching IOASID if @get is true. Return
+ * INVALID_IOASID if the IOASID is not found in the set or the set is not valid.
+ */
+ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get)
+{
+	ioasid_t ioasid;
+
+	spin_lock(&ioasid_allocator_lock);
+	ioasid = ioasid_find_by_spid_locked(set, spid, get);
+	spin_unlock(&ioasid_allocator_lock);
+	return ioasid;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_by_spid);
+
+static inline bool ioasid_set_is_valid(struct ioasid_set *set)
+{
+	return xa_load(&ioasid_sets, set->id) == set;
+}
+
+static void ioasid_add_pending_nb(struct ioasid_set *set)
+{
+	struct ioasid_set_nb *curr;
+
+	assert_spin_locked(&ioasid_allocator_lock);
+
+	if (set->type != IOASID_SET_TYPE_MM)
+		return;
+	/*
+	 * Check if there are any pending nb requests for the given token, if so
+	 * add them to the notifier chain.
+	 */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == set->token && !curr->active) {
+			if (unlikely(!func_ptr_is_kernel_text(curr->nb->notifier_call))) {
+				pr_warn("Invalid notifier callback");
+				continue;
+			}
+			atomic_notifier_chain_register(&set->nh, curr->nb);
+			curr->set = set;
+			curr->active = true;
+		}
+	}
+}
+
+/**
+ * ioasid_set_alloc - Allocate a new IOASID set for a given token
+ *
+ * @token:	An optional arbitrary number that can be associated with the
+ *		IOASID set. @token can be NULL if the type is
+ *		IOASID_SET_TYPE_NULL
+ * @quota:	Quota allowed in this set, 0 indicates no limit for the set
+ * @type:	The type of the token used to create the IOASID set
+ *
+ * IOASID is limited system-wide resource that requires quota management.
+ * Token will be stored in the ioasid_set returned. A reference will be taken
+ * on the newly created set. Subsequent IOASID allocation within the set need
+ * to use the returned ioasid_set pointer.
+ */
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type)
+{
+	struct ioasid_set *set;
+	unsigned long index;
+	ioasid_t id;
+
+	if (type >= IOASID_SET_TYPE_NR)
+		return ERR_PTR(-EINVAL);
+
+	/* No limit for the set, use whatever is available on the system */
+	if (!quota)
+		quota = ioasid_capacity_avail;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (quota > ioasid_capacity_avail) {
+		pr_warn("Out of IOASID capacity! ask %d, avail %d\n",
+			quota, ioasid_capacity_avail);
+		set = ERR_PTR(-ENOSPC);
+		goto exit_unlock;
+	}
+
+	/*
+	 * Token is only unique within its types but right now we have only
+	 * mm type. If we have more token types, we have to match type as well.
+	 */
+	switch (type) {
+	case IOASID_SET_TYPE_MM:
+		if (!token) {
+			set = ERR_PTR(-EINVAL);
+			goto exit_unlock;
+		}
+		/* Search existing set tokens, reject duplicates */
+		xa_for_each(&ioasid_sets, index, set) {
+			if (set->token == token && set->type == IOASID_SET_TYPE_MM) {
+				set = ERR_PTR(-EEXIST);
+				goto exit_unlock;
+			}
+		}
+		break;
+	case IOASID_SET_TYPE_NULL:
+		if (!token)
+			break;
+		fallthrough;
+	default:
+		pr_err("Invalid token and IOASID type\n");
+		set = ERR_PTR(-EINVAL);
+		goto exit_unlock;
+	}
+
+	set = kzalloc(sizeof(*set), GFP_ATOMIC);
+	if (!set) {
+		set = ERR_PTR(-ENOMEM);
+		goto exit_unlock;
+	}
+
+	if (xa_alloc(&ioasid_sets, &id, set,
+		     XA_LIMIT(0, ioasid_capacity_avail),
+		     GFP_ATOMIC)) {
+		kfree(set);
+		set = ERR_PTR(-ENOSPC);
+		goto exit_unlock;
+	}
+
+	set->token = token;
+	set->type = type;
+	set->quota = quota;
+	set->id = id;
+	atomic_set(&set->nr_ioasids, 0);
+	ATOMIC_INIT_NOTIFIER_HEAD(&set->nh);
+
+	/*
+	 * Check if there are any pending nb requests for the given token, if so
+	 * add them to the notifier chain.
+	 */
+	ioasid_add_pending_nb(set);
+	/*
+	 * Per set XA is used to store private IDs within the set, get ready
+	 * for ioasid_set private ID and system-wide IOASID allocation
+	 * results.
+	 */
+	xa_init(&set->xa);
+	ioasid_capacity_avail -= quota;
+
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_alloc);
+
+static int ioasid_set_free_locked(struct ioasid_set *set)
+{
+	struct ioasid_set_nb *curr;
+	int ret = 0;
+
+	if (!ioasid_set_is_valid(set)) {
+		ret = -EINVAL;
+		goto exit_done;
+	}
+
+	if (atomic_read(&set->nr_ioasids)) {
+		ret = -EBUSY;
+		set->free_pending = true;
+		pr_info("Set marked as free_pending, will be released when the last ioasid reclaimed!\n");
+		goto exit_done;
+	}
+
+	WARN_ON(!xa_empty(&set->xa));
+	/* Restore pending status of the set NBs */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == set->token) {
+			if (curr->active)
+				curr->active = false;
+			else
+				pr_warn("Set token exists but not active!\n");
+		}
+	}
+
+	/*
+	 * Token got released right away after the ioasid_set is freed.
+	 * If a new set is created immediately with the newly released token,
+	 * it will not allocate the same IOASIDs unless they are reclaimed.
+	 */
+	xa_erase(&ioasid_sets, set->id);
+	kfree_rcu(set, rcu);
+exit_done:
+	return ret;
+};
+
+/**
+ * @brief Free an ioasid_set if empty. Restore pending notification list.
+ *
+ * @param set to be freed
+ * @return
+ */
+int ioasid_set_free(struct ioasid_set *set)
+{
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	ret = ioasid_set_free_locked(set);
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_free);
+
+/**
+ * ioasid_alloc - Allocate an IOASID
+ * @set: the IOASID set
+ * @min: the minimum ID (inclusive)
+ * @max: the maximum ID (inclusive)
+ * @private: data private to the caller
+ *
+ * Allocate an ID between @min and @max. The @private pointer is stored
+ * internally and can be retrieved with ioasid_find().
+ *
+ * Return: the allocated ID on success, or %INVALID_IOASID on failure.
+ */
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+		      void *private)
+{
+	struct ioasid_data *data;
+	void *adata;
+	ioasid_t id = INVALID_IOASID;
+
+	spin_lock(&ioasid_allocator_lock);
+	/* Check if the IOASID set has been allocated and initialized */
+	if (!set || !ioasid_set_is_valid(set))
+		goto done_unlock;
+
+	if (set->type == IOASID_SET_TYPE_MM && ioasid_cg_charge(set))
+		goto done_unlock;
+
+	if (set->quota <= atomic_read(&set->nr_ioasids)) {
+		pr_err("IOASID set out of quota %d\n", set->quota);
+		goto done_unlock;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_ATOMIC);
+	if (!data)
+		goto done_unlock;
+
+	data->set = set;
+	data->private = private;
+	refcount_set(&data->refs, 1);
+
+	/*
+	 * Custom allocator needs allocator data to perform platform specific
+	 * operations.
+	 */
+	adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
+	id = active_allocator->ops->alloc(min, max, adata);
+	if (id == INVALID_IOASID) {
+		pr_err("Failed ASID allocation %lu\n", active_allocator->flags);
+		goto exit_free;
+	}
+
+	if ((active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) &&
+	     xa_alloc(&active_allocator->xa, &id, data, XA_LIMIT(id, id), GFP_ATOMIC)) {
+		/* Custom allocator needs framework to store and track allocation results */
+		pr_err("Failed to alloc ioasid from %d\n", id);
+		active_allocator->ops->free(id, active_allocator->ops->pdata);
+		goto exit_free;
+	}
+	data->id = id;
+	data->state = IOASID_STATE_IDLE;
+	data->spid = INVALID_IOASID;
+
+	/* Store IOASID in the per set data */
+	if (xa_err(xa_store(&set->xa, id, data, GFP_ATOMIC))) {
+		pr_err("Failed to store ioasid %d in set\n", id);
+		active_allocator->ops->free(id, active_allocator->ops->pdata);
+		goto exit_free;
+	}
+	atomic_inc(&set->nr_ioasids);
+	ioasid_notify(data, IOASID_NOTIFY_ALLOC, IOASID_NOTIFY_FLAG_SET);
+	goto done_unlock;
+exit_free:
+	kfree(data);
+	ioasid_cg_uncharge(set);
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return id;
+}
+EXPORT_SYMBOL_GPL(ioasid_alloc);
+
+static void ioasid_do_free_locked(struct ioasid_data *data)
+{
+	struct ioasid_data *ioasid_data;
+
+	active_allocator->ops->free(data->id, active_allocator->ops->pdata);
+	/* Custom allocator needs additional steps to free the xa element */
+	if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
+		ioasid_data = xa_erase(&active_allocator->xa, data->id);
+		kfree_rcu(ioasid_data, rcu);
+	}
+	atomic_dec(&data->set->nr_ioasids);
+	ioasid_cg_uncharge(data->set);
+	xa_erase(&data->set->xa, data->id);
+	/* Destroy the set if empty */
+	if (data->set->free_pending && !atomic_read(&data->set->nr_ioasids)) {
+		pr_info("%s free set set->id: %u\n", __func__, data->set->id);
+		ioasid_set_free_locked(data->set);
+	}
+}
+
+static void ioasid_free_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to free unknown IOASID %u\n", ioasid);
+		return;
+	}
+	if (data->set != set) {
+		pr_warn("Cannot free IOASID %u due to set ownership\n", ioasid);
+		return;
+	}
+	/* Check if the set exists */
+	if (WARN_ON(!xa_load(&ioasid_sets, data->set->id)))
+		return;
+
+	/* Free is already in progress */
+	if (data->state == IOASID_STATE_FREE_PENDING)
+		return;
+
+	data->state = IOASID_STATE_FREE_PENDING;
+	/*
+	 * If the refcount is 1, it means there is no other users of the IOASID
+	 * other than IOASID core itself. There is no need to notify anyone.
+	 */
+	if (!refcount_dec_and_test(&data->refs)) {
+		ioasid_notify(data, IOASID_NOTIFY_FREE,
+			IOASID_NOTIFY_FLAG_SET | IOASID_NOTIFY_FLAG_ALL);
+		return;
+	}
+	ioasid_do_free_locked(data);
+}
+
+/**
+ * ioasid_free - Drop reference on an IOASID. Free if refcount drops to 0,
+ *               including free from its set and system-wide list.
+ * @set:	The ioasid_set to check permission with. If not NULL, IOASID
+ *		free will fail if the set does not match.
+ * @ioasid:	The IOASID to remove
+ *
+ * TODO: return true if all references dropped, false if async work is in
+ * progress, IOASID is in FREE_PENDING state. wait queue to be used for blocking
+ * free task.
+ */
+void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
+{
+	spin_lock(&ioasid_allocator_lock);
+	ioasid_free_locked(set, ioasid);
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free);
+
+/**
+ * ioasid_free_all_in_set
+ *
+ * @brief
+ * Free all PASIDs from system-wide IOASID pool, all subscribers gets
+ * notified and do cleanup of their own.
+ * Note that some references of the IOASIDs within the set can still
+ * be held after the free call. This is OK in that the IOASIDs will be
+ * marked inactive, the only operations can be done is ioasid_put.
+ * No need to track IOASID set states since there is no reclaim phase.
+ *
+ * @param
+ * struct ioasid_set where all IOASIDs within the set will be freed.
+ */
+void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	if (!ioasid_set_is_valid(set))
+		return;
+
+	if (xa_empty(&set->xa))
+		return;
+
+	if (!atomic_read(&set->nr_ioasids))
+		return;
+	spin_lock(&ioasid_allocator_lock);
+	xa_for_each(&set->xa, index, entry) {
+		ioasid_free_locked(set, index);
+		/* Free from per set private pool */
+		xa_erase(&set->xa, index);
+	}
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
+
+static struct ioasid_set *ioasid_find_mm_set_locked(struct mm_struct *token)
+{
+	struct ioasid_set *set;
+	unsigned long index;
+
+	xa_for_each(&ioasid_sets, index, set) {
+		if (set->type == IOASID_SET_TYPE_MM && set->token == token)
+			return set;
+	}
+
+	return NULL;
+}
+
+/*
+ * ioasid_find_mm_set - Retrieve IOASID set with mm token
+ * Take a reference of the set if found.
+ */
+struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+	struct ioasid_set *set;
+
+	spin_lock(&ioasid_allocator_lock);
+	set = ioasid_find_mm_set_locked(token);
+	spin_unlock(&ioasid_allocator_lock);
+
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_mm_set);
+
+/**
+ * ioasid_set_for_each_ioasid
+ * @brief
+ * Iterate over all the IOASIDs within the set
+ */
+void ioasid_set_for_each_ioasid(struct ioasid_set *set,
+				void (*fn)(ioasid_t id, void *data),
+				void *data)
+{
+	struct ioasid_data *entry;
+	unsigned long index;
+
+	xa_for_each(&set->xa, index, entry)
+		fn(index, data);
+}
+EXPORT_SYMBOL_GPL(ioasid_set_for_each_ioasid);
+
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to get unknown IOASID %u\n", ioasid);
+		return -EINVAL;
+	}
+	if (data->state == IOASID_STATE_FREE_PENDING) {
+		pr_err("Trying to get IOASID being freed%u\n", ioasid);
+		return -EBUSY;
+	}
+
+	/* Check set ownership if the set is non-null */
+	if (set && data->set != set) {
+		pr_err("Trying to get IOASID %u outside the set\n", ioasid);
+		/* data found but does not belong to the set */
+		return -EACCES;
+	}
+	refcount_inc(&data->refs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_locked);
+
+/**
+ * ioasid_get - obtain a reference to the IOASID
+ * @set:	the ioasid_set to check permission against if not NULL
+ * @ioasid:	the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
+ */
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
+{
+	int ret;
+
+	spin_lock(&ioasid_allocator_lock);
+	ret = ioasid_get_locked(set, ioasid);
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_get);
+
+/**
+ * ioasid_get_if_owned - obtain a reference to the IOASID if the IOASID belongs
+ * 		to the ioasid_set with the current mm as token
+ * @ioasid:	the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
+ */
+int ioasid_get_if_owned(ioasid_t ioasid)
+{
+	struct ioasid_set *set;
+	int ret;
+
+	spin_lock(&ioasid_allocator_lock);
+	set = ioasid_find_set(ioasid);
+	if (IS_ERR_OR_NULL(set)) {
+		ret = -ENOENT;
+		goto done_unlock;
+	}
+	if (set->type != IOASID_SET_TYPE_MM) {
+		ret = -EINVAL;
+		goto done_unlock;
+	}
+	if (current->mm != set->token) {
+		ret = -EPERM;
+		goto done_unlock;
+	}
+
+	ret = ioasid_get_locked(set, ioasid);
+done_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_if_owned);
+
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	struct ioasid_data *data;
+
+	data = xa_load(&active_allocator->xa, ioasid);
+	if (!data) {
+		pr_err("Trying to put unknown IOASID %u\n", ioasid);
+		return false;
+	}
+	if (set && data->set != set) {
+		pr_err("Trying to drop IOASID %u outside the set\n", ioasid);
+		return false;
+	}
+	if (!refcount_dec_and_test(&data->refs))
+		return false;
+
+	ioasid_do_free_locked(data);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ioasid_put_locked);
+
+/**
+ * ioasid_put - Release a reference to an ioasid
+ * @set:	the ioasid_set to check permission against if not NULL
+ * @ioasid:	the IOASID to drop reference
+ *
+ * Put a reference to the IOASID, free it when the number of references drops to
+ * zero.
+ *
+ * Return: %true if the IOASID was freed, %false otherwise.
+ */
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
+{
+	bool ret;
+
+	spin_lock(&ioasid_allocator_lock);
+	ret = ioasid_put_locked(set, ioasid);
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_put);
+
+/**
+ * @brief
+ * Find the ioasid_set of an IOASID. As long as the IOASID is valid,
+ * the set must be valid since the refcounting is based on the number of IOASID
+ * in the set.
+ *
+ * @param ioasid
+ * @return struct ioasid_set*
+ */
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
+{
+	struct ioasid_allocator_data *idata;
+	struct ioasid_data *ioasid_data;
+	struct ioasid_set *set = NULL;
+
+	rcu_read_lock();
+	idata = rcu_dereference(active_allocator);
+	ioasid_data = xa_load(&idata->xa, ioasid);
+	if (!ioasid_data) {
+		set = ERR_PTR(-ENOENT);
+		goto unlock;
+	}
+	set = ioasid_data->set;
+unlock:
+	rcu_read_unlock();
+	return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_set);
+
+/**
+ * ioasid_find - Find IOASID data
+ * @set: the IOASID set
+ * @ioasid: the IOASID to find
+ * @getter: function to call on the found object
+ *
+ * The optional getter function allows to take a reference to the found object
+ * under the rcu lock. The function can also check if the object is still valid:
+ * if @getter returns false, then the object is invalid and NULL is returned.
+ *
+ * If the IOASID exists, return the private pointer passed to ioasid_alloc.
+ * Private data can be NULL if not set. Return an error if the IOASID is not
+ * found, or if @set is not NULL and the IOASID does not belong to the set.
+ */
+void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+		  bool (*getter)(void *))
+{
+	void *priv;
+	struct ioasid_data *ioasid_data;
+	struct ioasid_allocator_data *idata;
+
+	rcu_read_lock();
+	idata = rcu_dereference(active_allocator);
+	ioasid_data = xa_load(&idata->xa, ioasid);
+	if (!ioasid_data) {
+		priv = ERR_PTR(-ENOENT);
+		goto unlock;
+	}
+	if (set && ioasid_data->set != set) {
+		/* data found but does not belong to the set */
+		priv = ERR_PTR(-EACCES);
+		goto unlock;
+	}
+	/* Now IOASID and its set is verified, we can return the private data */
+	priv = rcu_dereference(ioasid_data->private);
+	if (getter && !getter(priv))
+		priv = NULL;
+unlock:
+	rcu_read_unlock();
+
+	return priv;
+}
+EXPORT_SYMBOL_GPL(ioasid_find);
+
+int ioasid_register_notifier(struct ioasid_set *set, struct notifier_block *nb)
+{
+	int ret;
+
+	spin_lock(&ioasid_allocator_lock);
+	if (set)
+		ret = atomic_notifier_chain_register(&set->nh, nb);
+	else
+		ret = atomic_notifier_chain_register(&ioasid_notifier, nb);
+	spin_unlock(&ioasid_allocator_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_notifier);
+
+/*
+ *  atomic_notifier_chain_unregister() will eventually call synchronize_rcu()
+ *  which is not suitable in a lock critical area. Release the lock before
+ *  calling atomic_notifier_chain_unregister() and reaquire it after it.
+ */
+static void unregister_notifier_locked(struct ioasid_set *set,
+				       struct notifier_block *nb)
+{
+	assert_spin_locked(&ioasid_allocator_lock);
+
+	/*
+	 * The set might be released once we put the lock down and result in
+	 * set->nh an invalid reference. Keep a reference of set before
+	 * unlock and drop it afterward.
+	 */
+	if (set) {
+		atomic_inc(&set->nr_ioasids);
+
+		spin_unlock(&ioasid_allocator_lock);
+		atomic_notifier_chain_unregister(&set->nh, nb);
+		spin_lock(&ioasid_allocator_lock);
+
+		if (atomic_dec_and_test(&set->nr_ioasids))
+			ioasid_set_free_locked(set);
+
+		return;
+	}
+
+	spin_unlock(&ioasid_allocator_lock);
+	atomic_notifier_chain_unregister(&ioasid_notifier, nb);
+	spin_lock(&ioasid_allocator_lock);
+}
+
+void ioasid_unregister_notifier(struct ioasid_set *set,
+				struct notifier_block *nb)
+{
+	struct ioasid_set_nb *curr;
+
+	spin_lock(&ioasid_allocator_lock);
+	/*
+	 * Pending list is registered with a token without an ioasid_set,
+	 * therefore should not be unregistered directly.
+	 */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->nb == nb) {
+			pr_warn("Cannot unregister NB from pending list\n");
+			goto out_unlock;
+		}
+	}
+
+	unregister_notifier_locked(set, nb);
+out_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_notifier);
+
+/**
+ * ioasid_register_notifier_mm - Register a notifier block on the IOASID set
+ *                               created by the mm_struct pointer as the token
+ *
+ * @mm: the mm_struct token of the ioasid_set
+ * @nb: notfier block to be registered on the ioasid_set
+ *
+ * This a variant of ioasid_register_notifier() where the caller intends to
+ * listen to IOASID events belong the ioasid_set created under the same
+ * process. Caller is not aware of the ioasid_set, no need to hold reference
+ * of the ioasid_set.
+ */
+int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
+{
+	struct ioasid_set_nb *curr;
+	struct ioasid_set *set;
+	int ret = 0;
+
+	spin_lock(&ioasid_allocator_lock);
+	/* Check for duplicates, nb is unique per set */
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == mm && curr->nb == nb) {
+			ret = -EBUSY;
+			goto exit_unlock;
+		}
+	}
+
+	if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+		pr_warn("Registering invalid callback!");
+		dump_stack();
+		ret = -EINVAL;
+		goto exit_unlock;
+	}
+	curr = kzalloc(sizeof(*curr), GFP_ATOMIC);
+	if (!curr) {
+		ret = -ENOMEM;
+		goto exit_unlock;
+	}
+
+	curr->token = mm;
+	curr->nb = nb;
+
+	/* Check if the token has an existing set */
+	set = ioasid_find_mm_set_locked(mm);
+	if (!set) {
+		/* Add to the rsvd list as inactive */
+		curr->active = false;
+	} else {
+		/* REVISIT: Only register empty set for now. Can add an option
+		 * in the future to playback existing PASIDs.
+		 */
+		if (atomic_read(&set->nr_ioasids)) {
+			pr_warn("IOASID set %d not empty %d\n", set->id,
+				atomic_read(&set->nr_ioasids));
+			ret = -EBUSY;
+			goto exit_free;
+		}
+		curr->active = true;
+		curr->set = set;
+
+		/* Set already created, add to the notifier chain */
+		atomic_notifier_chain_register(&set->nh, nb);
+	}
+
+	list_add(&curr->list, &ioasid_nb_pending_list);
+	goto exit_unlock;
+exit_free:
+	kfree(curr);
+exit_unlock:
+	spin_unlock(&ioasid_allocator_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_notifier_mm);
+
+bool ioasid_queue_work(struct work_struct *work)
+{
+	return queue_work(ioasid_wq, work);
+}
+EXPORT_SYMBOL_GPL(ioasid_queue_work);
+
+void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb)
+{
+	struct ioasid_set_nb *curr;
+
+	spin_lock(&ioasid_allocator_lock);
+	list_for_each_entry(curr, &ioasid_nb_pending_list, list) {
+		if (curr->token == mm && curr->nb == nb) {
+			list_del(&curr->list);
+			if (curr->active)
+				unregister_notifier_locked(curr->set, nb);
+			kfree(curr);
+			break;
+		}
+	}
+	spin_unlock(&ioasid_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_notifier_mm);
+
+static int __init ioasid_init(void)
+{
+	ioasid_wq = alloc_ordered_workqueue("ioasid_wq", 0);
+	if (!ioasid_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit ioasid_cleanup(void)
+{
+	destroy_workqueue(ioasid_wq);
+}
+
+MODULE_AUTHOR("Jean-Philippe Brucker <jean-philippe.brucker@arm.com>");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
+MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
+MODULE_LICENSE("GPL");
+module_init(ioasid_init);
+module_exit(ioasid_cleanup);
diff --git a/drivers/iommu/ioasid_user.c b/drivers/iommu/ioasid_user.c
new file mode 100644
index 0000000000000000000000000000000000000000..95e306cf7f270c6906b96bb2233b15e5668858e9
--- /dev/null
+++ b/drivers/iommu/ioasid_user.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support IOASID allocation/free from user space.
+ *
+ * Copyright (C) 2021 Intel Corporation.
+ *     Author: Liu Yi L <yi.l.liu@intel.com>
+ *
+ */
+
+#include <linux/ioasid.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Liu Yi L <yi.l.liu@intel.com>"
+#define DRIVER_DESC     "IOASID management for user space"
+
+/* Current user ioasid uapi supports 31 bits */
+#define IOASID_BITS	31
+
+struct ioasid_user_token {
+	unsigned long long val;
+};
+
+struct ioasid_user {
+	struct kref		kref;
+	struct ioasid_set	*ioasid_set;
+	struct mutex		lock;
+	struct list_head	next;
+	struct ioasid_user_token	token;
+};
+
+static struct mutex		ioasid_user_lock;
+static struct list_head		ioasid_user_list;
+
+/* called with ioasid_user_lock held */
+static void ioasid_user_release(struct kref *kref)
+{
+	struct ioasid_user *iuser = container_of(kref, struct ioasid_user, kref);
+
+	ioasid_free_all_in_set(iuser->ioasid_set);
+	list_del(&iuser->next);
+	mutex_unlock(&ioasid_user_lock);
+	ioasid_set_free(iuser->ioasid_set);
+	kfree(iuser);
+}
+
+void ioasid_user_put(struct ioasid_user *iuser)
+{
+	kref_put_mutex(&iuser->kref, ioasid_user_release, &ioasid_user_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_user_put);
+
+static void ioasid_user_get(struct ioasid_user *iuser)
+{
+	kref_get(&iuser->kref);
+}
+
+struct ioasid_user *ioasid_user_get_from_task(struct task_struct *task)
+{
+	struct mm_struct *mm = get_task_mm(task);
+	unsigned long long val = (unsigned long long)mm;
+	struct ioasid_user *iuser;
+	bool found = false;
+
+	if (!mm)
+		return NULL;
+
+	mutex_lock(&ioasid_user_lock);
+	/* Search existing ioasid_user with current mm pointer */
+	list_for_each_entry(iuser, &ioasid_user_list, next) {
+		if (iuser->token.val == val) {
+			ioasid_user_get(iuser);
+			found = true;
+			break;
+		}
+	}
+
+	mmput(mm);
+
+	mutex_unlock(&ioasid_user_lock);
+	return found ? iuser : NULL;
+}
+EXPORT_SYMBOL_GPL(ioasid_user_get_from_task);
+
+void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+			    void (*fn)(ioasid_t id, void *data))
+{
+	mutex_lock(&iuser->lock);
+	ioasid_set_for_each_ioasid(iuser->ioasid_set, fn, data);
+	mutex_unlock(&iuser->lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_user_for_each_id);
+
+static int ioasid_fops_open(struct inode *inode, struct file *filep)
+{
+	struct mm_struct *mm = get_task_mm(current);
+	unsigned long long val = (unsigned long long)mm;
+	struct ioasid_set *iset;
+	struct ioasid_user *iuser;
+	int ret = 0;
+
+	mutex_lock(&ioasid_user_lock);
+	/* Only allow one single open per process */
+	list_for_each_entry(iuser, &ioasid_user_list, next) {
+		if (iuser->token.val == val) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	iuser = kzalloc(sizeof(*iuser), GFP_KERNEL);
+	if (!iuser) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * IOASID core provides a 'IOASID set' concept to track all
+	 * IOASIDs associated with a token. Here we use mm_struct as
+	 * the token and create a IOASID set per mm_struct. All the
+	 * containers of the process share the same IOASID set.
+	 */
+	iset = ioasid_set_alloc(mm, 1000, IOASID_SET_TYPE_MM);
+	if (IS_ERR(iset)) {
+		kfree(iuser);
+		ret = PTR_ERR(iset);
+		printk("%s, ret: %d\n", __func__, ret);
+		goto out;
+	}
+
+	iuser->ioasid_set = iset;
+	kref_init(&iuser->kref);
+	iuser->token.val = val;
+	mutex_init(&iuser->lock);
+	filep->private_data = iuser;
+
+	list_add(&iuser->next, &ioasid_user_list);
+out:
+	mutex_unlock(&ioasid_user_lock);
+	mmput(mm);
+	return ret;
+}
+
+static int ioasid_fops_release(struct inode *inode, struct file *filep)
+{
+	struct ioasid_user *iuser = filep->private_data;
+
+	filep->private_data = NULL;
+
+	ioasid_user_put(iuser);
+
+	return 0;
+}
+
+static int ioasid_get_info(struct ioasid_user *iuser, unsigned long arg)
+{
+	struct ioasid_info info;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct ioasid_info, ioasid_bits);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz || info.flags)
+		return -EINVAL;
+
+	info.ioasid_bits = IOASID_BITS;
+
+	return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int ioasid_alloc_request(struct ioasid_user *iuser, unsigned long arg)
+{
+	struct ioasid_alloc_request req;
+	unsigned long minsz;
+	ioasid_t ioasid;
+
+	minsz = offsetofend(struct ioasid_alloc_request, range);
+
+	if (copy_from_user(&req, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (req.argsz < minsz || req.flags)
+		return -EINVAL;
+
+	if (req.range.min > req.range.max ||
+	    req.range.min >= (1 << IOASID_BITS) ||
+	    req.range.max >= (1 << IOASID_BITS))
+		return -EINVAL;
+
+	ioasid = ioasid_alloc(iuser->ioasid_set, req.range.min,
+			    req.range.max, NULL);
+
+	if (ioasid == INVALID_IOASID)
+		return -EINVAL;
+
+	return ioasid;
+
+}
+
+static int ioasid_free_request(struct ioasid_user *iuser, unsigned long arg)
+{
+	int ioasid;
+
+	if (copy_from_user(&ioasid, (void __user *)arg, sizeof(ioasid)))
+		return -EFAULT;
+
+	if (ioasid < 0)
+		return -EINVAL;
+
+	ioasid_free(iuser->ioasid_set, ioasid);
+
+	return 0;
+}
+
+static long ioasid_fops_unl_ioctl(struct file *filep,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct ioasid_user *iuser = filep->private_data;
+	long ret = -EINVAL;
+
+	if (!iuser)
+		return ret;
+
+	mutex_lock(&iuser->lock);
+
+	switch (cmd) {
+	case IOASID_GET_API_VERSION:
+		ret = IOASID_API_VERSION;
+		break;
+	case IOASID_GET_INFO:
+		ret = ioasid_get_info(iuser, arg);
+		break;
+	case IOASID_REQUEST_ALLOC:
+		ret = ioasid_alloc_request(iuser, arg);
+		break;
+	case IOASID_REQUEST_FREE:
+		ret = ioasid_free_request(iuser, arg);
+		break;
+	default:
+		pr_err("Unsupported cmd %u\n", cmd);
+		break;
+	}
+
+	mutex_unlock(&iuser->lock);
+	return ret;
+}
+
+static const struct file_operations ioasid_user_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ioasid_fops_open,
+	.release	= ioasid_fops_release,
+	.unlocked_ioctl	= ioasid_fops_unl_ioctl,
+};
+
+static struct miscdevice ioasid_user = {
+	.minor = IOASID_MINOR,
+	.name = "ioasid_user",
+	.fops = &ioasid_user_fops,
+	.nodename = "ioasid",
+	.mode = S_IRUGO | S_IWUGO,
+};
+
+
+static int __init ioasid_user_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ioasid_user);
+	if (ret) {
+		pr_err("ioasid_user: misc device register failed\n");
+		return ret;
+	}
+
+	mutex_init(&ioasid_user_lock);
+	INIT_LIST_HEAD(&ioasid_user_list);
+	return 0;
+}
+
+static void __exit ioasid_user_exit(void)
+{
+	WARN_ON(!list_empty(&ioasid_user_list));
+	misc_deregister(&ioasid_user);
+}
+
+module_init(ioasid_user_init);
+module_exit(ioasid_user_exit);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f97a03a135bf1f4716214d1c307f5cfc60fffd8
--- /dev/null
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for IOMMU drivers implementing SVA
+ */
+#include <linux/mutex.h>
+#include <linux/sched/mm.h>
+
+#include "iommu-sva-lib.h"
+
+static DEFINE_MUTEX(iommu_sva_lock);
+static struct ioasid_set *iommu_sva_pasid;
+
+/* Must be called before PASID allocations can occur */
+void iommu_sva_init(void)
+{
+	if (iommu_sva_pasid)
+		return;
+	iommu_sva_pasid = ioasid_set_alloc(NULL, 0, IOASID_SET_TYPE_NULL);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_init);
+
+/**
+ * iommu_sva_alloc_pasid - Allocate a PASID for the mm
+ * @mm: the mm
+ * @min: minimum PASID value (inclusive)
+ * @max: maximum PASID value (inclusive)
+ *
+ * Try to allocate a PASID for this mm, or take a reference to the existing one
+ * provided it fits within the [@min, @max] range. On success the PASID is
+ * available in mm->pasid, and must be released with iommu_sva_free_pasid().
+ * @min must be greater than 0, because 0 indicates an unused mm->pasid.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
+{
+	int ret = 0;
+	ioasid_t pasid;
+
+	if (min == INVALID_IOASID || max == INVALID_IOASID ||
+	    min == 0 || max < min)
+		return -EINVAL;
+
+	mutex_lock(&iommu_sva_lock);
+	if (mm->pasid) {
+		if (mm->pasid >= min && mm->pasid <= max)
+			ioasid_get(iommu_sva_pasid, mm->pasid);
+		else
+			ret = -EOVERFLOW;
+	} else {
+		pasid = ioasid_alloc(iommu_sva_pasid, min, max, mm);
+		if (pasid == INVALID_IOASID)
+			ret = -ENOMEM;
+		else
+			mm->pasid = pasid;
+	}
+	mutex_unlock(&iommu_sva_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
+
+/**
+ * iommu_sva_free_pasid - Release the mm's PASID
+ * @mm: the mm
+ *
+ * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
+ */
+void iommu_sva_free_pasid(struct mm_struct *mm)
+{
+	mutex_lock(&iommu_sva_lock);
+	if (ioasid_put(iommu_sva_pasid, mm->pasid))
+		mm->pasid = 0;
+	mutex_unlock(&iommu_sva_lock);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
+
+/* ioasid_find getter() requires a void * argument */
+static bool __mmget_not_zero(void *mm)
+{
+	return mmget_not_zero(mm);
+}
+
+/**
+ * iommu_sva_find() - Find mm associated to the given PASID
+ * @pasid: Process Address Space ID assigned to the mm
+ *
+ * On success a reference to the mm is taken, and must be released with mmput().
+ *
+ * Returns the mm corresponding to this PASID, or an error if not found.
+ */
+struct mm_struct *iommu_sva_find(ioasid_t pasid)
+{
+	return ioasid_find(iommu_sva_pasid, pasid, __mmget_not_zero);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_find);
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..95dc3ebc192831d1f0d37091a11c19b4e561b855
--- /dev/null
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SVA library for IOMMU drivers
+ */
+#ifndef _IOMMU_SVA_LIB_H
+#define _IOMMU_SVA_LIB_H
+
+#include <linux/ioasid.h>
+#include <linux/mm_types.h>
+
+int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
+void iommu_sva_free_pasid(struct mm_struct *mm);
+struct mm_struct *iommu_sva_find(ioasid_t pasid);
+
+/* I/O Page fault */
+struct device;
+struct iommu_fault;
+struct iopf_queue;
+
+#ifdef CONFIG_IOMMU_SVA
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_remove_device(struct iopf_queue *queue,
+			     struct device *dev);
+int iopf_queue_flush_dev(struct device *dev);
+struct iopf_queue *iopf_queue_alloc(const char *name);
+void iopf_queue_free(struct iopf_queue *queue);
+int iopf_queue_discard_partial(struct iopf_queue *queue);
+
+#else /* CONFIG_IOMMU_SVA */
+static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_add_device(struct iopf_queue *queue,
+					struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_remove_device(struct iopf_queue *queue,
+					   struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	return NULL;
+}
+
+static inline void iopf_queue_free(struct iopf_queue *queue)
+{
+}
+
+static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_IOMMU_SVA */
+#endif /* _IOMMU_SVA_LIB_H */
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index e436ff813e7e5bcced5225a11a6a25baa2bb5814..99869217fbec7d862af4bba0e0a6facbe34a7168 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -87,6 +87,7 @@ int iommu_device_sysfs_add(struct iommu_device *iommu,
 	put_device(iommu->dev);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_add);
 
 void iommu_device_sysfs_remove(struct iommu_device *iommu)
 {
@@ -94,6 +95,8 @@ void iommu_device_sysfs_remove(struct iommu_device *iommu)
 	device_unregister(iommu->dev);
 	iommu->dev = NULL;
 }
+EXPORT_SYMBOL_GPL(iommu_device_sysfs_remove);
+
 /*
  * IOMMU drivers can indicate a device is managed by a given IOMMU using
  * this interface.  A link to the device will be created in the "devices"
@@ -119,6 +122,7 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_device_link);
 
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 {
@@ -128,3 +132,4 @@ void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 	sysfs_remove_link(&link->kobj, "iommu");
 	sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link));
 }
+EXPORT_SYMBOL_GPL(iommu_device_unlink);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3d7448e7cec9078875ddfae0b633abe2dd6121cd..fbf8e17b07f9746b8be2bdfe9ababfeaed28f442 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -7,7 +7,9 @@
 #define pr_fmt(fmt)    "iommu: " fmt
 
 #include <linux/device.h>
+#include <linux/dma-iommu.h>
 #include <linux/kernel.h>
+#include <linux/bits.h>
 #include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/init.h>
@@ -22,15 +24,30 @@
 #include <linux/bitops.h>
 #include <linux/property.h>
 #include <linux/fsl/mc.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
 #include <trace/events/iommu.h>
 
 static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
 
 static unsigned int iommu_def_domain_type __read_mostly;
-static bool iommu_dma_strict __read_mostly = true;
+static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
 static u32 iommu_cmd_line __read_mostly;
 
+/*
+ * Timeout to wait for page response of a pending page request. This is
+ * intended as a basic safty net in case a pending page request is not
+ * responded for an exceptionally long time. Device may also implement
+ * its own protection mechanism against this exception.
+ * Units are in jiffies with a range between 1 - 100 seconds equivalent.
+ * Default to 10 seconds.
+ * Setting 0 means no timeout tracking.
+ */
+#define IOMMU_PAGE_RESPONSE_MAX_TIMEOUT (HZ * 100)
+#define IOMMU_PAGE_RESPONSE_DEF_TIMEOUT (HZ * 10)
+static unsigned long prq_timeout = IOMMU_PAGE_RESPONSE_DEF_TIMEOUT;
+
 struct iommu_group {
 	struct kobject kobj;
 	struct kobject *devices_kobj;
@@ -43,6 +60,7 @@ struct iommu_group {
 	int id;
 	struct iommu_domain *default_domain;
 	struct iommu_domain *domain;
+	struct list_head entry;
 };
 
 struct group_device {
@@ -78,6 +96,22 @@ static bool iommu_cmd_line_dma_api(void)
 	return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
 }
 
+static int iommu_alloc_default_domain(struct iommu_group *group,
+				      struct device *dev);
+static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
+						 unsigned type);
+static int __iommu_attach_device(struct iommu_domain *domain,
+				 struct device *dev);
+static int __iommu_attach_group(struct iommu_domain *domain,
+				struct iommu_group *group);
+static void __iommu_detach_group(struct iommu_domain *domain,
+				 struct iommu_group *group);
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+					       struct device *dev);
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
+static ssize_t iommu_group_store_type(struct iommu_group *group,
+				      const char *buf, size_t count);
+
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)		\
 struct iommu_group_attribute iommu_group_attr_##_name =		\
 	__ATTR(_name, _mode, _show, _store)
@@ -104,6 +138,7 @@ static const char *iommu_domain_type_str(unsigned int t)
 	case IOMMU_DOMAIN_UNMANAGED:
 		return "Unmanaged";
 	case IOMMU_DOMAIN_DMA:
+	case IOMMU_DOMAIN_DMA_FQ:
 		return "Translated";
 	default:
 		return "Unknown";
@@ -120,7 +155,7 @@ static int __init iommu_subsys_init(void)
 		else
 			iommu_set_default_translated(false);
 
-		if (iommu_default_passthrough() && mem_encrypt_active()) {
+		if (iommu_default_passthrough()) {
 			pr_info("Memory encryption detected - Disabling default IOMMU Passthrough\n");
 			iommu_set_default_translated(false);
 		}
@@ -141,6 +176,7 @@ int iommu_device_register(struct iommu_device *iommu)
 	spin_unlock(&iommu_device_lock);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(iommu_device_register);
 
 void iommu_device_unregister(struct iommu_device *iommu)
 {
@@ -148,10 +184,11 @@ void iommu_device_unregister(struct iommu_device *iommu)
 	list_del(&iommu->list);
 	spin_unlock(&iommu_device_lock);
 }
+EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
-static struct iommu_param *iommu_get_dev_param(struct device *dev)
+static struct dev_iommu *dev_iommu_get(struct device *dev)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 
 	if (param)
 		return param;
@@ -161,53 +198,133 @@ static struct iommu_param *iommu_get_dev_param(struct device *dev)
 		return NULL;
 
 	mutex_init(&param->lock);
-	dev->iommu_param = param;
+	dev->iommu = param;
 	return param;
 }
 
-static void iommu_free_dev_param(struct device *dev)
+static void dev_iommu_free(struct device *dev)
 {
-	kfree(dev->iommu_param);
-	dev->iommu_param = NULL;
+	iommu_fwspec_free(dev);
+	kfree(dev->iommu);
+	dev->iommu = NULL;
 }
 
-int iommu_probe_device(struct device *dev)
+static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_device *iommu_dev;
+	struct iommu_group *group;
 	int ret;
 
-	WARN_ON(dev->iommu_group);
 	if (!ops)
-		return -EINVAL;
+		return -ENODEV;
 
-	if (!iommu_get_dev_param(dev))
+	if (!dev_iommu_get(dev))
 		return -ENOMEM;
 
-	ret = ops->add_device(dev);
+	if (!try_module_get(ops->owner)) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	iommu_dev = ops->probe_device(dev);
+	if (IS_ERR(iommu_dev)) {
+		ret = PTR_ERR(iommu_dev);
+		goto out_module_put;
+	}
+
+	dev->iommu->iommu_dev = iommu_dev;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
+		goto out_release;
+	}
+	iommu_group_put(group);
+
+	if (group_list && !group->default_domain && list_empty(&group->entry))
+		list_add_tail(&group->entry, group_list);
+
+	iommu_device_link(iommu_dev, dev);
+
+	return 0;
+
+out_release:
+	ops->release_device(dev);
+
+out_module_put:
+	module_put(ops->owner);
+
+err_free:
+	dev_iommu_free(dev);
+
+	return ret;
+}
+
+int iommu_probe_device(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	struct iommu_group *group;
+	int ret;
+
+	ret = __iommu_probe_device(dev, NULL);
 	if (ret)
-		iommu_free_dev_param(dev);
+		goto err_out;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		goto err_release;
+
+	/*
+	 * Try to allocate a default domain - needs support from the
+	 * IOMMU driver. There are still some drivers which don't
+	 * support default domains, so the return value is not yet
+	 * checked.
+	 */
+	mutex_lock(&group->mutex);
+	iommu_alloc_default_domain(group, dev);
+	mutex_unlock(&group->mutex);
+
+	if (group->default_domain) {
+		ret = __iommu_attach_device(group->default_domain, dev);
+		if (ret) {
+			iommu_group_put(group);
+			goto err_release;
+		}
+	}
+
+	iommu_create_device_direct_mappings(group, dev);
+
+	iommu_group_put(group);
+
+	if (ops->probe_finalize)
+		ops->probe_finalize(dev);
+
+	return 0;
 
+err_release:
+	iommu_release_device(dev);
+
+err_out:
 	return ret;
+
 }
 
 void iommu_release_device(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-	if (dev->iommu_group)
-		ops->remove_device(dev);
+	if (!dev->iommu)
+		return;
+
+	iommu_device_unlink(dev->iommu->iommu_dev, dev);
 
-	iommu_free_dev_param(dev);
-}
+	ops->release_device(dev);
 
-static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
-						 unsigned type);
-static int __iommu_attach_device(struct iommu_domain *domain,
-				 struct device *dev);
-static int __iommu_attach_group(struct iommu_domain *domain,
-				struct iommu_group *group);
-static void __iommu_detach_group(struct iommu_domain *domain,
-				 struct iommu_group *group);
+	iommu_group_remove_device(dev);
+	module_put(ops->owner);
+	dev_iommu_free(dev);
+}
 
 static int __init iommu_set_def_domain_type(char *str)
 {
@@ -233,6 +350,26 @@ static int __init iommu_dma_setup(char *str)
 }
 early_param("iommu.strict", iommu_dma_setup);
 
+static int __init iommu_set_prq_timeout(char *str)
+{
+	int ret;
+	unsigned long timeout;
+
+	if (!str)
+		return -EINVAL;
+
+	ret = kstrtoul(str, 10, &timeout);
+	if (ret)
+		return ret;
+	timeout = timeout * HZ;
+	if (timeout > IOMMU_PAGE_RESPONSE_MAX_TIMEOUT)
+		return -EINVAL;
+	prq_timeout = timeout;
+
+	return 0;
+}
+early_param("iommu.prq_timeout", iommu_set_prq_timeout);
+
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
 {
@@ -289,8 +426,8 @@ static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf)
  * Elements are sorted by start address and overlapping segments
  * of the same type are merged.
  */
-int iommu_insert_resv_region(struct iommu_resv_region *new,
-			     struct list_head *regions)
+static int iommu_insert_resv_region(struct iommu_resv_region *new,
+				    struct list_head *regions)
 {
 	struct iommu_resv_region *iter, *tmp, *nr, *top;
 	LIST_HEAD(stack);
@@ -419,6 +556,9 @@ static ssize_t iommu_group_show_type(struct iommu_group *group,
 		case IOMMU_DOMAIN_DMA:
 			type = "DMA\n";
 			break;
+		case IOMMU_DOMAIN_DMA_FQ:
+			type = "DMA-FQ\n";
+			break;
 		}
 	}
 	strcpy(buf, type);
@@ -431,7 +571,8 @@ static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL);
 static IOMMU_GROUP_ATTR(reserved_regions, 0444,
 			iommu_group_show_resv_regions, NULL);
 
-static IOMMU_GROUP_ATTR(type, 0444, iommu_group_show_type, NULL);
+static IOMMU_GROUP_ATTR(type, 0644, iommu_group_show_type,
+			iommu_group_store_type);
 
 static void iommu_group_release(struct kobject *kobj)
 {
@@ -479,6 +620,7 @@ struct iommu_group *iommu_group_alloc(void)
 	group->kobj.kset = iommu_group_kset;
 	mutex_init(&group->mutex);
 	INIT_LIST_HEAD(&group->devices);
+	INIT_LIST_HEAD(&group->entry);
 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 
 	ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL);
@@ -620,8 +762,8 @@ int iommu_group_set_name(struct iommu_group *group, const char *name)
 }
 EXPORT_SYMBOL_GPL(iommu_group_set_name);
 
-static int iommu_group_create_direct_mappings(struct iommu_group *group,
-					      struct device *dev)
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+					       struct device *dev)
 {
 	struct iommu_domain *domain = group->default_domain;
 	struct iommu_resv_region *entry;
@@ -629,7 +771,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	unsigned long pg_size;
 	int ret = 0;
 
-	if (!domain || domain->type != IOMMU_DOMAIN_DMA)
+	if (!domain || !iommu_is_dma_domain(domain))
 		return 0;
 
 	BUG_ON(!domain->pgsize_bitmap);
@@ -667,7 +809,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 
 	}
 
-	iommu_flush_tlb_all(domain);
+	iommu_flush_iotlb_all(domain);
 
 out:
 	iommu_put_resv_regions(dev, &mappings);
@@ -675,6 +817,15 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 	return ret;
 }
 
+static bool iommu_is_attach_deferred(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	if (domain->ops->is_attach_deferred)
+		return domain->ops->is_attach_deferred(domain, dev);
+
+	return false;
+}
+
 /**
  * iommu_group_add_device - add a device to an iommu group
  * @group: the group into which to add the device (reference should be held)
@@ -725,11 +876,9 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
 
 	dev->iommu_group = group;
 
-	iommu_group_create_direct_mappings(group, dev);
-
 	mutex_lock(&group->mutex);
 	list_add_tail(&device->list, &group->devices);
-	if (group->domain)
+	if (group->domain  && !iommu_is_attach_deferred(group->domain, dev))
 		ret = __iommu_attach_device(group->domain, dev);
 	mutex_unlock(&group->mutex);
 	if (ret)
@@ -817,17 +966,6 @@ static int iommu_group_device_count(struct iommu_group *group)
 	return ret;
 }
 
-/**
- * iommu_group_for_each_dev - iterate over each device in the group
- * @group: the group
- * @data: caller opaque data to be passed to callback function
- * @fn: caller supplied callback function
- *
- * This function is called by group users to iterate over group devices.
- * Callers should hold a reference count to the group during callback.
- * The group->mutex is held across callbacks, which will block calls to
- * iommu_group_add/remove_device.
- */
 static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
 				      int (*fn)(struct device *, void *))
 {
@@ -842,7 +980,17 @@ static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
 	return ret;
 }
 
-
+/**
+ * iommu_group_for_each_dev - iterate over each device in the group
+ * @group: the group
+ * @data: caller opaque data to be passed to callback function
+ * @fn: caller supplied callback function
+ *
+ * This function is called by group users to iterate over group devices.
+ * Callers should hold a reference count to the group during callback.
+ * The group->mutex is held across callbacks, which will block calls to
+ * iommu_group_add/remove_device.
+ */
 int iommu_group_for_each_dev(struct iommu_group *group, void *data,
 			     int (*fn)(struct device *, void *))
 {
@@ -887,6 +1035,7 @@ struct iommu_group *iommu_group_ref_get(struct iommu_group *group)
 	kobject_get(group->devices_kobj);
 	return group;
 }
+EXPORT_SYMBOL_GPL(iommu_group_ref_get);
 
 /**
  * iommu_group_put - Decrement group reference
@@ -932,6 +1081,39 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+static void iommu_dev_fault_timer_fn(struct timer_list *t)
+{
+	struct iommu_fault_param *fparam = from_timer(fparam, t, timer);
+	struct iommu_fault_event *evt;
+	struct iommu_fault_page_request *prm;
+
+	u64 now;
+
+	now = get_jiffies_64();
+
+	/* The goal is to ensure driver or guest page fault handler(via vfio)
+	 * send page response on time. Otherwise, limited queue resources
+	 * may be occupied by some irresponsive guests or drivers.
+	 * When per device pending fault list is not empty, we periodically checks
+	 * if any anticipated page response time has expired.
+	 *
+	 * TODO:
+	 * We could do the following if response time expires:
+	 * 1. send page response code FAILURE to all pending PRQ
+	 * 2. inform device driver or vfio
+	 * 3. drain in-flight page requests and responses for this device
+	 * 4. clear pending fault list such that driver can unregister fault
+	 *    handler(otherwise blocked when pending faults are present).
+	 */
+	list_for_each_entry(evt, &fparam->faults, list) {
+		prm = &evt->fault.prm;
+		if (time_after64(now, evt->expire))
+			pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n",
+				prm->pasid, prm->grpid, evt->expire, now);
+	}
+	mod_timer(t, now + prq_timeout);
+}
+
 /**
  * iommu_register_device_fault_handler() - Register a device fault handler
  * @dev: the device
@@ -954,7 +1136,8 @@ int iommu_register_device_fault_handler(struct device *dev,
 					iommu_dev_fault_handler_t handler,
 					void *data)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata;
 	int ret = 0;
 
 	if (!param)
@@ -974,11 +1157,29 @@ int iommu_register_device_fault_handler(struct device *dev,
 		ret = -ENOMEM;
 		goto done_unlock;
 	}
+
 	param->fault_param->handler = handler;
-	param->fault_param->data = data;
+
+	hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL);
+	if (!hdata) {
+		kfree(param->fault_param);
+		put_device(dev);
+		ret = -ENOMEM;
+		goto done_unlock;
+	}
+
+	INIT_LIST_HEAD(&param->fault_param->data);
+	/* Default handler data uses reserved vector 0 */
+	hdata->data = data;
+	dev_dbg(dev, "Add IOMMU default handler data %llx\n", (u64)data);
+	list_add(&hdata->list, &param->fault_param->data);
+
 	mutex_init(&param->fault_param->lock);
 	INIT_LIST_HEAD(&param->fault_param->faults);
 
+	if (prq_timeout)
+		timer_setup(&param->fault_param->timer, iommu_dev_fault_timer_fn,
+			TIMER_DEFERRABLE);
 done_unlock:
 	mutex_unlock(&param->lock);
 
@@ -986,6 +1187,118 @@ int iommu_register_device_fault_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
 
+
+/**
+ * iommu_add_device_fault_data() - add handler specific data
+ *
+ * For devices with partitioned resources, we may need to have multiple
+ * handler data that can be identified by IOMMU driver. This function
+ * allows device drivers to add handler specific data associated with
+ * a vector. When IOMMU detects device fault and its vector, handlers
+ * can be invoked with the matching data.
+ * For page request service related to DMA request with PASID, the vector
+ * is the PASID and the data is PASID associated data such as a mediated
+ * device. Vector 0 is researved for default handler data when no per vector
+ * data is added to device handler data list.
+ *
+ * @dev: the device
+ * @vector: identifies fault reporting data
+ * @data: opaque device handler data associated with the fault
+ */
+int iommu_add_device_fault_data(struct device *dev,
+				int vector, void *data)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata;
+	int ret = 0;
+
+	dev_dbg(dev, "%s: vector: %d data: %llx\n", __func__, vector, (u64)data);
+	/*
+	 * Fault handler must have been registered before adding handler data.
+	 * Vector 0 is reserved for default data associated with handler.
+	 */
+	if (!param || !param->fault_param || !vector)
+		return -EINVAL;
+
+	mutex_lock(&param->lock);
+
+	ret = ioasid_get(NULL, vector);
+	if (ret) {
+		dev_err(dev, "Failed to get vector %d\n", vector);
+		goto unlock;
+	}
+
+	/* vector must be unique, check if we have the same vector already */
+	list_for_each_entry(hdata, &param->fault_param->data, list) {
+		if (hdata->vector == vector) {
+			dev_err(dev, "IOMMU fault handler data exists for vector %d\n", vector);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	hdata = kzalloc(sizeof(struct iommu_fault_handler_data), GFP_KERNEL);
+	if (!hdata) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+	hdata->vector = vector;
+	hdata->data = data;
+	dev_dbg(dev, "Added IOMMU fault handler data %llx for vector %d\n",
+		(u64)data, vector);
+	list_add_tail(&hdata->list, &param->fault_param->data);
+
+unlock:
+	mutex_unlock(&param->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_add_device_fault_data);
+
+/**
+ * iommu_delete_device_fault_data() - delete handler specific data
+ *
+ * For devices with partitioned resources, we may need to have multiple
+ * handler data that can be identified by IOMMU driver. This function
+ * allows device drivers to add handler specific data associated with
+ * a vector. When IOMMU detects device fault and its vector, handlers
+ * can be invoked with the matching data.
+ * For page request service related to DMA request with PASID, the vector
+ * is the PASID and the data is PASID associated data such as a mediated
+ * device.
+ * @dev: the device
+ * @vector: identifies fault reporting data to be removed
+ */
+void iommu_delete_device_fault_data(struct device *dev, int vector)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_handler_data *hdata, *tmp;
+
+	dev_dbg(dev, "%s: vector:%d\n", __func__, vector);
+	/*
+	 * Fault handler must have been registered before adding handler data.
+	 * Vector 0 is reserved for default data associated with handler.
+	 */
+	if (!param || !param->fault_param || !vector)
+		return;
+
+	mutex_lock(&param->lock);
+
+	list_for_each_entry_safe(hdata, tmp, &param->fault_param->data, list) {
+		if (hdata->vector == vector) {
+			list_del(&hdata->list);
+			kfree(hdata);
+			dev_dbg(dev, "Deleted IOMMU fault handler data for vector %d\n", vector);
+			ioasid_put(NULL, vector);//vector == pasid
+			goto unlock;
+		}
+	}
+	dev_err(dev, "Failed to find handler data for vector %d\n", vector);
+
+unlock:
+	mutex_unlock(&param->lock);
+}
+EXPORT_SYMBOL_GPL(iommu_delete_device_fault_data);
+
 /**
  * iommu_unregister_device_fault_handler() - Unregister the device fault handler
  * @dev: the device
@@ -997,8 +1310,10 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
  */
 int iommu_unregister_device_fault_handler(struct device *dev)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
+	struct iommu_fault_event *evt, *next;
+	struct iommu_fault_handler_data *hdata, *tmp;
 
 	if (!param)
 		return -EINVAL;
@@ -1010,8 +1325,32 @@ int iommu_unregister_device_fault_handler(struct device *dev)
 
 	/* we cannot unregister handler if there are pending faults */
 	if (!list_empty(&param->fault_param->faults)) {
+		/*
+		 * REVISIT: We should not run into pending faults if we do unbind first.
+		 * the proper termination flow will ensure no pending faults as follows:
+		 * 1. pasid disable and tlb flush
+		 * 2. unbind, free, flush and drain
+		 * 3. unregister fault handler.
+		 */
+		printk("%s, there is pending faults on dev: %s, here we force to free the fault events and unregister the fault handler, but this changes should be reverted when page response path is ready\n", __func__, dev_name(dev));
+		mutex_lock(&param->fault_param->lock);
+		list_for_each_entry_safe(evt, next, &param->fault_param->faults, list) {
+			dev_dbg(dev, "%s, free fault event: 0x%lx\n", __func__, (unsigned long) evt);
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		mutex_unlock(&param->fault_param->lock);
+/*
 		ret = -EBUSY;
 		goto unlock;
+*/
+	}
+	/* TODO: Free handler data if any */
+	list_for_each_entry_safe(hdata, tmp, &param->fault_param->data, list) {
+		dev_dbg(dev, "%s: free handler data %llx vector %d\n", __func__,
+			(u64)hdata->data, hdata->vector);
+		list_del(&hdata->list);
+		kfree(hdata);
 	}
 
 	kfree(param->fault_param);
@@ -1037,10 +1376,14 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
  */
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {
-	struct iommu_param *param = dev->iommu_param;
+	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_event *evt_pending = NULL;
+	struct iommu_fault_handler_data *hdata;
 	struct iommu_fault_param *fparam;
+	struct timer_list *tmr;
+	void *handler_data = NULL;
 	int ret = 0;
+	u64 exp;
 
 	if (!param || !evt)
 		return -EINVAL;
@@ -1061,33 +1404,112 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 			ret = -ENOMEM;
 			goto done_unlock;
 		}
+		/* Keep track of response expiration time */
+		exp = get_jiffies_64() + prq_timeout;
+		evt_pending->expire = exp;
 		mutex_lock(&fparam->lock);
+		if (list_empty(&fparam->faults) && prq_timeout) {
+			/* First pending event, start timer */
+			tmr = &dev->iommu->fault_param->timer;
+			WARN_ON(timer_pending(tmr));
+			mod_timer(tmr, exp);
+		}
+
 		list_add_tail(&evt_pending->list, &fparam->faults);
 		mutex_unlock(&fparam->lock);
 	}
 
-	ret = fparam->handler(&evt->fault, fparam->data);
+	if (!evt->vector) {
+		hdata = list_first_entry(&fparam->data,
+					struct iommu_fault_handler_data, list);
+		handler_data = hdata->data;
+		dev_dbg(dev, "%s:default handler data %llx\n",
+			__func__, (u64)handler_data);
+	} else {
+		/* Find data for matching vector */
+		list_for_each_entry(hdata, &param->fault_param->data, list) {
+			dev_dbg(dev, "Searching handler data vector %d to match %llu\n",
+					hdata->vector, evt->vector);
+
+			if (hdata->vector == evt->vector) {
+				handler_data = hdata->data;
+				dev_dbg(dev, "IOMMU report data %llx on fault vector %llu\n",
+					(u64)handler_data, evt->vector);
+				break;
+			}
+		}
+	}
+	if (!handler_data) {
+		dev_err(dev, "No valid handler data for vector %llu\n", evt->vector);
+		if (evt_pending)
+			list_del(&evt_pending->list);
+		ret = -ENODEV;
+		goto done_unlock;
+	}
+	dev_dbg(dev, "%s: calling handler with data %llx\n",
+		__func__, (u64)handler_data);
+
+	ret = fparam->handler(&evt->fault, handler_data);
+	trace_dev_fault(dev, &evt->fault);
 	if (ret && evt_pending) {
 		mutex_lock(&fparam->lock);
 		list_del(&evt_pending->list);
 		mutex_unlock(&fparam->lock);
 		kfree(evt_pending);
 	}
+	trace_dev_fault(dev, &evt->fault);
 done_unlock:
 	mutex_unlock(&param->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
-int iommu_page_response(struct device *dev,
-			struct iommu_page_response *msg)
+static int iommu_page_response_prepare_msg(void __user *udata,
+				struct iommu_page_response *msg)
+{
+	unsigned long minsz, maxsz;
+
+	/* Current kernel data size is the max to be copied from user */
+	maxsz = sizeof(struct iommu_page_response);
+	memset((void *)msg, 0, maxsz);
+	minsz = offsetofend(struct iommu_page_response, code);
+
+	if (copy_from_user(msg, udata, minsz))
+		return -EFAULT;
+
+	if (msg->argsz < minsz)
+		return -EINVAL;
+
+	if (msg->argsz > maxsz)
+		msg->argsz = maxsz;
+
+	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
+		!(msg->flags & IOMMU_PAGE_RESP_PASID_VALID)) {
+		pr_debug("%s:Invalid ver %x: flags %x\n",
+			__func__, msg->version, msg->flags);
+		return -EINVAL;
+	}
+
+	/* Copy the remaining user data _after_ minsz if there is */
+	if ((msg->argsz - minsz) &&
+	    copy_from_user((void *)msg + minsz, udata + minsz,
+				msg->argsz - minsz))
+		return -EFAULT;
+
+	return 0;
+}
+
+int iommu_page_response(struct iommu_domain *domain,
+			struct device *dev,
+			void __user *uinfo)
 {
-	bool pasid_valid;
+	bool needs_pasid;
 	int ret = -EINVAL;
+	struct iommu_page_response msg;
 	struct iommu_fault_event *evt;
 	struct iommu_fault_page_request *prm;
-	struct iommu_param *param = dev->iommu_param;
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct dev_iommu *param = dev->iommu;
+	bool has_pasid;
 
 	if (!domain || !domain->ops->page_response)
 		return -ENODEV;
@@ -1095,9 +1517,11 @@ int iommu_page_response(struct device *dev,
 	if (!param || !param->fault_param)
 		return -EINVAL;
 
-	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
-	    msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID)
-		return -EINVAL;
+	ret = iommu_page_response_prepare_msg(uinfo, &msg);
+	if (ret)
+		return ret;
+
+	has_pasid = msg.flags & IOMMU_PAGE_RESP_PASID_VALID;
 
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&param->fault_param->lock);
@@ -1111,21 +1535,39 @@ int iommu_page_response(struct device *dev,
 	 */
 	list_for_each_entry(evt, &param->fault_param->faults, list) {
 		prm = &evt->fault.prm;
-		pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		if (prm->grpid != msg.grpid)
+			continue;
 
-		if ((pasid_valid && prm->pasid != msg->pasid) ||
-		    prm->grpid != msg->grpid)
+		/*
+		 * If the PASID is required, the corresponding request is
+		 * matched using the group ID, the PASID valid bit and the PASID
+		 * value. Otherwise only the group ID matches request and
+		 * response.
+		 */
+		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+		if (needs_pasid && (!has_pasid || msg.pasid != prm->pasid))
 			continue;
 
-		/* Sanitize the reply */
-		msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0;
+		if (!needs_pasid && has_pasid) {
+			/* No big deal, just clear it. */
+			msg.flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
+			msg.pasid = 0;
+		}
 
-		ret = domain->ops->page_response(dev, evt, msg);
+		ret = domain->ops->page_response(domain, dev, evt, &msg);
+		trace_dev_page_response(dev, &msg);
 		list_del(&evt->list);
 		kfree(evt);
 		break;
 	}
 
+	/* stop response timer if no more pending request */
+	if (prq_timeout && list_empty(&param->fault_param->faults) &&
+		timer_pending(&param->fault_param->timer)) {
+		pr_debug("no pending PRQ, stop timer\n");
+		del_timer(&param->fault_param->timer);
+	}
+
 done_unlock:
 	mutex_unlock(&param->fault_param->lock);
 	return ret;
@@ -1260,6 +1702,7 @@ struct iommu_group *generic_device_group(struct device *dev)
 {
 	return iommu_group_alloc();
 }
+EXPORT_SYMBOL_GPL(generic_device_group);
 
 /*
  * Use standard PCI bus topology, isolation features, and DMA alias quirks
@@ -1327,6 +1770,7 @@ struct iommu_group *pci_device_group(struct device *dev)
 	/* No shared group found, allocate new */
 	return iommu_group_alloc();
 }
+EXPORT_SYMBOL_GPL(pci_device_group);
 
 /* Get the IOMMU group for device on fsl-mc bus */
 struct iommu_group *fsl_mc_device_group(struct device *dev)
@@ -1339,6 +1783,75 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
 		group = iommu_group_alloc();
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsl_mc_device_group);
+
+static int iommu_get_def_domain_type(struct device *dev)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
+		return IOMMU_DOMAIN_DMA;
+
+	if (ops->def_domain_type)
+		return ops->def_domain_type(dev);
+
+	return 0;
+}
+
+static int iommu_group_alloc_default_domain(struct bus_type *bus,
+					    struct iommu_group *group,
+					    unsigned int type)
+{
+	struct iommu_domain *dom;
+
+	dom = __iommu_domain_alloc(bus, type);
+	if (!dom && type != IOMMU_DOMAIN_DMA) {
+		dom = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
+		if (dom)
+			pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA",
+				type, group->name);
+	}
+
+	if (!dom)
+		return -ENOMEM;
+
+	group->default_domain = dom;
+	if (!group->domain)
+		group->domain = dom;
+
+	if (!iommu_dma_strict) {
+		int attr = 1;
+		iommu_domain_set_attr(dom,
+				      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+				      &attr);
+	}
+
+	return 0;
+}
+
+static int iommu_alloc_default_domain(struct iommu_group *group,
+				      struct device *dev)
+{
+#ifdef CONFIG_SMMU_BYPASS_DEV
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+#endif
+	unsigned int type = iommu_def_domain_type;
+
+	if (group->default_domain)
+		return 0;
+
+#ifdef CONFIG_SMMU_BYPASS_DEV
+	/* direct allocate required default domain type for some specific devices. */
+	if (ops->device_domain_type != NULL) {
+		if (ops->device_domain_type(dev, &type))
+			type = iommu_def_domain_type;
+	}
+#else
+	type = iommu_get_def_domain_type(dev) ? : iommu_def_domain_type;
+#endif
+
+	return iommu_group_alloc_default_domain(dev->bus, group, type);
+}
 
 /**
  * iommu_group_get_for_dev - Find or create the IOMMU group for a device
@@ -1350,14 +1863,12 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
  * to the returned IOMMU group, which will already include the provided
  * device.  The reference should be released with iommu_group_put().
  */
-struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_group *group;
 	int ret;
-#ifdef CONFIG_SMMU_BYPASS_DEV
-	unsigned int type = iommu_def_domain_type;
-#endif
+
 	group = iommu_group_get(dev);
 	if (group)
 		return group;
@@ -1372,53 +1883,16 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	if (IS_ERR(group))
 		return group;
 
-	/*
-	 * Try to allocate a default domain - needs support from the
-	 * IOMMU driver.
-	 */
-	if (!group->default_domain) {
-		struct iommu_domain *dom;
-
-#ifdef CONFIG_SMMU_BYPASS_DEV
-		/* direct allocate required default domain type for some specific devices. */
-		if (ops->device_domain_type != NULL) {
-			if (ops->device_domain_type(dev, &type))
-				type = iommu_def_domain_type;
-               }
-
-		dom = __iommu_domain_alloc(dev->bus, type);
-		if (!dom && type != IOMMU_DOMAIN_DMA) {
-#else
-		dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
-		if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
-#endif
-			dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
-			if (dom) {
-				dev_warn(dev,
-					 "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA",
-					 iommu_def_domain_type);
-			}
-		}
-
-		group->default_domain = dom;
-		if (!group->domain)
-			group->domain = dom;
-
-		if (dom && !iommu_dma_strict) {
-			int attr = 1;
-			iommu_domain_set_attr(dom,
-					      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-					      &attr);
-		}
-	}
-
 	ret = iommu_group_add_device(group, dev);
-	if (ret) {
-		iommu_group_put(group);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto out_put_group;
 
 	return group;
+
+out_put_group:
+	iommu_group_put(group);
+
+	return ERR_PTR(ret);
 }
 
 struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
@@ -1426,15 +1900,20 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 	return group->default_domain;
 }
 
-static int add_iommu_group(struct device *dev, void *data)
+static int probe_iommu_group(struct device *dev, void *data)
 {
-	int ret = iommu_probe_device(dev);
+	struct list_head *group_list = data;
+	struct iommu_group *group;
+	int ret;
 
-	/*
-	 * We ignore -ENODEV errors for now, as they just mean that the
-	 * device is not translated by an IOMMU. We still care about
-	 * other errors and fail to initialize when they happen.
-	 */
+	/* Device is probed already if in a group */
+	group = iommu_group_get(dev);
+	if (group) {
+		iommu_group_put(group);
+		return 0;
+	}
+
+	ret = __iommu_probe_device(dev, group_list);
 	if (ret == -ENODEV)
 		ret = 0;
 
@@ -1500,10 +1979,147 @@ static int iommu_bus_notifier(struct notifier_block *nb,
 	return 0;
 }
 
+struct __group_domain_type {
+	struct device *dev;
+	unsigned int type;
+};
+
+static int probe_get_default_domain_type(struct device *dev, void *data)
+{
+	struct __group_domain_type *gtype = data;
+	unsigned int type = iommu_get_def_domain_type(dev);
+
+	if (type) {
+		if (gtype->type && gtype->type != type) {
+			dev_warn(dev, "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n",
+				 iommu_domain_type_str(type),
+				 dev_name(gtype->dev),
+				 iommu_domain_type_str(gtype->type));
+			gtype->type = 0;
+		}
+
+		if (!gtype->dev) {
+			gtype->dev  = dev;
+			gtype->type = type;
+		}
+	}
+
+	return 0;
+}
+
+static void probe_alloc_default_domain(struct bus_type *bus,
+				       struct iommu_group *group)
+{
+	struct __group_domain_type gtype;
+
+	memset(&gtype, 0, sizeof(gtype));
+
+	/* Ask for default domain requirements of all devices in the group */
+	__iommu_group_for_each_dev(group, &gtype,
+				   probe_get_default_domain_type);
+
+	if (!gtype.type)
+		gtype.type = iommu_def_domain_type;
+
+	iommu_group_alloc_default_domain(bus, group, gtype.type);
+
+}
+
+static int iommu_group_do_dma_attach(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+	int ret = 0;
+
+	if (!iommu_is_attach_deferred(domain, dev))
+		ret = __iommu_attach_device(domain, dev);
+
+	return ret;
+}
+
+static int __iommu_group_dma_attach(struct iommu_group *group)
+{
+	return __iommu_group_for_each_dev(group, group->default_domain,
+					  iommu_group_do_dma_attach);
+}
+
+static int iommu_group_do_probe_finalize(struct device *dev, void *data)
+{
+	struct iommu_domain *domain = data;
+
+	if (domain->ops->probe_finalize)
+		domain->ops->probe_finalize(dev);
+
+	return 0;
+}
+
+static void __iommu_group_dma_finalize(struct iommu_group *group)
+{
+	__iommu_group_for_each_dev(group, group->default_domain,
+				   iommu_group_do_probe_finalize);
+}
+static int iommu_do_create_direct_mappings(struct device *dev, void *data)
+{
+	struct iommu_group *group = data;
+
+	iommu_create_device_direct_mappings(group, dev);
+
+	return 0;
+}
+
+static int iommu_group_create_direct_mappings(struct iommu_group *group)
+{
+	return __iommu_group_for_each_dev(group, group,
+					  iommu_do_create_direct_mappings);
+}
+
+int bus_iommu_probe(struct bus_type *bus)
+{
+	struct iommu_group *group, *next;
+	LIST_HEAD(group_list);
+	int ret;
+
+	/*
+	 * This code-path does not allocate the default domain when
+	 * creating the iommu group, so do it after the groups are
+	 * created.
+	 */
+	ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group);
+	if (ret)
+		return ret;
+
+	list_for_each_entry_safe(group, next, &group_list, entry) {
+		/* Remove item from the list */
+		list_del_init(&group->entry);
+
+		mutex_lock(&group->mutex);
+
+		/* Try to allocate default domain */
+		probe_alloc_default_domain(bus, group);
+
+		if (!group->default_domain) {
+			mutex_unlock(&group->mutex);
+			continue;
+		}
+
+		iommu_group_create_direct_mappings(group);
+
+		ret = __iommu_group_dma_attach(group);
+
+		mutex_unlock(&group->mutex);
+
+		if (ret)
+			break;
+
+		__iommu_group_dma_finalize(group);
+	}
+
+	return ret;
+}
+
 static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
 {
-	int err;
 	struct notifier_block *nb;
+	int err;
 
 	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
 	if (!nb)
@@ -1515,7 +2131,7 @@ static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops)
 	if (err)
 		goto out_free;
 
-	err = bus_for_each_dev(bus, NULL, NULL, add_iommu_group);
+	err = bus_iommu_probe(bus);
 	if (err)
 		goto out_err;
 
@@ -1550,6 +2166,11 @@ int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops)
 {
 	int err;
 
+	if (ops == NULL) {
+		bus->iommu_ops = NULL;
+		return 0;
+	}
+
 	if (bus->iommu_ops != NULL)
 		return -EBUSY;
 
@@ -1619,6 +2240,11 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
 	/* Assume all sizes by default; the driver may override this later */
 	domain->pgsize_bitmap  = bus->iommu_ops->pgsize_bitmap;
 
+	/* Temporarily avoid -EEXIST while drivers still get their own cookies */
+	if (iommu_is_dma_domain(domain) && !domain->iova_cookie && iommu_get_dma_cookie(domain)) {
+		iommu_domain_free(domain);
+		domain = NULL;
+	}
 	return domain;
 }
 
@@ -1630,6 +2256,7 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
 void iommu_domain_free(struct iommu_domain *domain)
 {
+	iommu_put_dma_cookie(domain);
 	domain->ops->domain_free(domain);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_free);
@@ -1638,9 +2265,6 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev)
 {
 	int ret;
-	if ((domain->ops->is_attach_deferred != NULL) &&
-	    domain->ops->is_attach_deferred(domain, dev))
-		return 0;
 
 	if (unlikely(domain->ops->attach_dev == NULL))
 		return -ENODEV;
@@ -1679,11 +2303,296 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
+{
+	const struct iommu_ops *ops = domain->ops;
+
+	if (ops->is_attach_deferred && ops->is_attach_deferred(domain, dev))
+		return __iommu_attach_device(domain, dev);
+
+	return 0;
+}
+
+/*
+ * Check flags and other user provided data for valid combinations. We also
+ * make sure no reserved fields or unused flags are set. This is to ensure
+ * not breaking userspace in the future when these fields or flags are used.
+ */
+static int iommu_check_cache_invl_data(struct iommu_cache_invalidate_info *info)
+{
+	u32 mask;
+	int i;
+
+	if (info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+		return -EINVAL;
+
+	mask = (1 << IOMMU_CACHE_INV_TYPE_NR) - 1;
+	if (info->cache & ~mask)
+		return -EINVAL;
+
+	if (info->granularity >= IOMMU_INV_GRANU_NR)
+		return -EINVAL;
+
+	switch (info->granularity) {
+	case IOMMU_INV_GRANU_ADDR:
+		if (info->cache & IOMMU_CACHE_INV_TYPE_PASID)
+			return -EINVAL;
+
+		mask = IOMMU_INV_ADDR_FLAGS_PASID |
+			IOMMU_INV_ADDR_FLAGS_ARCHID |
+			IOMMU_INV_ADDR_FLAGS_LEAF;
+
+		if (info->granu.addr_info.flags & ~mask)
+			return -EINVAL;
+		break;
+	case IOMMU_INV_GRANU_PASID:
+		mask = IOMMU_INV_PASID_FLAGS_PASID |
+			IOMMU_INV_PASID_FLAGS_ARCHID;
+		if (info->granu.pasid_info.flags & ~mask)
+			return -EINVAL;
+
+		break;
+	case IOMMU_INV_GRANU_DOMAIN:
+		if (info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Check reserved padding fields */
+	for (i = 0; i < sizeof(info->padding); i++) {
+		if (info->padding[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+				void __user *uinfo)
+{
+	struct iommu_cache_invalidate_info inv_info = { 0 };
+	u32 minsz;
+	int ret;
+
+	if (unlikely(!domain->ops->cache_invalidate))
+		return -ENODEV;
+
+	/*
+	 * No new spaces can be added before the variable sized union, the
+	 * minimum size is the offset to the union.
+	 */
+	minsz = offsetof(struct iommu_cache_invalidate_info, granu);
+
+	/* Copy minsz from user to get flags and argsz */
+	if (copy_from_user(&inv_info, uinfo, minsz))
+		return -EFAULT;
+
+	/* Fields before the variable size union are mandatory */
+	if (inv_info.argsz < minsz)
+		return -EINVAL;
+
+	/* PASID and address granu require additional info beyond minsz */
+	if (inv_info.granularity == IOMMU_INV_GRANU_PASID &&
+	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.pasid_info))
+		return -EINVAL;
+
+	if (inv_info.granularity == IOMMU_INV_GRANU_ADDR &&
+	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.addr_info))
+		return -EINVAL;
+
+	/*
+	 * User might be using a newer UAPI header which has a larger data
+	 * size, we shall support the existing flags within the current
+	 * size. Copy the remaining user data _after_ minsz but not more
+	 * than the current kernel supported size.
+	 */
+	if (copy_from_user((void *)&inv_info + minsz, uinfo + minsz,
+			   min_t(u32, inv_info.argsz, sizeof(inv_info)) - minsz))
+		return -EFAULT;
+
+	/* Now the argsz is validated, check the content */
+	ret = iommu_check_cache_invl_data(&inv_info);
+	if (ret)
+		return ret;
+
+	return domain->ops->cache_invalidate(domain, dev, &inv_info);
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate);
+
+static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
+{
+	u32 mask;
+	int i;
+
+	if (data->version != IOMMU_GPASID_BIND_VERSION_1)
+		return -EINVAL;
+
+	/* Check the range of supported formats */
+	if (data->format >= IOMMU_PASID_FORMAT_LAST)
+		return -EINVAL;
+
+	/* Check all flags */
+	mask = IOMMU_SVA_GPASID_VAL | IOMMU_SVA_HPASID_DEF | IOMMU_SVA_SL_ONLY;
+	if (data->flags & ~mask)
+		return -EINVAL;
+
+	/* Check reserved padding fields */
+	for (i = 0; i < sizeof(data->padding); i++) {
+		if (data->padding[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int iommu_sva_prepare_bind_data(void __user *udata,
+				       struct iommu_gpasid_bind_data *data)
+{
+	u32 minsz;
+
+	/*
+	 * No new spaces can be added before the variable sized union, the
+	 * minimum size is the offset to the union.
+	 */
+	minsz = offsetof(struct iommu_gpasid_bind_data, vendor);
+
+	/* Copy minsz from user to get flags and argsz */
+	if (copy_from_user(data, udata, minsz))
+		return -EFAULT;
+
+	/* Fields before the variable size union are mandatory */
+	if (data->argsz < minsz)
+		return -EINVAL;
+	/*
+	 * User might be using a newer UAPI header, we shall let IOMMU vendor
+	 * driver decide on what size it needs. Since the guest PASID bind data
+	 * can be vendor specific, larger argsz could be the result of extension
+	 * for one vendor but it should not affect another vendor.
+	 * Copy the remaining user data _after_ minsz
+	 */
+	if (copy_from_user((void *)data + minsz, udata + minsz,
+			   min_t(u32, data->argsz, sizeof(*data)) - minsz))
+		return -EFAULT;
+
+	return iommu_check_bind_data(data);
+}
+
+
+/*
+ * Caller could provide fault_data to differentiate future page
+ * requests from the device. This is helpful for page request
+ * handling for partial assignments of physical devices. e.g.
+ * mediated device assingment or other sub-device solution.
+ */
+int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+			       void __user *udata, void *fault_data)
+{
+	struct iommu_gpasid_bind_data data = { 0 };
+	int ret;
+
+	if (unlikely(!domain->ops->sva_bind_gpasid))
+		return -ENODEV;
+
+	ret = iommu_sva_prepare_bind_data(udata, &data);
+	if (ret)
+		return ret;
+
+	ret = ioasid_get(NULL, data.hpasid);
+	if (ret)
+		return ret;
+
+	ret = domain->ops->sva_bind_gpasid(domain, dev, &data, fault_data);
+	ioasid_put(NULL, data.hpasid);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
+
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+			     ioasid_t pasid, u64 flags)
+{
+	pr_warn("%s: FIXME need to clear all pending faults!\n", __func__);
+#if 0
+	struct dev_iommu *param = dev->iommu;
+	/* FIXME: clear all pending page requests */
+	struct iommu_page_response msg;
+	int ret = -EINVAL;
+	struct iommu_fault_event *evt, *tmp;
+
+	if (!domain || !domain->ops->page_response)
+		return -ENODEV;
+
+	/*
+	 * Device dev_iommu should have been allocated when device is
+	 * added to its iommu_group.
+	 */
+	if (!param || !param->fault_param)
+		return -EINVAL;
+
+	/* Only send response if there is a fault report pending */
+	mutex_lock(&param->fault_param->lock);
+	if (list_empty(&param->fault_param->faults)) {
+		pr_warn("no pending PRQ, drop response\n");
+		goto done_unlock;
+	}
+
+	/* Clear all pending page requests and return response code INVALID */
+	list_for_each_entry_safe(evt, tmp, &param->fault_param->faults, list) {
+		if (evt->fault.prm.pasid == pasid) {
+			memcpy(&msg.private_data, &evt->private_data, sizeof(evt->private_data));
+			msg.pasid = pasid;
+			msg.flags |= IOMMU_PAGE_RESP_PASID_VALID;
+			msg.priv_data_present = 1;
+			msg.grpid = evt->fault.prm.grpid;
+			msg.code = IOMMU_PAGE_RESP_INVALID;
+			trace_dev_page_response(dev, &msg);
+			dev_dbg(dev, "Clear pending PRQ for PASID %d grp %d resp code IR\n",
+				pasid, msg.grpid);
+			ret = domain->ops->page_response(dev, &msg, evt);
+			list_del(&evt->list);
+			kfree(evt);
+		}
+	}
+
+done_unlock:
+	mutex_unlock(&param->fault_param->lock);
+#endif
+	if (unlikely(!domain->ops->sva_unbind_gpasid))
+		return -ENODEV;
+
+	return domain->ops->sva_unbind_gpasid(domain, dev, pasid, flags);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
+int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+				 void __user *udata)
+{
+	struct iommu_gpasid_bind_data data = { 0 };
+	int ret;
+
+	if (unlikely(!domain->ops->sva_bind_gpasid))
+		return -ENODEV;
+
+	ret = iommu_sva_prepare_bind_data(udata, &data);
+	if (ret)
+		return ret;
+
+	ret = ioasid_get(NULL, data.hpasid);
+	if (ret)
+		return ret;
+	ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid, data.flags);
+	ioasid_put(NULL, data.hpasid);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
-	if ((domain->ops->is_attach_deferred != NULL) &&
-	    domain->ops->is_attach_deferred(domain, dev))
+	if (iommu_is_attach_deferred(domain, dev))
 		return;
 
 	if (unlikely(domain->ops->detach_dev == NULL))
@@ -1829,7 +2738,10 @@ EXPORT_SYMBOL_GPL(iommu_detach_group);
 
 phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
-	if (unlikely(domain->ops->iova_to_phys == NULL))
+	if (domain->type == IOMMU_DOMAIN_IDENTITY)
+		return iova;
+
+	if (domain->type == IOMMU_DOMAIN_BLOCKED)
 		return 0;
 
 	return domain->ops->iova_to_phys(domain, iova);
@@ -1868,8 +2780,8 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
 	return pgsize;
 }
 
-int iommu_map(struct iommu_domain *domain, unsigned long iova,
-	      phys_addr_t paddr, size_t size, int prot)
+static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
+		       phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	const struct iommu_ops *ops = domain->ops;
 	unsigned long orig_iova = iova;
@@ -1906,8 +2818,8 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
 			 iova, &paddr, pgsize);
+		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
 
-		ret = ops->map(domain, iova, paddr, pgsize, prot);
 		if (ret)
 			break;
 
@@ -1916,9 +2828,6 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		size -= pgsize;
 	}
 
-	if (ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
-
 	/* unroll mapping in case something went wrong */
 	if (ret)
 		iommu_unmap(domain, orig_iova, orig_size - size);
@@ -1927,8 +2836,35 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	return ret;
 }
+
+static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
+		      phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+	const struct iommu_ops *ops = domain->ops;
+	int ret;
+
+	ret = __iommu_map(domain, iova, paddr, size, prot, gfp);
+	if (ret == 0 && ops->iotlb_sync_map)
+		ops->iotlb_sync_map(domain, iova, size);
+
+	return ret;
+}
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, size_t size, int prot)
+{
+	might_sleep();
+	return _iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(iommu_map);
 
+int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, size_t size, int prot)
+{
+	return _iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(iommu_map_atomic);
+
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
 			    struct iommu_iotlb_gather *iotlb_gather)
@@ -1991,7 +2927,7 @@ size_t iommu_unmap(struct iommu_domain *domain,
 
 	iommu_iotlb_gather_init(&iotlb_gather);
 	ret = __iommu_unmap(domain, iova, size, &iotlb_gather);
-	iommu_tlb_sync(domain, &iotlb_gather);
+	iommu_iotlb_sync(domain, &iotlb_gather);
 
 	return ret;
 }
@@ -2005,9 +2941,11 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-		    struct scatterlist *sg, unsigned int nents, int prot)
+static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		struct scatterlist *sg, unsigned int nents, int prot,
+		gfp_t gfp)
 {
+	const struct iommu_ops *ops = domain->ops;
 	size_t len = 0, mapped = 0;
 	phys_addr_t start;
 	unsigned int i = 0;
@@ -2017,7 +2955,9 @@ size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 		phys_addr_t s_phys = sg_phys(sg);
 
 		if (len && s_phys != start + len) {
-			ret = iommu_map(domain, iova + mapped, start, len, prot);
+			ret = __iommu_map(domain, iova + mapped, start,
+					  len, prot, gfp);
+
 			if (ret)
 				goto out_err;
 
@@ -2036,17 +2976,31 @@ size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 			sg = sg_next(sg);
 	}
 
+	if (ops->iotlb_sync_map)
+		ops->iotlb_sync_map(domain, iova, mapped);
 	return mapped;
 
 out_err:
 	/* undo mappings already done */
 	iommu_unmap(domain, iova, mapped);
 
-	return 0;
+	return ret;
+}
 
+ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		     struct scatterlist *sg, unsigned int nents, int prot)
+{
+	might_sleep();
+	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
+ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+		    struct scatterlist *sg, unsigned int nents, int prot)
+{
+	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
+}
+
 int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 			       phys_addr_t paddr, u64 size, int prot)
 {
@@ -2058,15 +3012,6 @@ int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
 
-void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	if (unlikely(domain->ops->domain_window_disable == NULL))
-		return;
-
-	return domain->ops->domain_window_disable(domain, wnd_nr);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
-
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
  * @domain: the iommu domain where the fault has happened
@@ -2107,19 +3052,157 @@ int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 	trace_io_page_fault(dev, iova, flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(report_iommu_fault);
+EXPORT_SYMBOL_GPL(report_iommu_fault);
+
+static int __init iommu_init(void)
+{
+	iommu_group_kset = kset_create_and_add("iommu_groups",
+					       NULL, kernel_kobj);
+	BUG_ON(!iommu_group_kset);
+
+	iommu_debugfs_setup();
+
+	return 0;
+}
+core_initcall(iommu_init);
+
+static int __iommu_merge_pages(struct iommu_domain *domain,
+			       unsigned long iova, phys_addr_t paddr,
+			       size_t size)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret = 0;
+
+	if (unlikely(!ops))
+		return -ENODEV;
+
+	if (unlikely(!ops->merge_pages)) {
+		pr_warn("don't support merge_pages\n");
+		return ret;
+	}
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n",
+			iova, &paddr, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova | paddr, size);
+
+		ret = ops->merge_pages(domain, iova, paddr, pgsize);
+		if (ret)
+			break;
+
+		pr_debug("merge handled: iova 0x%lx pa %pa size 0x%zx\n",
+			 iova, &paddr, pgsize);
+
+		iova += pgsize;
+		paddr += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+
+static int iommu_merge_pages(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
+{
+	phys_addr_t phys;
+	dma_addr_t p, i;
+	size_t cont_size;
+	int ret = 0;
+
+	while (size) {
+		phys = iommu_iova_to_phys(domain, iova);
+		cont_size = PAGE_SIZE;
+		p = phys + cont_size;
+		i = iova + cont_size;
+
+		while (cont_size < size && p == iommu_iova_to_phys(domain, i)) {
+			p += PAGE_SIZE;
+			i += PAGE_SIZE;
+			cont_size += PAGE_SIZE;
+		}
+
+		ret = __iommu_merge_pages(domain, iova, phys, cont_size);
+		if (ret)
+			break;
+
+		iova += cont_size;
+		size -= cont_size;
+	}
+	iommu_flush_iotlb_all(domain);
+
+	return ret;
+}
+
+static int iommu_split_block(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret = 0;
+
+	if (unlikely(!ops))
+		return -ENODEV;
+
+	if (unlikely(!ops->split_block)) {
+		pr_warn("don't support split_block\n");
+		return ret;
+	}
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->split_block(domain, iova, pgsize);
+		if (ret)
+			break;
+
+		pr_debug("split handled: iova 0x%lx size 0x%zx\n", iova, pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+	iommu_flush_iotlb_all(domain);
+
+	return ret;
+}
 
-static int __init iommu_init(void)
+int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
+			   unsigned long iova, size_t size)
 {
-	iommu_group_kset = kset_create_and_add("iommu_groups",
-					       NULL, kernel_kobj);
-	BUG_ON(!iommu_group_kset);
+	const struct iommu_ops *ops = domain->ops;
+	int ret = 0;
 
-	iommu_debugfs_setup();
+	if (!ops || !ops->set_hwdbm) {
+		pr_err_ratelimited("Don't support set_hwdbm\n");
+		return -EINVAL;
+	}
 
-	return 0;
+	ret = ops->set_hwdbm(domain, enable, iova, size);
+	if (ret)
+		return ret;
+
+	if (enable)
+		ret = iommu_split_block(domain, iova, size);
+	else
+		ret = iommu_merge_pages(domain, iova, size);
+
+	return ret;
 }
-core_initcall(iommu_init);
+EXPORT_SYMBOL_GPL(iommu_domain_set_hwdbm);
 
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
@@ -2166,6 +3249,111 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
 
+int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+			 size_t size, unsigned long *bitmap,
+			 unsigned long base_iova, unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	unsigned int min_pagesz;
+	size_t pgsize;
+	int ret;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+			iova, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	if (!ops || !ops->sync_dirty_log) {
+		pr_err("don't support sync dirty log\n");
+		return -ENODEV;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+
+		ret = ops->sync_dirty_log(domain, iova, pgsize,
+					bitmap, base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("dirty_log_sync: iova 0x%lx pagesz 0x%zx\n", iova,
+			pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sync_dirty_log);
+
+static int __iommu_clear_dirty_log(struct iommu_domain *domain,
+				   unsigned long iova, size_t size,
+				   unsigned long *bitmap,
+				   unsigned long base_iova,
+				   unsigned long bitmap_pgshift)
+{
+	const struct iommu_ops *ops = domain->ops;
+	size_t pgsize;
+	int ret = 0;
+
+	if (!ops || !ops->clear_dirty_log) {
+		pr_err("don't support clear dirty log\n");
+		return -ENODEV;
+	}
+
+	while (size) {
+		pgsize = iommu_pgsize(domain, iova, size);
+		ret = ops->clear_dirty_log(domain, iova, pgsize, bitmap,
+				base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+
+		pr_debug("dirty_log_clear: iova 0x%lx pagesz 0x%zx\n", iova,
+			 pgsize);
+
+		iova += pgsize;
+		size -= pgsize;
+	}
+
+	return ret;
+}
+
+int iommu_clear_dirty_log(struct iommu_domain *domain,
+			  unsigned long iova, size_t size,
+			  unsigned long *bitmap, unsigned long base_iova,
+			  unsigned long bitmap_pgshift)
+{
+	unsigned long riova, rsize;
+	unsigned int min_pagesz;
+	int rs, re, start, end, ret = 0;
+
+	min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
+
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		pr_err("unaligned: iova 0x%lx min_pagesz 0x%x\n",
+			iova, min_pagesz);
+		return -EINVAL;
+	}
+
+	start = (iova - base_iova) >> bitmap_pgshift;
+	end = start + (size >> bitmap_pgshift);
+	bitmap_for_each_set_region(bitmap, rs, re, start, end) {
+		riova = iova + (rs << bitmap_pgshift);
+		rsize = (re - rs) << bitmap_pgshift;
+		ret = __iommu_clear_dirty_log(domain, riova, rsize, bitmap,
+					      base_iova, bitmap_pgshift);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_dirty_log);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
@@ -2182,6 +3370,25 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 		ops->put_resv_regions(dev, list);
 }
 
+/**
+ * generic_iommu_put_resv_regions - Reserved region driver helper
+ * @dev: device for which to free reserved regions
+ * @list: reserved region list for device
+ *
+ * IOMMU drivers can use this to implement their .put_resv_regions() callback
+ * for simple reservations. Memory allocated for each reserved region will be
+ * freed. If an IOMMU driver allocates additional resources per region, it is
+ * going to have to implement a custom callback.
+ */
+void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, list, list)
+		kfree(entry);
+}
+EXPORT_SYMBOL(generic_iommu_put_resv_regions);
+
 struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 						  size_t length, int prot,
 						  enum iommu_resv_type type)
@@ -2199,71 +3406,7 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 	region->type = type;
 	return region;
 }
-
-static int
-request_default_domain_for_dev(struct device *dev, unsigned long type)
-{
-	struct iommu_domain *domain;
-	struct iommu_group *group;
-	int ret;
-
-	/* Device must already be in a group before calling this function */
-	group = iommu_group_get(dev);
-	if (!group)
-		return -EINVAL;
-
-	mutex_lock(&group->mutex);
-
-	ret = 0;
-	if (group->default_domain && group->default_domain->type == type)
-		goto out;
-
-	/* Don't change mappings of existing devices */
-	ret = -EBUSY;
-	if (iommu_group_device_count(group) != 1)
-		goto out;
-
-	ret = -ENOMEM;
-	domain = __iommu_domain_alloc(dev->bus, type);
-	if (!domain)
-		goto out;
-
-	/* Attach the device to the domain */
-	ret = __iommu_attach_group(domain, group);
-	if (ret) {
-		iommu_domain_free(domain);
-		goto out;
-	}
-
-	/* Make the domain the default for this group */
-	if (group->default_domain)
-		iommu_domain_free(group->default_domain);
-	group->default_domain = domain;
-
-	iommu_group_create_direct_mappings(group, dev);
-
-	dev_info(dev, "Using iommu %s mapping\n",
-		 type == IOMMU_DOMAIN_DMA ? "dma" : "direct");
-
-	ret = 0;
-out:
-	mutex_unlock(&group->mutex);
-	iommu_group_put(group);
-
-	return ret;
-}
-
-/* Request that a device is direct mapped by the IOMMU */
-int iommu_request_dm_for_dev(struct device *dev)
-{
-	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY);
-}
-
-/* Request that a device can't be direct mapped by the IOMMU */
-int iommu_request_dma_domain_for_dev(struct device *dev)
-{
-	return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA);
-}
+EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 
 void iommu_set_default_passthrough(bool cmd_line)
 {
@@ -2310,7 +3453,11 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 	if (fwspec)
 		return ops == fwspec->ops ? 0 : -EINVAL;
 
-	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+	if (!dev_iommu_get(dev))
+		return -ENOMEM;
+
+	/* Preallocate for the overwhelmingly common case of 1 ID */
+	fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
 	if (!fwspec)
 		return -ENOMEM;
 
@@ -2337,15 +3484,15 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-	size_t size;
-	int i;
+	int i, new_num;
 
 	if (!fwspec)
 		return -EINVAL;
 
-	size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
-	if (size > sizeof(*fwspec)) {
-		fwspec = krealloc(fwspec, size, GFP_KERNEL);
+	new_num = fwspec->num_ids + num_ids;
+	if (new_num > 1) {
+		fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num),
+				  GFP_KERNEL);
 		if (!fwspec)
 			return -ENOMEM;
 
@@ -2355,7 +3502,7 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 	for (i = 0; i < num_ids; i++)
 		fwspec->ids[fwspec->num_ids + i] = ids[i];
 
-	fwspec->num_ids += num_ids;
+	fwspec->num_ids = new_num;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
@@ -2363,23 +3510,14 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
 /*
  * Per device IOMMU features.
  */
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-	if (ops && ops->dev_has_feat)
-		return ops->dev_has_feat(dev, feat);
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
-
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_enable_feat)
-		return ops->dev_enable_feat(dev, feat);
+		if (ops->dev_enable_feat)
+			return ops->dev_enable_feat(dev, feat);
+	}
 
 	return -ENODEV;
 }
@@ -2392,10 +3530,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_disable_feat)
-		return ops->dev_disable_feat(dev, feat);
+		if (ops->dev_disable_feat)
+			return ops->dev_disable_feat(dev, feat);
+	}
 
 	return -EBUSY;
 }
@@ -2403,10 +3543,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	if (dev->iommu && dev->iommu->iommu_dev) {
+		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-	if (ops && ops->dev_feat_enabled)
-		return ops->dev_feat_enabled(dev, feat);
+		if (ops->dev_feat_enabled)
+			return ops->dev_feat_enabled(dev, feat);
+	}
 
 	return false;
 }
@@ -2460,6 +3602,7 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
  * iommu_sva_bind_device() - Bind a process address space to a device
  * @dev: the device
  * @mm: the mm to bind, caller must hold a reference to it
+ * @drvdata: opaque data pointer to pass to bind callback
  *
  * Create a bond between device and address space, allowing the device to access
  * the mm using the returned PASID. If a bond already exists between @device and
@@ -2538,18 +3681,7 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 }
 EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
 
-int iommu_sva_set_ops(struct iommu_sva *handle,
-		      const struct iommu_sva_ops *sva_ops)
-{
-	if (handle->ops && handle->ops != sva_ops)
-		return -EEXIST;
-
-	handle->ops = sva_ops;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_sva_set_ops);
-
-int iommu_sva_get_pasid(struct iommu_sva *handle)
+u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	const struct iommu_ops *ops = handle->dev->bus->iommu_ops;
 
@@ -2559,3 +3691,236 @@ int iommu_sva_get_pasid(struct iommu_sva *handle)
 	return ops->sva_get_pasid(handle);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
+
+/*
+ * Changes the default domain of an iommu group that has *only* one device
+ *
+ * @group: The group for which the default domain should be changed
+ * @prev_dev: The device in the group (this is used to make sure that the device
+ *	 hasn't changed after the caller has called this function)
+ * @type: The type of the new default domain that gets associated with the group
+ *
+ * Returns 0 on success and error code on failure
+ *
+ * Note:
+ * 1. Presently, this function is called only when user requests to change the
+ *    group's default domain type through /sys/kernel/iommu_groups/<grp_id>/type
+ *    Please take a closer look if intended to use for other purposes.
+ */
+static int iommu_change_dev_def_domain(struct iommu_group *group,
+				       struct device *prev_dev, int type)
+{
+	struct iommu_domain *prev_dom;
+	struct group_device *grp_dev;
+	int ret, dev_def_dom;
+	struct device *dev;
+
+	mutex_lock(&group->mutex);
+
+	if (group->default_domain != group->domain) {
+		dev_err_ratelimited(prev_dev, "Group not assigned to default domain\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * iommu group wasn't locked while acquiring device lock in
+	 * iommu_group_store_type(). So, make sure that the device count hasn't
+	 * changed while acquiring device lock.
+	 *
+	 * Changing default domain of an iommu group with two or more devices
+	 * isn't supported because there could be a potential deadlock. Consider
+	 * the following scenario. T1 is trying to acquire device locks of all
+	 * the devices in the group and before it could acquire all of them,
+	 * there could be another thread T2 (from different sub-system and use
+	 * case) that has already acquired some of the device locks and might be
+	 * waiting for T1 to release other device locks.
+	 */
+	if (iommu_group_device_count(group) != 1) {
+		dev_err_ratelimited(prev_dev, "Cannot change default domain: Group has more than one device\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Since group has only one device */
+	grp_dev = list_first_entry(&group->devices, struct group_device, list);
+	dev = grp_dev->dev;
+
+	if (prev_dev != dev) {
+		dev_err_ratelimited(prev_dev, "Cannot change default domain: Device has been changed\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	prev_dom = group->default_domain;
+	if (!prev_dom) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev_def_dom = iommu_get_def_domain_type(dev);
+	if (!type) {
+		/*
+		 * If the user hasn't requested any specific type of domain and
+		 * if the device supports both the domains, then default to the
+		 * domain the device was booted with
+		 */
+		type = dev_def_dom ? : iommu_def_domain_type;
+	} else if (dev_def_dom && type != dev_def_dom) {
+		dev_err_ratelimited(prev_dev, "Device cannot be in %s domain\n",
+				    iommu_domain_type_str(type));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Switch to a new domain only if the requested domain type is different
+	 * from the existing default domain type
+	 */
+	if (prev_dom->type == type) {
+		ret = 0;
+		goto out;
+	}
+
+	/* We can bring up a flush queue without tearing down the domain */
+	if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) {
+		ret = iommu_dma_init_fq(prev_dom);
+		if (!ret)
+			prev_dom->type = IOMMU_DOMAIN_DMA_FQ;
+		goto out;
+	}
+
+	/* Sets group->default_domain to the newly allocated domain */
+	ret = iommu_group_alloc_default_domain(dev->bus, group, type);
+	if (ret)
+		goto out;
+
+	ret = iommu_create_device_direct_mappings(group, dev);
+	if (ret)
+		goto free_new_domain;
+
+	ret = __iommu_attach_device(group->default_domain, dev);
+	if (ret)
+		goto free_new_domain;
+
+	group->domain = group->default_domain;
+
+	/*
+	 * Release the mutex here because ops->probe_finalize() call-back of
+	 * some vendor IOMMU drivers calls arm_iommu_attach_device() which
+	 * in-turn might call back into IOMMU core code, where it tries to take
+	 * group->mutex, resulting in a deadlock.
+	 */
+	mutex_unlock(&group->mutex);
+
+	/* Make sure dma_ops is appropriatley set */
+	iommu_group_do_probe_finalize(dev, group->default_domain);
+	iommu_domain_free(prev_dom);
+	return 0;
+
+free_new_domain:
+	iommu_domain_free(group->default_domain);
+	group->default_domain = prev_dom;
+	group->domain = prev_dom;
+
+out:
+	mutex_unlock(&group->mutex);
+
+	return ret;
+}
+
+/*
+ * Changing the default domain through sysfs requires the users to unbind the
+ * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ
+ * transition. Return failure if this isn't met.
+ *
+ * We need to consider the race between this and the device release path.
+ * device_lock(dev) is used here to guarantee that the device release path
+ * will not be entered at the same time.
+ */
+static ssize_t iommu_group_store_type(struct iommu_group *group,
+				      const char *buf, size_t count)
+{
+	struct group_device *grp_dev;
+	struct device *dev;
+	int ret, req_type;
+
+	if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+		return -EACCES;
+
+	if (WARN_ON(!group))
+		return -EINVAL;
+
+	if (sysfs_streq(buf, "identity"))
+		req_type = IOMMU_DOMAIN_IDENTITY;
+	else if (sysfs_streq(buf, "DMA"))
+		req_type = IOMMU_DOMAIN_DMA;
+	else if (sysfs_streq(buf, "DMA-FQ"))
+		req_type = IOMMU_DOMAIN_DMA_FQ;
+	else if (sysfs_streq(buf, "auto"))
+		req_type = 0;
+	else
+		return -EINVAL;
+
+	/*
+	 * Lock/Unlock the group mutex here before device lock to
+	 * 1. Make sure that the iommu group has only one device (this is a
+	 *    prerequisite for step 2)
+	 * 2. Get struct *dev which is needed to lock device
+	 */
+	mutex_lock(&group->mutex);
+	if (iommu_group_device_count(group) != 1) {
+		mutex_unlock(&group->mutex);
+		pr_err_ratelimited("Cannot change default domain: Group has more than one device\n");
+		return -EINVAL;
+	}
+
+	/* Since group has only one device */
+	grp_dev = list_first_entry(&group->devices, struct group_device, list);
+	dev = grp_dev->dev;
+	get_device(dev);
+
+	/*
+	 * Don't hold the group mutex because taking group mutex first and then
+	 * the device lock could potentially cause a deadlock as below. Assume
+	 * two threads T1 and T2. T1 is trying to change default domain of an
+	 * iommu group and T2 is trying to hot unplug a device or release [1] VF
+	 * of a PCIe device which is in the same iommu group. T1 takes group
+	 * mutex and before it could take device lock assume T2 has taken device
+	 * lock and is yet to take group mutex. Now, both the threads will be
+	 * waiting for the other thread to release lock. Below, lock order was
+	 * suggested.
+	 * device_lock(dev);
+	 *	mutex_lock(&group->mutex);
+	 *		iommu_change_dev_def_domain();
+	 *	mutex_unlock(&group->mutex);
+	 * device_unlock(dev);
+	 *
+	 * [1] Typical device release path
+	 * device_lock() from device/driver core code
+	 *  -> bus_notifier()
+	 *   -> iommu_bus_notifier()
+	 *    -> iommu_release_device()
+	 *     -> ops->release_device() vendor driver calls back iommu core code
+	 *      -> mutex_lock() from iommu core code
+	 */
+	mutex_unlock(&group->mutex);
+
+	/* Check if the device in the group still has a driver bound to it */
+	device_lock(dev);
+	if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
+	    group->default_domain->type == IOMMU_DOMAIN_DMA)) {
+		pr_err_ratelimited("Device is still bound to driver\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = iommu_change_dev_def_domain(group, dev, req_type);
+	ret = ret ?: count;
+
+out:
+	device_unlock(dev);
+	put_device(dev);
+
+	return ret;
+}
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 612cbf668adf8667172d46784463cfdbfaea5409..8c0813fafd2bc99f15c96052ebaa8d5882a9b3e9 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -22,10 +22,28 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long size,
 				     unsigned long limit_pfn);
 static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
 
+static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct iova_domain *iovad;
+
+	iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
+
+	free_cpu_cached_iovas(cpu, iovad);
+	return 0;
+}
+
+static void free_global_cached_iovas(struct iova_domain *iovad);
+
+static struct iova *to_iova(struct rb_node *node)
+{
+	return rb_entry(node, struct iova, node);
+}
+
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn)
@@ -50,11 +68,12 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, &iovad->cpuhp_dead);
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
-bool has_iova_flush_queue(struct iova_domain *iovad)
+static bool has_iova_flush_queue(struct iova_domain *iovad)
 {
 	return !!iovad->fq;
 }
@@ -102,8 +121,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		spin_lock_init(&fq->lock);
 	}
 
-	smp_wmb();
-
 	iovad->fq = queue;
 
 	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
@@ -111,7 +128,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(init_iova_flush_queue);
 
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
@@ -136,7 +152,7 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
 	struct iova *cached_iova;
 
-	cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+	cached_iova = to_iova(iovad->cached32_node);
 	if (free == cached_iova ||
 	    (free->pfn_hi < iovad->dma_32bit_pfn &&
 	     free->pfn_lo >= cached_iova->pfn_lo)) {
@@ -144,11 +160,48 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 		iovad->max32_alloc_size = iovad->dma_32bit_pfn;
 	}
 
-	cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+	cached_iova = to_iova(iovad->cached_node);
 	if (free->pfn_lo >= cached_iova->pfn_lo)
 		iovad->cached_node = rb_next(&free->node);
 }
 
+static struct rb_node *iova_find_limit(struct iova_domain *iovad, unsigned long limit_pfn)
+{
+	struct rb_node *node, *next;
+	/*
+	 * Ideally what we'd like to judge here is whether limit_pfn is close
+	 * enough to the highest-allocated IOVA that starting the allocation
+	 * walk from the anchor node will be quicker than this initial work to
+	 * find an exact starting point (especially if that ends up being the
+	 * anchor node anyway). This is an incredibly crude approximation which
+	 * only really helps the most likely case, but is at least trivially easy.
+	 */
+	if (limit_pfn > iovad->dma_32bit_pfn)
+		return &iovad->anchor.node;
+
+	node = iovad->rbroot.rb_node;
+	while (to_iova(node)->pfn_hi < limit_pfn)
+		node = node->rb_right;
+
+search_left:
+	while (node->rb_left && to_iova(node->rb_left)->pfn_lo >= limit_pfn)
+		node = node->rb_left;
+
+	if (!node->rb_left)
+		return node;
+
+	next = node->rb_left;
+	while (next->rb_right) {
+		next = next->rb_right;
+		if (to_iova(next)->pfn_lo >= limit_pfn) {
+			node = next;
+			goto search_left;
+		}
+	}
+
+	return node;
+}
+
 /* Insert the iova into domain rbtree by holding writer lock */
 static void
 iova_insert_rbtree(struct rb_root *root, struct iova *iova,
@@ -159,7 +212,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
 	new = (start) ? &start : &(root->rb_node);
 	/* Figure out where to put new node */
 	while (*new) {
-		struct iova *this = rb_entry(*new, struct iova, node);
+		struct iova *this = to_iova(*new);
 
 		parent = *new;
 
@@ -184,8 +237,9 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	struct rb_node *curr, *prev;
 	struct iova *curr_iova;
 	unsigned long flags;
-	unsigned long new_pfn;
+	unsigned long new_pfn, retry_pfn;
 	unsigned long align_mask = ~0UL;
+	unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;
 
 	if (size_aligned)
 		align_mask <<= fls_long(size - 1);
@@ -197,16 +251,26 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		goto iova32_full;
 
 	curr = __get_cached_rbnode(iovad, limit_pfn);
-	curr_iova = rb_entry(curr, struct iova, node);
+	curr_iova = to_iova(curr);
+	retry_pfn = curr_iova->pfn_hi + 1;
+
+retry:
 	do {
-		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
-		new_pfn = (limit_pfn - size) & align_mask;
+		high_pfn = min(high_pfn, curr_iova->pfn_lo);
+		new_pfn = (high_pfn - size) & align_mask;
 		prev = curr;
 		curr = rb_prev(curr);
-		curr_iova = rb_entry(curr, struct iova, node);
-	} while (curr && new_pfn <= curr_iova->pfn_hi);
-
-	if (limit_pfn < size || new_pfn < iovad->start_pfn) {
+		curr_iova = to_iova(curr);
+	} while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
+
+	if (high_pfn < size || new_pfn < low_pfn) {
+		if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
+			high_pfn = limit_pfn;
+			low_pfn = retry_pfn;
+			curr = iova_find_limit(iovad, limit_pfn);
+			curr_iova = to_iova(curr);
+			goto retry;
+		}
 		iovad->max32_alloc_size = size;
 		goto iova32_full;
 	}
@@ -248,12 +312,23 @@ int iova_cache_get(void)
 {
 	mutex_lock(&iova_cache_mutex);
 	if (!iova_cache_users) {
+		int ret;
+
+		ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL,
+					iova_cpuhp_dead);
+		if (ret) {
+			mutex_unlock(&iova_cache_mutex);
+			pr_err("Couldn't register cpuhp handler\n");
+			return ret;
+		}
+
 		iova_cache = kmem_cache_create(
 			"iommu_iova", sizeof(struct iova), 0,
 			SLAB_HWCACHE_ALIGN, NULL);
 		if (!iova_cache) {
+			cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 			mutex_unlock(&iova_cache_mutex);
-			printk(KERN_ERR "Couldn't create iova cache\n");
+			pr_err("Couldn't create iova cache\n");
 			return -ENOMEM;
 		}
 	}
@@ -273,8 +348,10 @@ void iova_cache_put(void)
 		return;
 	}
 	iova_cache_users--;
-	if (!iova_cache_users)
+	if (!iova_cache_users) {
+		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 		kmem_cache_destroy(iova_cache);
+	}
 	mutex_unlock(&iova_cache_mutex);
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
@@ -322,7 +399,7 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 	assert_spin_locked(&iovad->iova_rbtree_lock);
 
 	while (node) {
-		struct iova *iova = rb_entry(node, struct iova, node);
+		struct iova *iova = to_iova(node);
 
 		if (pfn < iova->pfn_lo)
 			node = node->rb_left;
@@ -335,12 +412,11 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 	return NULL;
 }
 
-static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+static void remove_iova(struct iova_domain *iovad, struct iova *iova)
 {
 	assert_spin_locked(&iovad->iova_rbtree_lock);
 	__cached_rbnode_delete_update(iovad, iova);
 	rb_erase(&iova->node, &iovad->rbroot);
-	free_iova_mem(iova);
 }
 
 /**
@@ -375,8 +451,9 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	private_free_iova(iovad, iova);
+	remove_iova(iovad, iova);
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -390,11 +467,18 @@ EXPORT_SYMBOL_GPL(__free_iova);
 void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	struct iova *iova = find_iova(iovad, pfn);
-
-	if (iova)
-		__free_iova(iovad, iova);
+	unsigned long flags;
+	struct iova *iova;
 
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = private_find_iova(iovad, pfn);
+	if (!iova) {
+		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		return;
+	}
+	remove_iova(iovad, iova);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
@@ -431,12 +515,12 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 		flush_rcache = false;
 		for_each_online_cpu(cpu)
 			free_cpu_cached_iovas(cpu, iovad);
+		free_global_cached_iovas(iovad);
 		goto retry;
 	}
 
 	return new_iova->pfn_lo;
 }
-EXPORT_SYMBOL_GPL(alloc_iova_fast);
 
 /**
  * free_iova_fast - free iova pfn range into rcache
@@ -454,7 +538,6 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 
 	free_iova(iovad, pfn);
 }
-EXPORT_SYMBOL_GPL(free_iova_fast);
 
 #define fq_ring_for_each(i, fq) \
 	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
@@ -550,10 +633,20 @@ void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
 		unsigned long data)
 {
-	struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
+	struct iova_fq *fq;
 	unsigned long flags;
 	unsigned idx;
 
+	/*
+	 * Order against the IOMMU driver's pagetable update from unmapping
+	 * @pte, to guarantee that iova_domain_flush() observes that if called
+	 * from a different CPU before we release the lock below. Full barrier
+	 * so it also pairs with iommu_dma_init_fq() to avoid seeing partially
+	 * written fq state here.
+	 */
+	smp_mb();
+
+	fq = raw_cpu_ptr(iovad->fq);
 	spin_lock_irqsave(&fq->lock, flags);
 
 	/*
@@ -579,11 +672,10 @@ void queue_iova(struct iova_domain *iovad,
 
 	/* Avoid false sharing as much as possible. */
 	if (!atomic_read(&iovad->fq_timer_on) &&
-	    !atomic_cmpxchg(&iovad->fq_timer_on, 0, 1))
+	    !atomic_xchg(&iovad->fq_timer_on, 1))
 		mod_timer(&iovad->fq_timer,
 			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
-EXPORT_SYMBOL_GPL(queue_iova);
 
 /**
  * put_iova_domain - destroys the iova doamin
@@ -594,6 +686,9 @@ void put_iova_domain(struct iova_domain *iovad)
 {
 	struct iova *iova, *tmp;
 
+	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+					    &iovad->cpuhp_dead);
+
 	free_iova_flush_queue(iovad);
 	free_iova_rcaches(iovad);
 	rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
@@ -605,7 +700,7 @@ static int
 __is_range_overlap(struct rb_node *node,
 	unsigned long pfn_lo, unsigned long pfn_hi)
 {
-	struct iova *iova = rb_entry(node, struct iova, node);
+	struct iova *iova = to_iova(node);
 
 	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
 		return 1;
@@ -673,7 +768,7 @@ reserve_iova(struct iova_domain *iovad,
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
 		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
-			iova = rb_entry(node, struct iova, node);
+			iova = to_iova(node);
 			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
 			if ((pfn_lo >= iova->pfn_lo) &&
 				(pfn_hi <= iova->pfn_hi))
@@ -695,36 +790,6 @@ reserve_iova(struct iova_domain *iovad,
 }
 EXPORT_SYMBOL_GPL(reserve_iova);
 
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source doamin from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one doamin to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
-	unsigned long flags;
-	struct rb_node *node;
-
-	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
-	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
-		struct iova *iova = rb_entry(node, struct iova, node);
-		struct iova *new_iova;
-
-		if (iova->pfn_lo == IOVA_ANCHOR)
-			continue;
-
-		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
-		if (!new_iova)
-			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
-				iova->pfn_lo, iova->pfn_lo);
-	}
-	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
-}
-EXPORT_SYMBOL_GPL(copy_reserved_iova);
-
 struct iova *
 split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
 		      unsigned long pfn_lo, unsigned long pfn_hi)
@@ -814,7 +879,8 @@ iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
 		if (WARN_ON(!iova))
 			continue;
 
-		private_free_iova(iovad, iova);
+		remove_iova(iovad, iova);
+		free_iova_mem(iova);
 	}
 
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -1029,7 +1095,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 /*
  * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
  */
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 {
 	struct iova_cpu_rcache *cpu_rcache;
 	struct iova_rcache *rcache;
@@ -1046,5 +1112,25 @@ void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 	}
 }
 
+/*
+ * free all the IOVA ranges of global cache
+ */
+static void free_global_cached_iovas(struct iova_domain *iovad)
+{
+	struct iova_rcache *rcache;
+	unsigned long flags;
+	int i, j;
+
+	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+		rcache = &iovad->rcaches[i];
+		spin_lock_irqsave(&rcache->lock, flags);
+		for (j = 0; j < rcache->depot_size; ++j) {
+			iova_magazine_free_pfns(rcache->depot[j], iovad);
+			iova_magazine_free(rcache->depot[j]);
+		}
+		rcache->depot_size = 0;
+		spin_unlock_irqrestore(&rcache->lock, flags);
+	}
+}
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 83f36f61416e5d3b11e099ba51eded8bf4c964bd..83314b9d8f38bfcf63652ddb63e5c4c05d4c7f93 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -158,38 +158,3 @@ void panic_if_irq_remap(const char *msg)
 	if (irq_remapping_enabled)
 		panic(msg);
 }
-
-/**
- * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU
- *				     device serving request @info
- * @info: interrupt allocation information, used to identify the IOMMU device
- *
- * It's used to get parent irqdomain for HPET and IOAPIC irqdomains.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *
-irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
-{
-	if (!remap_ops || !remap_ops->get_ir_irq_domain)
-		return NULL;
-
-	return remap_ops->get_ir_irq_domain(info);
-}
-
-/**
- * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
- * @info: interrupt allocation information, used to identify the IOMMU device
- *
- * There will be one PCI MSI/MSIX irqdomain associated with each interrupt
- * remapping device, so this interface is used to retrieve the PCI MSI/MSIX
- * irqdomain serving request @info.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *
-irq_remapping_get_irq_domain(struct irq_alloc_info *info)
-{
-	if (!remap_ops || !remap_ops->get_irq_domain)
-		return NULL;
-
-	return remap_ops->get_irq_domain(info);
-}
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 6a190d504eb619f572955f8d4c5b4293bb0a07d0..8c89cb947cdb29e072f8218afa26601c8ee3ef7c 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -42,12 +42,6 @@ struct irq_remap_ops {
 
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
-
-	/* Get the irqdomain associated the IOMMU device */
-	struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *);
-
-	/* Get the MSI irqdomain associated with the IOMMU device */
-	struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 614a93aa5305a83d6bf6662f6c236aa7e25d3d68..e2b9d3e5675f1c0b94a6954ec321460b0aff6786 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -16,74 +16,6 @@
 
 #define NO_IOMMU	1
 
-/**
- * of_get_dma_window - Parse *dma-window property and returns 0 if found.
- *
- * @dn: device node
- * @prefix: prefix for property name if any
- * @index: index to start to parse
- * @busno: Returns busno if supported. Otherwise pass NULL
- * @addr: Returns address that DMA starts
- * @size: Returns the range that DMA can handle
- *
- * This supports different formats flexibly. "prefix" can be
- * configured if any. "busno" and "index" are optionally
- * specified. Set 0(or NULL) if not used.
- */
-int of_get_dma_window(struct device_node *dn, const char *prefix, int index,
-		      unsigned long *busno, dma_addr_t *addr, size_t *size)
-{
-	const __be32 *dma_window, *end;
-	int bytes, cur_index = 0;
-	char propname[NAME_MAX], addrname[NAME_MAX], sizename[NAME_MAX];
-
-	if (!dn || !addr || !size)
-		return -EINVAL;
-
-	if (!prefix)
-		prefix = "";
-
-	snprintf(propname, sizeof(propname), "%sdma-window", prefix);
-	snprintf(addrname, sizeof(addrname), "%s#dma-address-cells", prefix);
-	snprintf(sizename, sizeof(sizename), "%s#dma-size-cells", prefix);
-
-	dma_window = of_get_property(dn, propname, &bytes);
-	if (!dma_window)
-		return -ENODEV;
-	end = dma_window + bytes / sizeof(*dma_window);
-
-	while (dma_window < end) {
-		u32 cells;
-		const void *prop;
-
-		/* busno is one cell if supported */
-		if (busno)
-			*busno = be32_to_cpup(dma_window++);
-
-		prop = of_get_property(dn, addrname, NULL);
-		if (!prop)
-			prop = of_get_property(dn, "#address-cells", NULL);
-
-		cells = prop ? be32_to_cpup(prop) : of_n_addr_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*addr = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		prop = of_get_property(dn, sizename, NULL);
-		cells = prop ? be32_to_cpup(prop) : of_n_size_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*size = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		if (cur_index++ == index)
-			break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(of_get_dma_window);
-
 static int of_iommu_xlate(struct device *dev,
 			  struct of_phandle_args *iommu_spec)
 {
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 09c6e1c680db980aeb67dae51986b2a84e88d99a..39c9d1eded70c898e26bf81bb515f23781b571cc 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1097,7 +1097,7 @@ static __maybe_unused int omap_iommu_runtime_resume(struct device *dev)
 }
 
 /**
- * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation
+ * omap_iommu_prepare - prepare() dev_pm_ops implementation
  * @dev:	iommu device
  *
  * This function performs the necessary checks to determine if the IOMMU
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index 280de92b332edf5a5021f46475a5573891425e34..8bcab8d8ebda0d16d237bd82105c2b08c69bbe70 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -10,7 +10,6 @@
 #include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
-#include <linux/dma-iommu.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/interrupt.h>
@@ -329,12 +328,6 @@ static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type)
 	if (!qcom_domain)
 		return NULL;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&qcom_domain->domain)) {
-		kfree(qcom_domain);
-		return NULL;
-	}
-
 	mutex_init(&qcom_domain->init_mutex);
 	spin_lock_init(&qcom_domain->pgtbl_lock);
 
@@ -345,8 +338,6 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
 {
 	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
 
-	iommu_put_dma_cookie(domain);
-
 	if (qcom_domain->iommu) {
 		/*
 		 * NOTE: unmap can be called after client device is powered
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 3924f7c055440edd3ee64be0b02710102bd7efcd..a52ed43591ea17326a7f16e5ee072624e6e40fb7 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -273,7 +273,8 @@ static int gart_iommu_of_xlate(struct device *dev,
 	return 0;
 }
 
-static void gart_iommu_sync_map(struct iommu_domain *domain)
+static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
+				size_t size)
 {
 	FLUSH_GART_REGS(gart_handle);
 }
@@ -281,7 +282,9 @@ static void gart_iommu_sync_map(struct iommu_domain *domain)
 static void gart_iommu_sync(struct iommu_domain *domain,
 			    struct iommu_iotlb_gather *gather)
 {
-	gart_iommu_sync_map(domain);
+	size_t length = gather->end - gather->start + 1;
+
+	gart_iommu_sync_map(domain, gather->start, length);
 }
 
 static const struct iommu_ops gart_iommu_ops = {
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 60e659a24f90bf1f9fa94cc449d088f3e526ce60..6b7cc71c004249f32ea4563607727a6ca84407f2 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -599,12 +599,6 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type)
 	spin_lock_init(&vdomain->mappings_lock);
 	vdomain->mappings = RB_ROOT_CACHED;
 
-	if (type == IOMMU_DOMAIN_DMA &&
-	    iommu_get_dma_cookie(&vdomain->domain)) {
-		kfree(vdomain);
-		return NULL;
-	}
-
 	return &vdomain->domain;
 }
 
@@ -634,8 +628,6 @@ static void viommu_domain_free(struct iommu_domain *domain)
 {
 	struct viommu_domain *vdomain = to_viommu_domain(domain);
 
-	iommu_put_dma_cookie(domain);
-
 	/* Free all remaining mappings (size 2^64) */
 	viommu_del_mappings(vdomain, 0, 0);
 
@@ -715,7 +707,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 }
 
 static int viommu_map(struct iommu_domain *domain, unsigned long iova,
-		      phys_addr_t paddr, size_t size, int prot)
+		      phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	int ret;
 	u32 flags;
@@ -952,7 +944,16 @@ static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
 	return iommu_fwspec_add_ids(dev, args->args, 1);
 }
 
+static bool viommu_capable(enum iommu_cap cap)
+{
+	if (cap == IOMMU_CAP_VIOMMU_HINT)
+		return true;
+
+	return false;
+}
+
 static struct iommu_ops viommu_ops = {
+	.capable		= viommu_capable,
 	.domain_alloc		= viommu_domain_alloc,
 	.domain_free		= viommu_domain_free,
 	.attach_dev		= viommu_attach_dev,
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 6d04672975ef50778ed43d62622c6712d168378e..634d074fb290dd60281dc337c73dac0d27f4a7cb 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -504,4 +504,18 @@ config SIFIVE_PLIC
 
 	   If you don't know what to do here, say Y.
 
+config IMS_MSI
+	depends on PCI
+	select DEVICE_MSI
+	bool
+
+config IMS_MSI_ARRAY
+	bool "IMS Interrupt Message Store MSI controller for device memory storage arrays"
+	depends on PCI
+	select IMS_MSI
+	select GENERIC_MSI_IRQ_DOMAIN
+	help
+	  Support for IMS Interrupt Message Store MSI controller
+	  with IMS slot storage in a slot array in device memory
+
 endmenu
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 8903d3c34916c505c8210b9068a4f510baad4dbc..fec6f7e01917923a89cf14e52c42c4aa3446e068 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_CSKY_APB_INTC)		+= irq-csky-apb-intc.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_IMX_IRQSTEER)		+= irq-imx-irqsteer.o
 obj-$(CONFIG_MADERA_IRQ)		+= irq-madera.o
+obj-$(CONFIG_IMS_MSI)			+= irq-ims-msi.o
 obj-$(CONFIG_LS1X_IRQ)			+= irq-ls1x.o
 obj-$(CONFIG_TI_SCI_INTR_IRQCHIP)	+= irq-ti-sci-intr.o
 obj-$(CONFIG_TI_SCI_INTA_IRQCHIP)	+= irq-ti-sci-inta.o
diff --git a/drivers/irqchip/irq-ims-msi.c b/drivers/irqchip/irq-ims-msi.c
new file mode 100644
index 0000000000000000000000000000000000000000..7c9ef211dc4049d2a0ac6b48c4c73722b5affbe3
--- /dev/null
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+// (C) Copyright 2021 Thomas Gleixner <tglx@linutronix.de>
+/*
+ * Shared interrupt chips and irq domains for IMS devices
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+
+#include <linux/irqchip/irq-ims-msi.h>
+
+#ifdef CONFIG_IMS_MSI_ARRAY
+
+struct ims_array_data {
+	struct ims_array_info	info;
+	unsigned long		map[0];
+};
+
+static inline void iowrite32_and_flush(u32 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+	ioread32(addr);
+}
+
+static void ims_array_mask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32_and_flush(ioread32(ctrl) | IMS_CTRL_VECTOR_MASKBIT, ctrl);
+}
+
+static void ims_array_unmask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32_and_flush(ioread32(ctrl) & ~IMS_CTRL_VECTOR_MASKBIT, ctrl);
+}
+
+static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+
+	iowrite32(msg->address_lo, &slot->address_lo);
+	iowrite32(msg->address_hi, &slot->address_hi);
+	iowrite32_and_flush(msg->data, &slot->data);
+}
+
+static int ims_array_set_auxdata(struct irq_data *data, unsigned int which,
+				 u64 auxval)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 val, __iomem *ctrl = &slot->ctrl;
+
+	if (which != IMS_AUXDATA_CONTROL_WORD)
+		return -EINVAL;
+	if (auxval & ~(u64)IMS_CONTROL_WORD_AUXMASK)
+		return -EINVAL;
+
+	val = ioread32(ctrl) & IMS_CONTROL_WORD_IRQMASK;
+	iowrite32_and_flush(val | (u32) auxval, ctrl);
+	return 0;
+}
+
+static const struct irq_chip ims_array_msi_controller = {
+	.name			= "IMS",
+	.irq_mask		= ims_array_mask_irq,
+	.irq_unmask		= ims_array_unmask_irq,
+	.irq_write_msi_msg	= ims_array_write_msi_msg,
+	.irq_set_auxdata	= ims_array_set_auxdata,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static void ims_array_reset_slot(struct ims_slot __iomem *slot)
+{
+	iowrite32(0, &slot->address_lo);
+	iowrite32(0, &slot->address_hi);
+	iowrite32(0, &slot->data);
+	iowrite32_and_flush(IMS_CTRL_VECTOR_MASKBIT, &slot->ctrl);
+}
+
+static void ims_array_free_msi_store(struct irq_domain *domain,
+				     struct device *dev)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_dev_msi_entry(entry, dev) {
+		if (entry->device_msi.priv_iomem) {
+			clear_bit(entry->device_msi.hwirq, ims->map);
+			ims_array_reset_slot(entry->device_msi.priv_iomem);
+			entry->device_msi.priv_iomem = NULL;
+			entry->device_msi.hwirq = 0;
+		}
+	}
+}
+
+static void ims_array_free_msi_irq(struct irq_domain *domain,
+				   struct device *dev, unsigned int irq)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_dev_msi_entry(entry, dev) {
+		if (entry->irq == irq && entry->device_msi.priv_iomem) {
+			clear_bit(entry->device_msi.hwirq, ims->map);
+			ims_array_reset_slot(entry->device_msi.priv_iomem);
+			entry->device_msi.priv_iomem = NULL;
+			entry->device_msi.hwirq = 0;
+		}
+	}
+}
+
+static int ims_array_alloc_msi_store(struct irq_domain *domain,
+				     struct device *dev, int nvec)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_new_dev_msi_entry(entry, dev) {
+		unsigned int idx;
+
+		idx = find_first_zero_bit(ims->map, ims->info.max_slots);
+		if (idx >= ims->info.max_slots)
+			goto fail;
+		set_bit(idx, ims->map);
+		entry->device_msi.priv_iomem = &ims->info.slots[idx];
+		ims_array_reset_slot(entry->device_msi.priv_iomem);
+		entry->device_msi.hwirq = idx;
+	}
+	return 0;
+
+fail:
+	ims_array_free_msi_store(domain, dev);
+	return -ENOSPC;
+}
+
+struct ims_array_domain_template {
+	struct msi_domain_ops	ops;
+	struct msi_domain_info	info;
+};
+
+static void ims_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->desc = desc;
+	arg->hwirq = desc->device_msi.hwirq;
+}
+
+static const struct ims_array_domain_template ims_array_domain_template = {
+	.ops = {
+		.msi_alloc_store	= ims_array_alloc_msi_store,
+		.msi_free_store		= ims_array_free_msi_store,
+		.msi_free_irq		= ims_array_free_msi_irq,
+		.set_desc               = ims_set_desc,
+	},
+	.info = {
+		.flags		= MSI_FLAG_USE_DEF_DOM_OPS |
+				  MSI_FLAG_USE_DEF_CHIP_OPS,
+		.handler	= handle_edge_irq,
+		.handler_name	= "edge",
+	},
+};
+
+struct irq_domain *
+pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+				    struct ims_array_info *ims_info)
+{
+	struct ims_array_domain_template *info;
+	struct ims_array_data *data;
+	struct irq_domain *domain;
+	struct irq_chip *chip;
+	unsigned int size;
+
+	/* Allocate new domain storage */
+	info = kmemdup(&ims_array_domain_template,
+		       sizeof(ims_array_domain_template), GFP_KERNEL);
+	if (!info)
+		return NULL;
+	/* Link the ops */
+	info->info.ops = &info->ops;
+
+	/* Allocate ims_info along with the bitmap */
+	size = sizeof(*data);
+	size += BITS_TO_LONGS(ims_info->max_slots) * sizeof(unsigned long);
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto err_info;
+
+	data->info = *ims_info;
+	info->info.data = data;
+
+	/*
+	 * Allocate an interrupt chip because the core needs to be able to
+	 * update it with default callbacks.
+	 */
+	chip = kmemdup(&ims_array_msi_controller,
+		       sizeof(ims_array_msi_controller), GFP_KERNEL);
+	if (!chip)
+		goto err_data;
+	info->info.chip = chip;
+
+	domain = pci_subdevice_msi_create_irq_domain(pdev, &info->info);
+	if (!domain)
+		goto err_chip;
+
+	return domain;
+
+err_chip:
+	kfree(chip);
+err_data:
+	kfree(data);
+err_info:
+	kfree(info);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_ims_array_create_msi_irq_domain);
+
+#endif /* CONFIG_IMS_MSI_ARRAY */
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 64fff6abe60e8394782089931e59e374b68ee5bf..719f54c589200cd7955ba5536388a909a0e05e9b 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -848,7 +848,7 @@ static int jmb38x_ms_count_slots(struct pci_dev *pdev)
 {
 	int cnt, rc = 0;
 
-	for (cnt = 0; cnt < PCI_ROM_RESOURCE; ++cnt) {
+	for (cnt = 0; cnt < PCI_STD_NUM_BARS; ++cnt) {
 		if (!(IORESOURCE_MEM & pci_resource_flags(pdev, cnt)))
 			break;
 
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 43169f25da1fd5cd48a9e52104bd3dea6c5051ea..f30f2d16504f442168f4c921286efef625b187ce 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -632,6 +632,16 @@ config MFD_INTEL_MSIC
 	  Passage) chip. This chip embeds audio, battery, GPIO, etc.
 	  devices used in Intel Medfield platforms.
 
+config MFD_INTEL_PMT
+	tristate "Intel Platform Monitoring Technology (PMT) support"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The Intel Platform Monitoring Technology (PMT) is an interface that
+	  provides access to hardware monitor registers. This driver supports
+	  Telemetry, Watcher, and Crashlog PMT capabilities/devices for
+	  platforms starting from Tiger Lake.
+
 config MFD_IPAQ_MICRO
 	bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support"
 	depends on SA1100_H3100 || SA1100_H3600
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index c1067ea4620464b8c085be6f13aa9056a6ea3c0a..597cb4fc96e1a5a6d9f210b693f9e20ab729d0e8 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -212,6 +212,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS)	+= intel-lpss.o
 obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
 obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
+obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
 obj-$(CONFIG_MFD_VIPERBOARD)    += viperboard.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c
new file mode 100644
index 0000000000000000000000000000000000000000..dd7eb614c28e47d56fd88fb80cf501e0e67fd12b
--- /dev/null
+++ b/drivers/mfd/intel_pmt.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology PMT driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: David E. Box <david.e.box@linux.intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+
+/* Intel DVSEC capability vendor space offsets */
+#define INTEL_DVSEC_ENTRIES		0xA
+#define INTEL_DVSEC_SIZE		0xB
+#define INTEL_DVSEC_TABLE		0xC
+#define INTEL_DVSEC_TABLE_BAR(x)	((x) & GENMASK(2, 0))
+#define INTEL_DVSEC_TABLE_OFFSET(x)	((x) & GENMASK(31, 3))
+#define INTEL_DVSEC_ENTRY_SIZE		4
+
+/* PMT capabilities */
+#define DVSEC_INTEL_ID_TELEMETRY	2
+#define DVSEC_INTEL_ID_WATCHER		3
+#define DVSEC_INTEL_ID_CRASHLOG		4
+
+struct intel_dvsec_header {
+	u16	length;
+	u16	id;
+	u8	num_entries;
+	u8	entry_size;
+	u8	tbir;
+	u32	offset;
+};
+
+enum pmt_quirks {
+	/* Watcher capability not supported */
+	PMT_QUIRK_NO_WATCHER	= BIT(0),
+
+	/* Crashlog capability not supported */
+	PMT_QUIRK_NO_CRASHLOG	= BIT(1),
+
+	/* Use shift instead of mask to read discovery table offset */
+	PMT_QUIRK_TABLE_SHIFT	= BIT(2),
+
+	/* DVSEC not present (provided in driver data) */
+	PMT_QUIRK_NO_DVSEC	= BIT(3),
+};
+
+struct pmt_platform_info {
+	unsigned long quirks;
+	struct intel_dvsec_header **capabilities;
+};
+
+static const struct pmt_platform_info tgl_info = {
+	.quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG |
+		  PMT_QUIRK_TABLE_SHIFT,
+};
+
+/* DG1 Platform with DVSEC quirk*/
+static struct intel_dvsec_header dg1_telemetry = {
+	.length = 0x10,
+	.id = 2,
+	.num_entries = 1,
+	.entry_size = 3,
+	.tbir = 0,
+	.offset = 0x466000,
+};
+
+static struct intel_dvsec_header *dg1_capabilities[] = {
+	&dg1_telemetry,
+	NULL
+};
+
+static const struct pmt_platform_info dg1_info = {
+	.quirks = PMT_QUIRK_NO_DVSEC,
+	.capabilities = dg1_capabilities,
+};
+
+static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header,
+		       unsigned long quirks)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res, *tmp;
+	struct mfd_cell *cell;
+	const char *name;
+	int count = header->num_entries;
+	int size = header->entry_size;
+	int id = header->id;
+	int i;
+
+	switch (id) {
+	case DVSEC_INTEL_ID_TELEMETRY:
+		name = "pmt_telemetry";
+		break;
+	case DVSEC_INTEL_ID_WATCHER:
+		if (quirks & PMT_QUIRK_NO_WATCHER) {
+			dev_info(dev, "Watcher not supported\n");
+			return -EINVAL;
+		}
+		name = "pmt_watcher";
+		break;
+	case DVSEC_INTEL_ID_CRASHLOG:
+		if (quirks & PMT_QUIRK_NO_CRASHLOG) {
+			dev_info(dev, "Crashlog not supported\n");
+			return -EINVAL;
+		}
+		name = "pmt_crashlog";
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!header->num_entries || !header->entry_size) {
+		dev_err(dev, "Invalid count or size for %s header\n", name);
+		return -EINVAL;
+	}
+
+	cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL);
+	if (!cell)
+		return -ENOMEM;
+
+	res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	if (quirks & PMT_QUIRK_TABLE_SHIFT)
+		header->offset >>= 3;
+
+	/*
+	 * The PMT DVSEC contains the starting offset and count for a block of
+	 * discovery tables, each providing access to monitoring facilities for
+	 * a section of the device. Create a resource list of these tables to
+	 * provide to the driver.
+	 */
+	for (i = 0, tmp = res; i < count; i++, tmp++) {
+		tmp->start = pdev->resource[header->tbir].start +
+			     header->offset + i * (size << 2);
+		tmp->end = tmp->start + (size << 2) - 1;
+		tmp->flags = IORESOURCE_MEM;
+	}
+
+	cell->resources = res;
+	cell->num_resources = count;
+	cell->name = name;
+
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0,
+				    NULL);
+}
+
+static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct pmt_platform_info *info;
+	unsigned long quirks = 0;
+	bool found_devices = false;
+	int ret, pos = 0;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	info = (struct pmt_platform_info *)id->driver_data;
+
+	if (info)
+		quirks = info->quirks;
+
+	if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) {
+		struct intel_dvsec_header **header;
+
+		header = info->capabilities;
+		while (*header) {
+			ret = pmt_add_dev(pdev, *header, quirks);
+			if (ret)
+				dev_warn(&pdev->dev,
+					 "Failed to add device for DVSEC id %d\n",
+					 (*header)->id);
+			else
+				found_devices = true;
+
+			++header;
+		}
+	} else {
+		do {
+			struct intel_dvsec_header header;
+			u32 table;
+			u16 vid;
+
+			pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC);
+			if (!pos)
+				break;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid);
+			if (vid != PCI_VENDOR_ID_INTEL)
+				continue;
+
+			pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2,
+					     &header.id);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES,
+					     &header.num_entries);
+			pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE,
+					     &header.entry_size);
+			pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE,
+					      &table);
+
+			header.tbir = INTEL_DVSEC_TABLE_BAR(table);
+			header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
+
+			ret = pmt_add_dev(pdev, &header, quirks);
+			if (ret)
+				continue;
+
+			found_devices = true;
+		} while (true);
+	}
+
+	if (!found_devices)
+		return -ENODEV;
+
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_allow(&pdev->dev);
+
+	return 0;
+}
+
+static void pmt_pci_remove(struct pci_dev *pdev)
+{
+	pm_runtime_forbid(&pdev->dev);
+	pm_runtime_get_sync(&pdev->dev);
+}
+
+#define PCI_DEVICE_ID_INTEL_PMT_ADL	0x467d
+#define PCI_DEVICE_ID_INTEL_PMT_DG1	0x490e
+#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM	0x09a7
+#define PCI_DEVICE_ID_INTEL_PMT_TGL	0x9a0d
+static const struct pci_device_id pmt_pci_ids[] = {
+	{ PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) },
+	{ PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, pmt_pci_ids);
+
+static struct pci_driver pmt_pci_driver = {
+	.name = "intel-pmt",
+	.id_table = pmt_pci_ids,
+	.probe = pmt_pci_probe,
+	.remove = pmt_pci_remove,
+};
+module_pci_driver(pmt_pci_driver);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c
index 0a146b32da132f9f38af747372214604b08836f9..de7c5ab528d9efa6ef1d633ef37de2fe61b44304 100644
--- a/drivers/misc/lkdtm/refcount.c
+++ b/drivers/misc/lkdtm/refcount.c
@@ -6,14 +6,6 @@
 #include "lkdtm.h"
 #include <linux/refcount.h>
 
-#ifdef CONFIG_REFCOUNT_FULL
-#define REFCOUNT_MAX		(UINT_MAX - 1)
-#define REFCOUNT_SATURATED	UINT_MAX
-#else
-#define REFCOUNT_MAX		INT_MAX
-#define REFCOUNT_SATURATED	(INT_MIN / 2)
-#endif
-
 static void overflow_check(refcount_t *ref)
 {
 	switch (refcount_read(ref)) {
@@ -127,7 +119,7 @@ void lkdtm_REFCOUNT_DEC_ZERO(void)
 static void check_negative(refcount_t *ref, int start)
 {
 	/*
-	 * CONFIG_REFCOUNT_FULL refuses to move a refcount at all on an
+	 * refcount_t refuses to move a refcount at all on an
 	 * over-sub, so we have to track our starting position instead of
 	 * looking only at zero-pinning.
 	 */
@@ -210,7 +202,6 @@ static void check_from_zero(refcount_t *ref)
 
 /*
  * A refcount_inc() from zero should pin to zero or saturate and may WARN.
- * Only CONFIG_REFCOUNT_FULL provides this protection currently.
  */
 void lkdtm_REFCOUNT_INC_ZERO(void)
 {
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 7310b476323c2272591e64761e37bb7fed514d76..133fa8cbb1c8f9da8209aa062946b931185a72d2 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -532,24 +532,6 @@ static long mei_ioctl(struct file *file, unsigned int cmd, unsigned long data)
 	return rets;
 }
 
-/**
- * mei_compat_ioctl - the compat IOCTL function
- *
- * @file: pointer to file structure
- * @cmd: ioctl command
- * @data: pointer to mei message structure
- *
- * Return: 0 on success , <0 on error
- */
-#ifdef CONFIG_COMPAT
-static long mei_compat_ioctl(struct file *file,
-			unsigned int cmd, unsigned long data)
-{
-	return mei_ioctl(file, cmd, (unsigned long)compat_ptr(data));
-}
-#endif
-
-
 /**
  * mei_poll - the poll function
  *
@@ -898,9 +880,7 @@ static const struct file_operations mei_fops = {
 	.owner = THIS_MODULE,
 	.read = mei_read,
 	.unlocked_ioctl = mei_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = mei_compat_ioctl,
-#endif
+	.compat_ioctl = compat_ptr_ioctl,
 	.open = mei_open,
 	.release = mei_release,
 	.write = mei_write,
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 1154f0435b0ac00be133a9b836366308085bd11a..32e9f267d84f38381072b14d6ae631e7e7c9ec44 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -94,7 +94,7 @@ enum pci_barno {
 struct pci_endpoint_test {
 	struct pci_dev	*pdev;
 	void __iomem	*base;
-	void __iomem	*bar[6];
+	void __iomem	*bar[PCI_STD_NUM_BARS];
 	struct completion irq_raised;
 	int		last_irq;
 	int		num_irqs;
@@ -693,7 +693,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	if (!pci_endpoint_test_request_irq(test))
 		goto err_disable_irq;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
 			base = pci_ioremap_bar(pdev, bar);
 			if (!base) {
@@ -746,7 +746,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 	ida_simple_remove(&pci_endpoint_test_ida, id);
 
 err_iounmap:
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
@@ -777,7 +777,7 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 	misc_deregister(&test->miscdev);
 	kfree(misc_device->name);
 	ida_simple_remove(&pci_endpoint_test_ida, id);
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 4b713a80b572639f98a3b9d0260ee4537ab8208a..f253bb6f2ae4b9fe3dd91c10ee1db0737a7be20e 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -20,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
+#include <linux/sync_core.h>
 #include <linux/prefetch.h>
 #include <asm/pgtable.h>
 #include "gru.h"
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index f7224f90f4131f68636faee891e1f3b6fba42208..1d75d5e540bc992b35bf0274692ef889fb14e7bb 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -16,6 +16,7 @@
 #define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
 #define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
 #else
+#include <linux/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 0197441a1eae7c28dea29b98449d9c33ded13030..f6e600bfac5d23b9f8c8c28d747436227af6fd38 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -16,6 +16,7 @@
 #include <linux/miscdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
+#include <linux/sync_core.h>
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b4416bd19692254fbd104e4c4c0703ed..d054e2842a5fa6f932528aa45e8205d2e5baa5c5 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
 	if (!part)
 		return NULL;
 
-	if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+	if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
 		return NULL;
 
 	base = (unsigned long)part->base;
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 1b77fff9f8920e9bb3f36ca1c4dfdf758b504b6a..cc9a28cf9d82725ae21405b6bf8123c30d653b89 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -1078,36 +1078,6 @@ static long ctrl_cdev_ioctl(struct file *file, unsigned int cmd,
 	return err;
 }
 
-#ifdef CONFIG_COMPAT
-static long vol_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				  unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return vol_cdev_ioctl(file, cmd, translated_arg);
-}
-
-static long ubi_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				  unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return ubi_cdev_ioctl(file, cmd, translated_arg);
-}
-
-static long ctrl_cdev_compat_ioctl(struct file *file, unsigned int cmd,
-				   unsigned long arg)
-{
-	unsigned long translated_arg = (unsigned long)compat_ptr(arg);
-
-	return ctrl_cdev_ioctl(file, cmd, translated_arg);
-}
-#else
-#define vol_cdev_compat_ioctl  NULL
-#define ubi_cdev_compat_ioctl  NULL
-#define ctrl_cdev_compat_ioctl NULL
-#endif
-
 /* UBI volume character device operations */
 const struct file_operations ubi_vol_cdev_operations = {
 	.owner          = THIS_MODULE,
@@ -1118,7 +1088,7 @@ const struct file_operations ubi_vol_cdev_operations = {
 	.write          = vol_cdev_write,
 	.fsync		= vol_cdev_fsync,
 	.unlocked_ioctl = vol_cdev_ioctl,
-	.compat_ioctl   = vol_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /* UBI character device operations */
@@ -1126,13 +1096,13 @@ const struct file_operations ubi_cdev_operations = {
 	.owner          = THIS_MODULE,
 	.llseek         = no_llseek,
 	.unlocked_ioctl = ubi_cdev_ioctl,
-	.compat_ioctl   = ubi_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /* UBI control character device operations */
 const struct file_operations ubi_ctrl_cdev_operations = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = ctrl_cdev_ioctl,
-	.compat_ioctl   = ctrl_cdev_compat_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 	.llseek		= no_llseek,
 };
diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index c40729b2c1844e2fd16c905e6c51426698ed6f80..7fad2f24dcad9938ae7b2b02a13f0bec549bd5bf 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -45,7 +45,6 @@
 
 #define BAR_0		0
 #define BAR_1		1
-#define BAR_5		5
 
 #define INTEL_E1000_ETHERNET_DEVICE(device_id) {\
 	PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)}
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index a2ee28e487a6fd20658a5e753f0d60bee78944f1..82a83de8384c1fe9eecf60cd57ded40d5acc6320 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -982,7 +982,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_ioremap;
 
 	if (adapter->need_ioport) {
-		for (i = BAR_1; i <= BAR_5; i++) {
+		for (i = BAR_1; i < PCI_STD_NUM_BARS; i++) {
 			if (pci_resource_len(pdev, i) == 0)
 				continue;
 			if (pci_resource_flags(pdev, i) & IORESOURCE_IO) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b67de06e5f1b5be646fa9ff2413da5348de4e112..c8a4038fc254cab6e0427f6773f69306cb84f6de 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -16224,10 +16224,10 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
 			result = PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	err = pci_aer_clear_nonfatal_status(pdev);
 	if (err) {
 		dev_info(&pdev->dev,
-			 "pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
+			 "pci_aer_clear_nonfatal_status failed 0x%0x\n",
 			 err);
 		/* non-fatal, continue */
 	}
diff --git a/drivers/net/ethernet/intel/i40e/kcompat.h b/drivers/net/ethernet/intel/i40e/kcompat.h
index c3ee144db4e7a5ddd8506964bea8e7c0b9ab1166..0be8e017d1b8ce3a27bea3963821eec117c13cd9 100644
--- a/drivers/net/ethernet/intel/i40e/kcompat.h
+++ b/drivers/net/ethernet/intel/i40e/kcompat.h
@@ -2309,7 +2309,7 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev __always_unused
 	return 0;
 }
 #define pci_disable_pcie_error_reporting(dev) do {} while (0)
-#define pci_cleanup_aer_uncorrect_error_status(dev) do {} while (0)
+#define pci_aer_clear_nonfatal_status(dev) do {} while (0)
 
 void *_kc_kmemdup(const void *src, size_t len, unsigned gfp);
 #define kmemdup(src, len, gfp) _kc_kmemdup(src, len, gfp)
diff --git a/drivers/net/ethernet/intel/iavf/kcompat.h b/drivers/net/ethernet/intel/iavf/kcompat.h
index b37a3f3ac77af99e44b9dd45d6ec451b4615a12f..8b988e9617cb76db98a2ea858eb5178805a6aaba 100644
--- a/drivers/net/ethernet/intel/iavf/kcompat.h
+++ b/drivers/net/ethernet/intel/iavf/kcompat.h
@@ -2294,7 +2294,7 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev __always_unused
 	return 0;
 }
 #define pci_disable_pcie_error_reporting(dev) do {} while (0)
-#define pci_cleanup_aer_uncorrect_error_status(dev) do {} while (0)
+#define pci_aer_clear_nonfatal_status(dev) do {} while (0)
 
 void *_kc_kmemdup(const void *src, size_t len, unsigned gfp);
 #define kmemdup(src, len, gfp) _kc_kmemdup(src, len, gfp)
diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index 9edde960b4f2b4b5c592780153f50cdc7d4c52b3..f45f573e671514c8d7484f82f3b11bd62187d51a 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -4,6 +4,9 @@
 #
 # Makefile for the Intel(R) Ethernet Connection E800 Series Linux Driver
 #
+ccflags-y += -I$(src)
+subdir-ccflags-y += -I$(src)
+
 
 obj-$(CONFIG_ICE) += ice.o
 
@@ -13,9 +16,46 @@ ice-y := ice_main.o	\
 	 ice_nvm.o	\
 	 ice_switch.o	\
 	 ice_sched.o	\
+	 ice_base.o	\
 	 ice_lib.o	\
+	 ice_txrx_lib.o	\
 	 ice_txrx.o	\
+	 ice_fltr.o	\
+	 ice_pf_vsi_vlan_ops.o \
+	 ice_vsi_vlan_ops.o \
+	 ice_vsi_vlan_lib.o \
+	 ice_tc_lib.o	\
+	 ice_fdir.o	\
+	 ice_ethtool_fdir.o	\
+	 ice_acl_main.o		\
+	 ice_acl.o		\
+	 ice_acl_ctrl.o		\
+	 ice_vlan_mode.o	\
 	 ice_flex_pipe.o	\
-	 ice_ethtool.o
-ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o
-ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_lib.o
+	 ice_flow.o		\
+	 ice_lag.o		\
+	 ice_fwlog.o		\
+	 ice_ethtool.o		\
+	 kcompat.o
+
+ice-$(CONFIG_NET_DEVLINK:m=y) += ice_devlink.o ice_fw_update.o
+ice-$(CONFIG_NET_DEVLINK:m=y) += ice_eswitch.o ice_repr.o
+ice-$(CONFIG_MFD_CORE:m=y) += ice_idc.o
+ice-$(CONFIG_DEBUG_FS) += ice_debugfs.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_allowlist.o
+ice-$(CONFIG_PCI_IOV) += ice_dcf.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_fdir.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_vf_vsi_vlan_ops.o
+ice-$(CONFIG_PTP_1588_CLOCK:m=y) += ice_ptp.o ice_ptp_hw.o
+ice-$(CONFIG_PTP_1588_CLOCK:m=y) += ice_cgu_ops.o ice_cgu_util.o
+ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
+ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o
+ice-$(CONFIG_XDP_SOCKETS) += ice_xsk.o
+# Use kcompat pldmfw.c if kernel does not provide CONFIG_PLDMFW
+ifndef CONFIG_PLDMFW
+ice-y += kcompat_pldmfw.o
+endif
+# Use kcompat DIMLIB if kernel doesn't provide it
+ifndef CONFIG_DIMLIB
+ice-y += kcompat_dim.o kcompat_net_dim.o
+endif
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 45e1006660493b1aaaf8043b8cf0954db86a1e12..204dc3ddfcc0966472086980789a01dc843a0481 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -1,9 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_H_
 #define _ICE_H_
 
+#include "kcompat.h"
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -16,9 +17,13 @@
 #include <linux/cpumask.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
+#ifdef HAVE_NETDEV_SB_DEV
+#include <linux/if_macvlan.h>
+#endif /* HAVE_NETDEV_SB_DEV */
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
 #include <linux/workqueue.h>
+#include <linux/wait.h>
 #include <linux/aer.h>
 #include <linux/interrupt.h>
 #include <linux/ethtool.h>
@@ -29,35 +34,120 @@
 #include <linux/ip.h>
 #include <linux/sctp.h>
 #include <linux/ipv6.h>
+#include <linux/pkt_sched.h>
 #include <linux/if_bridge.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/avf/virtchnl.h>
+#ifdef HAVE_XDP_SUPPORT
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include <net/xdp_sock.h>
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 #include <net/ipv6.h>
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include <net/devlink.h>
+#endif /* CONFIG_NET_DEVLINK */
+#if IS_ENABLED(CONFIG_DCB)
+#include <scsi/iscsi_proto.h>
+#endif /* CONFIG_DCB */
+#ifdef HAVE_CONFIG_DIMLIB
+#include <linux/dim.h>
+#else
+#include "kcompat_dim.h"
+#endif
 #include "ice_devids.h"
 #include "ice_type.h"
 #include "ice_txrx.h"
 #include "ice_dcb.h"
 #include "ice_switch.h"
 #include "ice_common.h"
+#include "ice_flow.h"
 #include "ice_sched.h"
+#include <linux/mfd/core.h>
+#include <linux/idr.h>
+#include "ice_idc_int.h"
+#include "virtchnl.h"
 #include "ice_virtchnl_pf.h"
 #include "ice_sriov.h"
+#include "ice_ptp.h"
+#include "ice_cgu.h"
+#include "ice_cgu_ops.h"
+#include "ice_cgu_util.h"
+#include "ice_fdir.h"
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include "ice_xsk.h"
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#ifdef HAVE_NETDEV_UPPER_INFO
+#include "ice_lag.h"
+#endif /* HAVE_NETDEV_UPPER_INFO */
+#include "ice_trace.h"
+
+#if defined(HAVE_VXLAN_RX_OFFLOAD) || defined(HAVE_VXLAN_TYPE)
+#if IS_ENABLED(CONFIG_VXLAN)
+#include <net/vxlan.h>
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD || HAVE_VXLAN_TYPE */
+#ifdef HAVE_GRE_ENCAP_OFFLOAD
+#include <net/gre.h>
+#endif /* HAVE_GRE_ENCAP_OFFLOAD */
+#if defined(HAVE_GENEVE_RX_OFFLOAD) || defined(HAVE_GENEVE_TYPE)
+#if IS_ENABLED(CONFIG_GENEVE)
+#include <net/geneve.h>
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD || HAVE_GENEVE_TYPE */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+#include <net/udp_tunnel.h>
+#endif
+#ifdef NETIF_F_HW_TC
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_gact.h>
+#endif /* NETIF_F_HW_TC */
+#include <net/ip.h>
+#include <linux/cpu_rmap.h>
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include "ice_arfs.h"
+#include "ice_repr.h"
+#include "ice_eswitch.h"
+#include "ice_vsi_vlan_ops.h"
 
 extern const char ice_drv_ver[];
 #define ICE_BAR0		0
+#define ICE_BAR3		3
+#ifdef CONFIG_DEBUG_FS
+#define ICE_MAX_CSR_SPACE	(8 * 1024 * 1024 - 64 * 1024)
+#endif /* CONFIG_DEBUG_FS */
 #define ICE_REQ_DESC_MULTIPLE	32
 #define ICE_MIN_NUM_DESC	64
 #define ICE_MAX_NUM_DESC	8160
 #define ICE_DFLT_MIN_RX_DESC	512
-#define ICE_DFLT_NUM_TX_DESC	256
 #define ICE_DFLT_NUM_RX_DESC	2048
+#define ICE_DFLT_NUM_TX_DESC	256
 
+#define ICE_DFLT_TXQ_VMDQ_VSI	1
+#define ICE_DFLT_RXQ_VMDQ_VSI	1
+#define ICE_DFLT_VEC_VMDQ_VSI	1
+#define ICE_MAX_NUM_VMDQ_VSI	16
+#define ICE_MAX_TXQ_VMDQ_VSI	4
+#define ICE_MAX_RXQ_VMDQ_VSI	4
+#ifdef HAVE_NETDEV_SB_DEV
+#define ICE_MAX_MACVLANS	64
+#endif
 #define ICE_DFLT_TRAFFIC_CLASS	BIT(0)
 #define ICE_INT_NAME_STR_LEN	(IFNAMSIZ + 16)
-#define ICE_AQ_LEN		64
+#define ICE_AQ_LEN		192
 #define ICE_MBXSQ_LEN		64
-#define ICE_MBXRQ_LEN		512
-#define ICE_MIN_MSIX		2
+#define ICE_SBQ_LEN		64
+#define ICE_FDIR_MSIX		2
+#define ICE_ESWITCH_MSIX	1
+#define ICE_MIN_LAN_MSIX	1
+#define ICE_OICR_MSIX		1
+#define ICE_RDMA_NUM_AEQ_MSIX	4
+#define ICE_MIN_RDMA_MSIX	2
+#define ICE_MIN_MSIX		(ICE_MIN_LAN_MSIX + ICE_OICR_MSIX)
 #define ICE_NO_VSI		0xffff
 #define ICE_VSI_MAP_CONTIG	0
 #define ICE_VSI_MAP_SCATTER	1
@@ -66,20 +156,32 @@ extern const char ice_drv_ver[];
 #define ICE_Q_WAIT_RETRY_LIMIT	10
 #define ICE_Q_WAIT_MAX_RETRY	(5 * ICE_Q_WAIT_RETRY_LIMIT)
 #define ICE_MAX_LG_RSS_QS	256
-#define ICE_MAX_SMALL_RSS_QS	8
+#define ICE_MAX_MEDIUM_RSS_QS	64
+#define ICE_MAX_SMALL_RSS_QS	16
 #define ICE_RES_VALID_BIT	0x8000
 #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
+#define ICE_RES_RDMA_VEC_ID	(ICE_RES_MISC_VEC_ID - 1)
+/* All VF control VSIs share the same irq, so assign a unique ID for them */
+#define ICE_RES_VF_CTRL_VEC_ID	(ICE_RES_RDMA_VEC_ID - 1)
 #define ICE_INVAL_Q_INDEX	0xffff
 #define ICE_INVAL_VFID		256
 
+#define ICE_MAX_RXQS_PER_TC		256	/* Used when setting VSI context per TC Rx queues */
+#define ICE_MAX_TXQS_PER_TC		8
+#define ICE_MAX_RDMA_QSET_PER_TC	1
+
+#define ICE_CHNL_START_TC		1
+#define ICE_CHNL_MAX_TC			16
+
 #define ICE_MAX_RESET_WAIT		20
 
+#define ICE_VF_CHNL_START_TC		1
+
 #define ICE_VSIQF_HKEY_ARRAY_SIZE	((VSIQF_HKEY_MAX_INDEX + 1) *	4)
 
 #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
 
-#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - \
-			(ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)))
+#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - ICE_ETH_PKT_HDR_PAD)
 
 #define ICE_UP_TABLE_TRANSLATE(val, i) \
 		(((val) << ICE_AQ_VSI_UP_TABLE_UP##i##_S) & \
@@ -88,6 +190,21 @@ extern const char ice_drv_ver[];
 #define ICE_TX_DESC(R, i) (&(((struct ice_tx_desc *)((R)->desc))[i]))
 #define ICE_RX_DESC(R, i) (&(((union ice_32b_rx_flex_desc *)((R)->desc))[i]))
 #define ICE_TX_CTX_DESC(R, i) (&(((struct ice_tx_ctx_desc *)((R)->desc))[i]))
+#define ICE_TX_FDIRDESC(R, i) (&(((struct ice_fltr_desc *)((R)->desc))[i]))
+
+#define ICE_ACL_ENTIRE_SLICE	1
+#define ICE_ACL_HALF_SLICE	2
+
+/* Minimum BW limit is 500 Kbps for any scheduler node */
+#define ICE_MIN_BW_LIMIT		500
+/* User can specify BW in either Kbit/Mbit/Gbit and OS converts it in bytes.
+ * use it to convert user specified BW limit into Kbps
+ */
+#define ICE_BW_KBPS_DIVISOR		125
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+#define ICE_GTP_TNL_WELLKNOWN_PORT 2152
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
 
 /* Macro for each VSI in a PF */
 #define ice_for_each_vsi(pf, i) \
@@ -110,13 +227,13 @@ extern const char ice_drv_ver[];
 #define ice_for_each_q_vector(vsi, i) \
 	for ((i) = 0; (i) < (vsi)->num_q_vectors; (i)++)
 
-#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_MCAST_TX | \
-				ICE_PROMISC_UCAST_RX | ICE_PROMISC_MCAST_RX)
+#define ice_for_each_chnl_tc(i)	\
+	for ((i) = ICE_CHNL_START_TC; (i) < ICE_CHNL_MAX_TC; (i)++)
+
+#define ICE_UCAST_PROMISC_BITS (ICE_PROMISC_UCAST_TX | ICE_PROMISC_UCAST_RX)
 
 #define ICE_UCAST_VLAN_PROMISC_BITS (ICE_PROMISC_UCAST_TX | \
-				     ICE_PROMISC_MCAST_TX | \
 				     ICE_PROMISC_UCAST_RX | \
-				     ICE_PROMISC_MCAST_RX | \
 				     ICE_PROMISC_VLAN_TX  | \
 				     ICE_PROMISC_VLAN_RX)
 
@@ -127,6 +244,72 @@ extern const char ice_drv_ver[];
 				     ICE_PROMISC_VLAN_TX  | \
 				     ICE_PROMISC_VLAN_RX)
 
+#define ice_pf_to_dev(pf) (&((pf)->pdev->dev))
+
+struct ice_fwlog_user_input {
+	unsigned long events;
+	u8 log_level;
+};
+
+enum ice_feature {
+	ICE_F_DSCP,
+	ICE_F_PTP_EXTTS,
+	ICE_F_MAX
+};
+
+
+enum ice_channel_fltr_type {
+	ICE_CHNL_FLTR_TYPE_INVALID,
+	ICE_CHNL_FLTR_TYPE_SRC_PORT,
+	ICE_CHNL_FLTR_TYPE_DEST_PORT,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT, /* for future use cases */
+	ICE_CHNL_FLTR_TYPE_TENANT_ID,
+	ICE_CHNL_FLTR_TYPE_SRC_IPV4,
+	ICE_CHNL_FLTR_TYPE_DEST_IPV4,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4,
+	ICE_CHNL_FLTR_TYPE_SRC_IPV6,
+	ICE_CHNL_FLTR_TYPE_DEST_IPV6,
+	ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6,
+	ICE_CHNL_FLTR_TYPE_LAST /* must be last */
+};
+
+struct ice_channel {
+	struct list_head list;
+	u8 type;
+	u8 ch_type; /* NVMe over TCP, AF_XDP, UDP based, etc.. */
+	u16 sw_id;
+	u16 base_q;
+	u16 num_rxq;
+	u16 num_txq;
+	u16 vsi_num;
+	u8 ena_tc;
+	struct ice_aqc_vsi_props info;
+	u64 max_tx_rate;
+	u64 min_tx_rate;
+	atomic_t num_sb_fltr;
+	/* counter index when side-band FD is used */
+	u32 fd_cnt_index;
+	/* queue used to setup inline-FD */
+	atomic_t fd_queue;
+	/* packets services thru' inline-FD filter */
+	u64 fd_pkt_cnt;
+	enum ice_channel_fltr_type fltr_type;
+	struct ice_vsi *ch_vsi;
+};
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/* To convert BPS BW parameter into Mbps*/
+#define ICE_BW_MBIT_PS_DIVISOR	125000 /* rate / (1000000 / 8) Mbps */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+struct ice_txq_meta {
+	u32 q_teid;	/* Tx-scheduler element identifier */
+	u16 q_id;	/* Entry in VSI's txq_map bitmap */
+	u16 q_handle;	/* Relative index of Tx queue within TC */
+	u16 vsi_idx;	/* VSI index that Tx queue belongs to */
+	u8 tc;		/* TC number that Tx queue belongs to */
+};
+
 struct ice_tc_info {
 	u16 qoffset;
 	u16 qcount_tx;
@@ -136,14 +319,14 @@ struct ice_tc_info {
 
 struct ice_tc_cfg {
 	u8 numtc; /* Total number of enabled TCs */
-	u8 ena_tc; /* Tx map */
+	u16 ena_tc; /* Tx map */
 	struct ice_tc_info tc_info[ICE_MAX_TRAFFIC_CLASS];
 };
 
 struct ice_res_tracker {
 	u16 num_entries;
 	u16 end;
-	u16 list[1];
+	u16 list[];
 };
 
 struct ice_qs_cfg {
@@ -161,50 +344,100 @@ struct ice_sw {
 	struct ice_pf *pf;
 	u16 sw_id;		/* switch ID for this switch */
 	u16 bridge_mode;	/* VEB/VEPA/Port Virtualizer */
+	struct ice_vsi *dflt_vsi;	/* default VSI for this switch */
+	u8 dflt_vsi_ena:1;	/* true if above dflt_vsi is enabled */
 };
 
-enum ice_state {
-	__ICE_TESTING,
-	__ICE_DOWN,
-	__ICE_NEEDS_RESTART,
-	__ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
-	__ICE_RESET_OICR_RECV,		/* set by driver after rcv reset OICR */
-	__ICE_PFR_REQ,			/* set by driver and peers */
-	__ICE_CORER_REQ,		/* set by driver and peers */
-	__ICE_GLOBR_REQ,		/* set by driver and peers */
-	__ICE_CORER_RECV,		/* set by OICR handler */
-	__ICE_GLOBR_RECV,		/* set by OICR handler */
-	__ICE_EMPR_RECV,		/* set by OICR handler */
-	__ICE_SUSPENDED,		/* set on module remove path */
-	__ICE_RESET_FAILED,		/* set by reset/rebuild */
+enum ice_pf_state {
+	ICE_TESTING,
+	ICE_DOWN,
+	ICE_NEEDS_RESTART,
+	ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
+	ICE_RESET_OICR_RECV,		/* set by driver after rcv reset OICR */
+	ICE_PFR_REQ,			/* set by driver and peers */
+	ICE_CORER_REQ,		/* set by driver and peers */
+	ICE_GLOBR_REQ,		/* set by driver and peers */
+	ICE_CORER_RECV,		/* set by OICR handler */
+	ICE_GLOBR_RECV,		/* set by OICR handler */
+	ICE_EMPR_RECV,		/* set by OICR handler */
+	ICE_SUSPENDED,		/* set on module remove path */
+	ICE_RESET_FAILED,		/* set by reset/rebuild */
+	ICE_RECOVERY_MODE,		/* set when recovery mode is detected */
+	ICE_PREPPED_RECOVERY_MODE,	/* set on recovery mode transition */
 	/* When checking for the PF to be in a nominal operating state, the
 	 * bits that are grouped at the beginning of the list need to be
-	 * checked. Bits occurring before __ICE_STATE_NOMINAL_CHECK_BITS will
+	 * checked. Bits occurring before ICE_STATE_NOMINAL_CHECK_BITS will
 	 * be checked. If you need to add a bit into consideration for nominal
 	 * operating state, it must be added before
-	 * __ICE_STATE_NOMINAL_CHECK_BITS. Do not move this entry's position
+	 * ICE_STATE_NOMINAL_CHECK_BITS. Do not move this entry's position
 	 * without appropriate consideration.
 	 */
-	__ICE_STATE_NOMINAL_CHECK_BITS,
-	__ICE_ADMINQ_EVENT_PENDING,
-	__ICE_MAILBOXQ_EVENT_PENDING,
-	__ICE_MDD_EVENT_PENDING,
-	__ICE_VFLR_EVENT_PENDING,
-	__ICE_FLTR_OVERFLOW_PROMISC,
-	__ICE_VF_DIS,
-	__ICE_CFG_BUSY,
-	__ICE_SERVICE_SCHED,
-	__ICE_SERVICE_DIS,
-	__ICE_OICR_INTR_DIS,		/* Global OICR interrupt disabled */
-	__ICE_STATE_NBITS		/* must be last */
+	ICE_STATE_NOMINAL_CHECK_BITS,
+	ICE_ADMINQ_EVENT_PENDING,
+	ICE_MAILBOXQ_EVENT_PENDING,
+	ICE_SIDEBANDQ_EVENT_PENDING,
+	ICE_MDD_EVENT_PENDING,
+	ICE_VFLR_EVENT_PENDING,
+	ICE_FLTR_OVERFLOW_PROMISC,
+	ICE_VF_DIS,
+	ICE_CFG_BUSY,
+	ICE_SERVICE_SCHED,
+	ICE_PTP_TX_TS_READY,
+	ICE_PTP_EXT_TS_READY,
+	ICE_SERVICE_DIS,
+	ICE_FD_FLUSH_REQ,
+	ICE_OICR_INTR_DIS,		/* Global OICR interrupt disabled */
+	ICE_BAD_EEPROM,
+	ICE_MDD_VF_PRINT_PENDING,	/* set when MDD event handle */
+	ICE_VF_RESETS_DISABLED,	/* disable resets during ice_remove */
+	ICE_LINK_DEFAULT_OVERRIDE_PENDING,
+	ICE_PHY_INIT_COMPLETE,
+	ICE_FD_VF_FLUSH_CTX,		/* set at FD Rx IRQ or timeout */
+	ICE_STATE_NBITS		/* must be last */
+};
+
+enum ice_vsi_state {
+	ICE_VSI_DOWN,
+	ICE_VSI_NEEDS_RESTART,
+	ICE_VSI_NETDEV_ALLOCD,
+	ICE_VSI_NETDEV_REGISTERED,
+	ICE_VSI_UMAC_FLTR_CHANGED,
+	ICE_VSI_MMAC_FLTR_CHANGED,
+	ICE_VSI_VLAN_FLTR_CHANGED,
+	ICE_VSI_PROMISC_CHANGED,
+	ICE_VSI_STATE_NBITS		/* must be last */
+};
+
+enum ice_chnl_feature {
+	ICE_CHNL_FEATURE_FD_ENA, /* for side-band flow-director */
+	ICE_CHNL_FEATURE_INLINE_FD_ENA, /* for inline flow-director */
+	/* using the SO_MARK socket option will trigger skb->mark to be set.
+	 * Driver should act on skb->mark of not (to align flow to HW queue
+	 * binding) is additionally controlled via ethtool private flag and
+	 * when that private flag is tunred ON/OFF, this feature flags is
+	 * set/reset. This feature flag is used to determine if driver should
+	 * act or not when skb->mark is set.
+	 */
+	ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+	/* for pkt based inspection optimization - related to SW triggered
+	 * interrupt from napi_poll for channel enabled vector
+	 */
+	ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+	/* when set, allows cleaning of Rx queue(s) when napi_poll is invoked
+	 * due to busy_poll_stop
+	 */
+	ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+	ICE_CHNL_FEATURE_NBITS		/* must be last */
 };
 
-enum ice_vsi_flags {
-	ICE_VSI_FLAG_UMAC_FLTR_CHANGED,
-	ICE_VSI_FLAG_MMAC_FLTR_CHANGED,
-	ICE_VSI_FLAG_VLAN_FLTR_CHANGED,
-	ICE_VSI_FLAG_PROMISC_CHANGED,
-	ICE_VSI_FLAG_NBITS		/* must be last */
+/* This is to be used only when channels are configured, to track state
+ * at PF level, whether it should use RSS or inline flow-director and this
+ * state gets set/reset appropriately as the HW flow-director table becomes
+ * full/not-full
+ */
+enum ice_advanced_state_t {
+	ICE_SWITCH_TO_RSS,
+	ICE_ADVANCED_STATE_LAST, /* this must be last */
 };
 
 /* struct that defines a VSI, associated with a dev */
@@ -215,20 +448,26 @@ struct ice_vsi {
 	struct ice_port_info *port_info; /* back pointer to port_info */
 	struct ice_ring **rx_rings;	 /* Rx ring array */
 	struct ice_ring **tx_rings;	 /* Tx ring array */
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Initial VSI tx_rings array when L2 offload is off */
+	struct ice_ring **base_tx_rings;
+#endif /* HAVE_NETDEV_SB_DEV */
 	struct ice_q_vector **q_vectors; /* q_vector array */
 
 	irqreturn_t (*irq_handler)(int irq, void *data);
 
 	u64 tx_linearize;
-	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
-	DECLARE_BITMAP(flags, ICE_VSI_FLAG_NBITS);
+	DECLARE_BITMAP(state, ICE_VSI_STATE_NBITS);
 	unsigned int current_netdev_flags;
 	u32 tx_restart;
 	u32 tx_busy;
 	u32 rx_buf_failed;
 	u32 rx_page_failed;
-	int num_q_vectors;
-	int base_vector;		/* IRQ base for OS reserved vectors */
+#ifdef ICE_ADD_PROBES
+	u32 rx_page_reuse;
+#endif /* ICE_ADD_PROBES */
+	u16 num_q_vectors;
+	u16 base_vector;		/* IRQ base for OS reserved vectors */
 	enum ice_vsi_type type;
 	u16 vsi_num;			/* HW (absolute) index of this VSI */
 	u16 idx;			/* software index in pf->vsi[] */
@@ -236,6 +475,10 @@ struct ice_vsi {
 	s16 vf_id;			/* VF ID for SR-IOV VSIs */
 
 	u16 ethtype;			/* Ethernet protocol for pause frame */
+	u16 num_gfltr;
+	u16 num_bfltr;
+	u16 cntr_gfltr;
+	u16 cntr_bfltr;
 
 	/* RSS config */
 	u16 rss_table_size;	/* HW RSS table size */
@@ -244,6 +487,15 @@ struct ice_vsi {
 	u8 *rss_lut_user;	/* User configured lookup table entries */
 	u8 rss_lut_type;	/* used to configure Get/Set RSS LUT AQ call */
 
+	/* aRFS members only allocated for the PF VSI */
+#define ICE_MAX_RFS_FILTERS	0xFFFF
+#define ICE_MAX_ARFS_LIST	1024
+#define ICE_ARFS_LST_MASK	(ICE_MAX_ARFS_LIST - 1)
+	struct hlist_head *arfs_fltr_list;
+	struct ice_arfs_active_fltr_cntrs *arfs_fltr_cntrs;
+	spinlock_t arfs_lock;	/* protects aRFS hash table and filter state */
+	atomic_t *arfs_last_fltr_id;
+
 	u16 max_frame;
 	u16 rx_buf_len;
 
@@ -260,7 +512,10 @@ struct ice_vsi {
 	u8 irqs_ready:1;
 	u8 current_isup:1;		 /* Sync 'link up' logging */
 	u8 stat_offsets_loaded:1;
-	u8 vlan_ena:1;
+	struct ice_vsi_vlan_ops inner_vlan_ops;
+	struct ice_vsi_vlan_ops outer_vlan_ops;
+	u16 num_vlan;
+
 
 	/* queue information */
 	u8 tx_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
@@ -271,11 +526,147 @@ struct ice_vsi {
 	u16 num_txq;			 /* Used Tx queues */
 	u16 alloc_rxq;			 /* Allocated Rx queues */
 	u16 num_rxq;			 /* Used Rx queues */
+	u16 req_txq;			 /* User requested Tx queues */
+	u16 req_rxq;			 /* User requested Rx queues */
 	u16 num_rx_desc;
 	u16 num_tx_desc;
+	u16 qset_handle[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_tc_cfg tc_cfg;
+#ifdef HAVE_XDP_SUPPORT
+	struct bpf_prog *xdp_prog;
+	struct ice_ring **xdp_rings;	 /* XDP ring array */
+	u16 num_xdp_txq;		 /* Used XDP queues */
+	u8 xdp_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	struct xdp_umem **xsk_umems;
+	u16 num_xsk_umems_used;
+	u16 num_xsk_umems;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	struct tc_mqprio_qopt_offload mqprio_qopt;/* queue parameters */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	DECLARE_BITMAP(ptp_tx_idx, INDEX_PER_QUAD);
+	struct sk_buff *ptp_tx_skb[INDEX_PER_QUAD];
+	u32 tx_hwtstamp_skipped;
+	u8 ptp_tx:1;
+
+	/* Channel Specific Fields */
+	struct ice_vsi *tc_map_vsi[ICE_CHNL_MAX_TC];
+	u16 cnt_q_avail;
+	u16 next_base_q;    /* next queue to be used for channel setup */
+	struct list_head ch_list;
+	u16 num_chnl_rxq;
+	u16 num_chnl_txq;
+	u16 ch_rss_size;
+	u16 num_chnl_fltr;
+	/* store away rss size info before configuring ADQ channels so that,
+	 * it can be used after tc-qdisc delete, to get back RSS setting as
+	 * they were before
+	 */
+	u16 orig_rss_size;
+	u8 vf_adq_tc;	/* traffic class number for VF ADQ VSI */
+	/* track various feature bits for channel VSI */
+	DECLARE_BITMAP(features, ICE_CHNL_FEATURE_NBITS);
+#define ICE_TBL_FULL_TIMES             5
+	/* how many times transitioned into inline flow-director from RSS */
+	u64 cnt_inline_fd_transition;
+	/* how many times HW table is flushed */
+	u64 cnt_table_flushed;
+	/* keeps track, how many times SW detected that HW table remain full
+	 * once SW state is SWITCHED_TO_RSS
+	 */
+	int cnt_tbl_full;
+
+	/* inline_fd_active_cnt is SW based counter which keeps track of active
+	 * inline-FD filter entries in table
+	 */
+	atomic_t inline_fd_active_cnt;
+	DECLARE_BITMAP(adv_state, ICE_ADVANCED_STATE_LAST);
+
+	/* this keeps tracks of all enabled TC with and without DCB
+	 * and inclusive of ADQ, vsi->mqprio_opt keeps track of queue
+	 * information
+	 */
+	u8 all_numtc;
+	u16 all_enatc;
+
+	/* store away TC info, to be used for rebuild logic */
+	u8 old_numtc;
+	u16 old_ena_tc;
+
+	struct ice_channel *ch;
+	struct net_device **target_netdevs;
+
+	/* setup back reference, to which aggregator node this VSI
+	 * corresponds to
+	 */
+	struct ice_agg_node *agg_node;
+	u16 *global_lut_id;
 } ____cacheline_internodealigned_in_smp;
 
+enum ice_chnl_vector_state {
+	ICE_CHNL_VECTOR_IN_BP,
+	ICE_CHNL_VECTOR_PREV_IN_BP,
+	ICE_CHNL_VECTOR_ONCE_IN_BP,
+	ICE_CHNL_VECTOR_PREV_DATA_PKT_RECV,
+	ICE_CHNL_VECTOR_WD_EQUALS_BP,
+	ICE_CHNL_VECTOR_NBITS, /* This must be last */
+};
+
+#ifdef ADQ_PERF_COUNTERS
+struct ice_q_vector_ch_stats {
+	/* following are used as part of managing driver internal
+	 * state machine. Only to be used for perf debugging and
+	 * it is controlled by module_param : debug_mask
+	 */
+	u64 in_bp;
+	u64 in_int;
+	u64 real_int_to_bp;
+	u64 real_bp_to_int;
+	u64 real_int_to_int;
+	u64 real_bp_to_bp;
+
+	/* These counter is used to track real transition of vector from
+	 * BUSY_POLL to INTERRUPT based on enhanced logic (using state
+	 * machine and control packets).
+	 */
+	u64 unlikely_cb_to_bp;
+	/* This is used to keep track of enabling interrupt from napi_poll
+	 * when state machine condition indicated once_in_bp is false
+	 */
+	u64 once_bp_false;
+	u64 num_need_resched_bp_stop;
+	u64 num_timeout_bp_stop;
+	u64 num_l_c_data_pkt;
+	u64 num_l_c_data_pkt1;
+	u64 num_sw_intr_timeout; /* track SW INTR from napi_poll */
+	u64 num_sw_intr_serv_task; /* track SW INTR from service_task */
+	u64 cleaned_any_data_pkt;
+	/* Tracking "unlikely_cb_bp and once_in_bp is true" */
+	u64 ucb_o_bp;
+	/* This keeps track of how many times, bailout when once_in_bp is set,
+	 * unlikely_cb_to_bp is set, but pkt based interrupt optimization
+	 * is OFF
+	 */
+	u64 num_no_sw_intr_opt_off;
+	/* tracking, how many times WB_ON_ITR is set */
+	u64 num_wb_on_itr_set;
+
+	u64 pkt_bp_stop_napi_budget;
+	u64 pkt_bp_stop_bp_budget;
+
+	u64 bp_wd_equals_budget64;
+	u64 bp_wd_equals_budget8;
+
+	u64 keep_state_bp_budget64;
+	u64 keep_state_bp_budget8;
+};
+#endif /* ADQ_PERF_COUNTERS */
+
 /* struct that defines an interrupt vector */
 struct ice_q_vector {
 	struct ice_vsi *vsi;
@@ -284,7 +675,7 @@ struct ice_q_vector {
 	u16 reg_idx;
 	u8 num_ring_rx;			/* total number of Rx rings in vector */
 	u8 num_ring_tx;			/* total number of Tx rings in vector */
-	u8 itr_countdown;		/* when 0 should adjust adaptive ITR */
+	u8 wb_on_itr:1;			/* if true, WB on ITR is enabled */
 	/* in usecs, need to use ice_intrl_to_usecs_reg() before writing this
 	 * value to the device
 	 */
@@ -298,27 +689,185 @@ struct ice_q_vector {
 	cpumask_t affinity_mask;
 	struct irq_affinity_notify affinity_notify;
 
+	struct ice_channel *ch;
+
 	char name[ICE_INT_NAME_STR_LEN];
+
+	u16 total_events;	/* net_dim(): number of interrupts processed */
+	/* This tracks current state of vector, BUSY_POLL or INTR */
+#define ICE_CHNL_IN_BP			BIT(ICE_CHNL_VECTOR_IN_BP)
+	/* This tracks prev state of vector, BUSY_POLL or INTR */
+#define ICE_CHNL_PREV_IN_BP		BIT(ICE_CHNL_VECTOR_PREV_IN_BP)
+	/* This tracks state of vector, was the ever in BUSY_POLL. This
+	 * state goes to INTT if interrupt are enabled or SW interrupts
+	 * are triggered from either service_task or napi_poll
+	 */
+#define ICE_CHNL_ONCE_IN_BP		BIT(ICE_CHNL_VECTOR_ONCE_IN_BP)
+	/* Tracks if previously - were there any data packets received
+	 * on per channel enabled vector or not
+	 */
+#define ICE_CHNL_PREV_DATA_PKT_RECV	BIT(ICE_CHNL_VECTOR_PREV_DATA_PKT_RECV)
+	/* tracks if number of Rx packets processed is equal to budget or not.
+	 * It is set from napi_poll and used from ice_refresh_bp_state
+	 * to determine if internal state of vector to be kept in BUSY_POLL
+	 * or not
+	 */
+#define ICE_CHNL_WD_EQUALS_BP		BIT(ICE_CHNL_VECTOR_WD_EQUALS_BP)
+	/* it is used to keep track of various states as defined earlier
+	 * and those states are used during ADQ performance optimization
+	 */
+	u8 state_flags;
+	/* Used in logic to determine if SW inter is needed or not.
+	 * This is used only for channel enabled vector
+	 */
+	u64 jiffy;
+	/* Primarily used in decision making w.r.t using inline flow-director */
+	atomic_t inline_fd_cnt;
+
+	/* This is applicable only for ADQ enabled vectors and used to avoid
+	 * situation of OS triggering ksoftirqd.
+	 *
+	 * Usually busy_poll_stop is followed by napi_schedule:napi_poll if
+	 * driver returned "budget" as part of processing packets during
+	 * busy_poll_stop. As long as driver continue to return "budget",
+	 * OS keeps calling napi_schedule upto 10 times or 2msec and then
+	 * arms the ksoftrqd.
+	 *
+	 * As part of ADQ performance optimization, it is not preferable to
+	 * let ksoftirqd run when there has been enough packets processed.
+	 * To facilitate fairness to the consumer of those packets,
+	 * do not process Rx queues after 8 times.
+	 */
+	/* following variable keeps track of how many times Rx queues were
+	 * processed when napi_poll is invoked thru napi_schedule (as a result
+	 * of returning "budget" from busy_poll_stop:napi_poll) and
+	 * work_done == budget.
+	 */
+	u8 process_rx_queues;
+
+	/* following is controlled thru' priv-flag, value of
+	 * "max_limit_process_rx_queues" becomes 8 when priv-flag is set
+	 * otherwise it is set to 4 (default)
+	 */
+#define ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT  4
+#define ICE_MAX_LIMIT_PROCESS_RX_PKTS  8
+	u8 max_limit_process_rx_queues;
+
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_q_vector_ch_stats ch_stats;
+#endif /* ADQ_PERF_COUNTERS */
 } ____cacheline_internodealigned_in_smp;
 
 enum ice_pf_flags {
 	ICE_FLAG_FLTR_SYNC,
+	ICE_FLAG_VMDQ_ENA,
+#ifdef HAVE_NETDEV_SB_DEV
+	ICE_FLAG_MACVLAN_ENA,
+#endif /* HAVE_NETDEV_SB_DEV */
+	ICE_FLAG_IWARP_ENA,
 	ICE_FLAG_RSS_ENA,
 	ICE_FLAG_SRIOV_ENA,
 	ICE_FLAG_SRIOV_CAPABLE,
 	ICE_FLAG_DCB_CAPABLE,
 	ICE_FLAG_DCB_ENA,
+	ICE_FLAG_FD_ENA,
+	ICE_FLAG_PTP_ENA,		/* NVM PTP support */
+	ICE_FLAG_PTP,			/* PTP successfully initialized */
+	ICE_FLAG_PEER_ENA,
 	ICE_FLAG_ADV_FEATURES,
+#ifdef NETIF_F_HW_TC
+	ICE_FLAG_TC_MQPRIO,		/* support for Multi queue TC */
+	ICE_FLAG_CLS_FLOWER,		/* support cls flower filters */
+#endif /* NETIF_F_HW_TC */
 	ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA,
+	ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA,
 	ICE_FLAG_NO_MEDIA,
+#ifndef ETHTOOL_GFECPARAM
+	ICE_FLAG_RS_FEC,
+	ICE_FLAG_BASE_R_FEC,
+#endif /* !ETHTOOL_GFECPARAM */
 	ICE_FLAG_FW_LLDP_AGENT,
+	ICE_FLAG_CHNL_INLINE_FD_ENA,
+	ICE_FLAG_CHNL_INLINE_FD_MARK_ENA,
+	ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA,
+	ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA,
+	ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG,
+	ICE_FLAG_MOD_POWER_UNSUPPORTED,
 	ICE_FLAG_ETHTOOL_CTXT,		/* set when ethtool holds RTNL lock */
+	ICE_FLAG_LEGACY_RX,
+	ICE_FLAG_VF_TRUE_PROMISC_ENA,
+	ICE_FLAG_MDD_AUTO_RESET_VF,
+	ICE_FLAG_VF_VLAN_PRUNE_DIS,
+	ICE_FLAG_LINK_LENIENT_MODE_ENA,
+	ICE_FLAG_ESWITCH_CAPABLE,
 	ICE_PF_FLAGS_NBITS		/* must be last */
 };
 
+#ifdef HAVE_NETDEV_SB_DEV
+struct ice_macvlan {
+	struct list_head list;
+	int id;
+	struct net_device *vdev;
+	struct ice_vsi *parent_vsi;
+	struct ice_vsi *vsi;
+	u8 mac[ETH_ALEN];
+};
+#endif /* HAVE_NETDEV_SB_DEV */
+
+struct ice_switchdev_info {
+	struct ice_vsi *control_vsi;
+	struct ice_vsi *uplink_vsi;
+	bool is_running;
+};
+
+enum ice_tnl_state {
+	ICE_TNL_SET_TO_ADD,
+	ICE_TNL_ACTIVE,
+	ICE_TNL_SET_TO_DEL,
+	ICE_TNL_LAST = 0xFF, /* must be last */
+};
+
+struct ice_tnl_entry {
+	enum ice_tunnel_type type;
+	u16 port;
+#define ICE_TNL_STATE_TO_ADD	BIT(ICE_TNL_SET_TO_ADD)
+#define ICE_TNL_STATE_ACTIVE	BIT(ICE_TNL_ACTIVE)
+#define ICE_TNL_STATE_TO_DEL	BIT(ICE_TNL_SET_TO_DEL)
+	u8 state;
+	u8 ref_cnt;
+	struct list_head node;
+};
+
+struct ice_agg_node {
+	u32 agg_id;
+#define ICE_MAX_VSIS_IN_AGG_NODE	64
+	u32 num_vsis;
+	u8 valid;
+};
+
+enum ice_flash_update_preservation {
+	/* Preserve all settings and fields */
+	ICE_FLASH_UPDATE_PRESERVE_ALL = 0,
+	/* Preserve limited fields, such as VPD, PCI serial ID, MACs, etc */
+	ICE_FLASH_UPDATE_PRESERVE_LIMITED,
+	/* Return all fields to factory settings */
+	ICE_FLASH_UPDATE_PRESERVE_FACTORY_SETTINGS,
+	/* Do not perform any preservation */
+	ICE_FLASH_UPDATE_PRESERVE_NONE,
+};
+
 struct ice_pf {
 	struct pci_dev *pdev;
 
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_REGIONS
+	struct devlink_region *nvm_region;
+	struct devlink_region *devcaps_region;
+#endif /* HAVE_DEVLINK_REGIONS */
+	/* devlink port data */
+	struct devlink_port devlink_port;
+#endif /* CONFIG_NET_DEVLINK */
+
 	/* OS reserved IRQ details */
 	struct msix_entry *msix_entries;
 	struct ice_res_tracker *irq_tracker;
@@ -328,15 +877,25 @@ struct ice_pf {
 	 */
 	u16 sriov_base_vector;
 
+	u16 ctrl_vsi_idx;		/* control VSI index in pf->vsi array */
+
 	struct ice_vsi **vsi;		/* VSIs created by the driver */
 	struct ice_sw *first_sw;	/* first switch created by firmware */
+	u16 eswitch_mode;		/* current mode of eswitch */
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *ice_debugfs_pf;
+#endif /* CONFIG_DEBUG_FS */
 	/* Virtchnl/SR-IOV config info */
 	struct ice_vf *vf;
-	int num_alloc_vfs;		/* actual number of VFs allocated */
+	u16 num_alloc_vfs;	/* actual number of VFs allocated */
 	u16 num_vfs_supported;		/* num VFs supported for this PF */
-	u16 num_vf_qps;			/* num queue pairs per VF */
-	u16 num_vf_msix;		/* num vectors per VF */
-	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
+	u16 num_qps_per_vf;
+	u16 num_msix_per_vf;
+	/* used to ratelimit the MDD event logging */
+	unsigned long last_printed_mdd_jiffies;
+	DECLARE_BITMAP(malvfs, ICE_MAX_VF_COUNT);
+	DECLARE_BITMAP(features, ICE_F_MAX);
+	DECLARE_BITMAP(state, ICE_STATE_NBITS);
 	DECLARE_BITMAP(flags, ICE_PF_FLAGS_NBITS);
 	unsigned long *avail_txqs;	/* bitmap to track PF Tx queue usage */
 	unsigned long *avail_rxqs;	/* bitmap to track PF Rx queue usage */
@@ -346,13 +905,34 @@ struct ice_pf {
 	struct work_struct serv_task;
 	struct mutex avail_q_mutex;	/* protects access to avail_[rx|tx]qs */
 	struct mutex sw_mutex;		/* lock for protecting VSI alloc flow */
+	struct mutex tc_mutex;		/* lock to protect TC changes */
 	u32 msg_enable;
+	struct ice_ptp ptp;
+	struct ice_cgu_info cgu_info;
+	u16 num_rdma_msix;	/* Total MSIX vectors for RDMA driver */
+	u16 rdma_base_vector;
+	struct ice_peer_obj *rdma_peer;
+#ifdef HAVE_NETDEV_SB_DEV
+	/* MACVLAN specific variables */
+	DECLARE_BITMAP(avail_macvlan, ICE_MAX_MACVLANS);
+	struct list_head macvlan_list;
+	u16 num_macvlan;
+	u16 max_num_macvlan;
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	/* spinlock to protect the AdminQ wait list */
+	spinlock_t aq_wait_lock;
+	struct hlist_head aq_wait_list;
+	wait_queue_head_t aq_wait_queue;
+
+	wait_queue_head_t reset_wait_queue;
+
 	u32 hw_csum_rx_error;
-	u32 oicr_idx;		/* Other interrupt cause MSIX vector index */
-	u32 num_avail_sw_msix;	/* remaining MSIX SW vectors left unclaimed */
+	u16 oicr_idx;		/* Other interrupt cause MSIX vector index */
+	u16 num_avail_sw_msix;	/* remaining MSIX SW vectors left unclaimed */
 	u16 max_pf_txqs;	/* Total Tx queues PF wide */
 	u16 max_pf_rxqs;	/* Total Rx queues PF wide */
-	u32 num_lan_msix;	/* Total MSIX vectors for base driver */
+	u16 num_lan_msix;	/* Total MSIX vectors for base driver */
 	u16 num_lan_tx;		/* num LAN Tx queues setup */
 	u16 num_lan_rx;		/* num LAN Rx queues setup */
 	u16 next_vsi;		/* Next free slot in pf->vsi[] - 0-based! */
@@ -362,24 +942,242 @@ struct ice_pf {
 	u16 empr_count;		/* EMP reset count */
 	u16 pfr_count;		/* PF reset count */
 
+	u8 stat_prev_loaded : 1; /* has previous stats been loaded */
+	u8 wol_ena : 1;		/* software state of WoL */
+	u32 wakeup_reason;	/* last wakeup reason */
 	struct ice_hw_port_stats stats;
 	struct ice_hw_port_stats stats_prev;
 	struct ice_hw hw;
-	u8 stat_prev_loaded:1; /* has previous stats been loaded */
-#ifdef CONFIG_DCB
+#ifdef ICE_ADD_PROBES
+	u64 tcp_segs;
+	u64 udp_segs;
+	u64 tx_tcp_cso;
+	u64 tx_udp_cso;
+	u64 tx_sctp_cso;
+	u64 tx_ip4_cso;
+	u64 tx_l3_cso_err;
+	u64 tx_l4_cso_err;
+	u64 rx_tcp_cso;
+	u64 rx_udp_cso;
+	u64 rx_sctp_cso;
+	u64 rx_ip4_cso;
+	u64 rx_ip4_cso_err;
+	u64 rx_tcp_cso_err;
+	u64 rx_udp_cso_err;
+	u64 rx_sctp_cso_err;
+	u64 tx_q_vlano;
+	u64 rx_q_vlano;
+	u64 tx_ad_vlano;
+	u64 rx_ad_vlano;
+#endif
 	u16 dcbx_cap;
-#endif /* CONFIG_DCB */
 	u32 tx_timeout_count;
 	unsigned long tx_timeout_last_recovery;
 	u32 tx_timeout_recovery_level;
 	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_peer_obj_int **peers;
+	int peer_idx;
 	u32 sw_int_count;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* count of tc_flower filters specific to channel (aka where filter
+	 * action is "hw_tc <tc_num>")
+	 */
+	u16 num_dmac_chnl_fltrs;
+	struct hlist_head tc_flower_fltr_list;
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+	struct ice_dcf dcf;
+	__le64 nvm_phy_type_lo; /* NVM PHY type low */
+	__le64 nvm_phy_type_hi; /* NVM PHY type high */
+	struct ice_link_default_override_tlv link_dflt_override;
+	u64 supported_rxdids; /* bitmap for supported RXDID */
+#ifdef HAVE_NETDEV_UPPER_INFO
+	struct ice_lag *lag; /* Link Aggregation information */
+#endif /* HAVE_NETDEV_UPPER_INFO */
+
+	/* protects accesses to tunnel list, it is grabbed
+	 * from ice_udp_tunnel_add/del and as well from service_task
+	 */
+	spinlock_t tnl_lock;
+	struct list_head tnl_list;
+	struct ice_switchdev_info switchdev;
+
+#define ICE_INVALID_AGG_NODE_ID		0
+#define ICE_PF_AGG_NODE_ID_START	1
+#define ICE_MAX_PF_AGG_NODES		32
+	struct ice_agg_node pf_agg_node[ICE_MAX_PF_AGG_NODES];
+#ifdef HAVE_NETDEV_SB_DEV
+#define ICE_MACVLAN_AGG_NODE_ID_START	(ICE_PF_AGG_NODE_ID_START + \
+					 ICE_MAX_PF_AGG_NODES)
+#define ICE_MAX_MACVLAN_AGG_NODES	32
+	struct ice_agg_node macvlan_agg_node[ICE_MAX_MACVLAN_AGG_NODES];
+#endif
+#define ICE_VF_AGG_NODE_ID_START	65
+#define ICE_MAX_VF_AGG_NODES		32
+	struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES];
 };
 
 struct ice_netdev_priv {
 	struct ice_vsi *vsi;
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* indirect block callbacks on registered higher level devices
+	 * (e.g. tunnel devices)
+	 *
+	 * tc_indr_block_cb_priv_list is used to lookup indirect callback
+	 * private data
+	 *
+	 * netdevice_nb is the netdev events notifier - used to register
+	 * tunnel devices for block events
+	 *
+	 */
+	struct list_head tc_indr_block_priv_list;
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	struct notifier_block netdevice_nb;
+#endif
+#endif /* HAVE_TC_INDIR_BLOCK */
+	struct ice_repr *repr;
 };
 
+extern struct ida ice_peer_index_ida;
+
+
+/**
+ * ice_vector_ch_enabled
+ * @qv: pointer to q_vector, can be NULL
+ *
+ * This function returns true if vector is channel enabled otherwise false
+ */
+static inline bool ice_vector_ch_enabled(struct ice_q_vector *qv)
+{
+	return !!qv->ch; /* Enable it to run with TC */
+}
+
+/**
+ * ice_vector_busypoll_intr
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vector is transitioning from BUSY_POLL
+ * to INTERRUPT based on current and previous state of vector
+ */
+static inline bool ice_vector_busypoll_intr(struct ice_q_vector *qv)
+{
+	return (qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+	      !(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_ever_in_busypoll
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vectors current OR previous state
+ * is BUSY_POLL
+ */
+static inline bool ice_vector_ever_in_busypoll(struct ice_q_vector *qv)
+{
+	return (qv->state_flags & ICE_CHNL_PREV_IN_BP) ||
+	       (qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_state_curr_prev_intr
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vectors current AND previous state
+ * is INTERRUPT
+ */
+static inline bool ice_vector_state_curr_prev_intr(struct ice_q_vector *qv)
+{
+	return !(qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+	       !(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_vector_intr_busypoll
+ * @qv: pointer to q_vector
+ *
+ * This function returns true if vector is transitioning from INTERRUPT
+ * to BUSY_POLL based on current and previous state of vector
+ */
+static inline bool ice_vector_intr_busypoll(struct ice_q_vector *qv)
+{
+	return !(qv->state_flags & ICE_CHNL_PREV_IN_BP) &&
+		(qv->state_flags & ICE_CHNL_IN_BP);
+}
+
+/**
+ * ice_adq_trigger_sw_intr
+ * @hw: ptr to HW
+ * @q_vector: pointer to q_vector
+ *
+ * This function triggers SW interrupt on specified vector and re-enables
+ * interrupt. This is for use with ADQ.
+ */
+static inline void
+ice_adq_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	struct ice_ring_container *rx_rc = &q_vector->rx;
+
+	q_vector->state_flags &= ~ICE_CHNL_ONCE_IN_BP;
+
+	/* when entering into interrupt mode, use current value of Rx ITR
+	 * hence rx_rc->itr_setting. This is needed to honor user setting
+	 * for Rx ITR
+	 */
+	wr32(hw,
+	     GLINT_DYN_CTL(q_vector->reg_idx),
+	     (rx_rc->itr_idx << GLINT_DYN_CTL_ITR_INDX_S) |
+	     (ITR_REG_ALIGN(rx_rc->itr_setting) >> ICE_ITR_GRAN_S) |
+	     GLINT_DYN_CTL_SWINT_TRIG_M |
+	     GLINT_DYN_CTL_INTENA_M);
+}
+
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_sw_intr_cntr
+ * @q_vector: pointer to q_vector
+ * @napi_codepath: codepath separator for stats purpose
+ *
+ * This function counts the trigger code path for sw_intr. Caller of this
+ * expected to call ice_adq_trigger_sw_intr or ice_trigger_sw_intr function to
+ * actually trigger SW intr.
+ */
+static inline void
+ice_sw_intr_cntr(struct ice_q_vector *q_vector, bool napi_codepath)
+{
+	if (napi_codepath) /* napi - detected timeout */
+		q_vector->ch_stats.num_sw_intr_timeout++;
+	else
+		q_vector->ch_stats.num_sw_intr_serv_task++;
+}
+#endif /* ADQ_PERF_COUNTERS */
+
+/**
+ * ice_force_wb - trigger force write-back by setting WB_ON_ITR bit
+ * @hw: ptr to HW
+ * @q_vector: pointer to q_vector
+ *
+ * This function is used to force write-backs by setting WB_ON_ITR bit
+ * in DYN_CTLN register. WB_ON_ITR and INTENA are mutually exclusive bits.
+ * Setting WB_ON_ITR bits means Tx and Rx descriptors are written back based
+ * on ITR expiration irrespective of INTENA setting
+ */
+static inline void
+ice_force_wb(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	if (q_vector->num_ring_rx || q_vector->num_ring_tx) {
+#ifdef ADQ_PERF_COUNTERS
+		q_vector->ch_stats.num_wb_on_itr_set++;
+#endif /* ADQ_PERF_COUNTERS */
+		wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx),
+		     ICE_GLINT_DYN_CTL_WB_ON_ITR(0, ICE_RX_ITR));
+	}
+
+	/* needed to avoid triggering WB_ON_ITR again which typically
+	 * happens from ice_set_wb_on_itr function
+	 */
+	q_vector->wb_on_itr = true;
+}
+
 /**
  * ice_irq_dynamic_ena - Enable default interrupt generation settings
  * @hw: pointer to HW struct
@@ -401,7 +1199,7 @@ ice_irq_dynamic_ena(struct ice_hw *hw, struct ice_vsi *vsi,
 	val = GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
 	      (itr << GLINT_DYN_CTL_ITR_INDX_S);
 	if (vsi)
-		if (test_bit(__ICE_DOWN, vsi->state))
+		if (test_bit(ICE_VSI_DOWN, vsi->state))
 			return;
 	wr32(hw, GLINT_DYN_CTL(vector), val);
 }
@@ -417,6 +1215,55 @@ static inline struct ice_pf *ice_netdev_to_pf(struct net_device *netdev)
 	return np->vsi->back;
 }
 
+#ifdef HAVE_XDP_SUPPORT
+static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi)
+{
+	return !!vsi->xdp_prog;
+}
+
+static inline void ice_set_ring_xdp(struct ice_ring *ring)
+{
+	ring->flags |= ICE_TX_FLAGS_RING_XDP;
+}
+
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+/**
+ * ice_xsk_umem - get XDP UMEM bound to a ring
+ * @ring: ring to use
+ *
+ * Returns a pointer to xdp_umem structure if there is an UMEM present,
+ * NULL otherwise.
+ */
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+static inline struct xsk_buff_pool *ice_xsk_umem(struct ice_ring *ring)
+#else
+static inline struct xdp_umem *ice_xsk_umem(struct ice_ring *ring)
+#endif
+{
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	struct xdp_umem **umems = ring->vsi->xsk_umems;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+	u16 qid = ring->q_index;
+
+	if (ice_ring_is_xdp(ring))
+		qid -= ring->vsi->num_xdp_txq;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (qid >= ring->vsi->num_xsk_umems || !umems || !umems[qid] ||
+	    !ice_is_xdp_ena_vsi(ring->vsi))
+		return NULL;
+
+	return umems[qid];
+#else
+	if (!ice_is_xdp_ena_vsi(ring->vsi))
+		return NULL;
+
+	return xsk_get_pool_from_qid(ring->vsi->netdev, qid);
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
 /**
  * ice_get_main_vsi - Get the PF VSI
  * @pf: PF instance
@@ -431,27 +1278,381 @@ static inline struct ice_vsi *ice_get_main_vsi(struct ice_pf *pf)
 	return NULL;
 }
 
+/**
+ * ice_get_netdev_priv_vsi - return VSI associated with netdev priv.
+ * @np: private netdev structure
+ */
+static inline struct ice_vsi *ice_get_netdev_priv_vsi(struct ice_netdev_priv *np)
+{
+	/* In case of port representor return source port VSI. */
+	if (np->repr)
+		return np->repr->src_vsi;
+	else
+		return np->vsi;
+}
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+/**
+ * ice_is_switchdev_running - check if switchdev is configured
+ * @pf: pointer to PF structure
+ *
+ * Returns true if eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV
+ * and switchdev is configured, false otherwise.
+ */
+static inline bool ice_is_switchdev_running(struct ice_pf *pf)
+{
+	return pf->switchdev.is_running;
+}
+
+#else
+static inline bool
+ice_is_switchdev_running(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+#endif /* IS_ENABLED(CONFIG_NET_DEVLINK) */
+
+/**
+ * ice_get_ctrl_vsi - Get the control VSI
+ * @pf: PF instance
+ */
+static inline struct ice_vsi *ice_get_ctrl_vsi(struct ice_pf *pf)
+{
+	/* if pf->ctrl_vsi_idx is ICE_NO_VSI, control VSI was not set up */
+	if (!pf->vsi || pf->ctrl_vsi_idx == ICE_NO_VSI)
+		return NULL;
+
+	return pf->vsi[pf->ctrl_vsi_idx];
+}
+
+/**
+ * ice_find_first_vsi_by_type - Find and return first VSI of a given type
+ * @pf: PF to search for VSI
+ * @vsi_type: VSI type we are looking for
+ */
+static inline struct ice_vsi *
+ice_find_first_vsi_by_type(struct ice_pf *pf, enum ice_vsi_type vsi_type)
+{
+	int i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (vsi && vsi->type == vsi_type)
+			return vsi;
+	}
+
+	return NULL;
+}
+
+enum ice_fd_stat_idx {
+	ICE_FD_STAT_SB,
+	ICE_FD_STAT_CH,
+#ifdef ICE_ADD_PROBES
+	ICE_ARFS_STAT_TCPV4,
+	ICE_ARFS_STAT_TCPV6,
+	ICE_ARFS_STAT_UDPV4,
+	ICE_ARFS_STAT_UDPV6
+#endif /* ICE_ADD_PROBES */
+};
+
+#define ICE_FD_STAT_CTR_BLOCK_COUNT	256
+#define ICE_FD_STAT_PF_IDX(base_idx) \
+			((base_idx) * ICE_FD_STAT_CTR_BLOCK_COUNT)
+#define ICE_FD_SB_STAT_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_FD_STAT_SB)
+#ifdef ICE_ADD_PROBES
+#define ICE_ARFS_STAT_TCPV4_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_TCPV4)
+#define ICE_ARFS_STAT_TCPV6_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_TCPV6)
+#define ICE_ARFS_STAT_UDPV4_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_UDPV4)
+#define ICE_ARFS_STAT_UDPV6_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_ARFS_STAT_UDPV6)
+#endif /* ICE_ADD_PROBES */
+
+#define ICE_FD_CH_STAT_IDX(base_idx) \
+			(ICE_FD_STAT_PF_IDX(base_idx) + ICE_FD_STAT_CH)
+
+/**
+ * ice_vsi_fd_ena
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is capable for usage of flow-director
+ * otherwise returns false
+ */
+static inline bool ice_vsi_fd_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
+}
+
+/**
+ * ice_vsi_inline_fd_ena
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for usage of flow-director
+ * otherwise returns false. This is controlled thru' ethtool priv-flag
+ * 'channel-inline-flow-director'
+ */
+static inline bool ice_vsi_inline_fd_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
+}
+
+static inline bool ice_vsi_inline_fd_mark_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+}
+
+/**
+ * ice_get_current_fd_cnt - Get total FD filters programmed for this VSI
+ * @vsi: ptr to VSI
+ */
+static inline u32 ice_get_current_fd_cnt(struct ice_vsi *vsi)
+{
+	u32 val;
+
+	val = rd32(&vsi->back->hw, VSIQF_FD_CNT(vsi->vsi_num));
+
+	return (val & VSIQF_FD_CNT_FD_GCNT_M) +
+		((val & VSIQF_FD_CNT_FD_BCNT_M) >>
+		VSIQF_FD_CNT_FD_BCNT_S);
+}
+
+/**
+ * ice_read_cntr - read counter value using counter_index
+ * @pf: ptr to PF
+ * @counter_index: index of counter to be read
+ */
+static inline u64 ice_read_cntr(struct ice_pf *pf, u32 counter_index)
+{
+	/* Read the HW counter based on counter_index */
+	return ((u64)rd32(&pf->hw, GLSTAT_FD_CNT0H(counter_index)) << 32) |
+		rd32(&pf->hw, GLSTAT_FD_CNT0L(counter_index));
+}
+
+/**
+ * ice_clear_cntr - initialize counter to zero
+ * @pf: ptr to PF
+ * @counter_index: index of counter to be initialized
+ */
+static inline void ice_clear_cntr(struct ice_pf *pf, u32 counter_index)
+{
+	/* Read the HW counter based on counter_index */
+	wr32(&pf->hw, GLSTAT_FD_CNT0H(counter_index), 0);
+	wr32(&pf->hw, GLSTAT_FD_CNT0L(counter_index), 0);
+}
+
+/**
+ * ice_is_vsi_fd_table_full - VSI specific FD table is full or not
+ * @vsi: ptr to VSI
+ * @cnt: fd count, specific to VSI
+ *
+ * Retutn true if HW FD table specific to VSI is full, otherwise false
+ */
+static inline bool ice_is_vsi_fd_table_full(struct ice_vsi *vsi, u32 cnt)
+{
+	u32 max_allowed_fltr_cnt;
+
+	if (!cnt)
+		return false;
+
+	if (!vsi->num_gfltr && !vsi->num_bfltr)
+		return false;
+	/* determine if 'cnt' reached max_allowed for specified VSI,
+	 * if so, return HW table full for that specific VSI
+	 */
+	max_allowed_fltr_cnt = vsi->num_gfltr + vsi->num_bfltr - 1;
+
+	return cnt >= max_allowed_fltr_cnt;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_is_adq_active - any active ADQs
+ * @pf: pointer to PF
+ *
+ * This function returns true if there are any ADQs configured (which is
+ * determined by looking at VSI type (which should be VSI_PF), numtc, and
+ * TC_MQPRIO flag) otherwise return false
+ */
+static inline bool ice_is_adq_active(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
+
+	/* is ADQ configured */
+	if (vsi->tc_cfg.numtc > ICE_CHNL_START_TC &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		return true;
+
+	return false;
+}
+#endif /* NETIF_F_HW_TC */
+/**
+ * ice_vsi_pkt_inspect_opt_ena - packet inspection based optimization is ON/OFF
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for optimization based on
+ * control/data packet. By default, respective PF priv flags is ON (which user
+ * can change using ethtool if needed), hence by default VSI level feature
+ * flags is also ON. If user changes PF level priv flag after creating channel
+ * VSIs (aka ADQ VSI), those changes are not reflected in VSI level feature
+ * flag by design.
+ */
+static inline bool ice_vsi_pkt_inspect_opt_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
+}
+
+/**
+ * ice_vsi_pkt_process_bp_stop_ena - packet process ON/OFF from bp stop
+ * @vsi: pointer to VSI
+ *
+ * This function returns true if VSI is enabled for optimization to allow
+ * Tx/Rx cleanup from busy_poll_stop code path. There is an associated
+ * priv flag to control this feature and applicable only for channel (aka ADQ)
+ * specific vectors
+ */
+static inline bool ice_vsi_pkt_process_bp_stop_ena(struct ice_vsi *vsi)
+{
+	return !!test_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			  vsi->features);
+}
+
+static inline bool ice_active_vmdqs(struct ice_pf *pf)
+{
+	return !!ice_find_first_vsi_by_type(pf, ICE_VSI_VMDQ2);
+}
+
+#ifdef HAVE_NETDEV_SB_DEV
+static inline bool ice_is_offloaded_macvlan_ena(struct ice_pf *pf)
+{
+	return test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
+}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef CONFIG_DEBUG_FS
+void ice_debugfs_pf_init(struct ice_pf *pf);
+void ice_debugfs_pf_exit(struct ice_pf *pf);
+void ice_debugfs_init(void);
+void ice_debugfs_exit(void);
+#else
+static inline void ice_debugfs_pf_init(struct ice_pf *pf) { }
+static inline void ice_debugfs_pf_exit(struct ice_pf *pf) { }
+static inline void ice_debugfs_init(void) { }
+static inline void ice_debugfs_exit(void) { }
+#endif /* CONFIG_DEBUG_FS */
+
+bool netif_is_ice(struct net_device *dev);
 int ice_vsi_setup_tx_rings(struct ice_vsi *vsi);
 int ice_vsi_setup_rx_rings(struct ice_vsi *vsi);
+int ice_vsi_open_ctrl(struct ice_vsi *vsi);
+int ice_vsi_open(struct ice_vsi *vsi);
+void ice_set_ethtool_repr_ops(struct net_device *netdev);
 void ice_set_ethtool_ops(struct net_device *netdev);
+void ice_set_ethtool_recovery_ops(struct net_device *netdev);
 void ice_set_ethtool_safe_mode_ops(struct net_device *netdev);
 u16 ice_get_avail_txq_count(struct ice_pf *pf);
 u16 ice_get_avail_rxq_count(struct ice_pf *pf);
-void ice_update_vsi_stats(struct ice_vsi *vsi);
+int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx);
 void ice_update_pf_stats(struct ice_pf *pf);
+void ice_update_vsi_stats(struct ice_vsi *vsi);
 int ice_up(struct ice_vsi *vsi);
 int ice_down(struct ice_vsi *vsi);
 int ice_vsi_cfg(struct ice_vsi *vsi);
 struct ice_vsi *ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi);
-int ice_set_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
-int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
+#ifdef HAVE_NETDEV_SB_DEV
+int ice_vsi_cfg_netdev_tc0(struct ice_vsi *vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+#ifdef HAVE_XDP_SUPPORT
+int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog);
+int ice_destroy_xdp_rings(struct ice_vsi *vsi);
+#ifndef NO_NDO_XDP_FLUSH
+void ice_xdp_flush(struct net_device *dev);
+#endif /* NO_NDO_XDP_FLUSH */
+#ifdef HAVE_XDP_FRAME_STRUCT
+int
+ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+	     u32 flags);
+#else
+int ice_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+#endif /* HAVE_XDP_FRAME_STRUCT */
+#endif /* HAVE_XDP_SUPPORT */
+int ice_set_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size);
+int ice_get_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size);
+int ice_set_rss_key(struct ice_vsi *vsi, u8 *seed);
+int ice_get_rss_key(struct ice_vsi *vsi, u8 *seed);
 void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size);
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset);
 void ice_print_link_msg(struct ice_vsi *vsi, bool isup);
-#ifdef CONFIG_DCB
-int ice_pf_ena_all_vsi(struct ice_pf *pf, bool locked);
-void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked);
-#endif /* CONFIG_DCB */
+#if IS_ENABLED(CONFIG_MFD_CORE)
+int ice_init_peer_devices(struct ice_pf *pf);
+int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *));
+#ifdef CONFIG_PM
+void ice_peer_refresh_msix(struct ice_pf *pf);
+#endif /* CONFIG_PM */
+#else /* !CONFIG_MFD_CORE */
+static inline int ice_init_peer_devices(struct ice_pf *pf) { return 0; }
+
+static inline int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *))
+{
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static inline void ice_peer_refresh_msix(struct ice_pf *pf) { }
+#endif /* CONFIG_PM */
+#endif /* !CONFIG_MFD_CORE */
+const char *ice_stat_str(enum ice_status stat_err);
+const char *ice_aq_str(enum ice_aq_err aq_err);
+bool ice_is_wol_supported(struct ice_hw *hw);
+int ice_aq_wait_for_event(struct ice_pf *pf, u16 opcode, unsigned long timeout,
+			  struct ice_rq_event_info *event);
+int
+ice_fdir_write_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input, bool add,
+		    bool is_tun);
+void ice_vsi_manage_fdir(struct ice_vsi *vsi, bool ena);
+int ice_add_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_del_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_get_ethtool_fdir_entry(struct ice_hw *hw, struct ethtool_rxnfc *cmd);
+u32 ice_ntuple_get_max_fltr_cnt(struct ice_hw *hw);
+int
+ice_ntuple_set_input_set(struct ice_vsi *vsi, enum ice_block blk,
+			 struct ethtool_rx_flow_spec *fsp,
+			 struct ice_fdir_fltr *input);
+int
+ice_ntuple_l4_proto_to_port(enum ice_flow_seg_hdr l4_proto,
+			    enum ice_flow_field *src_port,
+			    enum ice_flow_field *dst_port);
+int ice_ntuple_check_ip4_seg(struct ethtool_tcpip4_spec *tcp_ip4_spec);
+int ice_ntuple_check_ip4_usr_seg(struct ethtool_usrip4_spec *usr_ip4_spec);
+int
+ice_get_fdir_fltr_ids(struct ice_hw *hw, struct ethtool_rxnfc *cmd,
+		      u32 *rule_locs);
+void ice_fdir_rem_adq_chnl(struct ice_hw *hw, u16 vsi_idx);
+void ice_fdir_release_flows(struct ice_hw *hw);
+void ice_fdir_replay_flows(struct ice_hw *hw);
+void ice_fdir_replay_fltrs(struct ice_pf *pf);
+int ice_fdir_create_dflt_rules(struct ice_pf *pf);
+enum ice_fltr_ptype ice_ethtool_flow_to_fltr(int eth);
+int
+ice_ntuple_update_list_entry(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			     int fltr_idx);
+void ice_update_ring_dest_vsi(struct ice_vsi *vsi, u16 *dest_vsi, u32 *ring);
 int ice_open(struct net_device *netdev);
+int ice_open_internal(struct net_device *netdev);
 int ice_stop(struct net_device *netdev);
-
+void ice_service_task_schedule(struct ice_pf *pf);
+int
+ice_acl_add_rule_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd);
+int ice_init_acl(struct ice_pf *pf);
 #endif /* _ICE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_acl.c b/drivers/net/ethernet/intel/ice/ice_acl.c
new file mode 100644
index 0000000000000000000000000000000000000000..fcf0fea30e1793c98b6153bb3824b60f14b90e53
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl.c
@@ -0,0 +1,592 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_acl.h"
+#include "ice_adminq_cmd.h"
+
+/**
+ * ice_aq_alloc_acl_tbl - allocate ACL table
+ * @hw: pointer to the HW struct
+ * @tbl: pointer to ice_acl_alloc_tbl struct
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL table (indirect 0x0C10)
+ */
+enum ice_status
+ice_aq_alloc_acl_tbl(struct ice_hw *hw, struct ice_acl_alloc_tbl *tbl,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_table *cmd;
+	struct ice_aq_desc desc;
+
+	if (!tbl->act_pairs_per_entry)
+		return ICE_ERR_PARAM;
+
+	if (tbl->act_pairs_per_entry > ICE_AQC_MAX_ACTION_MEMORIES)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* If this is concurrent table, then buffer shall be valid and
+	 * contain DependentAllocIDs, 'num_dependent_alloc_ids' should be valid
+	 * and within limit
+	 */
+	if (tbl->concurr) {
+		if (!tbl->num_dependent_alloc_ids)
+			return ICE_ERR_PARAM;
+		if (tbl->num_dependent_alloc_ids >
+		    ICE_AQC_MAX_CONCURRENT_ACL_TBL)
+			return ICE_ERR_INVAL_SIZE;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_tbl);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.alloc_table;
+	cmd->table_width = cpu_to_le16(tbl->width * BITS_PER_BYTE);
+	cmd->table_depth = cpu_to_le16(tbl->depth);
+	cmd->act_pairs_per_entry = tbl->act_pairs_per_entry;
+	if (tbl->concurr)
+		cmd->table_type = tbl->num_dependent_alloc_ids;
+
+	return ice_aq_send_cmd(hw, &desc, &tbl->buf, sizeof(tbl->buf), cd);
+}
+
+/**
+ * ice_aq_dealloc_acl_tbl - deallocate ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being released
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Deallocate ACL table (indirect 0x0C11)
+ *
+ * NOTE: This command has no buffer format for command itself but response
+ * format is 'struct ice_aqc_acl_generic', pass ptr to that struct
+ * as 'buf' and its size as 'buf_size'
+ */
+enum ice_status
+ice_aq_dealloc_acl_tbl(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_tbl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_tbl);
+	cmd = &desc.params.tbl_actpair;
+	cmd->alloc_id = cpu_to_le16(alloc_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+static enum ice_status
+ice_aq_acl_entry(struct ice_hw *hw, u16 opcode, u8 tcam_idx, u16 entry_idx,
+		 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_entry *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+
+	if (opcode == ice_aqc_opc_program_acl_entry)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.program_query_entry;
+	cmd->tcam_index = tcam_idx;
+	cmd->entry_index = cpu_to_le16(entry_idx);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_program_acl_entry - program ACL entry
+ * @hw: pointer to the HW struct
+ * @tcam_idx: Updated TCAM block index
+ * @entry_idx: updated entry index
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL entry (direct 0x0C20)
+ */
+enum ice_status
+ice_aq_program_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+			 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_acl_entry(hw, ice_aqc_opc_program_acl_entry, tcam_idx,
+				entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_query_acl_entry - query ACL entry
+ * @hw: pointer to the HW struct
+ * @tcam_idx: Updated TCAM block index
+ * @entry_idx: updated entry index
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL entry (direct 0x0C24)
+ *
+ * NOTE: Caller of this API to parse 'buf' appropriately since it contains
+ * response (key and key invert)
+ */
+enum ice_status
+ice_aq_query_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+		       struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_acl_entry(hw, ice_aqc_opc_query_acl_entry, tcam_idx,
+				entry_idx, buf, cd);
+}
+
+/* Helper function to alloc/dealloc ACL action pair */
+static enum ice_status
+ice_aq_actpair_a_d(struct ice_hw *hw, u16 opcode, u16 alloc_id,
+		   struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_tbl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	cmd = &desc.params.tbl_actpair;
+	cmd->alloc_id = cpu_to_le16(alloc_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_alloc_actpair - allocate actionpair for specified ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being associated with the actionpair
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL actionpair (direct 0x0C12)
+ *
+ * This command doesn't need and doesn't have its own command buffer
+ * but for response format is as specified in 'struct ice_aqc_acl_generic'
+ */
+enum ice_status
+ice_aq_alloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		     struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_a_d(hw, ice_aqc_opc_alloc_acl_actpair, alloc_id,
+				  buf, cd);
+}
+
+/**
+ * ice_aq_dealloc_actpair - dealloc actionpair for specified ACL table
+ * @hw: pointer to the HW struct
+ * @alloc_id: allocation ID of the table being associated with the actionpair
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ *  Deallocate ACL actionpair (direct 0x0C13)
+ */
+enum ice_status
+ice_aq_dealloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_a_d(hw, ice_aqc_opc_dealloc_acl_actpair, alloc_id,
+				  buf, cd);
+}
+
+/* Helper function to program/query ACL action pair */
+static enum ice_status
+ice_aq_actpair_p_q(struct ice_hw *hw, u16 opcode, u8 act_mem_idx,
+		   u16 act_entry_idx, struct ice_aqc_actpair *buf,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_actpair *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+
+	if (opcode == ice_aqc_opc_program_acl_actpair)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.program_query_actpair;
+	cmd->act_mem_index = act_mem_idx;
+	cmd->act_entry_index = cpu_to_le16(act_entry_idx);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_program_actpair - program ACL actionpair
+ * @hw: pointer to the HW struct
+ * @act_mem_idx: action memory index to program/update/query
+ * @act_entry_idx: the entry index in action memory to be programmed/updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program action entries (indirect 0x0C1C)
+ */
+enum ice_status
+ice_aq_program_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		       struct ice_aqc_actpair *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_p_q(hw, ice_aqc_opc_program_acl_actpair,
+				  act_mem_idx, act_entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_query_actpair - query ACL actionpair
+ * @hw: pointer to the HW struct
+ * @act_mem_idx: action memory index to program/update/query
+ * @act_entry_idx: the entry index in action memory to be programmed/updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL actionpair (indirect 0x0C25)
+ */
+enum ice_status
+ice_aq_query_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		     struct ice_aqc_actpair *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_actpair_p_q(hw, ice_aqc_opc_query_acl_actpair,
+				  act_mem_idx, act_entry_idx, buf, cd);
+}
+
+/**
+ * ice_aq_dealloc_acl_res - deallocate ACL resources
+ * @hw: pointer to the HW struct
+ * @cd: pointer to command details structure or NULL
+ *
+ * De-allocate ACL resources (direct 0x0C1A). Used by SW to release all the
+ * resources allocated for it using a single command
+ */
+enum ice_status ice_aq_dealloc_acl_res(struct ice_hw *hw, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_res);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_acl_prof_aq_send - sending ACL profile AQ commands
+ * @hw: pointer to the HW struct
+ * @opc: command opcode
+ * @prof_id: profile ID
+ * @buf: ptr to buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function sends ACL profile commands
+ */
+static enum ice_status
+ice_acl_prof_aq_send(struct ice_hw *hw, u16 opc, u8 prof_id,
+		     struct ice_aqc_acl_prof_generic_frmt *buf,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	desc.params.profile.profile_id = prof_id;
+	if (opc == ice_aqc_opc_program_acl_prof_extraction ||
+	    opc == ice_aqc_opc_program_acl_prof_ranges)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_prgm_acl_prof_xtrct - program ACL profile extraction sequence
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @buf: ptr to buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL profile extraction (indirect 0x0C1D)
+ */
+enum ice_status
+ice_prgm_acl_prof_xtrct(struct ice_hw *hw, u8 prof_id,
+			struct ice_aqc_acl_prof_generic_frmt *buf,
+			struct ice_sq_cd *cd)
+{
+	return ice_acl_prof_aq_send(hw, ice_aqc_opc_program_acl_prof_extraction,
+				    prof_id, buf, cd);
+}
+
+/**
+ * ice_query_acl_prof - query ACL profile
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @buf: ptr to buffer (which will contain response of this command)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL profile (indirect 0x0C21)
+ */
+enum ice_status
+ice_query_acl_prof(struct ice_hw *hw, u8 prof_id,
+		   struct ice_aqc_acl_prof_generic_frmt *buf,
+		   struct ice_sq_cd *cd)
+{
+	return ice_acl_prof_aq_send(hw, ice_aqc_opc_query_acl_prof, prof_id,
+				    buf, cd);
+}
+
+/**
+ * ice_aq_acl_cntrs_chk_params - Checks ACL counter parameters
+ * @cntrs: ptr to buffer describing input and output params
+ *
+ * This function checks the counter bank range for counter type and returns
+ * success or failure.
+ */
+static enum ice_status ice_aq_acl_cntrs_chk_params(struct ice_acl_cntrs *cntrs)
+{
+	enum ice_status status = 0;
+
+	if (!cntrs || !cntrs->amount)
+		return ICE_ERR_PARAM;
+
+	switch (cntrs->type) {
+	case ICE_AQC_ACL_CNT_TYPE_SINGLE:
+		/* Single counter type - configured to count either bytes
+		 * or packets, the valid values for byte or packet counters
+		 * shall be 0-3.
+		 */
+		if (cntrs->bank > ICE_AQC_ACL_MAX_CNT_SINGLE)
+			status = ICE_ERR_OUT_OF_RANGE;
+		break;
+	case ICE_AQC_ACL_CNT_TYPE_DUAL:
+		/* Pair counter type - counts number of bytes and packets
+		 * The valid values for byte/packet counter duals shall be 0-1
+		 */
+		if (cntrs->bank > ICE_AQC_ACL_MAX_CNT_DUAL)
+			status = ICE_ERR_OUT_OF_RANGE;
+		break;
+	default:
+		/* Unspecified counter type - Invalid or error */
+		status = ICE_ERR_PARAM;
+	}
+
+	return status;
+}
+
+/**
+ * ice_aq_alloc_acl_cntrs - allocate ACL counters
+ * @hw: pointer to the HW struct
+ * @cntrs: ptr to buffer describing input and output params
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL counters (indirect 0x0C16). This function attempts to
+ * allocate a contiguous block of counters. In case of failures, caller can
+ * attempt to allocate a smaller chunk. The allocation is considered
+ * unsuccessful if returned counter value is invalid. In this case it returns
+ * an error otherwise success.
+ */
+enum ice_status
+ice_aq_alloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+		       struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_counters *cmd;
+	u16 first_cntr, last_cntr;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	/* check for invalid params */
+	status = ice_aq_acl_cntrs_chk_params(cntrs);
+	if (status)
+		return status;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_counters);
+	cmd = &desc.params.alloc_counters;
+	cmd->counter_amount = cntrs->amount;
+	cmd->counters_type = cntrs->type;
+	cmd->bank_alloc = cntrs->bank;
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		first_cntr = le16_to_cpu(cmd->ops.resp.first_counter);
+		last_cntr = le16_to_cpu(cmd->ops.resp.last_counter);
+		if (first_cntr == ICE_AQC_ACL_ALLOC_CNT_INVAL ||
+		    last_cntr == ICE_AQC_ACL_ALLOC_CNT_INVAL)
+			return ICE_ERR_OUT_OF_RANGE;
+		cntrs->first_cntr = first_cntr;
+		cntrs->last_cntr = last_cntr;
+	}
+	return status;
+}
+
+/**
+ * ice_aq_dealloc_acl_cntrs - deallocate ACL counters
+ * @hw: pointer to the HW struct
+ * @cntrs: ptr to buffer describing input and output params
+ * @cd: pointer to command details structure or NULL
+ *
+ * De-allocate ACL counters (direct 0x0C17)
+ */
+enum ice_status
+ice_aq_dealloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_dealloc_counters *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	/* check for invalid params */
+	status = ice_aq_acl_cntrs_chk_params(cntrs);
+	if (status)
+		return status;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_counters);
+	cmd = &desc.params.dealloc_counters;
+	cmd->first_counter = cpu_to_le16(cntrs->first_cntr);
+	cmd->last_counter = cpu_to_le16(cntrs->last_cntr);
+	cmd->counters_type = cntrs->type;
+	cmd->bank_alloc = cntrs->bank;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+
+/**
+ * ice_prog_acl_prof_ranges - program ACL profile ranges
+ * @hw: pointer to the HW struct
+ * @prof_id: programmed or updated profile ID
+ * @buf: pointer to input buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program ACL profile ranges (indirect 0x0C1E)
+ */
+enum ice_status
+ice_prog_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			 struct ice_aqc_acl_profile_ranges *buf,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_program_acl_prof_ranges);
+	desc.params.profile.profile_id = prof_id;
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_query_acl_prof_ranges - query ACL profile ranges
+ * @hw: pointer to the HW struct
+ * @prof_id: programmed or updated profile ID
+ * @buf: pointer to response buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL profile ranges (indirect 0x0C22)
+ */
+enum ice_status
+ice_query_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			  struct ice_aqc_acl_profile_ranges *buf,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_query_acl_prof_ranges);
+	desc.params.profile.profile_id = prof_id;
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_alloc_acl_scen - allocate ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: memory location to receive allocated scenario ID
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Allocate ACL scenario (indirect 0x0C14)
+ */
+enum ice_status
+ice_aq_alloc_acl_scen(struct ice_hw *hw, u16 *scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_alloc_scen *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!scen_id)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_alloc_acl_scen);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = &desc.params.alloc_scen;
+
+	status = ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+	if (!status)
+		*scen_id = le16_to_cpu(cmd->ops.resp.scen_id);
+
+	return status;
+}
+
+/**
+ * ice_aq_dealloc_acl_scen - deallocate ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be deallocated (input and output field)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Deallocate ACL scenario (direct 0x0C15)
+ */
+enum ice_status
+ice_aq_dealloc_acl_scen(struct ice_hw *hw, u16 scen_id, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_dealloc_scen *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dealloc_acl_scen);
+	cmd = &desc.params.dealloc_scen;
+	cmd->scen_id = cpu_to_le16(scen_id);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_update_query_scen - update or query ACL scenario
+ * @hw: pointer to the HW struct
+ * @opcode: AQ command opcode for either query or update scenario
+ * @scen_id: scen_id to be updated or queried
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Calls update or query ACL scenario
+ */
+static enum ice_status
+ice_aq_update_query_scen(struct ice_hw *hw, u16 opcode, u16 scen_id,
+			 struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_acl_update_query_scen *cmd;
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	if (opcode == ice_aqc_opc_update_acl_scen)
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = &desc.params.update_query_scen;
+	cmd->scen_id = cpu_to_le16(scen_id);
+
+	return ice_aq_send_cmd(hw, &desc, buf, sizeof(*buf), cd);
+}
+
+/**
+ * ice_aq_update_acl_scen - update ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be updated
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update ACL scenario (indirect 0x0C1B)
+ */
+enum ice_status
+ice_aq_update_acl_scen(struct ice_hw *hw, u16 scen_id,
+		       struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_update_query_scen(hw, ice_aqc_opc_update_acl_scen,
+					scen_id, buf, cd);
+}
+
+/**
+ * ice_aq_query_acl_scen - query ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: scen_id to be queried
+ * @buf: address of indirect data buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query ACL scenario (indirect 0x0C23)
+ */
+enum ice_status
+ice_aq_query_acl_scen(struct ice_hw *hw, u16 scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd)
+{
+	return ice_aq_update_query_scen(hw, ice_aqc_opc_query_acl_scen,
+					scen_id, buf, cd);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_acl.h b/drivers/net/ethernet/intel/ice/ice_acl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdfe2681935c281be36fa94d21432f2cc9d8b10f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ACL_H_
+#define _ICE_ACL_H_
+
+#include "ice_common.h"
+#include "ice_adminq_cmd.h"
+
+struct ice_acl_tbl_params {
+	u16 width;	/* Select/match bytes */
+	u16 depth;	/* Number of entries */
+
+#define ICE_ACL_TBL_MAX_DEP_TBLS	15
+	u16 dep_tbls[ICE_ACL_TBL_MAX_DEP_TBLS];
+
+	u8 entry_act_pairs;	/* Action pairs per entry */
+	u8 concurr;		/* Concurrent table lookup enable */
+};
+
+struct ice_acl_act_mem {
+	u8 act_mem;
+#define ICE_ACL_ACT_PAIR_MEM_INVAL	0xff
+	u8 member_of_tcam;
+};
+
+struct ice_acl_tbl {
+	/* TCAM configuration */
+	u8 first_tcam;	/* Index of the first TCAM block */
+	u8 last_tcam;	/* Index of the last TCAM block */
+	/* Index of the first entry in the first TCAM */
+	u16 first_entry;
+	/* Index of the last entry in the last TCAM */
+	u16 last_entry;
+
+	/* List of active scenarios */
+	struct list_head scens;
+
+	struct ice_acl_tbl_params info;
+	struct ice_acl_act_mem act_mems[ICE_AQC_MAX_ACTION_MEMORIES];
+
+	/* Keep track of available 64-entry chunks in TCAMs */
+	DECLARE_BITMAP(avail, ICE_AQC_ACL_ALLOC_UNITS);
+
+	u16 id;
+};
+
+#define ICE_MAX_ACL_TCAM_ENTRY (ICE_AQC_ACL_TCAM_DEPTH * ICE_AQC_ACL_SLICES)
+enum ice_acl_entry_prio {
+	ICE_ACL_PRIO_LOW = 0,
+	ICE_ACL_PRIO_NORMAL,
+	ICE_ACL_PRIO_HIGH,
+	ICE_ACL_MAX_PRIO
+};
+
+/* Scenario structure
+ * A scenario is a logical partition within an ACL table. It can span more
+ * than one TCAM in cascade mode to support select/mask key widths larger.
+ * than the width of a TCAM. It can also span more than one TCAM in stacked
+ * mode to support larger number of entries than what a TCAM can hold. It is
+ * used to select values from selection bases (field vectors holding extract
+ * protocol header fields) to form lookup keys, and to associate action memory
+ * banks to the TCAMs used.
+ */
+struct ice_acl_scen {
+	struct list_head list_entry;
+	/* If nth bit of act_mem_bitmap is set, then nth action memory will
+	 * participate in this scenario
+	 */
+	DECLARE_BITMAP(act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES);
+
+	/* If nth bit of entry_bitmap is set, then nth entry will
+	 * be available in this scenario
+	 */
+	DECLARE_BITMAP(entry_bitmap, ICE_MAX_ACL_TCAM_ENTRY);
+	u16 first_idx[ICE_ACL_MAX_PRIO];
+	u16 last_idx[ICE_ACL_MAX_PRIO];
+
+	u16 id;
+	u16 start;	/* Number of entry from the start of the parent table */
+#define ICE_ACL_SCEN_MIN_WIDTH	0x3
+	u16 width;	/* Number of select/mask bytes */
+	u16 num_entry;	/* Number of scenario entry */
+	u16 end;	/* Last addressable entry from start of table */
+	u8 eff_width;	/* Available width in bytes to match */
+#define ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM	0x2
+#define ICE_ACL_SCEN_PID_IDX_IN_TCAM		0x3
+#define ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM	0x4
+	u8 pid_idx;	/* Byte index used to match profile ID */
+	u8 rng_chk_idx;	/* Byte index used to match range checkers result */
+	u8 pkt_dir_idx;	/* Byte index used to match packet direction */
+};
+
+/* This structure represents input fields needed to allocate ACL table */
+struct ice_acl_alloc_tbl {
+	/* Table's width in number of bytes matched */
+	u16 width;
+	/* Table's depth in number of entries. */
+	u16 depth;
+	u8 num_dependent_alloc_ids;	/* number of depdendent alloc IDs */
+	u8 concurr;			/* true for concurrent table type */
+
+	/* Amount of action pairs per table entry. Minimal valid
+	 * value for this field is 1 (e.g. single pair of actions)
+	 */
+	u8 act_pairs_per_entry;
+	union {
+		struct ice_aqc_acl_alloc_table_data data_buf;
+		struct ice_aqc_acl_generic resp_buf;
+	} buf;
+};
+
+/* This structure is used to communicate input and output params for
+ * [de]allocate_acl_counters
+ */
+struct ice_acl_cntrs {
+	u8 amount;
+	u8 type;
+	u8 bank;
+
+	/* Next 2 variables are used for output in case of alloc_acl_counters
+	 * and input in case of deallocate_acl_counters
+	 */
+	u16 first_cntr;
+	u16 last_cntr;
+};
+
+enum ice_status
+ice_acl_create_tbl(struct ice_hw *hw, struct ice_acl_tbl_params *params);
+enum ice_status ice_acl_destroy_tbl(struct ice_hw *hw);
+enum ice_status
+ice_acl_create_scen(struct ice_hw *hw, u16 match_width, u16 num_entries,
+		    u16 *scen_id);
+enum ice_status
+ice_aq_alloc_acl_tbl(struct ice_hw *hw, struct ice_acl_alloc_tbl *tbl,
+		     struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_tbl(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_program_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+			 struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_acl_entry(struct ice_hw *hw, u8 tcam_idx, u16 entry_idx,
+		       struct ice_aqc_acl_data *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		     struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_actpair(struct ice_hw *hw, u16 alloc_id,
+		       struct ice_aqc_acl_generic *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_program_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		       struct ice_aqc_actpair *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_actpair(struct ice_hw *hw, u8 act_mem_idx, u16 act_entry_idx,
+		     struct ice_aqc_actpair *buf, struct ice_sq_cd *cd);
+enum ice_status ice_aq_dealloc_acl_res(struct ice_hw *hw, struct ice_sq_cd *cd);
+enum ice_status
+ice_prgm_acl_prof_xtrct(struct ice_hw *hw, u8 prof_id,
+			struct ice_aqc_acl_prof_generic_frmt *buf,
+			struct ice_sq_cd *cd);
+enum ice_status
+ice_query_acl_prof(struct ice_hw *hw, u8 prof_id,
+		   struct ice_aqc_acl_prof_generic_frmt *buf,
+		   struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_cntrs(struct ice_hw *hw, struct ice_acl_cntrs *cntrs,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_prog_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			 struct ice_aqc_acl_profile_ranges *buf,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_query_acl_prof_ranges(struct ice_hw *hw, u8 prof_id,
+			  struct ice_aqc_acl_profile_ranges *buf,
+			  struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_alloc_acl_scen(struct ice_hw *hw, u16 *scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dealloc_acl_scen(struct ice_hw *hw, u16 scen_id, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_acl_scen(struct ice_hw *hw, u16 scen_id,
+		       struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_acl_scen(struct ice_hw *hw, u16 scen_id,
+		      struct ice_aqc_acl_scen *buf, struct ice_sq_cd *cd);
+enum ice_status
+ice_acl_add_entry(struct ice_hw *hw, struct ice_acl_scen *scen,
+		  enum ice_acl_entry_prio prio, u8 *keys, u8 *inverts,
+		  struct ice_acl_act_entry *acts, u8 acts_cnt, u16 *entry_idx);
+enum ice_status
+ice_acl_prog_act(struct ice_hw *hw, struct ice_acl_scen *scen,
+		 struct ice_acl_act_entry *acts, u8 acts_cnt, u16 entry_idx);
+enum ice_status
+ice_acl_rem_entry(struct ice_hw *hw, struct ice_acl_scen *scen, u16 entry_idx);
+bool ice_is_acl_empty(struct ice_hw *hw);
+#endif /* _ICE_ACL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c b/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c
new file mode 100644
index 0000000000000000000000000000000000000000..a777d215f1b9d74c3d85f94354deedd1e8276e5d
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl_ctrl.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_acl.h"
+#include "ice_flow.h"
+
+
+/* Determine the TCAM index of entry 'e' within the ACL table */
+#define ICE_ACL_TBL_TCAM_IDX(e) ((e) / ICE_AQC_ACL_TCAM_DEPTH)
+
+/* Determine the entry index within the TCAM */
+#define ICE_ACL_TBL_TCAM_ENTRY_IDX(e) ((e) % ICE_AQC_ACL_TCAM_DEPTH)
+
+#define ICE_ACL_SCEN_ENTRY_INVAL 0xFFFF
+
+/**
+ * ice_acl_init_entry
+ * @scen: pointer to the scenario struct
+ *
+ * Initialize the scenario control structure.
+ */
+static void ice_acl_init_entry(struct ice_acl_scen *scen)
+{
+	/* low priority: start from the highest index, 25% of total entries
+	 * normal priority: start from the highest index, 50% of total entries
+	 * high priority: start from the lowest index, 25% of total entries
+	 */
+	scen->first_idx[ICE_ACL_PRIO_LOW] = scen->num_entry - 1;
+	scen->first_idx[ICE_ACL_PRIO_NORMAL] = scen->num_entry -
+		scen->num_entry / 4 - 1;
+	scen->first_idx[ICE_ACL_PRIO_HIGH] = 0;
+
+	scen->last_idx[ICE_ACL_PRIO_LOW] = scen->num_entry -
+		scen->num_entry / 4;
+	scen->last_idx[ICE_ACL_PRIO_NORMAL] = scen->num_entry / 4;
+	scen->last_idx[ICE_ACL_PRIO_HIGH] = scen->num_entry / 4 - 1;
+}
+
+/**
+ * ice_acl_scen_assign_entry_idx
+ * @scen: pointer to the scenario struct
+ * @prio: the priority of the flow entry being allocated
+ *
+ * To find the index of an available entry in scenario
+ *
+ * Returns ICE_ACL_SCEN_ENTRY_INVAL if fails
+ * Returns index on success
+ */
+static u16
+ice_acl_scen_assign_entry_idx(struct ice_acl_scen *scen,
+			      enum ice_acl_entry_prio prio)
+{
+	u16 first_idx, last_idx, i;
+	s8 step;
+
+	if (prio >= ICE_ACL_MAX_PRIO)
+		return ICE_ACL_SCEN_ENTRY_INVAL;
+
+	first_idx = scen->first_idx[prio];
+	last_idx = scen->last_idx[prio];
+	step = first_idx <= last_idx ? 1 : -1;
+
+	for (i = first_idx; i != last_idx + step; i += step)
+		if (!test_and_set_bit(i, scen->entry_bitmap))
+			return i;
+
+	return ICE_ACL_SCEN_ENTRY_INVAL;
+}
+
+/**
+ * ice_acl_scen_free_entry_idx
+ * @scen: pointer to the scenario struct
+ * @idx: the index of the flow entry being de-allocated
+ *
+ * To mark an entry available in scenario
+ */
+static enum ice_status
+ice_acl_scen_free_entry_idx(struct ice_acl_scen *scen, u16 idx)
+{
+	if (idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!test_and_clear_bit(idx, scen->entry_bitmap))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	return 0;
+}
+
+/**
+ * ice_acl_tbl_calc_end_idx
+ * @start: start index of the TCAM entry of this partition
+ * @num_entries: number of entries in this partition
+ * @width: width of a partition in number of TCAMs
+ *
+ * Calculate the end entry index for a partition with starting entry index
+ * 'start', entries 'num_entries', and width 'width'.
+ */
+static u16 ice_acl_tbl_calc_end_idx(u16 start, u16 num_entries, u16 width)
+{
+	u16 end_idx, add_entries = 0;
+
+	end_idx = start + (num_entries - 1);
+
+	/* In case that our ACL partition requires cascading TCAMs */
+	if (width > 1) {
+		u16 num_stack_level;
+
+		/* Figure out the TCAM stacked level in this ACL scenario */
+		num_stack_level = (start % ICE_AQC_ACL_TCAM_DEPTH) +
+			num_entries;
+		num_stack_level = DIV_ROUND_UP(num_stack_level,
+					       ICE_AQC_ACL_TCAM_DEPTH);
+
+		/* In this case, each entries in our ACL partition span
+		 * multiple TCAMs. Thus, we will need to add
+		 * ((width - 1) * num_stack_level) TCAM's entries to
+		 * end_idx.
+		 *
+		 * For example : In our case, our scenario is 2x2:
+		 *	[TCAM 0]	[TCAM 1]
+		 *	[TCAM 2]	[TCAM 3]
+		 * Assuming that a TCAM will have 512 entries. If "start"
+		 * is 500, "num_entries" is 3 and "width" = 2, then end_idx
+		 * should be 1024 (belongs to TCAM 2).
+		 * Before going to this if statement, end_idx will have the
+		 * value of 512. If "width" is 1, then the final value of
+		 * end_idx is 512. However, in our case, width is 2, then we
+		 * will need add (2 - 1) * 1 * 512. As result, end_idx will
+		 * have the value of 1024.
+		 */
+		add_entries = (width - 1) * num_stack_level *
+			ICE_AQC_ACL_TCAM_DEPTH;
+	}
+
+	return end_idx + add_entries;
+}
+
+/**
+ * ice_acl_init_tbl
+ * @hw: pointer to the hardware structure
+ *
+ * Initialize the ACL table by invalidating TCAM entries and action pairs.
+ */
+static enum ice_status ice_acl_init_tbl(struct ice_hw *hw)
+{
+	struct ice_aqc_actpair act_buf;
+	struct ice_aqc_acl_data buf;
+	enum ice_status status = 0;
+	struct ice_acl_tbl *tbl;
+	u8 tcam_idx, i;
+	u16 idx;
+
+	tbl = hw->acl_tbl;
+	if (!tbl)
+		return ICE_ERR_CFG;
+
+	memset(&buf, 0, sizeof(buf));
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	tcam_idx = tbl->first_tcam;
+	idx = tbl->first_entry;
+	while (tcam_idx < tbl->last_tcam ||
+	       (tcam_idx == tbl->last_tcam && idx <= tbl->last_entry)) {
+		/* Use the same value for entry_key and entry_key_inv since
+		 * we are initializing the fields to 0
+		 */
+		status = ice_aq_program_acl_entry(hw, tcam_idx, idx, &buf,
+						  NULL);
+		if (status)
+			return status;
+
+		if (++idx > tbl->last_entry) {
+			tcam_idx++;
+			idx = tbl->first_entry;
+		}
+	}
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++) {
+		u16 act_entry_idx, start, end;
+
+		if (tbl->act_mems[i].act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL)
+			continue;
+
+		start = tbl->first_entry;
+		end = tbl->last_entry;
+
+		for (act_entry_idx = start; act_entry_idx <= end;
+		     act_entry_idx++) {
+			/* Invalidate all allocated action pairs */
+			status = ice_aq_program_actpair(hw, i, act_entry_idx,
+							&act_buf, NULL);
+			if (status)
+				return status;
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_acl_assign_act_mems_to_tcam
+ * @tbl: pointer to ACL table structure
+ * @cur_tcam: Index of current TCAM. Value = 0 to (ICE_AQC_ACL_SLICES - 1)
+ * @cur_mem_idx: Index of current action memory bank. Value = 0 to
+ *		 (ICE_AQC_MAX_ACTION_MEMORIES - 1)
+ * @num_mem: Number of action memory banks for this TCAM
+ *
+ * Assign "num_mem" valid action memory banks from "curr_mem_idx" to
+ * "curr_tcam" TCAM.
+ */
+static void
+ice_acl_assign_act_mems_to_tcam(struct ice_acl_tbl *tbl, u8 cur_tcam,
+				u8 *cur_mem_idx, u8 num_mem)
+{
+	u8 mem_cnt;
+
+	for (mem_cnt = 0;
+	     *cur_mem_idx < ICE_AQC_MAX_ACTION_MEMORIES && mem_cnt < num_mem;
+	     (*cur_mem_idx)++) {
+		struct ice_acl_act_mem *p_mem = &tbl->act_mems[*cur_mem_idx];
+
+		if (p_mem->act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL)
+			continue;
+
+		p_mem->member_of_tcam = cur_tcam;
+
+		mem_cnt++;
+	}
+}
+
+/**
+ * ice_acl_divide_act_mems_to_tcams
+ * @tbl: pointer to ACL table structure
+ *
+ * Figure out how to divide given action memory banks to given TCAMs. This
+ * division is for SW book keeping. In the time when scenario is created,
+ * an action memory bank can be used for different TCAM.
+ *
+ * For example, given that we have 2x2 ACL table with each table entry has
+ * 2 action memory pairs. As the result, we will have 4 TCAMs (T1,T2,T3,T4)
+ * and 4 action memory banks (A1,A2,A3,A4)
+ *	[T1 - T2] { A1 - A2 }
+ *	[T3 - T4] { A3 - A4 }
+ * In the time when we need to create a scenario, for example, 2x1 scenario,
+ * we will use [T3,T4] in a cascaded layout. As it is a requirement that all
+ * action memory banks in a cascaded TCAM's row will need to associate with
+ * the last TCAM. Thus, we will associate action memory banks [A3] and [A4]
+ * for TCAM [T4].
+ * For SW book-keeping purpose, we will keep theoretical maps between TCAM
+ * [Tn] to action memory bank [An].
+ */
+static void ice_acl_divide_act_mems_to_tcams(struct ice_acl_tbl *tbl)
+{
+	u16 num_cscd, stack_level, stack_idx, min_act_mem;
+	u8 tcam_idx = tbl->first_tcam;
+	u16 max_idx_to_get_extra;
+	u8 mem_idx = 0;
+
+	/* Determine number of stacked TCAMs */
+	stack_level = DIV_ROUND_UP(tbl->info.depth, ICE_AQC_ACL_TCAM_DEPTH);
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(tbl->info.width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* In a line of cascaded TCAM, given the number of action memory
+	 * banks per ACL table entry, we want to fairly divide these action
+	 * memory banks between these TCAMs.
+	 *
+	 * For example, there are 3 TCAMs (TCAM 3,4,5) in a line of
+	 * cascaded TCAM, and there are 7 act_mems for each ACL table entry.
+	 * The result is:
+	 *	[TCAM_3 will have 3 act_mems]
+	 *	[TCAM_4 will have 2 act_mems]
+	 *	[TCAM_5 will have 2 act_mems]
+	 */
+	min_act_mem = tbl->info.entry_act_pairs / num_cscd;
+	max_idx_to_get_extra = tbl->info.entry_act_pairs % num_cscd;
+
+	for (stack_idx = 0; stack_idx < stack_level; stack_idx++) {
+		u16 i;
+
+		for (i = 0; i < num_cscd; i++) {
+			u8 total_act_mem = min_act_mem;
+
+			if (i < max_idx_to_get_extra)
+				total_act_mem++;
+
+			ice_acl_assign_act_mems_to_tcam(tbl, tcam_idx,
+							&mem_idx,
+							total_act_mem);
+
+			tcam_idx++;
+		}
+	}
+}
+
+/**
+ * ice_acl_create_tbl
+ * @hw: pointer to the HW struct
+ * @params: parameters for the table to be created
+ *
+ * Create a LEM table for ACL usage. We are currently starting with some fixed
+ * values for the size of the table, but this will need to grow as more flow
+ * entries are added by the user level.
+ */
+enum ice_status
+ice_acl_create_tbl(struct ice_hw *hw, struct ice_acl_tbl_params *params)
+{
+	u16 width, depth, first_e, last_e, i;
+	struct ice_aqc_acl_generic *resp_buf;
+	struct ice_acl_alloc_tbl tbl_alloc;
+	struct ice_acl_tbl *tbl;
+	enum ice_status status;
+
+	if (hw->acl_tbl)
+		return ICE_ERR_ALREADY_EXISTS;
+
+	if (!params)
+		return ICE_ERR_PARAM;
+
+	/* round up the width to the next TCAM width boundary. */
+	width = roundup(params->width, (u16)ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	/* depth should be provided in chunk (64 entry) increments */
+	depth = ALIGN(params->depth, ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	if (params->entry_act_pairs < width / ICE_AQC_ACL_KEY_WIDTH_BYTES) {
+		params->entry_act_pairs = width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+		if (params->entry_act_pairs > ICE_AQC_TBL_MAX_ACTION_PAIRS)
+			params->entry_act_pairs = ICE_AQC_TBL_MAX_ACTION_PAIRS;
+	}
+
+	/* Validate that width*depth will not exceed the TCAM limit */
+	if ((DIV_ROUND_UP(depth, ICE_AQC_ACL_TCAM_DEPTH) *
+	     (width / ICE_AQC_ACL_KEY_WIDTH_BYTES)) > ICE_AQC_ACL_SLICES)
+		return ICE_ERR_MAX_LIMIT;
+
+	memset(&tbl_alloc, 0, sizeof(tbl_alloc));
+	tbl_alloc.width = width;
+	tbl_alloc.depth = depth;
+	tbl_alloc.act_pairs_per_entry = params->entry_act_pairs;
+	tbl_alloc.concurr = params->concurr;
+	/* Set dependent_alloc_id only for concurrent table type */
+	if (params->concurr) {
+		tbl_alloc.num_dependent_alloc_ids =
+			ICE_AQC_MAX_CONCURRENT_ACL_TBL;
+
+		for (i = 0; i < ICE_AQC_MAX_CONCURRENT_ACL_TBL; i++)
+			tbl_alloc.buf.data_buf.alloc_ids[i] =
+				cpu_to_le16(params->dep_tbls[i]);
+	}
+
+	/* call the AQ command to create the ACL table with these values */
+	status = ice_aq_alloc_acl_tbl(hw, &tbl_alloc, NULL);
+	if (status) {
+		if (le16_to_cpu(tbl_alloc.buf.resp_buf.alloc_id) <
+		    ICE_AQC_ALLOC_ID_LESS_THAN_4K)
+			ice_debug(hw, ICE_DBG_ACL, "Alloc ACL table failed. Unavailable resource.\n");
+		else
+			ice_debug(hw, ICE_DBG_ACL, "AQ allocation of ACL failed with error. status: %d\n",
+				  status);
+		return status;
+	}
+
+	tbl = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tbl), GFP_KERNEL);
+	if (!tbl) {
+		status = ICE_ERR_NO_MEMORY;
+
+		goto out;
+	}
+
+	resp_buf = &tbl_alloc.buf.resp_buf;
+
+	/* Retrieve information of the allocated table */
+	tbl->id = le16_to_cpu(resp_buf->alloc_id);
+	tbl->first_tcam = resp_buf->ops.table.first_tcam;
+	tbl->last_tcam = resp_buf->ops.table.last_tcam;
+	tbl->first_entry = le16_to_cpu(resp_buf->first_entry);
+	tbl->last_entry = le16_to_cpu(resp_buf->last_entry);
+
+	tbl->info = *params;
+	tbl->info.width = width;
+	tbl->info.depth = depth;
+	hw->acl_tbl = tbl;
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++)
+		tbl->act_mems[i].act_mem = resp_buf->act_mem[i];
+
+	/* Figure out which TCAMs that these newly allocated action memories
+	 * belong to.
+	 */
+	ice_acl_divide_act_mems_to_tcams(tbl);
+
+	/* Initialize the resources allocated by invalidating all TCAM entries
+	 * and all the action pairs
+	 */
+	status = ice_acl_init_tbl(hw);
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), tbl);
+		hw->acl_tbl = NULL;
+		ice_debug(hw, ICE_DBG_ACL, "Initialization of TCAM entries failed. status: %d\n",
+			  status);
+		goto out;
+	}
+
+	first_e = (tbl->first_tcam * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+		(tbl->first_entry / ICE_ACL_ENTRY_ALLOC_UNIT);
+	last_e = (tbl->last_tcam * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+		(tbl->last_entry / ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* Indicate available entries in the table */
+	bitmap_set(tbl->avail, first_e, last_e - first_e + 1);
+
+	INIT_LIST_HEAD(&tbl->scens);
+out:
+
+	return status;
+}
+
+/**
+ * ice_acl_alloc_partition - Allocate a partition from the ACL table
+ * @hw: pointer to the hardware structure
+ * @req: info of partition being allocated
+ */
+static enum ice_status
+ice_acl_alloc_partition(struct ice_hw *hw, struct ice_acl_scen *req)
+{
+	u16 start = 0, cnt = 0, off = 0;
+	u16 width, r_entries, row;
+	bool done = false;
+	int dir;
+
+	/* Determine the number of TCAMs each entry overlaps */
+	width = DIV_ROUND_UP(req->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* Check if we have enough TCAMs to accommodate the width */
+	if (width > hw->acl_tbl->last_tcam - hw->acl_tbl->first_tcam + 1)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* Number of entries must be multiple of ICE_ACL_ENTRY_ALLOC_UNIT's */
+	r_entries = ALIGN(req->num_entry, ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* To look for an available partition that can accommodate the request,
+	 * the process first logically arranges available TCAMs in rows such
+	 * that each row produces entries with the requested width. It then
+	 * scans the TCAMs' available bitmap, one bit at a time, and
+	 * accumulates contiguous available 64-entry chunks until there are
+	 * enough of them or when all TCAM configurations have been checked.
+	 *
+	 * For width of 1 TCAM, the scanning process starts from the top most
+	 * TCAM, and goes downward. Available bitmaps are examined from LSB
+	 * to MSB.
+	 *
+	 * For width of multiple TCAMs, the process starts from the bottom-most
+	 * row of TCAMs, and goes upward. Available bitmaps are examined from
+	 * the MSB to the LSB.
+	 *
+	 * To make sure that adjacent TCAMs can be logically arranged in the
+	 * same row, the scanning process may have multiple passes. In each
+	 * pass, the first TCAM of the bottom-most row is displaced by one
+	 * additional TCAM. The width of the row and the number of the TCAMs
+	 * available determine the number of passes. When the displacement is
+	 * more than the size of width, the TCAM row configurations will
+	 * repeat. The process will terminate when the configurations repeat.
+	 *
+	 * Available partitions can span more than one row of TCAMs.
+	 */
+	if (width == 1) {
+		row = hw->acl_tbl->first_tcam;
+		dir = 1;
+	} else {
+		/* Start with the bottom-most row, and scan for available
+		 * entries upward
+		 */
+		row = hw->acl_tbl->last_tcam + 1 - width;
+		dir = -1;
+	}
+
+	do {
+		u16 i;
+
+		/* Scan all 64-entry chunks, one chunk at a time, in the
+		 * current TCAM row
+		 */
+		for (i = 0;
+		     i < ICE_AQC_MAX_TCAM_ALLOC_UNITS && cnt < r_entries;
+		     i++) {
+			bool avail = true;
+			u16 w, p;
+
+			/* Compute the cumulative available mask across the
+			 * TCAM row to determine if the current 64-entry chunk
+			 * is available.
+			 */
+			p = dir > 0 ? i : ICE_AQC_MAX_TCAM_ALLOC_UNITS - i - 1;
+			for (w = row; w < row + width && avail; w++) {
+				u16 b;
+
+				b = (w * ICE_AQC_MAX_TCAM_ALLOC_UNITS) + p;
+				avail &= test_bit(b, hw->acl_tbl->avail);
+			}
+
+			if (!avail) {
+				cnt = 0;
+			} else {
+				/* Compute the starting index of the newly
+				 * found partition. When 'dir' is negative, the
+				 * scan processes is going upward. If so, the
+				 * starting index needs to be updated for every
+				 * available 64-entry chunk found.
+				 */
+				if (!cnt || dir < 0)
+					start = (row * ICE_AQC_ACL_TCAM_DEPTH) +
+						(p * ICE_ACL_ENTRY_ALLOC_UNIT);
+				cnt += ICE_ACL_ENTRY_ALLOC_UNIT;
+			}
+		}
+
+		if (cnt >= r_entries) {
+			req->start = start;
+			req->num_entry = r_entries;
+			req->end = ice_acl_tbl_calc_end_idx(start, r_entries,
+							    width);
+			break;
+		}
+
+		row = dir > 0 ? row + width : row - width;
+		if (row > hw->acl_tbl->last_tcam ||
+		    row < hw->acl_tbl->first_tcam) {
+			/* All rows have been checked. Increment 'off' that
+			 * will help yield a different TCAM configuration in
+			 * which adjacent TCAMs can be alternatively in the
+			 * same row.
+			 */
+			off++;
+
+			/* However, if the new 'off' value yields previously
+			 * checked configurations, then exit.
+			 */
+			if (off >= width)
+				done = true;
+			else
+				row = dir > 0 ? off :
+					hw->acl_tbl->last_tcam + 1 - off -
+					width;
+		}
+	} while (!done);
+
+	return cnt >= r_entries ? ICE_SUCCESS : ICE_ERR_MAX_LIMIT;
+}
+
+/**
+ * ice_acl_fill_tcam_select
+ * @scen_buf: Pointer to the scenario buffer that needs to be populated
+ * @scen: Pointer to the available space for the scenario
+ * @tcam_idx: Index of the TCAM used for this scenario
+ * @tcam_idx_in_cascade : Local index of the TCAM in the cascade scenario
+ *
+ * For all TCAM that participate in this scenario, fill out the tcam_select
+ * value.
+ */
+static void
+ice_acl_fill_tcam_select(struct ice_aqc_acl_scen *scen_buf,
+			 struct ice_acl_scen *scen, u16 tcam_idx,
+			 u16 tcam_idx_in_cascade)
+{
+	u16 cascade_cnt, idx;
+	u8 j;
+
+	idx = tcam_idx_in_cascade * ICE_AQC_ACL_KEY_WIDTH_BYTES;
+	cascade_cnt = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	/* For each scenario, we reserved last three bytes of scenario width for
+	 * profile ID, range checker, and packet direction. Thus, the last three
+	 * bytes of the last cascaded TCAMs will have value of 1st, 31st and
+	 * 32nd byte location of BYTE selection base.
+	 *
+	 * For other bytes in the TCAMs:
+	 * For non-cascade mode (1 TCAM wide) scenario, TCAM[x]'s Select {0-1}
+	 * select indices 0-1 of the Byte Selection Base
+	 * For cascade mode, the leftmost TCAM of the first cascade row selects
+	 * indices 0-4 of the Byte Selection Base; the second TCAM in the
+	 * cascade row selects indices starting with 5-n
+	 */
+	for (j = 0; j < ICE_AQC_ACL_KEY_WIDTH_BYTES; j++) {
+		/* PKT DIR uses the 1st location of Byte Selection Base: + 1 */
+		u8 val = ICE_AQC_ACL_BYTE_SEL_BASE + 1 + idx;
+
+		if (tcam_idx_in_cascade == cascade_cnt - 1) {
+			if (j == ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK;
+			else if (j == ICE_ACL_SCEN_PID_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_PID;
+			else if (j == ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM)
+				val = ICE_AQC_ACL_BYTE_SEL_BASE_PKT_DIR;
+		}
+
+		/* In case that scenario's width is greater than the width of
+		 * the Byte selection base, we will not assign a value to the
+		 * tcam_select[j]. As a result, the tcam_select[j] will have
+		 * default value which is zero.
+		 */
+		if (val > ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK)
+			continue;
+
+		scen_buf->tcam_cfg[tcam_idx].tcam_select[j] = val;
+
+		idx++;
+	}
+}
+
+/**
+ * ice_acl_set_scen_chnk_msk
+ * @scen_buf: Pointer to the scenario buffer that needs to be populated
+ * @scen: pointer to the available space for the scenario
+ *
+ * Set the chunk mask for the entries that will be used by this scenario
+ */
+static void
+ice_acl_set_scen_chnk_msk(struct ice_aqc_acl_scen *scen_buf,
+			  struct ice_acl_scen *scen)
+{
+	u16 tcam_idx, num_cscd, units, cnt;
+	u8 chnk_offst;
+
+	/* Determine the starting TCAM index and offset of the start entry */
+	tcam_idx = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	chnk_offst = (u8)((scen->start % ICE_AQC_ACL_TCAM_DEPTH) /
+			  ICE_ACL_ENTRY_ALLOC_UNIT);
+
+	/* Entries are allocated and tracked in multiple of 64's */
+	units = scen->num_entry / ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = scen->width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+	for (cnt = 0; cnt < units; cnt++) {
+		u16 i;
+
+		/* Set the corresponding bitmap of individual 64-entry
+		 * chunk spans across a cascade of 1 or more TCAMs
+		 * For each TCAM, there will be (ICE_AQC_ACL_TCAM_DEPTH
+		 * / ICE_ACL_ENTRY_ALLOC_UNIT) or 8 chunks.
+		 */
+		for (i = tcam_idx; i < tcam_idx + num_cscd; i++)
+			scen_buf->tcam_cfg[i].chnk_msk |= BIT(chnk_offst);
+
+		chnk_offst = (chnk_offst + 1) % ICE_AQC_MAX_TCAM_ALLOC_UNITS;
+		if (!chnk_offst)
+			tcam_idx += num_cscd;
+	}
+}
+
+/**
+ * ice_acl_assign_act_mem_for_scen
+ * @tbl: pointer to ACL table structure
+ * @scen: pointer to the scenario struct
+ * @scen_buf: pointer to the available space for the scenario
+ * @current_tcam_idx: theoretical index of the TCAM that we associated those
+ *		      action memory banks with, at the table creation time.
+ * @target_tcam_idx: index of the TCAM that we want to associate those action
+ *		     memory banks with.
+ */
+static void
+ice_acl_assign_act_mem_for_scen(struct ice_acl_tbl *tbl,
+				struct ice_acl_scen *scen,
+				struct ice_aqc_acl_scen *scen_buf,
+				u8 current_tcam_idx, u8 target_tcam_idx)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++) {
+		struct ice_acl_act_mem *p_mem = &tbl->act_mems[i];
+
+		if (p_mem->act_mem == ICE_ACL_ACT_PAIR_MEM_INVAL ||
+		    p_mem->member_of_tcam != current_tcam_idx)
+			continue;
+
+		scen_buf->act_mem_cfg[i] = target_tcam_idx;
+		scen_buf->act_mem_cfg[i] |= ICE_AQC_ACL_SCE_ACT_MEM_EN;
+		set_bit(i, scen->act_mem_bitmap);
+	}
+}
+
+/**
+ * ice_acl_commit_partition - Indicate if the specified partition is active
+ * @hw: pointer to the hardware structure
+ * @scen: pointer to the scenario struct
+ * @commit: true if the partition is being commit
+ */
+static void
+ice_acl_commit_partition(struct ice_hw *hw, struct ice_acl_scen *scen,
+			 bool commit)
+{
+	u16 tcam_idx, off, num_cscd, units, cnt;
+
+	/* Determine the starting TCAM index and offset of the start entry */
+	tcam_idx = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	off = (scen->start % ICE_AQC_ACL_TCAM_DEPTH) /
+		ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Entries are allocated and tracked in multiple of 64's */
+	units = scen->num_entry / ICE_ACL_ENTRY_ALLOC_UNIT;
+
+	/* Determine number of cascaded TCAM */
+	num_cscd = scen->width / ICE_AQC_ACL_KEY_WIDTH_BYTES;
+
+	for (cnt = 0; cnt < units; cnt++) {
+		u16 w;
+
+		/* Set/clear the corresponding bitmap of individual 64-entry
+		 * chunk spans across a row of 1 or more TCAMs
+		 */
+		for (w = 0; w < num_cscd; w++) {
+			u16 b;
+
+			b = ((tcam_idx + w) * ICE_AQC_MAX_TCAM_ALLOC_UNITS) +
+				off;
+			if (commit)
+				set_bit(b, hw->acl_tbl->avail);
+			else
+				clear_bit(b, hw->acl_tbl->avail);
+		}
+
+		off = (off + 1) % ICE_AQC_MAX_TCAM_ALLOC_UNITS;
+		if (!off)
+			tcam_idx += num_cscd;
+	}
+}
+
+/**
+ * ice_acl_create_scen
+ * @hw: pointer to the hardware structure
+ * @match_width: number of bytes to be matched in this scenario
+ * @num_entries: number of entries to be allocated for the scenario
+ * @scen_id: holds returned scenario ID if successful
+ */
+enum ice_status
+ice_acl_create_scen(struct ice_hw *hw, u16 match_width, u16 num_entries,
+		    u16 *scen_id)
+{
+	u8 cascade_cnt, first_tcam, last_tcam, i, k;
+	struct ice_aqc_acl_scen scen_buf;
+	struct ice_acl_scen *scen;
+	enum ice_status status;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	scen = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*scen), GFP_KERNEL);
+	if (!scen)
+		return ICE_ERR_NO_MEMORY;
+
+	scen->start = hw->acl_tbl->first_entry;
+	scen->width = ICE_AQC_ACL_KEY_WIDTH_BYTES *
+		DIV_ROUND_UP(match_width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	scen->num_entry = num_entries;
+
+	status = ice_acl_alloc_partition(hw, scen);
+	if (status)
+		goto out;
+
+	memset(&scen_buf, 0, sizeof(scen_buf));
+
+	/* Determine the number of cascade TCAMs, given the scenario's width */
+	cascade_cnt = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+	first_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	last_tcam = ICE_ACL_TBL_TCAM_IDX(scen->end);
+
+	/* For each scenario, we reserved last three bytes of scenario width for
+	 * packet direction flag, profile ID and range checker. Thus, we want to
+	 * return back to the caller the eff_width, pkt_dir_idx, rng_chk_idx and
+	 * pid_idx.
+	 */
+	scen->eff_width = cascade_cnt * ICE_AQC_ACL_KEY_WIDTH_BYTES -
+		ICE_ACL_SCEN_MIN_WIDTH;
+	scen->rng_chk_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_RNG_CHK_IDX_IN_TCAM;
+	scen->pid_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_PID_IDX_IN_TCAM;
+	scen->pkt_dir_idx = (cascade_cnt - 1) * ICE_AQC_ACL_KEY_WIDTH_BYTES +
+		ICE_ACL_SCEN_PKT_DIR_IDX_IN_TCAM;
+
+	/* set the chunk mask for the tcams */
+	ice_acl_set_scen_chnk_msk(&scen_buf, scen);
+
+	/* set the TCAM select and start_cmp and start_set bits */
+	k = first_tcam;
+	/* set the START_SET bit at the beginning of the stack */
+	scen_buf.tcam_cfg[k].start_cmp_set |= ICE_AQC_ACL_ALLOC_SCE_START_SET;
+	while (k <= last_tcam) {
+		u8 last_tcam_idx_cascade = cascade_cnt + k - 1;
+
+		/* set start_cmp for the first cascaded TCAM */
+		scen_buf.tcam_cfg[k].start_cmp_set |=
+			ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+		/* cascade TCAMs up to the width of the scenario */
+		for (i = k; i < cascade_cnt + k; i++) {
+			ice_acl_fill_tcam_select(&scen_buf, scen, i, i - k);
+			ice_acl_assign_act_mem_for_scen(hw->acl_tbl, scen,
+							&scen_buf,
+							i,
+							last_tcam_idx_cascade);
+		}
+
+		k = i;
+	}
+
+	/* We need to set the start_cmp bit for the unused TCAMs. */
+	i = 0;
+	while (i < first_tcam)
+		scen_buf.tcam_cfg[i++].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+	i = last_tcam + 1;
+	while (i < ICE_AQC_ACL_SLICES)
+		scen_buf.tcam_cfg[i++].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+
+	status = ice_aq_alloc_acl_scen(hw, scen_id, &scen_buf, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ allocation of ACL scenario failed. status: %d\n",
+			  status);
+		goto out;
+	}
+
+	scen->id = *scen_id;
+	ice_acl_commit_partition(hw, scen, false);
+	ice_acl_init_entry(scen);
+	list_add(&scen->list_entry, &hw->acl_tbl->scens);
+
+out:
+	if (status)
+		devm_kfree(ice_hw_to_dev(hw), scen);
+
+	return status;
+}
+
+/**
+ * ice_acl_destroy_scen - Destroy an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen_id: ID of the remove scenario
+ */
+static enum ice_status ice_acl_destroy_scen(struct ice_hw *hw, u16 scen_id)
+{
+	struct ice_acl_scen *scen, *tmp_scen;
+	struct ice_flow_prof *p, *tmp;
+	enum ice_status status;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Remove profiles that use "scen_id" scenario */
+	list_for_each_entry_safe(p, tmp, &hw->fl_profs[ICE_BLK_ACL], l_entry)
+		if (p->cfg.scen && p->cfg.scen->id == scen_id) {
+			status = ice_flow_rem_prof(hw, ICE_BLK_ACL, p->id);
+			if (status) {
+				ice_debug(hw, ICE_DBG_ACL, "ice_flow_rem_prof failed. status: %d\n",
+					  status);
+				return status;
+			}
+		}
+
+	/* Call the AQ command to destroy the targeted scenario */
+	status = ice_aq_dealloc_acl_scen(hw, scen_id, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ de-allocation of scenario failed. status: %d\n",
+			  status);
+		return status;
+	}
+
+	/* Remove scenario from hw->acl_tbl->scens */
+	list_for_each_entry_safe(scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry)
+		if (scen->id == scen_id) {
+			list_del(&scen->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), scen);
+		}
+
+	return 0;
+}
+
+/**
+ * ice_acl_destroy_tbl - Destroy a previously created LEM table for ACL
+ * @hw: pointer to the HW struct
+ */
+enum ice_status ice_acl_destroy_tbl(struct ice_hw *hw)
+{
+	struct ice_acl_scen *pos_scen, *tmp_scen;
+	struct ice_aqc_acl_generic resp_buf;
+	struct ice_aqc_acl_scen buf;
+	enum ice_status status;
+	u8 i;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Mark all the created scenario's TCAM to stop the packet lookup and
+	 * delete them afterward
+	 */
+	list_for_each_entry_safe(pos_scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry) {
+		status = ice_aq_query_acl_scen(hw, pos_scen->id, &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "ice_aq_query_acl_scen() failed. status: %d\n",
+				  status);
+			return status;
+		}
+
+		for (i = 0; i < ICE_AQC_ACL_SLICES; i++) {
+			buf.tcam_cfg[i].chnk_msk = 0;
+			buf.tcam_cfg[i].start_cmp_set =
+					ICE_AQC_ACL_ALLOC_SCE_START_CMP;
+		}
+
+		for (i = 0; i < ICE_AQC_MAX_ACTION_MEMORIES; i++)
+			buf.act_mem_cfg[i] = 0;
+
+		status = ice_aq_update_acl_scen(hw, pos_scen->id, &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "ice_aq_update_acl_scen() failed. status: %d\n",
+				  status);
+			return status;
+		}
+
+		status = ice_acl_destroy_scen(hw, pos_scen->id);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "deletion of scenario failed. status: %d\n",
+				  status);
+			return status;
+		}
+	}
+
+	/* call the AQ command to destroy the ACL table */
+	status = ice_aq_dealloc_acl_tbl(hw, hw->acl_tbl->id, &resp_buf, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_ACL, "AQ de-allocation of ACL failed. status: %d\n",
+			  status);
+		return status;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), hw->acl_tbl);
+	hw->acl_tbl = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_acl_add_entry - Add a flow entry to an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen: scenario to add the entry to
+ * @prio: priority level of the entry being added
+ * @keys: buffer of the value of the key to be programmed to the ACL entry
+ * @inverts: buffer of the value of the key inverts to be programmed
+ * @acts: pointer to a buffer containing formatted actions
+ * @acts_cnt: indicates the number of actions stored in "acts"
+ * @entry_idx: returned scenario relative index of the added flow entry
+ *
+ * Given an ACL table and a scenario, to add the specified key and key invert
+ * to an available entry in the specified scenario.
+ * The "keys" and "inverts" buffers must be of the size which is the same as
+ * the scenario's width
+ */
+enum ice_status
+ice_acl_add_entry(struct ice_hw *hw, struct ice_acl_scen *scen,
+		  enum ice_acl_entry_prio prio, u8 *keys, u8 *inverts,
+		  struct ice_acl_act_entry *acts, u8 acts_cnt, u16 *entry_idx)
+{
+	u8 i, entry_tcam, num_cscd, offset;
+	struct ice_aqc_acl_data buf;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (!scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	*entry_idx = ice_acl_scen_assign_entry_idx(scen, prio);
+	if (*entry_idx >= scen->num_entry) {
+		*entry_idx = 0;
+		return ICE_ERR_MAX_LIMIT;
+	}
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + *entry_idx);
+
+	memset(&buf, 0, sizeof(buf));
+	for (i = 0; i < num_cscd; i++) {
+		/* If the key spans more than one TCAM in the case of cascaded
+		 * TCAMs, the key and key inverts need to be properly split
+		 * among TCAMs.E.g.bytes 0 - 4 go to an index in the first TCAM
+		 * and bytes 5 - 9 go to the same index in the next TCAM, etc.
+		 * If the entry spans more than one TCAM in a cascaded TCAM
+		 * mode, the programming of the entries in the TCAMs must be in
+		 * reversed order - the TCAM entry of the rightmost TCAM should
+		 * be programmed first; the TCAM entry of the leftmost TCAM
+		 * should be programmed last.
+		 */
+		offset = num_cscd - i - 1;
+		memcpy(&buf.entry_key.val,
+		       &keys[offset * sizeof(buf.entry_key.val)],
+		       sizeof(buf.entry_key.val));
+		memcpy(&buf.entry_key_invert.val,
+		       &inverts[offset * sizeof(buf.entry_key_invert.val)],
+		       sizeof(buf.entry_key_invert.val));
+		status = ice_aq_program_acl_entry(hw, entry_tcam + offset, idx,
+						  &buf, NULL);
+		if (status) {
+			ice_debug(hw, ICE_DBG_ACL, "aq program acl entry failed status: %d\n",
+				  status);
+			goto out;
+		}
+	}
+
+	/* Program the action memory */
+	status = ice_acl_prog_act(hw, scen, acts, acts_cnt, *entry_idx);
+
+out:
+	if (status) {
+		ice_acl_rem_entry(hw, scen, *entry_idx);
+		*entry_idx = 0;
+	}
+
+	return status;
+}
+
+/**
+ * ice_acl_prog_act - Program a scenario's action memory
+ * @hw: pointer to the HW struct
+ * @scen: scenario to add the entry to
+ * @acts: pointer to a buffer containing formatted actions
+ * @acts_cnt: indicates the number of actions stored in "acts"
+ * @entry_idx: scenario relative index of the added flow entry
+ *
+ * Program a scenario's action memory
+ */
+enum ice_status
+ice_acl_prog_act(struct ice_hw *hw, struct ice_acl_scen *scen,
+		 struct ice_acl_act_entry *acts, u8 acts_cnt,
+		 u16 entry_idx)
+{
+	u8 entry_tcam, num_cscd, i, actx_idx = 0;
+	struct ice_aqc_actpair act_buf;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (entry_idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + entry_idx);
+
+	for_each_set_bit(i, scen->act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES) {
+		struct ice_acl_act_mem *mem = &hw->acl_tbl->act_mems[i];
+
+		if (actx_idx >= acts_cnt)
+			break;
+		if (mem->member_of_tcam >= entry_tcam &&
+		    mem->member_of_tcam < entry_tcam + num_cscd) {
+			memcpy(&act_buf.act[0], &acts[actx_idx],
+			       sizeof(struct ice_acl_act_entry));
+
+			if (++actx_idx < acts_cnt) {
+				memcpy(&act_buf.act[1], &acts[actx_idx],
+				       sizeof(struct ice_acl_act_entry));
+			}
+
+			status = ice_aq_program_actpair(hw, i, idx, &act_buf,
+							NULL);
+			if (status) {
+				ice_debug(hw, ICE_DBG_ACL, "program actpair failed status: %d\n",
+					  status);
+				break;
+			}
+			actx_idx++;
+		}
+	}
+
+	if (!status && actx_idx < acts_cnt)
+		status = ICE_ERR_MAX_LIMIT;
+
+	return status;
+}
+
+/**
+ * ice_acl_rem_entry - Remove a flow entry from an ACL scenario
+ * @hw: pointer to the HW struct
+ * @scen: scenario to remove the entry from
+ * @entry_idx: the scenario-relative index of the flow entry being removed
+ */
+enum ice_status
+ice_acl_rem_entry(struct ice_hw *hw, struct ice_acl_scen *scen, u16 entry_idx)
+{
+	struct ice_aqc_actpair act_buf;
+	struct ice_aqc_acl_data buf;
+	u8 entry_tcam, num_cscd, i;
+	enum ice_status status = 0;
+	u16 idx;
+
+	if (!scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	if (entry_idx >= scen->num_entry)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!test_bit(entry_idx, scen->entry_bitmap))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Determine number of cascaded TCAMs */
+	num_cscd = DIV_ROUND_UP(scen->width, ICE_AQC_ACL_KEY_WIDTH_BYTES);
+
+	entry_tcam = ICE_ACL_TBL_TCAM_IDX(scen->start);
+	idx = ICE_ACL_TBL_TCAM_ENTRY_IDX(scen->start + entry_idx);
+
+	/* invalidate the flow entry */
+	memset(&buf, 0, sizeof(buf));
+	for (i = 0; i < num_cscd; i++) {
+		status = ice_aq_program_acl_entry(hw, entry_tcam + i, idx, &buf,
+						  NULL);
+		if (status)
+			ice_debug(hw, ICE_DBG_ACL, "AQ program ACL entry failed status: %d\n",
+				  status);
+	}
+
+	memset(&act_buf, 0, sizeof(act_buf));
+
+	for_each_set_bit(i, scen->act_mem_bitmap, ICE_AQC_MAX_ACTION_MEMORIES) {
+		struct ice_acl_act_mem *mem = &hw->acl_tbl->act_mems[i];
+
+		if (mem->member_of_tcam >= entry_tcam &&
+		    mem->member_of_tcam < entry_tcam + num_cscd) {
+			/* Invalidate allocated action pairs */
+			status = ice_aq_program_actpair(hw, i, idx, &act_buf,
+							NULL);
+			if (status)
+				ice_debug(hw, ICE_DBG_ACL, "program actpair failed status: %d\n",
+					  status);
+		}
+	}
+
+	ice_acl_scen_free_entry_idx(scen, entry_idx);
+
+	return status;
+}
+
+/**
+ * ice_is_acl_empty - Check if any entry exists
+ * @hw: pointer to the HW struct
+ */
+bool ice_is_acl_empty(struct ice_hw *hw)
+{
+	struct ice_acl_scen *scen, *tmp_scen;
+
+	if (!hw->acl_tbl)
+		return false;
+
+	list_for_each_entry_safe(scen, tmp_scen, &hw->acl_tbl->scens,
+				 list_entry)
+		if (!bitmap_empty(scen->entry_bitmap, ICE_MAX_ACL_TCAM_ENTRY))
+			return false;
+
+	return true;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_acl_main.c b/drivers/net/ethernet/intel/ice/ice_acl_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..81eb056f1b32d3e1e1959bee9252d2c8387cfdf0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_acl_main.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* ACL support for ice */
+
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_flow.h"
+#include "ice_fdir.h"
+
+/* Default ACL Action priority */
+#define ICE_ACL_ACT_PRIO	3
+
+/* Number of action */
+#define ICE_ACL_NUM_ACT		1
+
+/**
+ * ice_acl_set_ip4_addr_seg
+ * @seg: flow segment for programming
+ *
+ * Set the IPv4 source and destination address mask for the given flow segment
+ */
+static void ice_acl_set_ip4_addr_seg(struct ice_flow_seg_info *seg)
+{
+	u16 val_loc, mask_loc;
+
+	/* IP source address */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.src_ip);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.src_ip);
+
+	ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA, val_loc,
+			 mask_loc, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* IP destination address */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.dst_ip);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.dst_ip);
+
+	ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA, val_loc,
+			 mask_loc, ICE_FLOW_FLD_OFF_INVAL, false);
+}
+
+/**
+ * ice_acl_set_ip4_port_seg
+ * @seg: flow segment for programming
+ * @l4_proto: Layer 4 protocol to program
+ *
+ * Set the source and destination port for the given flow segment based on the
+ * provided layer 4 protocol
+ */
+static int
+ice_acl_set_ip4_port_seg(struct ice_flow_seg_info *seg,
+			 enum ice_flow_seg_hdr l4_proto)
+{
+	enum ice_flow_field src_port, dst_port;
+	u16 val_loc, mask_loc;
+	int err;
+
+	err = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (err)
+		return err;
+
+	/* Layer 4 source port */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.src_port);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.src_port);
+
+	ice_flow_set_fld(seg, src_port, val_loc, mask_loc,
+			 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 destination port */
+	val_loc = offsetof(struct ice_fdir_fltr, ip.v4.dst_port);
+	mask_loc = offsetof(struct ice_fdir_fltr, mask.v4.dst_port);
+
+	ice_flow_set_fld(seg, dst_port, val_loc, mask_loc,
+			 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	return 0;
+}
+
+/**
+ * ice_acl_set_ip4_seg
+ * @seg: flow segment for programming
+ * @tcp_ip4_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv4
+ */
+static int
+ice_acl_set_ip4_seg(struct ice_flow_seg_info *seg,
+		    struct ethtool_tcpip4_spec *tcp_ip4_spec,
+		    enum ice_flow_seg_hdr l4_proto)
+{
+	int err;
+
+	err = ice_ntuple_check_ip4_seg(tcp_ip4_spec);
+	if (err)
+		return err;
+
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 | l4_proto);
+	ice_acl_set_ip4_addr_seg(seg);
+
+	return ice_acl_set_ip4_port_seg(seg, l4_proto);
+}
+
+/**
+ * ice_acl_set_ip4_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip4_spec: ethtool userdef packet offset
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv4
+ */
+static int
+ice_acl_set_ip4_usr_seg(struct ice_flow_seg_info *seg,
+			struct ethtool_usrip4_spec *usr_ip4_spec)
+{
+	int err;
+
+	err = ice_ntuple_check_ip4_usr_seg(usr_ip4_spec);
+	if (err)
+		return err;
+
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4);
+	ice_acl_set_ip4_addr_seg(seg);
+
+	return 0;
+}
+
+
+/**
+ * ice_acl_check_input_set - Checks that a given ACL input set is valid
+ * @pf: ice PF structure
+ * @fsp: pointer to ethtool Rx flow specification
+ *
+ * Returns 0 on success and negative values for failure
+ */
+static int
+ice_acl_check_input_set(struct ice_pf *pf, struct ethtool_rx_flow_spec *fsp)
+{
+	struct ice_fd_hw_prof *hw_prof = NULL;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_seg_info *seg;
+	enum ice_fltr_ptype fltr_type;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_TCP);
+		break;
+	case UDP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_UDP);
+		break;
+	case SCTP_V4_FLOW:
+		err = ice_acl_set_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					  ICE_FLOW_SEG_HDR_SCTP);
+		break;
+	case IPV4_USER_FLOW:
+		err = ice_acl_set_ip4_usr_seg(seg, &fsp->m_u.usr_ip4_spec);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+	if (err)
+		goto err_exit;
+
+	fltr_type = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+
+	if (!hw->acl_prof) {
+		hw->acl_prof = devm_kcalloc(dev, ICE_FLTR_PTYPE_MAX,
+					    sizeof(*hw->acl_prof), GFP_KERNEL);
+		if (!hw->acl_prof) {
+			err = -ENOMEM;
+			goto err_exit;
+		}
+	}
+	if (!hw->acl_prof[fltr_type]) {
+		hw->acl_prof[fltr_type] = devm_kzalloc(dev,
+						       sizeof(**hw->acl_prof),
+						       GFP_KERNEL);
+		if (!hw->acl_prof[fltr_type]) {
+			err = -ENOMEM;
+			goto err_acl_prof_exit;
+		}
+		hw->acl_prof[fltr_type]->cnt = 0;
+	}
+
+	hw_prof = hw->acl_prof[fltr_type];
+	old_seg = hw_prof->fdir_seg[0];
+	if (old_seg) {
+		/* This flow_type already has an input set.
+		 * If it matches the requested input set then we are
+		 * done. If it's different then it's an error.
+		 */
+		if (!memcmp(old_seg, seg, sizeof(*seg))) {
+			devm_kfree(dev, seg);
+			return 0;
+		}
+
+		err = -EINVAL;
+		goto err_acl_prof_flow_exit;
+	}
+
+	/* Adding a profile for the given flow specification with no
+	 * actions (NULL) and zero actions 0.
+	 */
+	status = ice_flow_add_prof(hw, ICE_BLK_ACL, ICE_FLOW_RX, fltr_type,
+				   seg, 1, NULL, 0, &prof);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_exit;
+	}
+
+	hw_prof->fdir_seg[0] = seg;
+	return 0;
+
+err_acl_prof_flow_exit:
+	devm_kfree(dev, hw->acl_prof[fltr_type]);
+err_acl_prof_exit:
+	devm_kfree(dev, hw->acl_prof);
+err_exit:
+	devm_kfree(dev, seg);
+
+	return err;
+}
+
+/**
+ * ice_acl_add_rule_ethtool - Adds an ACL rule
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete ACL rule
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_acl_add_rule_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ice_flow_action acts[ICE_ACL_NUM_ACT];
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fd_hw_prof *hw_prof;
+	struct ice_fdir_fltr *input;
+	enum ice_fltr_ptype flow;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u64 entry_h = 0;
+	int act_cnt;
+	int ret;
+
+	if (!vsi || !cmd)
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	ret = ice_acl_check_input_set(pf, fsp);
+	if (ret)
+		return ret;
+
+	/* Add new rule */
+	input = devm_kzalloc(dev, sizeof(*input), GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	ret = ice_ntuple_set_input_set(vsi, ICE_BLK_ACL, fsp, input);
+	if (ret)
+		goto free_input;
+
+	memset(&acts, 0, sizeof(acts));
+	act_cnt = 1;
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		acts[0].type = ICE_FLOW_ACT_DROP;
+		acts[0].data.acl_act.mdid = ICE_MDID_RX_PKT_DROP;
+		acts[0].data.acl_act.prio = ICE_ACL_ACT_PRIO;
+		acts[0].data.acl_act.value = cpu_to_le16(0x1);
+	} else {
+		acts[0].type = ICE_FLOW_ACT_FWD_QUEUE;
+		acts[0].data.acl_act.mdid = ICE_MDID_RX_DST_Q;
+		acts[0].data.acl_act.prio = ICE_ACL_ACT_PRIO;
+		acts[0].data.acl_act.value = cpu_to_le16(input->q_index);
+	}
+
+	flow = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+	hw_prof = hw->acl_prof[flow];
+
+	status = ice_flow_add_entry(hw, ICE_BLK_ACL, flow, fsp->location,
+				    vsi->idx, ICE_FLOW_PRIO_NORMAL, input, acts,
+				    act_cnt, &entry_h);
+	if (status) {
+		dev_err(dev, "Could not add flow entry %d\n", flow);
+		ret = ice_status_to_errno(status);
+		goto free_input;
+	}
+
+	if (!hw_prof->cnt || vsi->idx != hw_prof->vsi_h[hw_prof->cnt - 1]) {
+		hw_prof->vsi_h[hw_prof->cnt] = vsi->idx;
+		hw_prof->entry_h[hw_prof->cnt++][0] = entry_h;
+	}
+
+	input->acl_fltr = true;
+	/* input struct is added to the HW filter list */
+	ice_ntuple_update_list_entry(pf, input, fsp->location);
+
+	return 0;
+
+free_input:
+	devm_kfree(dev, input);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 023e3d2fee5f9464456b2b911e5f35e61ad7329c..c4fc3ee344d5e628e507ce357b8ced3265cad3bd 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_ADMINQ_CMD_H_
 #define _ICE_ADMINQ_CMD_H_
@@ -8,10 +8,12 @@
  * descriptor format. It is shared between Firmware and Software.
  */
 
+
 #define ICE_MAX_VSI			768
 #define ICE_AQC_TOPO_MAX_LEVEL_NUM	0x9
 #define ICE_AQ_SET_MAC_FRAME_SIZE_MAX	9728
 
+
 struct ice_aqc_generic {
 	__le32 param0;
 	__le32 param1;
@@ -19,6 +21,7 @@ struct ice_aqc_generic {
 	__le32 addr_low;
 };
 
+
 /* Get version (direct 0x0001) */
 struct ice_aqc_get_ver {
 	__le32 rom_ver;
@@ -33,6 +36,7 @@ struct ice_aqc_get_ver {
 	u8 api_patch;
 };
 
+
 /* Send driver version (indirect 0x0002) */
 struct ice_aqc_driver_ver {
 	u8 major_ver;
@@ -44,6 +48,7 @@ struct ice_aqc_driver_ver {
 	__le32 addr_low;
 };
 
+
 /* Queue Shutdown (direct 0x0003) */
 struct ice_aqc_q_shutdown {
 	u8 driver_unloading;
@@ -51,6 +56,17 @@ struct ice_aqc_q_shutdown {
 	u8 reserved[15];
 };
 
+
+
+/* Get Expanded Error Code (0x0005, direct) */
+struct ice_aqc_get_exp_err {
+	__le32 reason;
+#define ICE_AQC_EXPANDED_ERROR_NOT_PROVIDED	0xFFFFFFFF
+	__le32 identifier;
+	u8 rsvd[8];
+};
+
+
 /* Request resource ownership (direct 0x0008)
  * Release resource ownership (direct 0x0009)
  */
@@ -83,6 +99,7 @@ struct ice_aqc_req_res {
 	u8 reserved[2];
 };
 
+
 /* Get function capabilities (indirect 0x000A)
  * Get device capabilities (indirect 0x000B)
  */
@@ -95,19 +112,54 @@ struct ice_aqc_list_caps {
 	__le32 addr_low;
 };
 
+
 /* Device/Function buffer entry, repeated per reported capability */
 struct ice_aqc_list_caps_elem {
 	__le16 cap;
+#define ICE_AQC_CAPS_SWITCHING_MODE			0x0001
+#define ICE_AQC_CAPS_MANAGEABILITY_MODE			0x0002
+#define ICE_AQC_CAPS_OS2BMC				0x0004
 #define ICE_AQC_CAPS_VALID_FUNCTIONS			0x0005
+#define ICE_AQC_MAX_VALID_FUNCTIONS			0x8
+#define ICE_AQC_CAPS_ALTERNATE_RAM			0x0006
+#define ICE_AQC_CAPS_WOL_PROXY				0x0008
 #define ICE_AQC_CAPS_SRIOV				0x0012
 #define ICE_AQC_CAPS_VF					0x0013
+#define ICE_AQC_CAPS_VMDQ				0x0014
+#define ICE_AQC_CAPS_802_1QBG				0x0015
+#define ICE_AQC_CAPS_802_1BR				0x0016
 #define ICE_AQC_CAPS_VSI				0x0017
 #define ICE_AQC_CAPS_DCB				0x0018
+#define ICE_AQC_CAPS_RSVD				0x0021
+#define ICE_AQC_CAPS_ISCSI				0x0022
 #define ICE_AQC_CAPS_RSS				0x0040
 #define ICE_AQC_CAPS_RXQS				0x0041
 #define ICE_AQC_CAPS_TXQS				0x0042
 #define ICE_AQC_CAPS_MSIX				0x0043
+#define ICE_AQC_CAPS_FD					0x0045
+#define ICE_AQC_CAPS_1588				0x0046
 #define ICE_AQC_CAPS_MAX_MTU				0x0047
+#define ICE_AQC_CAPS_NVM_VER				0x0048
+#define ICE_AQC_CAPS_PENDING_NVM_VER			0x0049
+#define ICE_AQC_CAPS_OROM_VER				0x004A
+#define ICE_AQC_CAPS_PENDING_OROM_VER			0x004B
+#define ICE_AQC_CAPS_NET_VER				0x004C
+#define ICE_AQC_CAPS_PENDING_NET_VER			0x004D
+#define ICE_AQC_CAPS_CEM				0x00F2
+#define ICE_AQC_CAPS_IWARP				0x0051
+#define ICE_AQC_CAPS_LED				0x0061
+#define ICE_AQC_CAPS_SDP				0x0062
+#define ICE_AQC_CAPS_WR_CSR_PROT			0x0064
+#define ICE_AQC_CAPS_LOGI_TO_PHYSI_PORT_MAP		0x0073
+#define ICE_AQC_CAPS_SKU				0x0074
+#define ICE_AQC_CAPS_PORT_MAP				0x0075
+#define ICE_AQC_CAPS_PCIE_RESET_AVOIDANCE		0x0076
+#define ICE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT		0x0077
+#define ICE_AQC_CAPS_NVM_MGMT				0x0080
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0			0x0081
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG1			0x0082
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG2			0x0083
+#define ICE_AQC_CAPS_EXT_TOPO_DEV_IMG3			0x0084
 
 	u8 major_ver;
 	u8 minor_ver;
@@ -121,6 +173,7 @@ struct ice_aqc_list_caps_elem {
 	__le64 rsvd2;
 };
 
+
 /* Manage MAC address, read command - indirect (0x0107)
  * This struct is also used for the response
  */
@@ -130,6 +183,8 @@ struct ice_aqc_manage_mac_read {
 #define ICE_AQC_MAN_MAC_SAN_ADDR_VALID		BIT(5)
 #define ICE_AQC_MAN_MAC_PORT_ADDR_VALID		BIT(6)
 #define ICE_AQC_MAN_MAC_WOL_ADDR_VALID		BIT(7)
+#define ICE_AQC_MAN_MAC_MC_MAG_EN		BIT(8)
+#define ICE_AQC_MAN_MAC_WOL_PRESERVE_ON_PFR	BIT(9)
 #define ICE_AQC_MAN_MAC_READ_S			4
 #define ICE_AQC_MAN_MAC_READ_M			(0xF << ICE_AQC_MAN_MAC_READ_S)
 	u8 rsvd[2];
@@ -139,6 +194,7 @@ struct ice_aqc_manage_mac_read {
 	__le32 addr_low;
 };
 
+
 /* Response buffer format for manage MAC read command */
 struct ice_aqc_manage_mac_read_resp {
 	u8 lport_num;
@@ -148,6 +204,7 @@ struct ice_aqc_manage_mac_read_resp {
 	u8 mac_addr[ETH_ALEN];
 };
 
+
 /* Manage MAC address, write command - direct (0x0108) */
 struct ice_aqc_manage_mac_write {
 	u8 rsvd;
@@ -155,17 +212,16 @@ struct ice_aqc_manage_mac_write {
 #define ICE_AQC_MAN_MAC_WR_MC_MAG_EN		BIT(0)
 #define ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP	BIT(1)
 #define ICE_AQC_MAN_MAC_WR_S		6
-#define ICE_AQC_MAN_MAC_WR_M		(3 << ICE_AQC_MAN_MAC_WR_S)
+#define ICE_AQC_MAN_MAC_WR_M		ICE_M(3, ICE_AQC_MAN_MAC_WR_S)
 #define ICE_AQC_MAN_MAC_UPDATE_LAA	0
-#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL	(BIT(0) << ICE_AQC_MAN_MAC_WR_S)
-	/* High 16 bits of MAC address in big endian order */
-	__be16 sah;
-	/* Low 32 bits of MAC address in big endian order */
-	__be32 sal;
+#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL	BIT(ICE_AQC_MAN_MAC_WR_S)
+	/* byte stream in network order */
+	u8 mac_addr[ETH_ALEN];
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
 /* Clear PXE Command and response (direct 0x0110) */
 struct ice_aqc_clear_pxe {
 	u8 rx_cnt;
@@ -173,6 +229,15 @@ struct ice_aqc_clear_pxe {
 	u8 reserved[15];
 };
 
+
+/* Configure No-Drop Policy Command (direct 0x0112) */
+struct ice_aqc_config_no_drop_policy {
+	u8 opts;
+#define ICE_AQC_FORCE_NO_DROP			BIT(0)
+	u8 rsvd[15];
+};
+
+
 /* Get switch configuration (0x0200) */
 struct ice_aqc_get_sw_cfg {
 	/* Reserved for command and copy of request flags for response */
@@ -190,6 +255,7 @@ struct ice_aqc_get_sw_cfg {
 	__le32 addr_low;
 };
 
+
 /* Each entry in the response buffer is of the following type: */
 struct ice_aqc_get_sw_cfg_resp_elem {
 	/* VSI/Port Number */
@@ -216,13 +282,30 @@ struct ice_aqc_get_sw_cfg_resp_elem {
 #define ICE_AQC_GET_SW_CONF_RESP_IS_VF		BIT(15)
 };
 
-/* The response buffer is as follows. Note that the length of the
- * elements array varies with the length of the command response.
- */
-struct ice_aqc_get_sw_cfg_resp {
-	struct ice_aqc_get_sw_cfg_resp_elem elements[1];
+
+
+/* Set Port parameters, (direct, 0x0203) */
+struct ice_aqc_set_port_params {
+	__le16 cmd_flags;
+#define ICE_AQC_SET_P_PARAMS_SAVE_BAD_PACKETS	BIT(0)
+#define ICE_AQC_SET_P_PARAMS_PAD_SHORT_PACKETS	BIT(1)
+#define ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA	BIT(2)
+	__le16 bad_frame_vsi;
+#define ICE_AQC_SET_P_PARAMS_VSI_S	0
+#define ICE_AQC_SET_P_PARAMS_VSI_M	(0x3FF << ICE_AQC_SET_P_PARAMS_VSI_S)
+#define ICE_AQC_SET_P_PARAMS_VSI_VALID	BIT(15)
+	__le16 swid;
+#define ICE_AQC_SET_P_PARAMS_SWID_S	0
+#define ICE_AQC_SET_P_PARAMS_SWID_M	(0xFF << ICE_AQC_SET_P_PARAMS_SWID_S)
+#define ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_S	8
+#define ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_M	\
+				(0x3F << ICE_AQC_SET_P_PARAMS_LOGI_PORT_ID_S)
+#define ICE_AQC_SET_P_PARAMS_IS_LOGI_PORT	BIT(14)
+#define ICE_AQC_SET_P_PARAMS_SWID_VALID		BIT(15)
+	u8 reserved[10];
 };
 
+
 /* These resource type defines are used for all switch resource
  * commands where a resource type is required, such as:
  * Get Resource Allocation command (indirect 0x0204)
@@ -230,8 +313,64 @@ struct ice_aqc_get_sw_cfg_resp {
  * Free Resources command (indirect 0x0209)
  * Get Allocated Resource Descriptors Command (indirect 0x020A)
  */
+#define ICE_AQC_RES_TYPE_VEB_COUNTER			0x00
+#define ICE_AQC_RES_TYPE_VLAN_COUNTER			0x01
+#define ICE_AQC_RES_TYPE_MIRROR_RULE			0x02
 #define ICE_AQC_RES_TYPE_VSI_LIST_REP			0x03
 #define ICE_AQC_RES_TYPE_VSI_LIST_PRUNE			0x04
+#define ICE_AQC_RES_TYPE_RECIPE				0x05
+#define ICE_AQC_RES_TYPE_PROFILE			0x06
+#define ICE_AQC_RES_TYPE_SWID				0x07
+#define ICE_AQC_RES_TYPE_VSI				0x08
+#define ICE_AQC_RES_TYPE_FLU				0x09
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_1			0x0A
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_2			0x0B
+#define ICE_AQC_RES_TYPE_WIDE_TABLE_4			0x0C
+#define ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH		0x20
+#define ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK		0x21
+#define ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES	0x22
+#define ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES		0x23
+#define ICE_AQC_RES_TYPE_FLEX_DESC_PROG			0x30
+#define ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_PROFID	0x48
+#define ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_TCAM		0x49
+#define ICE_AQC_RES_TYPE_ACL_PROF_BLDR_PROFID		0x50
+#define ICE_AQC_RES_TYPE_ACL_PROF_BLDR_TCAM		0x51
+#define ICE_AQC_RES_TYPE_FD_PROF_BLDR_PROFID		0x58
+#define ICE_AQC_RES_TYPE_FD_PROF_BLDR_TCAM		0x59
+#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID		0x60
+#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM		0x61
+/* Resource types 0x62-67 are reserved for Hash profile builder */
+#define ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_PROFID		0x68
+#define ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_TCAM		0x69
+
+#define ICE_AQC_RES_TYPE_FLAG_SHARED			BIT(7)
+#define ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM		BIT(12)
+#define ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX		BIT(13)
+
+#define ICE_AQC_RES_TYPE_FLAG_DEDICATED			0x00
+
+#define ICE_AQC_RES_TYPE_S	0
+#define ICE_AQC_RES_TYPE_M	(0x07F << ICE_AQC_RES_TYPE_S)
+
+/* Get Resource Allocation command (indirect 0x0204) */
+struct ice_aqc_get_res_alloc {
+	__le16 resp_elem_num; /* Used in response, reserved in command */
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Resource Allocation Response Buffer per response */
+struct ice_aqc_get_res_resp_elem {
+	__le16 res_type; /* Types defined above cmd 0x0204 */
+	__le16 total_capacity; /* Resources available to all PF's */
+	__le16 total_function; /* Resources allocated for a PF */
+	__le16 total_shared; /* Resources allocated as shared */
+	__le16 total_free; /* Resources un-allocated/not reserved by any PF */
+};
+
+
 
 /* Allocate Resources command (indirect 0x0208)
  * Free Resources command (indirect 0x0209)
@@ -243,6 +382,7 @@ struct ice_aqc_alloc_free_res_cmd {
 	__le32 addr_low;
 };
 
+
 /* Resource descriptor */
 struct ice_aqc_res_elem {
 	union {
@@ -251,18 +391,75 @@ struct ice_aqc_res_elem {
 	} e;
 };
 
+
 /* Buffer for Allocate/Free Resources commands */
 struct ice_aqc_alloc_free_res_elem {
 	__le16 res_type; /* Types defined above cmd 0x0204 */
-#define ICE_AQC_RES_TYPE_SHARED_S	7
-#define ICE_AQC_RES_TYPE_SHARED_M	(0x1 << ICE_AQC_RES_TYPE_SHARED_S)
 #define ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_S	8
 #define ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_M	\
 				(0xF << ICE_AQC_RES_TYPE_VSI_PRUNE_LIST_S)
 	__le16 num_elems;
-	struct ice_aqc_res_elem elem[1];
+	struct ice_aqc_res_elem elem[];
+};
+
+
+/* Get Allocated Resource Descriptors Command (indirect 0x020A) */
+struct ice_aqc_get_allocd_res_desc {
+	union {
+		struct {
+			__le16 res; /* Types defined above cmd 0x0204 */
+			__le16 first_desc;
+			__le32 reserved;
+		} cmd;
+		struct {
+			__le16 res;
+			__le16 next_desc;
+			__le16 num_desc;
+			__le16 reserved;
+		} resp;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+
+/* Request buffer for Set VLAN Mode AQ command (indirect 0x020C) */
+struct ice_aqc_set_vlan_mode {
+	u8 reserved;
+	u8 l2tag_prio_tagging;
+#define ICE_AQ_VLAN_PRIO_TAG_S			0
+#define ICE_AQ_VLAN_PRIO_TAG_M			(0x7 << ICE_AQ_VLAN_PRIO_TAG_S)
+#define ICE_AQ_VLAN_PRIO_TAG_NOT_SUPPORTED	0x0
+#define ICE_AQ_VLAN_PRIO_TAG_STAG		0x1
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG		0x2
+#define ICE_AQ_VLAN_PRIO_TAG_OUTER_VLAN		0x3
+#define ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG		0x4
+#define ICE_AQ_VLAN_PRIO_TAG_MAX		0x4
+#define ICE_AQ_VLAN_PRIO_TAG_ERROR		0x7
+	u8 l2tag_reserved[64];
+	u8 rdma_packet;
+#define ICE_AQ_VLAN_RDMA_TAG_S			0
+#define ICE_AQ_VLAN_RDMA_TAG_M			(0x3F << ICE_AQ_VLAN_RDMA_TAG_S)
+#define ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING	0x10
+#define ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING	0x1A
+	u8 rdma_reserved[2];
+	u8 mng_vlan_prot_id;
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER	0x10
+#define ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER	0x11
+	u8 prot_id_reserved[30];
+};
+
+
+/* Response buffer for Get VLAN Mode AQ command (indirect 0x020D) */
+struct ice_aqc_get_vlan_mode {
+	u8 vlan_mode;
+#define ICE_AQ_VLAN_MODE_DVM_ENA	BIT(0)
+	u8 l2tag_prio_tagging;
+	u8 reserved[98];
 };
 
+
 /* Add VSI (indirect 0x0210)
  * Update VSI (indirect 0x0211)
  * Get VSI (indirect 0x0212)
@@ -288,6 +485,7 @@ struct ice_aqc_add_get_update_free_vsi {
 	__le32 addr_low;
 };
 
+
 /* Response descriptor for:
  * Add VSI (indirect 0x0210)
  * Update VSI (indirect 0x0211)
@@ -302,6 +500,21 @@ struct ice_aqc_add_update_free_vsi_resp {
 	__le32 addr_low;
 };
 
+
+struct ice_aqc_get_vsi_resp {
+	__le16 vsi_num;
+	u8 vf_id;
+	/* The vsi_flags field uses the ICE_AQ_VSI_TYPE_* defines for values.
+	 * These are found above in struct ice_aqc_add_get_update_free_vsi.
+	 */
+	u8 vsi_flags;
+	__le16 vsi_used;
+	__le16 vsi_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 struct ice_aqc_vsi_props {
 	__le16 valid_sections;
 #define ICE_AQ_VSI_PROP_SW_VALID		BIT(0)
@@ -313,6 +526,7 @@ struct ice_aqc_vsi_props {
 #define ICE_AQ_VSI_PROP_RXQ_MAP_VALID		BIT(6)
 #define ICE_AQ_VSI_PROP_Q_OPT_VALID		BIT(7)
 #define ICE_AQ_VSI_PROP_OUTER_UP_VALID		BIT(8)
+#define ICE_AQ_VSI_PROP_ACL_VALID		BIT(10)
 #define ICE_AQ_VSI_PROP_FLOW_DIR_VALID		BIT(11)
 #define ICE_AQ_VSI_PROP_PASID_VALID		BIT(12)
 	/* switch section */
@@ -323,141 +537,328 @@ struct ice_aqc_vsi_props {
 #define ICE_AQ_VSI_SW_FLAG_SRC_PRUNE		BIT(7)
 	u8 sw_flags2;
 #define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S	0
-#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M	\
-				(0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
+#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M	(0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
 #define ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA	BIT(0)
 #define ICE_AQ_VSI_SW_FLAG_LAN_ENA		BIT(4)
 	u8 veb_stat_id;
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_S		0
-#define ICE_AQ_VSI_SW_VEB_STAT_ID_M	(0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
+#define ICE_AQ_VSI_SW_VEB_STAT_ID_M		(0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
 #define ICE_AQ_VSI_SW_VEB_STAT_ID_VALID		BIT(5)
 	/* security section */
 	u8 sec_flags;
 #define ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD	BIT(0)
 #define ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF	BIT(2)
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S	4
-#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M	(0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S		4
+#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M		(0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
 #define ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA	BIT(0)
 	u8 sec_reserved;
 	/* VLAN section */
-	__le16 pvid; /* VLANS include priority bits */
-	u8 pvlan_reserved[2];
-	u8 vlan_flags;
-#define ICE_AQ_VSI_VLAN_MODE_S	0
-#define ICE_AQ_VSI_VLAN_MODE_M	(0x3 << ICE_AQ_VSI_VLAN_MODE_S)
-#define ICE_AQ_VSI_VLAN_MODE_UNTAGGED	0x1
-#define ICE_AQ_VSI_VLAN_MODE_TAGGED	0x2
-#define ICE_AQ_VSI_VLAN_MODE_ALL	0x3
-#define ICE_AQ_VSI_PVLAN_INSERT_PVID	BIT(2)
-#define ICE_AQ_VSI_VLAN_EMOD_S		3
-#define ICE_AQ_VSI_VLAN_EMOD_M		(0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_BOTH	(0x0 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR_UP	(0x1 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_STR	(0x2 << ICE_AQ_VSI_VLAN_EMOD_S)
-#define ICE_AQ_VSI_VLAN_EMOD_NOTHING	(0x3 << ICE_AQ_VSI_VLAN_EMOD_S)
-	u8 pvlan_reserved2[3];
+	__le16 port_based_inner_vlan; /* VLANS include priority bits */
+	u8 inner_vlan_reserved[2];
+	u8 inner_vlan_flags;
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_S		0
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_M		(0x3 << ICE_AQ_VSI_INNER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED	0x1
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTTAGGED	0x2
+#define ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL	0x3
+#define ICE_AQ_VSI_INNER_VLAN_INSERT_PVID	BIT(2)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_S		3
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_M		(0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH	(0x0 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR_UP	(0x1 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_STR		(0x2 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING	(0x3 << ICE_AQ_VSI_INNER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_INNER_VLAN_BLOCK_TX_DESC	BIT(5)
+	u8 inner_vlan_reserved2[3];
 	/* ingress egress up sections */
 	__le32 ingress_table; /* bitmap, 3 bits per up */
-#define ICE_AQ_VSI_UP_TABLE_UP0_S	0
-#define ICE_AQ_VSI_UP_TABLE_UP0_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
-#define ICE_AQ_VSI_UP_TABLE_UP1_S	3
-#define ICE_AQ_VSI_UP_TABLE_UP1_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
-#define ICE_AQ_VSI_UP_TABLE_UP2_S	6
-#define ICE_AQ_VSI_UP_TABLE_UP2_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
-#define ICE_AQ_VSI_UP_TABLE_UP3_S	9
-#define ICE_AQ_VSI_UP_TABLE_UP3_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
-#define ICE_AQ_VSI_UP_TABLE_UP4_S	12
-#define ICE_AQ_VSI_UP_TABLE_UP4_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
-#define ICE_AQ_VSI_UP_TABLE_UP5_S	15
-#define ICE_AQ_VSI_UP_TABLE_UP5_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
-#define ICE_AQ_VSI_UP_TABLE_UP6_S	18
-#define ICE_AQ_VSI_UP_TABLE_UP6_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
-#define ICE_AQ_VSI_UP_TABLE_UP7_S	21
-#define ICE_AQ_VSI_UP_TABLE_UP7_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
+#define ICE_AQ_VSI_UP_TABLE_UP0_S		0
+#define ICE_AQ_VSI_UP_TABLE_UP0_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
+#define ICE_AQ_VSI_UP_TABLE_UP1_S		3
+#define ICE_AQ_VSI_UP_TABLE_UP1_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
+#define ICE_AQ_VSI_UP_TABLE_UP2_S		6
+#define ICE_AQ_VSI_UP_TABLE_UP2_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
+#define ICE_AQ_VSI_UP_TABLE_UP3_S		9
+#define ICE_AQ_VSI_UP_TABLE_UP3_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
+#define ICE_AQ_VSI_UP_TABLE_UP4_S		12
+#define ICE_AQ_VSI_UP_TABLE_UP4_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
+#define ICE_AQ_VSI_UP_TABLE_UP5_S		15
+#define ICE_AQ_VSI_UP_TABLE_UP5_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
+#define ICE_AQ_VSI_UP_TABLE_UP6_S		18
+#define ICE_AQ_VSI_UP_TABLE_UP6_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
+#define ICE_AQ_VSI_UP_TABLE_UP7_S		21
+#define ICE_AQ_VSI_UP_TABLE_UP7_M		(0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
 	__le32 egress_table;   /* same defines as for ingress table */
 	/* outer tags section */
-	__le16 outer_tag;
-	u8 outer_tag_flags;
-#define ICE_AQ_VSI_OUTER_TAG_MODE_S	0
-#define ICE_AQ_VSI_OUTER_TAG_MODE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_MODE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NOTHING	0x0
-#define ICE_AQ_VSI_OUTER_TAG_REMOVE	0x1
-#define ICE_AQ_VSI_OUTER_TAG_COPY	0x2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_S	2
-#define ICE_AQ_VSI_OUTER_TAG_TYPE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
-#define ICE_AQ_VSI_OUTER_TAG_NONE	0x0
-#define ICE_AQ_VSI_OUTER_TAG_STAG	0x1
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100	0x2
-#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100	0x3
-#define ICE_AQ_VSI_OUTER_TAG_INSERT	BIT(4)
-#define ICE_AQ_VSI_OUTER_TAG_ACCEPT_HOST BIT(6)
-	u8 outer_tag_reserved;
+	__le16 port_based_outer_vlan;
+	u8 outer_vlan_flags;
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_S		0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_M		(0x3 << ICE_AQ_VSI_OUTER_VLAN_EMODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH	0x0
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_UP	0x1
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW	0x2
+#define ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING	0x3
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_S		2
+#define ICE_AQ_VSI_OUTER_TAG_TYPE_M		(0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
+#define ICE_AQ_VSI_OUTER_TAG_NONE		0x0
+#define ICE_AQ_VSI_OUTER_TAG_STAG		0x1
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100		0x2
+#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100		0x3
+#define ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT		BIT(4)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S			5
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M			(0x3 << ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S)
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED	0x1
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTTAGGED	0x2
+#define ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL		0x3
+#define ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC		BIT(7)
+	u8 outer_vlan_reserved;
 	/* queue mapping section */
 	__le16 mapping_flags;
-#define ICE_AQ_VSI_Q_MAP_CONTIG	0x0
-#define ICE_AQ_VSI_Q_MAP_NONCONTIG	BIT(0)
+#define ICE_AQ_VSI_Q_MAP_CONTIG			0x0
+#define ICE_AQ_VSI_Q_MAP_NONCONTIG		BIT(0)
 	__le16 q_mapping[16];
-#define ICE_AQ_VSI_Q_S		0
-#define ICE_AQ_VSI_Q_M		(0x7FF << ICE_AQ_VSI_Q_S)
+#define ICE_AQ_VSI_Q_S				0
+#define ICE_AQ_VSI_Q_M				(0x7FF << ICE_AQ_VSI_Q_S)
 	__le16 tc_mapping[8];
-#define ICE_AQ_VSI_TC_Q_OFFSET_S	0
-#define ICE_AQ_VSI_TC_Q_OFFSET_M	(0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
-#define ICE_AQ_VSI_TC_Q_NUM_S		11
-#define ICE_AQ_VSI_TC_Q_NUM_M		(0xF << ICE_AQ_VSI_TC_Q_NUM_S)
+#define ICE_AQ_VSI_TC_Q_OFFSET_S		0
+#define ICE_AQ_VSI_TC_Q_OFFSET_M		(0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
+#define ICE_AQ_VSI_TC_Q_NUM_S			11
+#define ICE_AQ_VSI_TC_Q_NUM_M			(0xF << ICE_AQ_VSI_TC_Q_NUM_S)
 	/* queueing option section */
 	u8 q_opt_rss;
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S	0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI	0x0
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF	0x2
-#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL	0x3
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S	2
-#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M	(0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S	6
-#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ	(0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ	(0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_XOR	(0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
-#define ICE_AQ_VSI_Q_OPT_RSS_JHASH	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S		0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI		0x0
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF		0x2
+#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL		0x3
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S		2
+#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M		(0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S		6
+#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ		(0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ		(0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_XOR		(0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
+#define ICE_AQ_VSI_Q_OPT_RSS_JHASH		(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
 	u8 q_opt_tc;
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_S	0
-#define ICE_AQ_VSI_Q_OPT_TC_OVR_M	(0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
-#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR	BIT(7)
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_S		0
+#define ICE_AQ_VSI_Q_OPT_TC_OVR_M		(0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
+#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR		BIT(7)
 	u8 q_opt_flags;
-#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN	BIT(0)
+#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN		BIT(0)
 	u8 q_opt_reserved[3];
 	/* outer up section */
 	__le32 outer_up_table; /* same structure and defines as ingress tbl */
-	/* section 10 */
-	__le16 sect_10_reserved;
+	/* ACL section */
+	__le16 acl_def_act;
+#define ICE_AQ_VSI_ACL_DEF_RX_PROF_S		0
+#define ICE_AQ_VSI_ACL_DEF_RX_PROF_M		(0xF << ICE_AQ_VSI_ACL_DEF_RX_PROF_S)
+#define ICE_AQ_VSI_ACL_DEF_RX_TABLE_S		4
+#define ICE_AQ_VSI_ACL_DEF_RX_TABLE_M		(0xF << ICE_AQ_VSI_ACL_DEF_RX_TABLE_S)
+#define ICE_AQ_VSI_ACL_DEF_TX_PROF_S		8
+#define ICE_AQ_VSI_ACL_DEF_TX_PROF_M		(0xF << ICE_AQ_VSI_ACL_DEF_TX_PROF_S)
+#define ICE_AQ_VSI_ACL_DEF_TX_TABLE_S		12
+#define ICE_AQ_VSI_ACL_DEF_TX_TABLE_M		(0xF << ICE_AQ_VSI_ACL_DEF_TX_TABLE_S)
 	/* flow director section */
 	__le16 fd_options;
-#define ICE_AQ_VSI_FD_ENABLE		BIT(0)
-#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE	BIT(1)
-#define ICE_AQ_VSI_FD_PROG_ENABLE	BIT(3)
+#define ICE_AQ_VSI_FD_ENABLE			BIT(0)
+#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE		BIT(1)
+#define ICE_AQ_VSI_FD_PROG_ENABLE		BIT(3)
 	__le16 max_fd_fltr_dedicated;
 	__le16 max_fd_fltr_shared;
 	__le16 fd_def_q;
-#define ICE_AQ_VSI_FD_DEF_Q_S		0
-#define ICE_AQ_VSI_FD_DEF_Q_M		(0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
-#define ICE_AQ_VSI_FD_DEF_GRP_S	12
-#define ICE_AQ_VSI_FD_DEF_GRP_M	(0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
+#define ICE_AQ_VSI_FD_DEF_Q_S			0
+#define ICE_AQ_VSI_FD_DEF_Q_M			(0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
+#define ICE_AQ_VSI_FD_DEF_GRP_S			12
+#define ICE_AQ_VSI_FD_DEF_GRP_M			(0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
 	__le16 fd_report_opt;
-#define ICE_AQ_VSI_FD_REPORT_Q_S	0
-#define ICE_AQ_VSI_FD_REPORT_Q_M	(0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_S	12
-#define ICE_AQ_VSI_FD_DEF_PRIORITY_M	(0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
-#define ICE_AQ_VSI_FD_DEF_DROP		BIT(15)
+#define ICE_AQ_VSI_FD_REPORT_Q_S		0
+#define ICE_AQ_VSI_FD_REPORT_Q_M		(0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_S		12
+#define ICE_AQ_VSI_FD_DEF_PRIORITY_M		(0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
+#define ICE_AQ_VSI_FD_DEF_DROP			BIT(15)
 	/* PASID section */
 	__le32 pasid_id;
-#define ICE_AQ_VSI_PASID_ID_S		0
-#define ICE_AQ_VSI_PASID_ID_M		(0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
-#define ICE_AQ_VSI_PASID_ID_VALID	BIT(31)
+#define ICE_AQ_VSI_PASID_ID_S			0
+#define ICE_AQ_VSI_PASID_ID_M			(0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
+#define ICE_AQ_VSI_PASID_ID_VALID		BIT(31)
 	u8 reserved[24];
 };
 
+
+/* Add/update mirror rule - direct (0x0260) */
+#define ICE_AQC_RULE_ID_VALID_S		7
+#define ICE_AQC_RULE_ID_VALID_M		(0x1 << ICE_AQC_RULE_ID_VALID_S)
+#define ICE_AQC_RULE_ID_S		0
+#define ICE_AQC_RULE_ID_M		(0x3F << ICE_AQC_RULE_ID_S)
+
+/* Following defines to be used while processing caller specified mirror list
+ * of VSI indexes.
+ */
+/* Action: Byte.bit (1.7)
+ *	0 = Remove VSI from mirror rule
+ *	1 = Add VSI to mirror rule
+ */
+#define ICE_AQC_RULE_ACT_S	15
+#define ICE_AQC_RULE_ACT_M	(0x1 << ICE_AQC_RULE_ACT_S)
+/* Action: 1.2:0.0 = Mirrored VSI */
+#define ICE_AQC_RULE_MIRRORED_VSI_S	0
+#define ICE_AQC_RULE_MIRRORED_VSI_M	(0x7FF << ICE_AQC_RULE_MIRRORED_VSI_S)
+
+/* This is to be used by add/update mirror rule Admin Queue command.
+ * In case of add mirror rule - if rule ID is specified as
+ * INVAL_MIRROR_RULE_ID, new rule ID is allocated from shared pool.
+ * If specified rule_id is valid, then it is used. If specified rule_id
+ * is in use then new mirroring rule is added.
+ */
+#define ICE_INVAL_MIRROR_RULE_ID	0xFFFF
+
+struct ice_aqc_add_update_mir_rule {
+	__le16 rule_id;
+
+	__le16 rule_type;
+#define ICE_AQC_RULE_TYPE_S		0
+#define ICE_AQC_RULE_TYPE_M		(0x7 << ICE_AQC_RULE_TYPE_S)
+	/* VPORT ingress/egress */
+#define ICE_AQC_RULE_TYPE_VPORT_INGRESS	0x1
+#define ICE_AQC_RULE_TYPE_VPORT_EGRESS	0x2
+	/* Physical port ingress mirroring.
+	 * All traffic received by this port
+	 */
+#define ICE_AQC_RULE_TYPE_PPORT_INGRESS	0x6
+	/* Physical port egress mirroring. All traffic sent by this port */
+#define ICE_AQC_RULE_TYPE_PPORT_EGRESS	0x7
+
+	/* Number of mirrored entries.
+	 * The values are in the command buffer
+	 */
+	__le16 num_entries;
+
+	/* Destination VSI */
+	__le16 dest;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Delete mirror rule - direct(0x0261) */
+struct ice_aqc_delete_mir_rule {
+	__le16 rule_id;
+	__le16 rsvd;
+
+	/* Byte.bit: 20.0 = Keep allocation. If set VSI stays part of
+	 * the PF allocated resources, otherwise it is returned to the
+	 * shared pool
+	 */
+#define ICE_AQC_FLAG_KEEP_ALLOCD_S	0
+#define ICE_AQC_FLAG_KEEP_ALLOCD_M	(0x1 << ICE_AQC_FLAG_KEEP_ALLOCD_S)
+	__le16 flags;
+
+	u8 reserved[10];
+};
+
+
+/* Set/Get storm config - (direct 0x0280, 0x0281) */
+/* This structure holds get storm configuration response and same structure
+ * is used to perform set_storm_cfg
+ */
+struct ice_aqc_storm_cfg {
+	__le32 bcast_thresh_size;
+	__le32 mcast_thresh_size;
+	/* Bit 18:0 - Traffic upper threshold size
+	 * Bit 31:19 - Reserved
+	 */
+#define ICE_AQ_THRESHOLD_S	0
+#define ICE_AQ_THRESHOLD_M	(0x7FFFF << ICE_AQ_THRESHOLD_S)
+
+	__le32 storm_ctrl_ctrl;
+	/* Bit 0: MDIPW - Drop Multicast packets in previous window
+	 * Bit 1: MDICW - Drop multicast packets in current window
+	 * Bit 2: BDIPW - Drop broadcast packets in previous window
+	 * Bit 3: BDICW - Drop broadcast packets in current window
+	 */
+#define ICE_AQ_STORM_CTRL_MDIPW_DROP_MULTICAST	BIT(0)
+#define ICE_AQ_STORM_CTRL_MDICW_DROP_MULTICAST	BIT(1)
+#define ICE_AQ_STORM_CTRL_BDIPW_DROP_MULTICAST	BIT(2)
+#define ICE_AQ_STORM_CTRL_BDICW_DROP_MULTICAST	BIT(3)
+	/* Bit 7:5 : Reserved */
+	/* Bit 27:8 : Interval - BSC/MSC Time-interval specification: The
+	 * interval size for applying ingress broadcast or multicast storm
+	 * control.
+	 */
+#define ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_S	8
+#define ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_M	\
+			(0xFFFFF << ICE_AQ_STORM_BSC_MSC_TIME_INTERVAL_S)
+	__le32 reserved;
+};
+
+
 #define ICE_MAX_NUM_RECIPES 64
 
+/* Add/Get Recipe (indirect 0x0290/0x0292) */
+struct ice_aqc_add_get_recipe {
+	__le16 num_sub_recipes;	/* Input in Add cmd, Output in Get cmd */
+	__le16 return_index;	/* Input, used for Get cmd only */
+	u8 reserved[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_recipe_content {
+	u8 rid;
+#define ICE_AQ_RECIPE_ID_S		0
+#define ICE_AQ_RECIPE_ID_M		(0x3F << ICE_AQ_RECIPE_ID_S)
+#define ICE_AQ_RECIPE_ID_IS_ROOT	BIT(7)
+#define	ICE_AQ_SW_ID_LKUP_IDX		0
+	u8 lkup_indx[5];
+#define ICE_AQ_RECIPE_LKUP_DATA_S	0
+#define ICE_AQ_RECIPE_LKUP_DATA_M	(0x3F << ICE_AQ_RECIPE_LKUP_DATA_S)
+#define ICE_AQ_RECIPE_LKUP_IGNORE	BIT(7)
+#define ICE_AQ_SW_ID_LKUP_MASK		0x00FF
+	__le16 mask[5];
+	u8 result_indx;
+#define ICE_AQ_RECIPE_RESULT_DATA_S	0
+#define ICE_AQ_RECIPE_RESULT_DATA_M	(0x3F << ICE_AQ_RECIPE_RESULT_DATA_S)
+#define ICE_AQ_RECIPE_RESULT_EN		BIT(7)
+	u8 rsvd0[3];
+	u8 act_ctrl_join_priority;
+	u8 act_ctrl_fwd_priority;
+#define ICE_AQ_RECIPE_FWD_PRIORITY_S	0
+#define ICE_AQ_RECIPE_FWD_PRIORITY_M	(0xF << ICE_AQ_RECIPE_FWD_PRIORITY_S)
+	u8 act_ctrl;
+#define ICE_AQ_RECIPE_ACT_NEED_PASS_L2	BIT(0)
+#define ICE_AQ_RECIPE_ACT_ALLOW_PASS_L2	BIT(1)
+#define ICE_AQ_RECIPE_ACT_INV_ACT	BIT(2)
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_S	4
+#define ICE_AQ_RECIPE_ACT_PRUNE_INDX_M	(0x3 << ICE_AQ_RECIPE_ACT_PRUNE_INDX_S)
+	u8 rsvd1;
+	__le32 dflt_act;
+#define ICE_AQ_RECIPE_DFLT_ACT_S	0
+#define ICE_AQ_RECIPE_DFLT_ACT_M	(0x7FFFF << ICE_AQ_RECIPE_DFLT_ACT_S)
+#define ICE_AQ_RECIPE_DFLT_ACT_VALID	BIT(31)
+};
+
+
+struct ice_aqc_recipe_data_elem {
+	u8 recipe_indx;
+	u8 resp_bits;
+#define ICE_AQ_RECIPE_WAS_UPDATED	BIT(0)
+	u8 rsvd0[2];
+	u8 recipe_bitmap[8];
+	u8 rsvd1[4];
+	struct ice_aqc_recipe_content content;
+	u8 rsvd2[20];
+};
+
+
+/* Set/Get Recipes to Profile Association (direct 0x0291/0x0293) */
+struct ice_aqc_recipe_to_profile {
+	__le16 profile_id;
+	u8 rsvd[6];
+	DECLARE_BITMAP(recipe_assoc, ICE_MAX_NUM_RECIPES);
+};
+
+
 /* Add/Update/Remove/Get switch rules (indirect 0x02A0, 0x02A1, 0x02A2, 0x02A3)
  */
 struct ice_aqc_sw_rules {
@@ -472,6 +873,7 @@ struct ice_aqc_sw_rules {
 	__le32 addr_low;
 };
 
+
 /* Add/Update/Get/Remove lookup Rx/Tx command/response entry
  * This structures describes the lookup rules and associated actions. "index"
  * is returned as part of a response to a successful Add command, and can be
@@ -534,7 +936,7 @@ struct ice_sw_rule_lkup_rx_tx {
 #define ICE_SINGLE_ACT_OTHER_ACTS		0x3
 #define ICE_SINGLE_OTHER_ACT_IDENTIFIER_S	17
 #define ICE_SINGLE_OTHER_ACT_IDENTIFIER_M	\
-				(0x3 << \ ICE_SINGLE_OTHER_ACT_IDENTIFIER_S)
+				(0x3 << ICE_SINGLE_OTHER_ACT_IDENTIFIER_S)
 
 	/* Bit 17:18 - Defines other actions */
 	/* Other action = 0 - Mirror VSI */
@@ -554,8 +956,9 @@ struct ice_sw_rule_lkup_rx_tx {
 	 * lookup-type
 	 */
 	__le16 hdr_len;
-	u8 hdr[1];
-} __packed;
+	u8 hdr[];
+};
+
 
 /* Add/Update/Remove large action command/response entry
  * "index" is returned as part of a response to a successful Add command, and
@@ -564,7 +967,6 @@ struct ice_sw_rule_lkup_rx_tx {
 struct ice_sw_rule_lg_act {
 	__le16 index; /* Index in large action table */
 	__le16 size;
-	__le32 act[1]; /* array of size for actions */
 	/* Max number of large actions */
 #define ICE_MAX_LG_ACT	4
 	/* Bit 0:1 - Action type */
@@ -615,8 +1017,10 @@ struct ice_sw_rule_lg_act {
 #define ICE_LG_ACT_STAT_COUNT		0x7
 #define ICE_LG_ACT_STAT_COUNT_S		3
 #define ICE_LG_ACT_STAT_COUNT_M		(0x7F << ICE_LG_ACT_STAT_COUNT_S)
+	__le32 act[]; /* array of size for actions */
 };
 
+
 /* Add/Update/Remove VSI list command/response entry
  * "index" is returned as part of a response to a successful Add command, and
  * can be used to identify the VSI list for Update/Get/Remove commands.
@@ -624,15 +1028,17 @@ struct ice_sw_rule_lg_act {
 struct ice_sw_rule_vsi_list {
 	__le16 index; /* Index of VSI/Prune list */
 	__le16 number_vsi;
-	__le16 vsi[1]; /* Array of number_vsi VSI numbers */
+	__le16 vsi[]; /* Array of number_vsi VSI numbers */
 };
 
+
 /* Query VSI list command/response entry */
 struct ice_sw_rule_vsi_list_query {
 	__le16 index;
 	DECLARE_BITMAP(vsi_list, ICE_MAX_VSI);
 } __packed;
 
+
 /* Add switch rule response:
  * Content of return buffer is same as the input buffer. The status field and
  * LUT index are updated as part of the response
@@ -655,6 +1061,47 @@ struct ice_aqc_sw_rules_elem {
 	} __packed pdata;
 };
 
+
+
+/* PFC Ignore (direct 0x0301)
+ * The command and response use the same descriptor structure
+ */
+struct ice_aqc_pfc_ignore {
+	u8	tc_bitmap;
+	u8	cmd_flags; /* unused in response */
+#define ICE_AQC_PFC_IGNORE_SET		BIT(7)
+#define ICE_AQC_PFC_IGNORE_CLEAR	0
+	u8	reserved[14];
+};
+
+
+/* Set PFC Mode (direct 0x0303)
+ * Query PFC Mode (direct 0x0302)
+ */
+struct ice_aqc_set_query_pfc_mode {
+	u8	pfc_mode;
+/* For Set Command response, reserved in all other cases */
+#define ICE_AQC_PFC_NOT_CONFIGURED	0
+/* For Query Command response, reserved in all other cases */
+#define ICE_AQC_DCB_DIS		0
+#define ICE_AQC_PFC_VLAN_BASED_PFC	1
+#define ICE_AQC_PFC_DSCP_BASED_PFC	2
+	u8	rsvd[15];
+};
+
+
+/* Set DCB Parameters (direct 0x0306) */
+struct ice_aqc_set_dcb_params {
+	u8 cmd_flags; /* unused in response */
+#define ICE_AQC_LINK_UP_DCB_CFG    BIT(0)
+#define ICE_AQC_PERSIST_DCB_CFG    BIT(1)
+	u8 valid_flags; /* unused in response */
+#define ICE_AQC_LINK_UP_DCB_CFG_VALID    BIT(0)
+#define ICE_AQC_PERSIST_DCB_CFG_VALID    BIT(1)
+	u8 rsvd[14];
+};
+
+
 /* Get Default Topology (indirect 0x0400) */
 struct ice_aqc_get_topo {
 	u8 port_num;
@@ -665,6 +1112,7 @@ struct ice_aqc_get_topo {
 	__le32 addr_low;
 };
 
+
 /* Update TSE (indirect 0x0403)
  * Get TSE (indirect 0x0404)
  * Add TSE (indirect 0x0401)
@@ -681,19 +1129,29 @@ struct ice_aqc_sched_elem_cmd {
 	__le32 addr_low;
 };
 
-/* This is the buffer for:
- * Suspend Nodes (indirect 0x0409)
- * Resume Nodes (indirect 0x040A)
- */
-struct ice_aqc_suspend_resume_elem {
-	__le32 teid[1];
+
+
+struct ice_aqc_txsched_move_grp_info_hdr {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	__le16 num_elems;
+	u8 flags;
+	u8 reserved;
+};
+
+
+struct ice_aqc_move_elem {
+	struct ice_aqc_txsched_move_grp_info_hdr hdr;
+	__le32 teid[];
 };
 
+
 struct ice_aqc_elem_info_bw {
 	__le16 bw_profile_idx;
 	__le16 bw_alloc;
 };
 
+
 struct ice_aqc_txsched_elem {
 	u8 elem_type; /* Special field, reserved for some aq calls */
 #define ICE_AQC_ELEM_TYPE_UNDEFINED		0x0
@@ -725,26 +1183,27 @@ struct ice_aqc_txsched_elem {
 	__le16 reserved2;
 };
 
+
 struct ice_aqc_txsched_elem_data {
 	__le32 parent_teid;
 	__le32 node_teid;
 	struct ice_aqc_txsched_elem data;
 };
 
+
 struct ice_aqc_txsched_topo_grp_info_hdr {
 	__le32 parent_teid;
 	__le16 num_elems;
 	__le16 reserved2;
 };
 
+
 struct ice_aqc_add_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
-	struct ice_aqc_txsched_elem_data generic[1];
+	struct ice_aqc_txsched_elem_data generic[];
 };
 
-struct ice_aqc_get_elem {
-	struct ice_aqc_txsched_elem_data generic[1];
-};
+
 
 struct ice_aqc_get_topo_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
@@ -752,11 +1211,13 @@ struct ice_aqc_get_topo_elem {
 		generic[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 };
 
+
 struct ice_aqc_delete_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
-	__le32 teid[1];
+	__le32 teid[];
 };
 
+
 /* Query Port ETS (indirect 0x040E)
  *
  * This indirect command is used to query port TC node configuration.
@@ -768,6 +1229,7 @@ struct ice_aqc_query_port_ets {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_port_ets_elem {
 	u8 tc_valid_bits;
 	u8 reserved[3];
@@ -783,6 +1245,63 @@ struct ice_aqc_port_ets_elem {
 	__le32 tc_node_teid[8]; /* Used for response, reserved in command */
 };
 
+
+/* Rate limiting profile for
+ * Add RL profile (indirect 0x0410)
+ * Query RL profile (indirect 0x0411)
+ * Remove RL profile (indirect 0x0415)
+ * These indirect commands acts on single or multiple
+ * RL profiles with specified data.
+ */
+struct ice_aqc_rl_profile {
+	__le16 num_profiles;
+	__le16 num_processed; /* Only for response. Reserved in Command. */
+	u8 reserved[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_rl_profile_elem {
+	u8 level;
+	u8 flags;
+#define ICE_AQC_RL_PROFILE_TYPE_S	0x0
+#define ICE_AQC_RL_PROFILE_TYPE_M	(0x3 << ICE_AQC_RL_PROFILE_TYPE_S)
+#define ICE_AQC_RL_PROFILE_TYPE_CIR	0
+#define ICE_AQC_RL_PROFILE_TYPE_EIR	1
+#define ICE_AQC_RL_PROFILE_TYPE_SRL	2
+/* The following flag is used for Query RL Profile Data */
+#define ICE_AQC_RL_PROFILE_INVAL_S	0x7
+#define ICE_AQC_RL_PROFILE_INVAL_M	(0x1 << ICE_AQC_RL_PROFILE_INVAL_S)
+
+	__le16 profile_id;
+	__le16 max_burst_size;
+	__le16 rl_multiply;
+	__le16 wake_up_calc;
+	__le16 rl_encode;
+};
+
+
+
+/* Configure L2 Node CGD (indirect 0x0414)
+ * This indirect command allows configuring a congestion domain for given L2
+ * node TEIDs in the scheduler topology.
+ */
+struct ice_aqc_cfg_l2_node_cgd {
+	__le16 num_l2_nodes;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_cfg_l2_node_cgd_elem {
+	__le32 node_teid;
+	u8 cgd;
+	u8 reserved[3];
+};
+
+
 /* Query Scheduler Resource Allocation (indirect 0x0412)
  * This indirect command retrieves the scheduler resources allocated by
  * EMP Firmware to the given PF.
@@ -793,6 +1312,7 @@ struct ice_aqc_query_txsched_res {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_generic_sched_props {
 	__le16 phys_levels;
 	__le16 logical_levels;
@@ -804,6 +1324,7 @@ struct ice_aqc_generic_sched_props {
 	u8 rsvd1[22];
 };
 
+
 struct ice_aqc_layer_props {
 	u8 logical_layer;
 	u8 chunk_size;
@@ -817,11 +1338,24 @@ struct ice_aqc_layer_props {
 	u8 rsvd1[14];
 };
 
+
 struct ice_aqc_query_txsched_res_resp {
 	struct ice_aqc_generic_sched_props sched_props;
 	struct ice_aqc_layer_props layer_props[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 };
 
+
+/* Query Node to Root Topology (indirect 0x0413)
+ * This command uses ice_aqc_get_elem as its data buffer.
+ */
+struct ice_aqc_query_node_to_root {
+	__le32 teid;
+	__le32 num_nodes; /* Response only */
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 /* Get PHY capabilities (indirect 0x0600) */
 struct ice_aqc_get_phy_caps {
 	u8 lport_num;
@@ -829,21 +1363,24 @@ struct ice_aqc_get_phy_caps {
 	__le16 param0;
 	/* 18.0 - Report qualified modules */
 #define ICE_AQC_GET_PHY_RQM		BIT(0)
-	/* 18.1 - 18.2 : Report mode
-	 * 00b - Report NVM capabilities
-	 * 01b - Report topology capabilities
-	 * 10b - Report SW configured
+	/* 18.1 - 18.3 : Report mode
+	 * 000b - Report NVM capabilities
+	 * 001b - Report topology capabilities
+	 * 010b - Report SW configured
+	 * 100b - Report default capabilities
 	 */
-#define ICE_AQC_REPORT_MODE_S		1
-#define ICE_AQC_REPORT_MODE_M		(3 << ICE_AQC_REPORT_MODE_S)
-#define ICE_AQC_REPORT_NVM_CAP		0
-#define ICE_AQC_REPORT_TOPO_CAP		BIT(1)
-#define ICE_AQC_REPORT_SW_CFG		BIT(2)
+#define ICE_AQC_REPORT_MODE_S			1
+#define ICE_AQC_REPORT_MODE_M			(7 << ICE_AQC_REPORT_MODE_S)
+#define ICE_AQC_REPORT_TOPO_CAP_NO_MEDIA	0
+#define ICE_AQC_REPORT_TOPO_CAP_MEDIA		BIT(1)
+#define ICE_AQC_REPORT_ACTIVE_CFG		BIT(2)
+#define ICE_AQC_REPORT_DFLT_CFG			BIT(3)
 	__le32 reserved1;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
 /* This is #define of PHY type (Extended):
  * The first set of defines is for phy_type_low.
  */
@@ -918,7 +1455,7 @@ struct ice_aqc_get_phy_caps {
 #define ICE_PHY_TYPE_HIGH_100G_CAUI2		BIT_ULL(2)
 #define ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC	BIT_ULL(3)
 #define ICE_PHY_TYPE_HIGH_100G_AUI2		BIT_ULL(4)
-#define ICE_PHY_TYPE_HIGH_MAX_INDEX		19
+#define ICE_PHY_TYPE_HIGH_MAX_INDEX		5
 
 struct ice_aqc_get_phy_caps_data {
 	__le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */
@@ -929,11 +1466,15 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_LOW_POWER_MODE			BIT(2)
 #define ICE_AQC_PHY_EN_LINK				BIT(3)
 #define ICE_AQC_PHY_AN_MODE				BIT(4)
-#define ICE_AQC_GET_PHY_EN_MOD_QUAL			BIT(5)
+#define ICE_AQC_PHY_EN_MOD_QUAL				BIT(5)
+#define ICE_AQC_PHY_EN_LESM				BIT(6)
 #define ICE_AQC_PHY_EN_AUTO_FEC				BIT(7)
 #define ICE_AQC_PHY_CAPS_MASK				ICE_M(0xff, 0)
-	u8 low_power_ctrl;
+	u8 low_power_ctrl_an;
 #define ICE_AQC_PHY_EN_D3COLD_LOW_POWER_AUTONEG		BIT(0)
+#define ICE_AQC_PHY_AN_EN_CLAUSE28			BIT(1)
+#define ICE_AQC_PHY_AN_EN_CLAUSE73			BIT(2)
+#define ICE_AQC_PHY_AN_EN_CLAUSE37			BIT(3)
 	__le16 eee_cap;
 #define ICE_AQC_PHY_EEE_EN_100BASE_TX			BIT(0)
 #define ICE_AQC_PHY_EEE_EN_1000BASE_T			BIT(1)
@@ -942,6 +1483,10 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_EEE_EN_10GBASE_KR			BIT(4)
 #define ICE_AQC_PHY_EEE_EN_25GBASE_KR			BIT(5)
 #define ICE_AQC_PHY_EEE_EN_40GBASE_KR4			BIT(6)
+#define ICE_AQC_PHY_EEE_EN_50GBASE_KR2			BIT(7)
+#define ICE_AQC_PHY_EEE_EN_50GBASE_KR_PAM4		BIT(8)
+#define ICE_AQC_PHY_EEE_EN_100GBASE_KR4			BIT(9)
+#define ICE_AQC_PHY_EEE_EN_100GBASE_KR2_PAM4		BIT(10)
 	__le16 eeer_value;
 	u8 phy_id_oui[4]; /* PHY/Module ID connected on the port */
 	u8 phy_fw_ver[8];
@@ -954,12 +1499,14 @@ struct ice_aqc_get_phy_caps_data {
 #define ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN		BIT(6)
 #define ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN		BIT(7)
 #define ICE_AQC_PHY_FEC_MASK				ICE_M(0xdf, 0)
-	u8 rsvd1;	/* Byte 35 reserved */
+	u8 module_compliance_enforcement;
+#define ICE_AQC_MOD_ENFORCE_STRICT_MODE			BIT(0)
 	u8 extended_compliance_code;
 #define ICE_MODULE_TYPE_TOTAL_BYTE			3
 	u8 module_type[ICE_MODULE_TYPE_TOTAL_BYTE];
 #define ICE_AQC_MOD_TYPE_BYTE0_SFP_PLUS			0xA0
 #define ICE_AQC_MOD_TYPE_BYTE0_QSFP_PLUS		0x80
+#define ICE_AQC_MOD_TYPE_IDENT				1
 #define ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE	BIT(0)
 #define ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE	BIT(1)
 #define ICE_AQC_MOD_TYPE_BYTE1_10G_BASE_SR		BIT(4)
@@ -980,6 +1527,7 @@ struct ice_aqc_get_phy_caps_data {
 	} qual_modules[ICE_AQC_QUAL_MOD_COUNT_MAX];
 };
 
+
 /* Set PHY capabilities (direct 0x0601)
  * NOTE: This command must be followed by setup link and restart auto-neg
  */
@@ -990,6 +1538,7 @@ struct ice_aqc_set_phy_cfg {
 	__le32 addr_low;
 };
 
+
 /* Set PHY config command data structure */
 struct ice_aqc_set_phy_cfg_data {
 	__le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */
@@ -1003,13 +1552,34 @@ struct ice_aqc_set_phy_cfg_data {
 #define ICE_AQ_PHY_ENA_AUTO_LINK_UPDT	BIT(5)
 #define ICE_AQ_PHY_ENA_LESM		BIT(6)
 #define ICE_AQ_PHY_ENA_AUTO_FEC		BIT(7)
-	u8 low_power_ctrl;
+	u8 low_power_ctrl_an;
 	__le16 eee_cap; /* Value from ice_aqc_get_phy_caps */
 	__le16 eeer_value;
 	u8 link_fec_opt; /* Use defines from ice_aqc_get_phy_caps */
-	u8 rsvd1;
+	u8 module_compliance_enforcement;
 };
 
+
+/* Set MAC Config command data structure (direct 0x0603) */
+struct ice_aqc_set_mac_cfg {
+	__le16 max_frame_size;
+	u8 params;
+#define ICE_AQ_SET_MAC_PACE_S		3
+#define ICE_AQ_SET_MAC_PACE_M		(0xF << ICE_AQ_SET_MAC_PACE_S)
+#define ICE_AQ_SET_MAC_PACE_TYPE_M	BIT(7)
+#define ICE_AQ_SET_MAC_PACE_TYPE_RATE	0
+#define ICE_AQ_SET_MAC_PACE_TYPE_FIXED	ICE_AQ_SET_MAC_PACE_TYPE_M
+	u8 tx_tmr_priority;
+	__le16 tx_tmr_value;
+	__le16 fc_refresh_threshold;
+	u8 drop_opts;
+#define ICE_AQ_SET_MAC_AUTO_DROP_MASK		BIT(0)
+#define ICE_AQ_SET_MAC_AUTO_DROP_NONE		0
+#define ICE_AQ_SET_MAC_AUTO_DROP_BLOCKING_PKTS	BIT(0)
+	u8 reserved[7];
+};
+
+
 /* Restart AN command data structure (direct 0x0605)
  * Also used for response, with only the lport_num field present.
  */
@@ -1022,6 +1592,7 @@ struct ice_aqc_restart_an {
 	u8 reserved2[13];
 };
 
+
 /* Get link status (indirect 0x0607), also used for Link Status Event */
 struct ice_aqc_get_link_status {
 	u8 lport_num;
@@ -1038,13 +1609,25 @@ struct ice_aqc_get_link_status {
 	__le32 addr_low;
 };
 
+
 /* Get link status response data structure, also used for Link Status Event */
 struct ice_aqc_get_link_status_data {
 	u8 topo_media_conflict;
 #define ICE_AQ_LINK_TOPO_CONFLICT	BIT(0)
 #define ICE_AQ_LINK_MEDIA_CONFLICT	BIT(1)
 #define ICE_AQ_LINK_TOPO_CORRUPT	BIT(2)
-	u8 reserved1;
+#define ICE_AQ_LINK_TOPO_UNREACH_PRT	BIT(4)
+#define ICE_AQ_LINK_TOPO_UNDRUTIL_PRT	BIT(5)
+#define ICE_AQ_LINK_TOPO_UNDRUTIL_MEDIA	BIT(6)
+#define ICE_AQ_LINK_TOPO_UNSUPP_MEDIA	BIT(7)
+	u8 link_cfg_err;
+#define ICE_AQ_LINK_CFG_ERR			BIT(0)
+#define ICE_AQ_LINK_ACT_PORT_OPT_INVAL		BIT(2)
+#define ICE_AQ_LINK_FEAT_ID_OR_CONFIG_ID_INVAL	BIT(3)
+#define ICE_AQ_LINK_TOPO_CRITICAL_SDP_ERR	BIT(4)
+#define ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED	BIT(5)
+#define ICE_AQ_LINK_EXTERNAL_PHY_LOAD_FAILURE	BIT(6)
+#define ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT	BIT(7)
 	u8 link_info;
 #define ICE_AQ_LINK_UP			BIT(0)	/* Link Status */
 #define ICE_AQ_LINK_FAULT		BIT(1)
@@ -1072,7 +1655,12 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_LINK_TX_ACTIVE		0
 #define ICE_AQ_LINK_TX_DRAINED		1
 #define ICE_AQ_LINK_TX_FLUSHED		3
-	u8 reserved2;
+	u8 lb_status;
+#define ICE_AQ_LINK_LB_PHY_LCL		BIT(0)
+#define ICE_AQ_LINK_LB_PHY_RMT		BIT(1)
+#define ICE_AQ_LINK_LB_MAC_LCL		BIT(2)
+#define ICE_AQ_LINK_LB_PHY_IDX_S	3
+#define ICE_AQ_LINK_LB_PHY_IDX_M	(0x7 << ICE_AQ_LB_PHY_IDX_S)
 	__le16 max_frame_size;
 	u8 cfg;
 #define ICE_AQ_LINK_25G_KR_FEC_EN	BIT(0)
@@ -1087,7 +1675,7 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_CFG_PACING_TYPE_FIXED	ICE_AQ_CFG_PACING_TYPE_M
 	/* External Device Power Ability */
 	u8 power_desc;
-#define ICE_AQ_PWR_CLASS_M		0x3
+#define ICE_AQ_PWR_CLASS_M		0x3F
 #define ICE_AQ_LINK_PWR_BASET_LOW_HIGH	0
 #define ICE_AQ_LINK_PWR_BASET_HIGH	1
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_1	0
@@ -1095,6 +1683,7 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_3	2
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_4	3
 	__le16 link_speed;
+#define ICE_AQ_LINK_SPEED_M		0x7FF
 #define ICE_AQ_LINK_SPEED_10MB		BIT(0)
 #define ICE_AQ_LINK_SPEED_100MB		BIT(1)
 #define ICE_AQ_LINK_SPEED_1000MB	BIT(2)
@@ -1112,6 +1701,7 @@ struct ice_aqc_get_link_status_data {
 	__le64 phy_type_high; /* Use values from ICE_PHY_TYPE_HIGH_* */
 };
 
+
 /* Set event mask command (direct 0x0613) */
 struct ice_aqc_set_event_mask {
 	u8	lport_num;
@@ -1126,9 +1716,31 @@ struct ice_aqc_set_event_mask {
 #define ICE_AQ_LINK_EVENT_AN_COMPLETED		BIT(7)
 #define ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL	BIT(8)
 #define ICE_AQ_LINK_EVENT_PORT_TX_SUSPENDED	BIT(9)
+#define ICE_AQ_LINK_EVENT_TOPO_CONFLICT		BIT(10)
+#define ICE_AQ_LINK_EVENT_MEDIA_CONFLICT	BIT(11)
+#define ICE_AQ_LINK_EVENT_PHY_FW_LOAD_FAIL	BIT(12)
 	u8	reserved1[6];
 };
 
+
+/* Set PHY Loopback command (direct 0x0619) */
+struct ice_aqc_set_phy_lb {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQ_PHY_LB_PORT_NUM_VALID	BIT(0)
+	u8 phy_index;
+	u8 lb_mode;
+#define ICE_AQ_PHY_LB_EN		BIT(0)
+#define ICE_AQ_PHY_LB_TYPE_M		BIT(1)
+#define ICE_AQ_PHY_LB_TYPE_LOCAL	0
+#define ICE_AQ_PHY_LB_TYPE_REMOTE	ICE_AQ_PHY_LB_TYPE_M
+#define ICE_AQ_PHY_LB_LEVEL_M		BIT(2)
+#define ICE_AQ_PHY_LB_LEVEL_PMD		0
+#define ICE_AQ_PHY_LB_LEVEL_PCS		ICE_AQ_PHY_LB_LEVEL_M
+	u8 reserved2[12];
+};
+
+
 /* Set MAC Loopback command (direct 0x0620) */
 struct ice_aqc_set_mac_lb {
 	u8 lb_mode;
@@ -1137,63 +1749,708 @@ struct ice_aqc_set_mac_lb {
 	u8 reserved[15];
 };
 
-/* Set Port Identification LED (direct, 0x06E9) */
-struct ice_aqc_set_port_id_led {
-	u8 lport_num;
-	u8 lport_num_valid;
-	u8 ident_mode;
-#define ICE_AQC_PORT_IDENT_LED_BLINK	BIT(0)
-#define ICE_AQC_PORT_IDENT_LED_ORIG	0
-	u8 rsvd[13];
-};
 
-/* NVM Read command (indirect 0x0701)
- * NVM Erase commands (direct 0x0702)
- * NVM Update commands (indirect 0x0703)
+
+
+/* DNL Get Status command (indirect 0x0680)
+ * Structure used for the response, the command uses the generic
+ * ice_aqc_generic struct to pass a buffer address to the FW.
  */
-struct ice_aqc_nvm {
-	__le16 offset_low;
-	u8 offset_high;
-	u8 cmd_flags;
-#define ICE_AQC_NVM_LAST_CMD		BIT(0)
-#define ICE_AQC_NVM_PCIR_REQ		BIT(0)	/* Used by NVM Update reply */
-#define ICE_AQC_NVM_PRESERVATION_S	1
-#define ICE_AQC_NVM_PRESERVATION_M	(3 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_NO_PRESERVATION	(0 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_PRESERVE_ALL	BIT(1)
-#define ICE_AQC_NVM_PRESERVE_SELECTED	(3 << ICE_AQC_NVM_PRESERVATION_S)
-#define ICE_AQC_NVM_FLASH_ONLY		BIT(7)
-	__le16 module_typeid;
-	__le16 length;
-#define ICE_AQC_NVM_ERASE_LEN	0xFFFF
+struct ice_aqc_dnl_get_status {
+	u8 ctx;
+	u8 status;
+#define ICE_AQ_DNL_STATUS_IDLE		0x0
+#define ICE_AQ_DNL_STATUS_RESERVED	0x1
+#define ICE_AQ_DNL_STATUS_STOPPED	0x2
+#define ICE_AQ_DNL_STATUS_FATAL		0x3 /* Fatal DNL engine error */
+#define ICE_AQ_DNL_SRC_S		3
+#define ICE_AQ_DNL_SRC_M		(0x3 << ICE_AQ_DNL_SRC_S)
+#define ICE_AQ_DNL_SRC_NVM		(0x0 << ICE_AQ_DNL_SRC_S)
+#define ICE_AQ_DNL_SRC_NVM_SCRATCH	(0x1 << ICE_AQ_DNL_SRC_S)
+	u8 stack_ptr;
+#define ICE_AQ_DNL_ST_PTR_S		0x0
+#define ICE_AQ_DNL_ST_PTR_M		(0x7 << ICE_AQ_DNL_ST_PTR_S)
+	u8 engine_flags;
+#define ICE_AQ_DNL_FLAGS_ERROR		BIT(2)
+#define ICE_AQ_DNL_FLAGS_NEGATIVE	BIT(3)
+#define ICE_AQ_DNL_FLAGS_OVERFLOW	BIT(4)
+#define ICE_AQ_DNL_FLAGS_ZERO		BIT(5)
+#define ICE_AQ_DNL_FLAGS_CARRY		BIT(6)
+#define ICE_AQ_DNL_FLAGS_JUMP		BIT(7)
+	__le16 pc;
+	__le16 activity_id;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
-/* NVM Checksum Command (direct, 0x0706) */
-struct ice_aqc_nvm_checksum {
-	u8 flags;
-#define ICE_AQC_NVM_CHECKSUM_VERIFY	BIT(0)
-#define ICE_AQC_NVM_CHECKSUM_RECALC	BIT(1)
-	u8 rsvd;
-	__le16 checksum; /* Used only by response */
-#define ICE_AQC_NVM_CHECKSUM_CORRECT	0xBABA
-	u8 rsvd2[12];
-};
 
-/**
- * Send to PF command (indirect 0x0801) ID is only used by PF
- *
- * Send to VF command (indirect 0x0802) ID is only used by PF
- *
+struct ice_aqc_dnl_get_status_data {
+	__le16 activity_err_code;
+	__le16 act_err_code;
+#define ICE_AQ_DNL_ACT_ERR_SUCCESS	0x0000 /* no error */
+#define ICE_AQ_DNL_ACT_ERR_PARSE	0x8001 /* NVM parse error */
+#define ICE_AQ_DNL_ACT_ERR_UNSUPPORTED	0x8002 /* unsupported action */
+#define ICE_AQ_DNL_ACT_ERR_NOT_FOUND	0x8003 /* activity not found */
+#define ICE_AQ_DNL_ACT_ERR_BAD_JUMP	0x8004 /* an illegal jump */
+#define ICE_AQ_DNL_ACT_ERR_PSTO_OVER	0x8005 /* persistent store overflow */
+#define ICE_AQ_DNL_ACT_ERR_ST_OVERFLOW	0x8006 /* stack overflow */
+#define ICE_AQ_DNL_ACT_ERR_TIMEOUT	0x8007 /* activity timeout */
+#define ICE_AQ_DNL_ACT_ERR_BREAK	0x0008 /* stopped at breakpoint */
+#define ICE_AQ_DNL_ACT_ERR_INVAL_ARG	0x0101 /* invalid action argument */
+	__le32 execution_time; /* in nanoseconds */
+	__le16 lib_ver;
+	u8 psto_local_sz;
+	u8 psto_global_sz;
+	u8 stack_sz;
+#define ICE_AQ_DNL_STACK_SZ_S		0
+#define ICE_AQ_DNL_STACK_SZ_M		(0xF << ICE_AQ_DNL_STACK_SZ_S)
+	u8 port_count;
+#define ICE_AQ_DNL_PORT_CNT_S		0
+#define ICE_AQ_DNL_PORT_CNT_M		(0x1F << ICE_AQ_DNL_PORT_CNT_S)
+	__le16 act_cache_cntr;
+	u32 i2c_clk_cntr;
+	u32 mdio_clk_cntr;
+	u32 sb_iosf_clk_cntr;
+};
+
+
+/* DNL run command (direct 0x0681) */
+struct ice_aqc_dnl_run_command {
+	u8 reserved0;
+	u8 command;
+#define ICE_AQ_DNL_CMD_S		0
+#define ICE_AQ_DNL_CMD_M		(0x7 << ICE_AQ_DNL_CMD_S)
+#define ICE_AQ_DNL_CMD_RESET		0x0
+#define ICE_AQ_DNL_CMD_RUN		0x1
+#define ICE_AQ_DNL_CMD_STEP		0x3
+#define ICE_AQ_DNL_CMD_ABORT		0x4
+#define ICE_AQ_DNL_CMD_SET_PC		0x7
+#define ICE_AQ_DNL_CMD_SRC_S		3
+#define ICE_AQ_DNL_CMD_SRC_M		(0x3 << ICE_AQ_DNL_CMD_SRC_S)
+#define ICE_AQ_DNL_CMD_SRC_DNL		0x0
+#define ICE_AQ_DNL_CMD_SRC_SCRATCH	0x1
+	__le16 new_pc;
+	u8 reserved1[12];
+};
+
+
+/* DNL call command (indirect 0x0682)
+ * Struct is used for both command and response
  */
-struct ice_aqc_pf_vf_msg {
-	__le32 id;
-	u32 reserved;
+struct ice_aqc_dnl_call_command {
+	u8 ctx; /* Used in command, reserved in response */
+	u8 reserved;
+	__le16 activity_id;
+	__le32 reserved1;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
+/* DNL call command/response buffer (indirect 0x0682) */
+struct ice_aqc_dnl_call {
+	__le32 stores[4];
+};
+
+
+/* Used for both commands:
+ * DNL read sto command (indirect 0x0683)
+ * DNL write sto command (indirect 0x0684)
+ */
+struct ice_aqc_dnl_read_write_command {
+	u8 ctx;
+	u8 sto_sel; /* STORE select */
+#define ICE_AQC_DNL_STORE_SELECT_STORE	0x0
+#define ICE_AQC_DNL_STORE_SELECT_PSTO	0x1
+#define ICE_AQC_DNL_STORE_SELECT_STACK	0x2
+	__le16 offset;
+	__le32 data; /* Used for write sto only */
+	__le32 addr_high; /* Used for read sto only */
+	__le32 addr_low; /* Used for read sto only */
+};
+
+
+/* Used for both command responses:
+ * DNL read sto response (indirect 0x0683)
+ * DNL write sto response (indirect 0x0684)
+ */
+struct ice_aqc_dnl_read_write_response {
+	u8 reserved;
+	u8 status; /* Reserved for read command */
+	__le16 size; /* Reserved for write command */
+	__le32 data; /* Reserved for write command */
+	__le32 addr_high; /* Reserved for write command */
+	__le32 addr_low; /* Reserved for write command */
+};
+
+
+/* DNL set breakpoints command (indirect 0x0686) */
+struct ice_aqc_dnl_set_breakpoints_command {
+	__le32 reserved[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* DNL set breakpoints data buffer structure (indirect 0x0686) */
+struct ice_aqc_dnl_set_breakpoints {
+	u8 ctx;
+	u8 ena; /* 0- disabled, 1- enabled */
+	__le16 offset;
+	__le16 activity_id;
+};
+
+
+/* DNL read log data command(indirect 0x0687) */
+struct ice_aqc_dnl_read_log_command {
+	__le16 reserved0;
+	__le16 offset;
+	__le32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+
+};
+
+
+/* DNL read log data response(indirect 0x0687) */
+struct ice_aqc_dnl_read_log_response {
+	__le16 reserved;
+	__le16 size;
+	__le32 data;
+	__le32 addr_high;
+	__le32 addr_low;
+
+};
+
+
+struct ice_aqc_link_topo_params {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_LINK_TOPO_PORT_NUM_VALID	BIT(0)
+	u8 node_type_ctx;
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_S		0
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_M	(0xF << ICE_AQC_LINK_TOPO_NODE_TYPE_S)
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_PHY		0
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_GPIO_CTRL	1
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_MUX_CTRL	2
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_LED_CTRL	3
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_LED		4
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_THERMAL	5
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_CAGE	6
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_MEZZ	7
+#define ICE_AQC_LINK_TOPO_NODE_TYPE_ID_EEPROM	8
+#define ICE_AQC_LINK_TOPO_NODE_CTX_S		4
+#define ICE_AQC_LINK_TOPO_NODE_CTX_M		\
+				(0xF << ICE_AQC_LINK_TOPO_NODE_CTX_S)
+#define ICE_AQC_LINK_TOPO_NODE_CTX_GLOBAL	0
+#define ICE_AQC_LINK_TOPO_NODE_CTX_BOARD	1
+#define ICE_AQC_LINK_TOPO_NODE_CTX_PORT		2
+#define ICE_AQC_LINK_TOPO_NODE_CTX_NODE		3
+#define ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED	4
+#define ICE_AQC_LINK_TOPO_NODE_CTX_OVERRIDE	5
+	u8 index;
+};
+
+
+struct ice_aqc_link_topo_addr {
+	struct ice_aqc_link_topo_params topo_params;
+	__le16 handle;
+#define ICE_AQC_LINK_TOPO_HANDLE_S	0
+#define ICE_AQC_LINK_TOPO_HANDLE_M	(0x3FF << ICE_AQC_LINK_TOPO_HANDLE_S)
+/* Used to decode the handle field */
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_M	BIT(9)
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_LOM	BIT(9)
+#define ICE_AQC_LINK_TOPO_HANDLE_BRD_TYPE_MEZZ	0
+#define ICE_AQC_LINK_TOPO_HANDLE_NODE_S		0
+/* In case of a Mezzanine type */
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_NODE_M	\
+				(0x3F << ICE_AQC_LINK_TOPO_HANDLE_NODE_S)
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_S	6
+#define ICE_AQC_LINK_TOPO_HANDLE_MEZZ_M	(0x7 << ICE_AQC_LINK_TOPO_HANDLE_MEZZ_S)
+/* In case of a LOM type */
+#define ICE_AQC_LINK_TOPO_HANDLE_LOM_NODE_M	\
+				(0x1FF << ICE_AQC_LINK_TOPO_HANDLE_NODE_S)
+};
+
+
+/* Get Link Topology Handle (direct, 0x06E0) */
+struct ice_aqc_get_link_topo {
+	struct ice_aqc_link_topo_addr addr;
+	u8 node_part_num;
+#define ICE_ACQ_GET_LINK_TOPO_NODE_NR_PCA9575	0x21
+	u8 rsvd[9];
+};
+
+
+/* Get Link Topology Pin (direct, 0x06E1) */
+struct ice_aqc_get_link_topo_pin {
+	struct ice_aqc_link_topo_addr addr;
+	u8 input_io_params;
+#define ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_S	0
+#define ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_M	\
+				(0x1F << ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_S)
+#define ICE_AQC_LINK_TOPO_IO_FUNC_GPIO		0
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RESET_N	1
+#define ICE_AQC_LINK_TOPO_IO_FUNC_INT_N		2
+#define ICE_AQC_LINK_TOPO_IO_FUNC_PRESENT_N	3
+#define ICE_AQC_LINK_TOPO_IO_FUNC_TX_DIS	4
+#define ICE_AQC_LINK_TOPO_IO_FUNC_MODSEL_N	5
+#define ICE_AQC_LINK_TOPO_IO_FUNC_LPMODE	6
+#define ICE_AQC_LINK_TOPO_IO_FUNC_TX_FAULT	7
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RX_LOSS	8
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RS0		9
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RS1		10
+#define ICE_AQC_LINK_TOPO_IO_FUNC_EEPROM_WP	11
+/* 12 repeats intentionally due to two different uses depending on context */
+#define ICE_AQC_LINK_TOPO_IO_FUNC_LED		12
+#define ICE_AQC_LINK_TOPO_IO_FUNC_RED_LED	12
+#define ICE_AQC_LINK_TOPO_IO_FUNC_GREEN_LED	13
+#define ICE_AQC_LINK_TOPO_IO_FUNC_BLUE_LED	14
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S	5
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S)
+#define ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_GPIO	3
+/* Use ICE_AQC_LINK_TOPO_NODE_TYPE_* for the type values */
+	u8 output_io_params;
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_FUNC_S	0
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_FUNC_M	\
+			(0x1F << \ ICE_AQC_LINK_TOPO_INPUT_IO_FUNC_NUM_S)
+/* Use ICE_AQC_LINK_TOPO_IO_FUNC_* for the non-numerical options */
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_TYPE_S	5
+#define ICE_AQC_LINK_TOPO_OUTPUT_IO_TYPE_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S)
+/* Use ICE_AQC_LINK_TOPO_NODE_TYPE_* for the type values */
+	u8 output_io_flags;
+#define ICE_AQC_LINK_TOPO_OUTPUT_SPEED_S	0
+#define ICE_AQC_LINK_TOPO_OUTPUT_SPEED_M	\
+			(0x7 << ICE_AQC_LINK_TOPO_OUTPUT_SPEED_S)
+#define ICE_AQC_LINK_TOPO_OUTPUT_INT_S		3
+#define ICE_AQC_LINK_TOPO_OUTPUT_INT_M		\
+			(0x3 << ICE_AQC_LINK_TOPO_OUTPUT_INT_S)
+#define ICE_AQC_LINK_TOPO_OUTPUT_POLARITY	BIT(5)
+#define ICE_AQC_LINK_TOPO_OUTPUT_VALUE		BIT(6)
+#define ICE_AQC_LINK_TOPO_OUTPUT_DRIVEN		BIT(7)
+	u8 rsvd[7];
+};
+
+/* Read/Write I2C (direct, 0x06E2/0x06E3) */
+struct ice_aqc_i2c {
+	struct ice_aqc_link_topo_addr topo_addr;
+	__le16 i2c_addr;
+	u8 i2c_params;
+#define ICE_AQC_I2C_DATA_SIZE_S		0
+#define ICE_AQC_I2C_DATA_SIZE_M		(0xF << ICE_AQC_I2C_DATA_SIZE_S)
+#define ICE_AQC_I2C_ADDR_TYPE_M		BIT(4)
+#define ICE_AQC_I2C_ADDR_TYPE_7BIT	0
+#define ICE_AQC_I2C_ADDR_TYPE_10BIT	ICE_AQC_I2C_ADDR_TYPE_M
+#define ICE_AQC_I2C_DATA_OFFSET_S	5
+#define ICE_AQC_I2C_DATA_OFFSET_M	(0x3 << ICE_AQC_I2C_DATA_OFFSET_S)
+#define ICE_AQC_I2C_USE_REPEATED_START	BIT(7)
+	u8 rsvd;
+	__le16 i2c_bus_addr;
+#define ICE_AQC_I2C_ADDR_7BIT_MASK	0x7F
+#define ICE_AQC_I2C_ADDR_10BIT_MASK	0x3FF
+	u8 i2c_data[4]; /* Used only by write command, reserved in read. */
+};
+
+
+/* Read I2C Response (direct, 0x06E2) */
+struct ice_aqc_read_i2c_resp {
+	u8 i2c_data[16];
+};
+
+
+/* Read/Write MDIO (direct, 0x06E4/0x06E5) */
+struct ice_aqc_mdio {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 mdio_device_addr;
+#define ICE_AQC_MDIO_DEV_S	0
+#define ICE_AQC_MDIO_DEV_M	(0x1F << ICE_AQC_MDIO_DEV_S)
+#define ICE_AQC_MDIO_CLAUSE_22	BIT(5)
+#define ICE_AQC_MDIO_CLAUSE_45	BIT(6)
+	u8 mdio_bus_address;
+#define ICE_AQC_MDIO_BUS_ADDR_S 0
+#define ICE_AQC_MDIO_BUS_ADDR_M (0x1F << ICE_AQC_MDIO_BUS_ADDR_S)
+	__le16 offset;
+	__le16 data; /* Input in write cmd, output in read cmd. */
+	u8 rsvd1[4];
+};
+
+
+/* Set/Get GPIO By Function (direct, 0x06E6/0x06E7) */
+struct ice_aqc_gpio_by_func {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 io_func_num;
+#define ICE_AQC_GPIO_FUNC_S	0
+#define ICE_AQC_GPIO_FUNC_M	(0x1F << ICE_AQC_GPIO_IO_FUNC_NUM_S)
+	u8 io_value; /* Input in write cmd, output in read cmd. */
+#define ICE_AQC_GPIO_ON		BIT(0)
+#define ICE_AQC_GPIO_OFF	0
+	u8 rsvd[8];
+};
+
+
+/* Set LED (direct, 0x06E8) */
+struct ice_aqc_set_led {
+	struct ice_aqc_link_topo_addr topo_addr;
+	u8 color_and_blink;
+#define ICE_AQC_LED_COLOR_S		0
+#define ICE_AQC_LED_COLOR_M		(0x7 << ICE_AQC_LED_COLOR_S)
+#define ICE_AQC_LED_COLOR_SKIP		0
+#define ICE_AQC_LED_COLOR_RED		1
+#define ICE_AQC_LED_COLOR_ORANGE	2
+#define ICE_AQC_LED_COLOR_YELLOW	3
+#define ICE_AQC_LED_COLOR_GREEN		4
+#define ICE_AQC_LED_COLOR_BLUE		5
+#define ICE_AQC_LED_COLOR_PURPLE	6
+#define ICE_AQC_LED_BLINK_S		3
+#define ICE_AQC_LED_BLINK_M		(0x7 << ICE_AQC_LED_BLINK_S)
+#define ICE_AQC_LED_BLINK_NONE		0
+#define ICE_AQC_LED_BLINK_SLOW		1
+#define ICE_AQC_LED_BLINK_SLOW_MAC	2
+#define ICE_AQC_LED_BLINK_SLOW_FLTR	3
+#define ICE_AQC_LED_BLINK_FAST		5
+#define ICE_AQC_LED_BLINK_FAST_MAC	6
+#define ICE_AQC_LED_BLINK_FAST_FLTR	7
+	u8 rsvd[9];
+};
+
+
+/* Set Port Identification LED (direct, 0x06E9) */
+struct ice_aqc_set_port_id_led {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_PORT_ID_PORT_NUM_VALID	BIT(0)
+	u8 ident_mode;
+#define ICE_AQC_PORT_IDENT_LED_BLINK	BIT(0)
+#define ICE_AQC_PORT_IDENT_LED_ORIG	0
+	u8 rsvd[13];
+};
+
+
+/* Get Port Options (indirect, 0x06EA) */
+struct ice_aqc_get_port_options {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_PORT_OPT_PORT_NUM_VALID	BIT(0)
+	u8 port_options_count;
+#define ICE_AQC_PORT_OPT_COUNT_S	0
+#define ICE_AQC_PORT_OPT_COUNT_M	(0xF << ICE_AQC_PORT_OPT_COUNT_S)
+	u8 innermost_phy_index;
+	u8 port_options;
+#define ICE_AQC_PORT_OPT_ACTIVE_S	0
+#define ICE_AQC_PORT_OPT_ACTIVE_M	(0xF << ICE_AQC_PORT_OPT_ACTIVE_S)
+#define ICE_AQC_PORT_OPT_FORCED		BIT(6)
+#define ICE_AQC_PORT_OPT_VALID		BIT(7)
+	u8 pending_port_option_status;
+#define ICE_AQC_PENDING_PORT_OPT_IDX_S	0
+#define ICE_AQC_PENDING_PORT_OPT_IDX_M	(0xF << ICE_AQC_PENDING_PORT_OPT_IDX_S)
+#define ICE_AQC_PENDING_PORT_OPT_VALID	BIT(7)
+	u8 rsvd[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_get_port_options_elem {
+	u8 pmd;
+#define ICE_AQC_PORT_INV_PORT_OPT	4
+#define ICE_AQC_PORT_OPT_PMD_COUNT_S	0
+#define ICE_AQC_PORT_OPT_PMD_COUNT_M	(0xF << ICE_AQC_PORT_OPT_PMD_COUNT_S)
+#define ICE_AQC_PORT_OPT_PMD_WIDTH_S	4
+#define ICE_AQC_PORT_OPT_PMD_WIDTH_M	(0xF << ICE_AQC_PORT_OPT_PMD_WIDTH_S)
+	u8 max_lane_speed;
+#define ICE_AQC_PORT_OPT_MAX_LANE_S	0
+#define ICE_AQC_PORT_OPT_MAX_LANE_M	(0xF << ICE_AQC_PORT_OPT_MAX_LANE_S)
+#define ICE_AQC_PORT_OPT_MAX_LANE_100M	0
+#define ICE_AQC_PORT_OPT_MAX_LANE_1G	1
+#define ICE_AQC_PORT_OPT_MAX_LANE_2500M	2
+#define ICE_AQC_PORT_OPT_MAX_LANE_5G	3
+#define ICE_AQC_PORT_OPT_MAX_LANE_10G	4
+#define ICE_AQC_PORT_OPT_MAX_LANE_25G	5
+#define ICE_AQC_PORT_OPT_MAX_LANE_50G	6
+#define ICE_AQC_PORT_OPT_MAX_LANE_100G	7
+	u8 global_scid[2];
+	u8 phy_scid[2];
+	u8 pf2port_cid[2];
+};
+
+
+/* Set Port Option (direct, 0x06EB) */
+struct ice_aqc_set_port_option {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_SET_PORT_OPT_PORT_NUM_VALID	BIT(0)
+	u8 selected_port_option;
+	u8 rsvd[13];
+};
+
+
+/* Set/Get GPIO (direct, 0x06EC/0x06ED) */
+struct ice_aqc_gpio {
+	__le16 gpio_ctrl_handle;
+#define ICE_AQC_GPIO_HANDLE_S	0
+#define ICE_AQC_GPIO_HANDLE_M	(0x3FF << ICE_AQC_GPIO_HANDLE_S)
+	u8 gpio_num;
+	u8 gpio_val;
+	u8 rsvd[12];
+};
+
+
+/* Read/Write SFF EEPROM command (indirect 0x06EE) */
+struct ice_aqc_sff_eeprom {
+	u8 lport_num;
+	u8 lport_num_valid;
+#define ICE_AQC_SFF_PORT_NUM_VALID	BIT(0)
+	__le16 i2c_bus_addr;
+#define ICE_AQC_SFF_I2CBUS_7BIT_M	0x7F
+#define ICE_AQC_SFF_I2CBUS_10BIT_M	0x3FF
+#define ICE_AQC_SFF_I2CBUS_TYPE_M	BIT(10)
+#define ICE_AQC_SFF_I2CBUS_TYPE_7BIT	0
+#define ICE_AQC_SFF_I2CBUS_TYPE_10BIT	ICE_AQC_SFF_I2CBUS_TYPE_M
+#define ICE_AQC_SFF_SET_EEPROM_PAGE_S	11
+#define ICE_AQC_SFF_SET_EEPROM_PAGE_M	(0x3 << ICE_AQC_SFF_SET_EEPROM_PAGE_S)
+#define ICE_AQC_SFF_NO_PAGE_CHANGE	0
+#define ICE_AQC_SFF_SET_23_ON_MISMATCH	1
+#define ICE_AQC_SFF_SET_22_ON_MISMATCH	2
+#define ICE_AQC_SFF_IS_WRITE		BIT(15)
+	__le16 i2c_mem_addr;
+	__le16 eeprom_page;
+#define  ICE_AQC_SFF_EEPROM_BANK_S 0
+#define  ICE_AQC_SFF_EEPROM_BANK_M (0xFF << ICE_AQC_SFF_EEPROM_BANK_S)
+#define  ICE_AQC_SFF_EEPROM_PAGE_S 8
+#define  ICE_AQC_SFF_EEPROM_PAGE_M (0xFF << ICE_AQC_SFF_EEPROM_PAGE_S)
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* SW Set GPIO command (indirect 0x6EF)
+ * SW Get GPIO command (indirect 0x6F0)
+ */
+struct ice_aqc_sw_gpio {
+	__le16 gpio_ctrl_handle;
+#define ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_S	0
+#define ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_M	(0x3FF << ICE_AQC_SW_GPIO_CONTROLLER_HANDLE_S)
+	u8 gpio_num;
+#define ICE_AQC_SW_GPIO_NUMBER_S	0
+#define ICE_AQC_SW_GPIO_NUMBER_M	(0x1F << ICE_AQC_SW_GPIO_NUMBER_S)
+	u8 gpio_params;
+#define ICE_AQC_SW_GPIO_PARAMS_DIRECTION    BIT(1)
+#define ICE_AQC_SW_GPIO_PARAMS_VALUE        BIT(0)
+	u8 rsvd[12];
+};
+
+
+/* Program Topology Device NVM (direct, 0x06F2) */
+struct ice_aqc_prog_topo_dev_nvm {
+	struct ice_aqc_link_topo_params topo_params;
+	u8 rsvd[12];
+};
+
+
+/* Read Topology Device NVM (direct, 0x06F3) */
+struct ice_aqc_read_topo_dev_nvm {
+	struct ice_aqc_link_topo_params topo_params;
+	__le32 start_address;
+#define ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE 8
+	u8 data_read[ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE];
+};
+
+
+/* NVM Read command (indirect 0x0701)
+ * NVM Erase commands (direct 0x0702)
+ * NVM Write commands (indirect 0x0703)
+ * NVM Write Activate commands (direct 0x0707)
+ * NVM Shadow RAM Dump commands (direct 0x0707)
+ */
+struct ice_aqc_nvm {
+#define ICE_AQC_NVM_MAX_OFFSET		0xFFFFFF
+	__le16 offset_low;
+	u8 offset_high; /* For Write Activate offset_high is used as flags2 */
+	u8 cmd_flags;
+#define ICE_AQC_NVM_LAST_CMD		BIT(0)
+#define ICE_AQC_NVM_PCIR_REQ		BIT(0)	/* Used by NVM Write reply */
+#define ICE_AQC_NVM_PRESERVATION_S	1 /* Used by NVM Write Activate only */
+#define ICE_AQC_NVM_PRESERVATION_M	(3 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_NO_PRESERVATION	(0 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_PRESERVE_ALL	BIT(1)
+#define ICE_AQC_NVM_FACTORY_DEFAULT	(2 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_PRESERVE_SELECTED	(3 << ICE_AQC_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_ACTIV_SEL_NVM	BIT(3) /* Write Activate/SR Dump only */
+#define ICE_AQC_NVM_ACTIV_SEL_OROM	BIT(4)
+#define ICE_AQC_NVM_ACTIV_SEL_NETLIST	BIT(5)
+#define ICE_AQC_NVM_SPECIAL_UPDATE	BIT(6)
+#define ICE_AQC_NVM_REVERT_LAST_ACTIV	BIT(6) /* Write Activate only */
+#define ICE_AQC_NVM_ACTIV_SEL_MASK	ICE_M(0x7, 3)
+#define ICE_AQC_NVM_FLASH_ONLY		BIT(7)
+#define ICE_AQC_NVM_RESET_LVL_M		ICE_M(0x3, 0) /* Write reply only */
+#define ICE_AQC_NVM_POR_FLAG		0
+#define ICE_AQC_NVM_PERST_FLAG		1
+#define ICE_AQC_NVM_EMPR_FLAG		2
+#define ICE_AQC_NVM_EMPR_ENA		BIT(0) /* Write Activate reply only */
+	__le16 module_typeid;
+	__le16 length;
+#define ICE_AQC_NVM_ERASE_LEN	0xFFFF
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* NVM Module_Type ID, needed offset and read_len for struct ice_aqc_nvm. */
+#define ICE_AQC_NVM_SECTOR_UNIT			4096 /* In Bytes */
+#define ICE_AQC_NVM_WORD_UNIT			2 /* In Bytes */
+
+#define ICE_AQC_NVM_START_POINT			0
+#define ICE_AQC_NVM_EMP_SR_PTR_OFFSET		0x90
+#define ICE_AQC_NVM_EMP_SR_PTR_RD_LEN		2 /* In Bytes */
+#define ICE_AQC_NVM_EMP_SR_PTR_M		ICE_M(0x7FFF, 0)
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_S		15
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_M		BIT(15)
+#define ICE_AQC_NVM_EMP_SR_PTR_TYPE_SECTOR	1
+
+#define ICE_AQC_NVM_LLDP_CFG_PTR_OFFSET		0x46
+#define ICE_AQC_NVM_LLDP_CFG_HEADER_LEN		2 /* In Bytes */
+#define ICE_AQC_NVM_LLDP_CFG_PTR_RD_LEN		2 /* In Bytes */
+
+#define ICE_AQC_NVM_LLDP_PRESERVED_MOD_ID	0x129
+#define ICE_AQC_NVM_CUR_LLDP_PERSIST_RD_OFFSET	2 /* In Bytes */
+#define ICE_AQC_NVM_LLDP_STATUS_M		ICE_M(0xF, 0)
+#define ICE_AQC_NVM_LLDP_STATUS_M_LEN		4 /* In Bits */
+#define ICE_AQC_NVM_LLDP_STATUS_RD_LEN		4 /* In Bytes */
+
+#define ICE_AQC_NVM_MINSREV_MOD_ID		0x130
+
+
+/* Used for reading and writing MinSRev using 0x0701 and 0x0703. Note that the
+ * type field is excluded from the section when reading and writing from
+ * a module using the module_typeid field with these AQ commands.
+ */
+struct ice_aqc_nvm_minsrev {
+	__le16 length;
+	__le16 validity;
+#define ICE_AQC_NVM_MINSREV_NVM_VALID		BIT(0)
+#define ICE_AQC_NVM_MINSREV_OROM_VALID		BIT(1)
+	__le16 nvm_minsrev_l;
+	__le16 nvm_minsrev_h;
+	__le16 orom_minsrev_l;
+	__le16 orom_minsrev_h;
+};
+
+
+/* Used for 0x0704 as well as for 0x0705 commands */
+struct ice_aqc_nvm_cfg {
+	u8	cmd_flags;
+#define ICE_AQC_ANVM_MULTIPLE_ELEMS	BIT(0)
+#define ICE_AQC_ANVM_IMMEDIATE_FIELD	BIT(1)
+#define ICE_AQC_ANVM_NEW_CFG		BIT(2)
+	u8	reserved;
+	__le16 count;
+	__le16 id;
+	u8 reserved1[2];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_nvm_cfg_data {
+	__le16 field_id;
+	__le16 field_options;
+	__le16 field_value;
+};
+
+
+/* NVM Checksum Command (direct, 0x0706) */
+struct ice_aqc_nvm_checksum {
+	u8 flags;
+#define ICE_AQC_NVM_CHECKSUM_VERIFY	BIT(0)
+#define ICE_AQC_NVM_CHECKSUM_RECALC	BIT(1)
+	u8 rsvd;
+	__le16 checksum; /* Used only by response */
+#define ICE_AQC_NVM_CHECKSUM_CORRECT	0xBABA
+	u8 rsvd2[12];
+};
+
+
+/* Used for NVM Set Package Data command - 0x070A */
+struct ice_aqc_nvm_pkg_data {
+	u8 reserved[3];
+	u8 cmd_flags;
+#define ICE_AQC_NVM_PKG_DELETE		BIT(0) /* used for command call */
+#define ICE_AQC_NVM_PKG_SKIPPED		BIT(0) /* used for command response */
+
+	u32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Used for Pass Component Table command - 0x070B */
+struct ice_aqc_nvm_pass_comp_tbl {
+	u8 component_response; /* Response only */
+#define ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED		0x0
+#define ICE_AQ_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE	0x1
+#define ICE_AQ_NVM_PASS_COMP_CAN_NOT_BE_UPDATED		0x2
+#define ICE_AQ_NVM_PASS_COMP_PARTIAL_CHECK		0x3
+	u8 component_response_code; /* Response only */
+#define ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED_CODE	0x0
+#define ICE_AQ_NVM_PASS_COMP_STAMP_IDENTICAL_CODE	0x1
+#define ICE_AQ_NVM_PASS_COMP_STAMP_LOWER		0x2
+#define ICE_AQ_NVM_PASS_COMP_INVALID_STAMP_CODE		0x3
+#define ICE_AQ_NVM_PASS_COMP_CONFLICT_CODE		0x4
+#define ICE_AQ_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE	0x5
+#define ICE_AQ_NVM_PASS_COMP_NOT_SUPPORTED_CODE		0x6
+#define ICE_AQ_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE	0x7
+#define ICE_AQ_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE	0x8
+#define ICE_AQ_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE	0xA
+#define ICE_AQ_NVM_PASS_COMP_VER_STR_LOWER_CODE		0xB
+	u8 reserved;
+	u8 transfer_flag;
+#define ICE_AQ_NVM_PASS_COMP_TBL_START			0x1
+#define ICE_AQ_NVM_PASS_COMP_TBL_MIDDLE			0x2
+#define ICE_AQ_NVM_PASS_COMP_TBL_END			0x4
+#define ICE_AQ_NVM_PASS_COMP_TBL_START_AND_END		0x5
+	__le32 reserved1;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+struct ice_aqc_nvm_comp_tbl {
+	__le16 comp_class;
+#define NVM_COMP_CLASS_ALL_FW	0x000A
+
+	__le16 comp_id;
+#define NVM_COMP_ID_OROM	0x5
+#define NVM_COMP_ID_NVM		0x6
+#define NVM_COMP_ID_NETLIST	0x8
+
+	u8 comp_class_idx;
+#define FWU_COMP_CLASS_IDX_NOT_USE 0x0
+
+	__le32 comp_cmp_stamp;
+	u8 cvs_type;
+#define NVM_CVS_TYPE_ASCII	0x1
+
+	u8 cvs_len;
+	u8 cvs[]; /* Component Version String */
+} __packed;
+
+
+/*
+ * Send to PF command (indirect 0x0801) ID is only used by PF
+ *
+ * Send to VF command (indirect 0x0802) ID is only used by PF
+ *
+ */
+struct ice_aqc_pf_vf_msg {
+	__le32 id;
+	u32 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+
 /* Get LLDP MIB (indirect 0x0A00)
  * Note: This is also used by the LLDP MIB Change Event (0x0A01)
  * as the format is the same.
@@ -1227,6 +2484,7 @@ struct ice_aqc_lldp_get_mib {
 	__le32 addr_low;
 };
 
+
 /* Configure LLDP MIB Change Event (direct 0x0A01) */
 /* For MIB Change Event use ice_aqc_lldp_get_mib structure above */
 struct ice_aqc_lldp_set_mib_change {
@@ -1236,6 +2494,32 @@ struct ice_aqc_lldp_set_mib_change {
 	u8 reserved[15];
 };
 
+
+/* Add LLDP TLV (indirect 0x0A02)
+ * Delete LLDP TLV (indirect 0x0A04)
+ */
+struct ice_aqc_lldp_add_delete_tlv {
+	u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8 reserved1[1];
+	__le16 len;
+	u8 reserved2[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Update LLDP TLV (indirect 0x0A03) */
+struct ice_aqc_lldp_update_tlv {
+	u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8 reserved;
+	__le16 old_len;
+	__le16 new_offset;
+	__le16 new_len;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
 /* Stop LLDP (direct 0x0A05) */
 struct ice_aqc_lldp_stop {
 	u8 command;
@@ -1246,6 +2530,7 @@ struct ice_aqc_lldp_stop {
 	u8 reserved[15];
 };
 
+
 /* Start LLDP (direct 0x0A06) */
 struct ice_aqc_lldp_start {
 	u8 command;
@@ -1254,6 +2539,7 @@ struct ice_aqc_lldp_start {
 	u8 reserved[15];
 };
 
+
 /* Get CEE DCBX Oper Config (0x0A07)
  * The command uses the generic descriptor struct and
  * returns the struct below as an indirect response.
@@ -1284,6 +2570,7 @@ struct ice_aqc_get_cee_dcb_cfg_resp {
 	u8 reserved[12];
 };
 
+
 /* Set Local LLDP MIB (indirect 0x0A08)
  * Used to replace the local MIB of a given LLDP agent. e.g. DCBX
  */
@@ -1301,76 +2588,571 @@ struct ice_aqc_lldp_set_local_mib {
 	__le32 addr_low;
 };
 
-/* Stop/Start LLDP Agent (direct 0x0A09)
- * Used for stopping/starting specific LLDP agent. e.g. DCBX.
- * The same structure is used for the response, with the command field
- * being used as the status field.
+
+struct ice_aqc_lldp_set_local_mib_resp {
+	u8 status;
+#define SET_LOCAL_MIB_RESP_EVENT_M		BIT(0)
+#define SET_LOCAL_MIB_RESP_MIB_CHANGE_SILENT	0
+#define SET_LOCAL_MIB_RESP_MIB_CHANGE_EVENT	SET_LOCAL_MIB_RESP_EVENT_M
+	u8 reserved[15];
+};
+
+
+/* Stop/Start LLDP Agent (direct 0x0A09)
+ * Used for stopping/starting specific LLDP agent. e.g. DCBX.
+ * The same structure is used for the response, with the command field
+ * being used as the status field.
+ */
+struct ice_aqc_lldp_stop_start_specific_agent {
+	u8 command;
+#define ICE_AQC_START_STOP_AGENT_M		BIT(0)
+#define ICE_AQC_START_STOP_AGENT_STOP_DCBX	0
+#define ICE_AQC_START_STOP_AGENT_START_DCBX	ICE_AQC_START_STOP_AGENT_M
+	u8 reserved[15];
+};
+
+
+/* LLDP Filter Control (direct 0x0A0A) */
+struct ice_aqc_lldp_filter_ctrl {
+	u8 cmd_flags;
+#define ICE_AQC_LLDP_FILTER_ACTION_M		ICE_M(3, 0)
+#define ICE_AQC_LLDP_FILTER_ACTION_ADD		0x0
+#define ICE_AQC_LLDP_FILTER_ACTION_DELETE	0x1
+#define ICE_AQC_LLDP_FILTER_ACTION_UPDATE	0x2
+	u8 reserved1;
+	__le16 vsi_num;
+	u8 reserved2[12];
+};
+
+
+/* Get/Set RSS key (indirect 0x0B04/0x0B02) */
+struct ice_aqc_get_set_rss_key {
+#define ICE_AQC_GSET_RSS_KEY_VSI_VALID	BIT(15)
+#define ICE_AQC_GSET_RSS_KEY_VSI_ID_S	0
+#define ICE_AQC_GSET_RSS_KEY_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_KEY_VSI_ID_S)
+	__le16 vsi_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+#define ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE	0x28
+#define ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE	0xC
+#define ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE \
+				(ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + \
+				 ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)
+
+/**
+ * struct ice_aqc_get_set_rss_keys - Get/Set RSS hash key command buffer
+ * @standard_rss_key: 40 most significant bytes of hash key
+ * @extended_hash_key: 12 least significant bytes of hash key
+ *
+ * Set/Get 40 byte hash key using standard_rss_key field, and set
+ * extended_hash_key field to zero. Set/Get 52 byte hash key using
+ * standard_rss_key field for 40 most significant bytes and the
+ * extended_hash_key field for the 12 least significant bytes of hash key.
+ */
+struct ice_aqc_get_set_rss_keys {
+	u8 standard_rss_key[ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE];
+	u8 extended_hash_key[ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE];
+};
+
+
+/* Get/Set RSS LUT (indirect 0x0B05/0x0B03) */
+struct ice_aqc_get_set_rss_lut {
+#define ICE_AQC_GSET_RSS_LUT_VSI_VALID	BIT(15)
+#define ICE_AQC_GSET_RSS_LUT_VSI_ID_S	0
+#define ICE_AQC_GSET_RSS_LUT_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_LUT_VSI_ID_S)
+	__le16 vsi_id;
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S	0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M	\
+				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S)
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI	 0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF	 1
+#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL	 2
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S	 2
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M	 \
+				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S)
+
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128	 128
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128_FLAG 0
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512	 512
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG 1
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K	 2048
+#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG	 2
+
+#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S	 4
+#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M	 \
+				(0xF << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S)
+
+	__le16 flags;
+	__le32 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Clear FD Table Command (direct, 0x0B06) */
+struct ice_aqc_clear_fd_table {
+	u8 clear_type;
+#define CL_FD_VM_VF_TYPE_VSI_IDX	1
+#define CL_FD_VM_VF_TYPE_PF_IDX		2
+	u8 rsvd;
+	__le16 vsi_index;
+	u8 reserved[12];
+};
+
+
+/* Sideband Control Interface Commands */
+/* Neighbor Device Request (indirect 0x0C00); also used for the response. */
+struct ice_aqc_neigh_dev_req {
+	__le16 sb_data_len;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Allocate ACL table (indirect 0x0C10) */
+#define ICE_AQC_ACL_KEY_WIDTH		40
+#define ICE_AQC_ACL_KEY_WIDTH_BYTES	5
+#define ICE_AQC_ACL_TCAM_DEPTH		512
+#define ICE_ACL_ENTRY_ALLOC_UNIT	64
+#define ICE_AQC_MAX_CONCURRENT_ACL_TBL	15
+#define ICE_AQC_MAX_ACTION_MEMORIES	20
+#define ICE_AQC_MAX_ACTION_ENTRIES	512
+#define ICE_AQC_ACL_SLICES		16
+#define ICE_AQC_ALLOC_ID_LESS_THAN_4K	0x1000
+/* The ACL block supports up to 8 actions per a single output. */
+#define ICE_AQC_TBL_MAX_ACTION_PAIRS	4
+
+#define ICE_AQC_MAX_TCAM_ALLOC_UNITS	(ICE_AQC_ACL_TCAM_DEPTH / \
+					 ICE_ACL_ENTRY_ALLOC_UNIT)
+#define ICE_AQC_ACL_ALLOC_UNITS		(ICE_AQC_ACL_SLICES * \
+					 ICE_AQC_MAX_TCAM_ALLOC_UNITS)
+
+struct ice_aqc_acl_alloc_table {
+	__le16 table_width;
+	__le16 table_depth;
+	u8 act_pairs_per_entry;
+	/* For non-concurrent table allocation, this field needs
+	 * to be set to zero(0) otherwise it shall specify the
+	 * amount of concurrent tables whose AllocIDs are
+	 * specified in buffer. Thus the newly allocated table
+	 * is concurrent with table IDs specified in AllocIDs.
+	 */
+#define ICE_AQC_ACL_ALLOC_TABLE_TYPE_NONCONCURR	0
+	u8 table_type;
+	__le16 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Allocate ACL table command buffer format */
+struct ice_aqc_acl_alloc_table_data {
+	/* Dependent table AllocIDs. Each word in this 15 word array specifies
+	 * a dependent table AllocID according to the amount specified in the
+	 * "table_type" field. All unused words shall be set to 0xFFFF
+	 */
+#define ICE_AQC_CONCURR_ID_INVALID	0xffff
+	__le16 alloc_ids[ICE_AQC_MAX_CONCURRENT_ACL_TBL];
+};
+
+
+/* Deallocate ACL table (indirect 0x0C11)
+ * Allocate ACL action-pair (indirect 0x0C12)
+ * Deallocate ACL action-pair (indirect 0x0C13)
+ */
+
+/* Following structure is common and used in case of deallocation
+ * of ACL table and action-pair
+ */
+struct ice_aqc_acl_tbl_actpair {
+	/* Alloc ID of the table being released */
+	__le16 alloc_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* This response structure is same in case of alloc/dealloc table,
+ * alloc/dealloc action-pair
+ */
+struct ice_aqc_acl_generic {
+	/* if alloc_id is below 0x1000 then alllocation failed due to
+	 * unavailable resources, else this is set by FW to identify
+	 * table allocation
+	 */
+	__le16 alloc_id;
+
+	union {
+		/* to be used only in case of alloc/dealloc table */
+		struct {
+			/* Index of the first TCAM block, otherwise set to 0xFF
+			 * for a failed allocation
+			 */
+			u8 first_tcam;
+			/* Index of the last TCAM block. This index shall be
+			 * set to the value of first_tcam for single TCAM block
+			 * allocation, otherwise set to 0xFF for a failed
+			 * allocation
+			 */
+			u8 last_tcam;
+		} table;
+		/* reserved in case of alloc/dealloc action-pair */
+		struct {
+			__le16 reserved;
+		} act_pair;
+	} ops;
+
+	/* index of first entry (in both TCAM and action memories),
+	 * otherwise set to 0xFF for a failed allocation
+	 */
+	__le16 first_entry;
+	/* index of last entry (in both TCAM and action memories),
+	 * otherwise set to 0xFF for a failed allocation
+	 */
+	__le16 last_entry;
+
+	/* Each act_mem element specifies the order of the memory
+	 * otherwise 0xFF
+	 */
+	u8 act_mem[ICE_AQC_MAX_ACTION_MEMORIES];
+};
+
+
+/* Allocate ACL scenario (indirect 0x0C14). This command doesn't have separate
+ * response buffer since original command buffer gets updated with
+ * 'scen_id' in case of success
+ */
+struct ice_aqc_acl_alloc_scen {
+	union {
+		struct {
+			u8 reserved[8];
+		} cmd;
+		struct {
+			__le16 scen_id;
+			u8 reserved[6];
+		} resp;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* De-allocate ACL scenario (direct 0x0C15). This command doesn't need
+ * separate response buffer since nothing to be returned as a response
+ * except status.
+ */
+struct ice_aqc_acl_dealloc_scen {
+	__le16 scen_id;
+	u8 reserved[14];
+};
+
+
+/* Update ACL scenario (direct 0x0C1B)
+ * Query ACL scenario (direct 0x0C23)
+ */
+struct ice_aqc_acl_update_query_scen {
+	__le16 scen_id;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format in case allocate/update ACL scenario and same format
+ * is used for response buffer in case of query ACL scenario.
+ * NOTE: de-allocate ACL scenario is direct command and doesn't require
+ * "buffer", hence no buffer format.
+ */
+struct ice_aqc_acl_scen {
+	struct {
+		/* Byte [x] selection for the TCAM key. This value must be
+		 * set to 0x0 for unusued TCAM.
+		 * Only Bit 6..0 is used in each byte and MSB is reserved
+		 */
+#define ICE_AQC_ACL_ALLOC_SCE_SELECT_M		0x7F
+#define ICE_AQC_ACL_BYTE_SEL_BASE		0x20
+#define ICE_AQC_ACL_BYTE_SEL_BASE_PID		0x3E
+#define ICE_AQC_ACL_BYTE_SEL_BASE_PKT_DIR	ICE_AQC_ACL_BYTE_SEL_BASE
+#define ICE_AQC_ACL_BYTE_SEL_BASE_RNG_CHK	0x3F
+		u8 tcam_select[5];
+		/* TCAM Block entry masking. This value should be set to 0x0 for
+		 * unused TCAM
+		 */
+		u8 chnk_msk;
+		/* Bit 0 : masks TCAM entries 0-63
+		 * Bit 1 : masks TCAM entries 64-127
+		 * Bit 2 to 7 : follow the pattern of bit 0 and 1
+		 */
+#define ICE_AQC_ACL_ALLOC_SCE_START_CMP		BIT(0)
+#define ICE_AQC_ACL_ALLOC_SCE_START_SET		BIT(1)
+		u8 start_cmp_set;
+
+	} tcam_cfg[ICE_AQC_ACL_SLICES];
+
+	/* Each byte, Bit 6..0: Action memory association to a TCAM block,
+	 * otherwise it shall be set to 0x0 for disabled memory action.
+	 * Bit 7 : Action memory enable for this scenario
+	 */
+#define ICE_AQC_ACL_SCE_ACT_MEM_TCAM_ASSOC_M	0x7F
+#define ICE_AQC_ACL_SCE_ACT_MEM_EN		BIT(7)
+	u8 act_mem_cfg[ICE_AQC_MAX_ACTION_MEMORIES];
+};
+
+
+/* Allocate ACL counters (indirect 0x0C16) */
+struct ice_aqc_acl_alloc_counters {
+	/* Amount of contiguous counters requested. Min value is 1 and
+	 * max value is 255
+	 */
+#define ICE_AQC_ACL_ALLOC_CNT_MIN_AMT	0x1
+#define ICE_AQC_ACL_ALLOC_CNT_MAX_AMT	0xFF
+	u8 counter_amount;
+
+	/* Counter type: 'single counter' which can be configured to count
+	 * either bytes or packets
+	 */
+#define ICE_AQC_ACL_CNT_TYPE_SINGLE	0x0
+
+	/* Counter type: 'counter pair' which counts number of bytes and number
+	 * of packets.
+	 */
+#define ICE_AQC_ACL_CNT_TYPE_DUAL	0x1
+	/* requested counter type, single/dual */
+	u8 counters_type;
+
+	/* counter bank allocation shall be 0-3 for 'byte or packet counter' */
+#define ICE_AQC_ACL_MAX_CNT_SINGLE	0x3
+/* counter bank allocation shall be 0-1 for 'byte and packet counter dual' */
+#define ICE_AQC_ACL_MAX_CNT_DUAL	0x1
+	/* requested counter bank allocation */
+	u8 bank_alloc;
+
+	u8 reserved;
+
+	union {
+		/* Applicable only in case of command */
+		struct {
+			u8 reserved[12];
+		} cmd;
+		/* Applicable only in case of response */
+#define ICE_AQC_ACL_ALLOC_CNT_INVAL	0xFFFF
+		struct {
+			/* Index of first allocated counter. 0xFFFF in case
+			 * of unsuccessful allocation
+			 */
+			__le16 first_counter;
+			/* Index of last allocated counter. 0xFFFF in case
+			 * of unsuccessful allocation
+			 */
+			__le16 last_counter;
+			u8 rsvd[8];
+		} resp;
+	} ops;
+};
+
+
+/* De-allocate ACL counters (direct 0x0C17) */
+struct ice_aqc_acl_dealloc_counters {
+	/* first counter being released */
+	__le16 first_counter;
+	/* last counter being released */
+	__le16 last_counter;
+	/* requested counter type, single/dual */
+	u8 counters_type;
+	/* requested counter bank allocation */
+	u8 bank_alloc;
+	u8 reserved[10];
+};
+
+
+/* De-allocate ACL resources (direct 0x0C1A). Used by SW to release all the
+ * resources allocated for it using a single command
+ */
+struct ice_aqc_acl_dealloc_res {
+	u8 reserved[16];
+};
+
+
+/* Program ACL actionpair (indirect 0x0C1C)
+ * Query ACL actionpair (indirect 0x0C25)
+ */
+struct ice_aqc_acl_actpair {
+	/* action mem index to program/update */
+	u8 act_mem_index;
+	u8 reserved;
+	/* The entry index in action memory to be programmed/updated */
+	__le16 act_entry_index;
+	__le32 reserved2;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format for program/query action-pair admin command */
+struct ice_acl_act_entry {
+	/* Action priority, values must be between 0..7 */
+#define ICE_AQC_ACT_PRIO_VALID_MAX	7
+#define ICE_AQC_ACT_PRIO_MSK		ICE_M(0xff, 0)
+	u8 prio;
+	/* Action meta-data identifier. This field should be set to 0x0
+	 * for a NOP action
+	 */
+#define ICE_AQC_ACT_MDID_S		8
+#define ICE_AQC_ACT_MDID_MSK		ICE_M(0xff00, ICE_AQC_ACT_MDID_S)
+	u8 mdid;
+	/* Action value */
+#define ICE_AQC_ACT_VALUE_S		16
+#define ICE_AQC_ACT_VALUE_MSK		ICE_M(0xffff0000, 16)
+	__le16 value;
+};
+
+
+#define ICE_ACL_NUM_ACT_PER_ACT_PAIR 2
+struct ice_aqc_actpair {
+	struct ice_acl_act_entry act[ICE_ACL_NUM_ACT_PER_ACT_PAIR];
+};
+
+
+/* Generic format used to describe either input or response buffer
+ * for admin commands related to ACL profile
+ */
+struct ice_aqc_acl_prof_generic_frmt {
+	/* The first byte of the byte selection base is reserved to keep the
+	 * first byte of the field vector where the packet direction info is
+	 * available. Thus we should start at index 1 of the field vector to
+	 * map its entries to the byte selection base.
+	 */
+#define ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX	1
+	/* In each byte:
+	 * Bit 0..5 = Byte selection for the byte selection base from the
+	 * extracted fields (expressed as byte offset in extracted fields).
+	 * Applicable values are 0..63
+	 * Bit 6..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS		30
+	u8 byte_selection[ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS];
+	/* In each byte:
+	 * Bit 0..4 = Word selection for the word selection base from the
+	 * extracted fields (expressed as word offset in extracted fields).
+	 * Applicable values are 0..31
+	 * Bit 5..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_WORD_SEL_ELEMS		32
+	u8 word_selection[ICE_AQC_ACL_PROF_WORD_SEL_ELEMS];
+	/* In each byte:
+	 * Bit 0..3 = Double word selection for the double-word selection base
+	 * from the extracted fields (expressed as double-word offset in
+	 * extracted fields).
+	 * Applicable values are 0..15
+	 * Bit 4..7 = Reserved
+	 */
+#define ICE_AQC_ACL_PROF_DWORD_SEL_ELEMS	15
+	u8 dword_selection[ICE_AQC_ACL_PROF_DWORD_SEL_ELEMS];
+	/* Scenario numbers for individual Physical Function's */
+#define ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS	8
+	u8 pf_scenario_num[ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS];
+};
+
+
+/* Program ACL profile extraction (indirect 0x0C1D)
+ * Program ACL profile ranges (indirect 0x0C1E)
+ * Query ACL profile (indirect 0x0C21)
+ * Query ACL profile ranges (indirect 0x0C22)
+ */
+struct ice_aqc_acl_profile {
+	u8 profile_id; /* Programmed/Updated profile ID */
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Input buffer format for program profile extraction admin command and
+ * response buffer format for query profile admin command is as defined
+ * in struct ice_aqc_acl_prof_generic_frmt
  */
-struct ice_aqc_lldp_stop_start_specific_agent {
-	u8 command;
-#define ICE_AQC_START_STOP_AGENT_M		BIT(0)
-#define ICE_AQC_START_STOP_AGENT_STOP_DCBX	0
-#define ICE_AQC_START_STOP_AGENT_START_DCBX	ICE_AQC_START_STOP_AGENT_M
-	u8 reserved[15];
-};
 
-/* Get/Set RSS key (indirect 0x0B04/0x0B02) */
-struct ice_aqc_get_set_rss_key {
-#define ICE_AQC_GSET_RSS_KEY_VSI_VALID	BIT(15)
-#define ICE_AQC_GSET_RSS_KEY_VSI_ID_S	0
-#define ICE_AQC_GSET_RSS_KEY_VSI_ID_M	(0x3FF << ICE_AQC_GSET_RSS_KEY_VSI_ID_S)
-	__le16 vsi_id;
-	u8 reserved[6];
-	__le32 addr_high;
-	__le32 addr_low;
+/* Input buffer format for program profile ranges and query profile ranges
+ * admin commands. Same format is used for response buffer in case of query
+ * profile ranges command
+ */
+struct ice_acl_rng_data {
+	/* The range checker output shall be sent when the value
+	 * related to this range checker is lower than low boundary
+	 */
+	__be16 low_boundary;
+	/* The range checker output shall be sent when the value
+	 * related to this range checker is higher than high boundary
+	 */
+	__be16 high_boundary;
+	/* A value of '0' in bit shall clear the relevant bit input
+	 * to the range checker
+	 */
+	__be16 mask;
 };
 
-#define ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE	0x28
-#define ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE	0xC
-#define ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE \
-				(ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE + \
-				 ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE)
 
-struct ice_aqc_get_set_rss_keys {
-	u8 standard_rss_key[ICE_AQC_GET_SET_RSS_KEY_DATA_RSS_KEY_SIZE];
-	u8 extended_hash_key[ICE_AQC_GET_SET_RSS_KEY_DATA_HASH_KEY_SIZE];
+struct ice_aqc_acl_profile_ranges {
+#define ICE_AQC_ACL_PROF_RANGES_NUM_CFG 8
+	struct ice_acl_rng_data checker_cfg[ICE_AQC_ACL_PROF_RANGES_NUM_CFG];
 };
 
-/* Get/Set RSS LUT (indirect 0x0B05/0x0B03) */
-struct ice_aqc_get_set_rss_lut {
-#define ICE_AQC_GSET_RSS_LUT_VSI_VALID	BIT(15)
-#define ICE_AQC_GSET_RSS_LUT_VSI_ID_S	0
-#define ICE_AQC_GSET_RSS_LUT_VSI_ID_M	(0x1FF << ICE_AQC_GSET_RSS_LUT_VSI_ID_S)
-	__le16 vsi_id;
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S	0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M	\
-				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S)
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI	 0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF	 1
-#define ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL	 2
+/* Program ACL entry (indirect 0x0C20)
+ * Query ACL entry (indirect 0x0C24)
+ */
+struct ice_aqc_acl_entry {
+	u8 tcam_index; /* Updated TCAM block index */
+	u8 reserved;
+	__le16 entry_index; /* Updated entry index */
+	__le32 reserved2;
+	__le32 addr_high;
+	__le32 addr_low;
+};
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S	 2
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M	 \
-				(0x3 << ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S)
 
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128	 128
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128_FLAG 0
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512	 512
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG 1
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K	 2048
-#define ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG	 2
+/* Input buffer format in case of program ACL entry and response buffer format
+ * in case of query ACL entry
+ */
+struct ice_aqc_acl_data {
+	/* Entry key and entry key invert are 40 bits wide.
+	 * Byte 0..4 : entry key and Byte 5..7 are reserved
+	 * Byte 8..12: entry key invert and Byte 13..15 are reserved
+	 */
+	struct {
+		u8 val[5];
+		u8 reserved[3];
+	} entry_key, entry_key_invert;
+};
 
-#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S	 4
-#define ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M	 \
-				(0xF << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S)
 
-	__le16 flags;
-	__le32 reserved;
-	__le32 addr_high;
-	__le32 addr_low;
+/* Query ACL counter (direct 0x0C27) */
+struct ice_aqc_acl_query_counter {
+	/* Queried counter index */
+	__le16 counter_index;
+	/* Queried counter bank */
+	u8 counter_bank;
+	union {
+		struct {
+			u8 reserved[13];
+		} cmd;
+		struct {
+			/* Holds counter value/packet counter value */
+			u8 val[5];
+			u8 reserved[8];
+		} resp;
+	} ops;
 };
 
+
 /* Add Tx LAN Queues (indirect 0x0C30) */
 struct ice_aqc_add_txqs {
 	u8 num_qgrps;
@@ -1380,6 +3162,7 @@ struct ice_aqc_add_txqs {
 	__le32 addr_low;
 };
 
+
 /* This is the descriptor of each queue entry for the Add Tx LAN Queues
  * command (0x0C30). Only used within struct ice_aqc_add_tx_qgrp.
  */
@@ -1392,6 +3175,7 @@ struct ice_aqc_add_txqs_perq {
 	struct ice_aqc_txsched_elem info;
 };
 
+
 /* The format of the command buffer for Add Tx LAN Queues (0x0C30)
  * is an array of the following structs. Please note that the length of
  * each struct ice_aqc_add_tx_qgrp is variable due
@@ -1401,9 +3185,10 @@ struct ice_aqc_add_tx_qgrp {
 	__le32 parent_teid;
 	u8 num_txqs;
 	u8 rsvd[3];
-	struct ice_aqc_add_txqs_perq txqs[1];
+	struct ice_aqc_add_txqs_perq txqs[];
 };
 
+
 /* Disable Tx LAN Queues (indirect 0x0C31) */
 struct ice_aqc_dis_txqs {
 	u8 cmd_type;
@@ -1426,6 +3211,7 @@ struct ice_aqc_dis_txqs {
 	__le32 addr_low;
 };
 
+
 /* The buffer for Disable Tx LAN Queues (indirect 0x0C31)
  * contains the following structures, arrayed one after the
  * other.
@@ -1439,99 +3225,122 @@ struct ice_aqc_dis_txq_item {
 	u8 num_qs;
 	u8 rsvd;
 	/* The length of the q_id array varies according to num_qs */
-	__le16 q_id[1];
-	/* This only applies from F8 onward */
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S		15
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_LAN_Q	\
 			(0 << ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S)
 #define ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET	\
 			(1 << ICE_AQC_Q_DIS_BUF_ELEM_TYPE_S)
-};
+	__le16 q_id[];
+} __packed;
+
 
-struct ice_aqc_dis_txq {
-	struct ice_aqc_dis_txq_item qgrps[1];
+
+/* Tx LAN Queues Cleanup Event (0x0C31) */
+struct ice_aqc_txqs_cleanup {
+	__le16 caller_opc;
+	__le16 cmd_tag;
+	u8 reserved[12];
 };
 
-/* Configure Firmware Logging Command (indirect 0xFF09)
- * Logging Information Read Response (indirect 0xFF10)
- * Note: The 0xFF10 command has no input parameters.
- */
-struct ice_aqc_fw_logging {
-	u8 log_ctrl;
-#define ICE_AQC_FW_LOG_AQ_EN		BIT(0)
-#define ICE_AQC_FW_LOG_UART_EN		BIT(1)
-	u8 rsvd0;
-	u8 log_ctrl_valid; /* Not used by 0xFF10 Response */
-#define ICE_AQC_FW_LOG_AQ_VALID		BIT(0)
-#define ICE_AQC_FW_LOG_UART_VALID	BIT(1)
-	u8 rsvd1[5];
+
+/* Move / Reconfigure Tx Queues (indirect 0x0C32) */
+struct ice_aqc_move_txqs {
+	u8 cmd_type;
+#define ICE_AQC_Q_CMD_TYPE_S		0
+#define ICE_AQC_Q_CMD_TYPE_M		(0x3 << ICE_AQC_Q_CMD_TYPE_S)
+#define ICE_AQC_Q_CMD_TYPE_MOVE		1
+#define ICE_AQC_Q_CMD_TYPE_TC_CHANGE	2
+#define ICE_AQC_Q_CMD_TYPE_MOVE_AND_TC	3
+#define ICE_AQC_Q_CMD_SUBSEQ_CALL	BIT(2)
+#define ICE_AQC_Q_CMD_FLUSH_PIPE	BIT(3)
+	u8 num_qs;
+	u8 rsvd;
+	u8 timeout;
+#define ICE_AQC_Q_CMD_TIMEOUT_S		2
+#define ICE_AQC_Q_CMD_TIMEOUT_M		(0x3F << ICE_AQC_Q_CMD_TIMEOUT_S)
+	__le32 blocked_cgds;
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
-enum ice_aqc_fw_logging_mod {
-	ICE_AQC_FW_LOG_ID_GENERAL = 0,
-	ICE_AQC_FW_LOG_ID_CTRL,
-	ICE_AQC_FW_LOG_ID_LINK,
-	ICE_AQC_FW_LOG_ID_LINK_TOPO,
-	ICE_AQC_FW_LOG_ID_DNL,
-	ICE_AQC_FW_LOG_ID_I2C,
-	ICE_AQC_FW_LOG_ID_SDP,
-	ICE_AQC_FW_LOG_ID_MDIO,
-	ICE_AQC_FW_LOG_ID_ADMINQ,
-	ICE_AQC_FW_LOG_ID_HDMA,
-	ICE_AQC_FW_LOG_ID_LLDP,
-	ICE_AQC_FW_LOG_ID_DCBX,
-	ICE_AQC_FW_LOG_ID_DCB,
-	ICE_AQC_FW_LOG_ID_NETPROXY,
-	ICE_AQC_FW_LOG_ID_NVM,
-	ICE_AQC_FW_LOG_ID_AUTH,
-	ICE_AQC_FW_LOG_ID_VPD,
-	ICE_AQC_FW_LOG_ID_IOSF,
-	ICE_AQC_FW_LOG_ID_PARSER,
-	ICE_AQC_FW_LOG_ID_SW,
-	ICE_AQC_FW_LOG_ID_SCHEDULER,
-	ICE_AQC_FW_LOG_ID_TXQ,
-	ICE_AQC_FW_LOG_ID_RSVD,
-	ICE_AQC_FW_LOG_ID_POST,
-	ICE_AQC_FW_LOG_ID_WATCHDOG,
-	ICE_AQC_FW_LOG_ID_TASK_DISPATCH,
-	ICE_AQC_FW_LOG_ID_MNG,
-	ICE_AQC_FW_LOG_ID_MAX,
+
+/* Per-queue data buffer for the Move Tx LAN Queues command/response */
+struct ice_aqc_move_txqs_elem {
+	__le16 txq_id;
+	u8 q_cgd;
+	u8 rsvd;
+	__le32 q_teid;
+};
+
+
+/* Indirect data buffer for the Move Tx LAN Queues command/response */
+struct ice_aqc_move_txqs_data {
+	__le32 src_teid;
+	__le32 dest_teid;
+	struct ice_aqc_move_txqs_elem txqs[];
+};
+
+
+/* Add Tx RDMA Queue Set (indirect 0x0C33) */
+struct ice_aqc_add_rdma_qset {
+	u8 num_qset_grps;
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
 };
 
-/* This is the buffer for both of the logging commands.
- * The entry array size depends on the datalen parameter in the descriptor.
- * There will be a total of datalen / 2 entries.
+
+/* This is the descriptor of each qset entry for the Add Tx RDMA Queue Set
+ * command (0x0C33). Only used within struct ice_aqc_add_rdma_qset.
  */
-struct ice_aqc_fw_logging_data {
-	__le16 entry[1];
-#define ICE_AQC_FW_LOG_ID_S		0
-#define ICE_AQC_FW_LOG_ID_M		(0xFFF << ICE_AQC_FW_LOG_ID_S)
+struct ice_aqc_add_tx_rdma_qset_entry {
+	__le16 tx_qset_id;
+	u8 rsvd[2];
+	__le32 qset_teid;
+	struct ice_aqc_txsched_elem info;
+};
 
-#define ICE_AQC_FW_LOG_CONF_SUCCESS	0	/* Used by response */
-#define ICE_AQC_FW_LOG_CONF_BAD_INDX	BIT(12)	/* Used by response */
 
-#define ICE_AQC_FW_LOG_EN_S		12
-#define ICE_AQC_FW_LOG_EN_M		(0xF << ICE_AQC_FW_LOG_EN_S)
-#define ICE_AQC_FW_LOG_INFO_EN		BIT(12)	/* Used by command */
-#define ICE_AQC_FW_LOG_INIT_EN		BIT(13)	/* Used by command */
-#define ICE_AQC_FW_LOG_FLOW_EN		BIT(14)	/* Used by command */
-#define ICE_AQC_FW_LOG_ERR_EN		BIT(15)	/* Used by command */
+/* The format of the command buffer for Add Tx RDMA Queue Set(0x0C33)
+ * is an array of the following structs. Please note that the length of
+ * each struct ice_aqc_add_rdma_qset is variable due to the variable
+ * number of queues in each group!
+ */
+struct ice_aqc_add_rdma_qset_data {
+	__le32 parent_teid;
+	__le16 num_qsets;
+	u8 rsvd[2];
+	struct ice_aqc_add_tx_rdma_qset_entry rdma_qsets[];
 };
 
-/* Get/Clear FW Log (indirect 0xFF11) */
-struct ice_aqc_get_clear_fw_log {
+
+/* Move RDMA Queue Set (indirect 0x0C34) */
+struct ice_aqc_move_rdma_qset_cmd {
+	u8 num_rdma_qset;	/* Used by commands and response */
 	u8 flags;
-#define ICE_AQC_FW_LOG_CLEAR		BIT(0)
-#define ICE_AQC_FW_LOG_MORE_DATA_AVAIL	BIT(1)
-	u8 rsvd1[7];
+	u8 reserved[6];
 	__le32 addr_high;
 	__le32 addr_low;
 };
 
+
+/* Buffer */
+struct ice_aqc_move_rdma_qset_buffer_desc {
+	__le16 tx_qset_id;
+	__le16 qset_teid;
+};
+
+
+struct ice_aqc_move_rdma_qset_buffer {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	struct ice_aqc_move_rdma_qset_buffer_desc descs[];
+};
+
+
+
 /* Download Package (indirect 0x0C40) */
-/* Also used for Update Package (indirect 0x0C42) */
+/* Also used for Update Package (indirect 0x0C42 and 0x0C41) */
 struct ice_aqc_download_pkg {
 	u8 flags;
 #define ICE_AQC_DOWNLOAD_PKG_LAST_BUF	0x01
@@ -1541,6 +3350,7 @@ struct ice_aqc_download_pkg {
 	__le32 addr_low;
 };
 
+
 struct ice_aqc_download_pkg_resp {
 	__le32 error_offset;
 	__le32 error_info;
@@ -1548,6 +3358,7 @@ struct ice_aqc_download_pkg_resp {
 	__le32 addr_low;
 };
 
+
 /* Get Package Info List (indirect 0x0C43) */
 struct ice_aqc_get_pkg_info_list {
 	__le32 reserved1;
@@ -1556,6 +3367,7 @@ struct ice_aqc_get_pkg_info_list {
 	__le32 addr_low;
 };
 
+
 /* Version format for packages */
 struct ice_pkg_ver {
 	u8 major;
@@ -1564,30 +3376,238 @@ struct ice_pkg_ver {
 	u8 draft;
 };
 
+
 #define ICE_PKG_NAME_SIZE	32
+#define ICE_SEG_ID_SIZE		28
+#define ICE_SEG_NAME_SIZE	28
 
 struct ice_aqc_get_pkg_info {
 	struct ice_pkg_ver ver;
-	char name[ICE_PKG_NAME_SIZE];
+	char name[ICE_SEG_NAME_SIZE];
+	__le32 track_id;
 	u8 is_in_nvm;
 	u8 is_active;
 	u8 is_active_at_boot;
 	u8 is_modified;
 };
 
+
 /* Get Package Info List response buffer format (0x0C43) */
 struct ice_aqc_get_pkg_info_resp {
 	__le32 count;
-	struct ice_aqc_get_pkg_info pkg_info[1];
+	struct ice_aqc_get_pkg_info pkg_info[];
+};
+
+
+
+
+/* Driver Shared Parameters (direct, 0x0C90) */
+struct ice_aqc_driver_shared_params {
+	u8 set_or_get_op;
+#define ICE_AQC_DRIVER_PARAM_OP_MASK		BIT(0)
+#define ICE_AQC_DRIVER_PARAM_SET		0
+#define ICE_AQC_DRIVER_PARAM_GET		1
+	u8 param_indx;
+#define ICE_AQC_DRIVER_PARAM_MAX_IDX		15
+	u8 rsvd[2];
+	__le32 param_val;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+enum ice_aqc_driver_params {
+	/* OS clock index for PTP timer Domain 0 */
+	ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0 = 0,
+	/* OS clock index for PTP timer Domain 1 */
+	ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1,
+
+	/* Add new parameters above */
+	ICE_AQC_DRIVER_PARAM_MAX = 16,
+};
+
+
+/* Lan Queue Overflow Event (direct, 0x1001) */
+struct ice_aqc_event_lan_overflow {
+	__le32 prtdcb_ruptq;
+	__le32 qtx_ctl;
+	u8 reserved[8];
+};
+
+
+
+
+enum ice_aqc_fw_logging_mod {
+	ICE_AQC_FW_LOG_ID_GENERAL = 0,
+	ICE_AQC_FW_LOG_ID_CTRL,
+	ICE_AQC_FW_LOG_ID_LINK,
+	ICE_AQC_FW_LOG_ID_LINK_TOPO,
+	ICE_AQC_FW_LOG_ID_DNL,
+	ICE_AQC_FW_LOG_ID_I2C,
+	ICE_AQC_FW_LOG_ID_SDP,
+	ICE_AQC_FW_LOG_ID_MDIO,
+	ICE_AQC_FW_LOG_ID_ADMINQ,
+	ICE_AQC_FW_LOG_ID_HDMA,
+	ICE_AQC_FW_LOG_ID_LLDP,
+	ICE_AQC_FW_LOG_ID_DCBX,
+	ICE_AQC_FW_LOG_ID_DCB,
+	ICE_AQC_FW_LOG_ID_XLR,
+	ICE_AQC_FW_LOG_ID_NVM,
+	ICE_AQC_FW_LOG_ID_AUTH,
+	ICE_AQC_FW_LOG_ID_VPD,
+	ICE_AQC_FW_LOG_ID_IOSF,
+	ICE_AQC_FW_LOG_ID_PARSER,
+	ICE_AQC_FW_LOG_ID_SW,
+	ICE_AQC_FW_LOG_ID_SCHEDULER,
+	ICE_AQC_FW_LOG_ID_TXQ,
+	ICE_AQC_FW_LOG_ID_ACL,
+	ICE_AQC_FW_LOG_ID_POST,
+	ICE_AQC_FW_LOG_ID_WATCHDOG,
+	ICE_AQC_FW_LOG_ID_TASK_DISPATCH,
+	ICE_AQC_FW_LOG_ID_MNG,
+	ICE_AQC_FW_LOG_ID_SYNCE,
+	ICE_AQC_FW_LOG_ID_HEALTH,
+	ICE_AQC_FW_LOG_ID_TSDRV,
+	ICE_AQC_FW_LOG_ID_PFREG,
+	ICE_AQC_FW_LOG_ID_MDLVER,
+	ICE_AQC_FW_LOG_ID_MAX,
+};
+
+
+
+/* Set Health Status (direct 0xFF20) */
+struct ice_aqc_set_health_status_config {
+	u8 event_source;
+#define ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK	BIT(0)
+#define ICE_AQC_HEALTH_STATUS_SET_ALL_PF_MASK		BIT(1)
+#define ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK		BIT(2)
+	u8 reserved[15];
+};
+
+
+#define ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT		0x101
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE			0x102
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL			0x103
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM			0x104
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT			0x105
+#define ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT		0x106
+#define ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED		0x107
+#define ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT		0x108
+#define ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG		0x10B
+#define ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS			0x10C
+#define ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE		0x10D
+#define ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED	0x10F
+#define ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT		0x110
+#define ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED	0x111
+#define ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO			0x112
+#define ICE_AQC_HEALTH_STATUS_ERR_NETLIST			0x113
+#define ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT			0x114
+#define ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS		0x115
+#define ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME			0x116
+#define ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT			0x117
+#define ICE_AQC_HEALTH_STATUS_ERR_PHY_NVM_PROG			0x120
+#define ICE_AQC_HEALTH_STATUS_ERR_PHY_FW_LOAD			0x121
+#define ICE_AQC_HEALTH_STATUS_INFO_RECOVERY			0x500
+#define ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS			0x501
+#define ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH			0x502
+#define ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH			0x503
+#define ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH			0x504
+#define ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT			0x505
+#define ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT			0x506
+#define ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB			0x509
+
+/* Get Health Status codes (indirect 0xFF21) */
+struct ice_aqc_get_supported_health_status_codes {
+	__le16 health_code_count;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Health Status (indirect 0xFF22) */
+struct ice_aqc_get_health_status {
+	__le16 health_status_count;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Get Health Status event buffer entry, (0xFF22)
+ * repeated per reported health status
+ */
+struct ice_aqc_health_status_elem {
+	__le16 health_status_code;
+	__le16 event_source;
+#define ICE_AQC_HEALTH_STATUS_PF			(0x1)
+#define ICE_AQC_HEALTH_STATUS_PORT			(0x2)
+#define ICE_AQC_HEALTH_STATUS_GLOBAL			(0x3)
+	__le32 internal_data1;
+#define ICE_AQC_HEALTH_STATUS_UNDEFINED_DATA	(0xDEADBEEF)
+	__le32 internal_data2;
+};
+
+
+/* Clear Health Status (direct 0xFF23) */
+struct ice_aqc_clear_health_status {
+	__le32 reserved[4];
+};
+
+
+/* Set FW Logging configuration (indirect 0xFF30)
+ * Register for FW Logging (indirect 0xFF31)
+ * Query FW Logging (indirect 0xFF32)
+ * FW Log Event (indirect 0xFF33)
+ * Get FW Log (indirect 0xFF34)
+ * Clear FW Log (indirect 0xFF35)
+ */
+struct ice_aqc_fw_log {
+	u8 cmd_flags;
+#define ICE_AQC_FW_LOG_CONF_UART_EN	BIT(0)
+#define ICE_AQC_FW_LOG_CONF_AQ_EN	BIT(1)
+#define ICE_AQC_FW_LOG_QUERY_REGISTERED	BIT(2)
+#define ICE_AQC_FW_LOG_CONF_SET_VALID	BIT(3)
+#define ICE_AQC_FW_LOG_AQ_REGISTER	BIT(0)
+#define ICE_AQC_FW_LOG_AQ_QUERY		BIT(2)
+#define ICE_AQC_FW_LOG_PERSISTENT	BIT(0)
+	u8 rsp_flag;
+#define ICE_AQC_FW_LOG_MORE_DATA	BIT(1)
+	__le16 fw_rt_msb;
+	union {
+		struct {
+			__le32 fw_rt_lsb;
+		} sync;
+		struct {
+			__le16 log_resolution;
+#define ICE_AQC_FW_LOG_MIN_RESOLUTION		(1)
+#define ICE_AQC_FW_LOG_MAX_RESOLUTION		(128)
+			__le16 mdl_cnt;
+		} cfg;
+	} ops;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+
+/* Response Buffer for:
+ *    Set Firmware Logging Configuration (0xFF30)
+ *    Query FW Logging (0xFF32)
+ */
+struct ice_aqc_fw_log_cfg_resp {
+	__le16 module_identifier;
+	u8 log_level;
+	u8 rsvd0;
 };
+
+
 /**
  * struct ice_aq_desc - Admin Queue (AQ) descriptor
  * @flags: ICE_AQ_FLAG_* flags
  * @opcode: AQ command opcode
  * @datalen: length in bytes of indirect/external data buffer
  * @retval: return value from firmware
- * @cookie_h: opaque data high-half
- * @cookie_l: opaque data low-half
+ * @cookie_high: opaque data high-half
+ * @cookie_low: opaque data low-half
  * @params: command-specific parameters
  *
  * Descriptor format for commands the driver posts on the Admin Transmit Queue
@@ -1610,76 +3630,184 @@ struct ice_aq_desc {
 		struct ice_aqc_get_ver get_ver;
 		struct ice_aqc_driver_ver driver_ver;
 		struct ice_aqc_q_shutdown q_shutdown;
+		struct ice_aqc_get_exp_err exp_err;
 		struct ice_aqc_req_res res_owner;
 		struct ice_aqc_manage_mac_read mac_read;
 		struct ice_aqc_manage_mac_write mac_write;
 		struct ice_aqc_clear_pxe clear_pxe;
+		struct ice_aqc_config_no_drop_policy no_drop;
+		struct ice_aqc_add_update_mir_rule add_update_rule;
+		struct ice_aqc_delete_mir_rule del_rule;
 		struct ice_aqc_list_caps get_cap;
 		struct ice_aqc_get_phy_caps get_phy;
 		struct ice_aqc_set_phy_cfg set_phy;
 		struct ice_aqc_restart_an restart_an;
+		struct ice_aqc_dnl_get_status get_status;
+		struct ice_aqc_dnl_run_command dnl_run;
+		struct ice_aqc_dnl_call_command dnl_call;
+		struct ice_aqc_dnl_read_write_command dnl_read_write;
+		struct ice_aqc_dnl_read_write_response dnl_read_write_resp;
+		struct ice_aqc_dnl_set_breakpoints_command dnl_set_brk;
+		struct ice_aqc_dnl_read_log_command dnl_read_log;
+		struct ice_aqc_dnl_read_log_response dnl_read_log_resp;
+		struct ice_aqc_i2c read_write_i2c;
+		struct ice_aqc_read_i2c_resp read_i2c_resp;
+		struct ice_aqc_mdio read_write_mdio;
+		struct ice_aqc_gpio_by_func read_write_gpio_by_func;
+		struct ice_aqc_gpio read_write_gpio;
+		struct ice_aqc_set_led set_led;
+		struct ice_aqc_mdio read_mdio;
+		struct ice_aqc_mdio write_mdio;
+		struct ice_aqc_sff_eeprom read_write_sff_param;
 		struct ice_aqc_set_port_id_led set_port_id_led;
+		struct ice_aqc_get_port_options get_port_options;
+		struct ice_aqc_set_port_option set_port_option;
 		struct ice_aqc_get_sw_cfg get_sw_conf;
+		struct ice_aqc_set_port_params set_port_params;
 		struct ice_aqc_sw_rules sw_rules;
+		struct ice_aqc_storm_cfg storm_conf;
+		struct ice_aqc_add_get_recipe add_get_recipe;
+		struct ice_aqc_recipe_to_profile recipe_to_profile;
 		struct ice_aqc_get_topo get_topo;
 		struct ice_aqc_sched_elem_cmd sched_elem_cmd;
 		struct ice_aqc_query_txsched_res query_sched_res;
+		struct ice_aqc_query_node_to_root query_node_to_root;
+		struct ice_aqc_cfg_l2_node_cgd cfg_l2_node_cgd;
 		struct ice_aqc_query_port_ets port_ets;
+		struct ice_aqc_rl_profile rl_profile;
 		struct ice_aqc_nvm nvm;
+		struct ice_aqc_nvm_cfg nvm_cfg;
 		struct ice_aqc_nvm_checksum nvm_checksum;
+		struct ice_aqc_nvm_pkg_data pkg_data;
+		struct ice_aqc_nvm_pass_comp_tbl pass_comp_tbl;
 		struct ice_aqc_pf_vf_msg virt;
+		struct ice_aqc_pfc_ignore pfc_ignore;
+		struct ice_aqc_set_query_pfc_mode set_query_pfc_mode;
+		struct ice_aqc_set_dcb_params set_dcb_params;
 		struct ice_aqc_lldp_get_mib lldp_get_mib;
 		struct ice_aqc_lldp_set_mib_change lldp_set_event;
+		struct ice_aqc_lldp_add_delete_tlv lldp_add_delete_tlv;
+		struct ice_aqc_lldp_update_tlv lldp_update_tlv;
 		struct ice_aqc_lldp_stop lldp_stop;
 		struct ice_aqc_lldp_start lldp_start;
 		struct ice_aqc_lldp_set_local_mib lldp_set_mib;
 		struct ice_aqc_lldp_stop_start_specific_agent lldp_agent_ctrl;
+		struct ice_aqc_lldp_filter_ctrl lldp_filter_ctrl;
 		struct ice_aqc_get_set_rss_lut get_set_rss_lut;
 		struct ice_aqc_get_set_rss_key get_set_rss_key;
+		struct ice_aqc_clear_fd_table clear_fd_table;
+		struct ice_aqc_neigh_dev_req neigh_dev;
+		struct ice_aqc_acl_alloc_table alloc_table;
+		struct ice_aqc_acl_tbl_actpair tbl_actpair;
+		struct ice_aqc_acl_alloc_scen alloc_scen;
+		struct ice_aqc_acl_dealloc_scen dealloc_scen;
+		struct ice_aqc_acl_update_query_scen update_query_scen;
+		struct ice_aqc_acl_alloc_counters alloc_counters;
+		struct ice_aqc_acl_dealloc_counters dealloc_counters;
+		struct ice_aqc_acl_dealloc_res dealloc_res;
+		struct ice_aqc_acl_entry program_query_entry;
+		struct ice_aqc_acl_actpair program_query_actpair;
+		struct ice_aqc_acl_profile profile;
+		struct ice_aqc_acl_query_counter query_counter;
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
+		struct ice_aqc_move_txqs move_txqs;
+		struct ice_aqc_add_rdma_qset add_rdma_qset;
+		struct ice_aqc_txqs_cleanup txqs_cleanup;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
-		struct ice_aqc_fw_logging fw_logging;
-		struct ice_aqc_get_clear_fw_log get_clear_fw_log;
+		struct ice_aqc_get_vsi_resp get_vsi_resp;
 		struct ice_aqc_download_pkg download_pkg;
+		struct ice_aqc_get_pkg_info_list get_pkg_info_list;
+		struct ice_aqc_driver_shared_params drv_shared_params;
+		struct ice_aqc_fw_log fw_log;
 		struct ice_aqc_set_mac_lb set_mac_lb;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
+		struct ice_aqc_get_res_alloc get_res;
+		struct ice_aqc_get_allocd_res_desc get_res_desc;
+		struct ice_aqc_set_mac_cfg set_mac_cfg;
 		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
+		struct ice_aqc_event_lan_overflow lan_overflow;
+		struct ice_aqc_get_link_topo get_link_topo;
+		struct ice_aqc_get_link_topo_pin get_link_topo_pin;
+		struct ice_aqc_set_health_status_config
+			set_health_status_config;
+		struct ice_aqc_get_supported_health_status_codes
+			get_supported_health_status_codes;
+		struct ice_aqc_get_health_status get_health_status;
+		struct ice_aqc_clear_health_status clear_health_status;
+		struct ice_aqc_prog_topo_dev_nvm prog_topo_dev_nvm;
+		struct ice_aqc_read_topo_dev_nvm read_topo_dev_nvm;
 	} params;
 };
 
+
 /* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
 #define ICE_AQ_LG_BUF	512
 
+/* Flags sub-structure
+ * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
+ * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
+ */
+
+/* command flags and offsets */
+#define ICE_AQ_FLAG_DD_S	0
+#define ICE_AQ_FLAG_CMP_S	1
 #define ICE_AQ_FLAG_ERR_S	2
+#define ICE_AQ_FLAG_VFE_S	3
 #define ICE_AQ_FLAG_LB_S	9
 #define ICE_AQ_FLAG_RD_S	10
+#define ICE_AQ_FLAG_VFC_S	11
 #define ICE_AQ_FLAG_BUF_S	12
 #define ICE_AQ_FLAG_SI_S	13
+#define ICE_AQ_FLAG_EI_S	14
+#define ICE_AQ_FLAG_FE_S	15
 
+#define ICE_AQ_FLAG_DD		BIT(ICE_AQ_FLAG_DD_S)  /* 0x1    */
+#define ICE_AQ_FLAG_CMP		BIT(ICE_AQ_FLAG_CMP_S) /* 0x2    */
 #define ICE_AQ_FLAG_ERR		BIT(ICE_AQ_FLAG_ERR_S) /* 0x4    */
+#define ICE_AQ_FLAG_VFE		BIT(ICE_AQ_FLAG_VFE_S) /* 0x8    */
 #define ICE_AQ_FLAG_LB		BIT(ICE_AQ_FLAG_LB_S)  /* 0x200  */
 #define ICE_AQ_FLAG_RD		BIT(ICE_AQ_FLAG_RD_S)  /* 0x400  */
+#define ICE_AQ_FLAG_VFC		BIT(ICE_AQ_FLAG_VFC_S) /* 0x800  */
 #define ICE_AQ_FLAG_BUF		BIT(ICE_AQ_FLAG_BUF_S) /* 0x1000 */
 #define ICE_AQ_FLAG_SI		BIT(ICE_AQ_FLAG_SI_S)  /* 0x2000 */
+#define ICE_AQ_FLAG_EI		BIT(ICE_AQ_FLAG_EI_S)  /* 0x4000 */
+#define ICE_AQ_FLAG_FE		BIT(ICE_AQ_FLAG_FE_S)  /* 0x8000 */
 
 /* error codes */
 enum ice_aq_err {
 	ICE_AQ_RC_OK		= 0,  /* Success */
 	ICE_AQ_RC_EPERM		= 1,  /* Operation not permitted */
 	ICE_AQ_RC_ENOENT	= 2,  /* No such element */
+	ICE_AQ_RC_ESRCH		= 3,  /* Bad opcode */
+	ICE_AQ_RC_EINTR		= 4,  /* Operation interrupted */
+	ICE_AQ_RC_EIO		= 5,  /* I/O error */
+	ICE_AQ_RC_ENXIO		= 6,  /* No such resource */
+	ICE_AQ_RC_E2BIG		= 7,  /* Arg too long */
+	ICE_AQ_RC_EAGAIN	= 8,  /* Try again */
 	ICE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	ICE_AQ_RC_EACCES	= 10, /* Permission denied */
+	ICE_AQ_RC_EFAULT	= 11, /* Bad address */
 	ICE_AQ_RC_EBUSY		= 12, /* Device or resource busy */
 	ICE_AQ_RC_EEXIST	= 13, /* Object already exists */
+	ICE_AQ_RC_EINVAL	= 14, /* Invalid argument */
+	ICE_AQ_RC_ENOTTY	= 15, /* Not a typewriter */
 	ICE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
 	ICE_AQ_RC_ENOSYS	= 17, /* Function not implemented */
+	ICE_AQ_RC_ERANGE	= 18, /* Parameter out of range */
+	ICE_AQ_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
+	ICE_AQ_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
+	ICE_AQ_RC_EMODE		= 21, /* Op not allowed in current dev mode */
+	ICE_AQ_RC_EFBIG		= 22, /* File too big */
+	ICE_AQ_RC_ESBCOMP	= 23, /* SB-IOSF completion unsuccessful */
 	ICE_AQ_RC_ENOSEC	= 24, /* Missing security manifest */
 	ICE_AQ_RC_EBADSIG	= 25, /* Bad RSA signature */
 	ICE_AQ_RC_ESVN		= 26, /* SVN number prohibits this package */
 	ICE_AQ_RC_EBADMAN	= 27, /* Manifest hash mismatch */
 	ICE_AQ_RC_EBADBUF	= 28, /* Buffer hash mismatches manifest */
+	ICE_AQ_RC_EACCES_BMCU	= 29, /* BMC Update in progress */
 };
 
 /* Admin Queue command opcodes */
@@ -1688,6 +3816,7 @@ enum ice_adminq_opc {
 	ice_aqc_opc_get_ver				= 0x0001,
 	ice_aqc_opc_driver_ver				= 0x0002,
 	ice_aqc_opc_q_shutdown				= 0x0003,
+	ice_aqc_opc_get_exp_err				= 0x0005,
 
 	/* resource ownership */
 	ice_aqc_opc_req_res				= 0x0008,
@@ -1704,77 +3833,200 @@ enum ice_adminq_opc {
 	/* PXE */
 	ice_aqc_opc_clear_pxe_mode			= 0x0110,
 
+	ice_aqc_opc_config_no_drop_policy		= 0x0112,
+
 	/* internal switch commands */
 	ice_aqc_opc_get_sw_cfg				= 0x0200,
+	ice_aqc_opc_set_port_params			= 0x0203,
 
 	/* Alloc/Free/Get Resources */
+	ice_aqc_opc_get_res_alloc			= 0x0204,
 	ice_aqc_opc_alloc_res				= 0x0208,
 	ice_aqc_opc_free_res				= 0x0209,
+	ice_aqc_opc_get_allocd_res_desc			= 0x020A,
+	ice_aqc_opc_set_vlan_mode_parameters		= 0x020C,
+	ice_aqc_opc_get_vlan_mode_parameters		= 0x020D,
 
 	/* VSI commands */
 	ice_aqc_opc_add_vsi				= 0x0210,
 	ice_aqc_opc_update_vsi				= 0x0211,
+	ice_aqc_opc_get_vsi_params			= 0x0212,
 	ice_aqc_opc_free_vsi				= 0x0213,
 
+	/* Mirroring rules - add/update, delete */
+	ice_aqc_opc_add_update_mir_rule			= 0x0260,
+	ice_aqc_opc_del_mir_rule			= 0x0261,
+
+	/* storm configuration */
+	ice_aqc_opc_set_storm_cfg			= 0x0280,
+	ice_aqc_opc_get_storm_cfg			= 0x0281,
+
+	/* recipe commands */
+	ice_aqc_opc_add_recipe				= 0x0290,
+	ice_aqc_opc_recipe_to_profile			= 0x0291,
+	ice_aqc_opc_get_recipe				= 0x0292,
+	ice_aqc_opc_get_recipe_to_profile		= 0x0293,
+
 	/* switch rules population commands */
 	ice_aqc_opc_add_sw_rules			= 0x02A0,
 	ice_aqc_opc_update_sw_rules			= 0x02A1,
 	ice_aqc_opc_remove_sw_rules			= 0x02A2,
-
+	ice_aqc_opc_get_sw_rules			= 0x02A3,
 	ice_aqc_opc_clear_pf_cfg			= 0x02A4,
 
+	/* DCB commands */
+	ice_aqc_opc_pfc_ignore				= 0x0301,
+	ice_aqc_opc_query_pfc_mode			= 0x0302,
+	ice_aqc_opc_set_pfc_mode			= 0x0303,
+	ice_aqc_opc_set_dcb_params			= 0x0306,
+
 	/* transmit scheduler commands */
 	ice_aqc_opc_get_dflt_topo			= 0x0400,
 	ice_aqc_opc_add_sched_elems			= 0x0401,
+	ice_aqc_opc_cfg_sched_elems			= 0x0403,
 	ice_aqc_opc_get_sched_elems			= 0x0404,
+	ice_aqc_opc_move_sched_elems			= 0x0408,
 	ice_aqc_opc_suspend_sched_elems			= 0x0409,
 	ice_aqc_opc_resume_sched_elems			= 0x040A,
 	ice_aqc_opc_query_port_ets			= 0x040E,
 	ice_aqc_opc_delete_sched_elems			= 0x040F,
+	ice_aqc_opc_add_rl_profiles			= 0x0410,
+	ice_aqc_opc_query_rl_profiles			= 0x0411,
 	ice_aqc_opc_query_sched_res			= 0x0412,
+	ice_aqc_opc_query_node_to_root			= 0x0413,
+	ice_aqc_opc_cfg_l2_node_cgd			= 0x0414,
+	ice_aqc_opc_remove_rl_profiles			= 0x0415,
 
 	/* PHY commands */
 	ice_aqc_opc_get_phy_caps			= 0x0600,
 	ice_aqc_opc_set_phy_cfg				= 0x0601,
+	ice_aqc_opc_set_mac_cfg				= 0x0603,
 	ice_aqc_opc_restart_an				= 0x0605,
 	ice_aqc_opc_get_link_status			= 0x0607,
 	ice_aqc_opc_set_event_mask			= 0x0613,
 	ice_aqc_opc_set_mac_lb				= 0x0620,
+	ice_aqc_opc_dnl_get_status			= 0x0680,
+	ice_aqc_opc_dnl_run				= 0x0681,
+	ice_aqc_opc_dnl_call				= 0x0682,
+	ice_aqc_opc_dnl_read_sto			= 0x0683,
+	ice_aqc_opc_dnl_write_sto			= 0x0684,
+	ice_aqc_opc_dnl_set_breakpoints			= 0x0686,
+	ice_aqc_opc_dnl_read_log			= 0x0687,
+	ice_aqc_opc_get_link_topo			= 0x06E0,
+	ice_aqc_opc_get_link_topo_pin			= 0x06E1,
+	ice_aqc_opc_read_i2c				= 0x06E2,
+	ice_aqc_opc_write_i2c				= 0x06E3,
+	ice_aqc_opc_read_mdio				= 0x06E4,
+	ice_aqc_opc_write_mdio				= 0x06E5,
+	ice_aqc_opc_set_gpio_by_func			= 0x06E6,
+	ice_aqc_opc_get_gpio_by_func			= 0x06E7,
+	ice_aqc_opc_set_led				= 0x06E8,
 	ice_aqc_opc_set_port_id_led			= 0x06E9,
+	ice_aqc_opc_get_port_options			= 0x06EA,
+	ice_aqc_opc_set_port_option			= 0x06EB,
+	ice_aqc_opc_set_gpio				= 0x06EC,
+	ice_aqc_opc_get_gpio				= 0x06ED,
+	ice_aqc_opc_sff_eeprom				= 0x06EE,
+	ice_aqc_opc_sw_set_gpio				= 0x06EF,
+	ice_aqc_opc_sw_get_gpio				= 0x06F0,
+	ice_aqc_opc_prog_topo_dev_nvm			= 0x06F2,
+	ice_aqc_opc_read_topo_dev_nvm			= 0x06F3,
 
 	/* NVM commands */
 	ice_aqc_opc_nvm_read				= 0x0701,
+	ice_aqc_opc_nvm_erase				= 0x0702,
+	ice_aqc_opc_nvm_write				= 0x0703,
+	ice_aqc_opc_nvm_cfg_read			= 0x0704,
+	ice_aqc_opc_nvm_cfg_write			= 0x0705,
 	ice_aqc_opc_nvm_checksum			= 0x0706,
+	ice_aqc_opc_nvm_write_activate			= 0x0707,
+	ice_aqc_opc_nvm_sr_dump				= 0x0707,
+	ice_aqc_opc_nvm_save_factory_settings		= 0x0708,
+	ice_aqc_opc_nvm_update_empr			= 0x0709,
+	ice_aqc_opc_nvm_pkg_data			= 0x070A,
+	ice_aqc_opc_nvm_pass_component_tbl		= 0x070B,
 
 	/* PF/VF mailbox commands */
 	ice_mbx_opc_send_msg_to_pf			= 0x0801,
 	ice_mbx_opc_send_msg_to_vf			= 0x0802,
+	/* Peer driver communication mailbox commands */
+	ice_mbx_opc_send_to_peer_pf			= 0x0803,
+	ice_mbx_opc_send_to_peer_drv			= 0x0804,
 	/* LLDP commands */
 	ice_aqc_opc_lldp_get_mib			= 0x0A00,
 	ice_aqc_opc_lldp_set_mib_change			= 0x0A01,
+	ice_aqc_opc_lldp_add_tlv			= 0x0A02,
+	ice_aqc_opc_lldp_update_tlv			= 0x0A03,
+	ice_aqc_opc_lldp_delete_tlv			= 0x0A04,
 	ice_aqc_opc_lldp_stop				= 0x0A05,
 	ice_aqc_opc_lldp_start				= 0x0A06,
 	ice_aqc_opc_get_cee_dcb_cfg			= 0x0A07,
 	ice_aqc_opc_lldp_set_local_mib			= 0x0A08,
 	ice_aqc_opc_lldp_stop_start_specific_agent	= 0x0A09,
+	ice_aqc_opc_lldp_filter_ctrl			= 0x0A0A,
 
 	/* RSS commands */
 	ice_aqc_opc_set_rss_key				= 0x0B02,
 	ice_aqc_opc_set_rss_lut				= 0x0B03,
 	ice_aqc_opc_get_rss_key				= 0x0B04,
 	ice_aqc_opc_get_rss_lut				= 0x0B05,
+	ice_aqc_opc_clear_fd_table			= 0x0B06,
+	/* Sideband Control Interface commands */
+	ice_aqc_opc_neighbour_device_request		= 0x0C00,
+	/* ACL commands */
+	ice_aqc_opc_alloc_acl_tbl			= 0x0C10,
+	ice_aqc_opc_dealloc_acl_tbl			= 0x0C11,
+	ice_aqc_opc_alloc_acl_actpair			= 0x0C12,
+	ice_aqc_opc_dealloc_acl_actpair			= 0x0C13,
+	ice_aqc_opc_alloc_acl_scen			= 0x0C14,
+	ice_aqc_opc_dealloc_acl_scen			= 0x0C15,
+	ice_aqc_opc_alloc_acl_counters			= 0x0C16,
+	ice_aqc_opc_dealloc_acl_counters		= 0x0C17,
+	ice_aqc_opc_dealloc_acl_res			= 0x0C1A,
+	ice_aqc_opc_update_acl_scen			= 0x0C1B,
+	ice_aqc_opc_program_acl_actpair			= 0x0C1C,
+	ice_aqc_opc_program_acl_prof_extraction		= 0x0C1D,
+	ice_aqc_opc_program_acl_prof_ranges		= 0x0C1E,
+	ice_aqc_opc_program_acl_entry			= 0x0C20,
+	ice_aqc_opc_query_acl_prof			= 0x0C21,
+	ice_aqc_opc_query_acl_prof_ranges		= 0x0C22,
+	ice_aqc_opc_query_acl_scen			= 0x0C23,
+	ice_aqc_opc_query_acl_entry			= 0x0C24,
+	ice_aqc_opc_query_acl_actpair			= 0x0C25,
+	ice_aqc_opc_query_acl_counter			= 0x0C27,
 
 	/* Tx queue handling commands/events */
 	ice_aqc_opc_add_txqs				= 0x0C30,
 	ice_aqc_opc_dis_txqs				= 0x0C31,
+	ice_aqc_opc_txqs_cleanup			= 0x0C31,
+	ice_aqc_opc_move_recfg_txqs			= 0x0C32,
+	ice_aqc_opc_add_rdma_qset			= 0x0C33,
+	ice_aqc_opc_move_rdma_qset			= 0x0C34,
 
 	/* package commands */
 	ice_aqc_opc_download_pkg			= 0x0C40,
+	ice_aqc_opc_upload_section			= 0x0C41,
+	ice_aqc_opc_update_pkg				= 0x0C42,
 	ice_aqc_opc_get_pkg_info_list			= 0x0C43,
 
-	/* debug commands */
-	ice_aqc_opc_fw_logging				= 0xFF09,
-	ice_aqc_opc_fw_logging_info			= 0xFF10,
+
+	ice_aqc_opc_driver_shared_params		= 0x0C90,
+
+	/* Standalone Commands/Events */
+	ice_aqc_opc_event_lan_overflow			= 0x1001,
+	/* SystemDiagnostic commands */
+	ice_aqc_opc_set_health_status_config		= 0xFF20,
+	ice_aqc_opc_get_supported_health_status_codes	= 0xFF21,
+	ice_aqc_opc_get_health_status			= 0xFF22,
+	ice_aqc_opc_clear_health_status			= 0xFF23,
+
+	/* FW Logging Commands */
+	ice_aqc_opc_fw_logs_config			= 0xFF30,
+	ice_aqc_opc_fw_logs_register			= 0xFF31,
+	ice_aqc_opc_fw_logs_query			= 0xFF32,
+	ice_aqc_opc_fw_logs_event			= 0xFF33,
+	ice_aqc_opc_fw_logs_get				= 0xFF34,
+	ice_aqc_opc_fw_logs_clear			= 0xFF35
 };
 
 #endif /* _ICE_ADMINQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..b059cd91a7e06ccef9e4ab0ce483eaee073a6077
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_is_arfs_active - helper to check is aRFS is active
+ * @vsi: VSI to check
+ */
+static bool ice_is_arfs_active(struct ice_vsi *vsi)
+{
+	return !!vsi->arfs_fltr_list;
+}
+
+/**
+ * ice_is_arfs_using_perfect_flow - check if aRFS has active perfect filters
+ * @hw: pointer to the HW structure
+ * @flow_type: flow type as Flow Director understands it
+ *
+ * Flow Director will query this function to see if aRFS is currently using
+ * the specified flow_type for perfect (4-tuple) filters.
+ */
+bool
+ice_is_arfs_using_perfect_flow(struct ice_hw *hw, enum ice_fltr_ptype flow_type)
+{
+	struct ice_arfs_active_fltr_cntrs *arfs_fltr_cntrs;
+	struct ice_pf *pf = hw->back;
+	struct ice_vsi *vsi;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
+
+	arfs_fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+	/* active counters can be updated by multiple CPUs */
+	smp_mb__before_atomic();
+	switch (flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return atomic_read(&arfs_fltr_cntrs->active_udpv4_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return atomic_read(&arfs_fltr_cntrs->active_udpv6_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return atomic_read(&arfs_fltr_cntrs->active_tcpv4_cnt) > 0;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return atomic_read(&arfs_fltr_cntrs->active_tcpv6_cnt) > 0;
+	default:
+		return false;
+	}
+}
+
+/**
+ * ice_arfs_update_active_fltr_cntrs - update active filter counters for aRFS
+ * @vsi: VSI that aRFS is active on
+ * @entry: aRFS entry used to change counters
+ * @add: true to increment counter, false to decrement
+ */
+static void
+ice_arfs_update_active_fltr_cntrs(struct ice_vsi *vsi,
+				  struct ice_arfs_entry *entry, bool add)
+{
+	struct ice_arfs_active_fltr_cntrs *fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+	switch (entry->fltr_info.flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_tcpv4_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_tcpv4_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_tcpv6_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_tcpv6_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_udpv4_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_udpv4_cnt);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		if (add)
+			atomic_inc(&fltr_cntrs->active_udpv6_cnt);
+		else
+			atomic_dec(&fltr_cntrs->active_udpv6_cnt);
+		break;
+	default:
+		dev_err(ice_pf_to_dev(vsi->back), "aRFS: Failed to update filter counters, invalid filter type %d\n",
+			entry->fltr_info.flow_type);
+	}
+}
+
+/**
+ * ice_arfs_del_flow_rules - delete the rules passed in from HW
+ * @vsi: VSI for the flow rules that need to be deleted
+ * @del_list_head: head of the list of ice_arfs_entry(s) for rule deletion
+ *
+ * Loop through the delete list passed in and remove the rules from HW. After
+ * each rule is deleted, disconnect and free the ice_arfs_entry because it is no
+ * longer being referenced by the aRFS hash table.
+ */
+static void
+ice_arfs_del_flow_rules(struct ice_vsi *vsi, struct hlist_head *del_list_head)
+{
+	struct ice_arfs_entry *e;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	hlist_for_each_entry_safe(e, n, del_list_head, list_entry) {
+		int result;
+
+		result = ice_fdir_write_fltr(vsi->back, &e->fltr_info, false,
+					     false);
+		if (!result)
+			ice_arfs_update_active_fltr_cntrs(vsi, e, false);
+		else
+			dev_dbg(dev, "Unable to delete aRFS entry, err %d fltr_state %d fltr_id %d flow_id %d Q %d\n",
+				result, e->fltr_state, e->fltr_info.fltr_id,
+				e->flow_id, e->fltr_info.q_index);
+
+		/* The aRFS hash table is no longer referencing this entry */
+		hlist_del(&e->list_entry);
+		devm_kfree(dev, e);
+	}
+}
+
+/**
+ * ice_arfs_add_flow_rules - add the rules passed in from HW
+ * @vsi: VSI for the flow rules that need to be added
+ * @add_list_head: head of the list of ice_arfs_entry_ptr(s) for rule addition
+ *
+ * Loop through the add list passed in and remove the rules from HW. After each
+ * rule is added, disconnect and free the ice_arfs_entry_ptr node. Don't free
+ * the ice_arfs_entry(s) because they are still being referenced in the aRFS
+ * hash table.
+ */
+static void
+ice_arfs_add_flow_rules(struct ice_vsi *vsi, struct hlist_head *add_list_head)
+{
+	struct ice_arfs_entry_ptr *ep;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	hlist_for_each_entry_safe(ep, n, add_list_head, list_entry) {
+		int result;
+
+		result = ice_fdir_write_fltr(vsi->back,
+					     &ep->arfs_entry->fltr_info, true,
+					     false);
+		if (!result)
+			ice_arfs_update_active_fltr_cntrs(vsi, ep->arfs_entry,
+							  true);
+		else
+			dev_dbg(dev, "Unable to add aRFS entry, err %d fltr_state %d fltr_id %d flow_id %d Q %d\n",
+				result, ep->arfs_entry->fltr_state,
+				ep->arfs_entry->fltr_info.fltr_id,
+				ep->arfs_entry->flow_id,
+				ep->arfs_entry->fltr_info.q_index);
+
+		hlist_del(&ep->list_entry);
+		devm_kfree(dev, ep);
+	}
+}
+
+/**
+ * ice_arfs_is_flow_expired - check if the aRFS entry has expired
+ * @vsi: VSI containing the aRFS entry
+ * @arfs_entry: aRFS entry that's being checked for expiration
+ *
+ * Return true if the flow has expired, else false. This function should be used
+ * to determine whether or not an aRFS entry should be removed from the hardware
+ * and software structures.
+ */
+static bool
+ice_arfs_is_flow_expired(struct ice_vsi *vsi, struct ice_arfs_entry *arfs_entry)
+{
+#define ICE_ARFS_TIME_DELTA_EXPIRATION	msecs_to_jiffies(5000)
+	if (rps_may_expire_flow(vsi->netdev, arfs_entry->fltr_info.q_index,
+				arfs_entry->flow_id,
+				arfs_entry->fltr_info.fltr_id))
+		return true;
+
+	/* expiration timer only used for UDP filters */
+	if (arfs_entry->fltr_info.flow_type != ICE_FLTR_PTYPE_NONF_IPV4_UDP &&
+	    arfs_entry->fltr_info.flow_type != ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		return false;
+
+	return time_in_range64(arfs_entry->time_activated +
+			       ICE_ARFS_TIME_DELTA_EXPIRATION,
+			       arfs_entry->time_activated, get_jiffies_64());
+}
+
+/**
+ * ice_arfs_update_flow_rules - add/delete aRFS rules in HW
+ * @vsi: the VSI to be forwarded to
+ * @idx: index into the table of aRFS filter lists. Obtained from skb->hash
+ * @add_list: list to populate with filters to be added to Flow Director
+ * @del_list: list to populate with filters to be deleted from Flow Director
+ *
+ * Iterate over the hlist at the index given in the aRFS hash table and
+ * determine if there are any aRFS entries that need to be either added or
+ * deleted in the HW. If the aRFS entry is marked as ICE_ARFS_INACTIVE the
+ * filter needs to be added to HW, else if it's marked as ICE_ARFS_ACTIVE and
+ * the flow has expired delete the filter from HW. The caller of this function
+ * is expected to add/delete rules on the add_list/del_list respectively.
+ */
+static void
+ice_arfs_update_flow_rules(struct ice_vsi *vsi, u16 idx,
+			   struct hlist_head *add_list,
+			   struct hlist_head *del_list)
+{
+	struct ice_arfs_entry *e;
+	struct hlist_node *n;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	/* go through the aRFS hlist at this idx and check for needed updates */
+	hlist_for_each_entry_safe(e, n, &vsi->arfs_fltr_list[idx], list_entry)
+		/* check if filter needs to be added to HW */
+		if (e->fltr_state == ICE_ARFS_INACTIVE) {
+			enum ice_fltr_ptype flow_type = e->fltr_info.flow_type;
+			struct ice_arfs_entry_ptr *ep =
+				devm_kzalloc(dev, sizeof(*ep), GFP_ATOMIC);
+
+			if (!ep)
+				continue;
+			INIT_HLIST_NODE(&ep->list_entry);
+			/* reference aRFS entry to add HW filter */
+			ep->arfs_entry = e;
+			hlist_add_head(&ep->list_entry, add_list);
+			e->fltr_state = ICE_ARFS_ACTIVE;
+			/* expiration timer only used for UDP flows */
+			if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+			    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+				e->time_activated = get_jiffies_64();
+		} else if (e->fltr_state == ICE_ARFS_ACTIVE) {
+			/* check if filter needs to be removed from HW */
+			if (ice_arfs_is_flow_expired(vsi, e)) {
+				/* remove aRFS entry from hash table for delete
+				 * and to prevent referencing it the next time
+				 * through this hlist index
+				 */
+				hlist_del(&e->list_entry);
+				e->fltr_state = ICE_ARFS_TODEL;
+				/* save reference to aRFS entry for delete */
+				hlist_add_head(&e->list_entry, del_list);
+			}
+		}
+}
+
+/**
+ * ice_sync_arfs_fltrs - update all aRFS filters
+ * @pf: board private structure
+ */
+void ice_sync_arfs_fltrs(struct ice_pf *pf)
+{
+	HLIST_HEAD(tmp_del_list);
+	HLIST_HEAD(tmp_add_list);
+	struct ice_vsi *pf_vsi;
+	unsigned int i;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	if (!ice_is_arfs_active(pf_vsi))
+		return;
+
+	spin_lock_bh(&pf_vsi->arfs_lock);
+	/* Once we process aRFS for the PF VSI get out */
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++)
+		ice_arfs_update_flow_rules(pf_vsi, i, &tmp_add_list,
+					   &tmp_del_list);
+	spin_unlock_bh(&pf_vsi->arfs_lock);
+
+	/* use list of ice_arfs_entry(s) for delete */
+	ice_arfs_del_flow_rules(pf_vsi, &tmp_del_list);
+
+	/* use list of ice_arfs_entry_ptr(s) for add */
+	ice_arfs_add_flow_rules(pf_vsi, &tmp_add_list);
+}
+
+#ifdef ICE_ADD_PROBES
+static u16
+ice_arfs_get_cnt_index(struct ice_pf *pf, struct ice_arfs_entry *entry)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	switch (entry->fltr_info.flow_type) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return ICE_ARFS_STAT_TCPV4_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return ICE_ARFS_STAT_TCPV6_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return ICE_ARFS_STAT_UDPV4_IDX(hw->fd_ctr_base);
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return ICE_ARFS_STAT_UDPV6_IDX(hw->fd_ctr_base);
+	default:
+		dev_err(ice_pf_to_dev(pf), "aRFS: Invalid flow type %d\n",
+			entry->fltr_info.flow_type);
+		return ICE_FD_SB_STAT_IDX(hw->fd_ctr_base);
+	}
+}
+#endif /* ICE_ADD_PROBES */
+
+/**
+ * ice_arfs_build_entry - builds an aRFS entry based on input
+ * @vsi: destination VSI for this flow
+ * @fk: flow dissector keys for creating the tuple
+ * @rxq_idx: Rx queue to steer this flow to
+ * @flow_id: passed down from the stack and saved for flow expiration
+ *
+ * returns an aRFS entry on success and NULL on failure
+ */
+static struct ice_arfs_entry *
+ice_arfs_build_entry(struct ice_vsi *vsi, const struct flow_keys *fk,
+		     u16 rxq_idx, u32 flow_id)
+{
+	struct ice_arfs_entry *arfs_entry;
+	struct ice_fdir_fltr *fltr_info;
+	u8 ip_proto;
+
+	arfs_entry = devm_kzalloc(ice_pf_to_dev(vsi->back),
+				  sizeof(*arfs_entry),
+				  GFP_ATOMIC | __GFP_NOWARN);
+	if (!arfs_entry)
+		return NULL;
+
+	fltr_info = &arfs_entry->fltr_info;
+	fltr_info->q_index = rxq_idx;
+	fltr_info->dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+	fltr_info->dest_vsi = vsi->idx;
+	fltr_info->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+	fltr_info->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	ip_proto = fk->basic.ip_proto;
+
+	if (fk->basic.n_proto == htons(ETH_P_IP)) {
+		fltr_info->ip.v4.proto = ip_proto;
+		fltr_info->flow_type = (ip_proto == IPPROTO_TCP) ?
+			ICE_FLTR_PTYPE_NONF_IPV4_TCP :
+			ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+		fltr_info->ip.v4.src_ip = fk->addrs.v4addrs.src;
+		fltr_info->ip.v4.dst_ip = fk->addrs.v4addrs.dst;
+		fltr_info->ip.v4.src_port = fk->ports.src;
+		fltr_info->ip.v4.dst_port = fk->ports.dst;
+	} else { /* ETH_P_IPV6 */
+		fltr_info->ip.v6.proto = ip_proto;
+		fltr_info->flow_type = (ip_proto == IPPROTO_TCP) ?
+			ICE_FLTR_PTYPE_NONF_IPV6_TCP :
+			ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+		memcpy(&fltr_info->ip.v6.src_ip, &fk->addrs.v6addrs.src,
+		       sizeof(struct in6_addr));
+		memcpy(&fltr_info->ip.v6.dst_ip, &fk->addrs.v6addrs.dst,
+		       sizeof(struct in6_addr));
+		fltr_info->ip.v6.src_port = fk->ports.src;
+		fltr_info->ip.v6.dst_port = fk->ports.dst;
+	}
+
+#ifdef ICE_ADD_PROBES
+	fltr_info->cnt_index =
+		ice_arfs_get_cnt_index(vsi->back, arfs_entry);
+	fltr_info->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+#endif /* ICE_ADD_PROBES */
+
+	arfs_entry->flow_id = flow_id;
+	fltr_info->fltr_id =
+		atomic_inc_return(vsi->arfs_last_fltr_id) % RPS_NO_FILTER;
+
+	return arfs_entry;
+}
+
+/**
+ * ice_arfs_is_perfect_flow_set - Check to see if perfect flow is set
+ * @hw: pointer to HW structure
+ * @l3_proto: ETH_P_IP or ETH_P_IPV6 in network order
+ * @l4_proto: IPPROTO_UDP or IPPROTO_TCP
+ *
+ * We only support perfect (4-tuple) filters for aRFS. This function allows aRFS
+ * to check if perfect (4-tuple) flow rules are currently in place by Flow
+ * Director.
+ */
+static bool
+ice_arfs_is_perfect_flow_set(struct ice_hw *hw, __be16 l3_proto, u8 l4_proto)
+{
+	unsigned long *perfect_fltr = hw->fdir_perfect_fltr;
+
+	/* advanced Flow Director disabled, perfect filters always supported */
+	if (!perfect_fltr)
+		return true;
+
+	if (l3_proto == htons(ETH_P_IP) && l4_proto == IPPROTO_UDP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV4_UDP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IP) && l4_proto == IPPROTO_TCP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV4_TCP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IPV6) && l4_proto == IPPROTO_UDP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV6_UDP, perfect_fltr);
+	else if (l3_proto == htons(ETH_P_IPV6) && l4_proto == IPPROTO_TCP)
+		return test_bit(ICE_FLTR_PTYPE_NONF_IPV6_TCP, perfect_fltr);
+
+	return false;
+}
+
+/**
+ * ice_rx_flow_steer - steer the Rx flow to where application is being run
+ * @netdev: ptr to the netdev being adjusted
+ * @skb: buffer with required header information
+ * @rxq_idx: queue to which the flow needs to move
+ * @flow_id: flow identifier provided by the netdev
+ *
+ * Based on the skb, rxq_idx, and flow_id passed in add/update an entry in the
+ * aRFS hash table. Iterate over one of the hlists in the aRFS hash table and
+ * if the flow_id already exists in the hash table but the rxq_idx has changed
+ * mark the entry as ICE_ARFS_INACTIVE so it can get updated in HW, else
+ * if the entry is marked as ICE_ARFS_TODEL delete it from the aRFS hash table.
+ * If neither of the previous conditions are true then add a new entry in the
+ * aRFS hash table, which gets set to ICE_ARFS_INACTIVE by default so it can be
+ * added to HW.
+ */
+int
+ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb,
+		  u16 rxq_idx, u32 flow_id)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_arfs_entry *arfs_entry;
+	struct ice_vsi *vsi = np->vsi;
+	struct flow_keys fk;
+	struct ice_pf *pf;
+	__be16 n_proto;
+	u8 ip_proto;
+	u16 idx;
+	int ret;
+
+	/* failed to allocate memory for aRFS so don't crash */
+	if (unlikely(!vsi->arfs_fltr_list))
+		return -ENODEV;
+
+	pf = vsi->back;
+
+#ifdef NETIF_F_HW_TC
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* aRFS only supported on Rx queues belonging to PF VSI */
+	if (vsi->type == ICE_VSI_PF && ice_is_adq_active(pf) &&
+	    rxq_idx >= vsi->mqprio_qopt.qopt.count[0])
+		return -EOPNOTSUPP;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#endif /* NETIF_F_HW_TC */
+
+	if (skb->encapsulation)
+		return -EPROTONOSUPPORT;
+
+	if (!skb_flow_dissect_flow_keys(skb, &fk, 0))
+		return -EPROTONOSUPPORT;
+
+	n_proto = fk.basic.n_proto;
+	/* Support only IPV4 and IPV6 */
+	if ((n_proto == htons(ETH_P_IP) && !ip_is_fragment(ip_hdr(skb))) ||
+	    n_proto == htons(ETH_P_IPV6))
+		ip_proto = fk.basic.ip_proto;
+	else
+		return -EPROTONOSUPPORT;
+
+	/* Support only TCP and UDP */
+	if (ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP)
+		return -EPROTONOSUPPORT;
+
+	/* only support 4-tuple filters for aRFS */
+	if (!ice_arfs_is_perfect_flow_set(&pf->hw, n_proto, ip_proto))
+		return -EOPNOTSUPP;
+
+	/* choose the aRFS list bucket based on skb hash */
+	idx = skb_get_hash_raw(skb) & ICE_ARFS_LST_MASK;
+	/* search for entry in the bucket */
+	spin_lock_bh(&vsi->arfs_lock);
+	hlist_for_each_entry(arfs_entry, &vsi->arfs_fltr_list[idx],
+			     list_entry) {
+		struct ice_fdir_fltr *fltr_info;
+
+		/* keep searching for the already existing arfs_entry flow */
+		if (arfs_entry->flow_id != flow_id)
+			continue;
+
+		fltr_info = &arfs_entry->fltr_info;
+		ret = fltr_info->fltr_id;
+
+		if (fltr_info->q_index == rxq_idx ||
+		    arfs_entry->fltr_state != ICE_ARFS_ACTIVE)
+			goto out;
+
+		/* update the queue to forward to on an already existing flow */
+		fltr_info->q_index = rxq_idx;
+		arfs_entry->fltr_state = ICE_ARFS_INACTIVE;
+		ice_arfs_update_active_fltr_cntrs(vsi, arfs_entry, false);
+		goto out_schedule_service_task;
+	}
+
+	arfs_entry = ice_arfs_build_entry(vsi, &fk, rxq_idx, flow_id);
+	if (!arfs_entry) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = arfs_entry->fltr_info.fltr_id;
+	INIT_HLIST_NODE(&arfs_entry->list_entry);
+	hlist_add_head(&arfs_entry->list_entry, &vsi->arfs_fltr_list[idx]);
+out_schedule_service_task:
+	ice_service_task_schedule(pf);
+out:
+	spin_unlock_bh(&vsi->arfs_lock);
+	return ret;
+}
+
+/**
+ * ice_init_arfs_cntrs - initialize aRFS counter values
+ * @vsi: VSI that aRFS counters need to be initialized on
+ */
+static int ice_init_arfs_cntrs(struct ice_vsi *vsi)
+{
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	vsi->arfs_fltr_cntrs = kzalloc(sizeof(*vsi->arfs_fltr_cntrs),
+				       GFP_KERNEL);
+	if (!vsi->arfs_fltr_cntrs)
+		return -ENOMEM;
+
+	vsi->arfs_last_fltr_id = kzalloc(sizeof(*vsi->arfs_last_fltr_id),
+					 GFP_KERNEL);
+	if (!vsi->arfs_last_fltr_id) {
+		kfree(vsi->arfs_fltr_cntrs);
+		vsi->arfs_fltr_cntrs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_init_arfs - initialize aRFS resources
+ * @vsi: the VSI to be forwarded to
+ */
+void ice_init_arfs(struct ice_vsi *vsi)
+{
+	struct hlist_head *arfs_fltr_list;
+	unsigned int i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return;
+
+	arfs_fltr_list = kzalloc(sizeof(*arfs_fltr_list) * ICE_MAX_ARFS_LIST,
+				 GFP_KERNEL);
+	if (!arfs_fltr_list)
+		return;
+
+	if (ice_init_arfs_cntrs(vsi))
+		goto free_arfs_fltr_list;
+
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++)
+		INIT_HLIST_HEAD(&arfs_fltr_list[i]);
+
+	spin_lock_init(&vsi->arfs_lock);
+
+	vsi->arfs_fltr_list = arfs_fltr_list;
+
+	return;
+
+free_arfs_fltr_list:
+	kfree(arfs_fltr_list);
+}
+
+/**
+ * ice_clear_arfs - clear the aRFS hash table and any memory used for aRFS
+ * @vsi: the VSI to be forwarded to
+ */
+void ice_clear_arfs(struct ice_vsi *vsi)
+{
+	struct device *dev;
+	unsigned int i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF || !vsi->back ||
+	    !vsi->arfs_fltr_list)
+		return;
+
+	dev = ice_pf_to_dev(vsi->back);
+	for (i = 0; i < ICE_MAX_ARFS_LIST; i++) {
+		struct ice_arfs_entry *r;
+		struct hlist_node *n;
+
+		spin_lock_bh(&vsi->arfs_lock);
+		hlist_for_each_entry_safe(r, n, &vsi->arfs_fltr_list[i],
+					  list_entry) {
+			hlist_del(&r->list_entry);
+			devm_kfree(dev, r);
+		}
+		spin_unlock_bh(&vsi->arfs_lock);
+	}
+
+	kfree(vsi->arfs_fltr_list);
+	vsi->arfs_fltr_list = NULL;
+	kfree(vsi->arfs_last_fltr_id);
+	vsi->arfs_last_fltr_id = NULL;
+	kfree(vsi->arfs_fltr_cntrs);
+	vsi->arfs_fltr_cntrs = NULL;
+}
+
+/**
+ * ice_free_cpu_rx_rmap - free setup cpu reverse map
+ * @vsi: the VSI to be forwarded to
+ */
+static void ice_free_cpu_rx_rmap(struct ice_vsi *vsi)
+{
+	struct net_device *netdev;
+
+	if (!vsi || vsi->type != ICE_VSI_PF || !vsi->arfs_fltr_list)
+		return;
+
+	netdev = vsi->netdev;
+	if (!netdev || !netdev->rx_cpu_rmap)
+		return;
+
+	free_irq_cpu_rmap(netdev->rx_cpu_rmap);
+	netdev->rx_cpu_rmap = NULL;
+}
+
+/**
+ * ice_set_cpu_rx_rmap - setup cpu reverse map for each queue
+ * @vsi: the VSI to be forwarded to
+ */
+int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)
+{
+	struct net_device *netdev;
+	struct ice_pf *pf;
+	int base_idx, i;
+
+	if (!vsi || vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	pf = vsi->back;
+	netdev = vsi->netdev;
+	if (!pf || !netdev || !vsi->num_q_vectors)
+		return -EINVAL;
+
+	netdev_dbg(netdev, "Setup CPU RMAP: vsi type 0x%x, ifname %s, q_vectors %d\n",
+		   vsi->type, netdev->name, vsi->num_q_vectors);
+
+	netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(vsi->num_q_vectors);
+	if (unlikely(!netdev->rx_cpu_rmap))
+		return -EINVAL;
+
+	base_idx = vsi->base_vector;
+	for (i = 0; i < vsi->num_q_vectors; i++)
+		if (irq_cpu_rmap_add(netdev->rx_cpu_rmap,
+				     pf->msix_entries[base_idx + i].vector)) {
+			ice_free_cpu_rx_rmap(vsi);
+			return -EINVAL;
+		}
+
+	return 0;
+}
+
+/**
+ * ice_remove_arfs - remove/clear all aRFS resources
+ * @pf: device private structure
+ */
+void ice_remove_arfs(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	ice_free_cpu_rx_rmap(pf_vsi);
+	ice_clear_arfs(pf_vsi);
+}
+
+/**
+ * ice_rebuild_arfs - remove/clear all aRFS resources and rebuild after reset
+ * @pf: device private structure
+ */
+void ice_rebuild_arfs(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return;
+
+	ice_remove_arfs(pf);
+	if (ice_set_cpu_rx_rmap(pf_vsi)) {
+		dev_err(ice_pf_to_dev(pf), "Failed to rebuild aRFS\n");
+		return;
+	}
+	ice_init_arfs(pf_vsi);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.h b/drivers/net/ethernet/intel/ice/ice_arfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b9346fa73c8d9c1914df4d6dacf1fa8c4725cd7
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ARFS_H_
+#define _ICE_ARFS_H_
+
+#include "ice.h"
+
+enum ice_arfs_fltr_state {
+	ICE_ARFS_INACTIVE,
+	ICE_ARFS_ACTIVE,
+	ICE_ARFS_TODEL,
+};
+
+struct ice_arfs_entry {
+	struct ice_fdir_fltr fltr_info;
+	struct hlist_node list_entry;
+	u64 time_activated;	/* only valid for UDP flows */
+	u32 flow_id;
+	/* fltr_state = 0 - ICE_ARFS_INACTIVE:
+	 *	filter needs to be updated or programmed in HW.
+	 * fltr_state = 1 - ICE_ARFS_ACTIVE:
+	 *	filter is active and programmed in HW.
+	 * fltr_state = 2 - ICE_ARFS_TODEL:
+	 *	filter has been deleted from HW and needs to be removed from
+	 *	the aRFS hash table.
+	 */
+	u8 fltr_state;
+};
+
+struct ice_arfs_entry_ptr {
+	struct ice_arfs_entry *arfs_entry;
+	struct hlist_node list_entry;
+};
+
+struct ice_arfs_active_fltr_cntrs {
+	atomic_t active_tcpv4_cnt;
+	atomic_t active_tcpv6_cnt;
+	atomic_t active_udpv4_cnt;
+	atomic_t active_udpv6_cnt;
+};
+
+#ifdef CONFIG_RFS_ACCEL
+int
+ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb,
+		  u16 rxq_idx, u32 flow_id);
+void ice_clear_arfs(struct ice_vsi *vsi);
+void ice_init_arfs(struct ice_vsi *vsi);
+void ice_sync_arfs_fltrs(struct ice_pf *pf);
+int ice_set_cpu_rx_rmap(struct ice_vsi *vsi);
+void ice_remove_arfs(struct ice_pf *pf);
+void ice_rebuild_arfs(struct ice_pf *pf);
+bool
+ice_is_arfs_using_perfect_flow(struct ice_hw *hw,
+			       enum ice_fltr_ptype flow_type);
+#else
+static inline void ice_clear_arfs(struct ice_vsi *vsi) { }
+static inline void ice_init_arfs(struct ice_vsi *vsi) { }
+static inline void ice_sync_arfs_fltrs(struct ice_pf *pf) { }
+static inline void ice_remove_arfs(struct ice_pf *pf) { }
+static inline void ice_rebuild_arfs(struct ice_pf *pf) { }
+
+static inline int ice_set_cpu_rx_rmap(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+static inline int
+ice_rx_flow_steer(struct net_device __always_unused *netdev,
+		  const struct sk_buff __always_unused *skb,
+		  u16 __always_unused rxq_idx, u32 __always_unused flow_id)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool
+ice_is_arfs_using_perfect_flow(struct ice_hw __always_unused *hw,
+			       enum ice_fltr_ptype __always_unused flow_type)
+{
+	return false;
+}
+#endif /* CONFIG_RFS_ACCEL */
+#endif /* _ICE_ARFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
new file mode 100644
index 0000000000000000000000000000000000000000..e2acab0c0777cdbfca5a602f4b5189506cf7b88a
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -0,0 +1,1045 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_base.h"
+#include "ice_lib.h"
+#include "ice_dcb_lib.h"
+#include "ice_virtchnl_pf.h"
+
+/**
+ * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
+ * @qs_cfg: gathered variables needed for PF->VSI queues assignment
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg)
+{
+	unsigned int offset, i;
+
+	mutex_lock(qs_cfg->qs_mutex);
+	offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size,
+					    0, qs_cfg->q_count, 0);
+	if (offset >= qs_cfg->pf_map_size) {
+		mutex_unlock(qs_cfg->qs_mutex);
+		return -ENOMEM;
+	}
+
+	bitmap_set(qs_cfg->pf_map, offset, qs_cfg->q_count);
+	for (i = 0; i < qs_cfg->q_count; i++)
+		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)(i + offset);
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return 0;
+}
+
+/**
+ * __ice_vsi_get_qs_sc - Assign a scattered queues from PF to VSI
+ * @qs_cfg: gathered variables needed for pf->vsi queues assignment
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg)
+{
+	unsigned int i, index = 0;
+
+	mutex_lock(qs_cfg->qs_mutex);
+	for (i = 0; i < qs_cfg->q_count; i++) {
+		index = find_next_zero_bit(qs_cfg->pf_map,
+					   qs_cfg->pf_map_size, index);
+		if (index >= qs_cfg->pf_map_size)
+			goto err_scatter;
+		set_bit(index, qs_cfg->pf_map);
+		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)index;
+	}
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return 0;
+err_scatter:
+	for (index = 0; index < i; index++) {
+		clear_bit(qs_cfg->vsi_map[index], qs_cfg->pf_map);
+		qs_cfg->vsi_map[index + qs_cfg->vsi_map_offset] = 0;
+	}
+	mutex_unlock(qs_cfg->qs_mutex);
+
+	return -ENOMEM;
+}
+
+/**
+ * ice_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
+ * @pf: the PF being configured
+ * @pf_q: the PF queue
+ * @ena: enable or disable state of the queue
+ *
+ * This routine will wait for the given Rx queue of the PF to reach the
+ * enabled or disabled state.
+ * Returns -ETIMEDOUT in case of failing to reach the requested state after
+ * multiple retries; else will return 0 in case of success.
+ */
+static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena)
+{
+	int i;
+
+	for (i = 0; i < ICE_Q_WAIT_MAX_RETRY; i++) {
+		if (ena == !!(rd32(&pf->hw, QRX_CTRL(pf_q)) &
+			      QRX_CTRL_QENA_STAT_M))
+			return 0;
+
+		usleep_range(20, 40);
+	}
+
+	return -ETIMEDOUT;
+}
+
+/**
+ * ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
+ * @vsi: the VSI being configured
+ * @v_idx: index of the vector in the VSI struct
+ *
+ * We allocate one q_vector and set default value for ITR setting associated
+ * with this q_vector. If allocation fails we return -ENOMEM.
+ */
+static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, u16 v_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_q_vector *q_vector;
+
+	/* allocate q_vector */
+	q_vector = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*q_vector),
+				GFP_KERNEL);
+	if (!q_vector)
+		return -ENOMEM;
+
+	q_vector->vsi = vsi;
+	q_vector->v_idx = v_idx;
+	q_vector->tx.itr_setting = ICE_DFLT_TX_ITR;
+	q_vector->rx.itr_setting = ICE_DFLT_RX_ITR;
+	q_vector->tx.itr_mode = ITR_DYNAMIC;
+	q_vector->rx.itr_mode = ITR_DYNAMIC;
+
+	if (vsi->type == ICE_VSI_VF)
+		goto out;
+	/* only set affinity_mask if the CPU is online */
+	if (cpu_online(v_idx))
+		cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
+
+	/* This will not be called in the driver load path because the netdev
+	 * will not be created yet. All other cases with register the NAPI
+	 * handler here (i.e. resume, reset/rebuild, etc.)
+	 */
+	if (vsi->netdev)
+		netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll,
+			       NAPI_POLL_WEIGHT);
+
+out:
+	/* tie q_vector and VSI together */
+	vsi->q_vectors[v_idx] = q_vector;
+
+	return 0;
+}
+
+/**
+ * ice_free_q_vector - Free memory allocated for a specific interrupt vector
+ * @vsi: VSI having the memory freed
+ * @v_idx: index of the vector to be freed
+ */
+static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
+{
+	struct ice_q_vector *q_vector;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *ring;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->q_vectors[v_idx]) {
+		dev_dbg(dev, "Queue vector at index %d not found\n", v_idx);
+		return;
+	}
+	q_vector = vsi->q_vectors[v_idx];
+
+	ice_for_each_ring(ring, q_vector->tx)
+		ring->q_vector = NULL;
+	ice_for_each_ring(ring, q_vector->rx)
+		ring->q_vector = NULL;
+
+	/* only VSI with an associated netdev is set up with NAPI */
+	if (vsi->netdev)
+		netif_napi_del(&q_vector->napi);
+
+	devm_kfree(dev, q_vector);
+	vsi->q_vectors[v_idx] = NULL;
+}
+
+/**
+ * ice_cfg_itr_gran - set the ITR granularity to 2 usecs if not already set
+ * @hw: board specific structure
+ */
+static void ice_cfg_itr_gran(struct ice_hw *hw)
+{
+	u32 regval = rd32(hw, GLINT_CTL);
+
+	/* no need to update global register if ITR gran is already set */
+	if (!(regval & GLINT_CTL_DIS_AUTOMASK_M) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_200_M) >>
+	     GLINT_CTL_ITR_GRAN_200_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_100_M) >>
+	     GLINT_CTL_ITR_GRAN_100_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_50_M) >>
+	     GLINT_CTL_ITR_GRAN_50_S) == ICE_ITR_GRAN_US) &&
+	    (((regval & GLINT_CTL_ITR_GRAN_25_M) >>
+	      GLINT_CTL_ITR_GRAN_25_S) == ICE_ITR_GRAN_US))
+		return;
+
+	regval = ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_200_S) &
+		  GLINT_CTL_ITR_GRAN_200_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_100_S) &
+		  GLINT_CTL_ITR_GRAN_100_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_50_S) &
+		  GLINT_CTL_ITR_GRAN_50_M) |
+		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_25_S) &
+		  GLINT_CTL_ITR_GRAN_25_M);
+	wr32(hw, GLINT_CTL, regval);
+}
+
+/**
+ * ice_calc_q_handle - calculate the queue handle
+ * @vsi: VSI that ring belongs to
+ * @ring: ring to get the absolute queue index
+ * @tc: traffic class number
+ */
+static u16 ice_calc_q_handle(struct ice_vsi *vsi, struct ice_ring *ring, u8 tc)
+{
+#ifdef HAVE_XDP_SUPPORT
+	WARN_ONCE(ice_ring_is_xdp(ring) && tc, "XDP ring can't belong to TC other than 0\n");
+
+#endif /* HAVE_XDP_SUPPORT */
+	if (ring->ch)
+		return ring->q_index - ring->ch->base_q;
+
+	/* Idea here for calculation is that we subtract the number of queue
+	 * count from TC that ring belongs to from it's absolute queue index
+	 * and as a result we get the queue's index within TC.
+	 */
+	return ring->q_index - vsi->tc_cfg.tc_info[tc].qoffset;
+}
+
+/**
+ * ice_eswitch_calc_q_handle
+ * @ring: pointer to ring which unique index is needed
+ *
+ * To correctly work with many netdevs
+ * ring->q_index of Tx rings on switchdev VSI can repeat. Hardware ring setup
+ * requires unique q_index. Calculate it here by finding index in vsi->tx_rings
+ * of this ring.
+ *
+ * Return -1 when index wasn't found. Should never happen, because vsi is get
+ * from ring->vsi, so it has to be present in this vsi.
+ */
+static u16 ice_eswitch_calc_q_handle(struct ice_ring *ring)
+{
+	struct ice_vsi *vsi = ring->vsi;
+	int i;
+
+	ice_for_each_txq(vsi, i) {
+		if (vsi->tx_rings[i] == ring)
+			return i;
+	}
+
+	return -1;
+}
+
+/**
+ * ice_cfg_xps_tx_ring - Configure XPS for a Tx ring
+ * @ring: The Tx ring to configure
+ *
+ * This enables/disables XPS for a given Tx descriptor ring
+ * based on the TCs enabled for the VSI that ring belongs to.
+ */
+static void ice_cfg_xps_tx_ring(struct ice_ring *ring)
+{
+#ifndef HAVE_XPS_QOS_SUPPORT
+	struct ice_vsi *vsi = ring->vsi;
+
+#endif /* !HAVE_XPS_QOS_SUPPORT */
+	if (!ring->q_vector || !ring->netdev)
+		return;
+
+#ifndef HAVE_XPS_QOS_SUPPORT
+	/* Single TC mode enable XPS
+	 * If there is more than 1 TC, netdev_set_num_tc() resets XPS settings
+	 */
+	if (vsi->tc_cfg.numtc > 1)
+		return;
+#endif /* !HAVE_XPS_QOS_SUPPORT */
+
+	/* We only initialize XPS once, so as not to overwrite user settings */
+	if (test_and_set_bit(ICE_TX_XPS_INIT_DONE, ring->xps_state))
+		return;
+
+	netif_set_xps_queue(ring->netdev, &ring->q_vector->affinity_mask,
+			    ring->q_index);
+}
+
+/**
+ * ice_setup_tx_ctx - setup a struct ice_tlan_ctx instance
+ * @ring: The Tx ring to configure
+ * @tlan_ctx: Pointer to the Tx LAN queue context structure to be initialized
+ * @pf_q: queue index in the PF space
+ *
+ * Configure the Tx descriptor ring in TLAN context.
+ */
+static void
+ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
+{
+	struct ice_vsi *vsi = ring->vsi;
+	struct ice_hw *hw = &vsi->back->hw;
+
+	tlan_ctx->base = ring->dma >> ICE_TLAN_CTX_BASE_S;
+
+	tlan_ctx->port_num = vsi->port_info->lport;
+
+	/* Transmit Queue Length */
+	tlan_ctx->qlen = ring->count;
+
+	ice_set_cgd_num(tlan_ctx, ring);
+
+	/* PF number */
+	tlan_ctx->pf_num = hw->pf_id;
+
+	/* queue belongs to a specific VSI type
+	 * VF / VM index should be programmed per vmvf_type setting:
+	 * for vmvf_type = VF, it is VF number between 0-256
+	 * for vmvf_type = VM, it is VM number between 0-767
+	 * for PF or EMP this field should be set to zero
+	 */
+	switch (vsi->type) {
+	case ICE_VSI_LB:
+	case ICE_VSI_CTRL:
+	case ICE_VSI_PF:
+		if (ring->ch)
+			tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
+		else
+			tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
+		break;
+	case ICE_VSI_VF:
+		/* Firmware expects vmvf_num to be absolute VF ID */
+		tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_id;
+		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
+		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
+		break;
+	default:
+		return;
+	}
+
+	/* make sure the context is associated with the right VSI */
+	if (ring->ch)
+		tlan_ctx->src_vsi = ring->ch->vsi_num;
+	else
+		tlan_ctx->src_vsi = ice_get_hw_vsi_num(hw, vsi->idx);
+
+	/* Restrict Tx timestamps to the PF VSI */
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+		tlan_ctx->tsyn_ena = 1;
+		break;
+	default:
+		break;
+	}
+
+	tlan_ctx->tso_ena = ICE_TX_LEGACY;
+	tlan_ctx->tso_qnum = pf_q;
+
+	/* Legacy or Advanced Host Interface:
+	 * 0: Advanced Host Interface
+	 * 1: Legacy Host Interface
+	 */
+	tlan_ctx->legacy_int = ICE_TX_LEGACY;
+}
+
+/**
+ * ice_setup_rx_ctx - Configure a receive ring context
+ * @ring: The Rx ring to configure
+ *
+ * Configure the Rx descriptor ring in RLAN context.
+ */
+static int ice_setup_rx_ctx(struct ice_ring *ring)
+{
+	int chain_len = ICE_MAX_CHAINED_RX_BUFS;
+	struct ice_vsi *vsi = ring->vsi;
+	u32 rxdid = ICE_RXDID_FLEX_NIC;
+	struct ice_rlan_ctx rlan_ctx;
+	struct ice_hw *hw;
+	u16 pf_q;
+	int err;
+
+	hw = &vsi->back->hw;
+
+	/* what is Rx queue number in global space of 2K Rx queues */
+	pf_q = vsi->rxq_map[ring->q_index];
+
+	/* clear the context structure first */
+	memset(&rlan_ctx, 0, sizeof(rlan_ctx));
+
+	/* Receive Queue Base Address.
+	 * Indicates the starting address of the descriptor queue defined in
+	 * 128 Byte units.
+	 */
+	rlan_ctx.base = ring->dma >> 7;
+
+	rlan_ctx.qlen = ring->count;
+
+	/* Receive Packet Data Buffer Size.
+	 * The Packet Data Buffer Size is defined in 128 byte units.
+	 */
+	rlan_ctx.dbuf = ring->rx_buf_len >> ICE_RLAN_CTX_DBUF_S;
+
+	/* use 32 byte descriptors */
+	rlan_ctx.dsize = 1;
+
+	/* Strip the Ethernet CRC bytes before the packet is posted to host
+	 * memory.
+	 */
+	rlan_ctx.crcstrip = ring->rx_crc_strip_dis ? 0 : 1;
+
+	/* L2TSEL flag defines the reported L2 Tags in the receive descriptor
+	 * and it needs to remain 1 for non-DVM capable configurations to not
+	 * break backward compatibility for VF drivers. Setting this field to 0
+	 * will cause the single/outer VLAN tag to be stripped to the L2TAG2_2ND
+	 * field in the Rx descriptor. Setting it to 1 allows the VLAN tag to
+	 * be stripped in L2TAG1 of the Rx descriptor, which is where VFs will
+	 * check for the tag
+	 */
+	if (ice_is_dvm_ena(hw))
+		if (vsi->type == ICE_VSI_VF &&
+		    ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+			rlan_ctx.l2tsel = 1;
+		else
+			rlan_ctx.l2tsel = 0;
+	else
+		rlan_ctx.l2tsel = 1;
+
+	rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
+	rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
+	rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
+
+	/* This controls whether VLAN is stripped from inner headers
+	 * The VLAN in the inner L2 header is stripped to the receive
+	 * descriptor if enabled by this flag.
+	 */
+	rlan_ctx.showiv = 0;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	/* For AF_XDP ZC, we disallow packets to span on
+	 * multiple buffers, thus letting us skip that
+	 * handling in the fast-path.
+	 */
+	if (ring->xsk_pool)
+		chain_len = 1;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	/* Max packet size for this queue - must not be set to a larger value
+	 * than 5 x DBUF
+	 */
+	rlan_ctx.rxmax = min_t(u32, vsi->max_frame,
+			       chain_len * ring->rx_buf_len);
+
+	/* Rx queue threshold in units of 64 */
+	rlan_ctx.lrxqthresh = 1;
+
+	/* Enable Flexible Descriptors in the queue context which
+	 * allows this driver to select a specific receive descriptor format
+	 * increasing context priority to pick up profile ID; default is 0x01;
+	 * setting to 0x03 to ensure profile is programming if prev context is
+	 * of same priority
+	 */
+	if (vsi->type != ICE_VSI_VF)
+		ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3, true);
+	else
+		ice_write_qrxflxp_cntxt(hw, pf_q, ICE_RXDID_LEGACY_1, 0x3,
+					false);
+
+	/* Absolute queue number out of 2K needs to be passed */
+	err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back),
+			"Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n",
+			pf_q, err);
+		return -EIO;
+	}
+
+	if (vsi->type == ICE_VSI_VF)
+		return 0;
+
+	/* configure Rx buffer alignment */
+	if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
+		ice_clear_ring_build_skb_ena(ring);
+	else
+		ice_set_ring_build_skb_ena(ring);
+
+	/* init queue specific tail register */
+	ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
+	writel(0, ring->tail);
+
+	return 0;
+}
+
+/**
+ * ice_vsi_cfg_rxq - Configure an Rx queue
+ * @ring: the ring being configured
+ *
+ * Return 0 on success and a negative value on error.
+ */
+int ice_vsi_cfg_rxq(struct ice_ring *ring)
+{
+	struct device *dev = ice_pf_to_dev(ring->vsi->back);
+	u16 num_bufs = ICE_DESC_UNUSED(ring);
+	int err;
+
+	ring->rx_buf_len = ring->vsi->rx_buf_len;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ring->vsi->type == ICE_VSI_PF) {
+		if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+			/* coverity[check_return] */
+			xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+					 ring->q_index, ring->q_vector->napi.napi_id);
+
+		ring->xsk_pool = ice_xsk_umem(ring);
+		if (ring->xsk_pool) {
+			xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+
+			ring->rx_buf_len =
+				xsk_pool_get_rx_frame_size(ring->xsk_pool);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			ring->zca.free = ice_zca_free;
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_ZERO_COPY,
+							 &ring->zca);
+			if (err)
+				return err;
+
+			dev_info(dev, "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+				 ring->q_index);
+#else
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_XSK_BUFF_POOL,
+							 NULL);
+			if (err)
+				return err;
+			xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
+
+			dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+				 ring->q_index);
+#endif
+		} else {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			ring->zca.free = NULL;
+#endif
+			if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+				/* coverity[check_return] */
+				xdp_rxq_info_reg(&ring->xdp_rxq,
+						 ring->netdev,
+						 ring->q_index, ring->q_vector->napi.napi_id);
+
+			err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+							 MEM_TYPE_PAGE_SHARED,
+							 NULL);
+			if (err)
+				return err;
+		}
+	}
+#elif defined(HAVE_XDP_FRAME_STRUCT)
+	if (ring->vsi->type == ICE_VSI_PF) {
+		if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+			/* coverity[check_return] */
+			xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+					 ring->q_index, ring->q_vector->napi.napi_id);
+
+		err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err)
+			return err;
+	}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	err = ice_setup_rx_ctx(ring);
+	if (err) {
+		dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
+			ring->q_index, err);
+		return err;
+	}
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ring->xsk_pool) {
+#ifdef HAVE_XSK_UMEM_HAS_ADDRS
+		if (!xsk_umem_has_addrs_rq(ring->xsk_pool, num_bufs)) {
+			dev_warn(dev, "UMEM does not provide enough addresses to fill %d buffers on Rx ring %d\n",
+				 num_bufs, ring->q_index);
+			dev_warn(dev, "Change Rx ring/fill queue size to avoid performance issues\n");
+
+			return 0;
+		}
+#endif /* HAVE_XSK_UMEM_HAS_ADDRS */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		err = ice_alloc_rx_bufs_slow_zc(ring, num_bufs);
+#else
+		err = ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring));
+#endif
+		if (err) {
+			u16 pf_q = ring->vsi->rxq_map[ring->q_index];
+
+			dev_info(dev, "Failed to allocate some buffers on UMEM enabled Rx ring %d (pf_q %d)\n",
+				 ring->q_index, pf_q);
+		}
+
+		return 0;
+	}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	ice_alloc_rx_bufs(ring, num_bufs);
+
+	return 0;
+}
+
+/**
+ * __ice_vsi_get_qs - helper function for assigning queues from PF to VSI
+ * @qs_cfg: gathered variables needed for pf->vsi queues assignment
+ *
+ * This function first tries to find contiguous space. If it is not successful,
+ * it tries with the scatter approach.
+ *
+ * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ */
+int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg)
+{
+	int ret = 0;
+
+	ret = __ice_vsi_get_qs_contig(qs_cfg);
+	if (ret) {
+		/* contig failed, so try with scatter approach */
+		qs_cfg->mapping_mode = ICE_VSI_MAP_SCATTER;
+		qs_cfg->q_count = min_t(unsigned int, qs_cfg->q_count,
+					qs_cfg->scatter_count);
+		ret = __ice_vsi_get_qs_sc(qs_cfg);
+	}
+	return ret;
+}
+
+/**
+ * ice_vsi_ctrl_one_rx_ring - start/stop VSI's Rx ring with no busy wait
+ * @vsi: the VSI being configured
+ * @ena: start or stop the Rx ring
+ * @rxq_idx: 0-based Rx queue index for the VSI passed in
+ * @wait: wait or don't wait for configuration to finish in hardware
+ *
+ * Return 0 on success and negative on error.
+ */
+int
+ice_vsi_ctrl_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx, bool wait)
+{
+	int pf_q = vsi->rxq_map[rxq_idx];
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 rx_reg;
+
+	rx_reg = rd32(hw, QRX_CTRL(pf_q));
+
+	/* Skip if the queue is already in the requested state */
+	if (ena == !!(rx_reg & QRX_CTRL_QENA_STAT_M))
+		return 0;
+
+	/* turn on/off the queue */
+	if (ena)
+		rx_reg |= QRX_CTRL_QENA_REQ_M;
+	else
+		rx_reg &= ~QRX_CTRL_QENA_REQ_M;
+	wr32(hw, QRX_CTRL(pf_q), rx_reg);
+
+	if (!wait)
+		return 0;
+
+	ice_flush(hw);
+	return ice_pf_rxq_wait(pf, pf_q, ena);
+}
+
+/**
+ * ice_vsi_wait_one_rx_ring - wait for a VSI's Rx ring to be stopped/started
+ * @vsi: the VSI being configured
+ * @ena: true/false to verify Rx ring has been enabled/disabled respectively
+ * @rxq_idx: 0-based Rx queue index for the VSI passed in
+ *
+ * This routine will wait for the given Rx queue of the VSI to reach the
+ * enabled or disabled state. Returns -ETIMEDOUT in case of failing to reach
+ * the requested state after multiple retries; else will return 0 in case of
+ * success.
+ */
+int ice_vsi_wait_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx)
+{
+	int pf_q = vsi->rxq_map[rxq_idx];
+	struct ice_pf *pf = vsi->back;
+
+	return ice_pf_rxq_wait(pf, pf_q, ena);
+}
+
+/**
+ * ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
+ * @vsi: the VSI being configured
+ *
+ * We allocate one q_vector per queue interrupt. If allocation fails we
+ * return -ENOMEM.
+ */
+int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	u16 v_idx;
+	int err;
+
+	if (vsi->q_vectors[0]) {
+		dev_dbg(dev, "VSI %d has existing q_vectors\n", vsi->vsi_num);
+		return -EEXIST;
+	}
+
+	for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++) {
+		err = ice_vsi_alloc_q_vector(vsi, v_idx);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	while (v_idx--)
+		ice_free_q_vector(vsi, v_idx);
+
+	dev_err(dev, "Failed to allocate %d q_vector for VSI %d, ret=%d\n",
+		vsi->num_q_vectors, vsi->vsi_num, err);
+	vsi->num_q_vectors = 0;
+	return err;
+}
+
+/**
+ * ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
+ * @vsi: the VSI being configured
+ *
+ * This function maps descriptor rings to the queue-specific vectors allotted
+ * through the MSI-X enabling code. On a constrained vector budget, we map Tx
+ * and Rx rings to the vector as "efficiently" as possible.
+ */
+void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
+{
+	int q_vectors = vsi->num_q_vectors;
+	u16 tx_rings_rem, rx_rings_rem;
+	int v_id;
+
+	/* initially assigning remaining rings count to VSIs num queue value */
+	tx_rings_rem = vsi->num_txq;
+	rx_rings_rem = vsi->num_rxq;
+
+	for (v_id = 0; v_id < q_vectors; v_id++) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
+		u8 tx_rings_per_v, rx_rings_per_v;
+		u16 q_id, q_base;
+
+		/* Tx rings mapping to vector */
+		tx_rings_per_v = (u8)DIV_ROUND_UP(tx_rings_rem,
+						  q_vectors - v_id);
+		q_vector->num_ring_tx = tx_rings_per_v;
+		q_vector->tx.ring = NULL;
+		q_vector->tx.itr_idx = ICE_TX_ITR;
+		q_base = vsi->num_txq - tx_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
+			struct ice_ring *tx_ring = vsi->tx_rings[q_id];
+
+			tx_ring->q_vector = q_vector;
+			tx_ring->next = q_vector->tx.ring;
+			q_vector->tx.ring = tx_ring;
+		}
+		tx_rings_rem -= tx_rings_per_v;
+
+		/* Rx rings mapping to vector */
+		rx_rings_per_v = (u8)DIV_ROUND_UP(rx_rings_rem,
+						  q_vectors - v_id);
+		q_vector->num_ring_rx = rx_rings_per_v;
+		q_vector->rx.ring = NULL;
+		q_vector->rx.itr_idx = ICE_RX_ITR;
+		q_base = vsi->num_rxq - rx_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
+			struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+
+			rx_ring->q_vector = q_vector;
+			rx_ring->next = q_vector->rx.ring;
+			q_vector->rx.ring = rx_ring;
+		}
+		rx_rings_rem -= rx_rings_per_v;
+	}
+}
+
+/**
+ * ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
+ * @vsi: the VSI having memory freed
+ */
+void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
+{
+	int v_idx;
+
+	ice_for_each_q_vector(vsi, v_idx)
+		ice_free_q_vector(vsi, v_idx);
+}
+
+/**
+ * ice_vsi_cfg_txq - Configure single Tx queue
+ * @vsi: the VSI that queue belongs to
+ * @ring: Tx ring to be configured
+ * @qg_buf: queue group buffer
+ */
+int
+ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring,
+		struct ice_aqc_add_tx_qgrp *qg_buf)
+{
+	u8 buf_len = struct_size(qg_buf, txqs, 1);
+	struct ice_tlan_ctx tlan_ctx = { 0 };
+	struct ice_aqc_add_txqs_perq *txq;
+	struct ice_channel *ch = ring->ch;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u16 pf_q;
+	u8 tc;
+
+	/* Configure XPS */
+	ice_cfg_xps_tx_ring(ring);
+
+	pf_q = ring->reg_idx;
+	ice_setup_tx_ctx(ring, &tlan_ctx, pf_q);
+	/* copy context contents into the qg_buf */
+	qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q);
+	ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx,
+		    ice_tlan_ctx_info);
+
+	/* init queue specific tail reg. It is referred as
+	 * transmit comm scheduler queue doorbell.
+	 */
+	ring->tail = hw->hw_addr + QTX_COMM_DBELL(pf_q);
+
+	if (IS_ENABLED(CONFIG_DCB))
+		tc = ring->dcb_tc;
+	else
+		tc = 0;
+
+	/* Add unique software queue handle of the Tx queue per
+	 * TC into the VSI Tx ring
+	 */
+	if (vsi->type == ICE_VSI_SWITCHDEV_CTRL)
+		ring->q_handle = ice_eswitch_calc_q_handle(ring);
+	else
+		ring->q_handle = ice_calc_q_handle(vsi, ring, tc);
+
+	status = (ch ?
+		  ice_ena_vsi_txq(vsi->port_info, ch->ch_vsi->idx, 0,
+				  ring->q_handle, 1, qg_buf, buf_len, NULL) :
+		  ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc,
+				  ring->q_handle, 1, qg_buf, buf_len, NULL));
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to set LAN Tx queue context, error: %s\n",
+			ice_stat_str(status));
+		return -ENODEV;
+	}
+
+	/* Add Tx Queue TEID into the VSI Tx ring from the
+	 * response. This will complete configuring and
+	 * enabling the queue.
+	 */
+	txq = &qg_buf->txqs[0];
+	if (pf_q == le16_to_cpu(txq->txq_id))
+		ring->txq_teid = le32_to_cpu(txq->q_teid);
+
+	return 0;
+}
+
+/**
+ * ice_cfg_itr - configure the initial interrupt throttle values
+ * @hw: pointer to the HW structure
+ * @q_vector: interrupt vector that's being configured
+ *
+ * Configure interrupt throttling values for the ring containers that are
+ * associated with the interrupt vector passed in.
+ */
+void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	ice_cfg_itr_gran(hw);
+
+	if (q_vector->num_ring_rx)
+		ice_write_itr(&q_vector->rx, q_vector->rx.itr_setting);
+
+	if (q_vector->num_ring_tx)
+		ice_write_itr(&q_vector->tx, q_vector->tx.itr_setting);
+
+	ice_write_intrl(q_vector, q_vector->intrl);
+}
+
+/**
+ * ice_cfg_txq_interrupt - configure interrupt on Tx queue
+ * @vsi: the VSI being configured
+ * @txq: Tx queue being mapped to MSI-X vector
+ * @msix_idx: MSI-X vector index within the function
+ * @itr_idx: ITR index of the interrupt cause
+ *
+ * Configure interrupt on Tx queue by associating Tx queue to MSI-X vector
+ * within the function space.
+ */
+void
+ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 val;
+
+	itr_idx = (itr_idx << QINT_TQCTL_ITR_INDX_S) & QINT_TQCTL_ITR_INDX_M;
+
+	val = QINT_TQCTL_CAUSE_ENA_M | itr_idx |
+	      ((msix_idx << QINT_TQCTL_MSIX_INDX_S) & QINT_TQCTL_MSIX_INDX_M);
+
+	wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), val);
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		u32 xdp_txq = txq + vsi->num_xdp_txq;
+
+		wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]),
+		     val);
+	}
+
+	ice_flush(hw);
+#endif /* HAVE_XDP_SUPPORT */
+}
+
+/**
+ * ice_cfg_rxq_interrupt - configure interrupt on Rx queue
+ * @vsi: the VSI being configured
+ * @rxq: Rx queue being mapped to MSI-X vector
+ * @msix_idx: MSI-X vector index within the function
+ * @itr_idx: ITR index of the interrupt cause
+ *
+ * Configure interrupt on Rx queue by associating Rx queue to MSI-X vector
+ * within the function space.
+ */
+void
+ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 val;
+
+	itr_idx = (itr_idx << QINT_RQCTL_ITR_INDX_S) & QINT_RQCTL_ITR_INDX_M;
+
+	val = QINT_RQCTL_CAUSE_ENA_M | itr_idx |
+	      ((msix_idx << QINT_RQCTL_MSIX_INDX_S) & QINT_RQCTL_MSIX_INDX_M);
+
+	wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), val);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_trigger_sw_intr - trigger a software interrupt
+ * @hw: pointer to the HW structure
+ * @q_vector: interrupt vector to trigger the software interrupt for
+ */
+void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+{
+	wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx), (ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) |
+	     GLINT_DYN_CTL_SWINT_TRIG_M | GLINT_DYN_CTL_INTENA_M);
+}
+
+/**
+ * ice_vsi_stop_tx_ring - Disable single Tx ring
+ * @vsi: the VSI being configured
+ * @rst_src: reset source
+ * @rel_vmvf_num: Relative ID of VF/VM
+ * @ring: Tx ring to be stopped
+ * @txq_meta: Meta data of Tx ring to be stopped
+ */
+int
+ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+		     u16 rel_vmvf_num, struct ice_ring *ring,
+		     struct ice_txq_meta *txq_meta)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_q_vector *q_vector;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	/* clear cause_ena bit for disabled queues */
+	val = rd32(hw, QINT_TQCTL(ring->reg_idx));
+	val &= ~QINT_TQCTL_CAUSE_ENA_M;
+	wr32(hw, QINT_TQCTL(ring->reg_idx), val);
+
+	/* software is expected to wait for 100 ns */
+	ndelay(100);
+
+	/* trigger a software interrupt for the vector
+	 * associated to the queue to schedule NAPI handler
+	 */
+	q_vector = ring->q_vector;
+	if (q_vector)
+		ice_trigger_sw_intr(hw, q_vector);
+
+	status = ice_dis_vsi_txq(vsi->port_info, txq_meta->vsi_idx,
+				 txq_meta->tc, 1, &txq_meta->q_handle,
+				 &txq_meta->q_id, &txq_meta->q_teid, rst_src,
+				 rel_vmvf_num, NULL);
+
+	/* if the disable queue command was exercised during an
+	 * active reset flow, ICE_ERR_RESET_ONGOING is returned.
+	 * This is not an error as the reset operation disables
+	 * queues at the hardware level anyway.
+	 */
+	if (status == ICE_ERR_RESET_ONGOING) {
+		dev_dbg(ice_pf_to_dev(vsi->back), "Reset in progress. LAN Tx queues already disabled\n");
+	} else if (status == ICE_ERR_DOES_NOT_EXIST) {
+		dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n");
+	} else if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %s\n",
+			ice_stat_str(status));
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_fill_txq_meta - Prepare the Tx queue's meta data
+ * @vsi: VSI that ring belongs to
+ * @ring: ring that txq_meta will be based on
+ * @txq_meta: a helper struct that wraps Tx queue's information
+ *
+ * Set up a helper struct that will contain all the necessary fields that
+ * are needed for stopping Tx queue
+ */
+void
+ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
+		  struct ice_txq_meta *txq_meta)
+{
+	struct ice_channel *ch = ring->ch;
+	u8 tc;
+
+	if (IS_ENABLED(CONFIG_DCB))
+		tc = ring->dcb_tc;
+	else
+		tc = 0;
+	txq_meta->q_id = ring->reg_idx;
+	txq_meta->q_teid = ring->txq_teid;
+	txq_meta->q_handle = ring->q_handle;
+	if (ch) {
+		txq_meta->vsi_idx = ch->ch_vsi->idx;
+		txq_meta->tc = 0;
+	} else {
+		txq_meta->vsi_idx = vsi->idx;
+		txq_meta->tc = tc;
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_base.h b/drivers/net/ethernet/intel/ice/ice_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c83e555ef5cca610e33ac607334df57252f7186
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_base.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_BASE_H_
+#define _ICE_BASE_H_
+
+#include "ice.h"
+
+int ice_vsi_cfg_rxq(struct ice_ring *ring);
+int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg);
+int
+ice_vsi_ctrl_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx, bool wait);
+int ice_vsi_wait_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx);
+int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi);
+void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi);
+void ice_vsi_free_q_vectors(struct ice_vsi *vsi);
+int
+ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring,
+		struct ice_aqc_add_tx_qgrp *qg_buf);
+void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector);
+void
+ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx);
+void
+ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx);
+void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector);
+int
+ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+		     u16 rel_vmvf_num, struct ice_ring *ring,
+		     struct ice_txq_meta *txq_meta);
+void
+ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
+		  struct ice_txq_meta *txq_meta);
+#endif /* _ICE_BASE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu.h b/drivers/net/ethernet/intel/ice/ice_cgu.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9c215a9df9228f131630077c7e0efe1c4167bb8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_H_
+#define _ICE_CGU_H_
+
+#include <linux/types.h>
+#include "ice_cgu_regs.h"
+
+/* CGU mux identifier
+ * Specifies a mux within the CGU block.
+ */
+enum ice_cgu_mux_sel {
+	/* CGU reference clock source (DWORD10_SYNCE_S_REF_CLK) */
+	ICE_CGU_MUX_SEL_REF_CLK,
+	/* CGU bypass clock source (DWORD11_SYNCE_S_BYP_CLK) */
+	ICE_CGU_MUX_SEL_BYPASS_CLK,
+	/* CGU ETHCLKO pin source (DWORD10_SYNCE_ETHCLKO_SEL) */
+	ICE_CGU_MUX_SEL_ETHCLKO,
+	/* CGU CLKO pin source (DWORD10_SYNCE_CLKO_SEL) */
+	ICE_CGU_MUX_SEL_CLKO,
+
+	NUM_ICE_CGU_MUX_SEL
+};
+
+/* CGU reference clock specification
+ * Specifies the source for the CGU reference/bypass clock.
+ */
+enum ice_cgu_clk_src {
+	/* network reference clock 0 */
+	ICE_CGU_CLK_SRC_NET_REF_CLK0,
+	/* network reference clock 1 */
+	ICE_CGU_CLK_SRC_NET_REF_CLK1,
+	/* 1588 recovered clock */
+	ICE_CGU_CLK_SRC_1588_RECOVERED_CLK,
+	/* recovered clock from phys port 0 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_0,
+	/* recovered clock from phys port 1 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_1,
+	/* recovered clock from phys port 2 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_2,
+	/* recovered clock from phys port 3 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_3,
+	/* recovered clock from phys port 4 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_4,
+	/* recovered clock from phys port 5 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_5,
+	/* recovered clock from phys port 6 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_6,
+	/* recovered clock from phys port 7 */
+	ICE_CGU_CLK_SRC_SYNCE_CLK_7,
+	NUM_ICE_CGU_CLK_SRC
+};
+
+/* Sources for ETHCLKO pin */
+enum ice_cgu_ethclko_sel {
+	/* DPLL reference clock 0 input divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP0_DIV,
+	/* DPLL reference clock 1 input divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1_DIV,
+	/* DPLL output clock divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_CLK_PLL_25000_DIV,
+	/* JAPLL output clock divided by ETHDIV */
+	ICE_CGU_ETHCLKO_SEL_CLK_JAPLL_625000_DIV,
+	/* DPLL reference clock 0 input */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP0,
+	/* DPLL reference clock 1 input */
+	ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1,
+	/* DPLL output clock */
+	ICE_CGU_ETHCLKO_SEL_CLK_PLL_25000,
+	ICE_CGU_ETHCLKO_SEL_CLK_JAPLL_625000,
+
+	NUM_ICE_CGU_ETHCLKO_SEL
+};
+
+#define ICE_CGU_ETHCLKO_SEL_NRCKI ICE_CGU_ETHCLKO_SEL_REF_CLK_BYP1
+
+/* Sources for CLKO pin */
+enum ice_cgu_clko_sel {
+	/* DPLL reference clock 0 input divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP0_DIV,
+	/* DPLL reference clock 1 input divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP1_DIV,
+	/* DPLL core clock divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_CLK_SYS_DIV,
+	/* JAPLL output clock divided by CLKODIV */
+	ICE_CGU_CLKO_SEL_CLK_JAPLL_625000_DIV,
+	/* DPLL reference clock 0 input */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP0,
+	/* DPLL reference clock 1 input */
+	ICE_CGU_CLKO_SEL_REF_CLK_BYP1,
+
+	/* 1.544 MHz, NRCP divider output */
+	ICE_CGU_CLKO_SEL_CLK_1544 = 8,
+	/* 2.048 MHz, NRCP divider output */
+	ICE_CGU_CLKO_SEL_CLK_2048 = 9,
+
+	NUM_ICE_CGU_CLKO_SEL
+};
+
+#define ICE_CGU_CLKO_SEL_NRCKI ICE_CGU_CLKO_SEL_REF_CLK_BYP1
+
+/* TIME_REF source selection */
+enum ice_cgu_time_ref_sel {
+	ICE_CGU_TIME_REF_SEL_TCXO, /* Use TCXO source */
+	ICE_CGU_TIME_REF_SEL_TIME_REF, /* Use TIME_REF source */
+
+	NUM_ICE_CGU_TIME_REF_SEL
+};
+
+/* Macro to convert an enum ice_time_ref_freq to a string for printing */
+#define ICE_TIME_REF_FREQ_TO_STR(__trf)                              \
+	({                                                           \
+		enum ice_time_ref_freq _trf = (__trf);               \
+		(_trf) == ICE_TIME_REF_FREQ_25_000 ? "25 MHz" :      \
+		(_trf) == ICE_TIME_REF_FREQ_122_880 ? "122.88 MHz" : \
+		(_trf) == ICE_TIME_REF_FREQ_125_000 ? "125 MHz" :    \
+		(_trf) == ICE_TIME_REF_FREQ_153_600 ? "153.6 MHz" :  \
+		(_trf) == ICE_TIME_REF_FREQ_156_250 ? "156.25 MHz" : \
+		(_trf) == ICE_TIME_REF_FREQ_245_760 ? "245.76 MHz" : \
+		"invalid"; \
+	})
+
+/* Macro to convert an enum ice_cgu_time_ref_sel to a string for printing */
+#define ICE_TIME_REF_SEL_TO_STR(__trs)                                 \
+	({                                                             \
+		enum ice_cgu_time_ref_sel _trs = (__trs);              \
+		(_trs) == ICE_CGU_TIME_REF_SEL_TCXO ? "TCXO" :         \
+		(_trs) == ICE_CGU_TIME_REF_SEL_TIME_REF ? "TIME_REF" : \
+		"invalid"; \
+	})
+/* Macro to convert an enum ice_src_tmr_mode to a string for printing */
+#define ICE_SRC_TMR_MODE_TO_STR(__mtm)                                    \
+	({                                                                \
+		enum ice_src_tmr_mode _mtm = (__mtm);                     \
+		(_mtm) == ICE_SRC_TMR_MODE_NANOSECONDS ? "nanoseconds" :  \
+		(_mtm) == ICE_SRC_TMR_MODE_LOCKED ? "locked" :            \
+		"invalid"; \
+	})
+
+/* DPLL select */
+enum ice_cgu_dpll_select {
+	/* DPLL (DPLL1) */
+	ICE_CGU_DPLL_SELECT_TRANSPORT,
+	/* EEC DPLL (DPLL2), 0x098 Hz BW */
+	ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW,
+
+	NUM_ICE_CGU_DPLL_SELECT
+};
+
+/* DPLL holdover mode */
+enum ice_cgu_dpll_holdover_mode {
+	/* previous acquired frequency */
+	ICE_CGU_DPLL_HOLDOVER_MODE_ACQUIRED_FREQ,
+	/* local frequency (free run) */
+	ICE_CGU_DPLL_HOLDOVER_MODE_LOCAL_FREQ,
+
+	NUM_ICE_CGU_DPLL_HOLDOVER_MODE
+};
+
+/* DPLL configuration parameters */
+struct ice_cgu_dpll_cfg {
+	/* CGU reference clock frequency */
+	enum ice_time_ref_freq ref_freq;
+	/* select DPLL */
+	enum ice_cgu_dpll_select dpll_sel;
+	/* enable holdover feature support */
+	u32 holdover_support;
+	/* select holdover mode */
+	enum ice_cgu_dpll_holdover_mode holdover_mode;
+};
+
+enum ice_japll_ref_freq {
+	ICE_CGU_JAPLL_REF_FREQ_25_000, /* 25 MHz */
+	ICE_CGU_JAPLL_REF_FREQ_156_250, /* 156.25 MHz */
+
+	NUM_ICE_CGU_JAPLL_REF_FREQ
+};
+
+/* Mux configuration parameters */
+struct ice_cgu_mux_cfg {
+	/* reference clock source select */
+	enum ice_cgu_clk_src ref_clk_src;
+	/* bypass clock source select */
+	enum ice_cgu_clk_src byp_clk_src;
+	/* ETHCLKO pin source select */
+	enum ice_cgu_ethclko_sel eth_clk_out;
+	/* CLKO pin source select */
+	enum ice_cgu_clko_sel clk_out;
+	/* CLKO programmable divider */
+	__u8 clk_out_div;
+	/* ETHCLKO programmable divider */
+	__u8 eth_clk_out_div;
+	/* bypass DPLL */
+	u32 bypass;
+	/* tie refClk to ground (force holdover mode) */
+	u32 ref_clk_gnd_ena;
+};
+
+/* CGU event was triggered by SyncE loss of lock */
+#define ICE_CGU_EVENT_ERR_SYNCE_LOCK_LOSS 0x1
+
+/* CGU event was triggered by SyncE holdover change */
+#define ICE_CGU_EVENT_ERR_HOLDOVER_CHNG 0x2
+
+/* CGU event was triggered by timestamp PLL loss of lock */
+#define ICE_CGU_EVENT_ERR_TIMESYNC_LOCK_LOSS 0x4
+
+
+struct ice_cgu_info {
+	struct ice_cgu_dpll_cfg dpll_cfg;
+	struct ice_cgu_mux_cfg mux_cfg;
+	enum ice_japll_ref_freq japll_ref_freq;
+	wait_queue_head_t wq_head;
+
+	/* used to synchronize waiters (only one at a time) */
+	struct mutex event_mutex;
+
+	u32 event_occurred;
+	u8 err_type;
+	u8 unlock_event;
+
+	/* current state of 1588 output to CGU */
+	u8 out_1588_enabled;
+	enum ice_time_ref_freq out_1588_ref_freq;
+
+	enum ice_time_ref_freq time_ref_freq;
+	enum ice_src_tmr_mode src_tmr_mode;
+};
+
+#endif /* _ICE_CGU_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_ops.c b/drivers/net/ethernet/intel/ice/ice_cgu_ops.c
new file mode 100644
index 0000000000000000000000000000000000000000..cb7a3ce8605b1b8178434aa5a10523be096575b7
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_ops.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_cgu_cfg_ts_pll - Configure the TS PLL
+ * @pf: Board private structure
+ * @enable: True to enable TS PLL
+ * @time_ref_freq: primary timer frequency
+ * @time_ref_sel: Time source
+ * @src_tmr_mode: primary timer mode
+ */
+int ice_cgu_cfg_ts_pll(struct ice_pf *pf, bool enable, enum ice_time_ref_freq time_ref_freq,
+		       enum ice_cgu_time_ref_sel time_ref_sel, enum ice_src_tmr_mode src_tmr_mode)
+{
+	struct ice_cgu_info *cgu_info = &pf->cgu_info;
+	union tspll_ro_bwm_lf bwm_lf;
+	union nac_cgu_dword19 dw19;
+	union nac_cgu_dword22 dw22;
+	union nac_cgu_dword24 dw24;
+	union nac_cgu_dword9 dw9;
+	int err;
+
+	dev_dbg(ice_pf_to_dev(pf), "Requested %s, time_ref_freq %s, time_ref_sel %s, src_tmr_mode %s\n",
+		enable ? "enable" : "disable",
+		ICE_TIME_REF_FREQ_TO_STR(time_ref_freq),
+		ICE_TIME_REF_SEL_TO_STR(time_ref_sel),
+		ICE_SRC_TMR_MODE_TO_STR(src_tmr_mode));
+
+	if (time_ref_freq >= NUM_ICE_TIME_REF_FREQ) {
+		dev_err(ice_pf_to_dev(pf), "Invalid TIME_REF freq %u\n", time_ref_freq);
+		return -EIO;
+	}
+
+	if (time_ref_sel >= NUM_ICE_CGU_TIME_REF_SEL) {
+		dev_err(ice_pf_to_dev(pf), "Invalid TIME_REF sel %u\n", time_ref_sel);
+		return -EIO;
+	}
+
+	if (src_tmr_mode >= NUM_ICE_SRC_TMR_MODE) {
+		dev_err(ice_pf_to_dev(pf), "Invalid src_tmr_mode %u\n", src_tmr_mode);
+		return -EIO;
+	}
+
+	if (time_ref_sel == ICE_CGU_TIME_REF_SEL_TCXO &&
+	    time_ref_freq != ICE_TIME_REF_FREQ_25_000) {
+		dev_err(ice_pf_to_dev(pf),
+			"TS PLL source specified as TCXO but specified frequency is not 25 MHz\n");
+		return -EIO;
+	}
+
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD9, &dw9.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD24, &dw24.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, TSPLL_RO_BWM_LF, &bwm_lf.val);
+	if (err)
+		return err;
+
+	dev_dbg(ice_pf_to_dev(pf),
+		"Before change, %s, time_ref_freq %s, time_ref_sel %s, PLL %s\n",
+		dw24.field.ts_pll_enable ? "enabled" : "disabled",
+		ICE_TIME_REF_FREQ_TO_STR(dw9.field.time_ref_freq_sel),
+		ICE_TIME_REF_SEL_TO_STR(dw24.field.time_ref_sel),
+		bwm_lf.field.plllock_true_lock_cri ? "locked" : "unlocked");
+
+	if (!enable) {
+		if (dw24.field.ts_pll_enable) {
+			dw24.field.ts_pll_enable = 0;
+			err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+			if (!err)
+				ice_cgu_usleep(1);
+		}
+		/* don't need to update the freq, sel, or mode; that'll happen
+		 * when the PLL is re-enabled
+		 */
+		return err;
+	}
+
+	/* TS PLL must be disabled before changing freq or src */
+	if (dw24.field.ts_pll_enable && (dw9.field.time_ref_freq_sel != time_ref_freq ||
+					 dw24.field.time_ref_sel != time_ref_sel)) {
+		dev_err(ice_pf_to_dev(pf),
+			"Can't adjust time_ref_freq or time_ref_sel while TS PLL is enabled\n");
+		return -EIO;
+	}
+
+	/* set frequency, configure TS PLL params, and enable the TS PLL */
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD19, &dw19.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD22, &dw22.val);
+	if (!err) {
+		dw9.field.time_ref_freq_sel = time_ref_freq;
+		dw19.field.tspll_fbdiv_intgr = tspll_per_rate_params[time_ref_freq].feedback_div;
+		dw19.field.tspll_ndivratio = 1;
+		dw22.field.time1588clk_div = tspll_per_rate_params[time_ref_freq].post_pll_div;
+		dw22.field.time1588clk_sel_div2 = 0;
+		dw24.field.ref1588_ck_div = tspll_per_rate_params[time_ref_freq].refclk_pre_div;
+		dw24.field.tspll_fbdiv_frac = tspll_per_rate_params[time_ref_freq].frac_n_div;
+		dw24.field.time_ref_sel = time_ref_sel;
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD9, dw9.val);
+	}
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD19, dw19.val);
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD22, dw22.val);
+	/* first write dw24 with updated values but still not enabled */
+	if (!err)
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+	/* now enable the TS_PLL */
+	if (!err) {
+		dw24.field.ts_pll_enable = 1;
+		err = ice_cgu_reg_write(pf, NAC_CGU_DWORD24, dw24.val);
+	}
+
+	if (!err) {
+		cgu_info->time_ref_freq = time_ref_freq;
+		cgu_info->src_tmr_mode = src_tmr_mode;
+		err = ice_ptp_update_incval(pf, time_ref_freq, src_tmr_mode);
+		if (err) {
+			dev_err(ice_pf_to_dev(pf), "Failed to update INCVAL\n");
+			return err;
+		}
+	}
+
+	/* to check for lock, wait 1 ms; if it hasn't locked by then, it's not
+	 * going to lock
+	 */
+	if (!err) {
+		ice_cgu_usleep(1000);
+		err = ice_cgu_reg_read(pf, TSPLL_RO_BWM_LF, &bwm_lf.val);
+	}
+	if (!err && bwm_lf.field.plllock_true_lock_cri) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"TS PLL successfully locked, time_ref_freq %s, time_ref_sel %s\n",
+			ICE_TIME_REF_FREQ_TO_STR(time_ref_freq),
+			ICE_TIME_REF_SEL_TO_STR(time_ref_sel));
+
+		/* update state to indicate no unlock event since last lock */
+		cgu_info->unlock_event = false;
+	} else {
+		dev_err(ice_pf_to_dev(pf), "TS PLL failed to lock\n");
+		err = -EFAULT;
+	}
+
+	return err;
+}
+
+/**
+ * ice_cgu_init_state - Initialize CGU HW
+ * @pf: Board private structure
+ *
+ * Read CGU registers, initialize internal state, and lock the timestamp PLL using the parameters
+ * read from the soft straps.
+ */
+void ice_cgu_init_state(struct ice_pf *pf)
+{
+	union tspll_cntr_bist_settings tspll_cntr_bist;
+	struct ice_cgu_info *cgu_info = &pf->cgu_info;
+	union nac_cgu_dword10 dw10;
+	union nac_cgu_dword11 dw11;
+	union nac_cgu_dword12 dw12;
+	union nac_cgu_dword14 dw14;
+	union nac_cgu_dword24 dw24;
+	union nac_cgu_dword9 dw9;
+	int err;
+
+	init_waitqueue_head(&cgu_info->wq_head);
+	mutex_init(&cgu_info->event_mutex);
+
+	err = ice_cgu_reg_read(pf, NAC_CGU_DWORD9, &dw9.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD11, &dw11.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD14, &dw14.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, NAC_CGU_DWORD24, &dw24.val);
+	if (!err)
+		err = ice_cgu_reg_read(pf, TSPLL_CNTR_BIST_SETTINGS, &tspll_cntr_bist.val);
+	if (err)
+		goto err;
+
+	/* Note that the TIME_SYNC, TIME_REF, and ONE_PPS_OUT pins are enabled
+	 * through soft straps.
+	 */
+	/* Mux config */
+	cgu_info->mux_cfg.ref_clk_src = dw10.field.synce_s_ref_clk;
+	cgu_info->mux_cfg.byp_clk_src = dw11.field.synce_s_byp_clk;
+	cgu_info->mux_cfg.eth_clk_out = dw10.field.synce_ethclko_sel;
+	cgu_info->mux_cfg.clk_out = dw10.field.synce_clko_sel;
+	cgu_info->mux_cfg.clk_out_div = dw10.field.synce_clkodiv_m1;
+	cgu_info->mux_cfg.eth_clk_out_div = dw10.field.synce_ethdiv_m1;
+	cgu_info->mux_cfg.bypass = dw12.field.synce_dpll_byp;
+	cgu_info->mux_cfg.ref_clk_gnd_ena = dw10.field.synce_sel_gnd;
+
+	/* Timestamp PLL config */
+	/* Disable sticky lock detection so lock status reported is accurate */
+	tspll_cntr_bist.field.i_plllock_sel_0 = 0;
+	tspll_cntr_bist.field.i_plllock_sel_1 = 0;
+	err = ice_cgu_reg_write(pf, TSPLL_CNTR_BIST_SETTINGS, tspll_cntr_bist.val);
+
+	/* Assume the 1588 output to CGU isn't configured; require the app to reconfigure it before
+	 * using it
+	 */
+	if (!err)
+		cgu_info->out_1588_enabled = false;
+
+	/* first, try to lock the timestamp PLL with the parameters from the soft straps */
+	/* disable first, then re-enable with correct parameters */
+	err = ice_cgu_cfg_ts_pll(pf, false, dw9.field.time_ref_freq_sel, dw24.field.time_ref_sel,
+				 ICE_SRC_TMR_MODE_NANOSECONDS);
+	if (err)
+		dev_err(ice_pf_to_dev(pf), "Failed to disable TS PLL\n");
+	else
+		err = ice_cgu_cfg_ts_pll(pf, true, dw9.field.time_ref_freq_sel,
+					 dw24.field.time_ref_sel, ICE_SRC_TMR_MODE_NANOSECONDS);
+	if (err) {
+		/* if that fails, try to lock the timestamp PLL with the TCXO
+		 */
+		dev_info(ice_pf_to_dev(pf),
+			 "Unable to lock TS PLL with soft straps settings; trying TCXO\n");
+
+			/* disable first, then re-enable with correct parameters */
+			err = ice_cgu_cfg_ts_pll(pf, false, ICE_TIME_REF_FREQ_25_000,
+						 ICE_CGU_TIME_REF_SEL_TCXO,
+						 ICE_SRC_TMR_MODE_NANOSECONDS);
+		if (err)
+			dev_err(ice_pf_to_dev(pf), "Failed to disable TS PLL with TCXO\n");
+		else
+			err = ice_cgu_cfg_ts_pll(pf, true, ICE_TIME_REF_FREQ_25_000,
+						 ICE_CGU_TIME_REF_SEL_TCXO,
+						 ICE_SRC_TMR_MODE_NANOSECONDS);
+		if (err) {
+			dev_err(ice_pf_to_dev(pf), "Failed to lock TS PLL with TCXO\n");
+			goto err;
+		}
+	}
+
+	dev_info(ice_pf_to_dev(pf), "CGU init successful\n");
+	return;
+err:
+	dev_err(ice_pf_to_dev(pf), "CGU init failed, err=%d\n", err);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_ops.h b/drivers/net/ethernet/intel/ice/ice_cgu_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba1ad7d939fd2b8af7262a2ddbc26a2a111e129
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_ops.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_OPS_H_
+#define _ICE_CGU_OPS_H_
+
+#define ICE_CGU_LOCK_CHECK_DELAY_USEC        256000	/* 256 msec */
+
+/* fast mode lock check settings */
+#define ICE_CGU_EDPLL_FAST_LOCK_DELAY_LOOPS     239	/* 60 seconds total */
+#define ICE_CGU_TDPLL_FAST_LOCK_DELAY_LOOPS      25	/* 5 seconds total */
+
+/* normal mode lock check settings */
+#define ICE_CGU_EDPLL_NORMAL_LOCK_DELAY_LOOPS    52	/* 12 seconds total */
+#define ICE_CGU_TDPLL_NORMAL_LOCK_DELAY_LOOPS    13	/* 2 seconds total */
+
+/* number of consecutive locks to declare DPLL lock */
+#define ICE_CGU_DPLL_LOCK_COUNT 5
+
+#define ICE_CGU_CORE_CLOCK_MHZ 800
+#define ICE_CGU_DPLL_FREQ_MHZ 25
+
+/* DPLL lock/unlock threshold */
+#define ICE_CGU_TRANSPORT_DPLL_LOCK_THRESHOLD_800MHZ    0x2D8
+#define ICE_CGU_TRANSPORT_DPLL_UNLOCK_THRESHOLD_800MHZ  0x3640
+#define ICE_CGU_ECC_DPLL_LOCK_THRESHOLD_800MHZ          0x5A
+#define ICE_CGU_ECC_DPLL_UNLOCK_THRESHOLD_800MHZ        0x21E8
+
+/* time to hold enable bits low to perform a JAPLL reset */
+#define ICE_CGU_JAPLL_RESET_TIME_USEC 1
+
+/* LCPLL lock alone (FDPLL disabled) should take < 10 usec */
+#define ICE_CGU_LCPLL_LOCK_CHECK_DELAY_USEC 1
+#define ICE_CGU_LCPLL_LOCK_DELAY_LOOPS 10
+
+/* FDPLL lock time in fast mode is around 500 msec;
+ * use poll interval of 100ms, max poll time 5 seconds
+ * (max poll time was originally 2 seconds, increased
+ * to 5 to avoid occasional poll timeouts.)
+ */
+#define ICE_CGU_FDPLL_LOCK_CHECK_DELAY_USEC 100000
+#define ICE_CGU_FDPLL_LOCK_DELAY_LOOPS 50
+#define ICE_CGU_FDPLL_ACQ_TOGGLE_LOOPS 2
+
+
+/* valid values for enum ice_cgu_clko_sel */
+#define ICE_CGU_CLKO_SEL_VALID_BITMAP \
+	(BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP0_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP1_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_SYS_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_JAPLL_625000_DIV) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP0) | \
+	 BIT(ICE_CGU_CLKO_SEL_REF_CLK_BYP1) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_1544) | \
+	 BIT(ICE_CGU_CLKO_SEL_CLK_2048))
+
+/* Only FW can read NAC_CGU_DWORD8 where these are defined, so they are exposed
+ * to the driver stack via soft straps in the misc24 field of NAC_CGU_DWORD9.
+ */
+#define MISC24_BIT_TCXO_FREQ_SEL_M	BIT(0)
+#define MISC24_BIT_TCXO_SEL_M		BIT(4)
+
+/* internal structure definitions */
+
+enum ice_cgu_sample_rate {
+	ICE_CGU_SAMPLE_RATE_8K = 0,	/* 8 KHz sample rate */
+	ICE_CGU_SAMPLE_RATE_10K,	/* 10 KHz sample rate */
+	ICE_CGU_SAMPLE_RATE_12K5,	/* 12.5 KHz sample rate */
+
+	NUM_ICE_CGU_SAMPLE_RATE
+};
+
+struct ice_cgu_div_rat_m1 {
+	u32 ref_clk_rate;	/* reference clock rate in kHz */
+	u32 div_rat_m1;		/* div_rat_m1 value */
+};
+
+struct ice_cgu_dpll_params {
+	enum ice_cgu_dpll_select dpll_select;
+	enum ice_cgu_sample_rate sample_rate;
+	u32 mul_rat_m1;
+	u32 scale;
+	u32 gain;
+};
+
+struct ice_cgu_dpll_per_rate_params {
+	u32 rate_hz;
+	enum ice_cgu_sample_rate sample_rate;
+	u32 div_rat_m1;
+	u32 synce_rat_sel;
+};
+
+struct ice_cgu_lcpll_per_rate_params {
+	u32 refclk_pre_div;
+	u32 feedback_div;
+	u32 frac_n_div;
+	u32 post_pll_div;
+};
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+/* Function to init internal state */
+void ice_cgu_init_state(struct ice_pf *pf);
+/* Function to configure TS PLL */
+int
+ice_cgu_cfg_ts_pll(struct ice_pf *pf, bool enable, enum ice_time_ref_freq time_ref_freq,
+		   enum ice_cgu_time_ref_sel time_ref_sel,
+		   enum ice_src_tmr_mode src_tmr_mode);
+#else /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+static inline void ice_cgu_init_state(struct ice_pf *pf) { }
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static inline int
+ice_cgu_cfg_ts_pll(struct ice_pf __always_unused *pf, bool __always_unused enable,
+		   enum ice_time_ref_freq __always_unused time_ref_freq,
+		   enum ice_cgu_time_ref_sel __always_unused time_ref_sel,
+		   enum ice_src_tmr_mode __always_unused src_tmr_mode)
+{
+	return 0;
+}
+#endif /* IS_ENABLED(CONFIG_DEBUG_FS) */
+#endif /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+#endif /* _ICE_CGU_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_regs.h b/drivers/net/ethernet/intel/ice/ice_cgu_regs.h
new file mode 100644
index 0000000000000000000000000000000000000000..a58fc697e6f5a2ca3b7e12a81ec56f11f80f0f86
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_regs.h
@@ -0,0 +1,941 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_REGS_H_
+#define _ICE_CGU_REGS_H_
+
+#include "ice_osdep.h"
+
+#define NAC_CGU_DWORD8 0x20
+#define NAC_CGU_DWORD8_TCXO_FREQ_SEL_S 0
+#define NAC_CGU_DWORD8_TCXO_FREQ_SEL_M BIT(0)
+#define NAC_CGU_DWORD8_MISC8_S 1
+#define NAC_CGU_DWORD8_MISC8_M ICE_M(0x7, 1)
+#define NAC_CGU_DWORD8_HLP_SWITCH_FREQ_SEL_S 4
+#define NAC_CGU_DWORD8_HLP_SWITCH_FREQ_SEL_M ICE_M(0xf, 4)
+#define NAC_CGU_DWORD8_CGUPLL_NDIVRATIO_S 8
+#define NAC_CGU_DWORD8_CGUPLL_NDIVRATIO_M ICE_M(0xf, 8)
+#define NAC_CGU_DWORD8_CGUPLL_IREF_NDIVRATIO_S 12
+#define NAC_CGU_DWORD8_CGUPLL_IREF_NDIVRATIO_M ICE_M(0x7, 12)
+#define NAC_CGU_DWORD8_MISC28_S 15
+#define NAC_CGU_DWORD8_MISC28_M BIT(15)
+#define NAC_CGU_DWORD8_HLPPLL_NDIVRATIO_S 16
+#define NAC_CGU_DWORD8_HLPPLL_NDIVRATIO_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD8_HLPPLL_IREF_NDIVRATIO_S 20
+#define NAC_CGU_DWORD8_HLPPLL_IREF_NDIVRATIO_M ICE_M(0x7, 20)
+#define NAC_CGU_DWORD8_MISC29_S 23
+#define NAC_CGU_DWORD8_MISC29_M BIT(23)
+#define NAC_CGU_DWORD8_CLK_EREF1_EN_SELFBIAS_S 24
+#define NAC_CGU_DWORD8_CLK_EREF1_EN_SELFBIAS_M BIT(24)
+#define NAC_CGU_DWORD8_CLK_EREF0_EN_SELFBIAS_S 25
+#define NAC_CGU_DWORD8_CLK_EREF0_EN_SELFBIAS_M BIT(25)
+#define NAC_CGU_DWORD8_TIME_REF_EN_SELFBIAS_S 26
+#define NAC_CGU_DWORD8_TIME_REF_EN_SELFBIAS_M BIT(26)
+#define NAC_CGU_DWORD8_TIME_SYNC_EN_SELFBIAS_S 27
+#define NAC_CGU_DWORD8_TIME_SYNC_EN_SELFBIAS_M BIT(27)
+#define NAC_CGU_DWORD8_CLK_REF_SYNC_E_EN_SELFBIAS_S 28
+#define NAC_CGU_DWORD8_CLK_REF_SYNC_E_EN_SELFBIAS_M BIT(28)
+#define NAC_CGU_DWORD8_NET_CLK_REF1_EN_SELFBIAS_S 29
+#define NAC_CGU_DWORD8_NET_CLK_REF1_EN_SELFBIAS_M BIT(29)
+#define NAC_CGU_DWORD8_NET_CLK_REF0_EN_SELFBIAS_S 30
+#define NAC_CGU_DWORD8_NET_CLK_REF0_EN_SELFBIAS_M BIT(30)
+#define NAC_CGU_DWORD8_TCXO_SEL_S 31
+#define NAC_CGU_DWORD8_TCXO_SEL_M BIT(31)
+
+union nac_cgu_dword8 {
+	struct {
+		u32 tcxo_freq_sel : 1;
+		u32 misc8 : 3;
+		u32 hlp_switch_freq_sel : 4;
+		u32 cgupll_ndivratio : 4;
+		u32 cgupll_iref_ndivratio : 3;
+		u32 misc28 : 1;
+		u32 hlppll_ndivratio : 4;
+		u32 hlppll_iref_ndivratio : 3;
+		u32 misc29 : 1;
+		u32 clk_eref1_en_selfbias : 1;
+		u32 clk_eref0_en_selfbias : 1;
+		u32 time_ref_en_selfbias : 1;
+		u32 time_sync_en_selfbias : 1;
+		u32 clk_ref_sync_e_en_selfbias : 1;
+		u32 net_clk_ref1_en_selfbias : 1;
+		u32 net_clk_ref0_en_selfbias : 1;
+		u32 tcxo_sel : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD9 0x24
+#define NAC_CGU_DWORD9_TIME_REF_FREQ_SEL_S 0
+#define NAC_CGU_DWORD9_TIME_REF_FREQ_SEL_M ICE_M(0x7, 0)
+#define NAC_CGU_DWORD9_CLK_EREF1_EN_S 3
+#define NAC_CGU_DWORD9_CLK_EREF1_EN_M BIT(3)
+#define NAC_CGU_DWORD9_CLK_EREF0_EN_S 4
+#define NAC_CGU_DWORD9_CLK_EREF0_EN_M BIT(4)
+#define NAC_CGU_DWORD9_TIME_REF_EN_S 5
+#define NAC_CGU_DWORD9_TIME_REF_EN_M BIT(5)
+#define NAC_CGU_DWORD9_TIME_SYNC_EN_S 6
+#define NAC_CGU_DWORD9_TIME_SYNC_EN_M BIT(6)
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_EN_S 7
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_EN_M BIT(7)
+#define NAC_CGU_DWORD9_CLK_REF_SYNCE_EN_S 8
+#define NAC_CGU_DWORD9_CLK_REF_SYNCE_EN_M BIT(8)
+#define NAC_CGU_DWORD9_CLK_SYNCE1_EN_S 9
+#define NAC_CGU_DWORD9_CLK_SYNCE1_EN_M BIT(9)
+#define NAC_CGU_DWORD9_CLK_SYNCE0_EN_S 10
+#define NAC_CGU_DWORD9_CLK_SYNCE0_EN_M BIT(10)
+#define NAC_CGU_DWORD9_NET_CLK_REF1_EN_S 11
+#define NAC_CGU_DWORD9_NET_CLK_REF1_EN_M BIT(11)
+#define NAC_CGU_DWORD9_NET_CLK_REF0_EN_S 12
+#define NAC_CGU_DWORD9_NET_CLK_REF0_EN_M BIT(12)
+#define NAC_CGU_DWORD9_CLK_SYNCE1_AMP_S 13
+#define NAC_CGU_DWORD9_CLK_SYNCE1_AMP_M ICE_M(0x3, 13)
+#define NAC_CGU_DWORD9_MISC6_S 15
+#define NAC_CGU_DWORD9_MISC6_M BIT(15)
+#define NAC_CGU_DWORD9_CLK_SYNCE0_AMP_S 16
+#define NAC_CGU_DWORD9_CLK_SYNCE0_AMP_M ICE_M(0x3, 16)
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_AMP_S 18
+#define NAC_CGU_DWORD9_ONE_PPS_OUT_AMP_M ICE_M(0x3, 18)
+#define NAC_CGU_DWORD9_MISC24_S 20
+#define NAC_CGU_DWORD9_MISC24_M ICE_M(0xfff, 20)
+
+union nac_cgu_dword9 {
+	struct {
+		u32 time_ref_freq_sel : 3;
+		u32 clk_eref1_en : 1;
+		u32 clk_eref0_en : 1;
+		u32 time_ref_en : 1;
+		u32 time_sync_en : 1;
+		u32 one_pps_out_en : 1;
+		u32 clk_ref_synce_en : 1;
+		u32 clk_synce1_en : 1;
+		u32 clk_synce0_en : 1;
+		u32 net_clk_ref1_en : 1;
+		u32 net_clk_ref0_en : 1;
+		u32 clk_synce1_amp : 2;
+		u32 misc6 : 1;
+		u32 clk_synce0_amp : 2;
+		u32 one_pps_out_amp : 2;
+		u32 misc24 : 12;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD10 0x28
+#define NAC_CGU_DWORD10_JA_PLL_ENABLE_S 0
+#define NAC_CGU_DWORD10_JA_PLL_ENABLE_M BIT(0)
+#define NAC_CGU_DWORD10_MISC11_S 1
+#define NAC_CGU_DWORD10_MISC11_M BIT(1)
+#define NAC_CGU_DWORD10_FDPLL_ENABLE_S 2
+#define NAC_CGU_DWORD10_FDPLL_ENABLE_M BIT(2)
+#define NAC_CGU_DWORD10_FDPLL_SLOW_S 3
+#define NAC_CGU_DWORD10_FDPLL_SLOW_M BIT(3)
+#define NAC_CGU_DWORD10_FDPLL_LOCK_INT_ENB_S 4
+#define NAC_CGU_DWORD10_FDPLL_LOCK_INT_ENB_M BIT(4)
+#define NAC_CGU_DWORD10_SYNCE_CLKO_SEL_S 5
+#define NAC_CGU_DWORD10_SYNCE_CLKO_SEL_M ICE_M(0xf, 5)
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_M1_S 9
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_M1_M ICE_M(0x1f, 9)
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_LOAD_S 14
+#define NAC_CGU_DWORD10_SYNCE_CLKODIV_LOAD_M BIT(14)
+#define NAC_CGU_DWORD10_SYNCE_DCK_RST_S 15
+#define NAC_CGU_DWORD10_SYNCE_DCK_RST_M BIT(15)
+#define NAC_CGU_DWORD10_SYNCE_ETHCLKO_SEL_S 16
+#define NAC_CGU_DWORD10_SYNCE_ETHCLKO_SEL_M ICE_M(0x7, 16)
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_M1_S 19
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_M1_M ICE_M(0x1f, 19)
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_LOAD_S 24
+#define NAC_CGU_DWORD10_SYNCE_ETHDIV_LOAD_M BIT(24)
+#define NAC_CGU_DWORD10_SYNCE_DCK2_RST_S 25
+#define NAC_CGU_DWORD10_SYNCE_DCK2_RST_M BIT(25)
+#define NAC_CGU_DWORD10_SYNCE_SEL_GND_S 26
+#define NAC_CGU_DWORD10_SYNCE_SEL_GND_M BIT(26)
+#define NAC_CGU_DWORD10_SYNCE_S_REF_CLK_S 27
+#define NAC_CGU_DWORD10_SYNCE_S_REF_CLK_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword10 {
+	struct {
+		u32 ja_pll_enable : 1;
+		u32 misc11 : 1;
+		u32 fdpll_enable : 1;
+		u32 fdpll_slow : 1;
+		u32 fdpll_lock_int_enb : 1;
+		u32 synce_clko_sel : 4;
+		u32 synce_clkodiv_m1 : 5;
+		u32 synce_clkodiv_load : 1;
+		u32 synce_dck_rst : 1;
+		u32 synce_ethclko_sel : 3;
+		u32 synce_ethdiv_m1 : 5;
+		u32 synce_ethdiv_load : 1;
+		u32 synce_dck2_rst : 1;
+		u32 synce_sel_gnd : 1;
+		u32 synce_s_ref_clk : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD11 0x2c
+#define NAC_CGU_DWORD11_MISC25_S 0
+#define NAC_CGU_DWORD11_MISC25_M BIT(0)
+#define NAC_CGU_DWORD11_SYNCE_S_BYP_CLK_S 1
+#define NAC_CGU_DWORD11_SYNCE_S_BYP_CLK_M ICE_M(0x3f, 1)
+#define NAC_CGU_DWORD11_SYNCE_HDOV_MODE_S 7
+#define NAC_CGU_DWORD11_SYNCE_HDOV_MODE_M BIT(7)
+#define NAC_CGU_DWORD11_SYNCE_RAT_SEL_S 8
+#define NAC_CGU_DWORD11_SYNCE_RAT_SEL_M ICE_M(0x3, 8)
+#define NAC_CGU_DWORD11_SYNCE_LINK_ENABLE_S 10
+#define NAC_CGU_DWORD11_SYNCE_LINK_ENABLE_M ICE_M(0xfffff, 10)
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_EN_S 30
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_EN_M BIT(30)
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_RAT_M1_S 31
+#define NAC_CGU_DWORD11_SYNCE_MISCLK_RAT_M1_M BIT(31)
+
+union nac_cgu_dword11 {
+	struct {
+		u32 misc25 : 1;
+		u32 synce_s_byp_clk : 6;
+		u32 synce_hdov_mode : 1;
+		u32 synce_rat_sel : 2;
+		u32 synce_link_enable : 20;
+		u32 synce_misclk_en : 1;
+		u32 synce_misclk_rat_m1 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD12 0x30
+#define NAC_CGU_DWORD12_SYNCE_MISCLK_RAT_M1_S 0
+#define NAC_CGU_DWORD12_SYNCE_MISCLK_RAT_M1_M ICE_M(0x3ff, 0)
+#define NAC_CGU_DWORD12_SYNCE_MCK_RST_S 10
+#define NAC_CGU_DWORD12_SYNCE_MCK_RST_M BIT(10)
+#define NAC_CGU_DWORD12_SYNCE_DPLL_BYP_S 11
+#define NAC_CGU_DWORD12_SYNCE_DPLL_BYP_M BIT(11)
+#define NAC_CGU_DWORD12_SYNCE_DV_RAT_M1_S 12
+#define NAC_CGU_DWORD12_SYNCE_DV_RAT_M1_M ICE_M(0x1fff, 12)
+#define NAC_CGU_DWORD12_SYNCE_ML_RAT_M1_S 25
+#define NAC_CGU_DWORD12_SYNCE_ML_RAT_M1_M ICE_M(0x7f, 25)
+
+union nac_cgu_dword12 {
+	struct {
+		u32 synce_misclk_rat_m1 : 10;
+		u32 synce_mck_rst : 1;
+		u32 synce_dpll_byp : 1;
+		u32 synce_dv_rat_m1 : 13;
+		u32 synce_ml_rat_m1 : 7;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD13 0x34
+#define NAC_CGU_DWORD13_SYNCE_ML_RAT_M1_S 0
+#define NAC_CGU_DWORD13_SYNCE_ML_RAT_M1_M ICE_M(0x1f, 0)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CHANGED_S 5
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CHANGED_M BIT(5)
+#define NAC_CGU_DWORD13_SYNCE_LOCK_CHANGED_S 6
+#define NAC_CGU_DWORD13_SYNCE_LOCK_CHANGED_M BIT(6)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_S 7
+#define NAC_CGU_DWORD13_SYNCE_HDOV_M BIT(7)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_INT_ENB_S 8
+#define NAC_CGU_DWORD13_SYNCE_HDOV_INT_ENB_M BIT(8)
+#define NAC_CGU_DWORD13_SYNCE_LOCK_INT_ENB_S 9
+#define NAC_CGU_DWORD13_SYNCE_LOCK_INT_ENB_M BIT(9)
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_NC_S 10
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_NC_M BIT(10)
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_NC_S 11
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_NC_M BIT(11)
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_CLEAR_S 12
+#define NAC_CGU_DWORD13_SYNCE_LOCKED_CLEAR_M BIT(12)
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CLEAR_S 13
+#define NAC_CGU_DWORD13_SYNCE_HDOV_CLEAR_M BIT(13)
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_CLEAR_S 14
+#define NAC_CGU_DWORD13_FDPLL_LOCKED_CLEAR_M BIT(14)
+#define NAC_CGU_DWORD13_FDPLL_LOCK_CHANGED_S 15
+#define NAC_CGU_DWORD13_FDPLL_LOCK_CHANGED_M BIT(15)
+#define NAC_CGU_DWORD13_RMNRXCLK_SEL_S 16
+#define NAC_CGU_DWORD13_RMNRXCLK_SEL_M ICE_M(0x1f, 16)
+#define NAC_CGU_DWORD13_ENABLE_ETH_COUNT_S 21
+#define NAC_CGU_DWORD13_ENABLE_ETH_COUNT_M BIT(21)
+#define NAC_CGU_DWORD13_ETH_COUNT_FAST_MODE_S 22
+#define NAC_CGU_DWORD13_ETH_COUNT_FAST_MODE_M BIT(22)
+#define NAC_CGU_DWORD13_MISC12_S 23
+#define NAC_CGU_DWORD13_MISC12_M ICE_M(0x1ff, 23)
+
+union nac_cgu_dword13 {
+	struct {
+		u32 synce_ml_rat_m1 : 5;
+		u32 synce_hdov_changed : 1;
+		u32 synce_lock_changed : 1;
+		u32 synce_hdov : 1;
+		u32 synce_hdov_int_enb : 1;
+		u32 synce_lock_int_enb : 1;
+		u32 synce_locked_nc : 1;
+		u32 fdpll_locked_nc : 1;
+		u32 synce_locked_clear : 1;
+		u32 synce_hdov_clear : 1;
+		u32 fdpll_locked_clear : 1;
+		u32 fdpll_lock_changed : 1;
+		u32 rmnrxclk_sel : 5;
+		u32 enable_eth_count : 1;
+		u32 eth_count_fast_mode : 1;
+		u32 misc12 : 9;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD14 0x38
+#define NAC_CGU_DWORD14_SYNCE_LNK_UP_MD_S 0
+#define NAC_CGU_DWORD14_SYNCE_LNK_UP_MD_M BIT(0)
+#define NAC_CGU_DWORD14_SYNCE_LNK_DN_MD_S 1
+#define NAC_CGU_DWORD14_SYNCE_LNK_DN_MD_M BIT(1)
+#define NAC_CGU_DWORD14_SYNCE_FAST_MODE_S 2
+#define NAC_CGU_DWORD14_SYNCE_FAST_MODE_M BIT(2)
+#define NAC_CGU_DWORD14_SYNCE_EEC_MODE_S 3
+#define NAC_CGU_DWORD14_SYNCE_EEC_MODE_M BIT(3)
+#define NAC_CGU_DWORD14_SYNCE_NGAIN_S 4
+#define NAC_CGU_DWORD14_SYNCE_NGAIN_M ICE_M(0xff, 4)
+#define NAC_CGU_DWORD14_SYNCE_NSCALE_S 12
+#define NAC_CGU_DWORD14_SYNCE_NSCALE_M ICE_M(0x3f, 12)
+#define NAC_CGU_DWORD14_SYNCE_UNLCK_THR_S 18
+#define NAC_CGU_DWORD14_SYNCE_UNLCK_THR_M ICE_M(0x3fff, 18)
+
+union nac_cgu_dword14 {
+	struct {
+		u32 synce_lnk_up_md : 1;
+		u32 synce_lnk_dn_md : 1;
+		u32 synce_fast_mode : 1;
+		u32 synce_eec_mode : 1;
+		u32 synce_ngain : 8;
+		u32 synce_nscale : 6;
+		u32 synce_unlck_thr : 14;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD15 0x3c
+#define NAC_CGU_DWORD15_SYNCE_UNLCK_THR_S 0
+#define NAC_CGU_DWORD15_SYNCE_UNLCK_THR_M ICE_M(0x7, 0)
+#define NAC_CGU_DWORD15_SYNCE_LOCK_THR_S 3
+#define NAC_CGU_DWORD15_SYNCE_LOCK_THR_M ICE_M(0x1ffff, 3)
+#define NAC_CGU_DWORD15_SYNCE_QUO_M1_S 20
+#define NAC_CGU_DWORD15_SYNCE_QUO_M1_M ICE_M(0x3f, 20)
+#define NAC_CGU_DWORD15_SYNCE_REMNDR_S 26
+#define NAC_CGU_DWORD15_SYNCE_REMNDR_M ICE_M(0x3f, 26)
+
+union nac_cgu_dword15 {
+	struct {
+		u32 synce_unlck_thr : 3;
+		u32 synce_lock_thr : 17;
+		u32 synce_quo_m1 : 6;
+		u32 synce_remndr : 6;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD16 0x40
+#define NAC_CGU_DWORD16_SYNCE_REMNDR_S 0
+#define NAC_CGU_DWORD16_SYNCE_REMNDR_M ICE_M(0x3f, 0)
+#define NAC_CGU_DWORD16_SYNCE_PHLMT_EN_S 6
+#define NAC_CGU_DWORD16_SYNCE_PHLMT_EN_M BIT(6)
+#define NAC_CGU_DWORD16_MISC13_S 7
+#define NAC_CGU_DWORD16_MISC13_M ICE_M(0x1ffffff, 7)
+
+union nac_cgu_dword16 {
+	struct {
+		u32 synce_remndr : 6;
+		u32 synce_phlmt_en : 1;
+		u32 misc13 : 25;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD17 0x44
+#define NAC_CGU_DWORD17_FDPLL_GAIN_S 0
+#define NAC_CGU_DWORD17_FDPLL_GAIN_M ICE_M(0xf, 0)
+#define NAC_CGU_DWORD17_FDPLL_SCALE_S 4
+#define NAC_CGU_DWORD17_FDPLL_SCALE_M ICE_M(0xf, 4)
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_F_S 8
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_F_M ICE_M(0x3f, 8)
+#define NAC_CGU_DWORD17_FDPLL_CLR_PHERR_S 14
+#define NAC_CGU_DWORD17_FDPLL_CLR_PHERR_M BIT(14)
+#define NAC_CGU_DWORD17_FDPLL_BB_EN_S 15
+#define NAC_CGU_DWORD17_FDPLL_BB_EN_M BIT(15)
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_S 16
+#define NAC_CGU_DWORD17_FDPLL_FGAIN_SHIFT_M ICE_M(0x3f, 16)
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_S 22
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_M ICE_M(0x1f, 22)
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_F_S 27
+#define NAC_CGU_DWORD17_FDPLL_FSCALE_SHIFT_F_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword17 {
+	struct {
+		u32 fdpll_gain : 4;
+		u32 fdpll_scale : 4;
+		u32 fdpll_fgain_shift_f : 6;
+		u32 fdpll_clr_pherr : 1;
+		u32 fdpll_bb_en : 1;
+		u32 fdpll_fgain_shift : 6;
+		u32 fdpll_fscale_shift : 5;
+		u32 fdpll_fscale_shift_f : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD18 0x48
+#define NAC_CGU_DWORD18_FDPLL_BYPASS_S 0
+#define NAC_CGU_DWORD18_FDPLL_BYPASS_M BIT(0)
+#define NAC_CGU_DWORD18_FDPLL_INP_NCO_S 1
+#define NAC_CGU_DWORD18_FDPLL_INP_NCO_M ICE_M(0xff, 1)
+#define NAC_CGU_DWORD18_FDPLL_AUTO_EN_S 9
+#define NAC_CGU_DWORD18_FDPLL_AUTO_EN_M BIT(9)
+#define NAC_CGU_DWORD18_FDPLL_SAMP_CNT_S 10
+#define NAC_CGU_DWORD18_FDPLL_SAMP_CNT_M ICE_M(0xfff, 10)
+#define NAC_CGU_DWORD18_FDPLL_LOCKCNT_S 22
+#define NAC_CGU_DWORD18_FDPLL_LOCKCNT_M ICE_M(0x1f, 22)
+#define NAC_CGU_DWORD18_FDPLL_LOCK_THR_S 27
+#define NAC_CGU_DWORD18_FDPLL_LOCK_THR_M ICE_M(0x1f, 27)
+
+union nac_cgu_dword18 {
+	struct {
+		u32 fdpll_bypass : 1;
+		u32 fdpll_inp_nco : 8;
+		u32 fdpll_auto_en : 1;
+		u32 fdpll_samp_cnt : 12;
+		u32 fdpll_lockcnt : 5;
+		u32 fdpll_lock_thr : 5;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD19 0x4c
+#define NAC_CGU_DWORD19_TSPLL_FBDIV_INTGR_S 0
+#define NAC_CGU_DWORD19_TSPLL_FBDIV_INTGR_M ICE_M(0xff, 0)
+#define NAC_CGU_DWORD19_FDPLL_ULCK_THR_S 8
+#define NAC_CGU_DWORD19_FDPLL_ULCK_THR_M ICE_M(0x1f, 8)
+#define NAC_CGU_DWORD19_MISC15_S 13
+#define NAC_CGU_DWORD19_MISC15_M ICE_M(0x7, 13)
+#define NAC_CGU_DWORD19_TSPLL_NDIVRATIO_S 16
+#define NAC_CGU_DWORD19_TSPLL_NDIVRATIO_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD19_TSPLL_IREF_NDIVRATIO_S 20
+#define NAC_CGU_DWORD19_TSPLL_IREF_NDIVRATIO_M ICE_M(0x7, 20)
+#define NAC_CGU_DWORD19_MISC19_S 23
+#define NAC_CGU_DWORD19_MISC19_M BIT(23)
+#define NAC_CGU_DWORD19_JAPLL_NDIVRATIO_S 24
+#define NAC_CGU_DWORD19_JAPLL_NDIVRATIO_M ICE_M(0xf, 24)
+#define NAC_CGU_DWORD19_JAPLL_IREF_NDIVRATIO_S 28
+#define NAC_CGU_DWORD19_JAPLL_IREF_NDIVRATIO_M ICE_M(0x7, 28)
+#define NAC_CGU_DWORD19_MISC27_S 31
+#define NAC_CGU_DWORD19_MISC27_M BIT(31)
+
+union nac_cgu_dword19 {
+	struct {
+		u32 tspll_fbdiv_intgr : 8;
+		u32 fdpll_ulck_thr : 5;
+		u32 misc15 : 3;
+		u32 tspll_ndivratio : 4;
+		u32 tspll_iref_ndivratio : 3;
+		u32 misc19 : 1;
+		u32 japll_ndivratio : 4;
+		u32 japll_iref_ndivratio : 3;
+		u32 misc27 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD20 0x50
+#define NAC_CGU_DWORD20_JAPLL_INT_DIV_S 0
+#define NAC_CGU_DWORD20_JAPLL_INT_DIV_M ICE_M(0xff, 0)
+#define NAC_CGU_DWORD20_JAPLL_FRAC_DIV_S 8
+#define NAC_CGU_DWORD20_JAPLL_FRAC_DIV_M ICE_M(0x3fffff, 8)
+#define NAC_CGU_DWORD20_MISC16_S 30
+#define NAC_CGU_DWORD20_MISC16_M ICE_M(0x3, 30)
+
+union nac_cgu_dword20 {
+	struct {
+		u32 japll_int_div : 8;
+		u32 japll_frac_div : 22;
+		u32 misc16 : 2;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD21 0x54
+#define NAC_CGU_DWORD21_MISC17_S 0
+#define NAC_CGU_DWORD21_MISC17_M ICE_M(0xf, 0)
+#define NAC_CGU_DWORD21_FDPLL_INT_DIV_OUT_NC_S 4
+#define NAC_CGU_DWORD21_FDPLL_INT_DIV_OUT_NC_M ICE_M(0xff, 4)
+#define NAC_CGU_DWORD21_FDPLL_FRAC_DIV_OUT_NC_S 12
+#define NAC_CGU_DWORD21_FDPLL_FRAC_DIV_OUT_NC_M ICE_M(0xfffff, 12)
+
+union nac_cgu_dword21 {
+	struct {
+		u32 misc17 : 4;
+		u32 fdpll_int_div_out_nc : 8;
+		u32 fdpll_frac_div_out_nc : 20;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD22 0x58
+#define NAC_CGU_DWORD22_FDPLL_FRAC_DIV_OUT_NC_S 0
+#define NAC_CGU_DWORD22_FDPLL_FRAC_DIV_OUT_NC_M ICE_M(0x3, 0)
+#define NAC_CGU_DWORD22_FDPLL_LOCK_INT_FOR_S 2
+#define NAC_CGU_DWORD22_FDPLL_LOCK_INT_FOR_M BIT(2)
+#define NAC_CGU_DWORD22_SYNCE_HDOV_INT_FOR_S 3
+#define NAC_CGU_DWORD22_SYNCE_HDOV_INT_FOR_M BIT(3)
+#define NAC_CGU_DWORD22_SYNCE_LOCK_INT_FOR_S 4
+#define NAC_CGU_DWORD22_SYNCE_LOCK_INT_FOR_M BIT(4)
+#define NAC_CGU_DWORD22_FDPLL_PHLEAD_SLIP_NC_S 5
+#define NAC_CGU_DWORD22_FDPLL_PHLEAD_SLIP_NC_M BIT(5)
+#define NAC_CGU_DWORD22_FDPLL_ACC1_OVFL_NC_S 6
+#define NAC_CGU_DWORD22_FDPLL_ACC1_OVFL_NC_M BIT(6)
+#define NAC_CGU_DWORD22_FDPLL_ACC2_OVFL_NC_S 7
+#define NAC_CGU_DWORD22_FDPLL_ACC2_OVFL_NC_M BIT(7)
+#define NAC_CGU_DWORD22_SYNCE_STATUS_NC_S 8
+#define NAC_CGU_DWORD22_SYNCE_STATUS_NC_M ICE_M(0x3f, 8)
+#define NAC_CGU_DWORD22_FDPLL_ACC1F_OVFL_S 14
+#define NAC_CGU_DWORD22_FDPLL_ACC1F_OVFL_M BIT(14)
+#define NAC_CGU_DWORD22_MISC18_S 15
+#define NAC_CGU_DWORD22_MISC18_M BIT(15)
+#define NAC_CGU_DWORD22_FDPLLCLK_DIV_S 16
+#define NAC_CGU_DWORD22_FDPLLCLK_DIV_M ICE_M(0xf, 16)
+#define NAC_CGU_DWORD22_TIME1588CLK_DIV_S 20
+#define NAC_CGU_DWORD22_TIME1588CLK_DIV_M ICE_M(0xf, 20)
+#define NAC_CGU_DWORD22_SYNCECLK_DIV_S 24
+#define NAC_CGU_DWORD22_SYNCECLK_DIV_M ICE_M(0xf, 24)
+#define NAC_CGU_DWORD22_SYNCECLK_SEL_DIV2_S 28
+#define NAC_CGU_DWORD22_SYNCECLK_SEL_DIV2_M BIT(28)
+#define NAC_CGU_DWORD22_FDPLLCLK_SEL_DIV2_S 29
+#define NAC_CGU_DWORD22_FDPLLCLK_SEL_DIV2_M BIT(29)
+#define NAC_CGU_DWORD22_TIME1588CLK_SEL_DIV2_S 30
+#define NAC_CGU_DWORD22_TIME1588CLK_SEL_DIV2_M BIT(30)
+#define NAC_CGU_DWORD22_MISC3_S 31
+#define NAC_CGU_DWORD22_MISC3_M BIT(31)
+
+union nac_cgu_dword22 {
+	struct {
+		u32 fdpll_frac_div_out_nc : 2;
+		u32 fdpll_lock_int_for : 1;
+		u32 synce_hdov_int_for : 1;
+		u32 synce_lock_int_for : 1;
+		u32 fdpll_phlead_slip_nc : 1;
+		u32 fdpll_acc1_ovfl_nc : 1;
+		u32 fdpll_acc2_ovfl_nc : 1;
+		u32 synce_status_nc : 6;
+		u32 fdpll_acc1f_ovfl : 1;
+		u32 misc18 : 1;
+		u32 fdpllclk_div : 4;
+		u32 time1588clk_div : 4;
+		u32 synceclk_div : 4;
+		u32 synceclk_sel_div2 : 1;
+		u32 fdpllclk_sel_div2 : 1;
+		u32 time1588clk_sel_div2 : 1;
+		u32 misc3 : 1;
+	} field;
+	u32 val;
+};
+
+#define NAC_CGU_DWORD24 0x60
+#define NAC_CGU_DWORD24_TSPLL_FBDIV_FRAC_S 0
+#define NAC_CGU_DWORD24_TSPLL_FBDIV_FRAC_M ICE_M(0x3fffff, 0)
+#define NAC_CGU_DWORD24_MISC20_S 22
+#define NAC_CGU_DWORD24_MISC20_M ICE_M(0x3, 22)
+#define NAC_CGU_DWORD24_TS_PLL_ENABLE_S 24
+#define NAC_CGU_DWORD24_TS_PLL_ENABLE_M BIT(24)
+#define NAC_CGU_DWORD24_TIME_SYNC_TSPLL_ALIGN_SEL_S 25
+#define NAC_CGU_DWORD24_TIME_SYNC_TSPLL_ALIGN_SEL_M BIT(25)
+#define NAC_CGU_DWORD24_EXT_SYNCE_SEL_S 26
+#define NAC_CGU_DWORD24_EXT_SYNCE_SEL_M BIT(26)
+#define NAC_CGU_DWORD24_REF1588_CK_DIV_S 27
+#define NAC_CGU_DWORD24_REF1588_CK_DIV_M ICE_M(0xf, 27)
+#define NAC_CGU_DWORD24_TIME_REF_SEL_S 31
+#define NAC_CGU_DWORD24_TIME_REF_SEL_M BIT(31)
+
+union nac_cgu_dword24 {
+	struct {
+		u32 tspll_fbdiv_frac : 22;
+		u32 misc20 : 2;
+		u32 ts_pll_enable : 1;
+		u32 time_sync_tspll_align_sel : 1;
+		u32 ext_synce_sel : 1;
+		u32 ref1588_ck_div : 4;
+		u32 time_ref_sel : 1;
+	} field;
+	u32 val;
+};
+
+#define TSPLL_CNTR_BIST_SETTINGS 0x344
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_CNTR_7_0_S 0
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_CNTR_7_0_M \
+	ICE_M(0xff, 0)
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_RO_STANDBY_1_0_S 8
+#define TSPLL_CNTR_BIST_SETTINGS_I_IREFGEN_SETTLING_TIME_RO_STANDBY_1_0_M \
+	ICE_M(0x3, 8)
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED195_S 10
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED195_M ICE_M(0x1f, 10)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_0_S 15
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_0_M BIT(15)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_1_S 16
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_SEL_1_M BIT(16)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_6_0_S 17
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_6_0_M ICE_M(0x7f, 17)
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_10_7_S 24
+#define TSPLL_CNTR_BIST_SETTINGS_I_PLLLOCK_CNT_10_7_M ICE_M(0xf, 24)
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED200_S 28
+#define TSPLL_CNTR_BIST_SETTINGS_RESERVED200_M ICE_M(0xf, 28)
+
+union tspll_cntr_bist_settings {
+	struct {
+		u32 i_irefgen_settling_time_cntr_7_0 : 8;
+		u32 i_irefgen_settling_time_ro_standby_1_0 : 2;
+		u32 reserved195 : 5;
+		u32 i_plllock_sel_0 : 1;
+		u32 i_plllock_sel_1 : 1;
+		u32 i_plllock_cnt_6_0 : 7;
+		u32 i_plllock_cnt_10_7 : 4;
+		u32 reserved200 : 4;
+	} field;
+	u32 val;
+};
+
+#define TSPLL_RO_BWM_LF 0x370
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_S 0
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_M ICE_M(0xff, 0)
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_S 8
+#define TSPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_M ICE_M(0x3, 8)
+#define TSPLL_RO_BWM_LF_BIASCALDONE_CRI_S 10
+#define TSPLL_RO_BWM_LF_BIASCALDONE_CRI_M BIT(10)
+#define TSPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_S 11
+#define TSPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_M BIT(11)
+#define TSPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_S 12
+#define TSPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_M BIT(12)
+#define TSPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_S 13
+#define TSPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_M BIT(13)
+#define TSPLL_RO_BWM_LF_AFCERR_CRI_S 14
+#define TSPLL_RO_BWM_LF_AFCERR_CRI_M BIT(14)
+#define TSPLL_RO_BWM_LF_AFCDONE_CRI_S 15
+#define TSPLL_RO_BWM_LF_AFCDONE_CRI_M BIT(15)
+#define TSPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_S 16
+#define TSPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_M ICE_M(0xff, 16)
+#define TSPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_S 24
+#define TSPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_M ICE_M(0xff, 24)
+
+union tspll_ro_bwm_lf {
+	struct {
+		u32 bw_freqov_high_cri_7_0 : 8;
+		u32 bw_freqov_high_cri_9_8 : 2;
+		u32 biascaldone_cri : 1;
+		u32 plllock_gain_tran_cri : 1;
+		u32 plllock_true_lock_cri : 1;
+		u32 pllunlock_flag_cri : 1;
+		u32 afcerr_cri : 1;
+		u32 afcdone_cri : 1;
+		u32 feedfwrdgain_cal_cri_7_0 : 8;
+		u32 m2fbdivmod_cri_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_DIV0 0x400
+#define JAPLL_DIV0_I_FBDIV_INTGR_7_0_S 0
+#define JAPLL_DIV0_I_FBDIV_INTGR_7_0_M ICE_M(0xff, 0)
+#define JAPLL_DIV0_I_FBDIV_FRAC_7_0_S 8
+#define JAPLL_DIV0_I_FBDIV_FRAC_7_0_M ICE_M(0xff, 8)
+#define JAPLL_DIV0_I_FBDIV_FRAC_15_8_S 16
+#define JAPLL_DIV0_I_FBDIV_FRAC_15_8_M ICE_M(0xff, 16)
+#define JAPLL_DIV0_I_FBDIV_FRAC_21_16_S 24
+#define JAPLL_DIV0_I_FBDIV_FRAC_21_16_M ICE_M(0x3f, 24)
+#define JAPLL_DIV0_I_FRACNEN_H_S 30
+#define JAPLL_DIV0_I_FRACNEN_H_M BIT(30)
+#define JAPLL_DIV0_I_DIRECT_PIN_IF_EN_S 31
+#define JAPLL_DIV0_I_DIRECT_PIN_IF_EN_M BIT(31)
+
+union japll_div0 {
+	struct {
+		u32 i_fbdiv_intgr_7_0 : 8;
+		u32 i_fbdiv_frac_7_0 : 8;
+		u32 i_fbdiv_frac_15_8 : 8;
+		u32 i_fbdiv_frac_21_16 : 6;
+		u32 i_fracnen_h : 1;
+		u32 i_direct_pin_if_en : 1;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_LF 0x408
+#define JAPLL_LF_I_PROP_COEFF_3_0_S 0
+#define JAPLL_LF_I_PROP_COEFF_3_0_M ICE_M(0xf, 0)
+#define JAPLL_LF_I_FLL_INT_COEFF_3_0_S 4
+#define JAPLL_LF_I_FLL_INT_COEFF_3_0_M ICE_M(0xf, 4)
+#define JAPLL_LF_I_INT_COEFF_4_0_S 8
+#define JAPLL_LF_I_INT_COEFF_4_0_M ICE_M(0x1f, 8)
+#define JAPLL_LF_I_FLL_EN_H_S 13
+#define JAPLL_LF_I_FLL_EN_H_M BIT(13)
+#define JAPLL_LF_I_TDC_FINE_RES_S 14
+#define JAPLL_LF_I_TDC_FINE_RES_M BIT(14)
+#define JAPLL_LF_I_DCOFINE_RESOLUTION_S 15
+#define JAPLL_LF_I_DCOFINE_RESOLUTION_M BIT(15)
+#define JAPLL_LF_I_GAINCTRL_2_0_S 16
+#define JAPLL_LF_I_GAINCTRL_2_0_M ICE_M(0x7, 16)
+#define JAPLL_LF_I_AFC_DIVRATIO_S 19
+#define JAPLL_LF_I_AFC_DIVRATIO_M BIT(19)
+#define JAPLL_LF_I_AFCCNTSEL_S 20
+#define JAPLL_LF_I_AFCCNTSEL_M BIT(20)
+#define JAPLL_LF_I_AFC_STARTUP_1_0_S 21
+#define JAPLL_LF_I_AFC_STARTUP_1_0_M ICE_M(0x3, 21)
+#define JAPLL_LF_RESERVED31_S 23
+#define JAPLL_LF_RESERVED31_M BIT(23)
+#define JAPLL_LF_I_TDCTARGETCNT_7_0_S 24
+#define JAPLL_LF_I_TDCTARGETCNT_7_0_M ICE_M(0xff, 24)
+
+union japll_lf {
+	struct {
+		u32 i_prop_coeff_3_0 : 4;
+		u32 i_fll_int_coeff_3_0 : 4;
+		u32 i_int_coeff_4_0 : 5;
+		u32 i_fll_en_h : 1;
+		u32 i_tdc_fine_res : 1;
+		u32 i_dcofine_resolution : 1;
+		u32 i_gainctrl_2_0 : 3;
+		u32 i_afc_divratio : 1;
+		u32 i_afccntsel : 1;
+		u32 i_afc_startup_1_0 : 2;
+		u32 reserved31 : 1;
+		u32 i_tdctargetcnt_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_FRAC_LOCK 0x40c
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDGAIN_7_0_S 0
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDGAIN_7_0_M ICE_M(0xff, 0)
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_EN_H_S 8
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_EN_H_M BIT(8)
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_PAUSE_H_S 9
+#define JAPLL_FRAC_LOCK_I_FEEDFWRDCAL_PAUSE_H_M BIT(9)
+#define JAPLL_FRAC_LOCK_I_DCODITHEREN_H_S 10
+#define JAPLL_FRAC_LOCK_I_DCODITHEREN_H_M BIT(10)
+#define JAPLL_FRAC_LOCK_I_LOCKTHRESH_3_0_S 11
+#define JAPLL_FRAC_LOCK_I_LOCKTHRESH_3_0_M ICE_M(0xf, 11)
+#define JAPLL_FRAC_LOCK_I_DCODITHER_CONFIG_S 15
+#define JAPLL_FRAC_LOCK_I_DCODITHER_CONFIG_M BIT(15)
+#define JAPLL_FRAC_LOCK_I_EARLYLOCK_CRITERIA_1_0_S 16
+#define JAPLL_FRAC_LOCK_I_EARLYLOCK_CRITERIA_1_0_M ICE_M(0x3, 16)
+#define JAPLL_FRAC_LOCK_I_TRUELOCK_CRITERIA_1_0_S 18
+#define JAPLL_FRAC_LOCK_I_TRUELOCK_CRITERIA_1_0_M ICE_M(0x3, 18)
+#define JAPLL_FRAC_LOCK_I_LF_HALF_CYC_EN_S 20
+#define JAPLL_FRAC_LOCK_I_LF_HALF_CYC_EN_M BIT(20)
+#define JAPLL_FRAC_LOCK_I_DITHER_OVRD_S 21
+#define JAPLL_FRAC_LOCK_I_DITHER_OVRD_M BIT(21)
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_REG_S 22
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_REG_M BIT(22)
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_MODE_CTRL_S 23
+#define JAPLL_FRAC_LOCK_I_PLLLC_RESTORE_MODE_CTRL_M BIT(23)
+#define JAPLL_FRAC_LOCK_I_PLLRAMPEN_H_S 24
+#define JAPLL_FRAC_LOCK_I_PLLRAMPEN_H_M BIT(24)
+#define JAPLL_FRAC_LOCK_I_FBDIV_STROBE_H_S 25
+#define JAPLL_FRAC_LOCK_I_FBDIV_STROBE_H_M BIT(25)
+#define JAPLL_FRAC_LOCK_I_OVC_SNAPSHOT_H_S 26
+#define JAPLL_FRAC_LOCK_I_OVC_SNAPSHOT_H_M BIT(26)
+#define JAPLL_FRAC_LOCK_I_DITHER_VALUE_4_0_S 27
+#define JAPLL_FRAC_LOCK_I_DITHER_VALUE_4_0_M ICE_M(0x1f, 27)
+
+union japll_frac_lock {
+	struct {
+		u32 i_feedfwrdgain_7_0 : 8;
+		u32 i_feedfwrdcal_en_h : 1;
+		u32 i_feedfwrdcal_pause_h : 1;
+		u32 i_dcoditheren_h : 1;
+		u32 i_lockthresh_3_0 : 4;
+		u32 i_dcodither_config : 1;
+		u32 i_earlylock_criteria_1_0 : 2;
+		u32 i_truelock_criteria_1_0 : 2;
+		u32 i_lf_half_cyc_en : 1;
+		u32 i_dither_ovrd : 1;
+		u32 i_plllc_restore_reg : 1;
+		u32 i_plllc_restore_mode_ctrl : 1;
+		u32 i_pllrampen_h : 1;
+		u32 i_fbdiv_strobe_h : 1;
+		u32 i_ovc_snapshot_h : 1;
+		u32 i_dither_value_4_0 : 5;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_BIAS 0x414
+#define JAPLL_BIAS_I_IREFTRIM_4_0_S 0
+#define JAPLL_BIAS_I_IREFTRIM_4_0_M ICE_M(0x1f, 0)
+#define JAPLL_BIAS_I_VREF_RDAC_2_0_S 5
+#define JAPLL_BIAS_I_VREF_RDAC_2_0_M ICE_M(0x7, 5)
+#define JAPLL_BIAS_I_CTRIM_4_0_S 8
+#define JAPLL_BIAS_I_CTRIM_4_0_M ICE_M(0x1f, 8)
+#define JAPLL_BIAS_I_IREF_REFCLK_MODE_1_0_S 13
+#define JAPLL_BIAS_I_IREF_REFCLK_MODE_1_0_M ICE_M(0x3, 13)
+#define JAPLL_BIAS_I_BIASCAL_EN_H_S 15
+#define JAPLL_BIAS_I_BIASCAL_EN_H_M BIT(15)
+#define JAPLL_BIAS_I_BIAS_BONUS_7_0_S 16
+#define JAPLL_BIAS_I_BIAS_BONUS_7_0_M ICE_M(0xff, 16)
+#define JAPLL_BIAS_I_INIT_DCOAMP_5_0_S 24
+#define JAPLL_BIAS_I_INIT_DCOAMP_5_0_M ICE_M(0x3f, 24)
+#define JAPLL_BIAS_I_BIAS_GB_SEL_1_0_S 30
+#define JAPLL_BIAS_I_BIAS_GB_SEL_1_0_M ICE_M(0x3, 30)
+
+union japll_bias {
+	struct {
+		u32 i_ireftrim_4_0 : 5;
+		u32 i_vref_rdac_2_0 : 3;
+		u32 i_ctrim_4_0 : 5;
+		u32 i_iref_refclk_mode_1_0 : 2;
+		u32 i_biascal_en_h : 1;
+		u32 i_bias_bonus_7_0 : 8;
+		u32 i_init_dcoamp_5_0 : 6;
+		u32 i_bias_gb_sel_1_0 : 2;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_TDC_COLDST_BIAS 0x418
+#define JAPLL_TDC_COLDST_BIAS_I_TDCSEL_1_0_S 0
+#define JAPLL_TDC_COLDST_BIAS_I_TDCSEL_1_0_M ICE_M(0x3, 0)
+#define JAPLL_TDC_COLDST_BIAS_I_TDCOVCCORR_EN_H_S 2
+#define JAPLL_TDC_COLDST_BIAS_I_TDCOVCCORR_EN_H_M BIT(2)
+#define JAPLL_TDC_COLDST_BIAS_I_TDCDC_EN_H_S 3
+#define JAPLL_TDC_COLDST_BIAS_I_TDCDC_EN_H_M BIT(3)
+#define JAPLL_TDC_COLDST_BIAS_I_TDC_OFFSET_LOCK_1_0_S 4
+#define JAPLL_TDC_COLDST_BIAS_I_TDC_OFFSET_LOCK_1_0_M ICE_M(0x3, 4)
+#define JAPLL_TDC_COLDST_BIAS_I_SWCAP_IREFGEN_CLKMODE_1_0_S 6
+#define JAPLL_TDC_COLDST_BIAS_I_SWCAP_IREFGEN_CLKMODE_1_0_M ICE_M(0x3, 6)
+#define JAPLL_TDC_COLDST_BIAS_I_BB_GAIN_2_0_S 8
+#define JAPLL_TDC_COLDST_BIAS_I_BB_GAIN_2_0_M ICE_M(0x7, 8)
+#define JAPLL_TDC_COLDST_BIAS_I_BBTHRESH_3_0_S 11
+#define JAPLL_TDC_COLDST_BIAS_I_BBTHRESH_3_0_M ICE_M(0xf, 11)
+#define JAPLL_TDC_COLDST_BIAS_I_BBINLOCK_H_S 15
+#define JAPLL_TDC_COLDST_BIAS_I_BBINLOCK_H_M BIT(15)
+#define JAPLL_TDC_COLDST_BIAS_I_COLDSTART_S 16
+#define JAPLL_TDC_COLDST_BIAS_I_COLDSTART_M BIT(16)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_WIDTH_1_0_S 17
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_WIDTH_1_0_M \
+	ICE_M(0x3, 17)
+#define JAPLL_TDC_COLDST_BIAS_I_DCO_SETTLING_TIME_CNTR_3_0_S 19
+#define JAPLL_TDC_COLDST_BIAS_I_DCO_SETTLING_TIME_CNTR_3_0_M ICE_M(0xf, 19)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_BYPASS_S 23
+#define JAPLL_TDC_COLDST_BIAS_I_IREFBIAS_STARTUP_PULSE_BYPASS_M BIT(23)
+#define JAPLL_TDC_COLDST_BIAS_I_BIAS_CALIB_STEPSIZE_1_0_S 24
+#define JAPLL_TDC_COLDST_BIAS_I_BIAS_CALIB_STEPSIZE_1_0_M ICE_M(0x3, 24)
+#define JAPLL_TDC_COLDST_BIAS_RESERVED81_S 26
+#define JAPLL_TDC_COLDST_BIAS_RESERVED81_M BIT(26)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFINT_EN_S 27
+#define JAPLL_TDC_COLDST_BIAS_I_IREFINT_EN_M BIT(27)
+#define JAPLL_TDC_COLDST_BIAS_I_VGSBUFEN_S 28
+#define JAPLL_TDC_COLDST_BIAS_I_VGSBUFEN_M BIT(28)
+#define JAPLL_TDC_COLDST_BIAS_I_DIGDFTSWEP_S 29
+#define JAPLL_TDC_COLDST_BIAS_I_DIGDFTSWEP_M BIT(29)
+#define JAPLL_TDC_COLDST_BIAS_I_IREFDIGDFTEN_S 30
+#define JAPLL_TDC_COLDST_BIAS_I_IREFDIGDFTEN_M BIT(30)
+#define JAPLL_TDC_COLDST_BIAS_I_IREF_REFCLK_INV_EN_S 31
+#define JAPLL_TDC_COLDST_BIAS_I_IREF_REFCLK_INV_EN_M BIT(31)
+
+union japll_tdc_coldst_bias {
+	struct {
+		u32 i_tdcsel_1_0 : 2;
+		u32 i_tdcovccorr_en_h : 1;
+		u32 i_tdcdc_en_h : 1;
+		u32 i_tdc_offset_lock_1_0 : 2;
+		u32 i_swcap_irefgen_clkmode_1_0 : 2;
+		u32 i_bb_gain_2_0 : 3;
+		u32 i_bbthresh_3_0 : 4;
+		u32 i_bbinlock_h : 1;
+		u32 i_coldstart : 1;
+		u32 i_irefbias_startup_pulse_width_1_0 : 2;
+		u32 i_dco_settling_time_cntr_3_0 : 4;
+		u32 i_irefbias_startup_pulse_bypass : 1;
+		u32 i_bias_calib_stepsize_1_0 : 2;
+		u32 reserved81 : 1;
+		u32 i_irefint_en : 1;
+		u32 i_vgsbufen : 1;
+		u32 i_digdftswep : 1;
+		u32 i_irefdigdften : 1;
+		u32 i_iref_refclk_inv_en : 1;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_DFX_DCO 0x424
+#define JAPLL_DFX_DCO_I_DCOFINEDFTSEL_1_0_S 0
+#define JAPLL_DFX_DCO_I_DCOFINEDFTSEL_1_0_M ICE_M(0x3, 0)
+#define JAPLL_DFX_DCO_I_DCOCOARSE_OVRD_H_S 2
+#define JAPLL_DFX_DCO_I_DCOCOARSE_OVRD_H_M BIT(2)
+#define JAPLL_DFX_DCO_I_BIAS_FILTER_EN_S 3
+#define JAPLL_DFX_DCO_I_BIAS_FILTER_EN_M BIT(3)
+#define JAPLL_DFX_DCO_I_PLLPWRMODE_1_0_S 4
+#define JAPLL_DFX_DCO_I_PLLPWRMODE_1_0_M ICE_M(0x3, 4)
+#define JAPLL_DFX_DCO_I_DCOAMP_STATICLEG_CFG_1_0_S 6
+#define JAPLL_DFX_DCO_I_DCOAMP_STATICLEG_CFG_1_0_M ICE_M(0x3, 6)
+#define JAPLL_DFX_DCO_I_DCOFINE_7_0_S 8
+#define JAPLL_DFX_DCO_I_DCOFINE_7_0_M ICE_M(0xff, 8)
+#define JAPLL_DFX_DCO_I_DCOFINE_9_8_S 16
+#define JAPLL_DFX_DCO_I_DCOFINE_9_8_M ICE_M(0x3, 16)
+#define JAPLL_DFX_DCO_I_DCOAMPOVRDEN_H_S 18
+#define JAPLL_DFX_DCO_I_DCOAMPOVRDEN_H_M BIT(18)
+#define JAPLL_DFX_DCO_I_DCOAMP_3_0_S 19
+#define JAPLL_DFX_DCO_I_DCOAMP_3_0_M ICE_M(0xf, 19)
+#define JAPLL_DFX_DCO_I_BIASFILTER_EN_DELAY_S 23
+#define JAPLL_DFX_DCO_I_BIASFILTER_EN_DELAY_M BIT(23)
+#define JAPLL_DFX_DCO_I_DCOCOARSE_7_0_S 24
+#define JAPLL_DFX_DCO_I_DCOCOARSE_7_0_M ICE_M(0xff, 24)
+
+union japll_dfx_dco {
+	struct {
+		u32 i_dcofinedftsel_1_0 : 2;
+		u32 i_dcocoarse_ovrd_h : 1;
+		u32 i_bias_filter_en : 1;
+		u32 i_pllpwrmode_1_0 : 2;
+		u32 i_dcoamp_staticleg_cfg_1_0 : 2;
+		u32 i_dcofine_7_0 : 8;
+		u32 i_dcofine_9_8 : 2;
+		u32 i_dcoampovrden_h : 1;
+		u32 i_dcoamp_3_0 : 4;
+		u32 i_biasfilter_en_delay : 1;
+		u32 i_dcocoarse_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#define JAPLL_RO_BWM_LF 0x470
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_S 0
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_7_0_M ICE_M(0xff, 0)
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_S 8
+#define JAPLL_RO_BWM_LF_BW_FREQOV_HIGH_CRI_9_8_M ICE_M(0x3, 8)
+#define JAPLL_RO_BWM_LF_BIASCALDONE_CRI_S 10
+#define JAPLL_RO_BWM_LF_BIASCALDONE_CRI_M BIT(10)
+#define JAPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_S 11
+#define JAPLL_RO_BWM_LF_PLLLOCK_GAIN_TRAN_CRI_M BIT(11)
+#define JAPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_S 12
+#define JAPLL_RO_BWM_LF_PLLLOCK_TRUE_LOCK_CRI_M BIT(12)
+#define JAPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_S 13
+#define JAPLL_RO_BWM_LF_PLLUNLOCK_FLAG_CRI_M BIT(13)
+#define JAPLL_RO_BWM_LF_AFCERR_CRI_S 14
+#define JAPLL_RO_BWM_LF_AFCERR_CRI_M BIT(14)
+#define JAPLL_RO_BWM_LF_AFCDONE_CRI_S 15
+#define JAPLL_RO_BWM_LF_AFCDONE_CRI_M BIT(15)
+#define JAPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_S 16
+#define JAPLL_RO_BWM_LF_FEEDFWRDGAIN_CAL_CRI_7_0_M ICE_M(0xff, 16)
+#define JAPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_S 24
+#define JAPLL_RO_BWM_LF_M2FBDIVMOD_CRI_7_0_M ICE_M(0xff, 24)
+
+union japll_ro_bwm_lf {
+	struct {
+		u32 bw_freqov_high_cri_7_0 : 8;
+		u32 bw_freqov_high_cri_9_8 : 2;
+		u32 biascaldone_cri : 1;
+		u32 plllock_gain_tran_cri : 1;
+		u32 plllock_true_lock_cri : 1;
+		u32 pllunlock_flag_cri : 1;
+		u32 afcerr_cri : 1;
+		u32 afcdone_cri : 1;
+		u32 feedfwrdgain_cal_cri_7_0 : 8;
+		u32 m2fbdivmod_cri_7_0 : 8;
+	} field;
+	u32 val;
+};
+
+#endif /* _ICE_CGU_REGS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_util.c b/drivers/net/ethernet/intel/ice/ice_cgu_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..75561d5c26fd761cc9fc8329055cadb1269a655f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_util.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+/**
+ * ice_cgu_reg_read - Read a CGU register
+ * @pf: Board private structure
+ * @reg: Register to read from
+ * @val: Pointer to the value to read (out param)
+ */
+int ice_cgu_reg_read(struct ice_pf *pf, u32 reg, u32 *val)
+{
+	struct ice_sbq_msg_input cgu_msg;
+	int status;
+
+	cgu_msg.opcode = ice_sbq_msg_rd;
+	cgu_msg.dest_dev = cgu;
+	cgu_msg.msg_addr_low = reg;
+	cgu_msg.msg_addr_high = 0x0;
+
+	status = ice_sbq_rw_reg_lp(&pf->hw, &cgu_msg, true);
+	if (status) {
+		dev_dbg(ice_pf_to_dev(pf), "addr 0x%04x, val 0x%08x\n", reg, cgu_msg.data);
+		return -EIO;
+	}
+
+	*val = cgu_msg.data;
+
+	return 0;
+}
+
+/**
+ * ice_cgu_reg_write - Write a CGU register with lock parameter
+ * @pf: Board private structure
+ * @reg: Register to write to
+ * @val: Value to write
+ */
+int ice_cgu_reg_write(struct ice_pf *pf, u32 reg, u32 val)
+{
+	struct ice_sbq_msg_input cgu_msg;
+	int status;
+
+	cgu_msg.opcode = ice_sbq_msg_wr;
+	cgu_msg.dest_dev = cgu;
+	cgu_msg.msg_addr_low = reg;
+	cgu_msg.msg_addr_high = 0x0;
+	cgu_msg.data = val;
+
+	dev_dbg(ice_pf_to_dev(pf), "addr 0x%04x, val 0x%08x\n", reg, val);
+
+	status = ice_sbq_rw_reg_lp(&pf->hw, &cgu_msg, true);
+	if (status)
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_cgu_set_gnd - Ground the refclk
+ * @pf: Board private structure
+ * @enable: True to ground the refclk
+ */
+int ice_cgu_set_gnd(struct ice_pf *pf, bool enable)
+{
+	int status = 0;
+	union nac_cgu_dword10 dw10;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (enable)
+		dw10.field.synce_sel_gnd = 1;
+	else
+		dw10.field.synce_sel_gnd = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_set_byp - Set the DPLL bypass
+ * @pf: Board private structure
+ * @enable: True to enable bypass
+ */
+int ice_cgu_set_byp(struct ice_pf *pf, bool enable)
+{
+	union nac_cgu_dword12 dw12;
+	int status = 0;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+	if (enable)
+		dw12.field.synce_dpll_byp = 1;
+	else
+		dw12.field.synce_dpll_byp = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD12, dw12.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_set_holdover_lock_irq - Set holdover/lock interrupt
+ * @pf: Board private structure
+ * @enable: True to enable the lock
+ */
+int ice_cgu_set_holdover_lock_irq(struct ice_pf *pf, bool enable)
+{
+	union nac_cgu_dword13 dw13;
+	int status;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD13, &dw13.val);
+	if (status)
+		goto err;
+
+	/* the *_int_enb bits are defined opposite of what one would expect.
+	 * 0 = enabled, 1 = disabled
+	 */
+	if (enable) {
+		dw13.field.synce_hdov_int_enb = 0;
+		dw13.field.synce_lock_int_enb = 0;
+	} else {
+		dw13.field.synce_hdov_int_enb = 1;
+		dw13.field.synce_lock_int_enb = 1;
+	}
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD13, dw13.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_mux_sel_set_reg - Write to selected mux register
+ * @pf: Board private structure
+ * @mux_sel: Target mux
+ * @val: Value to write to
+ */
+int ice_cgu_mux_sel_set_reg(struct ice_pf *pf, enum ice_cgu_mux_sel mux_sel, u32 val)
+{
+	union nac_cgu_dword10 dw10;
+	union nac_cgu_dword11 dw11;
+	int status;
+
+	switch (mux_sel) {
+	case ICE_CGU_MUX_SEL_REF_CLK:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_s_ref_clk = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_BYPASS_CLK:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD11, &dw11.val);
+		if (status)
+			goto err;
+		dw11.field.synce_s_byp_clk = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD11, dw11.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_ETHCLKO:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_ethclko_sel = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	case ICE_CGU_MUX_SEL_CLKO:
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+		if (status)
+			goto err;
+		dw10.field.synce_clko_sel = val;
+		status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+		if (status)
+			goto err;
+		break;
+
+	default:
+		dev_err(ice_pf_to_dev(pf), "internal error -- invalid mux!\n");
+		return -EIO;
+	}
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_dck_rst_assert_release - Assert the dck reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_dck_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword10 dw10;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw10.field.synce_dck_rst = 1;
+	else
+		dw10.field.synce_dck_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_dck2_rst_assert_release - Assert the dck2 reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_dck2_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword10 dw10;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw10.field.synce_dck2_rst = 1;
+	else
+		dw10.field.synce_dck2_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD10, dw10.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD10, &dw10.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_mck_rst_assert_release - Assert the mck reset
+ * @pf: Board private structure
+ * @assert: True to assert, false to release
+ */
+int ice_cgu_mck_rst_assert_release(struct ice_pf *pf, bool assert)
+{
+	union nac_cgu_dword12 dw12;
+	int status = 0;
+	int i;
+
+	status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+	if (assert)
+		dw12.field.synce_mck_rst = 1;
+	else
+		dw12.field.synce_mck_rst = 0;
+
+	status = ice_cgu_reg_write(pf, NAC_CGU_DWORD12, dw12.val);
+	if (status)
+		goto err;
+
+	for (i = 0; i < 3; i++)
+		status = ice_cgu_reg_read(pf, NAC_CGU_DWORD12, &dw12.val);
+	if (status)
+		goto err;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_usleep - Sleep for a specified period of time
+ * @usec: Time to sleep in microseconds
+ */
+void ice_cgu_usleep(u64 usec)
+{
+	if (usec <= 10) {
+		udelay(usec);
+	} else if (usec <= 20000) {
+		usleep_range(usec, usec + 10);
+	} else {
+		int msec;
+
+		msec = (usec + 999) / 1000;
+		msleep_interruptible(msec);
+	}
+}
+
+/**
+ * ice_cgu_poll - Poll the specified CGU register for the specified value
+ * @pf: Board private structure
+ * @offset: Offset of the register
+ * @mask: Bitmask for testing the value
+ * @value: Value to poll for
+ * @delay_time: Delay between the register reads
+ * @delay_loops: Number of read loops
+ */
+int ice_cgu_poll(struct ice_pf *pf, u64 offset, u32 mask, u32 value, u32 delay_time,
+		 u32 delay_loops)
+{
+	int status;
+	u32 reg, i;
+
+	for (i = 0; i < delay_loops; i++) {
+		status = ice_cgu_reg_read(pf, offset, &reg);
+		if (status)
+			goto err;
+
+		if ((reg & mask) == value)
+			return 0;
+
+		/* delay for a bit */
+		ice_cgu_usleep(delay_time);
+	}
+
+	return -EBUSY;
+
+err:
+	return status;
+}
+
+/**
+ * ice_cgu_npoll - Poll the specified CGU register for the specified value occurring n times
+ * @pf: Board private structure
+ * @offset: Offset of the register
+ * @mask: Bitmask for testing the value
+ * @value: Value to poll for
+ * @delay_time: Delay between the register reads
+ * @delay_loops: Number of read loops
+ * @poll_count: Number of the value matches to poll for
+ * @count_delay_time: Additional delay after the value match
+ */
+int ice_cgu_npoll(struct ice_pf *pf, u32 offset, u32 mask, u32 value, u32 delay_time,
+		  u32 delay_loops, u32 poll_count, u32 count_delay_time)
+{
+	u32 reg, i, my_count = 0, complete = 0;
+	int status;
+
+	for (i = 0; i < delay_loops; i++) {
+		status = ice_cgu_reg_read(pf, offset, &reg);
+		if (status)
+			goto err;
+
+		dev_dbg(ice_pf_to_dev(pf), "count=%u, reg=%08x\n", my_count, reg);
+
+		if ((reg & mask) == value) {
+			my_count++;
+			if (my_count < poll_count) {
+				ice_cgu_usleep(count_delay_time);
+			} else {
+				complete = 1;
+				break;
+			}
+		} else {
+			my_count = 0;
+			ice_cgu_usleep(delay_time);
+		}
+	}
+
+	if (complete)
+		return 0;
+	else
+		return -EBUSY;
+
+err:
+	return status;
+}
+
+struct ice_cgu_dpll_params dpll_params_table[ICE_NUM_DPLL_PARAMS] = {
+	/* {dpll select, sample rate, mul_rat_m1, scale, gain} */
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_8K, 3124, 16, 42 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_8K, 3124, 7, 3 },
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_10K, 2499, 20, 66 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_10K, 2499, 8, 4 },
+	{ ICE_CGU_DPLL_SELECT_TRANSPORT, ICE_CGU_SAMPLE_RATE_12K5, 1999, 25, 103 },
+	{ ICE_CGU_DPLL_SELECT_EEC_RELAXED_BW, ICE_CGU_SAMPLE_RATE_12K5, 1999, 10, 6 }
+};
+
+struct ice_cgu_dpll_per_rate_params dpll_per_rate_params[NUM_ICE_TIME_REF_FREQ] = {
+	/* {rate_hz, sample_rate, div_rat_m1, synce_rat_sel} */
+	{ 25000000, ICE_CGU_SAMPLE_RATE_10K, 2499, 0 }, /* 25 MHz */
+	{ 122880000, ICE_CGU_SAMPLE_RATE_8K, 3071, 1 }, /* 122.88 MHz */
+	{ 125000000, ICE_CGU_SAMPLE_RATE_10K, 2499, 1 }, /* 125 MHz */
+	{ 153600000, ICE_CGU_SAMPLE_RATE_10K, 3071, 1 }, /* 153.6 MHz */
+	{ 156250000, ICE_CGU_SAMPLE_RATE_10K, 3124, 1 }, /* 156.25 MHz */
+};
+
+struct ice_cgu_lcpll_per_rate_params tspll_per_rate_params[NUM_ICE_TIME_REF_FREQ] = {
+	/* {refclk_pre_div, feedback_div, frac_n_div, post_pll_div} */
+	{ 1, 197, 2621440, 6 }, /* 25 MHz */
+	{ 5, 223, 524288, 7 }, /* 122.88 MHz */
+	{ 5, 223, 524288, 7 }, /* 125 MHz */
+	{ 5, 159, 1572864, 6 }, /* 153.6 MHz */
+	{ 5, 159, 1572864, 6 }, /* 156.25 MHz */
+	{ 10, 223, 524288, 7 }, /* 245.76 MHz */
+};
+
+struct ice_cgu_lcpll_per_rate_params japll_per_rate_params[NUM_ICE_CGU_JAPLL_REF_FREQ] = {
+	/* {refclk_pre_div, feedback_div, frac_n_div, post_pll_div} */
+	{ 1, 150, 0, 6 }, /* 25 MHz */
+	{ 1, 120, 0, 6 }, /* 156.25 MHz */
+};
diff --git a/drivers/net/ethernet/intel/ice/ice_cgu_util.h b/drivers/net/ethernet/intel/ice/ice_cgu_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a1566324cb1d013823c3d307a7e071ce7ebb96e
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_cgu_util.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_CGU_UTIL_H_
+#define _ICE_CGU_UTIL_H_
+
+/* offset of last valid CGU register */
+#define ICE_CGU_MAX_REG_OFFS 0x47c
+
+int ice_cgu_reg_read(struct ice_pf *pf, u32 reg, u32 *val);
+
+int ice_cgu_reg_write(struct ice_pf *pf, u32 reg, u32 val);
+
+int ice_cgu_set_gnd(struct ice_pf *pf, bool enable);
+
+int ice_cgu_set_byp(struct ice_pf *pf, bool enable);
+
+int ice_cgu_set_holdover_lock_irq(struct ice_pf *pf, bool enable);
+
+int ice_cgu_mux_sel_set_reg(struct ice_pf *pf, enum ice_cgu_mux_sel mux_sel, u32 val);
+
+int ice_cgu_dck_rst_assert_release(struct ice_pf *pf, bool assert);
+
+int ice_cgu_dck2_rst_assert_release(struct ice_pf *pf, bool assert);
+
+int ice_cgu_mck_rst_assert_release(struct ice_pf *pf, bool assert);
+
+void ice_cgu_usleep(u64 usec);
+
+int ice_cgu_poll(struct ice_pf *pf, u64 offset, u32 mask, u32 value, u32 delay_time,
+		 u32 delay_loops);
+
+int ice_cgu_npoll(struct ice_pf *pf, u32 offset, u32 mask, u32 value, u32 delay_time,
+		  u32 delay_loops, u32 poll_count, u32 count_delay_time);
+
+#define ICE_NUM_DPLL_PARAMS (NUM_ICE_CGU_SAMPLE_RATE * NUM_ICE_CGU_DPLL_SELECT)
+
+extern struct ice_cgu_dpll_params dpll_params_table[ICE_NUM_DPLL_PARAMS];
+
+extern struct ice_cgu_dpll_per_rate_params dpll_per_rate_params[NUM_ICE_TIME_REF_FREQ];
+
+extern struct ice_cgu_lcpll_per_rate_params tspll_per_rate_params[NUM_ICE_TIME_REF_FREQ];
+
+extern struct ice_cgu_lcpll_per_rate_params japll_per_rate_params[NUM_ICE_CGU_JAPLL_REF_FREQ];
+
+#endif /* _ICE_CGU_UTIL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index d68b8aa31b193fa82969a7c5d92a1442f4815b3b..32335405e484d85f874ae07cdf49b7fc9152e622 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -1,30 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_sched.h"
 #include "ice_adminq_cmd.h"
 
-#define ICE_PF_RESET_WAIT_COUNT	200
+#include "ice_flow.h"
+#include "ice_switch.h"
 
-#define ICE_PROG_FLEX_ENTRY(hw, rxdid, mdid, idx) \
-	wr32((hw), GLFLXP_RXDID_FLX_WRD_##idx(rxdid), \
-	     ((ICE_RX_OPC_MDID << \
-	       GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_S) & \
-	      GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_M) | \
-	     (((mdid) << GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_S) & \
-	      GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_M))
+#define ICE_PF_RESET_WAIT_COUNT	300
+#define ICE_SCHED_VALID_SEC_BITS 4
 
-#define ICE_PROG_FLG_ENTRY(hw, rxdid, flg_0, flg_1, flg_2, flg_3, idx) \
-	wr32((hw), GLFLXP_RXDID_FLAGS(rxdid, idx), \
-	     (((flg_0) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M) | \
-	     (((flg_1) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M) | \
-	     (((flg_2) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M) | \
-	     (((flg_3) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S) & \
-	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M))
 
 /**
  * ice_set_mac_type - Sets MAC type
@@ -38,24 +24,68 @@ static enum ice_status ice_set_mac_type(struct ice_hw *hw)
 	if (hw->vendor_id != PCI_VENDOR_ID_INTEL)
 		return ICE_ERR_DEVICE_NOT_SUPPORTED;
 
-	hw->mac_type = ICE_MAC_GENERIC;
+	switch (hw->device_id) {
+	case ICE_DEV_ID_E810C_BACKPLANE:
+	case ICE_DEV_ID_E810C_QSFP:
+	case ICE_DEV_ID_E810C_SFP:
+	case ICE_DEV_ID_E810_XXV_BACKPLANE:
+	case ICE_DEV_ID_E810_XXV_QSFP:
+	case ICE_DEV_ID_E810_XXV_SFP:
+		hw->mac_type = ICE_MAC_E810;
+		break;
+	case ICE_DEV_ID_E822C_10G_BASE_T:
+	case ICE_DEV_ID_E822C_BACKPLANE:
+	case ICE_DEV_ID_E822C_QSFP:
+	case ICE_DEV_ID_E822C_SFP:
+	case ICE_DEV_ID_E822C_SGMII:
+	case ICE_DEV_ID_E822L_10G_BASE_T:
+	case ICE_DEV_ID_E822L_BACKPLANE:
+	case ICE_DEV_ID_E822L_SFP:
+	case ICE_DEV_ID_E822L_SGMII:
+	case ICE_DEV_ID_E823L_10G_BASE_T:
+	case ICE_DEV_ID_E823L_1GBE:
+	case ICE_DEV_ID_E823L_BACKPLANE:
+	case ICE_DEV_ID_E823L_QSFP:
+	case ICE_DEV_ID_E823L_SFP:
+	case ICE_DEV_ID_E823C_10G_BASE_T:
+	case ICE_DEV_ID_E823C_BACKPLANE:
+	case ICE_DEV_ID_E823C_QSFP:
+	case ICE_DEV_ID_E823C_SFP:
+	case ICE_DEV_ID_E823C_SGMII:
+		hw->mac_type = ICE_MAC_GENERIC;
+		break;
+	default:
+		hw->mac_type = ICE_MAC_UNKNOWN;
+		break;
+	}
+
+	ice_debug(hw, ICE_DBG_INIT, "mac_type: %d\n", hw->mac_type);
 	return 0;
 }
 
 /**
- * ice_dev_onetime_setup - Temporary HW/FW workarounds
- * @hw: pointer to the HW structure
+ * ice_is_generic_mac
+ * @hw: pointer to the hardware structure
  *
- * This function provides temporary workarounds for certain issues
- * that are expected to be fixed in the HW/FW.
+ * returns true if mac_type is ICE_MAC_GENERIC, false if not
  */
-void ice_dev_onetime_setup(struct ice_hw *hw)
+bool ice_is_generic_mac(struct ice_hw *hw)
 {
-#define MBX_PF_VT_PFALLOC	0x00231E80
-	/* set VFs per PF */
-	wr32(hw, MBX_PF_VT_PFALLOC, rd32(hw, PF_VT_PFALLOC_HIF));
+	return hw->mac_type == ICE_MAC_GENERIC;
 }
 
+/**
+ * ice_is_e810
+ * @hw: pointer to the hardware structure
+ *
+ * returns true if the device is E810 based, false if not.
+ */
+bool ice_is_e810(struct ice_hw *hw)
+{
+	return hw->mac_type == ICE_MAC_E810;
+}
+
+
 /**
  * ice_clear_pf_cfg - Clear PF configuration
  * @hw: pointer to the hardware structure
@@ -84,7 +114,8 @@ enum ice_status ice_clear_pf_cfg(struct ice_hw *hw)
  * is returned in user specified buffer. Please interpret user specified
  * buffer as "manage_mac_read" response.
  * Response such as various MAC addresses are stored in HW struct (port.mac)
- * ice_aq_discover_caps is expected to be called before this function is called.
+ * ice_discover_dev_caps is expected to be called before this function is
+ * called.
  */
 static enum ice_status
 ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
@@ -108,7 +139,7 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 	if (status)
 		return status;
 
-	resp = (struct ice_aqc_manage_mac_read_resp *)buf;
+	resp = buf;
 	flags = le16_to_cpu(cmd->flags) & ICE_AQC_MAN_MAC_READ_M;
 
 	if (!(flags & ICE_AQC_MAN_MAC_LAN_ADDR_VALID)) {
@@ -125,7 +156,6 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 					resp[i].mac_addr);
 			break;
 		}
-
 	return 0;
 }
 
@@ -148,11 +178,17 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 	u16 pcaps_size = sizeof(*pcaps);
 	struct ice_aq_desc desc;
 	enum ice_status status;
+	struct ice_hw *hw;
 
 	cmd = &desc.params.get_phy;
 
 	if (!pcaps || (report_mode & ~ICE_AQC_REPORT_MODE_M) || !pi)
 		return ICE_ERR_PARAM;
+	hw = pi->hw;
+
+	if (report_mode == ICE_AQC_REPORT_DFLT_CFG &&
+	    !ice_fw_supports_report_dflt_cfg(hw))
+		return ICE_ERR_PARAM;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_phy_caps);
 
@@ -160,16 +196,95 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 		cmd->param0 |= cpu_to_le16(ICE_AQC_GET_PHY_RQM);
 
 	cmd->param0 |= cpu_to_le16(report_mode);
-	status = ice_aq_send_cmd(pi->hw, &desc, pcaps, pcaps_size, cd);
-
-	if (!status && report_mode == ICE_AQC_REPORT_TOPO_CAP) {
+	status = ice_aq_send_cmd(hw, &desc, pcaps, pcaps_size, cd);
+
+	ice_debug(hw, ICE_DBG_LINK, "get phy caps - report_mode = 0x%x\n",
+		  report_mode);
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(pcaps->phy_type_low));
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(pcaps->phy_type_high));
+	ice_debug(hw, ICE_DBG_LINK, "	caps = 0x%x\n", pcaps->caps);
+	ice_debug(hw, ICE_DBG_LINK, "	low_power_ctrl_an = 0x%x\n",
+		  pcaps->low_power_ctrl_an);
+	ice_debug(hw, ICE_DBG_LINK, "	eee_cap = 0x%x\n", pcaps->eee_cap);
+	ice_debug(hw, ICE_DBG_LINK, "	eeer_value = 0x%x\n",
+		  pcaps->eeer_value);
+	ice_debug(hw, ICE_DBG_LINK, "	link_fec_options = 0x%x\n",
+		  pcaps->link_fec_options);
+	ice_debug(hw, ICE_DBG_LINK, "	module_compliance_enforcement = 0x%x\n",
+		  pcaps->module_compliance_enforcement);
+	ice_debug(hw, ICE_DBG_LINK, "   extended_compliance_code = 0x%x\n",
+		  pcaps->extended_compliance_code);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[0] = 0x%x\n",
+		  pcaps->module_type[0]);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[1] = 0x%x\n",
+		  pcaps->module_type[1]);
+	ice_debug(hw, ICE_DBG_LINK, "   module_type[2] = 0x%x\n",
+		  pcaps->module_type[2]);
+
+	if (!status && report_mode == ICE_AQC_REPORT_TOPO_CAP_MEDIA) {
 		pi->phy.phy_type_low = le64_to_cpu(pcaps->phy_type_low);
 		pi->phy.phy_type_high = le64_to_cpu(pcaps->phy_type_high);
+		memcpy(pi->phy.link_info.module_type, &pcaps->module_type,
+		       sizeof(pi->phy.link_info.module_type));
 	}
 
 	return status;
 }
 
+/**
+ * ice_aq_get_link_topo_handle - get link topology node return status
+ * @pi: port information structure
+ * @node_type: requested node type
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get link topology node return status for specified node type (0x06E0)
+ *
+ * Node type cage can be used to determine if cage is present. If AQC
+ * returns error (ENOENT), then no cage present. If no cage present, then
+ * connection type is backplane or BASE-T.
+ */
+static enum ice_status
+ice_aq_get_link_topo_handle(struct ice_port_info *pi, u8 node_type,
+			    struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_link_topo *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.get_link_topo;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
+
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PORT <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
+
+	/* set node type */
+	cmd->addr.topo_params.node_type_ctx |=
+		(ICE_AQC_LINK_TOPO_NODE_TYPE_M & node_type);
+
+	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_is_media_cage_present
+ * @pi: port information structure
+ *
+ * Returns true if media cage is present, else false. If no cage, then
+ * media type is backplane or BASE-T.
+ */
+static bool ice_is_media_cage_present(struct ice_port_info *pi)
+{
+	/* Node type cage can be used to determine if cage is present. If AQC
+	 * returns error (ENOENT), then no cage present. If no cage present then
+	 * connection type is backplane or BASE-T.
+	 */
+	return !ice_aq_get_link_topo_handle(pi,
+					    ICE_AQC_LINK_TOPO_NODE_TYPE_CAGE,
+					    NULL);
+}
+
 /**
  * ice_get_media_type - Gets media type
  * @pi: port information structure
@@ -187,6 +302,18 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		return ICE_MEDIA_UNKNOWN;
 
 	if (hw_link_info->phy_type_low) {
+		/* 1G SGMII is a special case where some DA cable PHYs
+		 * may show this as an option when it really shouldn't
+		 * be since SGMII is meant to be between a MAC and a PHY
+		 * in a backplane. Try to detect this case and handle it
+		 */
+		if (hw_link_info->phy_type_low == ICE_PHY_TYPE_LOW_1G_SGMII &&
+		    (hw_link_info->module_type[ICE_AQC_MOD_TYPE_IDENT] ==
+		    ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE ||
+		    hw_link_info->module_type[ICE_AQC_MOD_TYPE_IDENT] ==
+		    ICE_AQC_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE))
+			return ICE_MEDIA_DA;
+
 		switch (hw_link_info->phy_type_low) {
 		case ICE_PHY_TYPE_LOW_1000BASE_SX:
 		case ICE_PHY_TYPE_LOW_1000BASE_LX:
@@ -195,7 +322,6 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
 		case ICE_PHY_TYPE_LOW_25GBASE_SR:
 		case ICE_PHY_TYPE_LOW_25GBASE_LR:
-		case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
 		case ICE_PHY_TYPE_LOW_40GBASE_SR4:
 		case ICE_PHY_TYPE_LOW_40GBASE_LR4:
 		case ICE_PHY_TYPE_LOW_50GBASE_SR2:
@@ -208,6 +334,15 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_100GBASE_SR2:
 		case ICE_PHY_TYPE_LOW_100GBASE_DR:
 			return ICE_MEDIA_FIBER;
+		case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
+		case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
+			return ICE_MEDIA_FIBER;
 		case ICE_PHY_TYPE_LOW_100BASE_TX:
 		case ICE_PHY_TYPE_LOW_1000BASE_T:
 		case ICE_PHY_TYPE_LOW_2500BASE_T:
@@ -226,6 +361,16 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
 		case ICE_PHY_TYPE_LOW_100GBASE_CP2:
 			return ICE_MEDIA_DA;
+		case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
+		case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		case ICE_PHY_TYPE_LOW_50G_LAUI2:
+		case ICE_PHY_TYPE_LOW_50G_AUI2:
+		case ICE_PHY_TYPE_LOW_50G_AUI1:
+		case ICE_PHY_TYPE_LOW_100G_AUI4:
+		case ICE_PHY_TYPE_LOW_100G_CAUI4:
+			if (ice_is_media_cage_present(pi))
+				return ICE_MEDIA_AUI;
+			/* fall-through */
 		case ICE_PHY_TYPE_LOW_1000BASE_KX:
 		case ICE_PHY_TYPE_LOW_2500BASE_KX:
 		case ICE_PHY_TYPE_LOW_2500BASE_X:
@@ -243,13 +388,22 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 		}
 	} else {
 		switch (hw_link_info->phy_type_high) {
+		case ICE_PHY_TYPE_HIGH_100G_AUI2:
+		case ICE_PHY_TYPE_HIGH_100G_CAUI2:
+			if (ice_is_media_cage_present(pi))
+				return ICE_MEDIA_AUI;
+			/* fall-through */
 		case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
 			return ICE_MEDIA_BACKPLANE;
+		case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
+		case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
+			return ICE_MEDIA_FIBER;
 		}
 	}
 	return ICE_MEDIA_UNKNOWN;
 }
 
+
 /**
  * ice_aq_get_link_info
  * @pi: port information structure
@@ -277,6 +431,8 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 	if (!pi)
 		return ICE_ERR_PARAM;
 	hw = pi->hw;
+
+
 	li_old = &pi->phy.link_info_old;
 	hw_media_type = &pi->phy.media_type;
 	li = &pi->phy.link_info;
@@ -302,6 +458,7 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 	li->phy_type_high = le64_to_cpu(link_data.phy_type_high);
 	*hw_media_type = ice_get_media_type(pi);
 	li->link_info = link_data.link_info;
+	li->link_cfg_err = link_data.link_cfg_err;
 	li->an_info = link_data.an_info;
 	li->ext_info = link_data.ext_info;
 	li->max_frame_size = le16_to_cpu(link_data.max_frame_size);
@@ -324,18 +481,22 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 
 	li->lse_ena = !!(resp->cmd_flags & cpu_to_le16(ICE_AQ_LSE_IS_ENABLED));
 
-	ice_debug(hw, ICE_DBG_LINK, "link_speed = 0x%x\n", li->link_speed);
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_low = 0x%llx\n",
+	ice_debug(hw, ICE_DBG_LINK, "get link info\n");
+	ice_debug(hw, ICE_DBG_LINK, "	link_speed = 0x%x\n", li->link_speed);
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
 		  (unsigned long long)li->phy_type_low);
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_high = 0x%llx\n",
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
 		  (unsigned long long)li->phy_type_high);
-	ice_debug(hw, ICE_DBG_LINK, "media_type = 0x%x\n", *hw_media_type);
-	ice_debug(hw, ICE_DBG_LINK, "link_info = 0x%x\n", li->link_info);
-	ice_debug(hw, ICE_DBG_LINK, "an_info = 0x%x\n", li->an_info);
-	ice_debug(hw, ICE_DBG_LINK, "ext_info = 0x%x\n", li->ext_info);
-	ice_debug(hw, ICE_DBG_LINK, "lse_ena = 0x%x\n", li->lse_ena);
-	ice_debug(hw, ICE_DBG_LINK, "max_frame = 0x%x\n", li->max_frame_size);
-	ice_debug(hw, ICE_DBG_LINK, "pacing = 0x%x\n", li->pacing);
+	ice_debug(hw, ICE_DBG_LINK, "	media_type = 0x%x\n", *hw_media_type);
+	ice_debug(hw, ICE_DBG_LINK, "	link_info = 0x%x\n", li->link_info);
+	ice_debug(hw, ICE_DBG_LINK, "	link_cfg_err = 0x%x\n", li->link_cfg_err);
+	ice_debug(hw, ICE_DBG_LINK, "	an_info = 0x%x\n", li->an_info);
+	ice_debug(hw, ICE_DBG_LINK, "	ext_info = 0x%x\n", li->ext_info);
+	ice_debug(hw, ICE_DBG_LINK, "	fec_info = 0x%x\n", li->fec_info);
+	ice_debug(hw, ICE_DBG_LINK, "	lse_ena = 0x%x\n", li->lse_ena);
+	ice_debug(hw, ICE_DBG_LINK, "	max_frame = 0x%x\n",
+		  li->max_frame_size);
+	ice_debug(hw, ICE_DBG_LINK, "	pacing = 0x%x\n", li->pacing);
 
 	/* save link status information */
 	if (link)
@@ -348,85 +509,68 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 }
 
 /**
- * ice_init_flex_flags
- * @hw: pointer to the hardware structure
- * @prof_id: Rx Descriptor Builder profile ID
+ * ice_fill_tx_timer_and_fc_thresh
+ * @hw: pointer to the HW struct
+ * @cmd: pointer to MAC cfg structure
  *
- * Function to initialize Rx flex flags
+ * Add Tx timer and FC refresh threshold info to Set MAC Config AQ command
+ * descriptor
  */
-static void ice_init_flex_flags(struct ice_hw *hw, enum ice_rxdid prof_id)
+static void
+ice_fill_tx_timer_and_fc_thresh(struct ice_hw *hw,
+				struct ice_aqc_set_mac_cfg *cmd)
 {
-	u8 idx = 0;
+	u16 fc_thres_val, tx_timer_val;
+	u32 val;
 
-	/* Flex-flag fields (0-2) are programmed with FLG64 bits with layout:
-	 * flexiflags0[5:0] - TCP flags, is_packet_fragmented, is_packet_UDP_GRE
-	 * flexiflags1[3:0] - Not used for flag programming
-	 * flexiflags2[7:0] - Tunnel and VLAN types
-	 * 2 invalid fields in last index
-	 */
-	switch (prof_id) {
-	/* Rx flex flags are currently programmed for the NIC profiles only.
-	 * Different flag bit programming configurations can be added per
-	 * profile as needed.
+	/* We read back the transmit timer and fc threshold value of
+	 * LFC. Thus, we will use index =
+	 * PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX.
+	 *
+	 * Also, because we are opearating on transmit timer and fc
+	 * threshold of LFC, we don't turn on any bit in tx_tmr_priority
 	 */
-	case ICE_RXDID_FLEX_NIC:
-	case ICE_RXDID_FLEX_NIC_2:
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_FRG,
-				   ICE_FLG_UDP_GRE, ICE_FLG_PKT_DSI,
-				   ICE_FLG_FIN, idx++);
-		/* flex flag 1 is not used for flexi-flag programming, skipping
-		 * these four FLG64 bits.
-		 */
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_SYN, ICE_FLG_RST,
-				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_DSI,
-				   ICE_FLG_PKT_DSI, ICE_FLG_EVLAN_x8100,
-				   ICE_FLG_EVLAN_x9100, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_VLAN_x8100,
-				   ICE_FLG_TNL_VLAN, ICE_FLG_TNL_MAC,
-				   ICE_FLG_TNL0, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_TNL1, ICE_FLG_TNL2,
-				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx);
-		break;
+#define IDX_OF_LFC PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX
 
-	default:
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Flag programming for profile ID %d not supported\n",
-			  prof_id);
-	}
+	/* Retrieve the transmit timer */
+	val = rd32(hw, PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(IDX_OF_LFC));
+	tx_timer_val = val &
+		PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_M;
+	cmd->tx_tmr_value = cpu_to_le16(tx_timer_val);
+
+	/* Retrieve the fc threshold */
+	val = rd32(hw, PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(IDX_OF_LFC));
+	fc_thres_val = val & PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_M;
+
+	cmd->fc_refresh_threshold = cpu_to_le16(fc_thres_val);
 }
 
 /**
- * ice_init_flex_flds
- * @hw: pointer to the hardware structure
- * @prof_id: Rx Descriptor Builder profile ID
+ * ice_aq_set_mac_cfg
+ * @hw: pointer to the HW struct
+ * @max_frame_size: Maximum Frame Size to be supported
+ * @cd: pointer to command details structure or NULL
  *
- * Function to initialize flex descriptors
+ * Set MAC configuration (0x0603)
  */
-static void ice_init_flex_flds(struct ice_hw *hw, enum ice_rxdid prof_id)
+enum ice_status
+ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd)
 {
-	enum ice_flex_rx_mdid mdid;
+	struct ice_aqc_set_mac_cfg *cmd;
+	struct ice_aq_desc desc;
 
-	switch (prof_id) {
-	case ICE_RXDID_FLEX_NIC:
-	case ICE_RXDID_FLEX_NIC_2:
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_LOW, 0);
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_HIGH, 1);
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_FLOW_ID_LOWER, 2);
+	cmd = &desc.params.set_mac_cfg;
 
-		mdid = (prof_id == ICE_RXDID_FLEX_NIC_2) ?
-			ICE_RX_MDID_SRC_VSI : ICE_RX_MDID_FLOW_ID_HIGH;
+	if (max_frame_size == 0)
+		return ICE_ERR_PARAM;
 
-		ICE_PROG_FLEX_ENTRY(hw, prof_id, mdid, 3);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_cfg);
 
-		ice_init_flex_flags(hw, prof_id);
-		break;
+	cmd->max_frame_size = cpu_to_le16(max_frame_size);
 
-	default:
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Field init for profile ID %d not supported\n",
-			  prof_id);
-	}
+	ice_fill_tx_timer_and_fc_thresh(hw, cmd);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
@@ -440,14 +584,16 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw)
 
 	hw->switch_info = devm_kzalloc(ice_hw_to_dev(hw),
 				       sizeof(*hw->switch_info), GFP_KERNEL);
+
 	sw = hw->switch_info;
 
 	if (!sw)
 		return ICE_ERR_NO_MEMORY;
 
 	INIT_LIST_HEAD(&sw->vsi_list_map_head);
+	sw->prof_res_bm_init = 0;
 
-	status = ice_init_def_sw_recp(hw);
+	status = ice_init_def_sw_recp(hw, &hw->switch_info->recp_list);
 	if (status) {
 		devm_kfree(ice_hw_to_dev(hw), hw->switch_info);
 		return status;
@@ -456,262 +602,83 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw)
 }
 
 /**
- * ice_cleanup_fltr_mgmt_struct - cleanup filter management list and locks
+ * ice_cleanup_fltr_mgmt_single - clears single filter mngt struct
  * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function clears filters
  */
-static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
+static void
+ice_cleanup_fltr_mgmt_single(struct ice_hw *hw, struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_vsi_list_map_info *v_pos_map;
 	struct ice_vsi_list_map_info *v_tmp_map;
 	struct ice_sw_recipe *recps;
 	u8 i;
 
+	if (!sw)
+		return;
+
 	list_for_each_entry_safe(v_pos_map, v_tmp_map, &sw->vsi_list_map_head,
 				 list_entry) {
 		list_del(&v_pos_map->list_entry);
 		devm_kfree(ice_hw_to_dev(hw), v_pos_map);
 	}
-	recps = hw->switch_info->recp_list;
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
-		struct ice_fltr_mgmt_list_entry *lst_itr, *tmp_entry;
+	recps = sw->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		struct ice_recp_grp_entry *rg_entry, *tmprg_entry;
 
 		recps[i].root_rid = i;
-		mutex_destroy(&recps[i].filt_rule_lock);
-		list_for_each_entry_safe(lst_itr, tmp_entry,
-					 &recps[i].filt_rules, list_entry) {
-			list_del(&lst_itr->list_entry);
-			devm_kfree(ice_hw_to_dev(hw), lst_itr);
-		}
-	}
-	ice_rm_all_sw_replay_rule_info(hw);
-	devm_kfree(ice_hw_to_dev(hw), sw->recp_list);
-	devm_kfree(ice_hw_to_dev(hw), sw);
-}
-
-#define ICE_FW_LOG_DESC_SIZE(n)	(sizeof(struct ice_aqc_fw_logging_data) + \
-	(((n) - 1) * sizeof(((struct ice_aqc_fw_logging_data *)0)->entry)))
-#define ICE_FW_LOG_DESC_SIZE_MAX	\
-	ICE_FW_LOG_DESC_SIZE(ICE_AQC_FW_LOG_ID_MAX)
-
-/**
- * ice_get_fw_log_cfg - get FW logging configuration
- * @hw: pointer to the HW struct
- */
-static enum ice_status ice_get_fw_log_cfg(struct ice_hw *hw)
-{
-	struct ice_aqc_fw_logging_data *config;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 size;
-
-	size = ICE_FW_LOG_DESC_SIZE_MAX;
-	config = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
-	if (!config)
-		return ICE_ERR_NO_MEMORY;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logging_info);
-
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_BUF);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	status = ice_aq_send_cmd(hw, &desc, config, size, NULL);
-	if (!status) {
-		u16 i;
-
-		/* Save FW logging information into the HW structure */
-		for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
-			u16 v, m, flgs;
-
-			v = le16_to_cpu(config->entry[i]);
-			m = (v & ICE_AQC_FW_LOG_ID_M) >> ICE_AQC_FW_LOG_ID_S;
-			flgs = (v & ICE_AQC_FW_LOG_EN_M) >> ICE_AQC_FW_LOG_EN_S;
-
-			if (m < ICE_AQC_FW_LOG_ID_MAX)
-				hw->fw_log.evnts[m].cur = flgs;
+		list_for_each_entry_safe(rg_entry, tmprg_entry,
+					 &recps[i].rg_list, l_entry) {
+			list_del(&rg_entry->l_entry);
+			devm_kfree(ice_hw_to_dev(hw), rg_entry);
 		}
-	}
-
-	devm_kfree(ice_hw_to_dev(hw), config);
-
-	return status;
-}
-
-/**
- * ice_cfg_fw_log - configure FW logging
- * @hw: pointer to the HW struct
- * @enable: enable certain FW logging events if true, disable all if false
- *
- * This function enables/disables the FW logging via Rx CQ events and a UART
- * port based on predetermined configurations. FW logging via the Rx CQ can be
- * enabled/disabled for individual PF's. However, FW logging via the UART can
- * only be enabled/disabled for all PFs on the same device.
- *
- * To enable overall FW logging, the "cq_en" and "uart_en" enable bits in
- * hw->fw_log need to be set accordingly, e.g. based on user-provided input,
- * before initializing the device.
- *
- * When re/configuring FW logging, callers need to update the "cfg" elements of
- * the hw->fw_log.evnts array with the desired logging event configurations for
- * modules of interest. When disabling FW logging completely, the callers can
- * just pass false in the "enable" parameter. On completion, the function will
- * update the "cur" element of the hw->fw_log.evnts array with the resulting
- * logging event configurations of the modules that are being re/configured. FW
- * logging modules that are not part of a reconfiguration operation retain their
- * previous states.
- *
- * Before resetting the device, it is recommended that the driver disables FW
- * logging before shutting down the control queue. When disabling FW logging
- * ("enable" = false), the latest configurations of FW logging events stored in
- * hw->fw_log.evnts[] are not overridden to allow them to be reconfigured after
- * a device reset.
- *
- * When enabling FW logging to emit log messages via the Rx CQ during the
- * device's initialization phase, a mechanism alternative to interrupt handlers
- * needs to be used to extract FW log messages from the Rx CQ periodically and
- * to prevent the Rx CQ from being full and stalling other types of control
- * messages from FW to SW. Interrupts are typically disabled during the device's
- * initialization phase.
- */
-static enum ice_status ice_cfg_fw_log(struct ice_hw *hw, bool enable)
-{
-	struct ice_aqc_fw_logging_data *data = NULL;
-	struct ice_aqc_fw_logging *cmd;
-	enum ice_status status = 0;
-	u16 i, chgs = 0, len = 0;
-	struct ice_aq_desc desc;
-	u8 actv_evnts = 0;
-	void *buf = NULL;
-
-	if (!hw->fw_log.cq_en && !hw->fw_log.uart_en)
-		return 0;
-
-	/* Disable FW logging only when the control queue is still responsive */
-	if (!enable &&
-	    (!hw->fw_log.actv_evnts || !ice_check_sq_alive(hw, &hw->adminq)))
-		return 0;
 
-	/* Get current FW log settings */
-	status = ice_get_fw_log_cfg(hw);
-	if (status)
-		return status;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logging);
-	cmd = &desc.params.fw_logging;
-
-	/* Indicate which controls are valid */
-	if (hw->fw_log.cq_en)
-		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_AQ_VALID;
-
-	if (hw->fw_log.uart_en)
-		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_UART_VALID;
-
-	if (enable) {
-		/* Fill in an array of entries with FW logging modules and
-		 * logging events being reconfigured.
-		 */
-		for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
-			u16 val;
-
-			/* Keep track of enabled event types */
-			actv_evnts |= hw->fw_log.evnts[i].cfg;
-
-			if (hw->fw_log.evnts[i].cfg == hw->fw_log.evnts[i].cur)
-				continue;
-
-			if (!data) {
-				data = devm_kzalloc(ice_hw_to_dev(hw),
-						    ICE_FW_LOG_DESC_SIZE_MAX,
-						    GFP_KERNEL);
-				if (!data)
-					return ICE_ERR_NO_MEMORY;
+		if (recps[i].adv_rule) {
+			struct ice_adv_fltr_mgmt_list_entry *tmp_entry;
+			struct ice_adv_fltr_mgmt_list_entry *lst_itr;
+
+			mutex_destroy(&recps[i].filt_rule_lock);
+			list_for_each_entry_safe(lst_itr, tmp_entry,
+						 &recps[i].filt_rules,
+						 list_entry) {
+				list_del(&lst_itr->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr->lkups);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr);
 			}
-
-			val = i << ICE_AQC_FW_LOG_ID_S;
-			val |= hw->fw_log.evnts[i].cfg << ICE_AQC_FW_LOG_EN_S;
-			data->entry[chgs++] = cpu_to_le16(val);
-		}
-
-		/* Only enable FW logging if at least one module is specified.
-		 * If FW logging is currently enabled but all modules are not
-		 * enabled to emit log messages, disable FW logging altogether.
-		 */
-		if (actv_evnts) {
-			/* Leave if there is effectively no change */
-			if (!chgs)
-				goto out;
-
-			if (hw->fw_log.cq_en)
-				cmd->log_ctrl |= ICE_AQC_FW_LOG_AQ_EN;
-
-			if (hw->fw_log.uart_en)
-				cmd->log_ctrl |= ICE_AQC_FW_LOG_UART_EN;
-
-			buf = data;
-			len = ICE_FW_LOG_DESC_SIZE(chgs);
-			desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-		}
-	}
-
-	status = ice_aq_send_cmd(hw, &desc, buf, len, NULL);
-	if (!status) {
-		/* Update the current configuration to reflect events enabled.
-		 * hw->fw_log.cq_en and hw->fw_log.uart_en indicate if the FW
-		 * logging mode is enabled for the device. They do not reflect
-		 * actual modules being enabled to emit log messages. So, their
-		 * values remain unchanged even when all modules are disabled.
-		 */
-		u16 cnt = enable ? chgs : (u16)ICE_AQC_FW_LOG_ID_MAX;
-
-		hw->fw_log.actv_evnts = actv_evnts;
-		for (i = 0; i < cnt; i++) {
-			u16 v, m;
-
-			if (!enable) {
-				/* When disabling all FW logging events as part
-				 * of device's de-initialization, the original
-				 * configurations are retained, and can be used
-				 * to reconfigure FW logging later if the device
-				 * is re-initialized.
-				 */
-				hw->fw_log.evnts[i].cur = 0;
-				continue;
+		} else {
+			struct ice_fltr_mgmt_list_entry *lst_itr, *tmp_entry;
+
+			mutex_destroy(&recps[i].filt_rule_lock);
+			list_for_each_entry_safe(lst_itr, tmp_entry,
+						 &recps[i].filt_rules,
+						 list_entry) {
+				list_del(&lst_itr->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), lst_itr);
 			}
-
-			v = le16_to_cpu(data->entry[i]);
-			m = (v & ICE_AQC_FW_LOG_ID_M) >> ICE_AQC_FW_LOG_ID_S;
-			hw->fw_log.evnts[m].cur = hw->fw_log.evnts[m].cfg;
 		}
+		if (recps[i].root_buf)
+			devm_kfree(ice_hw_to_dev(hw), recps[i].root_buf);
 	}
-
-out:
-	if (data)
-		devm_kfree(ice_hw_to_dev(hw), data);
-
-	return status;
+	ice_rm_sw_replay_rule_info(hw, sw);
+	devm_kfree(ice_hw_to_dev(hw), sw->recp_list);
+	devm_kfree(ice_hw_to_dev(hw), sw);
 }
 
 /**
- * ice_output_fw_log
+ * ice_cleanup_fltr_mgmt_struct - cleanup filter management list and locks
  * @hw: pointer to the HW struct
- * @desc: pointer to the AQ message descriptor
- * @buf: pointer to the buffer accompanying the AQ message
- *
- * Formats a FW Log message and outputs it via the standard driver logs.
  */
-void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf)
+static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
 {
-	ice_debug(hw, ICE_DBG_FW_LOG, "[ FW Log Msg Start ]\n");
-	ice_debug_array(hw, ICE_DBG_FW_LOG, 16, 1, (u8 *)buf,
-			le16_to_cpu(desc->datalen));
-	ice_debug(hw, ICE_DBG_FW_LOG, "[ FW Log Msg End ]\n");
+	ice_cleanup_fltr_mgmt_single(hw, hw->switch_info);
 }
 
+
 /**
- * ice_get_itr_intrl_gran - determine int/intrl granularity
+ * ice_get_itr_intrl_gran
  * @hw: pointer to the HW struct
  *
- * Determines the ITR/intrl granularities based on the maximum aggregate
+ * Determines the ITR/INTRL granularities based on the maximum aggregate
  * bandwidth according to the device's configuration during power-on.
  */
 static void ice_get_itr_intrl_gran(struct ice_hw *hw)
@@ -735,26 +702,36 @@ static void ice_get_itr_intrl_gran(struct ice_hw *hw)
 }
 
 /**
- * ice_get_nvm_version - get cached NVM version data
+ * ice_print_rollback_msg - print FW rollback message
  * @hw: pointer to the hardware structure
- * @oem_ver: 8 bit NVM version
- * @oem_build: 16 bit NVM build number
- * @oem_patch: 8 NVM patch number
- * @ver_hi: high 16 bits of the NVM version
- * @ver_lo: low 16 bits of the NVM version
  */
-void
-ice_get_nvm_version(struct ice_hw *hw, u8 *oem_ver, u16 *oem_build,
-		    u8 *oem_patch, u8 *ver_hi, u8 *ver_lo)
+void ice_print_rollback_msg(struct ice_hw *hw)
 {
-	struct ice_nvm_info *nvm = &hw->nvm;
+	char nvm_str[ICE_NVM_VER_LEN] = { 0 };
+	struct ice_orom_info *orom;
+	struct ice_nvm_info *nvm;
+
+	orom = &hw->flash.orom;
+	nvm = &hw->flash.nvm;
+
+	snprintf(nvm_str, sizeof(nvm_str), "%x.%02x 0x%x %d.%d.%d",
+		 nvm->major, nvm->minor, nvm->eetrack, orom->major,
+		 orom->build, orom->patch);
+	dev_warn(ice_hw_to_dev(hw),
+		 "Firmware rollback mode detected. Current version is NVM: %s, FW: %d.%d. Device may exhibit limited functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware rollback mode\n",
+		 nvm_str, hw->fw_maj_ver, hw->fw_min_ver);
+}
+
 
-	*oem_ver = (u8)((nvm->oem_ver & ICE_OEM_VER_MASK) >> ICE_OEM_VER_SHIFT);
-	*oem_patch = (u8)(nvm->oem_ver & ICE_OEM_VER_PATCH_MASK);
-	*oem_build = (u16)((nvm->oem_ver & ICE_OEM_VER_BUILD_MASK) >>
-			   ICE_OEM_VER_BUILD_SHIFT);
-	*ver_hi = (nvm->ver & ICE_NVM_VER_HI_MASK) >> ICE_NVM_VER_HI_SHIFT;
-	*ver_lo = (nvm->ver & ICE_NVM_VER_LO_MASK) >> ICE_NVM_VER_LO_SHIFT;
+/**
+ * ice_set_umac_shared
+ * @hw: pointer to the hw struct
+ *
+ * Set boolean flag to allow unicast MAC sharing
+ */
+void ice_set_umac_shared(struct ice_hw *hw)
+{
+	hw->umac_shared = true;
 }
 
 /**
@@ -774,34 +751,56 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		return status;
 
 	hw->pf_id = (u8)(rd32(hw, PF_FUNC_RID) &
-			 PF_FUNC_RID_FUNC_NUM_M) >>
-		PF_FUNC_RID_FUNC_NUM_S;
+			 PF_FUNC_RID_FUNCTION_NUMBER_M) >>
+		PF_FUNC_RID_FUNCTION_NUMBER_S;
+
 
 	status = ice_reset(hw, ICE_RESET_PFR);
 	if (status)
 		return status;
-
 	ice_get_itr_intrl_gran(hw);
 
+
 	status = ice_create_all_ctrlq(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
-	/* Enable FW logging. Not fatal if this fails. */
-	status = ice_cfg_fw_log(hw, true);
-	if (status)
-		ice_debug(hw, ICE_DBG_INIT, "Failed to enable FW logging.\n");
+	ice_fwlog_set_support_ena(hw);
+	status = ice_fwlog_set(hw, &hw->fwlog_cfg);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to enable FW logging, status %d.\n",
+			  status);
+	} else {
+		if (hw->fwlog_cfg.options & ICE_FWLOG_OPTION_REGISTER_ON_INIT) {
+			status = ice_fwlog_register(hw);
+			if (status)
+				ice_debug(hw, ICE_DBG_INIT, "Failed to register for FW logging events, status %d.\n",
+					  status);
+		} else {
+			status = ice_fwlog_unregister(hw);
+			if (status)
+				ice_debug(hw, ICE_DBG_INIT, "Failed to unregister for FW logging events, status %d.\n",
+					  status);
+		}
+	}
 
-	status = ice_clear_pf_cfg(hw);
+	status = ice_init_nvm(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
-	ice_clear_pxe_mode(hw);
+	if (ice_get_fw_mode(hw) == ICE_FW_MODE_ROLLBACK)
+		ice_print_rollback_msg(hw);
 
-	status = ice_init_nvm(hw);
+	status = ice_clear_pf_cfg(hw);
 	if (status)
 		goto err_unroll_cqinit;
 
+	/* Set bit to enable Flow Director filters */
+	wr32(hw, PFQF_FD_ENA, PFQF_FD_ENA_FD_ENA_M);
+	INIT_LIST_HEAD(&hw->fdir_list_head);
+
+	ice_clear_pxe_mode(hw);
+
 	status = ice_get_caps(hw);
 	if (status)
 		goto err_unroll_cqinit;
@@ -822,20 +821,18 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		goto err_unroll_alloc;
 
 	hw->evb_veb = true;
-
 	/* Query the allocated resources for Tx scheduler */
 	status = ice_sched_query_res_alloc(hw);
 	if (status) {
-		ice_debug(hw, ICE_DBG_SCHED,
-			  "Failed to get scheduler allocated resources\n");
+		ice_debug(hw, ICE_DBG_SCHED, "Failed to get scheduler allocated resources\n");
 		goto err_unroll_alloc;
 	}
+	ice_sched_get_psm_clk_freq(hw);
 
 	/* Initialize port_info struct with scheduler data */
 	status = ice_sched_init_port(hw->port_info);
 	if (status)
 		goto err_unroll_sched;
-
 	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps) {
 		status = ICE_ERR_NO_MEMORY;
@@ -844,16 +841,17 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 
 	/* Initialize port_info struct with PHY capabilities */
 	status = ice_aq_get_phy_caps(hw->port_info, false,
-				     ICE_AQC_REPORT_TOPO_CAP, pcaps, NULL);
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, pcaps, NULL);
 	devm_kfree(ice_hw_to_dev(hw), pcaps);
 	if (status)
-		goto err_unroll_sched;
+		dev_warn(ice_hw_to_dev(hw),
+			 "Get PHY capabilities failed status = %d, continuing anyway\n",
+			 status);
 
 	/* Initialize port_info struct with link information */
 	status = ice_aq_get_link_info(hw->port_info, false, NULL, NULL);
 	if (status)
 		goto err_unroll_sched;
-
 	/* need a valid SW entry point to build a Tx tree */
 	if (!hw->sw_entry_point_layer) {
 		ice_debug(hw, ICE_DBG_SCHED, "invalid sw entry point\n");
@@ -861,14 +859,16 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		goto err_unroll_sched;
 	}
 	INIT_LIST_HEAD(&hw->agg_list);
-
+	/* Initialize max burst size */
+	if (!hw->max_burst_size)
+		ice_cfg_rl_burst_size(hw, ICE_SCHED_DFLT_BURST_SIZE);
 	status = ice_init_fltr_mgmt_struct(hw);
 	if (status)
 		goto err_unroll_sched;
 
-	ice_dev_onetime_setup(hw);
 
 	/* Get MAC information */
+
 	/* A single port can report up to two (LAN and WoL) addresses */
 	mac_buf = devm_kcalloc(ice_hw_to_dev(hw), 2,
 			       sizeof(struct ice_aqc_manage_mac_read_resp),
@@ -885,12 +885,15 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 
 	if (status)
 		goto err_unroll_fltr_mgmt_struct;
-
-	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC);
-	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC_2);
+	/* Obtain counter base index which would be used by flow director */
+	status = ice_alloc_fd_res_cntr(hw, &hw->fd_ctr_base);
+	if (status)
+		goto err_unroll_fltr_mgmt_struct;
 	status = ice_init_hw_tbls(hw);
 	if (status)
 		goto err_unroll_fltr_mgmt_struct;
+	mutex_init(&hw->tnl_lock);
+
 	return 0;
 
 err_unroll_fltr_mgmt_struct:
@@ -899,6 +902,7 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 	ice_sched_cleanup_all(hw);
 err_unroll_alloc:
 	devm_kfree(ice_hw_to_dev(hw), hw->port_info);
+	hw->port_info = NULL;
 err_unroll_cqinit:
 	ice_destroy_all_ctrlq(hw);
 	return status;
@@ -914,20 +918,20 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
  */
 void ice_deinit_hw(struct ice_hw *hw)
 {
+	ice_free_fd_res_cntr(hw, hw->fd_ctr_base);
 	ice_cleanup_fltr_mgmt_struct(hw);
 
 	ice_sched_cleanup_all(hw);
 	ice_sched_clear_agg(hw);
 	ice_free_seg(hw);
 	ice_free_hw_tbls(hw);
+	mutex_destroy(&hw->tnl_lock);
 
 	if (hw->port_info) {
 		devm_kfree(ice_hw_to_dev(hw), hw->port_info);
 		hw->port_info = NULL;
 	}
 
-	/* Attempt to disable FW logging before shutting down control queues */
-	ice_cfg_fw_log(hw, false);
 	ice_destroy_all_ctrlq(hw);
 
 	/* Clear VSI contexts if not already cleared */
@@ -940,25 +944,24 @@ void ice_deinit_hw(struct ice_hw *hw)
  */
 enum ice_status ice_check_reset(struct ice_hw *hw)
 {
-	u32 cnt, reg = 0, grst_delay, uld_mask;
+	u32 cnt, reg = 0, grst_timeout, uld_mask;
 
 	/* Poll for Device Active state in case a recent CORER, GLOBR,
 	 * or EMPR has occurred. The grst delay value is in 100ms units.
 	 * Add 1sec for outstanding AQ commands that can take a long time.
 	 */
-	grst_delay = ((rd32(hw, GLGEN_RSTCTL) & GLGEN_RSTCTL_GRSTDEL_M) >>
-		      GLGEN_RSTCTL_GRSTDEL_S) + 10;
+	grst_timeout = ((rd32(hw, GLGEN_RSTCTL) & GLGEN_RSTCTL_GRSTDEL_M) >>
+			GLGEN_RSTCTL_GRSTDEL_S) + 10;
 
-	for (cnt = 0; cnt < grst_delay; cnt++) {
-		mdelay(100);
+	for (cnt = 0; cnt < grst_timeout; cnt++) {
+		msleep(100);
 		reg = rd32(hw, GLGEN_RSTAT);
 		if (!(reg & GLGEN_RSTAT_DEVSTATE_M))
 			break;
 	}
 
-	if (cnt == grst_delay) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Global reset polling failed to complete.\n");
+	if (cnt == grst_timeout) {
+		ice_debug(hw, ICE_DBG_INIT, "Global reset polling failed to complete.\n");
 		return ICE_ERR_RESET_FAILED;
 	}
 
@@ -970,22 +973,21 @@ enum ice_status ice_check_reset(struct ice_hw *hw)
 				 GLNVM_ULD_POR_DONE_1_M |\
 				 GLNVM_ULD_PCIER_DONE_2_M)
 
-	uld_mask = ICE_RESET_DONE_MASK;
+	uld_mask = ICE_RESET_DONE_MASK | (hw->func_caps.common_cap.iwarp ?
+					  GLNVM_ULD_PE_DONE_M : 0);
 
 	/* Device is Active; check Global Reset processes are done */
 	for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) {
 		reg = rd32(hw, GLNVM_ULD) & uld_mask;
 		if (reg == uld_mask) {
-			ice_debug(hw, ICE_DBG_INIT,
-				  "Global reset processes done. %d\n", cnt);
+			ice_debug(hw, ICE_DBG_INIT, "Global reset processes done. %d\n", cnt);
 			break;
 		}
-		mdelay(10);
+		msleep(10);
 	}
 
 	if (cnt == ICE_PF_RESET_WAIT_COUNT) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Wait for Reset Done timed out. GLNVM_ULD = 0x%x\n",
+		ice_debug(hw, ICE_DBG_INIT, "Wait for Reset Done timed out. GLNVM_ULD = 0x%x\n",
 			  reg);
 		return ICE_ERR_RESET_FAILED;
 	}
@@ -1023,17 +1025,21 @@ static enum ice_status ice_pf_reset(struct ice_hw *hw)
 
 	wr32(hw, PFGEN_CTRL, (reg | PFGEN_CTRL_PFSWR_M));
 
-	for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) {
+	/* Wait for the PFR to complete. The wait time is the global config lock
+	 * timeout plus the PFR timeout which will account for a possible reset
+	 * that is occurring during a download package operation.
+	 */
+	for (cnt = 0; cnt < ICE_GLOBAL_CFG_LOCK_TIMEOUT +
+	     ICE_PF_RESET_WAIT_COUNT; cnt++) {
 		reg = rd32(hw, PFGEN_CTRL);
 		if (!(reg & PFGEN_CTRL_PFSWR_M))
 			break;
 
-		mdelay(1);
+		msleep(1);
 	}
 
 	if (cnt == ICE_PF_RESET_WAIT_COUNT) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "PF reset polling failed to complete.\n");
+		ice_debug(hw, ICE_DBG_INIT, "PF reset polling failed to complete.\n");
 		return ICE_ERR_RESET_FAILED;
 	}
 
@@ -1075,10 +1081,12 @@ enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req)
 	wr32(hw, GLGEN_RTRIG, val);
 	ice_flush(hw);
 
+
 	/* wait for the FW to be ready */
 	return ice_check_reset(hw);
 }
 
+
 /**
  * ice_copy_rxq_ctx_to_hw
  * @hw: pointer to the hardware structure
@@ -1157,10 +1165,31 @@ ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx,
 
 	rlan_ctx->prefena = 1;
 
-	ice_set_ctx((u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info);
+	ice_set_ctx(hw, (u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info);
 	return ice_copy_rxq_ctx_to_hw(hw, ctx_buf, rxq_index);
 }
 
+/**
+ * ice_clear_rxq_ctx
+ * @hw: pointer to the hardware structure
+ * @rxq_index: the index of the Rx queue to clear
+ *
+ * Clears rxq context in HW register space
+ */
+enum ice_status ice_clear_rxq_ctx(struct ice_hw *hw, u32 rxq_index)
+{
+	u8 i;
+
+	if (rxq_index > QRX_CTRL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++)
+		wr32(hw, QRX_CONTEXT(i, rxq_index), 0);
+
+	return 0;
+}
+
 /* LAN Tx Queue Context */
 const struct ice_ctx_ele ice_tlan_ctx_info[] = {
 				    /* Field			Width	LSB */
@@ -1196,93 +1225,445 @@ const struct ice_ctx_ele ice_tlan_ctx_info[] = {
 };
 
 /**
- * ice_debug_cq
+ * ice_copy_tx_cmpltnq_ctx_to_hw
  * @hw: pointer to the hardware structure
- * @mask: debug mask
- * @desc: pointer to control queue descriptor
- * @buf: pointer to command buffer
- * @buf_len: max length of buf
+ * @ice_tx_cmpltnq_ctx: pointer to the Tx completion queue context
+ * @tx_cmpltnq_index: the index of the completion queue
  *
- * Dumps debug log about control command with descriptor contents.
+ * Copies Tx completion queue context from dense structure to HW register space
  */
-void
-ice_debug_cq(struct ice_hw *hw, u32 __maybe_unused mask, void *desc, void *buf,
-	     u16 buf_len)
+static enum ice_status
+ice_copy_tx_cmpltnq_ctx_to_hw(struct ice_hw *hw, u8 *ice_tx_cmpltnq_ctx,
+			      u32 tx_cmpltnq_index)
 {
-	struct ice_aq_desc *cq_desc = (struct ice_aq_desc *)desc;
-	u16 len;
-
-#ifndef CONFIG_DYNAMIC_DEBUG
-	if (!(mask & hw->debug_mask))
-		return;
-#endif
+	u8 i;
 
-	if (!desc)
-		return;
+	if (!ice_tx_cmpltnq_ctx)
+		return ICE_ERR_BAD_PTR;
 
-	len = le16_to_cpu(cq_desc->datalen);
+	if (tx_cmpltnq_index > GLTCLAN_CQ_CNTX0_MAX_INDEX)
+		return ICE_ERR_PARAM;
 
-	ice_debug(hw, mask,
-		  "CQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
-		  le16_to_cpu(cq_desc->opcode),
-		  le16_to_cpu(cq_desc->flags),
-		  le16_to_cpu(cq_desc->datalen), le16_to_cpu(cq_desc->retval));
-	ice_debug(hw, mask, "\tcookie (h,l) 0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->cookie_high),
-		  le32_to_cpu(cq_desc->cookie_low));
-	ice_debug(hw, mask, "\tparam (0,1)  0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->params.generic.param0),
-		  le32_to_cpu(cq_desc->params.generic.param1));
-	ice_debug(hw, mask, "\taddr (h,l)   0x%08X 0x%08X\n",
-		  le32_to_cpu(cq_desc->params.generic.addr_high),
-		  le32_to_cpu(cq_desc->params.generic.addr_low));
-	if (buf && cq_desc->datalen != 0) {
-		ice_debug(hw, mask, "Buffer:\n");
-		if (buf_len < len)
-			len = buf_len;
+	/* Copy each dword separately to HW */
+	for (i = 0; i < ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS; i++) {
+		wr32(hw, GLTCLAN_CQ_CNTX(i, tx_cmpltnq_index),
+		     *((u32 *)(ice_tx_cmpltnq_ctx + (i * sizeof(u32)))));
 
-		ice_debug_array(hw, mask, 16, 1, (u8 *)buf, len);
+		ice_debug(hw, ICE_DBG_QCTX, "cmpltnqdata[%d]: %08X\n", i,
+			  *((u32 *)(ice_tx_cmpltnq_ctx + (i * sizeof(u32)))));
 	}
+
+	return 0;
 }
 
-/* FW Admin Queue command wrappers */
+/* LAN Tx Completion Queue Context */
+static const struct ice_ctx_ele ice_tx_cmpltnq_ctx_info[] = {
+				       /* Field			Width   LSB */
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, base,			57,	0),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, q_len,		18,	64),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, generation,		1,	96),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, wrt_ptr,		22,	97),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, pf_num,		3,	128),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, vmvf_num,		10,	131),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, vmvf_type,		2,	141),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, tph_desc_wr,		1,	160),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, cpuid,		8,	161),
+	ICE_CTX_STORE(ice_tx_cmpltnq_ctx, cmpltn_cache,		512,	192),
+	{ 0 }
+};
 
-/* Software lock/mutex that is meant to be held while the Global Config Lock
- * in firmware is acquired by the software to prevent most (but not all) types
- * of AQ commands from being sent to FW
+/**
+ * ice_write_tx_cmpltnq_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_cmpltnq_ctx: pointer to the completion queue context
+ * @tx_cmpltnq_index: the index of the completion queue
+ *
+ * Converts completion queue context from sparse to dense structure and then
+ * writes it to HW register space
  */
-DEFINE_MUTEX(ice_global_cfg_lock_sw);
+enum ice_status
+ice_write_tx_cmpltnq_ctx(struct ice_hw *hw,
+			 struct ice_tx_cmpltnq_ctx *tx_cmpltnq_ctx,
+			 u32 tx_cmpltnq_index)
+{
+	u8 ctx_buf[ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS * sizeof(u32)] = { 0 };
+
+	ice_set_ctx(hw, (u8 *)tx_cmpltnq_ctx, ctx_buf, ice_tx_cmpltnq_ctx_info);
+	return ice_copy_tx_cmpltnq_ctx_to_hw(hw, ctx_buf, tx_cmpltnq_index);
+}
 
 /**
- * ice_aq_send_cmd - send FW Admin Queue command to FW Admin Queue
- * @hw: pointer to the HW struct
- * @desc: descriptor describing the command
- * @buf: buffer to use for indirect commands (NULL for direct commands)
- * @buf_size: size of buffer for indirect commands (0 for direct commands)
- * @cd: pointer to command details structure
+ * ice_clear_tx_cmpltnq_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_cmpltnq_index: the index of the completion queue to clear
  *
- * Helper function to send FW Admin Queue commands to the FW Admin Queue.
+ * Clears Tx completion queue context in HW register space
  */
 enum ice_status
-ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
-		u16 buf_size, struct ice_sq_cd *cd)
+ice_clear_tx_cmpltnq_ctx(struct ice_hw *hw, u32 tx_cmpltnq_index)
 {
-	struct ice_aqc_req_res *cmd = &desc->params.res_owner;
-	bool lock_acquired = false;
-	enum ice_status status;
+	u8 i;
 
-	/* When a package download is in process (i.e. when the firmware's
+	if (tx_cmpltnq_index > GLTCLAN_CQ_CNTX0_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS; i++)
+		wr32(hw, GLTCLAN_CQ_CNTX(i, tx_cmpltnq_index), 0);
+
+	return 0;
+}
+
+/**
+ * ice_copy_tx_drbell_q_ctx_to_hw
+ * @hw: pointer to the hardware structure
+ * @ice_tx_drbell_q_ctx: pointer to the doorbell queue context
+ * @tx_drbell_q_index: the index of the doorbell queue
+ *
+ * Copies doorbell queue context from dense structure to HW register space
+ */
+static enum ice_status
+ice_copy_tx_drbell_q_ctx_to_hw(struct ice_hw *hw, u8 *ice_tx_drbell_q_ctx,
+			       u32 tx_drbell_q_index)
+{
+	u8 i;
+
+	if (!ice_tx_drbell_q_ctx)
+		return ICE_ERR_BAD_PTR;
+
+	if (tx_drbell_q_index > QTX_COMM_DBLQ_DBELL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Copy each dword separately to HW */
+	for (i = 0; i < ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS; i++) {
+		wr32(hw, QTX_COMM_DBLQ_CNTX(i, tx_drbell_q_index),
+		     *((u32 *)(ice_tx_drbell_q_ctx + (i * sizeof(u32)))));
+
+		ice_debug(hw, ICE_DBG_QCTX, "tx_drbell_qdata[%d]: %08X\n", i,
+			  *((u32 *)(ice_tx_drbell_q_ctx + (i * sizeof(u32)))));
+	}
+
+	return 0;
+}
+
+/* LAN Tx Doorbell Queue Context info */
+static const struct ice_ctx_ele ice_tx_drbell_q_ctx_info[] = {
+					/* Field		Width   LSB */
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, base,		57,	0),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, ring_len,		13,	64),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, pf_num,		3,	80),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, vf_num,		8,	84),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, vmvf_type,		2,	94),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, cpuid,		8,	96),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, tph_desc_rd,		1,	104),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, tph_desc_wr,		1,	108),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, db_q_en,		1,	112),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, rd_head,		13,	128),
+	ICE_CTX_STORE(ice_tx_drbell_q_ctx, rd_tail,		13,	144),
+	{ 0 }
+};
+
+/**
+ * ice_write_tx_drbell_q_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_drbell_q_ctx: pointer to the doorbell queue context
+ * @tx_drbell_q_index: the index of the doorbell queue
+ *
+ * Converts doorbell queue context from sparse to dense structure and then
+ * writes it to HW register space
+ */
+enum ice_status
+ice_write_tx_drbell_q_ctx(struct ice_hw *hw,
+			  struct ice_tx_drbell_q_ctx *tx_drbell_q_ctx,
+			  u32 tx_drbell_q_index)
+{
+	u8 ctx_buf[ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS * sizeof(u32)] = { 0 };
+
+	ice_set_ctx(hw, (u8 *)tx_drbell_q_ctx, ctx_buf,
+		    ice_tx_drbell_q_ctx_info);
+	return ice_copy_tx_drbell_q_ctx_to_hw(hw, ctx_buf, tx_drbell_q_index);
+}
+
+/**
+ * ice_clear_tx_drbell_q_ctx
+ * @hw: pointer to the hardware structure
+ * @tx_drbell_q_index: the index of the doorbell queue to clear
+ *
+ * Clears doorbell queue context in HW register space
+ */
+enum ice_status
+ice_clear_tx_drbell_q_ctx(struct ice_hw *hw, u32 tx_drbell_q_index)
+{
+	u8 i;
+
+	if (tx_drbell_q_index > QTX_COMM_DBLQ_DBELL_MAX_INDEX)
+		return ICE_ERR_PARAM;
+
+	/* Clear each dword register separately */
+	for (i = 0; i < ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS; i++)
+		wr32(hw, QTX_COMM_DBLQ_CNTX(i, tx_drbell_q_index), 0);
+
+	return 0;
+}
+
+/* Sideband Queue command wrappers */
+
+/**
+ * ice_get_sbq - returns the right control queue to use for sideband
+ * @hw: pointer to the hardware structure
+ */
+static struct ice_ctl_q_info *ice_get_sbq(struct ice_hw *hw)
+{
+	if (!ice_is_generic_mac(hw))
+		return &hw->adminq;
+	return &hw->sbq;
+}
+
+/**
+ * ice_sbq_send_cmd - send Sideband Queue command to Sideband Queue
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ */
+static enum ice_status
+ice_sbq_send_cmd(struct ice_hw *hw, struct ice_sbq_cmd_desc *desc,
+		 void *buf, u16 buf_size, struct ice_sq_cd *cd)
+{
+	return ice_sq_send_cmd(hw, ice_get_sbq(hw), (struct ice_aq_desc *)desc,
+			       buf, buf_size, cd);
+}
+
+/**
+ * ice_sbq_send_cmd_nolock - send Sideband Queue command to Sideband Queue
+ *                           but do not lock sq_lock
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ */
+static enum ice_status
+ice_sbq_send_cmd_nolock(struct ice_hw *hw, struct ice_sbq_cmd_desc *desc,
+			void *buf, u16 buf_size, struct ice_sq_cd *cd)
+{
+	return ice_sq_send_cmd_nolock(hw, ice_get_sbq(hw),
+				      (struct ice_aq_desc *)desc, buf,
+				      buf_size, cd);
+}
+
+/**
+ * ice_sbq_rw_reg_lp - Fill Sideband Queue command, with lock parameter
+ * @hw: pointer to the HW struct
+ * @in: message info to be filled in descriptor
+ * @lock: true to lock the sq_lock (the usual case); false if the sq_lock has
+ *        already been locked at a higher level
+ */
+enum ice_status ice_sbq_rw_reg_lp(struct ice_hw *hw,
+				  struct ice_sbq_msg_input *in, bool lock)
+{
+	struct ice_sbq_cmd_desc desc = {0};
+	struct ice_sbq_msg_req msg = {0};
+	enum ice_status status;
+	u16 msg_len;
+
+	msg_len = sizeof(msg);
+
+	msg.dest_dev = in->dest_dev;
+	msg.opcode = in->opcode;
+	msg.flags = ICE_SBQ_MSG_FLAGS;
+	msg.sbe_fbe = ICE_SBQ_MSG_SBE_FBE;
+	msg.msg_addr_low = cpu_to_le16(in->msg_addr_low);
+	msg.msg_addr_high = cpu_to_le32(in->msg_addr_high);
+
+	if (in->opcode)
+		msg.data = cpu_to_le32(in->data);
+	else
+		/* data read comes back in completion, so shorten the struct by
+		 * sizeof(msg.data)
+		 */
+		msg_len -= sizeof(msg.data);
+
+	desc.flags = cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.opcode = cpu_to_le16(ice_sbq_opc_neigh_dev_req);
+	desc.param0.cmd_len = cpu_to_le16(msg_len);
+	if (lock)
+		status = ice_sbq_send_cmd(hw, &desc, &msg, msg_len, NULL);
+	else
+		status = ice_sbq_send_cmd_nolock(hw, &desc, &msg, msg_len,
+						 NULL);
+	if (!status && !in->opcode)
+		in->data = le32_to_cpu
+			(((struct ice_sbq_msg_cmpl *)&msg)->data);
+	return status;
+}
+
+/**
+ * ice_sbq_rw_reg - Fill Sideband Queue command
+ * @hw: pointer to the HW struct
+ * @in: message info to be filled in descriptor
+ */
+enum ice_status ice_sbq_rw_reg(struct ice_hw *hw, struct ice_sbq_msg_input *in)
+{
+	return ice_sbq_rw_reg_lp(hw, in, true);
+}
+
+/**
+ * ice_sbq_lock - Lock the sideband queue's sq_lock
+ * @hw: pointer to the HW struct
+ */
+void ice_sbq_lock(struct ice_hw *hw)
+{
+	mutex_lock(&ice_get_sbq(hw)->sq_lock);
+}
+
+/**
+ * ice_sbq_unlock - Unlock the sideband queue's sq_lock
+ * @hw: pointer to the HW struct
+ */
+void ice_sbq_unlock(struct ice_hw *hw)
+{
+	mutex_unlock(&ice_get_sbq(hw)->sq_lock);
+}
+
+/* FW Admin Queue command wrappers */
+
+/* Software lock/mutex that is meant to be held while the Global Config Lock
+ * in firmware is acquired by the software to prevent most (but not all) types
+ * of AQ commands from being sent to FW
+ */
+DEFINE_MUTEX(ice_global_cfg_lock_sw);
+
+/**
+ * ice_should_retry_sq_send_cmd
+ * @opcode: AQ opcode
+ *
+ * Decide if we should retry the send command routine for the ATQ, depending
+ * on the opcode.
+ */
+static bool ice_should_retry_sq_send_cmd(u16 opcode)
+{
+	switch (opcode) {
+	case ice_aqc_opc_dnl_get_status:
+	case ice_aqc_opc_dnl_run:
+	case ice_aqc_opc_dnl_call:
+	case ice_aqc_opc_dnl_read_sto:
+	case ice_aqc_opc_dnl_write_sto:
+	case ice_aqc_opc_dnl_set_breakpoints:
+	case ice_aqc_opc_dnl_read_log:
+	case ice_aqc_opc_get_link_topo:
+	case ice_aqc_opc_lldp_stop:
+	case ice_aqc_opc_lldp_start:
+	case ice_aqc_opc_lldp_filter_ctrl:
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_sq_send_cmd_retry - send command to Control Queue (ATQ)
+ * @hw: pointer to the HW struct
+ * @cq: pointer to the specific Control queue
+ * @desc: prefilled descriptor describing the command
+ * @buf: buffer to use for indirect commands (or NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (or 0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * Retry sending the FW Admin Queue command, multiple times, to the FW Admin
+ * Queue if the EBUSY AQ error is returned.
+ */
+static enum ice_status
+ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		      struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc_cpy;
+	enum ice_status status;
+	bool is_cmd_for_retry;
+	u8 *buf_cpy = NULL;
+	u8 idx = 0;
+	u16 opcode;
+
+	opcode = le16_to_cpu(desc->opcode);
+	is_cmd_for_retry = ice_should_retry_sq_send_cmd(opcode);
+	memset(&desc_cpy, 0, sizeof(desc_cpy));
+
+	if (is_cmd_for_retry) {
+		if (buf) {
+			buf_cpy = devm_kzalloc(ice_hw_to_dev(hw), buf_size,
+					       GFP_KERNEL);
+			if (!buf_cpy)
+				return ICE_ERR_NO_MEMORY;
+		}
+
+		memcpy(&desc_cpy, desc, sizeof(desc_cpy));
+	}
+
+	do {
+		status = ice_sq_send_cmd(hw, cq, desc, buf, buf_size, cd);
+
+		if (!is_cmd_for_retry || !status ||
+		    hw->adminq.sq_last_status != ICE_AQ_RC_EBUSY)
+			break;
+
+		if (buf_cpy)
+			memcpy(buf, buf_cpy, buf_size);
+
+		memcpy(desc, &desc_cpy, sizeof(desc_cpy));
+
+		mdelay(ICE_SQ_SEND_DELAY_TIME_MS);
+
+	} while (++idx < ICE_SQ_SEND_MAX_EXECUTE);
+
+	if (buf_cpy)
+		devm_kfree(ice_hw_to_dev(hw), buf_cpy);
+
+	return status;
+}
+
+/**
+ * ice_aq_send_cmd - send FW Admin Queue command to FW Admin Queue
+ * @hw: pointer to the HW struct
+ * @desc: descriptor describing the command
+ * @buf: buffer to use for indirect commands (NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * Helper function to send FW Admin Queue commands to the FW Admin Queue.
+ */
+enum ice_status
+ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
+		u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_req_res *cmd = &desc->params.res_owner;
+	bool lock_acquired = false;
+	enum ice_status status;
+
+	/* When a package download is in process (i.e. when the firmware's
 	 * Global Configuration Lock resource is held), only the Download
-	 * Package, Get Version, Get Package Info List and Release Resource
-	 * (with resource ID set to Global Config Lock) AdminQ commands are
-	 * allowed; all others must block until the package download completes
-	 * and the Global Config Lock is released.  See also
-	 * ice_acquire_global_cfg_lock().
+	 * Package, Get Version, Get Package Info List, Upload Section,
+	 * Update Package, Set Port Parameters, Get/Set VLAN Mode Parameters,
+	 * Add Recipe, Set Recipes to Profile Association, Get Recipe, and Get
+	 * Recipes to Profile Association, and Release Resource (with resource
+	 * ID set to Global Config Lock) AdminQ commands are allowed; all others
+	 * must block until the package download completes and the Global Config
+	 * Lock is released.  See also ice_acquire_global_cfg_lock().
 	 */
 	switch (le16_to_cpu(desc->opcode)) {
 	case ice_aqc_opc_download_pkg:
 	case ice_aqc_opc_get_pkg_info_list:
 	case ice_aqc_opc_get_ver:
+	case ice_aqc_opc_upload_section:
+	case ice_aqc_opc_update_pkg:
+	case ice_aqc_opc_set_port_params:
+	case ice_aqc_opc_get_vlan_mode_parameters:
+	case ice_aqc_opc_set_vlan_mode_parameters:
+	case ice_aqc_opc_add_recipe:
+	case ice_aqc_opc_recipe_to_profile:
+	case ice_aqc_opc_get_recipe:
+	case ice_aqc_opc_get_recipe_to_profile:
 		break;
 	case ice_aqc_opc_release_res:
 		if (le16_to_cpu(cmd->res_id) == ICE_AQC_RES_ID_GLBL_LOCK)
@@ -1294,7 +1675,7 @@ ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
 		break;
 	}
 
-	status = ice_sq_send_cmd(hw, &hw->adminq, desc, buf, buf_size, cd);
+	status = ice_sq_send_cmd_retry(hw, &hw->adminq, desc, buf, buf_size, cd);
 	if (lock_acquired)
 		mutex_unlock(&ice_global_cfg_lock_sw);
 
@@ -1536,13 +1917,12 @@ ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		goto ice_acquire_res_exit;
 
 	if (status)
-		ice_debug(hw, ICE_DBG_RES,
-			  "resource %d acquire type %d failed.\n", res, access);
+		ice_debug(hw, ICE_DBG_RES, "resource %d acquire type %d failed.\n", res, access);
 
 	/* If necessary, poll until the current lock owner timeouts */
 	timeout = time_left;
 	while (status && timeout && time_left) {
-		mdelay(delay);
+		msleep(delay);
 		timeout = (timeout > delay) ? timeout - delay : 0;
 		status = ice_aq_req_res(hw, res, access, 0, &time_left, NULL);
 
@@ -1560,11 +1940,9 @@ ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 ice_acquire_res_exit:
 	if (status == ICE_ERR_AQ_NO_WORK) {
 		if (access == ICE_RES_WRITE)
-			ice_debug(hw, ICE_DBG_RES,
-				  "resource indicates no work to do.\n");
+			ice_debug(hw, ICE_DBG_RES, "resource indicates no work to do.\n");
 		else
-			ice_debug(hw, ICE_DBG_RES,
-				  "Warning: ICE_ERR_AQ_NO_WORK not expected\n");
+			ice_debug(hw, ICE_DBG_RES, "Warning: ICE_ERR_AQ_NO_WORK not expected\n");
 	}
 	return status;
 }
@@ -1588,1977 +1966,4503 @@ void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res)
 	 */
 	while ((status == ICE_ERR_AQ_TIMEOUT) &&
 	       (total_delay < hw->adminq.sq_cmd_timeout)) {
-		mdelay(1);
+		msleep(1);
 		status = ice_aq_release_res(hw, res, 0, NULL);
 		total_delay++;
 	}
 }
 
 /**
- * ice_get_num_per_func - determine number of resources per PF
- * @hw: pointer to the HW structure
- * @max: value to be evenly split between each PF
+ * ice_aq_alloc_free_res - command to allocate/free resources
+ * @hw: pointer to the HW struct
+ * @num_entries: number of resource entries in buffer
+ * @buf: Indirect buffer to hold data parameters and response
+ * @buf_size: size of buffer for indirect commands
+ * @opc: pass in the command opcode
+ * @cd: pointer to command details structure or NULL
  *
- * Determine the number of valid functions by going through the bitmap returned
- * from parsing capabilities and use this to calculate the number of resources
- * per PF based on the max value passed in.
+ * Helper function to allocate/free resources using the admin queue commands
  */
-static u32 ice_get_num_per_func(struct ice_hw *hw, u32 max)
+enum ice_status
+ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
+		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
+		      enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
-	u8 funcs;
+	struct ice_aqc_alloc_free_res_cmd *cmd;
+	struct ice_aq_desc desc;
 
-#define ICE_CAPS_VALID_FUNCS_M	0xFF
-	funcs = hweight8(hw->dev_caps.common_cap.valid_functions &
-			 ICE_CAPS_VALID_FUNCS_M);
+	cmd = &desc.params.sw_res_ctrl;
 
-	if (!funcs)
-		return 0;
+	if (!buf)
+		return ICE_ERR_PARAM;
 
-	return max / funcs;
+	if (buf_size < flex_array_size(buf, elem, num_entries))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_entries = cpu_to_le16(num_entries);
+
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
 /**
- * ice_parse_caps - parse function/device capabilities
+ * ice_alloc_hw_res - allocate resource
  * @hw: pointer to the HW struct
- * @buf: pointer to a buffer containing function/device capability records
- * @cap_count: number of capability records in the list
- * @opc: type of capabilities list to parse
- *
- * Helper function to parse function(0x000a)/device(0x000b) capabilities list.
+ * @type: type of resource
+ * @num: number of resources to allocate
+ * @btm: allocate from bottom
+ * @res: pointer to array that will receive the resources
  */
-static void
-ice_parse_caps(struct ice_hw *hw, void *buf, u32 cap_count,
-	       enum ice_adminq_opc opc)
+enum ice_status
+ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res)
 {
-	struct ice_aqc_list_caps_elem *cap_resp;
-	struct ice_hw_func_caps *func_p = NULL;
-	struct ice_hw_dev_caps *dev_p = NULL;
-	struct ice_hw_common_caps *caps;
-	char const *prefix;
-	u32 i;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
+	buf_len = struct_size(buf, elem, num);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
 	if (!buf)
-		return;
+		return ICE_ERR_NO_MEMORY;
 
-	cap_resp = (struct ice_aqc_list_caps_elem *)buf;
+	/* Prepare buffer to allocate resource. */
+	buf->num_elems = cpu_to_le16(num);
+	buf->res_type = cpu_to_le16(type | ICE_AQC_RES_TYPE_FLAG_DEDICATED |
+				    ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX);
+	if (btm)
+		buf->res_type |= cpu_to_le16(ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM);
 
-	if (opc == ice_aqc_opc_list_dev_caps) {
-		dev_p = &hw->dev_caps;
-		caps = &dev_p->common_cap;
-		prefix = "dev cap";
-	} else if (opc == ice_aqc_opc_list_func_caps) {
-		func_p = &hw->func_caps;
-		caps = &func_p->common_cap;
-		prefix = "func cap";
-	} else {
-		ice_debug(hw, ICE_DBG_INIT, "wrong opcode\n");
-		return;
-	}
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (status)
+		goto ice_alloc_res_exit;
 
-	for (i = 0; caps && i < cap_count; i++, cap_resp++) {
-		u32 logical_id = le32_to_cpu(cap_resp->logical_id);
-		u32 phys_id = le32_to_cpu(cap_resp->phys_id);
-		u32 number = le32_to_cpu(cap_resp->number);
-		u16 cap = le16_to_cpu(cap_resp->cap);
+	memcpy(res, buf->elem, sizeof(*buf->elem) * num);
 
-		switch (cap) {
-		case ICE_AQC_CAPS_VALID_FUNCTIONS:
-			caps->valid_functions = number;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: valid_functions (bitmap) = %d\n", prefix,
-				  caps->valid_functions);
-			break;
-		case ICE_AQC_CAPS_SRIOV:
-			caps->sr_iov_1_1 = (number == 1);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: sr_iov_1_1 = %d\n", prefix,
-				  caps->sr_iov_1_1);
-			break;
-		case ICE_AQC_CAPS_VF:
-			if (dev_p) {
-				dev_p->num_vfs_exposed = number;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_vfs_exposed = %d\n", prefix,
-					  dev_p->num_vfs_exposed);
-			} else if (func_p) {
-				func_p->num_allocd_vfs = number;
-				func_p->vf_base_id = logical_id;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_allocd_vfs = %d\n", prefix,
-					  func_p->num_allocd_vfs);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: vf_base_id = %d\n", prefix,
-					  func_p->vf_base_id);
-			}
-			break;
-		case ICE_AQC_CAPS_VSI:
-			if (dev_p) {
-				dev_p->num_vsi_allocd_to_host = number;
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: num_vsi_allocd_to_host = %d\n",
-					  prefix,
-					  dev_p->num_vsi_allocd_to_host);
-			} else if (func_p) {
-				func_p->guar_num_vsi =
-					ice_get_num_per_func(hw, ICE_MAX_VSI);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: guar_num_vsi (fw) = %d\n",
-					  prefix, number);
-				ice_debug(hw, ICE_DBG_INIT,
-					  "%s: guar_num_vsi = %d\n",
-					  prefix, func_p->guar_num_vsi);
-			}
-			break;
-		case ICE_AQC_CAPS_DCB:
-			caps->dcb = (number == 1);
-			caps->active_tc_bitmap = logical_id;
-			caps->maxtc = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: dcb = %d\n", prefix, caps->dcb);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: active_tc_bitmap = %d\n", prefix,
-				  caps->active_tc_bitmap);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: maxtc = %d\n", prefix, caps->maxtc);
-			break;
-		case ICE_AQC_CAPS_RSS:
-			caps->rss_table_size = number;
-			caps->rss_table_entry_width = logical_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rss_table_size = %d\n", prefix,
-				  caps->rss_table_size);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rss_table_entry_width = %d\n", prefix,
-				  caps->rss_table_entry_width);
-			break;
-		case ICE_AQC_CAPS_RXQS:
-			caps->num_rxq = number;
-			caps->rxq_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_rxq = %d\n", prefix,
-				  caps->num_rxq);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: rxq_first_id = %d\n", prefix,
-				  caps->rxq_first_id);
-			break;
-		case ICE_AQC_CAPS_TXQS:
-			caps->num_txq = number;
-			caps->txq_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_txq = %d\n", prefix,
-				  caps->num_txq);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: txq_first_id = %d\n", prefix,
-				  caps->txq_first_id);
-			break;
-		case ICE_AQC_CAPS_MSIX:
-			caps->num_msix_vectors = number;
-			caps->msix_vector_first_id = phys_id;
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: num_msix_vectors = %d\n", prefix,
-				  caps->num_msix_vectors);
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: msix_vector_first_id = %d\n", prefix,
-				  caps->msix_vector_first_id);
-			break;
-		case ICE_AQC_CAPS_MAX_MTU:
-			caps->max_mtu = number;
-			ice_debug(hw, ICE_DBG_INIT, "%s: max_mtu = %d\n",
-				  prefix, caps->max_mtu);
-			break;
-		default:
-			ice_debug(hw, ICE_DBG_INIT,
-				  "%s: unknown capability[%d]: 0x%x\n", prefix,
-				  i, cap);
-			break;
-		}
-	}
+ice_alloc_res_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
 }
 
 /**
- * ice_aq_discover_caps - query function/device capabilities
+ * ice_free_hw_res - free allocated HW resource
  * @hw: pointer to the HW struct
- * @buf: a virtual buffer to hold the capabilities
- * @buf_size: Size of the virtual buffer
- * @cap_count: cap count needed if AQ err==ENOMEM
- * @opc: capabilities type to discover - pass in the command opcode
- * @cd: pointer to command details structure or NULL
- *
- * Get the function(0x000a)/device(0x000b) capabilities description from
- * the firmware.
+ * @type: type of resource to free
+ * @num: number of resources
+ * @res: pointer to array that contains the resources to free
  */
-static enum ice_status
-ice_aq_discover_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
-		     enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+enum ice_status ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res)
 {
-	struct ice_aqc_list_caps *cmd;
-	struct ice_aq_desc desc;
+	struct ice_aqc_alloc_free_res_elem *buf;
 	enum ice_status status;
+	u16 buf_len;
 
-	cmd = &desc.params.get_cap;
+	buf_len = struct_size(buf, elem, num);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
 
-	if (opc != ice_aqc_opc_list_func_caps &&
-	    opc != ice_aqc_opc_list_dev_caps)
-		return ICE_ERR_PARAM;
+	/* Prepare buffer to free resource. */
+	buf->num_elems = cpu_to_le16(num);
+	buf->res_type = cpu_to_le16(type);
+	memcpy(buf->elem, res, sizeof(*buf->elem) * num);
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	status = ice_aq_alloc_free_res(hw, num, buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "CQ CMD Buffer:\n");
 
-	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
-	if (!status)
-		ice_parse_caps(hw, buf, le32_to_cpu(cmd->count), opc);
-	else if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOMEM)
-		*cap_count = le32_to_cpu(cmd->count);
+	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
 /**
- * ice_discover_caps - get info about the HW
- * @hw: pointer to the hardware structure
- * @opc: capabilities type to discover - pass in the command opcode
- */
-static enum ice_status
-ice_discover_caps(struct ice_hw *hw, enum ice_adminq_opc opc)
-{
-	enum ice_status status;
-	u32 cap_count;
-	u16 cbuf_len;
-	u8 retries;
-
-	/* The driver doesn't know how many capabilities the device will return
-	 * so the buffer size required isn't known ahead of time. The driver
-	 * starts with cbuf_len and if this turns out to be insufficient, the
-	 * device returns ICE_AQ_RC_ENOMEM and also the cap_count it needs.
-	 * The driver then allocates the buffer based on the count and retries
-	 * the operation. So it follows that the retry count is 2.
-	 */
-#define ICE_GET_CAP_BUF_COUNT	40
-#define ICE_GET_CAP_RETRY_COUNT	2
-
-	cap_count = ICE_GET_CAP_BUF_COUNT;
-	retries = ICE_GET_CAP_RETRY_COUNT;
+ * ice_get_num_per_func - determine number of resources per PF
+ * @hw: pointer to the HW structure
+ * @max: value to be evenly split between each PF
+ *
+ * Determine the number of valid functions by going through the bitmap returned
+ * from parsing capabilities and use this to calculate the number of resources
+ * per PF based on the max value passed in.
+ */
+static u32 ice_get_num_per_func(struct ice_hw *hw, u32 max)
+{
+	u8 funcs;
 
-	do {
-		void *cbuf;
+#define ICE_CAPS_VALID_FUNCS_M	0xFF
+	funcs = hweight8(hw->dev_caps.common_cap.valid_functions & ICE_CAPS_VALID_FUNCS_M);
 
-		cbuf_len = (u16)(cap_count *
-				 sizeof(struct ice_aqc_list_caps_elem));
-		cbuf = devm_kzalloc(ice_hw_to_dev(hw), cbuf_len, GFP_KERNEL);
-		if (!cbuf)
-			return ICE_ERR_NO_MEMORY;
+	if (!funcs)
+		return 0;
 
-		status = ice_aq_discover_caps(hw, cbuf, cbuf_len, &cap_count,
-					      opc, NULL);
-		devm_kfree(ice_hw_to_dev(hw), cbuf);
+	return max / funcs;
+}
 
-		if (!status || hw->adminq.sq_last_status != ICE_AQ_RC_ENOMEM)
-			break;
+/**
+ * ice_print_led_caps - print LED capabilities
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ * @dbg: set to indicate debug print
+ */
+static void
+ice_print_led_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		   char const *prefix, bool dbg)
+{
+	u8 i;
 
-		/* If ENOMEM is returned, try again with bigger buffer */
-	} while (--retries);
+	if (dbg)
+		ice_debug(hw, ICE_DBG_INIT, "%s: led_pin_num = %d\n", prefix,
+			  caps->led_pin_num);
+	else
+		dev_info(ice_hw_to_dev(hw), "%s: led_pin_num = %d\n", prefix,
+			 caps->led_pin_num);
 
-	return status;
+	for (i = 0; i < ICE_MAX_SUPPORTED_GPIO_LED; i++) {
+		if (!caps->led[i])
+			continue;
+
+		if (dbg)
+			ice_debug(hw, ICE_DBG_INIT, "%s: led[%d] = %d\n",
+				  prefix, i, caps->led[i]);
+		else
+			dev_info(ice_hw_to_dev(hw), "%s: led[%d] = %d\n",
+				 prefix, i, caps->led[i]);
+	}
 }
 
 /**
- * ice_set_safe_mode_caps - Override dev/func capabilities when in safe mode
- * @hw: pointer to the hardware structure
+ * ice_print_sdp_caps - print SDP capabilities
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ * @dbg: set to indicate debug print
  */
-void ice_set_safe_mode_caps(struct ice_hw *hw)
+static void
+ice_print_sdp_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		   char const *prefix, bool dbg)
 {
-	struct ice_hw_func_caps *func_caps = &hw->func_caps;
-	struct ice_hw_dev_caps *dev_caps = &hw->dev_caps;
-	u32 valid_func, rxq_first_id, txq_first_id;
-	u32 msix_vector_first_id, max_mtu;
-	u32 num_func = 0;
 	u8 i;
 
-	/* cache some func_caps values that should be restored after memset */
-	valid_func = func_caps->common_cap.valid_functions;
-	txq_first_id = func_caps->common_cap.txq_first_id;
-	rxq_first_id = func_caps->common_cap.rxq_first_id;
-	msix_vector_first_id = func_caps->common_cap.msix_vector_first_id;
-	max_mtu = func_caps->common_cap.max_mtu;
+	if (dbg)
+		ice_debug(hw, ICE_DBG_INIT, "%s: sdp_pin_num = %d\n", prefix,
+			  caps->sdp_pin_num);
+	else
+		dev_info(ice_hw_to_dev(hw), "%s: sdp_pin_num = %d\n", prefix,
+			 caps->sdp_pin_num);
 
-	/* unset func capabilities */
-	memset(func_caps, 0, sizeof(*func_caps));
+	for (i = 0; i < ICE_MAX_SUPPORTED_GPIO_SDP; i++) {
+		if (!caps->sdp[i])
+			continue;
 
-	/* restore cached values */
-	func_caps->common_cap.valid_functions = valid_func;
-	func_caps->common_cap.txq_first_id = txq_first_id;
-	func_caps->common_cap.rxq_first_id = rxq_first_id;
-	func_caps->common_cap.msix_vector_first_id = msix_vector_first_id;
-	func_caps->common_cap.max_mtu = max_mtu;
+		if (dbg)
+			ice_debug(hw, ICE_DBG_INIT, "%s: sdp[%d] = %d\n",
+				  prefix, i, caps->sdp[i]);
+		else
+			dev_info(ice_hw_to_dev(hw), "%s: sdp[%d] = %d\n",
+				 prefix, i, caps->sdp[i]);
+	}
+}
 
-	/* one Tx and one Rx queue in safe mode */
-	func_caps->common_cap.num_rxq = 1;
-	func_caps->common_cap.num_txq = 1;
+/**
+ * ice_parse_common_caps - parse common device/function capabilities
+ * @hw: pointer to the HW struct
+ * @caps: pointer to common capabilities structure
+ * @elem: the capability element to parse
+ * @prefix: message prefix for tracing capabilities
+ *
+ * Given a capability element, extract relevant details into the common
+ * capability structure.
+ *
+ * Returns: true if the capability matches one of the common capability ids,
+ * false otherwise.
+ */
+static bool
+ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		      struct ice_aqc_list_caps_elem *elem, const char *prefix)
+{
+	u32 logical_id = le32_to_cpu(elem->logical_id);
+	u32 phys_id = le32_to_cpu(elem->phys_id);
+	u32 number = le32_to_cpu(elem->number);
+	u16 cap = le16_to_cpu(elem->cap);
+	bool found = true;
+
+	switch (cap) {
+	case ICE_AQC_CAPS_SWITCHING_MODE:
+		caps->switching_mode = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: switching_mode = %d\n", prefix,
+			  caps->switching_mode);
+		break;
+	case ICE_AQC_CAPS_MANAGEABILITY_MODE:
+		caps->mgmt_mode = number;
+		caps->mgmt_protocols_mctp = logical_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_mode = %d\n", prefix,
+			  caps->mgmt_mode);
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_protocols_mctp = %d\n", prefix,
+			  caps->mgmt_protocols_mctp);
+		break;
+	case ICE_AQC_CAPS_OS2BMC:
+		caps->os2bmc = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: os2bmc = %d\n", prefix, caps->os2bmc);
+		break;
+	case ICE_AQC_CAPS_VALID_FUNCTIONS:
+		caps->valid_functions = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: valid_functions (bitmap) = %d\n", prefix,
+			  caps->valid_functions);
+		break;
+	case ICE_AQC_CAPS_SRIOV:
+		caps->sr_iov_1_1 = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: sr_iov_1_1 = %d\n", prefix,
+			  caps->sr_iov_1_1);
+		break;
+	case ICE_AQC_CAPS_VMDQ:
+		caps->vmdq = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: vmdq = %d\n", prefix, caps->vmdq);
+		break;
+	case ICE_AQC_CAPS_802_1QBG:
+		caps->evb_802_1_qbg = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: evb_802_1_qbg = %d\n", prefix, number);
+		break;
+	case ICE_AQC_CAPS_802_1BR:
+		caps->evb_802_1_qbh = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: evb_802_1_qbh = %d\n", prefix, number);
+		break;
+	case ICE_AQC_CAPS_DCB:
+		caps->dcb = (number == 1);
+		caps->active_tc_bitmap = logical_id;
+		caps->maxtc = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: dcb = %d\n", prefix, caps->dcb);
+		ice_debug(hw, ICE_DBG_INIT, "%s: active_tc_bitmap = %d\n", prefix,
+			  caps->active_tc_bitmap);
+		ice_debug(hw, ICE_DBG_INIT, "%s: maxtc = %d\n", prefix, caps->maxtc);
+		break;
+	case ICE_AQC_CAPS_ISCSI:
+		caps->iscsi = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: iscsi = %d\n", prefix, caps->iscsi);
+		break;
+	case ICE_AQC_CAPS_RSS:
+		caps->rss_table_size = number;
+		caps->rss_table_entry_width = logical_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_size = %d\n", prefix,
+			  caps->rss_table_size);
+		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_entry_width = %d\n", prefix,
+			  caps->rss_table_entry_width);
+		break;
+	case ICE_AQC_CAPS_RXQS:
+		caps->num_rxq = number;
+		caps->rxq_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_rxq = %d\n", prefix,
+			  caps->num_rxq);
+		ice_debug(hw, ICE_DBG_INIT, "%s: rxq_first_id = %d\n", prefix,
+			  caps->rxq_first_id);
+		break;
+	case ICE_AQC_CAPS_TXQS:
+		caps->num_txq = number;
+		caps->txq_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_txq = %d\n", prefix,
+			  caps->num_txq);
+		ice_debug(hw, ICE_DBG_INIT, "%s: txq_first_id = %d\n", prefix,
+			  caps->txq_first_id);
+		break;
+	case ICE_AQC_CAPS_MSIX:
+		caps->num_msix_vectors = number;
+		caps->msix_vector_first_id = phys_id;
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_msix_vectors = %d\n", prefix,
+			  caps->num_msix_vectors);
+		ice_debug(hw, ICE_DBG_INIT, "%s: msix_vector_first_id = %d\n", prefix,
+			  caps->msix_vector_first_id);
+		break;
+	case ICE_AQC_CAPS_NVM_VER:
+		break;
+	case ICE_AQC_CAPS_PENDING_NVM_VER:
+		caps->nvm_update_pending_nvm = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_nvm\n", prefix);
+		break;
+	case ICE_AQC_CAPS_PENDING_OROM_VER:
+		caps->nvm_update_pending_orom = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_orom\n", prefix);
+		break;
+	case ICE_AQC_CAPS_PENDING_NET_VER:
+		caps->nvm_update_pending_netlist = true;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_netlist\n", prefix);
+		break;
+	case ICE_AQC_CAPS_NVM_MGMT:
+		caps->sec_rev_disabled =
+			(number & ICE_NVM_MGMT_SEC_REV_DISABLED) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: sec_rev_disabled = %d\n", prefix,
+			  caps->sec_rev_disabled);
+		caps->update_disabled =
+			(number & ICE_NVM_MGMT_UPDATE_DISABLED) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: update_disabled = %d\n", prefix,
+			  caps->update_disabled);
+		caps->nvm_unified_update =
+			(number & ICE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ?
+			true : false;
+		ice_debug(hw, ICE_DBG_INIT, "%s: nvm_unified_update = %d\n", prefix,
+			  caps->nvm_unified_update);
+		break;
+	case ICE_AQC_CAPS_CEM:
+		caps->mgmt_cem = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_cem = %d\n", prefix,
+			  caps->mgmt_cem);
+		break;
+	case ICE_AQC_CAPS_IWARP:
+		caps->iwarp = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: iwarp = %d\n", prefix, caps->iwarp);
+		break;
+	case ICE_AQC_CAPS_LED:
+		if (phys_id < ICE_MAX_SUPPORTED_GPIO_LED) {
+			caps->led[phys_id] = true;
+			caps->led_pin_num++;
+			ice_debug(hw, ICE_DBG_INIT, "%s: led[%d] = 1\n", prefix, phys_id);
+		}
+		break;
+	case ICE_AQC_CAPS_SDP:
+		if (phys_id < ICE_MAX_SUPPORTED_GPIO_SDP) {
+			caps->sdp[phys_id] = true;
+			caps->sdp_pin_num++;
+			ice_debug(hw, ICE_DBG_INIT, "%s: sdp[%d] = 1\n", prefix, phys_id);
+		}
+		break;
+	case ICE_AQC_CAPS_WR_CSR_PROT:
+		caps->wr_csr_prot = number;
+		caps->wr_csr_prot |= (u64)logical_id << 32;
+		ice_debug(hw, ICE_DBG_INIT, "%s: wr_csr_prot = 0x%llX\n", prefix,
+			  (unsigned long long)caps->wr_csr_prot);
+		break;
+	case ICE_AQC_CAPS_WOL_PROXY:
+		caps->num_wol_proxy_fltr = number;
+		caps->wol_proxy_vsi_seid = logical_id;
+		caps->apm_wol_support = !!(phys_id & ICE_WOL_SUPPORT_M);
+		caps->acpi_prog_mthd = !!(phys_id &
+					  ICE_ACPI_PROG_MTHD_M);
+		caps->proxy_support = !!(phys_id & ICE_PROXY_SUPPORT_M);
+		ice_debug(hw, ICE_DBG_INIT, "%s: num_wol_proxy_fltr = %d\n", prefix,
+			  caps->num_wol_proxy_fltr);
+		ice_debug(hw, ICE_DBG_INIT, "%s: wol_proxy_vsi_seid = %d\n", prefix,
+			  caps->wol_proxy_vsi_seid);
+		break;
+	case ICE_AQC_CAPS_MAX_MTU:
+		caps->max_mtu = number;
+		ice_debug(hw, ICE_DBG_INIT, "%s: max_mtu = %d\n",
+			  prefix, caps->max_mtu);
+		break;
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG1:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG2:
+	case ICE_AQC_CAPS_EXT_TOPO_DEV_IMG3:
+	{
+		u8 index = cap - ICE_AQC_CAPS_EXT_TOPO_DEV_IMG0;
+
+		if (index >= ICE_EXT_TOPO_DEV_IMG_COUNT)
+			break;
 
-	/* two MSIX vectors, one for traffic and one for misc causes */
-	func_caps->common_cap.num_msix_vectors = 2;
-	func_caps->guar_num_vsi = 1;
+		caps->ext_topo_dev_img_ver_high[index] = number;
+		caps->ext_topo_dev_img_ver_low[index] = logical_id;
+		caps->ext_topo_dev_img_part_num[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_PART_NUM_M) >>
+			ICE_EXT_TOPO_DEV_IMG_PART_NUM_S;
+		caps->ext_topo_dev_img_load_en[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_LOAD_EN) != 0;
+		caps->ext_topo_dev_img_prog_en[index] =
+			(phys_id & ICE_EXT_TOPO_DEV_IMG_PROG_EN) != 0;
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_ver_high[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_ver_high[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_ver_low[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_ver_low[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_part_num[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_part_num[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_load_en[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_load_en[index]);
+		ice_debug(hw, ICE_DBG_INIT,
+			  "%s: ext_topo_dev_img_prog_en[%d] = %d\n",
+			  prefix, index,
+			  caps->ext_topo_dev_img_prog_en[index]);
+		break;
+	}
+	default:
+		/* Not one of the recognized common capabilities */
+		found = false;
+	}
 
-	/* cache some dev_caps values that should be restored after memset */
-	valid_func = dev_caps->common_cap.valid_functions;
-	txq_first_id = dev_caps->common_cap.txq_first_id;
-	rxq_first_id = dev_caps->common_cap.rxq_first_id;
-	msix_vector_first_id = dev_caps->common_cap.msix_vector_first_id;
-	max_mtu = dev_caps->common_cap.max_mtu;
+	return found;
+}
 
-	/* unset dev capabilities */
-	memset(dev_caps, 0, sizeof(*dev_caps));
+/**
+ * ice_recalc_port_limited_caps - Recalculate port limited capabilities
+ * @hw: pointer to the HW structure
+ * @caps: pointer to capabilities structure to fix
+ *
+ * Re-calculate the capabilities that are dependent on the number of physical
+ * ports; i.e. some features are not supported or function differently on
+ * devices with more than 4 ports.
+ */
+static void
+ice_recalc_port_limited_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps)
+{
+	/* This assumes device capabilities are always scanned before function
+	 * capabilities during the initialization flow.
+	 */
+	if (hw->dev_caps.num_funcs > 4) {
+		/* Max 4 TCs per port */
+		caps->maxtc = 4;
+		ice_debug(hw, ICE_DBG_INIT, "reducing maxtc to %d (based on #ports)\n",
+			  caps->maxtc);
+		if (caps->iwarp) {
+			ice_debug(hw, ICE_DBG_INIT, "forcing RDMA off\n");
+			caps->iwarp = 0;
+		}
 
-	/* restore cached values */
-	dev_caps->common_cap.valid_functions = valid_func;
-	dev_caps->common_cap.txq_first_id = txq_first_id;
-	dev_caps->common_cap.rxq_first_id = rxq_first_id;
-	dev_caps->common_cap.msix_vector_first_id = msix_vector_first_id;
-	dev_caps->common_cap.max_mtu = max_mtu;
-
-	/* valid_func is a bitmap. get number of functions */
-#define ICE_MAX_FUNCS 8
-	for (i = 0; i < ICE_MAX_FUNCS; i++)
-		if (valid_func & BIT(i))
-			num_func++;
+		/* print message only when processing device capabilities
+		 * during initialization.
+		 */
+		if (caps == &hw->dev_caps.common_cap)
+			dev_info(ice_hw_to_dev(hw),
+				 "RDMA functionality is not available with the current device configuration.\n");
+	}
+}
 
-	/* one Tx and one Rx queue per function in safe mode */
-	dev_caps->common_cap.num_rxq = num_func;
-	dev_caps->common_cap.num_txq = num_func;
+/**
+ * ice_parse_vf_func_caps - Parse ICE_AQC_CAPS_VF function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_VF.
+ */
+static void
+ice_parse_vf_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+		       struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
+	u32 logical_id = le32_to_cpu(cap->logical_id);
+
+	func_p->num_allocd_vfs = number;
+	func_p->vf_base_id = logical_id;
+	ice_debug(hw, ICE_DBG_INIT, "func caps: num_allocd_vfs = %d\n",
+		  func_p->num_allocd_vfs);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: vf_base_id = %d\n",
+		  func_p->vf_base_id);
+}
 
-	/* two MSIX vectors per function */
-	dev_caps->common_cap.num_msix_vectors = 2 * num_func;
+/**
+ * ice_parse_vsi_func_caps - Parse ICE_AQC_CAPS_VSI function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_VSI.
+ */
+static void
+ice_parse_vsi_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+			struct ice_aqc_list_caps_elem *cap)
+{
+	func_p->guar_num_vsi = ice_get_num_per_func(hw, ICE_MAX_VSI);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: guar_num_vsi (fw) = %d\n",
+		  le32_to_cpu(cap->number));
+	ice_debug(hw, ICE_DBG_INIT, "func caps: guar_num_vsi = %d\n",
+		  func_p->guar_num_vsi);
 }
 
 /**
- * ice_get_caps - get info about the HW
- * @hw: pointer to the hardware structure
+ * ice_parse_1588_func_caps - Parse ICE_AQC_CAPS_1588 function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for ICE_AQC_CAPS_1588.
  */
-enum ice_status ice_get_caps(struct ice_hw *hw)
+static void
+ice_parse_1588_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+			 struct ice_aqc_list_caps_elem *cap)
 {
-	enum ice_status status;
+	struct ice_ts_func_info *info = &func_p->ts_func_info;
+	u32 number = le32_to_cpu(cap->number);
 
-	status = ice_discover_caps(hw, ice_aqc_opc_list_dev_caps);
-	if (!status)
-		status = ice_discover_caps(hw, ice_aqc_opc_list_func_caps);
+	info->ena = ((number & ICE_TS_FUNC_ENA_M) != 0);
+	func_p->common_cap.ieee_1588 = info->ena;
 
-	return status;
+	info->src_tmr_owned = ((number & ICE_TS_SRC_TMR_OWND_M) != 0);
+	info->tmr_ena = ((number & ICE_TS_TMR_ENA_M) != 0);
+	info->tmr_index_owned = ((number & ICE_TS_TMR_IDX_OWND_M) != 0);
+	info->tmr_index_assoc = ((number & ICE_TS_TMR_IDX_ASSOC_M) != 0);
+
+	info->clk_freq = (number & ICE_TS_CLK_FREQ_M) >> ICE_TS_CLK_FREQ_S;
+	info->clk_src = ((number & ICE_TS_CLK_SRC_M) != 0);
+
+	if (info->clk_freq < NUM_ICE_TIME_REF_FREQ) {
+		info->time_ref = (enum ice_time_ref_freq)info->clk_freq;
+	} else {
+		/* Unknown clock frequency, so assume a (probably incorrect)
+		 * default to avoid out-of-bounds look ups of frequency
+		 * related information.
+		 */
+		ice_debug(hw, ICE_DBG_INIT, "1588 func caps: unknown clock frequency %u\n",
+			  info->clk_freq);
+		info->time_ref = ICE_TIME_REF_FREQ_25_000;
+	}
+
+	ice_debug(hw, ICE_DBG_INIT, "func caps: ieee_1588 = %u\n",
+		  func_p->common_cap.ieee_1588);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: src_tmr_owned = %u\n",
+		  info->src_tmr_owned);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_ena = %u\n",
+		  info->tmr_ena);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_index_owned = %u\n",
+		  info->tmr_index_owned);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: tmr_index_assoc = %u\n",
+		  info->tmr_index_assoc);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: clk_freq = %u\n",
+		  info->clk_freq);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: clk_src = %u\n",
+		  info->clk_src);
 }
 
 /**
- * ice_aq_manage_mac_write - manage MAC address write command
+ * ice_parse_fdir_func_caps - Parse ICE_AQC_CAPS_FD function caps
  * @hw: pointer to the HW struct
- * @mac_addr: MAC address to be written as LAA/LAA+WoL/Port address
- * @flags: flags to control write behavior
- * @cd: pointer to command details structure or NULL
+ * @func_p: pointer to function capabilities structure
  *
- * This function is used to write MAC address to the NVM (0x0108).
+ * Extract function capabilities for ICE_AQC_CAPS_FD.
  */
-enum ice_status
-ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
-			struct ice_sq_cd *cd)
+static void
+ice_parse_fdir_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p)
 {
-	struct ice_aqc_manage_mac_write *cmd;
-	struct ice_aq_desc desc;
+	u32 reg_val, val;
+
+	reg_val = rd32(hw, GLQF_FD_SIZE);
+	val = (reg_val & GLQF_FD_SIZE_FD_GSIZE_M) >>
+		GLQF_FD_SIZE_FD_GSIZE_S;
+	func_p->fd_fltr_guar =
+		ice_get_num_per_func(hw, val);
+	val = (reg_val & GLQF_FD_SIZE_FD_BSIZE_M) >>
+		GLQF_FD_SIZE_FD_BSIZE_S;
+	func_p->fd_fltr_best_effort = val;
+
+	ice_debug(hw, ICE_DBG_INIT, "func caps: fd_fltr_guar = %d\n",
+		  func_p->fd_fltr_guar);
+	ice_debug(hw, ICE_DBG_INIT, "func caps: fd_fltr_best_effort = %d\n",
+		  func_p->fd_fltr_best_effort);
+}
 
-	cmd = &desc.params.mac_write;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
 
-	cmd->flags = flags;
+/**
+ * ice_parse_func_caps - Parse function capabilities
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @buf: buffer containing the function capability records
+ * @cap_count: the number of capabilities
+ *
+ * Helper function to parse function (0x000A) capabilities list. For
+ * capabilities shared between device and function, this relies on
+ * ice_parse_common_caps.
+ *
+ * Loop through the list of provided capabilities and extract the relevant
+ * data into the function capabilities structured.
+ */
+static void
+ice_parse_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
+		    void *buf, u32 cap_count)
+{
+	struct ice_aqc_list_caps_elem *cap_resp;
+	u32 i;
 
-	/* Prep values for flags, sah, sal */
-	cmd->sah = htons(*((const u16 *)mac_addr));
-	cmd->sal = htonl(*((const u32 *)(mac_addr + 2)));
+	cap_resp = buf;
 
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	memset(func_p, 0, sizeof(*func_p));
+
+	for (i = 0; i < cap_count; i++) {
+		u16 cap = le16_to_cpu(cap_resp[i].cap);
+		bool found;
+
+		found = ice_parse_common_caps(hw, &func_p->common_cap,
+					      &cap_resp[i], "func caps");
+
+		switch (cap) {
+		case ICE_AQC_CAPS_VF:
+			ice_parse_vf_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VSI:
+			ice_parse_vsi_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_1588:
+			ice_parse_1588_func_caps(hw, func_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_FD:
+			ice_parse_fdir_func_caps(hw, func_p);
+			break;
+		default:
+			/* Don't list common capabilities as unknown */
+			if (!found)
+				ice_debug(hw, ICE_DBG_INIT, "func caps: unknown capability[%d]: 0x%x\n",
+					  i, cap);
+			break;
+		}
+	}
+
+	ice_print_led_caps(hw, &func_p->common_cap, "func caps", true);
+	ice_print_sdp_caps(hw, &func_p->common_cap, "func caps", true);
+
+	ice_recalc_port_limited_caps(hw, &func_p->common_cap);
 }
 
 /**
- * ice_aq_clear_pxe_mode
+ * ice_parse_valid_functions_cap - Parse ICE_AQC_CAPS_VALID_FUNCTIONS caps
  * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * Tell the firmware that the driver is taking over from PXE (0x0110).
+ * Parse ICE_AQC_CAPS_VALID_FUNCTIONS for device capabilities.
  */
-static enum ice_status ice_aq_clear_pxe_mode(struct ice_hw *hw)
+static void
+ice_parse_valid_functions_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			      struct ice_aqc_list_caps_elem *cap)
 {
-	struct ice_aq_desc desc;
+	u32 number = le32_to_cpu(cap->number);
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pxe_mode);
-	desc.params.clear_pxe.rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
+	dev_p->num_funcs = hweight32(number);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_funcs = %d\n",
+		  dev_p->num_funcs);
+}
 
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+/**
+ * ice_parse_vf_dev_caps - Parse ICE_AQC_CAPS_VF device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
+ *
+ * Parse ICE_AQC_CAPS_VF for device capabilities.
+ */
+static void
+ice_parse_vf_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		      struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
+
+	dev_p->num_vfs_exposed = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev_caps: num_vfs_exposed = %d\n",
+		  dev_p->num_vfs_exposed);
 }
 
 /**
- * ice_clear_pxe_mode - clear pxe operations mode
+ * ice_parse_vsi_dev_caps - Parse ICE_AQC_CAPS_VSI device caps
  * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * Make sure all PXE mode settings are cleared, including things
- * like descriptor fetch/write-back mode.
+ * Parse ICE_AQC_CAPS_VSI for device capabilities.
  */
-void ice_clear_pxe_mode(struct ice_hw *hw)
+static void
+ice_parse_vsi_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		       struct ice_aqc_list_caps_elem *cap)
 {
-	if (ice_check_sq_alive(hw, &hw->adminq))
-		ice_aq_clear_pxe_mode(hw);
+	u32 number = le32_to_cpu(cap->number);
+
+	dev_p->num_vsi_allocd_to_host = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_vsi_allocd_to_host = %d\n",
+		  dev_p->num_vsi_allocd_to_host);
 }
 
 /**
- * ice_get_link_speed_based_on_phy_type - returns link speed
- * @phy_type_low: lower part of phy_type
- * @phy_type_high: higher part of phy_type
+ * ice_parse_1588_dev_caps - Parse ICE_AQC_CAPS_1588 device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
  *
- * This helper function will convert an entry in PHY type structure
- * [phy_type_low, phy_type_high] to its corresponding link speed.
- * Note: In the structure of [phy_type_low, phy_type_high], there should
- * be one bit set, as this function will convert one PHY type to its
- * speed.
- * If no bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
- * If more than one bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ * Parse ICE_AQC_CAPS_1588 for device capabilities.
  */
-static u16
-ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high)
+static void
+ice_parse_1588_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			struct ice_aqc_list_caps_elem *cap)
 {
-	u16 speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
-	u16 speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+	struct ice_ts_dev_info *info = &dev_p->ts_dev_info;
+	u32 logical_id = le32_to_cpu(cap->logical_id);
+	u32 phys_id = le32_to_cpu(cap->phys_id);
+	u32 number = le32_to_cpu(cap->number);
+
+	info->ena = ((number & ICE_TS_DEV_ENA_M) != 0);
+	dev_p->common_cap.ieee_1588 = info->ena;
+
+	info->tmr0_owner = number & ICE_TS_TMR0_OWNR_M;
+	info->tmr0_owned = ((number & ICE_TS_TMR0_OWND_M) != 0);
+	info->tmr0_ena = ((number & ICE_TS_TMR0_ENA_M) != 0);
+
+	info->tmr1_owner = (number & ICE_TS_TMR1_OWNR_M) >> ICE_TS_TMR1_OWNR_S;
+	info->tmr1_owned = ((number & ICE_TS_TMR1_OWND_M) != 0);
+	info->tmr1_ena = ((number & ICE_TS_TMR1_ENA_M) != 0);
+
+	info->ena_ports = logical_id;
+	info->tmr_own_map = phys_id;
+
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: ieee_1588 = %u\n",
+		  dev_p->common_cap.ieee_1588);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_owner = %u\n",
+		  info->tmr0_owner);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_owned = %u\n",
+		  info->tmr0_owned);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr0_ena = %u\n",
+		  info->tmr0_ena);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_owner = %u\n",
+		  info->tmr1_owner);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_owned = %u\n",
+		  info->tmr1_owned);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr1_ena = %u\n",
+		  info->tmr1_ena);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: ieee_1588 ena_ports = %u\n",
+		  info->ena_ports);
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: tmr_own_map = %u\n",
+		  info->tmr_own_map);
+}
 
-	switch (phy_type_low) {
-	case ICE_PHY_TYPE_LOW_100BASE_TX:
-	case ICE_PHY_TYPE_LOW_100M_SGMII:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_100MB;
-		break;
-	case ICE_PHY_TYPE_LOW_1000BASE_T:
-	case ICE_PHY_TYPE_LOW_1000BASE_SX:
-	case ICE_PHY_TYPE_LOW_1000BASE_LX:
-	case ICE_PHY_TYPE_LOW_1000BASE_KX:
-	case ICE_PHY_TYPE_LOW_1G_SGMII:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_1000MB;
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_T:
-	case ICE_PHY_TYPE_LOW_2500BASE_X:
-	case ICE_PHY_TYPE_LOW_2500BASE_KX:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_2500MB;
-		break;
-	case ICE_PHY_TYPE_LOW_5GBASE_T:
-	case ICE_PHY_TYPE_LOW_5GBASE_KR:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_5GB;
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_T:
-	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
-	case ICE_PHY_TYPE_LOW_10GBASE_SR:
-	case ICE_PHY_TYPE_LOW_10GBASE_LR:
-	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
-	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_10GB;
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_T:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
-	case ICE_PHY_TYPE_LOW_25GBASE_SR:
-	case ICE_PHY_TYPE_LOW_25GBASE_LR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
-	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_25GB;
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_40GB;
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2:
-	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI2:
-	case ICE_PHY_TYPE_LOW_50GBASE_CP:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR:
-	case ICE_PHY_TYPE_LOW_50GBASE_FR:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
-	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI1:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_50GB;
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4:
-	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_AUI4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_100GBASE_DR:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_100GB;
-		break;
-	default:
-		speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
-		break;
-	}
-
-	switch (phy_type_high) {
-	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2:
-		speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB;
-		break;
-	default:
-		speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
-		break;
-	}
+/**
+ * ice_parse_fdir_dev_caps - Parse ICE_AQC_CAPS_FD device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
+ *
+ * Parse ICE_AQC_CAPS_FD for device capabilities.
+ */
+static void
+ice_parse_fdir_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+			struct ice_aqc_list_caps_elem *cap)
+{
+	u32 number = le32_to_cpu(cap->number);
 
-	if (speed_phy_type_low == ICE_AQ_LINK_SPEED_UNKNOWN &&
-	    speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
-		return ICE_AQ_LINK_SPEED_UNKNOWN;
-	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
-		 speed_phy_type_high != ICE_AQ_LINK_SPEED_UNKNOWN)
-		return ICE_AQ_LINK_SPEED_UNKNOWN;
-	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
-		 speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
-		return speed_phy_type_low;
-	else
-		return speed_phy_type_high;
+	dev_p->num_flow_director_fltr = number;
+	ice_debug(hw, ICE_DBG_INIT, "dev caps: num_flow_director_fltr = %d\n",
+		  dev_p->num_flow_director_fltr);
 }
 
+
 /**
- * ice_update_phy_type
- * @phy_type_low: pointer to the lower part of phy_type
- * @phy_type_high: pointer to the higher part of phy_type
- * @link_speeds_bitmap: targeted link speeds bitmap
+ * ice_parse_dev_caps - Parse device capabilities
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @buf: buffer containing the device capability records
+ * @cap_count: the number of capabilities
  *
- * Note: For the link_speeds_bitmap structure, you can check it at
- * [ice_aqc_get_link_status->link_speed]. Caller can pass in
- * link_speeds_bitmap include multiple speeds.
+ * Helper device to parse device (0x000B) capabilities list. For
+ * capabilities shared between device and function, this relies on
+ * ice_parse_common_caps.
  *
- * Each entry in this [phy_type_low, phy_type_high] structure will
- * present a certain link speed. This helper function will turn on bits
- * in [phy_type_low, phy_type_high] structure based on the value of
- * link_speeds_bitmap input parameter.
+ * Loop through the list of provided capabilities and extract the relevant
+ * data into the device capabilities structured.
  */
-void
-ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
-		    u16 link_speeds_bitmap)
+static void
+ice_parse_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
+		   void *buf, u32 cap_count)
 {
-	u64 pt_high;
-	u64 pt_low;
-	int index;
-	u16 speed;
+	struct ice_aqc_list_caps_elem *cap_resp;
+	u32 i;
 
-	/* We first check with low part of phy_type */
-	for (index = 0; index <= ICE_PHY_TYPE_LOW_MAX_INDEX; index++) {
-		pt_low = BIT_ULL(index);
-		speed = ice_get_link_speed_based_on_phy_type(pt_low, 0);
+	cap_resp = buf;
 
-		if (link_speeds_bitmap & speed)
-			*phy_type_low |= BIT_ULL(index);
-	}
+	memset(dev_p, 0, sizeof(*dev_p));
 
-	/* We then check with high part of phy_type */
-	for (index = 0; index <= ICE_PHY_TYPE_HIGH_MAX_INDEX; index++) {
-		pt_high = BIT_ULL(index);
-		speed = ice_get_link_speed_based_on_phy_type(0, pt_high);
+	for (i = 0; i < cap_count; i++) {
+		u16 cap = le16_to_cpu(cap_resp[i].cap);
+		bool found;
 
-		if (link_speeds_bitmap & speed)
-			*phy_type_high |= BIT_ULL(index);
+		found = ice_parse_common_caps(hw, &dev_p->common_cap,
+					      &cap_resp[i], "dev caps");
+
+		switch (cap) {
+		case ICE_AQC_CAPS_VALID_FUNCTIONS:
+			ice_parse_valid_functions_cap(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VF:
+			ice_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_VSI:
+			ice_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case ICE_AQC_CAPS_1588:
+			ice_parse_1588_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		case  ICE_AQC_CAPS_FD:
+			ice_parse_fdir_dev_caps(hw, dev_p, &cap_resp[i]);
+			break;
+		default:
+			/* Don't list common capabilities as unknown */
+			if (!found)
+				ice_debug(hw, ICE_DBG_INIT, "dev caps: unknown capability[%d]: 0x%x\n",
+					  i, cap);
+			break;
+		}
 	}
+
+	ice_print_led_caps(hw, &dev_p->common_cap, "dev caps", true);
+	ice_print_sdp_caps(hw, &dev_p->common_cap, "dev caps", true);
+
+	ice_recalc_port_limited_caps(hw, &dev_p->common_cap);
 }
 
 /**
- * ice_aq_set_phy_cfg
+ * ice_aq_list_caps - query function/device capabilities
  * @hw: pointer to the HW struct
- * @lport: logical port number
- * @cfg: structure with PHY configuration data to be set
+ * @buf: a buffer to hold the capabilities
+ * @buf_size: size of the buffer
+ * @cap_count: if not NULL, set to the number of capabilities reported
+ * @opc: capabilities type to discover, device or function
  * @cd: pointer to command details structure or NULL
  *
- * Set the various PHY configuration parameters supported on the Port.
- * One or more of the Set PHY config parameters may be ignored in an MFP
- * mode as the PF may not have the privilege to set some of the PHY Config
- * parameters. This status will be indicated by the command response (0x0601).
+ * Get the function (0x000A) or device (0x000B) capabilities description from
+ * firmware and store it in the buffer.
+ *
+ * If the cap_count pointer is not NULL, then it is set to the number of
+ * capabilities firmware will report. Note that if the buffer size is too
+ * small, it is possible the command will return ICE_AQ_ERR_ENOMEM. The
+ * cap_count will still be updated in this case. It is recommended that the
+ * buffer size be set to ICE_AQ_MAX_BUF_LEN (the largest possible buffer that
+ * firmware could return) to avoid this.
  */
 enum ice_status
-ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
-		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
+ice_aq_list_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
+		 enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
+	struct ice_aqc_list_caps *cmd;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	if (!cfg)
-		return ICE_ERR_PARAM;
-
-	/* Ensure that only valid bits of cfg->caps can be turned on. */
-	if (cfg->caps & ~ICE_AQ_PHY_ENA_VALID_MASK) {
-		ice_debug(hw, ICE_DBG_PHY,
-			  "Invalid bit is set in ice_aqc_set_phy_cfg_data->caps : 0x%x\n",
-			  cfg->caps);
+	cmd = &desc.params.get_cap;
 
-		cfg->caps &= ICE_AQ_PHY_ENA_VALID_MASK;
-	}
+	if (opc != ice_aqc_opc_list_func_caps &&
+	    opc != ice_aqc_opc_list_dev_caps)
+		return ICE_ERR_PARAM;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
-	desc.params.set_phy.lport_num = lport;
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_low = 0x%llx\n",
-		  (unsigned long long)le64_to_cpu(cfg->phy_type_low));
-	ice_debug(hw, ICE_DBG_LINK, "phy_type_high = 0x%llx\n",
-		  (unsigned long long)le64_to_cpu(cfg->phy_type_high));
-	ice_debug(hw, ICE_DBG_LINK, "caps = 0x%x\n", cfg->caps);
-	ice_debug(hw, ICE_DBG_LINK, "low_power_ctrl = 0x%x\n",
-		  cfg->low_power_ctrl);
-	ice_debug(hw, ICE_DBG_LINK, "eee_cap = 0x%x\n", cfg->eee_cap);
-	ice_debug(hw, ICE_DBG_LINK, "eeer_value = 0x%x\n", cfg->eeer_value);
-	ice_debug(hw, ICE_DBG_LINK, "link_fec_opt = 0x%x\n", cfg->link_fec_opt);
+	if (cap_count)
+		*cap_count = le32_to_cpu(cmd->count);
 
-	return ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
+	return status;
 }
 
 /**
- * ice_update_link_info - update status of the HW network link
- * @pi: port info structure of the interested logical port
+ * ice_discover_dev_caps - Read and extract device capabilities
+ * @hw: pointer to the hardware structure
+ * @dev_caps: pointer to device capabilities structure
+ *
+ * Read the device capabilities and extract them into the dev_caps structure
+ * for later use.
  */
-enum ice_status ice_update_link_info(struct ice_port_info *pi)
+enum ice_status
+ice_discover_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps)
 {
-	struct ice_link_status *li;
 	enum ice_status status;
+	u32 cap_count = 0;
+	void *cbuf;
 
-	if (!pi)
-		return ICE_ERR_PARAM;
-
-	li = &pi->phy.link_info;
-
-	status = ice_aq_get_link_info(pi, true, NULL, NULL);
-	if (status)
-		return status;
-
-	if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		struct ice_aqc_get_phy_caps_data *pcaps;
-		struct ice_hw *hw;
-
-		hw = pi->hw;
-		pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps),
-				     GFP_KERNEL);
-		if (!pcaps)
-			return ICE_ERR_NO_MEMORY;
+	cbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!cbuf)
+		return ICE_ERR_NO_MEMORY;
 
-		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-					     pcaps, NULL);
-		if (!status)
-			memcpy(li->module_type, &pcaps->module_type,
-			       sizeof(li->module_type));
+	/* Although the driver doesn't know the number of capabilities the
+	 * device will return, we can simply send a 4KB buffer, the maximum
+	 * possible size that firmware can return.
+	 */
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
 
-		devm_kfree(ice_hw_to_dev(hw), pcaps);
-	}
+	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
+				  ice_aqc_opc_list_dev_caps, NULL);
+	if (!status)
+		ice_parse_dev_caps(hw, dev_caps, cbuf, cap_count);
+	devm_kfree(ice_hw_to_dev(hw), cbuf);
 
 	return status;
 }
 
 /**
- * ice_set_fc
- * @pi: port information structure
- * @aq_failures: pointer to status code, specific to ice_set_fc routine
- * @ena_auto_link_update: enable automatic link update
+ * ice_discover_func_caps - Read and extract function capabilities
+ * @hw: pointer to the hardware structure
+ * @func_caps: pointer to function capabilities structure
  *
- * Set the requested flow control mode.
+ * Read the function capabilities and extract them into the func_caps structure
+ * for later use.
  */
-enum ice_status
-ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update)
+static enum ice_status
+ice_discover_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_caps)
 {
-	struct ice_aqc_set_phy_cfg_data cfg = { 0 };
-	struct ice_aqc_get_phy_caps_data *pcaps;
 	enum ice_status status;
-	u8 pause_mask = 0x0;
-	struct ice_hw *hw;
+	u32 cap_count = 0;
+	void *cbuf;
 
-	if (!pi)
-		return ICE_ERR_PARAM;
-	hw = pi->hw;
-	*aq_failures = ICE_SET_FC_AQ_FAIL_NONE;
+	cbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!cbuf)
+		return ICE_ERR_NO_MEMORY;
 
-	switch (pi->fc.req_mode) {
-	case ICE_FC_FULL:
-		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
-		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
-		break;
-	case ICE_FC_RX_PAUSE:
-		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
-		break;
-	case ICE_FC_TX_PAUSE:
-		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
-		break;
-	default:
-		break;
-	}
+	/* Although the driver doesn't know the number of capabilities the
+	 * device will return, we can simply send a 4KB buffer, the maximum
+	 * possible size that firmware can return.
+	 */
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
 
-	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
-	if (!pcaps)
-		return ICE_ERR_NO_MEMORY;
+	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
+				  ice_aqc_opc_list_func_caps, NULL);
+	if (!status)
+		ice_parse_func_caps(hw, func_caps, cbuf, cap_count);
+	devm_kfree(ice_hw_to_dev(hw), cbuf);
 
-	/* Get the current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
-				     NULL);
-	if (status) {
-		*aq_failures = ICE_SET_FC_AQ_FAIL_GET;
-		goto out;
-	}
+	return status;
+}
 
-	/* clear the old pause settings */
-	cfg.caps = pcaps->caps & ~(ICE_AQC_PHY_EN_TX_LINK_PAUSE |
-				   ICE_AQC_PHY_EN_RX_LINK_PAUSE);
+/**
+ * ice_set_safe_mode_caps - Override dev/func capabilities when in safe mode
+ * @hw: pointer to the hardware structure
+ */
+void ice_set_safe_mode_caps(struct ice_hw *hw)
+{
+	struct ice_hw_func_caps *func_caps = &hw->func_caps;
+	struct ice_hw_dev_caps *dev_caps = &hw->dev_caps;
+	struct ice_hw_common_caps cached_caps;
+	u32 num_funcs;
 
-	/* set the new capabilities */
-	cfg.caps |= pause_mask;
+	/* cache some func_caps values that should be restored after memset */
+	cached_caps = func_caps->common_cap;
 
-	/* If the capabilities have changed, then set the new config */
-	if (cfg.caps != pcaps->caps) {
-		int retry_count, retry_max = 10;
+	/* unset func capabilities */
+	memset(func_caps, 0, sizeof(*func_caps));
 
-		/* Auto restart link so settings take effect */
-		if (ena_auto_link_update)
-			cfg.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-		/* Copy over all the old settings */
-		cfg.phy_type_high = pcaps->phy_type_high;
-		cfg.phy_type_low = pcaps->phy_type_low;
-		cfg.low_power_ctrl = pcaps->low_power_ctrl;
-		cfg.eee_cap = pcaps->eee_cap;
-		cfg.eeer_value = pcaps->eeer_value;
-		cfg.link_fec_opt = pcaps->link_fec_options;
-
-		status = ice_aq_set_phy_cfg(hw, pi->lport, &cfg, NULL);
-		if (status) {
-			*aq_failures = ICE_SET_FC_AQ_FAIL_SET;
-			goto out;
-		}
+#define ICE_RESTORE_FUNC_CAP(name) \
+	func_caps->common_cap.name = cached_caps.name
 
-		/* Update the link info
-		 * It sometimes takes a really long time for link to
-		 * come back from the atomic reset. Thus, we wait a
-		 * little bit.
-		 */
-		for (retry_count = 0; retry_count < retry_max; retry_count++) {
-			status = ice_update_link_info(pi);
+	/* restore cached values */
+	ICE_RESTORE_FUNC_CAP(valid_functions);
+	ICE_RESTORE_FUNC_CAP(txq_first_id);
+	ICE_RESTORE_FUNC_CAP(rxq_first_id);
+	ICE_RESTORE_FUNC_CAP(msix_vector_first_id);
+	ICE_RESTORE_FUNC_CAP(max_mtu);
+	ICE_RESTORE_FUNC_CAP(nvm_unified_update);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_nvm);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_orom);
+	ICE_RESTORE_FUNC_CAP(nvm_update_pending_netlist);
 
-			if (!status)
-				break;
+	/* one Tx and one Rx queue in safe mode */
+	func_caps->common_cap.num_rxq = 1;
+	func_caps->common_cap.num_txq = 1;
 
-			mdelay(100);
-		}
+	/* two MSIX vectors, one for traffic and one for misc causes */
+	func_caps->common_cap.num_msix_vectors = 2;
+	func_caps->guar_num_vsi = 1;
 
-		if (status)
-			*aq_failures = ICE_SET_FC_AQ_FAIL_UPDATE;
-	}
+	/* cache some dev_caps values that should be restored after memset */
+	cached_caps = dev_caps->common_cap;
+	num_funcs = dev_caps->num_funcs;
 
-out:
-	devm_kfree(ice_hw_to_dev(hw), pcaps);
-	return status;
-}
+	/* unset dev capabilities */
+	memset(dev_caps, 0, sizeof(*dev_caps));
 
-/**
- * ice_copy_phy_caps_to_cfg - Copy PHY ability data to configuration data
- * @caps: PHY ability structure to copy date from
- * @cfg: PHY configuration structure to copy data to
- *
- * Helper function to copy AQC PHY get ability data to PHY set configuration
- * data structure
- */
-void
-ice_copy_phy_caps_to_cfg(struct ice_aqc_get_phy_caps_data *caps,
-			 struct ice_aqc_set_phy_cfg_data *cfg)
-{
-	if (!caps || !cfg)
-		return;
+#define ICE_RESTORE_DEV_CAP(name) \
+	dev_caps->common_cap.name = cached_caps.name
 
-	cfg->phy_type_low = caps->phy_type_low;
-	cfg->phy_type_high = caps->phy_type_high;
-	cfg->caps = caps->caps;
-	cfg->low_power_ctrl = caps->low_power_ctrl;
-	cfg->eee_cap = caps->eee_cap;
-	cfg->eeer_value = caps->eeer_value;
-	cfg->link_fec_opt = caps->link_fec_options;
-}
+	/* restore cached values */
+	ICE_RESTORE_DEV_CAP(valid_functions);
+	ICE_RESTORE_DEV_CAP(txq_first_id);
+	ICE_RESTORE_DEV_CAP(rxq_first_id);
+	ICE_RESTORE_DEV_CAP(msix_vector_first_id);
+	ICE_RESTORE_DEV_CAP(max_mtu);
+	ICE_RESTORE_DEV_CAP(nvm_unified_update);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_nvm);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_orom);
+	ICE_RESTORE_DEV_CAP(nvm_update_pending_netlist);
+	dev_caps->num_funcs = num_funcs;
 
-/**
- * ice_cfg_phy_fec - Configure PHY FEC data based on FEC mode
- * @cfg: PHY configuration data to set FEC mode
- * @fec: FEC mode to configure
- *
- * Caller should copy ice_aqc_get_phy_caps_data.caps ICE_AQC_PHY_EN_AUTO_FEC
- * (bit 7) and ice_aqc_get_phy_caps_data.link_fec_options to cfg.caps
- * ICE_AQ_PHY_ENA_AUTO_FEC (bit 7) and cfg.link_fec_options before calling.
- */
-void
-ice_cfg_phy_fec(struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec)
-{
-	switch (fec) {
-	case ICE_FEC_BASER:
-		/* Clear RS bits, and AND BASE-R ability
-		 * bits and OR request bits.
-		 */
-		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
-				     ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN;
-		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
-				     ICE_AQC_PHY_FEC_25G_KR_REQ;
-		break;
-	case ICE_FEC_RS:
-		/* Clear BASE-R bits, and AND RS ability
-		 * bits and OR request bits.
-		 */
-		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN;
-		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_25G_RS_528_REQ |
-				     ICE_AQC_PHY_FEC_25G_RS_544_REQ;
-		break;
-	case ICE_FEC_NONE:
-		/* Clear all FEC option bits. */
-		cfg->link_fec_opt &= ~ICE_AQC_PHY_FEC_MASK;
-		break;
-	case ICE_FEC_AUTO:
-		/* AND auto FEC bit, and all caps bits. */
-		cfg->caps &= ICE_AQC_PHY_CAPS_MASK;
-		break;
-	}
+	/* one Tx and one Rx queue per function in safe mode */
+	dev_caps->common_cap.num_rxq = num_funcs;
+	dev_caps->common_cap.num_txq = num_funcs;
+
+	/* two MSIX vectors per function */
+	dev_caps->common_cap.num_msix_vectors = 2 * num_funcs;
 }
 
 /**
- * ice_get_link_status - get status of the HW network link
- * @pi: port information structure
- * @link_up: pointer to bool (true/false = linkup/linkdown)
- *
- * Variable link_up is true if link is up, false if link is down.
- * The variable link_up is invalid if status is non zero. As a
- * result of this call, link status reporting becomes enabled
+ * ice_get_caps - get info about the HW
+ * @hw: pointer to the hardware structure
  */
-enum ice_status ice_get_link_status(struct ice_port_info *pi, bool *link_up)
+enum ice_status ice_get_caps(struct ice_hw *hw)
 {
-	struct ice_phy_info *phy_info;
-	enum ice_status status = 0;
-
-	if (!pi || !link_up)
-		return ICE_ERR_PARAM;
-
-	phy_info = &pi->phy;
-
-	if (phy_info->get_link_info) {
-		status = ice_update_link_info(pi);
-
-		if (status)
-			ice_debug(pi->hw, ICE_DBG_LINK,
-				  "get link status error, status = %d\n",
-				  status);
-	}
+	enum ice_status status;
 
-	*link_up = phy_info->link_info.link_info & ICE_AQ_LINK_UP;
+	status = ice_discover_dev_caps(hw, &hw->dev_caps);
+	if (status)
+		return status;
 
-	return status;
+	return ice_discover_func_caps(hw, &hw->func_caps);
 }
 
 /**
- * ice_aq_set_link_restart_an
- * @pi: pointer to the port information structure
- * @ena_link: if true: enable link, if false: disable link
+ * ice_aq_manage_mac_write - manage MAC address write command
+ * @hw: pointer to the HW struct
+ * @mac_addr: MAC address to be written as LAA/LAA+WoL/Port address
+ * @flags: flags to control write behavior
  * @cd: pointer to command details structure or NULL
  *
- * Sets up the link and restarts the Auto-Negotiation over the link.
+ * This function is used to write MAC address to the NVM (0x0108).
  */
 enum ice_status
-ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
-			   struct ice_sq_cd *cd)
+ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
+			struct ice_sq_cd *cd)
 {
-	struct ice_aqc_restart_an *cmd;
+	struct ice_aqc_manage_mac_write *cmd;
 	struct ice_aq_desc desc;
 
-	cmd = &desc.params.restart_an;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_restart_an);
+	cmd = &desc.params.mac_write;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
 
-	cmd->cmd_flags = ICE_AQC_RESTART_AN_LINK_RESTART;
-	cmd->lport_num = pi->lport;
-	if (ena_link)
-		cmd->cmd_flags |= ICE_AQC_RESTART_AN_LINK_ENABLE;
-	else
-		cmd->cmd_flags &= ~ICE_AQC_RESTART_AN_LINK_ENABLE;
+	cmd->flags = flags;
+	ether_addr_copy(cmd->mac_addr, mac_addr);
 
-	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * ice_aq_set_event_mask
+ * ice_aq_clear_pxe_mode
  * @hw: pointer to the HW struct
- * @port_num: port number of the physical function
- * @mask: event mask to be set
- * @cd: pointer to command details structure or NULL
  *
- * Set event mask (0x0613)
+ * Tell the firmware that the driver is taking over from PXE (0x0110).
  */
-enum ice_status
-ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
-		      struct ice_sq_cd *cd)
+static enum ice_status ice_aq_clear_pxe_mode(struct ice_hw *hw)
 {
-	struct ice_aqc_set_event_mask *cmd;
 	struct ice_aq_desc desc;
 
-	cmd = &desc.params.set_event_mask;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
-
-	cmd->lport_num = port_num;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pxe_mode);
+	desc.params.clear_pxe.rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
 
-	cmd->event_mask = cpu_to_le16(mask);
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_aq_set_mac_loopback
+ * ice_clear_pxe_mode - clear pxe operations mode
  * @hw: pointer to the HW struct
- * @ena_lpbk: Enable or Disable loopback
- * @cd: pointer to command details structure or NULL
  *
- * Enable/disable loopback on a given port
+ * Make sure all PXE mode settings are cleared, including things
+ * like descriptor fetch/write-back mode.
  */
-enum ice_status
-ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd)
+void ice_clear_pxe_mode(struct ice_hw *hw)
 {
-	struct ice_aqc_set_mac_lb *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.set_mac_lb;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_lb);
-	if (ena_lpbk)
-		cmd->lb_mode = ICE_AQ_MAC_LB_EN;
-
-	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (ice_check_sq_alive(hw, &hw->adminq))
+		ice_aq_clear_pxe_mode(hw);
 }
 
 /**
- * ice_aq_set_port_id_led
- * @pi: pointer to the port information
- * @is_orig_mode: is this LED set to original mode (by the net-list)
+ * ice_aq_set_port_params - set physical port parameters.
+ * @pi: pointer to the port info struct
+ * @bad_frame_vsi: defines the VSI to which bad frames are forwarded
+ * @save_bad_pac: if set packets with errors are forwarded to the bad frames VSI
+ * @pad_short_pac: if set transmit packets smaller than 60 bytes are padded
+ * @double_vlan: if set double VLAN is enabled
  * @cd: pointer to command details structure or NULL
  *
- * Set LED value for the given port (0x06e9)
+ * Set Physical port parameters (0x0203)
  */
 enum ice_status
-ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
+ice_aq_set_port_params(struct ice_port_info *pi, u16 bad_frame_vsi,
+		       bool save_bad_pac, bool pad_short_pac, bool double_vlan,
 		       struct ice_sq_cd *cd)
+
 {
-	struct ice_aqc_set_port_id_led *cmd;
+	struct ice_aqc_set_port_params *cmd;
 	struct ice_hw *hw = pi->hw;
 	struct ice_aq_desc desc;
+	u16 cmd_flags = 0;
 
-	cmd = &desc.params.set_port_id_led;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_id_led);
+	cmd = &desc.params.set_port_params;
 
-	if (is_orig_mode)
-		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_ORIG;
-	else
-		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_BLINK;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_params);
+	cmd->bad_frame_vsi = cpu_to_le16(bad_frame_vsi);
+	if (save_bad_pac)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_SAVE_BAD_PACKETS;
+	if (pad_short_pac)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_PAD_SHORT_PACKETS;
+	if (double_vlan)
+		cmd_flags |= ICE_AQC_SET_P_PARAMS_DOUBLE_VLAN_ENA;
+	cmd->cmd_flags = cpu_to_le16(cmd_flags);
 
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * __ice_aq_get_set_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_id: VSI FW index
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
- * @glob_lut_idx: global LUT index
- * @set: set true to set the table, false to get the table
+ * ice_get_link_speed_based_on_phy_type - returns link speed
+ * @phy_type_low: lower part of phy_type
+ * @phy_type_high: higher part of phy_type
  *
- * Internal function to get (0x0B05) or set (0x0B03) RSS look up table
+ * This helper function will convert an entry in PHY type structure
+ * [phy_type_low, phy_type_high] to its corresponding link speed.
+ * Note: In the structure of [phy_type_low, phy_type_high], there should
+ * be one bit set, as this function will convert one PHY type to its
+ * speed.
+ * If no bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ * If more than one bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
  */
-static enum ice_status
-__ice_aq_get_set_rss_lut(struct ice_hw *hw, u16 vsi_id, u8 lut_type, u8 *lut,
-			 u16 lut_size, u8 glob_lut_idx, bool set)
+static u16
+ice_get_link_speed_based_on_phy_type(u64 phy_type_low, u64 phy_type_high)
 {
-	struct ice_aqc_get_set_rss_lut *cmd_resp;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 flags = 0;
-
-	cmd_resp = &desc.params.get_set_rss_lut;
-
-	if (set) {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_lut);
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	} else {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_lut);
-	}
-
-	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
-					 ICE_AQC_GSET_RSS_LUT_VSI_ID_S) &
-					ICE_AQC_GSET_RSS_LUT_VSI_ID_M) |
-				       ICE_AQC_GSET_RSS_LUT_VSI_VALID);
+	u16 speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
+	u16 speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
 
-	switch (lut_type) {
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI:
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF:
-	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL:
-		flags |= ((lut_type << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S) &
-			  ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M);
+	switch (phy_type_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
+	case ICE_PHY_TYPE_LOW_100M_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_100MB;
 		break;
-	default:
-		status = ICE_ERR_PARAM;
-		goto ice_aq_get_set_rss_lut_exit;
+	case ICE_PHY_TYPE_LOW_1000BASE_T:
+	case ICE_PHY_TYPE_LOW_1000BASE_SX:
+	case ICE_PHY_TYPE_LOW_1000BASE_LX:
+	case ICE_PHY_TYPE_LOW_1000BASE_KX:
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_1000MB;
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_2500MB;
+		break;
+	case ICE_PHY_TYPE_LOW_5GBASE_T:
+	case ICE_PHY_TYPE_LOW_5GBASE_KR:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_5GB;
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_10GB;
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_T:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
+	case ICE_PHY_TYPE_LOW_25GBASE_SR:
+	case ICE_PHY_TYPE_LOW_25GBASE_LR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
+	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_25GB;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_40GB;
+		break;
+	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
+	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
+	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_LAUI2:
+	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_AUI2:
+	case ICE_PHY_TYPE_LOW_50GBASE_CP:
+	case ICE_PHY_TYPE_LOW_50GBASE_SR:
+	case ICE_PHY_TYPE_LOW_50GBASE_FR:
+	case ICE_PHY_TYPE_LOW_50GBASE_LR:
+	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
+	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_50G_AUI1:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_50GB;
+		break;
+	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
+	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
+	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_100G_CAUI4:
+	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_100G_AUI4:
+	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
+	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
+	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
+	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
+	case ICE_PHY_TYPE_LOW_100GBASE_DR:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_100GB;
+		break;
+	default:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (phy_type_high) {
+	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
+	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
+	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
+	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
+	case ICE_PHY_TYPE_HIGH_100G_AUI2:
+		speed_phy_type_high = ICE_AQ_LINK_SPEED_100GB;
+		break;
+	default:
+		speed_phy_type_high = ICE_AQ_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	if (speed_phy_type_low == ICE_AQ_LINK_SPEED_UNKNOWN &&
+	    speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
+		return ICE_AQ_LINK_SPEED_UNKNOWN;
+	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
+		 speed_phy_type_high != ICE_AQ_LINK_SPEED_UNKNOWN)
+		return ICE_AQ_LINK_SPEED_UNKNOWN;
+	else if (speed_phy_type_low != ICE_AQ_LINK_SPEED_UNKNOWN &&
+		 speed_phy_type_high == ICE_AQ_LINK_SPEED_UNKNOWN)
+		return speed_phy_type_low;
+	else
+		return speed_phy_type_high;
+}
+
+/**
+ * ice_update_phy_type
+ * @phy_type_low: pointer to the lower part of phy_type
+ * @phy_type_high: pointer to the higher part of phy_type
+ * @link_speeds_bitmap: targeted link speeds bitmap
+ *
+ * Note: For the link_speeds_bitmap structure, you can check it at
+ * [ice_aqc_get_link_status->link_speed]. Caller can pass in
+ * link_speeds_bitmap include multiple speeds.
+ *
+ * Each entry in this [phy_type_low, phy_type_high] structure will
+ * present a certain link speed. This helper function will turn on bits
+ * in [phy_type_low, phy_type_high] structure based on the value of
+ * link_speeds_bitmap input parameter.
+ */
+void
+ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
+		    u16 link_speeds_bitmap)
+{
+	u64 pt_high;
+	u64 pt_low;
+	int index;
+	u16 speed;
+
+	/* We first check with low part of phy_type */
+	for (index = 0; index <= ICE_PHY_TYPE_LOW_MAX_INDEX; index++) {
+		pt_low = BIT_ULL(index);
+		speed = ice_get_link_speed_based_on_phy_type(pt_low, 0);
+
+		if (link_speeds_bitmap & speed)
+			*phy_type_low |= BIT_ULL(index);
+	}
+
+	/* We then check with high part of phy_type */
+	for (index = 0; index <= ICE_PHY_TYPE_HIGH_MAX_INDEX; index++) {
+		pt_high = BIT_ULL(index);
+		speed = ice_get_link_speed_based_on_phy_type(0, pt_high);
+
+		if (link_speeds_bitmap & speed)
+			*phy_type_high |= BIT_ULL(index);
+	}
+}
+
+/**
+ * ice_aq_set_phy_cfg
+ * @hw: pointer to the HW struct
+ * @pi: port info structure of the interested logical port
+ * @cfg: structure with PHY configuration data to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set the various PHY configuration parameters supported on the Port.
+ * One or more of the Set PHY config parameters may be ignored in an MFP
+ * mode as the PF may not have the privilege to set some of the PHY Config
+ * parameters. This status will be indicated by the command response (0x0601).
+ */
+enum ice_status
+ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
+		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!cfg)
+		return ICE_ERR_PARAM;
+
+	/* Ensure that only valid bits of cfg->caps can be turned on. */
+	if (cfg->caps & ~ICE_AQ_PHY_ENA_VALID_MASK) {
+		ice_debug(hw, ICE_DBG_PHY, "Invalid bit is set in ice_aqc_set_phy_cfg_data->caps : 0x%x\n",
+			  cfg->caps);
+
+		cfg->caps &= ICE_AQ_PHY_ENA_VALID_MASK;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
+	desc.params.set_phy.lport_num = pi->lport;
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	ice_debug(hw, ICE_DBG_LINK, "set phy cfg\n");
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(cfg->phy_type_low));
+	ice_debug(hw, ICE_DBG_LINK, "	phy_type_high = 0x%llx\n",
+		  (unsigned long long)le64_to_cpu(cfg->phy_type_high));
+	ice_debug(hw, ICE_DBG_LINK, "	caps = 0x%x\n", cfg->caps);
+	ice_debug(hw, ICE_DBG_LINK, "	low_power_ctrl_an = 0x%x\n",
+		  cfg->low_power_ctrl_an);
+	ice_debug(hw, ICE_DBG_LINK, "	eee_cap = 0x%x\n", cfg->eee_cap);
+	ice_debug(hw, ICE_DBG_LINK, "	eeer_value = 0x%x\n", cfg->eeer_value);
+	ice_debug(hw, ICE_DBG_LINK, "	link_fec_opt = 0x%x\n",
+		  cfg->link_fec_opt);
+
+	status = ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
+
+	if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+		status = 0;
+
+	if (!status)
+		pi->phy.curr_user_phy_cfg = *cfg;
+
+	return status;
+}
+
+/**
+ * ice_update_link_info - update status of the HW network link
+ * @pi: port info structure of the interested logical port
+ */
+enum ice_status ice_update_link_info(struct ice_port_info *pi)
+{
+	struct ice_link_status *li;
+	enum ice_status status;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	li = &pi->phy.link_info;
+
+	status = ice_aq_get_link_info(pi, true, NULL, NULL);
+	if (status)
+		return status;
+
+	if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) {
+		struct ice_aqc_get_phy_caps_data *pcaps;
+		struct ice_hw *hw;
+
+		hw = pi->hw;
+		pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps),
+				     GFP_KERNEL);
+		if (!pcaps)
+			return ICE_ERR_NO_MEMORY;
+
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+
+		if (!status)
+			memcpy(li->module_type, &pcaps->module_type,
+			       sizeof(li->module_type));
+
+		devm_kfree(ice_hw_to_dev(hw), pcaps);
+	}
+
+	return status;
+}
+
+/**
+ * ice_cache_phy_user_req
+ * @pi: port information structure
+ * @cache_data: PHY logging data
+ * @cache_mode: PHY logging mode
+ *
+ * Log the user request on (FC, FEC, SPEED) for later user.
+ */
+static void
+ice_cache_phy_user_req(struct ice_port_info *pi,
+		       struct ice_phy_cache_mode_data cache_data,
+		       enum ice_phy_cache_mode cache_mode)
+{
+	if (!pi)
+		return;
+
+	switch (cache_mode) {
+	case ICE_FC_MODE:
+		pi->phy.curr_user_fc_req = cache_data.data.curr_user_fc_req;
+		break;
+	case ICE_SPEED_MODE:
+		pi->phy.curr_user_speed_req =
+			cache_data.data.curr_user_speed_req;
+		break;
+	case ICE_FEC_MODE:
+		pi->phy.curr_user_fec_req = cache_data.data.curr_user_fec_req;
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * ice_caps_to_fc_mode
+ * @caps: PHY capabilities
+ *
+ * Convert PHY FC capabilities to ice FC mode
+ */
+enum ice_fc_mode ice_caps_to_fc_mode(u8 caps)
+{
+	if (caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE &&
+	    caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		return ICE_FC_FULL;
+
+	if (caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+		return ICE_FC_TX_PAUSE;
+
+	if (caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		return ICE_FC_RX_PAUSE;
+
+	return ICE_FC_NONE;
+}
+
+/**
+ * ice_caps_to_fec_mode
+ * @caps: PHY capabilities
+ * @fec_options: Link FEC options
+ *
+ * Convert PHY FEC capabilities to ice FEC mode
+ */
+enum ice_fec_mode ice_caps_to_fec_mode(u8 caps, u8 fec_options)
+{
+	if (caps & ICE_AQC_PHY_EN_AUTO_FEC)
+		return ICE_FEC_AUTO;
+
+	if (fec_options & (ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
+			   ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
+			   ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN |
+			   ICE_AQC_PHY_FEC_25G_KR_REQ))
+		return ICE_FEC_BASER;
+
+	if (fec_options & (ICE_AQC_PHY_FEC_25G_RS_528_REQ |
+			   ICE_AQC_PHY_FEC_25G_RS_544_REQ |
+			   ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN))
+		return ICE_FEC_RS;
+
+	return ICE_FEC_NONE;
+}
+
+/**
+ * ice_cfg_phy_fc - Configure PHY FC data based on FC mode
+ * @pi: port information structure
+ * @cfg: PHY configuration data to set FC mode
+ * @req_mode: FC mode to configure
+ */
+enum ice_status
+ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+	       enum ice_fc_mode req_mode)
+{
+	struct ice_phy_cache_mode_data cache_data;
+	u8 pause_mask = 0x0;
+
+	if (!pi || !cfg)
+		return ICE_ERR_BAD_PTR;
+	switch (req_mode) {
+	case ICE_FC_FULL:
+		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
+		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
+		break;
+	case ICE_FC_RX_PAUSE:
+		pause_mask |= ICE_AQC_PHY_EN_RX_LINK_PAUSE;
+		break;
+	case ICE_FC_TX_PAUSE:
+		pause_mask |= ICE_AQC_PHY_EN_TX_LINK_PAUSE;
+		break;
+	default:
+		break;
+	}
+
+	/* clear the old pause settings */
+	cfg->caps &= ~(ICE_AQC_PHY_EN_TX_LINK_PAUSE |
+		ICE_AQC_PHY_EN_RX_LINK_PAUSE);
+
+	/* set the new capabilities */
+	cfg->caps |= pause_mask;
+
+	/* Cache user FC request */
+	cache_data.data.curr_user_fc_req = req_mode;
+	ice_cache_phy_user_req(pi, cache_data, ICE_FC_MODE);
+
+	return 0;
+}
+
+/**
+ * ice_set_fc
+ * @pi: port information structure
+ * @aq_failures: pointer to status code, specific to ice_set_fc routine
+ * @ena_auto_link_update: enable automatic link update
+ *
+ * Set the requested flow control mode.
+ */
+enum ice_status
+ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update)
+{
+	struct ice_aqc_set_phy_cfg_data  cfg = { 0 };
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!pi || !aq_failures)
+		return ICE_ERR_BAD_PTR;
+
+	*aq_failures = 0;
+	hw = pi->hw;
+
+	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Get the current PHY config */
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG,
+				     pcaps, NULL);
+
+	if (status) {
+		*aq_failures = ICE_SET_FC_AQ_FAIL_GET;
+		goto out;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, &cfg);
+
+	/* Configure the set PHY data */
+	status = ice_cfg_phy_fc(pi, &cfg, pi->fc.req_mode);
+	if (status)
+		goto out;
+
+	/* If the capabilities have changed, then set the new config */
+	if (cfg.caps != pcaps->caps) {
+		int retry_count, retry_max = 10;
+
+		/* Auto restart link so settings take effect */
+		if (ena_auto_link_update)
+			cfg.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+		status = ice_aq_set_phy_cfg(hw, pi, &cfg, NULL);
+		if (status) {
+			*aq_failures = ICE_SET_FC_AQ_FAIL_SET;
+			goto out;
+		}
+
+		/* Update the link info
+		 * It sometimes takes a really long time for link to
+		 * come back from the atomic reset. Thus, we wait a
+		 * little bit.
+		 */
+		for (retry_count = 0; retry_count < retry_max; retry_count++) {
+			status = ice_update_link_info(pi);
+
+			if (!status)
+				break;
+
+			msleep(100);
+		}
+
+		if (status)
+			*aq_failures = ICE_SET_FC_AQ_FAIL_UPDATE;
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), pcaps);
+	return status;
+}
+
+/**
+ * ice_phy_caps_equals_cfg
+ * @phy_caps: PHY capabilities
+ * @phy_cfg: PHY configuration
+ *
+ * Helper function to determine if PHY capabilities matches PHY
+ * configuration
+ */
+bool
+ice_phy_caps_equals_cfg(struct ice_aqc_get_phy_caps_data *phy_caps,
+			struct ice_aqc_set_phy_cfg_data *phy_cfg)
+{
+	u8 caps_mask, cfg_mask;
+
+	if (!phy_caps || !phy_cfg)
+		return false;
+
+	/* These bits are not common between capabilities and configuration.
+	 * Do not use them to determine equality.
+	 */
+	caps_mask = ICE_AQC_PHY_CAPS_MASK & ~(ICE_AQC_PHY_AN_MODE |
+					      ICE_AQC_PHY_EN_MOD_QUAL);
+	cfg_mask = ICE_AQ_PHY_ENA_VALID_MASK & ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	if (phy_caps->phy_type_low != phy_cfg->phy_type_low ||
+	    phy_caps->phy_type_high != phy_cfg->phy_type_high ||
+	    ((phy_caps->caps & caps_mask) != (phy_cfg->caps & cfg_mask)) ||
+	    phy_caps->low_power_ctrl_an != phy_cfg->low_power_ctrl_an ||
+	    phy_caps->eee_cap != phy_cfg->eee_cap ||
+	    phy_caps->eeer_value != phy_cfg->eeer_value ||
+	    phy_caps->link_fec_options != phy_cfg->link_fec_opt)
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_copy_phy_caps_to_cfg - Copy PHY ability data to configuration data
+ * @pi: port information structure
+ * @caps: PHY ability structure to copy date from
+ * @cfg: PHY configuration structure to copy data to
+ *
+ * Helper function to copy AQC PHY get ability data to PHY set configuration
+ * data structure
+ */
+void
+ice_copy_phy_caps_to_cfg(struct ice_port_info *pi,
+			 struct ice_aqc_get_phy_caps_data *caps,
+			 struct ice_aqc_set_phy_cfg_data *cfg)
+{
+	if (!pi || !caps || !cfg)
+		return;
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->phy_type_low = caps->phy_type_low;
+	cfg->phy_type_high = caps->phy_type_high;
+	cfg->caps = caps->caps;
+	cfg->low_power_ctrl_an = caps->low_power_ctrl_an;
+	cfg->eee_cap = caps->eee_cap;
+	cfg->eeer_value = caps->eeer_value;
+	cfg->link_fec_opt = caps->link_fec_options;
+	cfg->module_compliance_enforcement =
+		caps->module_compliance_enforcement;
+}
+
+/**
+ * ice_cfg_phy_fec - Configure PHY FEC data based on FEC mode
+ * @pi: port information structure
+ * @cfg: PHY configuration data to set FEC mode
+ * @fec: FEC mode to configure
+ */
+enum ice_status
+ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+		enum ice_fec_mode fec)
+{
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	enum ice_status status = 0;
+	struct ice_hw *hw;
+
+	if (!pi || !cfg)
+		return ICE_ERR_BAD_PTR;
+
+	hw = pi->hw;
+
+	pcaps = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_aq_get_phy_caps(pi, false,
+				     (ice_fw_supports_report_dflt_cfg(hw) ?
+				      ICE_AQC_REPORT_DFLT_CFG :
+				      ICE_AQC_REPORT_TOPO_CAP_MEDIA), pcaps, NULL);
+
+	if (status)
+		goto out;
+
+	cfg->caps |= (pcaps->caps & ICE_AQC_PHY_EN_AUTO_FEC);
+	cfg->link_fec_opt = pcaps->link_fec_options;
+
+	switch (fec) {
+	case ICE_FEC_BASER:
+		/* Clear RS bits, and AND BASE-R ability
+		 * bits and OR request bits.
+		 */
+		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN |
+			ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN;
+		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ |
+			ICE_AQC_PHY_FEC_25G_KR_REQ;
+		break;
+	case ICE_FEC_RS:
+		/* Clear BASE-R bits, and AND RS ability
+		 * bits and OR request bits.
+		 */
+		cfg->link_fec_opt &= ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN;
+		cfg->link_fec_opt |= ICE_AQC_PHY_FEC_25G_RS_528_REQ |
+			ICE_AQC_PHY_FEC_25G_RS_544_REQ;
+		break;
+	case ICE_FEC_NONE:
+		/* Clear all FEC option bits. */
+		cfg->link_fec_opt &= ~ICE_AQC_PHY_FEC_MASK;
+		break;
+	case ICE_FEC_AUTO:
+		/* AND auto FEC bit, and all caps bits. */
+		cfg->caps &= ICE_AQC_PHY_CAPS_MASK;
+		cfg->link_fec_opt |= pcaps->link_fec_options;
+		break;
+	default:
+		status = ICE_ERR_PARAM;
+		break;
+	}
+
+	if (fec == ICE_FEC_AUTO && ice_fw_supports_link_override(pi->hw) &&
+	    !ice_fw_supports_report_dflt_cfg(pi->hw)) {
+		struct ice_link_default_override_tlv tlv;
+
+		if (ice_get_link_default_override(&tlv, pi))
+			goto out;
+
+		if (!(tlv.options & ICE_LINK_OVERRIDE_STRICT_MODE) &&
+		    (tlv.options & ICE_LINK_OVERRIDE_EN))
+			cfg->link_fec_opt = tlv.fec_options;
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), pcaps);
+
+	return status;
+}
+
+/**
+ * ice_get_link_status - get status of the HW network link
+ * @pi: port information structure
+ * @link_up: pointer to bool (true/false = linkup/linkdown)
+ *
+ * Variable link_up is true if link is up, false if link is down.
+ * The variable link_up is invalid if status is non zero. As a
+ * result of this call, link status reporting becomes enabled
+ */
+enum ice_status ice_get_link_status(struct ice_port_info *pi, bool *link_up)
+{
+	struct ice_phy_info *phy_info;
+	enum ice_status status = 0;
+
+	if (!pi || !link_up)
+		return ICE_ERR_PARAM;
+
+	phy_info = &pi->phy;
+
+	if (phy_info->get_link_info) {
+		status = ice_update_link_info(pi);
+
+		if (status)
+			ice_debug(pi->hw, ICE_DBG_LINK, "get link status error, status = %d\n",
+				  status);
+	}
+
+	*link_up = phy_info->link_info.link_info & ICE_AQ_LINK_UP;
+
+	return status;
+}
+
+/**
+ * ice_aq_set_link_restart_an
+ * @pi: pointer to the port information structure
+ * @ena_link: if true: enable link, if false: disable link
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sets up the link and restarts the Auto-Negotiation over the link.
+ */
+enum ice_status
+ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
+			   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_restart_an *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.restart_an;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_restart_an);
+
+	cmd->cmd_flags = ICE_AQC_RESTART_AN_LINK_RESTART;
+	cmd->lport_num = pi->lport;
+	if (ena_link)
+		cmd->cmd_flags |= ICE_AQC_RESTART_AN_LINK_ENABLE;
+	else
+		cmd->cmd_flags &= ~ICE_AQC_RESTART_AN_LINK_ENABLE;
+
+	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_set_event_mask
+ * @hw: pointer to the HW struct
+ * @port_num: port number of the physical function
+ * @mask: event mask to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set event mask (0x0613)
+ */
+enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_event_mask *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_event_mask;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
+
+	cmd->lport_num = port_num;
+
+	cmd->event_mask = cpu_to_le16(mask);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_set_mac_loopback
+ * @hw: pointer to the HW struct
+ * @ena_lpbk: Enable or Disable loopback
+ * @cd: pointer to command details structure or NULL
+ *
+ * Enable/disable loopback on a given port
+ */
+enum ice_status
+ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_mac_lb *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_mac_lb;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_lb);
+	if (ena_lpbk)
+		cmd->lb_mode = ICE_AQ_MAC_LB_EN;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+
+/**
+ * ice_aq_set_port_id_led
+ * @pi: pointer to the port information
+ * @is_orig_mode: is this LED set to original mode (by the net-list)
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set LED value for the given port (0x06e9)
+ */
+enum ice_status
+ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
+		       struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_port_id_led *cmd;
+	struct ice_hw *hw = pi->hw;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_port_id_led;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_id_led);
+
+
+	if (is_orig_mode)
+		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_ORIG;
+	else
+		cmd->ident_mode = ICE_AQC_PORT_IDENT_LED_BLINK;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_sff_eeprom
+ * @hw: pointer to the HW struct
+ * @lport: bits [7:0] = logical port, bit [8] = logical port valid
+ * @bus_addr: I2C bus address of the eeprom (typically 0xA0, 0=topo default)
+ * @mem_addr: I2C offset. lower 8 bits for address, 8 upper bits zero padding.
+ * @page: QSFP page
+ * @set_page: set or ignore the page
+ * @data: pointer to data buffer to be read/written to the I2C device.
+ * @length: 1-16 for read, 1 for write.
+ * @write: 0 read, 1 for write.
+ * @cd: pointer to command details structure or NULL
+ *
+ * Read/Write SFF EEPROM (0x06EE)
+ */
+enum ice_status
+ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
+		  u16 mem_addr, u8 page, u8 set_page, u8 *data, u8 length,
+		  bool write, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_sff_eeprom *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || (mem_addr & 0xff00))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_sff_eeprom);
+	cmd = &desc.params.read_write_sff_param;
+	desc.flags = cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd->lport_num = (u8)(lport & 0xff);
+	cmd->lport_num_valid = (u8)((lport >> 8) & 0x01);
+	cmd->i2c_bus_addr = cpu_to_le16(((bus_addr >> 1) &
+					 ICE_AQC_SFF_I2CBUS_7BIT_M) |
+					((set_page <<
+					  ICE_AQC_SFF_SET_EEPROM_PAGE_S) &
+					 ICE_AQC_SFF_SET_EEPROM_PAGE_M));
+	cmd->i2c_mem_addr = cpu_to_le16(mem_addr & 0xff);
+	cmd->eeprom_page = cpu_to_le16((u16)page << ICE_AQC_SFF_EEPROM_PAGE_S);
+	if (write)
+		cmd->i2c_bus_addr |= cpu_to_le16(ICE_AQC_SFF_IS_WRITE);
+
+	status = ice_aq_send_cmd(hw, &desc, data, length, cd);
+	return status;
+}
+
+/**
+ * ice_aq_prog_topo_dev_nvm
+ * @hw: pointer to the hardware structure
+ * @topo_params: pointer to structure storing topology parameters for a device
+ * @cd: pointer to command details structure or NULL
+ *
+ * Program Topology Device NVM (0x06F2)
+ *
+ */
+enum ice_status
+ice_aq_prog_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_prog_topo_dev_nvm *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.prog_topo_dev_nvm;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_prog_topo_dev_nvm);
+
+	memcpy(&cmd->topo_params, topo_params, sizeof(*topo_params));
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_read_topo_dev_nvm
+ * @hw: pointer to the hardware structure
+ * @topo_params: pointer to structure storing topology parameters for a device
+ * @start_address: byte offset in the topology device NVM
+ * @data: pointer to data buffer
+ * @data_size: number of bytes to be read from the topology device NVM
+ * @cd: pointer to command details structure or NULL
+ * Read Topology Device NVM (0x06F3)
+ *
+ */
+enum ice_status
+ice_aq_read_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 u32 start_address, u8 *data, u8 data_size,
+			 struct ice_sq_cd *cd)
+{
+	struct ice_aqc_read_topo_dev_nvm *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || data_size == 0 ||
+	    data_size > ICE_AQC_READ_TOPO_DEV_NVM_DATA_READ_SIZE)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.read_topo_dev_nvm;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_read_topo_dev_nvm);
+
+	desc.datalen = data_size;
+	memcpy(&cmd->topo_params, topo_params, sizeof(*topo_params));
+	cmd->start_address = cpu_to_le32(start_address);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
+
+	memcpy(data, cmd->data_read, data_size);
+
+	return 0;
+}
+
+/**
+ * __ice_aq_get_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @params: RSS LUT parameters
+ * @set: set true to set the table, false to get the table
+ *
+ * Internal function to get (0x0B05) or set (0x0B03) RSS look up table
+ */
+static enum ice_status
+__ice_aq_get_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *params, bool set)
+{
+	u16 flags = 0, vsi_id, lut_type, lut_size, glob_lut_idx, vsi_handle;
+	struct ice_aqc_get_set_rss_lut *cmd_resp;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u8 *lut;
+
+	if (!params)
+		return ICE_ERR_PARAM;
+
+	vsi_handle = params->vsi_handle;
+	lut = params->lut;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
+		return ICE_ERR_PARAM;
+
+	lut_size = params->lut_size;
+	lut_type = params->lut_type;
+	glob_lut_idx = params->global_lut_id;
+	vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	cmd_resp = &desc.params.get_set_rss_lut;
+
+	if (set) {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_lut);
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	} else {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_lut);
+	}
+
+	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
+					 ICE_AQC_GSET_RSS_LUT_VSI_ID_S) &
+					ICE_AQC_GSET_RSS_LUT_VSI_ID_M) |
+				       ICE_AQC_GSET_RSS_LUT_VSI_VALID);
+
+	switch (lut_type) {
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI:
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF:
+	case ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL:
+		flags |= ((lut_type << ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_S) &
+			  ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_M);
+		break;
+	default:
+		status = ICE_ERR_PARAM;
+		goto ice_aq_get_set_rss_lut_exit;
+	}
+
+	if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL) {
+		flags |= ((glob_lut_idx << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S) &
+			  ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M);
+
+		if (!set)
+			goto ice_aq_get_set_rss_lut_send;
+	} else if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
+		if (!set)
+			goto ice_aq_get_set_rss_lut_send;
+	} else {
+		goto ice_aq_get_set_rss_lut_send;
+	}
+
+	/* LUT size is only valid for Global and PF table types */
+	switch (lut_size) {
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128:
+		break;
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512:
+		flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG <<
+			  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
+			 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+		break;
+	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K:
+		if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
+			flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG <<
+				  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
+				 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+			break;
+		}
+		/* fall-through */
+	default:
+		status = ICE_ERR_PARAM;
+		goto ice_aq_get_set_rss_lut_exit;
+	}
+
+ice_aq_get_set_rss_lut_send:
+	cmd_resp->flags = cpu_to_le16(flags);
+	status = ice_aq_send_cmd(hw, &desc, lut, lut_size, NULL);
+
+ice_aq_get_set_rss_lut_exit:
+	return status;
+}
+
+/**
+ * ice_aq_get_rss_lut
+ * @hw: pointer to the hardware structure
+ * @get_params: RSS LUT parameters used to specify which RSS LUT to get
+ *
+ * get the RSS lookup table, PF or VSI type
+ */
+enum ice_status
+ice_aq_get_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *get_params)
+{
+	return __ice_aq_get_set_rss_lut(hw, get_params, false);
+}
+
+/**
+ * ice_aq_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @set_params: RSS LUT parameters used to specify how to set the RSS LUT
+ *
+ * set the RSS lookup table, PF or VSI type
+ */
+enum ice_status
+ice_aq_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *set_params)
+{
+	return __ice_aq_get_set_rss_lut(hw, set_params, true);
+}
+
+/**
+ * __ice_aq_get_set_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_id: VSI FW index
+ * @key: pointer to key info struct
+ * @set: set true to set the key, false to get the key
+ *
+ * get (0x0B04) or set (0x0B02) the RSS key per VSI
+ */
+static enum
+ice_status __ice_aq_get_set_rss_key(struct ice_hw *hw, u16 vsi_id,
+				    struct ice_aqc_get_set_rss_keys *key,
+				    bool set)
+{
+	struct ice_aqc_get_set_rss_key *cmd_resp;
+	u16 key_size = sizeof(*key);
+	struct ice_aq_desc desc;
+
+	cmd_resp = &desc.params.get_set_rss_key;
+
+	if (set) {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_key);
+		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	} else {
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_key);
+	}
+
+	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
+					 ICE_AQC_GSET_RSS_KEY_VSI_ID_S) &
+					ICE_AQC_GSET_RSS_KEY_VSI_ID_M) |
+				       ICE_AQC_GSET_RSS_KEY_VSI_VALID);
+
+	return ice_aq_send_cmd(hw, &desc, key, key_size, NULL);
+}
+
+/**
+ * ice_aq_get_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @key: pointer to key info struct
+ *
+ * get the RSS key per VSI
+ */
+enum ice_status
+ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
+		   struct ice_aqc_get_set_rss_keys *key)
+{
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !key)
+		return ICE_ERR_PARAM;
+
+	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
+					key, false);
+}
+
+/**
+ * ice_aq_set_rss_key
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @keys: pointer to key info struct
+ *
+ * set the RSS key per VSI
+ */
+enum ice_status
+ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
+		   struct ice_aqc_get_set_rss_keys *keys)
+{
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !keys)
+		return ICE_ERR_PARAM;
+
+	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
+					keys, true);
+}
+
+/**
+ * ice_aq_add_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qgrps: Number of added queue groups
+ * @qg_list: list of queue groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx LAN queue (0x0C30)
+ *
+ * NOTE:
+ * Prior to calling add Tx LAN queue:
+ * Initialize the following as part of the Tx queue context:
+ * Completion queue ID if the queue uses Completion queue, Quanta profile,
+ * Cache profile and Packet shaper profile.
+ *
+ * After add Tx LAN queue AQ command is completed:
+ * Interrupts should be associated with specific queues,
+ * Association of Tx queue to Doorbell queue is not part of Add LAN Tx queue
+ * flow.
+ */
+static enum ice_status
+ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
+		   struct ice_aqc_add_tx_qgrp *qg_list, u16 buf_size,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_tx_qgrp *list;
+	struct ice_aqc_add_txqs *cmd;
+	struct ice_aq_desc desc;
+	u16 i, sum_size = 0;
+
+	cmd = &desc.params.add_txqs;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_txqs);
+
+	if (!qg_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	for (i = 0, list = qg_list; i < num_qgrps; i++) {
+		sum_size += struct_size(list, txqs, list->num_txqs);
+		list = (struct ice_aqc_add_tx_qgrp *)(list->txqs +
+						      list->num_txqs);
+	}
+
+	if (buf_size != sum_size)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qgrps = num_qgrps;
+
+	return ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+}
+
+/**
+ * ice_aq_dis_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qgrps: number of groups in the list
+ * @qg_list: the list of groups to disable
+ * @buf_size: the total size of the qg_list buffer in bytes
+ * @rst_src: if called due to reset, specifies the reset source
+ * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * @cd: pointer to command details structure or NULL
+ *
+ * Disable LAN Tx queue (0x0C31)
+ */
+static enum ice_status
+ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
+		   struct ice_aqc_dis_txq_item *qg_list, u16 buf_size,
+		   enum ice_disq_rst_src rst_src, u16 vmvf_num,
+		   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_dis_txq_item *item;
+	struct ice_aqc_dis_txqs *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i, sz = 0;
+
+	cmd = &desc.params.dis_txqs;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dis_txqs);
+
+	/* qg_list can be NULL only in VM/VF reset flow */
+	if (!qg_list && !rst_src)
+		return ICE_ERR_PARAM;
+
+	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	cmd->num_entries = num_qgrps;
+
+	cmd->vmvf_and_timeout = cpu_to_le16((5 << ICE_AQC_Q_DIS_TIMEOUT_S) &
+					    ICE_AQC_Q_DIS_TIMEOUT_M);
+
+	switch (rst_src) {
+	case ICE_VM_RESET:
+		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VM_RESET;
+		cmd->vmvf_and_timeout |=
+			cpu_to_le16(vmvf_num & ICE_AQC_Q_DIS_VMVF_NUM_M);
+		break;
+	case ICE_VF_RESET:
+		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VF_RESET;
+		/* In this case, FW expects vmvf_num to be absolute VF ID */
+		cmd->vmvf_and_timeout |=
+			cpu_to_le16((vmvf_num + hw->func_caps.vf_base_id) &
+				    ICE_AQC_Q_DIS_VMVF_NUM_M);
+		break;
+	case ICE_NO_RESET:
+	default:
+		break;
+	}
+
+	/* flush pipe on time out */
+	cmd->cmd_type |= ICE_AQC_Q_DIS_CMD_FLUSH_PIPE;
+	/* If no queue group info, we are in a reset flow. Issue the AQ */
+	if (!qg_list)
+		goto do_aq;
+
+	/* set RD bit to indicate that command buffer is provided by the driver
+	 * and it needs to be read by the firmware
+	 */
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	for (i = 0, item = qg_list; i < num_qgrps; i++) {
+		u16 item_size = struct_size(item, q_id, item->num_qs);
+
+		/* If the num of queues is even, add 2 bytes of padding */
+		if ((item->num_qs % 2) == 0)
+			item_size += 2;
+
+		sz += item_size;
+
+		item = (struct ice_aqc_dis_txq_item *)((u8 *)item + item_size);
+	}
+
+	if (buf_size != sz)
+		return ICE_ERR_PARAM;
+
+do_aq:
+	status = ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+	if (status) {
+		if (!qg_list)
+			ice_debug(hw, ICE_DBG_SCHED, "VM%d disable failed %d\n",
+				  vmvf_num, hw->adminq.sq_last_status);
+		else
+			ice_debug(hw, ICE_DBG_SCHED, "disable queue %d failed %d\n",
+				  le16_to_cpu(qg_list[0].q_id[0]),
+				  hw->adminq.sq_last_status);
+	}
+	return status;
+}
+
+/**
+ * ice_aq_move_recfg_lan_txq
+ * @hw: pointer to the hardware structure
+ * @num_qs: number of queues to move/reconfigure
+ * @is_move: true if this operation involves node movement
+ * @is_tc_change: true if this operation involves a TC change
+ * @subseq_call: true if this operation is a subsequent call
+ * @flush_pipe: on timeout, true to flush pipe, false to return EAGAIN
+ * @timeout: timeout in units of 100 usec (valid values 0-50)
+ * @blocked_cgds: out param, bitmap of CGDs that timed out if returning EAGAIN
+ * @buf: struct containing src/dest TEID and per-queue info
+ * @buf_size: size of buffer for indirect command
+ * @txqs_moved: out param, number of queues successfully moved
+ * @cd: pointer to command details structure or NULL
+ *
+ * Move / Reconfigure Tx LAN queues (0x0C32)
+ */
+enum ice_status
+ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
+			  bool is_tc_change, bool subseq_call, bool flush_pipe,
+			  u8 timeout, u32 *blocked_cgds,
+			  struct ice_aqc_move_txqs_data *buf, u16 buf_size,
+			  u8 *txqs_moved, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_move_txqs *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.move_txqs;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_move_recfg_txqs);
+
+#define ICE_LAN_TXQ_MOVE_TIMEOUT_MAX 50
+	if (timeout > ICE_LAN_TXQ_MOVE_TIMEOUT_MAX)
+		return ICE_ERR_PARAM;
+
+	if (is_tc_change && !flush_pipe && !blocked_cgds)
+		return ICE_ERR_PARAM;
+
+	if (!is_move && !is_tc_change)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	if (is_move)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_TYPE_MOVE;
+
+	if (is_tc_change)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_TYPE_TC_CHANGE;
+
+	if (subseq_call)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_SUBSEQ_CALL;
+
+	if (flush_pipe)
+		cmd->cmd_type |= ICE_AQC_Q_CMD_FLUSH_PIPE;
+
+	cmd->num_qs = num_qs;
+	cmd->timeout = ((timeout << ICE_AQC_Q_CMD_TIMEOUT_S) &
+			ICE_AQC_Q_CMD_TIMEOUT_M);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+
+	if (!status && txqs_moved)
+		*txqs_moved = cmd->num_qs;
+
+	if (hw->adminq.sq_last_status == ICE_AQ_RC_EAGAIN &&
+	    is_tc_change && !flush_pipe)
+		*blocked_cgds = le32_to_cpu(cmd->blocked_cgds);
+
+	return status;
+}
+
+/**
+ * ice_aq_add_rdma_qsets
+ * @hw: pointer to the hardware structure
+ * @num_qset_grps: Number of RDMA Qset groups
+ * @qset_list: list of qset groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx RDMA Qsets (0x0C33)
+ */
+static enum ice_status
+ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
+		      struct ice_aqc_add_rdma_qset_data *qset_list,
+		      u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_rdma_qset_data *list;
+	struct ice_aqc_add_rdma_qset *cmd;
+	struct ice_aq_desc desc;
+	u16 i, sum_size = 0;
+
+	cmd = &desc.params.add_rdma_qset;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset);
+
+	if (!qset_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qset_grps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	for (i = 0, list = qset_list; i < num_qset_grps; i++) {
+		u16 num_qsets = le16_to_cpu(list->num_qsets);
+
+		sum_size += struct_size(list, rdma_qsets, num_qsets);
+		list = (struct ice_aqc_add_rdma_qset_data *)(list->rdma_qsets +
+							     num_qsets);
+	}
+
+	if (buf_size != sum_size)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qset_grps = num_qset_grps;
+
+	return ice_aq_send_cmd(hw, &desc, qset_list, buf_size, cd);
+}
+
+/* End of FW Admin Queue command wrappers */
+
+/**
+ * ice_write_byte - write a byte to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_byte(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u8 src_byte, dest_byte, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = (u8)(BIT(ce_info->width) - 1);
+
+	src_byte = *from;
+	src_byte &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_byte <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_byte, dest, sizeof(dest_byte));
+
+	dest_byte &= ~mask;	/* get the bits not changing */
+	dest_byte |= src_byte;	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_byte, sizeof(dest_byte));
+}
+
+/**
+ * ice_write_word - write a word to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_word(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u16 src_word, mask;
+	__le16 dest_word;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_word = *(u16 *)from;
+	src_word &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_word <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_word, dest, sizeof(dest_word));
+
+	dest_word &= ~(cpu_to_le16(mask));	/* get the bits not changing */
+	dest_word |= cpu_to_le16(src_word);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_word, sizeof(dest_word));
+}
+
+/**
+ * ice_write_dword - write a dword to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_dword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u32 src_dword, mask;
+	__le32 dest_dword;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 32 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 5 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 32)
+		mask = BIT(ce_info->width) - 1;
+	else
+		mask = (u32)~0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_dword = *(u32 *)from;
+	src_dword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_dword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_dword, dest, sizeof(dest_dword));
+
+	dest_dword &= ~(cpu_to_le32(mask));	/* get the bits not changing */
+	dest_dword |= cpu_to_le32(src_dword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_dword, sizeof(dest_dword));
+}
+
+/**
+ * ice_write_qword - write a qword to a packed context structure
+ * @src_ctx:  the context structure to read from
+ * @dest_ctx: the context to be written to
+ * @ce_info:  a description of the struct to be filled
+ */
+static void
+ice_write_qword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+{
+	u64 src_qword, mask;
+	__le64 dest_qword;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src_ctx + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 64 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 6 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 64)
+		mask = BIT_ULL(ce_info->width) - 1;
+	else
+		mask = (u64)~0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_qword = *(u64 *)from;
+	src_qword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_qword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = dest_ctx + (ce_info->lsb / 8);
+
+	memcpy(&dest_qword, dest, sizeof(dest_qword));
+
+	dest_qword &= ~(cpu_to_le64(mask));	/* get the bits not changing */
+	dest_qword |= cpu_to_le64(src_qword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_qword, sizeof(dest_qword));
+}
+
+/**
+ * ice_set_ctx - set context bits in packed structure
+ * @hw: pointer to the hardware structure
+ * @src_ctx:  pointer to a generic non-packed context structure
+ * @dest_ctx: pointer to memory for the packed structure
+ * @ce_info:  a description of the structure to be transformed
+ */
+enum ice_status
+ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx,
+	    const struct ice_ctx_ele *ce_info)
+{
+	int f;
+
+	for (f = 0; ce_info[f].width; f++) {
+		/* We have to deal with each element of the FW response
+		 * using the correct size so that we are correct regardless
+		 * of the endianness of the machine.
+		 */
+		if (ce_info[f].width > (ce_info[f].size_of * BITS_PER_BYTE)) {
+			ice_debug(hw, ICE_DBG_QCTX, "Field %d width of %d bits larger than size of %d byte(s) ... skipping write\n",
+				  f, ce_info[f].width, ce_info[f].size_of);
+			continue;
+		}
+		switch (ce_info[f].size_of) {
+		case sizeof(u8):
+			ice_write_byte(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u16):
+			ice_write_word(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u32):
+			ice_write_dword(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		case sizeof(u64):
+			ice_write_qword(src_ctx, dest_ctx, &ce_info[f]);
+			break;
+		default:
+			return ICE_ERR_INVAL_SIZE;
+		}
+	}
+
+	return 0;
+}
+
+
+/**
+ * ice_print_sched_elem - parse through an element struct in a branch
+ * @hw: ice hardware struct
+ * @elem: element number in a branch
+ * @data: corresponding element info struct to extract data from
+ */
+static void
+ice_print_sched_elem(struct ice_hw *hw, int elem,
+		     struct ice_aqc_txsched_elem_data *data)
+{
+	struct ice_aqc_txsched_elem *d = &data->data;
+	unsigned long valid_sec = d->valid_sections;
+	char str[128];
+	int i;
+
+	dev_info(ice_hw_to_dev(hw), "\t\telement %d\n", elem);
+	dev_info(ice_hw_to_dev(hw), "\t\t\tparent TEID %d\n",
+		 le32_to_cpu(data->parent_teid));
+	dev_info(ice_hw_to_dev(hw), "\t\t\tnode TEID %d\n",
+		 le32_to_cpu(data->node_teid));
+
+	switch (d->elem_type) {
+	case ICE_AQC_ELEM_TYPE_UNDEFINED:
+		snprintf(str, sizeof(str), "undefined");
+		break;
+	case ICE_AQC_ELEM_TYPE_ROOT_PORT:
+		snprintf(str, sizeof(str), "root port");
+		break;
+	case ICE_AQC_ELEM_TYPE_TC:
+		snprintf(str, sizeof(str), "tc");
+		break;
+	case ICE_AQC_ELEM_TYPE_SE_GENERIC:
+		snprintf(str, sizeof(str), "se generic");
+		break;
+	case ICE_AQC_ELEM_TYPE_ENTRY_POINT:
+		snprintf(str, sizeof(str), "sw entry point se");
+		break;
+	case ICE_AQC_ELEM_TYPE_LEAF:
+		snprintf(str, sizeof(str), "leaf");
+		break;
+	case ICE_AQC_ELEM_TYPE_SE_PADDED:
+		snprintf(str, sizeof(str), "se padded");
+		break;
+	default:
+		snprintf(str, sizeof(str), "unknown");
+		break;
+	}
+	dev_info(ice_hw_to_dev(hw), "\t\t\telement type %s\n", str);
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\tvalid sections\n");
+	/* iterate through valid sections */
+	for_each_set_bit(i, &valid_sec, ICE_SCHED_VALID_SEC_BITS) {
+		switch (BIT(i)) {
+		case ICE_AQC_ELEM_VALID_GENERIC:
+			snprintf(str, sizeof(str), "generic section");
+			break;
+		case ICE_AQC_ELEM_VALID_CIR:
+			snprintf(str, sizeof(str),
+				 "cir bw:profile id %d, weight %d",
+				 le16_to_cpu(d->cir_bw.bw_profile_idx),
+				 le16_to_cpu(d->cir_bw.bw_alloc));
+			break;
+		case ICE_AQC_ELEM_VALID_EIR:
+			snprintf(str, sizeof(str),
+				 "eir bw:profile id %d, weight %d",
+				 le16_to_cpu(d->eir_bw.bw_profile_idx),
+				 le16_to_cpu(d->eir_bw.bw_alloc));
+			break;
+		case ICE_AQC_ELEM_VALID_SHARED:
+			snprintf(str, sizeof(str),
+				 "shared bw: rl profile id %d",
+				 le16_to_cpu(d->srl_id));
+			break;
+		default:
+			snprintf(str, sizeof(str), "unknown");
+			break;
+		}
+		dev_info(ice_hw_to_dev(hw), "\t\t\t\t%s\n", str);
+	}
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\tgeneric\n");
+	snprintf(str, sizeof(str), "%s",
+		 (d->generic & ICE_AQC_ELEM_GENERIC_MODE_M) ?  "pss" : "bps");
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tscheduling mode %s\n", str);
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tpriority %d\n",
+		 ((d->generic & ICE_AQC_ELEM_GENERIC_PRIO_M) >> ICE_AQC_ELEM_GENERIC_PRIO_S));
+
+	dev_info(ice_hw_to_dev(hw), "\t\t\t\tadjustment value %d\n",
+		 (d->generic & ICE_AQC_ELEM_GENERIC_ADJUST_VAL_M) >> ICE_AQC_ELEM_GENERIC_ADJUST_VAL_S);
+}
+
+/**
+ * ice_dump_port_dflt_topo - print scheduler tree topology for a port
+ * @pi: pointer to the port_info structure
+ */
+enum ice_status ice_dump_port_dflt_topo(struct ice_port_info *pi)
+{
+	struct ice_aqc_get_topo_elem *buf;
+	struct ice_hw *hw = pi->hw;
+	u16 j, buf_size, num_elem;
+	enum ice_status ret;
+	u8 i, num_branches;
+
+	/* allocate memory for response buffer */
+	buf_size = sizeof(*buf) * ICE_TXSCHED_MAX_BRANCHES;
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	ret = ice_aq_get_dflt_topo(hw, pi->lport, buf, buf_size, &num_branches,
+				   NULL);
+	if (ret) {
+		ret = ICE_ERR_CFG;
+		goto err_exit;
+	}
+
+	if (num_branches < 2 || num_branches > ICE_TXSCHED_MAX_BRANCHES)
+		dev_info(ice_hw_to_dev(hw),
+			 "CHECK: num_branches unexpected %d\n", num_branches);
+
+	dev_info(ice_hw_to_dev(hw), "scheduler tree topology for port %d\n",
+		 pi->lport);
+
+	/* iterate through all branches */
+	for (i = 0; i < num_branches; i++) {
+		dev_info(ice_hw_to_dev(hw), "\tbranch %d\n", i);
+		num_elem = le16_to_cpu(buf[i].hdr.num_elems);
+
+		if (num_elem < 2 || num_elem > ICE_AQC_TOPO_MAX_LEVEL_NUM)
+			dev_info(ice_hw_to_dev(hw),
+				 "CHECK: num_elems unexpected %d\n", num_elem);
+		/* iterate through all elements in a branch */
+		for (j = 0; j < num_elem; j++)
+			ice_print_sched_elem(hw, j, &buf[i].generic[j]);
+	}
+
+err_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return ret;
+}
+
+/**
+ * ice_sched_print_tree - prints the node information
+ * @hw: pointer to the HW struct
+ * @node: pointer to the node
+ *
+ * This function prints the node and all its children information
+ */
+static void ice_sched_print_tree(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+	u8 i;
+
+	if (!node)
+		return;
+	for (i = 0; i < node->num_children; i++)
+		ice_sched_print_tree(hw, node->children[i]);
+	dev_info(ice_hw_to_dev(hw),
+		 "Node Layer: 0x%x Node TEID: 0x%x Parent TEID: 0x%x num_children: %d tc_num :0x%x\n",
+		 node->tx_sched_layer, le32_to_cpu(node->info.node_teid),
+		 le32_to_cpu(node->info.parent_teid), node->num_children,
+		 node->tc_num);
+	/* print the current RL values and suspend state */
+	status = ice_sched_query_elem(hw,  le32_to_cpu(node->info.node_teid),
+				      &buf);
+	if (status)
+		return;
+	data = &buf.data;
+	dev_info(ice_hw_to_dev(hw), "elem type 0x%x valid sec 0x%x\n",
+		 data->elem_type, data->valid_sections);
+	dev_info(ice_hw_to_dev(hw), "generic 0x%x Flags 0x%x\n",
+		 data->generic, data->flags);
+	dev_info(ice_hw_to_dev(hw),
+		 "BW Profile: CIR id 0x%x alloc 0x%x EIR id 0x%x alloc 0x%x SRL id alloc 0x%x\n",
+		 le16_to_cpu(data->cir_bw.bw_profile_idx),
+		 le16_to_cpu(data->cir_bw.bw_alloc),
+		 le16_to_cpu(data->eir_bw.bw_profile_idx),
+		 le16_to_cpu(data->eir_bw.bw_alloc),
+		 le16_to_cpu(data->srl_id));
+}
+
+/**
+ * ice_dump_port_topo - prints the tree topology of the port
+ * @pi: port information structure
+ *
+ * This function prints the tree topology of the port
+ */
+void ice_dump_port_topo(struct ice_port_info *pi)
+{
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return;
+
+	mutex_lock(&pi->sched_lock);
+	ice_sched_print_tree(pi->hw, pi->root);
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_dump_common_caps - print struct ice_hw_common_caps fields
+ * @hw: pointer to the ice_hw instance
+ * @caps: pointer to common caps instance
+ * @prefix: string to prefix when printing
+ */
+static void
+ice_dump_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
+		     char const *prefix)
+{
+	dev_info(ice_hw_to_dev(hw), "%s: switching_mode = %d\n", prefix,
+		 caps->switching_mode);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_mode = %d\n", prefix,
+		 caps->mgmt_mode);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_protocols_mctp = %d\n", prefix,
+		 caps->mgmt_protocols_mctp);
+	dev_info(ice_hw_to_dev(hw), "%s: os2bmc = %d\n", prefix, caps->os2bmc);
+	dev_info(ice_hw_to_dev(hw), "%s: valid_functions (bitmap) = %d\n",
+		 prefix, caps->valid_functions);
+	dev_info(ice_hw_to_dev(hw), "%s: sr_iov_1_1 = %d\n", prefix,
+		 caps->sr_iov_1_1);
+	dev_info(ice_hw_to_dev(hw), "%s: vmdq = %d\n", prefix, caps->vmdq);
+	dev_info(ice_hw_to_dev(hw), "%s: evb_802_1_qbg = %d\n", prefix,
+		 caps->evb_802_1_qbg);
+	dev_info(ice_hw_to_dev(hw), "%s: evb_802_1_qbh = %d\n", prefix,
+		 caps->evb_802_1_qbh);
+	dev_info(ice_hw_to_dev(hw), "%s: dcb = %d\n", prefix, caps->dcb);
+	dev_info(ice_hw_to_dev(hw), "%s: active_tc_bitmap = %d\n", prefix,
+		 caps->active_tc_bitmap);
+	dev_info(ice_hw_to_dev(hw), "%s: maxtc = %d\n", prefix, caps->maxtc);
+	dev_info(ice_hw_to_dev(hw), "%s: iscsi = %d\n", prefix, caps->iscsi);
+	dev_info(ice_hw_to_dev(hw), "%s: rss_table_size = %d\n", prefix,
+		 caps->rss_table_size);
+	dev_info(ice_hw_to_dev(hw), "%s: rss_table_entry_width = %d\n",
+		 prefix, caps->rss_table_entry_width);
+	dev_info(ice_hw_to_dev(hw), "%s: num_rxq = %d\n", prefix,
+		 caps->num_rxq);
+	dev_info(ice_hw_to_dev(hw), "%s: rxq_first_id = %d\n", prefix,
+		 caps->rxq_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: num_txq = %d\n", prefix,
+		 caps->num_txq);
+	dev_info(ice_hw_to_dev(hw), "%s: txq_first_id = %d\n", prefix,
+		 caps->txq_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: num_msix_vectors = %d\n", prefix,
+		 caps->num_msix_vectors);
+	dev_info(ice_hw_to_dev(hw), "%s: msix_vector_first_id = %d\n", prefix,
+		 caps->msix_vector_first_id);
+	dev_info(ice_hw_to_dev(hw), "%s: ieee_1588 = %d\n", prefix,
+		 caps->ieee_1588);
+	dev_info(ice_hw_to_dev(hw), "%s: mgmt_cem = %d\n", prefix,
+		 caps->mgmt_cem);
+	dev_info(ice_hw_to_dev(hw), "%s: iwarp = %d\n", prefix, caps->iwarp);
+	dev_info(ice_hw_to_dev(hw), "%s: wr_csr_prot = 0x%llX\n", prefix,
+		 (unsigned long long)caps->wr_csr_prot);
+	dev_info(ice_hw_to_dev(hw), "%s: num_wol_proxy_fltr = %d\n", prefix,
+		 caps->num_wol_proxy_fltr);
+	dev_info(ice_hw_to_dev(hw), "%s: wol_proxy_vsi_seid = %d\n", prefix,
+		 caps->wol_proxy_vsi_seid);
+	dev_info(ice_hw_to_dev(hw), "%s: max_mtu = %d\n", prefix,
+		 caps->max_mtu);
+	ice_print_led_caps(hw, caps, prefix, false);
+	ice_print_sdp_caps(hw, caps, prefix, false);
+}
+
+/**
+ * ice_dump_func_caps - Dump function capabilities
+ * @hw: pointer to the ice_hw instance
+ * @func_caps: pointer to function capabilities struct
+ */
+static void
+ice_dump_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_caps)
+{
+	char const *prefix = "func cap";
+
+	ice_dump_common_caps(hw, &func_caps->common_cap, prefix);
+	dev_info(ice_hw_to_dev(hw), "%s: num_allocd_vfs = %d\n", prefix,
+		 func_caps->num_allocd_vfs);
+	dev_info(ice_hw_to_dev(hw), "%s: vf_base_id = %d\n", prefix,
+		 func_caps->vf_base_id);
+	dev_info(ice_hw_to_dev(hw), "%s: guar_num_vsi = %d\n", prefix,
+		 func_caps->guar_num_vsi);
+	dev_info(ice_hw_to_dev(hw), "%s: fd_fltr_guar = %d\n", prefix,
+		 func_caps->fd_fltr_guar);
+	dev_info(ice_hw_to_dev(hw), "%s: fd_fltr_best_effort = %d\n", prefix,
+		 func_caps->fd_fltr_best_effort);
+}
+
+/**
+ * ice_dump_dev_caps - Dump device capabilities
+ * @hw: pointer to the ice_hw instance
+ * @dev_caps: pointer to device capabilities struct
+ */
+static void
+ice_dump_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps)
+{
+	char const *prefix = "dev cap";
+
+	ice_dump_common_caps(hw, &dev_caps->common_cap, prefix);
+	dev_info(ice_hw_to_dev(hw), "%s: num_vfs_exposed = %d\n", prefix,
+		 dev_caps->num_vfs_exposed);
+	dev_info(ice_hw_to_dev(hw), "%s: num_vsi_allocd_to_host = %d\n",
+		 prefix, dev_caps->num_vsi_allocd_to_host);
+	dev_info(ice_hw_to_dev(hw), "%s: num_flow_director_fltr = %d\n",
+		 prefix, dev_caps->num_flow_director_fltr);
+}
+
+/**
+ * ice_dump_ptp_func_caps - Dump function PTP capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_ptp_func_caps(struct ice_hw *hw)
+{
+	struct ice_ts_func_info *ptpfunc = &hw->func_caps.ts_func_info;
+
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: ena = %d\n", ptpfunc->ena);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: src_tmr_owned = %d\n",
+		 ptpfunc->src_tmr_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_ena = %d\n",
+		 ptpfunc->tmr_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_index_owned = %d\n",
+		 ptpfunc->tmr_index_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: clk_freq = %d\n",
+		 ptpfunc->clk_freq);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: clk_src = %d\n",
+		 ptpfunc->clk_src);
+	dev_info(ice_hw_to_dev(hw), "PTP func cap: tmr_index_assoc = %d\n",
+		 ptpfunc->tmr_index_assoc);
+}
+
+/**
+ * ice_dump_ptp_dev_caps - Dump device PTP capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_ptp_dev_caps(struct ice_hw *hw)
+{
+	struct ice_ts_dev_info *ptpdev = &hw->dev_caps.ts_dev_info;
+
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_owner = %d\n",
+		 ptpdev->tmr0_owner);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_owned = %d\n",
+		 ptpdev->tmr0_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_owner = %d\n",
+		 ptpdev->tmr1_owner);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_owned = %d\n",
+		 ptpdev->tmr1_owned);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: ena = %d\n", ptpdev->ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr0_ena = %d\n",
+		 ptpdev->tmr0_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr1_ena = %d\n",
+		 ptpdev->tmr1_ena);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: ena_ports(bitmap) = %d\n",
+		 ptpdev->ena_ports);
+	dev_info(ice_hw_to_dev(hw), "PTP dev cap: tmr_own_map = %d\n",
+		 ptpdev->tmr_own_map);
+}
+
+/**
+ * ice_dump_caps - Dump a list of capabilities
+ * @hw: pointer to the ice_hw instance
+ */
+void ice_dump_caps(struct ice_hw *hw)
+{
+	ice_dump_dev_caps(hw, &hw->dev_caps);
+	ice_dump_func_caps(hw, &hw->func_caps);
+}
+
+/**
+ * ice_dump_port_info - Dump data from the port_info array
+ * @pi: pointer to the port_info structure
+ */
+void ice_dump_port_info(struct ice_port_info *pi)
+{
+	dev_info(ice_hw_to_dev(pi->hw), "\tvirt_port = %d\n", pi->lport);
+
+	dev_info(ice_hw_to_dev(pi->hw), "\tswid = %d\n", pi->sw_id);
+	dev_info(ice_hw_to_dev(pi->hw), "\tdflt_tx_vsi = %d\n",
+		 pi->dflt_tx_vsi_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\tdflt_rx_vsi = %d\n",
+		 pi->dflt_rx_vsi_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\t%s_num = %d\n",
+		 (pi->is_vf ? "vf" : "pf"), pi->pf_vf_num);
+	dev_info(ice_hw_to_dev(pi->hw), "\tlast_node_teid = %d\n",
+		 pi->last_node_teid);
+	dev_info(ice_hw_to_dev(pi->hw), "\tmedia_type = %d\n",
+		 pi->phy.media_type);
+
+	dev_info(ice_hw_to_dev(pi->hw), "\tmac_addr: %pM\n", pi->mac.lan_addr);
+}
+
+
+
+/**
+ * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @q_handle: software queue handle
+ */
+struct ice_q_ctx *
+ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle)
+{
+	struct ice_vsi_ctx *vsi;
+	struct ice_q_ctx *q_ctx;
+
+	vsi = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!vsi)
+		return NULL;
+	if (q_handle >= vsi->num_lan_q_entries[tc])
+		return NULL;
+	if (!vsi->lan_q_ctx[tc])
+		return NULL;
+	q_ctx = vsi->lan_q_ctx[tc];
+	return &q_ctx[q_handle];
+}
+
+/**
+ * ice_ena_vsi_txq
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @q_handle: software queue handle
+ * @num_qgrps: Number of added queue groups
+ * @buf: list of queue groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function adds one LAN queue
+ */
+enum ice_status
+ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
+		u8 num_qgrps, struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
+		struct ice_sq_cd *cd)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_sched_node *parent;
+	struct ice_q_ctx *q_ctx;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	if (num_qgrps > 1 || buf->num_txqs > 1)
+		return ICE_ERR_MAX_LIMIT;
+
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handle);
+	if (!q_ctx) {
+		ice_debug(hw, ICE_DBG_SCHED, "Enaq: invalid queue handle %d\n",
+			  q_handle);
+		status = ICE_ERR_PARAM;
+		goto ena_txq_exit;
+	}
+
+	/* find a parent node */
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_LAN);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto ena_txq_exit;
+	}
+
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
+	/* Mark that the values in the "generic" section as valid. The default
+	 * value in the "generic" section is zero. This means that :
+	 * - Scheduling mode is Bytes Per Second (BPS), indicated by Bit 0.
+	 * - 0 priority among siblings, indicated by Bit 1-3.
+	 * - WFQ, indicated by Bit 4.
+	 * - 0 Adjustment value is used in PSM credit update flow, indicated by
+	 * Bit 5-6.
+	 * - Bit 7 is reserved.
+	 * Without setting the generic section as valid in valid_sections, the
+	 * Admin queue command will fail with error code ICE_AQ_RC_EINVAL.
+	 */
+	buf->txqs[0].info.valid_sections =
+		ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+		ICE_AQC_ELEM_VALID_EIR;
+	buf->txqs[0].info.generic = 0;
+	buf->txqs[0].info.cir_bw.bw_profile_idx =
+		cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+	buf->txqs[0].info.cir_bw.bw_alloc =
+		cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+	buf->txqs[0].info.eir_bw.bw_profile_idx =
+		cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+	buf->txqs[0].info.eir_bw.bw_alloc =
+		cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+
+	/* add the LAN queue */
+	status = ice_aq_add_lan_txq(hw, num_qgrps, buf, buf_size, cd);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SCHED, "enable queue %d failed %d\n",
+			  le16_to_cpu(buf->txqs[0].txq_id),
+			  hw->adminq.sq_last_status);
+		goto ena_txq_exit;
 	}
 
-	if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL) {
-		flags |= ((glob_lut_idx << ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_S) &
-			  ICE_AQC_GSET_RSS_LUT_GLOBAL_IDX_M);
+	node.node_teid = buf->txqs[0].q_teid;
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	q_ctx->q_handle = q_handle;
+	q_ctx->q_teid = le32_to_cpu(node.node_teid);
+
+	/* add a leaf node into scheduler tree queue layer */
+	status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1, &node);
+	if (!status)
+		status = ice_sched_replay_q_bw(pi, q_ctx);
+
+ena_txq_exit:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_dis_vsi_txq
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @num_queues: number of queues
+ * @q_handles: pointer to software queue handle array
+ * @q_ids: pointer to the q_id array
+ * @q_teids: pointer to queue node teids
+ * @rst_src: if called due to reset, specifies the reset source
+ * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function removes queues and their corresponding nodes in SW DB
+ */
+enum ice_status
+ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
+		u16 *q_handles, u16 *q_ids, u32 *q_teids,
+		enum ice_disq_rst_src rst_src, u16 vmvf_num,
+		struct ice_sq_cd *cd)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_aqc_dis_txq_item *qg_list;
+	struct ice_q_ctx *q_ctx;
+	struct ice_hw *hw;
+	u16 i, buf_size;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	hw = pi->hw;
+
+	if (!num_queues) {
+		/* if queue is disabled already yet the disable queue command
+		 * has to be sent to complete the VF reset, then call
+		 * ice_aq_dis_lan_txq without any queue information
+		 */
+		if (rst_src)
+			return ice_aq_dis_lan_txq(hw, 0, NULL, 0, rst_src,
+						  vmvf_num, NULL);
+		return ICE_ERR_CFG;
+	}
+
+	buf_size = struct_size(qg_list, q_id, 1);
+	qg_list = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!qg_list)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < num_queues; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, q_teids[i]);
+		if (!node)
+			continue;
+		q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handles[i]);
+		if (!q_ctx) {
+			ice_debug(hw, ICE_DBG_SCHED, "invalid queue handle%d\n",
+				  q_handles[i]);
+			continue;
+		}
+		if (q_ctx->q_handle != q_handles[i]) {
+			ice_debug(hw, ICE_DBG_SCHED, "Err:handles %d %d\n",
+				  q_ctx->q_handle, q_handles[i]);
+			continue;
+		}
+		qg_list->parent_teid = node->info.parent_teid;
+		qg_list->num_qs = 1;
+		qg_list->q_id[0] = cpu_to_le16(q_ids[i]);
+		status = ice_aq_dis_lan_txq(hw, 1, qg_list, buf_size, rst_src,
+					    vmvf_num, cd);
+
+		if (status)
+			break;
+		ice_free_sched_node(pi, node);
+		q_ctx->q_handle = ICE_INVAL_Q_HANDLE;
+	}
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), qg_list);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_qs - configure the new/existing VSI queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @maxqs: max queues array per TC
+ * @owner: LAN or RDMA
+ *
+ * This function adds/updates the VSI queues per TC.
+ */
+static enum ice_status
+ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+	       u16 *maxqs, u8 owner)
+{
+	enum ice_status status = 0;
+	u8 i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	ice_for_each_traffic_class(i) {
+		/* configuration is possible only if TC node is present */
+		if (!ice_sched_get_tc_node(pi, i))
+			continue;
+
+		status = ice_sched_cfg_vsi(pi, vsi_handle, i, maxqs[i], owner,
+					   ice_is_tc_ena(tc_bitmap, i));
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_lan - configure VSI LAN queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_lanqs: max LAN queues array per TC
+ *
+ * This function adds/updates the VSI LAN queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		u16 *max_lanqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_lanqs,
+			      ICE_SCHED_NODE_OWNER_LAN);
+}
+
+/**
+ * ice_cfg_vsi_rdma - configure the VSI RDMA queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_rdmaqs: max RDMA queues array per TC
+ *
+ * This function adds/updates the VSI RDMA queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_rdmaqs,
+			      ICE_SCHED_NODE_OWNER_RDMA);
+}
 
-		if (!set)
-			goto ice_aq_get_set_rss_lut_send;
-	} else if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
-		if (!set)
-			goto ice_aq_get_set_rss_lut_send;
-	} else {
-		goto ice_aq_get_set_rss_lut_send;
+/**
+ * ice_ena_vsi_rdma_qset
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @rdma_qset: pointer to RDMA qset
+ * @num_qsets: number of RDMA qsets
+ * @qset_teid: pointer to qset node teids
+ *
+ * This function adds RDMA qset
+ */
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_aqc_add_rdma_qset_data *buf;
+	struct ice_sched_node *parent;
+	enum ice_status status;
+	struct ice_hw *hw;
+	u16 i, buf_size;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	buf_size = struct_size(buf, rdma_qsets, num_qsets);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	mutex_lock(&pi->sched_lock);
+
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_RDMA);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto rdma_error_exit;
 	}
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
 
-	/* LUT size is only valid for Global and PF table types */
-	switch (lut_size) {
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_128:
-		break;
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512:
-		flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512_FLAG <<
-			  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
-			 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
-		break;
-	case ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K:
-		if (lut_type == ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF) {
-			flags |= (ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_2K_FLAG <<
-				  ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_S) &
-				 ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_M;
+	buf->num_qsets = cpu_to_le16(num_qsets);
+	for (i = 0; i < num_qsets; i++) {
+		buf->rdma_qsets[i].tx_qset_id = cpu_to_le16(rdma_qset[i]);
+		buf->rdma_qsets[i].info.valid_sections =
+			ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+			ICE_AQC_ELEM_VALID_EIR;
+		buf->rdma_qsets[i].info.generic = 0;
+		buf->rdma_qsets[i].info.cir_bw.bw_profile_idx =
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.cir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+		buf->rdma_qsets[i].info.eir_bw.bw_profile_idx =
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.eir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
+	}
+	status = ice_aq_add_rdma_qsets(hw, 1, buf, buf_size, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_RDMA, "add RDMA qset failed\n");
+		goto rdma_error_exit;
+	}
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	for (i = 0; i < num_qsets; i++) {
+		node.node_teid = buf->rdma_qsets[i].qset_teid;
+		status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1,
+					    &node);
+		if (status)
 			break;
-		}
-		/* fall-through */
-	default:
-		status = ICE_ERR_PARAM;
-		goto ice_aq_get_set_rss_lut_exit;
+		qset_teid[i] = le32_to_cpu(node.node_teid);
 	}
+rdma_error_exit:
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
 
-ice_aq_get_set_rss_lut_send:
-	cmd_resp->flags = cpu_to_le16(flags);
-	status = ice_aq_send_cmd(hw, &desc, lut, lut_size, NULL);
+/**
+ * ice_dis_vsi_rdma_qset - free RDMA resources
+ * @pi: port_info struct
+ * @count: number of RDMA qsets to free
+ * @qset_teid: TEID of qset node
+ * @q_id: list of queue IDs being disabled
+ */
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id)
+{
+	struct ice_aqc_dis_txq_item *qg_list;
+	enum ice_status status = 0;
+	struct ice_hw *hw;
+	u16 qg_size;
+	int i;
 
-ice_aq_get_set_rss_lut_exit:
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	hw = pi->hw;
+
+	qg_size = struct_size(qg_list, q_id, 1);
+	qg_list = devm_kzalloc(ice_hw_to_dev(hw), qg_size, GFP_KERNEL);
+	if (!qg_list)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < count; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, qset_teid[i]);
+		if (!node)
+			continue;
+
+		qg_list->parent_teid = node->info.parent_teid;
+		qg_list->num_qs = 1;
+		qg_list->q_id[0] =
+			cpu_to_le16(q_id[i] |
+				    ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET);
+
+		status = ice_aq_dis_lan_txq(hw, 1, qg_list, qg_size,
+					    ICE_NO_RESET, 0, NULL);
+		if (status)
+			break;
+
+		ice_free_sched_node(pi, node);
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	devm_kfree(ice_hw_to_dev(hw), qg_list);
 	return status;
 }
 
+
 /**
- * ice_aq_get_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_handle: software VSI handle
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
+ * ice_is_main_vsi - checks whether the VSI is main VSI
+ * @hw: pointer to the HW struct
+ * @vsi_handle: VSI handle
  *
- * get the RSS lookup table, PF or VSI type
+ * Checks whether the VSI is the main VSI (the first PF VSI created on
+ * given PF).
  */
-enum ice_status
-ice_aq_get_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type,
-		   u8 *lut, u16 lut_size)
+static bool ice_is_main_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
-		return ICE_ERR_PARAM;
+	return vsi_handle == ICE_MAIN_VSI_HANDLE && hw->vsi_ctx[vsi_handle];
+}
+
+
+/**
+ * ice_replay_pre_init - replay pre initialization
+ * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function initializes filters
+ *
+ * Initializes required config data for VSI, FD, ACL, and RSS before replay.
+ */
+static enum ice_status
+ice_replay_pre_init(struct ice_hw *hw, struct ice_switch_info *sw)
+{
+	enum ice_status status;
+	u8 i;
+
+	/* Delete old entries from replay filter list head if there is any */
+	ice_rm_sw_replay_rule_info(hw, sw);
+	/* In start of replay, move entries into replay_rules list, it
+	 * will allow adding rules entries back to filt_rules list,
+	 * which is operational list.
+	 */
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++)
+		list_replace_init(&sw->recp_list[i].filt_rules,
+				  &sw->recp_list[i].filt_replay_rules);
+	ice_sched_replay_agg_vsi_preinit(hw);
+
+	status = ice_sched_replay_root_node_bw(hw->port_info);
+	if (status)
+		return status;
 
-	return __ice_aq_get_set_rss_lut(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					lut_type, lut, lut_size, 0, false);
+	return ice_sched_replay_tc_node_bw(hw->port_info);
 }
 
 /**
- * ice_aq_set_rss_lut
- * @hw: pointer to the hardware structure
- * @vsi_handle: software VSI handle
- * @lut_type: LUT table type
- * @lut: pointer to the LUT buffer provided by the caller
- * @lut_size: size of the LUT buffer
+ * ice_replay_vsi - replay VSI configuration
+ * @hw: pointer to the HW struct
+ * @vsi_handle: driver VSI handle
  *
- * set the RSS lookup table, PF or VSI type
+ * Restore all VSI configuration after reset. It is required to call this
+ * function with main VSI first.
  */
-enum ice_status
-ice_aq_set_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type,
-		   u8 *lut, u16 lut_size)
+enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !lut)
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	return __ice_aq_get_set_rss_lut(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					lut_type, lut, lut_size, 0, true);
+	/* Replay pre-initialization if there is any */
+	if (ice_is_main_vsi(hw, vsi_handle)) {
+		status = ice_replay_pre_init(hw, sw);
+		if (status)
+			return status;
+	}
+	/* Replay per VSI all RSS configurations */
+	status = ice_replay_rss_cfg(hw, vsi_handle);
+	if (status)
+		return status;
+	/* Replay per VSI all filters */
+	status = ice_replay_vsi_all_fltr(hw, pi, vsi_handle);
+	if (!status)
+		status = ice_replay_vsi_agg(hw, vsi_handle);
+	return status;
 }
 
 /**
- * __ice_aq_get_set_rss_key
+ * ice_replay_post - post replay configuration cleanup
  * @hw: pointer to the HW struct
- * @vsi_id: VSI FW index
- * @key: pointer to key info struct
- * @set: set true to set the key, false to get the key
  *
- * get (0x0B04) or set (0x0B02) the RSS key per VSI
+ * Post replay cleanup.
  */
-static enum
-ice_status __ice_aq_get_set_rss_key(struct ice_hw *hw, u16 vsi_id,
-				    struct ice_aqc_get_set_rss_keys *key,
-				    bool set)
+void ice_replay_post(struct ice_hw *hw)
 {
-	struct ice_aqc_get_set_rss_key *cmd_resp;
-	u16 key_size = sizeof(*key);
-	struct ice_aq_desc desc;
+	/* Delete old entries from replay filter list head */
+	ice_rm_all_sw_replay_rule_info(hw);
+	ice_sched_replay_agg(hw);
+}
 
-	cmd_resp = &desc.params.get_set_rss_key;
+/**
+ * ice_stat_update40 - read 40 bit stat from the chip and update stat values
+ * @hw: ptr to the hardware info
+ * @reg: offset of 64 bit HW register to read from
+ * @prev_stat_loaded: bool to specify if previous stats are loaded
+ * @prev_stat: ptr to previous loaded stat value
+ * @cur_stat: ptr to current stat value
+ */
+void
+ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
+		  u64 *prev_stat, u64 *cur_stat)
+{
+	u64 new_data = rd64(hw, reg) & (BIT_ULL(40) - 1);
 
-	if (set) {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_key);
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	} else {
-		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_key);
+	/* device stats are not reset at PFR, they likely will not be zeroed
+	 * when the driver starts. Thus, save the value from the first read
+	 * without adding to the statistic value so that we report stats which
+	 * count up from zero.
+	 */
+	if (!prev_stat_loaded) {
+		*prev_stat = new_data;
+		return;
+	}
+
+	/* Calculate the difference between the new and old values, and then
+	 * add it to the software stat value.
+	 */
+	if (new_data >= *prev_stat)
+		*cur_stat += new_data - *prev_stat;
+	else
+		/* to manage the potential roll-over */
+		*cur_stat += (new_data + BIT_ULL(40)) - *prev_stat;
+
+	/* Update the previously stored value to prepare for next read */
+	*prev_stat = new_data;
+}
+
+/**
+ * ice_stat_update32 - read 32 bit stat from the chip and update stat values
+ * @hw: ptr to the hardware info
+ * @reg: offset of HW register to read from
+ * @prev_stat_loaded: bool to specify if previous stats are loaded
+ * @prev_stat: ptr to previous loaded stat value
+ * @cur_stat: ptr to current stat value
+ */
+void
+ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
+		  u64 *prev_stat, u64 *cur_stat)
+{
+	u32 new_data;
+
+	new_data = rd32(hw, reg);
+
+	/* device stats are not reset at PFR, they likely will not be zeroed
+	 * when the driver starts. Thus, save the value from the first read
+	 * without adding to the statistic value so that we report stats which
+	 * count up from zero.
+	 */
+	if (!prev_stat_loaded) {
+		*prev_stat = new_data;
+		return;
 	}
 
-	cmd_resp->vsi_id = cpu_to_le16(((vsi_id <<
-					 ICE_AQC_GSET_RSS_KEY_VSI_ID_S) &
-					ICE_AQC_GSET_RSS_KEY_VSI_ID_M) |
-				       ICE_AQC_GSET_RSS_KEY_VSI_VALID);
+	/* Calculate the difference between the new and old values, and then
+	 * add it to the software stat value.
+	 */
+	if (new_data >= *prev_stat)
+		*cur_stat += new_data - *prev_stat;
+	else
+		/* to manage the potential roll-over */
+		*cur_stat += (new_data + BIT_ULL(32)) - *prev_stat;
 
-	return ice_aq_send_cmd(hw, &desc, key, key_size, NULL);
+	/* Update the previously stored value to prepare for next read */
+	*prev_stat = new_data;
 }
 
+
+
 /**
- * ice_aq_get_rss_key
+ * ice_sched_query_elem - query element information from HW
  * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @key: pointer to key info struct
+ * @node_teid: node TEID to be queried
+ * @buf: buffer to element information
  *
- * get the RSS key per VSI
+ * This function queries HW element information
  */
 enum ice_status
-ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
-		   struct ice_aqc_get_set_rss_keys *key)
+ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
+		     struct ice_aqc_txsched_elem_data *buf)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !key)
-		return ICE_ERR_PARAM;
+	u16 buf_size, num_elem_ret = 0;
+	enum ice_status status;
 
-	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					key, false);
+	buf_size = sizeof(*buf);
+	memset(buf, 0, buf_size);
+	buf->node_teid = cpu_to_le32(node_teid);
+	status = ice_aq_query_sched_elems(hw, 1, buf, buf_size, &num_elem_ret,
+					  NULL);
+	if (status || num_elem_ret != 1)
+		ice_debug(hw, ICE_DBG_SCHED, "query element failed\n");
+	return status;
 }
 
+
 /**
- * ice_aq_set_rss_key
+ * ice_get_fw_mode - returns FW mode
  * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @keys: pointer to key info struct
- *
- * set the RSS key per VSI
  */
-enum ice_status
-ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
-		   struct ice_aqc_get_set_rss_keys *keys)
+enum ice_fw_modes ice_get_fw_mode(struct ice_hw *hw)
 {
-	if (!ice_is_vsi_valid(hw, vsi_handle) || !keys)
-		return ICE_ERR_PARAM;
-
-	return __ice_aq_get_set_rss_key(hw, ice_get_hw_vsi_num(hw, vsi_handle),
-					keys, true);
+#define ICE_FW_MODE_DBG_M BIT(0)
+#define ICE_FW_MODE_REC_M BIT(1)
+#define ICE_FW_MODE_ROLLBACK_M BIT(2)
+	u32 fw_mode;
+
+	/* check the current FW mode */
+	fw_mode = rd32(hw, GL_MNG_FWSM) & GL_MNG_FWSM_FW_MODES_M;
+
+	if (fw_mode & ICE_FW_MODE_DBG_M)
+		return ICE_FW_MODE_DBG;
+	else if (fw_mode & ICE_FW_MODE_REC_M)
+		return ICE_FW_MODE_REC;
+	else if (fw_mode & ICE_FW_MODE_ROLLBACK_M)
+		return ICE_FW_MODE_ROLLBACK;
+	else
+		return ICE_FW_MODE_NORMAL;
 }
 
+
 /**
- * ice_aq_add_lan_txq
- * @hw: pointer to the hardware structure
- * @num_qgrps: Number of added queue groups
- * @qg_list: list of queue groups to be added
- * @buf_size: size of buffer for indirect command
+ * ice_aq_read_i2c
+ * @hw: pointer to the hw struct
+ * @topo_addr: topology address for a device to communicate with
+ * @bus_addr: 7-bit I2C bus address
+ * @addr: I2C memory address (I2C offset) with up to 16 bits
+ * @params: I2C parameters: bit [7] - Repeated start, bits [6:5] data offset size,
+ *			    bit [4] - I2C address type, bits [3:0] - data size to read (0-16 bytes)
+ * @data: pointer to data (0 to 16 bytes) to be read from the I2C device
  * @cd: pointer to command details structure or NULL
  *
- * Add Tx LAN queue (0x0C30)
- *
- * NOTE:
- * Prior to calling add Tx LAN queue:
- * Initialize the following as part of the Tx queue context:
- * Completion queue ID if the queue uses Completion queue, Quanta profile,
- * Cache profile and Packet shaper profile.
- *
- * After add Tx LAN queue AQ command is completed:
- * Interrupts should be associated with specific queues,
- * Association of Tx queue to Doorbell queue is not part of Add LAN Tx queue
- * flow.
+ * Read I2C (0x06E2)
  */
-static enum ice_status
-ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
-		   struct ice_aqc_add_tx_qgrp *qg_list, u16 buf_size,
-		   struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		struct ice_sq_cd *cd)
 {
-	u16 i, sum_header_size, sum_q_size = 0;
-	struct ice_aqc_add_tx_qgrp *list;
-	struct ice_aqc_add_txqs *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.add_txqs;
+	struct ice_aq_desc desc = { 0 };
+	struct ice_aqc_i2c *cmd;
+	enum ice_status status;
+	u8 data_size;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_txqs);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_read_i2c);
+	cmd = &desc.params.read_write_i2c;
 
-	if (!qg_list)
+	if (!data)
 		return ICE_ERR_PARAM;
 
-	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
-		return ICE_ERR_PARAM;
+	data_size = (params & ICE_AQC_I2C_DATA_SIZE_M) >> ICE_AQC_I2C_DATA_SIZE_S;
 
-	sum_header_size = num_qgrps *
-		(sizeof(*qg_list) - sizeof(*qg_list->txqs));
+	cmd->i2c_bus_addr = cpu_to_le16(bus_addr);
+	cmd->topo_addr = topo_addr;
+	cmd->i2c_params = params;
+	cmd->i2c_addr = addr;
 
-	list = qg_list;
-	for (i = 0; i < num_qgrps; i++) {
-		struct ice_aqc_add_txqs_perq *q = list->txqs;
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		struct ice_aqc_read_i2c_resp *resp;
+		u8 i;
 
-		sum_q_size += list->num_txqs * sizeof(*q);
-		list = (struct ice_aqc_add_tx_qgrp *)(q + list->num_txqs);
+		resp = &desc.params.read_i2c_resp;
+		for (i = 0; i < data_size; i++) {
+			*data = resp->i2c_data[i];
+			data++;
+		}
 	}
 
-	if (buf_size != (sum_header_size + sum_q_size))
-		return ICE_ERR_PARAM;
-
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	cmd->num_qgrps = num_qgrps;
-
-	return ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
+	return status;
 }
 
 /**
- * ice_aq_dis_lan_txq
- * @hw: pointer to the hardware structure
- * @num_qgrps: number of groups in the list
- * @qg_list: the list of groups to disable
- * @buf_size: the total size of the qg_list buffer in bytes
- * @rst_src: if called due to reset, specifies the reset source
- * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * ice_aq_write_i2c
+ * @hw: pointer to the hw struct
+ * @topo_addr: topology address for a device to communicate with
+ * @bus_addr: 7-bit I2C bus address
+ * @addr: I2C memory address (I2C offset) with up to 16 bits
+ * @params: I2C parameters: bit [4] - I2C address type, bits [3:0] - data size to write (0-7 bytes)
+ * @data: pointer to data (0 to 4 bytes) to be written to the I2C device
  * @cd: pointer to command details structure or NULL
  *
- * Disable LAN Tx queue (0x0C31)
+ * Write I2C (0x06E3)
  */
-static enum ice_status
-ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
-		   struct ice_aqc_dis_txq_item *qg_list, u16 buf_size,
-		   enum ice_disq_rst_src rst_src, u16 vmvf_num,
-		   struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		 u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		 struct ice_sq_cd *cd)
 {
-	struct ice_aqc_dis_txqs *cmd;
-	struct ice_aq_desc desc;
-	enum ice_status status;
-	u16 i, sz = 0;
+	struct ice_aq_desc desc = { 0 };
+	struct ice_aqc_i2c *cmd;
+	u8 i, data_size;
 
-	cmd = &desc.params.dis_txqs;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dis_txqs);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_write_i2c);
+	cmd = &desc.params.read_write_i2c;
 
-	/* qg_list can be NULL only in VM/VF reset flow */
-	if (!qg_list && !rst_src)
-		return ICE_ERR_PARAM;
+	data_size = (params & ICE_AQC_I2C_DATA_SIZE_M) >> ICE_AQC_I2C_DATA_SIZE_S;
 
-	if (num_qgrps > ICE_LAN_TXQ_MAX_QGRPS)
+	/* data_size limited to 4 */
+	if (data_size > 4)
 		return ICE_ERR_PARAM;
 
-	cmd->num_entries = num_qgrps;
-
-	cmd->vmvf_and_timeout = cpu_to_le16((5 << ICE_AQC_Q_DIS_TIMEOUT_S) &
-					    ICE_AQC_Q_DIS_TIMEOUT_M);
-
-	switch (rst_src) {
-	case ICE_VM_RESET:
-		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VM_RESET;
-		cmd->vmvf_and_timeout |=
-			cpu_to_le16(vmvf_num & ICE_AQC_Q_DIS_VMVF_NUM_M);
-		break;
-	case ICE_VF_RESET:
-		cmd->cmd_type = ICE_AQC_Q_DIS_CMD_VF_RESET;
-		/* In this case, FW expects vmvf_num to be absolute VF ID */
-		cmd->vmvf_and_timeout |=
-			cpu_to_le16((vmvf_num + hw->func_caps.vf_base_id) &
-				    ICE_AQC_Q_DIS_VMVF_NUM_M);
-		break;
-	case ICE_NO_RESET:
-	default:
-		break;
-	}
-
-	/* flush pipe on time out */
-	cmd->cmd_type |= ICE_AQC_Q_DIS_CMD_FLUSH_PIPE;
-	/* If no queue group info, we are in a reset flow. Issue the AQ */
-	if (!qg_list)
-		goto do_aq;
-
-	/* set RD bit to indicate that command buffer is provided by the driver
-	 * and it needs to be read by the firmware
-	 */
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-
-	for (i = 0; i < num_qgrps; ++i) {
-		/* Calculate the size taken up by the queue IDs in this group */
-		sz += qg_list[i].num_qs * sizeof(qg_list[i].q_id);
-
-		/* Add the size of the group header */
-		sz += sizeof(qg_list[i]) - sizeof(qg_list[i].q_id);
+	cmd->i2c_bus_addr = cpu_to_le16(bus_addr);
+	cmd->topo_addr = topo_addr;
+	cmd->i2c_params = params;
+	cmd->i2c_addr = addr;
 
-		/* If the num of queues is even, add 2 bytes of padding */
-		if ((qg_list[i].num_qs % 2) == 0)
-			sz += 2;
+	for (i = 0; i < data_size; i++) {
+		cmd->i2c_data[i] = *data;
+		data++;
 	}
 
-	if (buf_size != sz)
-		return ICE_ERR_PARAM;
-
-do_aq:
-	status = ice_aq_send_cmd(hw, &desc, qg_list, buf_size, cd);
-	if (status) {
-		if (!qg_list)
-			ice_debug(hw, ICE_DBG_SCHED, "VM%d disable failed %d\n",
-				  vmvf_num, hw->adminq.sq_last_status);
-		else
-			ice_debug(hw, ICE_DBG_SCHED, "disable queue %d failed %d\n",
-				  le16_to_cpu(qg_list[0].q_id[0]),
-				  hw->adminq.sq_last_status);
-	}
-	return status;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
-/* End of FW Admin Queue command wrappers */
-
 /**
- * ice_write_byte - write a byte to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_set_driver_param - Set driver parameter to share via firmware
+ * @hw: pointer to the HW struct
+ * @idx: parameter index to set
+ * @value: the value to set the parameter to
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set the value of one of the software defined parameters. All PFs connected
+ * to this device can read the value using ice_aq_get_driver_param.
+ *
+ * Note that firmware provides no synchronization or locking, and will not
+ * save the parameter value during a device reset. It is expected that
+ * a single PF will write the parameter value, while all other PFs will only
+ * read it.
  */
-static void
-ice_write_byte(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_set_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 value, struct ice_sq_cd *cd)
 {
-	u8 src_byte, dest_byte, mask;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
-
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-	mask = (u8)(BIT(ce_info->width) - 1);
-
-	src_byte = *from;
-	src_byte &= mask;
+	struct ice_aqc_driver_shared_params *cmd;
+	struct ice_aq_desc desc;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_byte <<= shift_width;
+	if (idx >= ICE_AQC_DRIVER_PARAM_MAX)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	cmd = &desc.params.drv_shared_params;
 
-	memcpy(&dest_byte, dest, sizeof(dest_byte));
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_driver_shared_params);
 
-	dest_byte &= ~mask;	/* get the bits not changing */
-	dest_byte |= src_byte;	/* add in the new bits */
+	cmd->set_or_get_op = ICE_AQC_DRIVER_PARAM_SET;
+	cmd->param_indx = idx;
+	cmd->param_val = cpu_to_le32(value);
 
-	/* put it all back */
-	memcpy(dest, &dest_byte, sizeof(dest_byte));
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
 /**
- * ice_write_word - write a word to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_get_driver_param - Get driver parameter shared via firmware
+ * @hw: pointer to the HW struct
+ * @idx: parameter index to set
+ * @value: storage to return the shared parameter
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get the value of one of the software defined parameters.
+ *
+ * Note that firmware provides no synchronization or locking. It is expected
+ * that only a single PF will write a given parameter.
  */
-static void
-ice_write_word(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_get_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 *value, struct ice_sq_cd *cd)
 {
-	u16 src_word, mask;
-	__le16 dest_word;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
+	struct ice_aqc_driver_shared_params *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-	mask = BIT(ce_info->width) - 1;
+	if (idx >= ICE_AQC_DRIVER_PARAM_MAX)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_word = *(u16 *)from;
-	src_word &= mask;
+	cmd = &desc.params.drv_shared_params;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_word <<= shift_width;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_driver_shared_params);
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	cmd->set_or_get_op = ICE_AQC_DRIVER_PARAM_GET;
+	cmd->param_indx = idx;
 
-	memcpy(&dest_word, dest, sizeof(dest_word));
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
 
-	dest_word &= ~(cpu_to_le16(mask));	/* get the bits not changing */
-	dest_word |= cpu_to_le16(src_word);	/* add in the new bits */
+	*value = le32_to_cpu(cmd->param_val);
 
-	/* put it all back */
-	memcpy(dest, &dest_word, sizeof(dest_word));
+	return 0;
 }
 
 /**
- * ice_write_dword - write a dword to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_aq_set_gpio
+ * @hw: pointer to the hw struct
+ * @gpio_ctrl_handle: GPIO controller node handle
+ * @pin_idx: IO Number of the GPIO that needs to be set
+ * @value: SW provide IO value to set in the LSB
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sends 0x06EC AQ command to set the GPIO pin state that's part of the topology
  */
-static void
-ice_write_dword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+enum ice_status
+ice_aq_set_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx, bool value,
+		struct ice_sq_cd *cd)
 {
-	u32 src_dword, mask;
-	__le32 dest_dword;
-	u8 *from, *dest;
-	u16 shift_width;
-
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
-
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
-
-	/* if the field width is exactly 32 on an x86 machine, then the shift
-	 * operation will not work because the SHL instructions count is masked
-	 * to 5 bits so the shift will do nothing
-	 */
-	if (ce_info->width < 32)
-		mask = BIT(ce_info->width) - 1;
-	else
-		mask = (u32)~0;
+	struct ice_aqc_gpio *cmd;
+	struct ice_aq_desc desc;
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_dword = *(u32 *)from;
-	src_dword &= mask;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_gpio);
+	cmd = &desc.params.read_write_gpio;
+	cmd->gpio_ctrl_handle = gpio_ctrl_handle;
+	cmd->gpio_num = pin_idx;
+	cmd->gpio_val = value ? 1 : 0;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_dword <<= shift_width;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+/**
+ * ice_aq_get_gpio
+ * @hw: pointer to the hw struct
+ * @gpio_ctrl_handle: GPIO controller node handle
+ * @pin_idx: IO Number of the GPIO that needs to be set
+ * @value: IO value read
+ * @cd: pointer to command details structure or NULL
+ *
+ * Sends 0x06ED AQ command to get the value of a GPIO signal which is part of
+ * the topology
+ */
+enum ice_status
+ice_aq_get_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx,
+		bool *value, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_gpio *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	memcpy(&dest_dword, dest, sizeof(dest_dword));
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_gpio);
+	cmd = &desc.params.read_write_gpio;
+	cmd->gpio_ctrl_handle = gpio_ctrl_handle;
+	cmd->gpio_num = pin_idx;
 
-	dest_dword &= ~(cpu_to_le32(mask));	/* get the bits not changing */
-	dest_dword |= cpu_to_le32(src_dword);	/* add in the new bits */
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
 
-	/* put it all back */
-	memcpy(dest, &dest_dword, sizeof(dest_dword));
+	*value = !!cmd->gpio_val;
+	return 0;
 }
 
 /**
- * ice_write_qword - write a qword to a packed context structure
- * @src_ctx:  the context structure to read from
- * @dest_ctx: the context to be written to
- * @ce_info:  a description of the struct to be filled
+ * ice_fw_supports_link_override
+ * @hw: pointer to the hardware structure
+ *
+ * Checks if the firmware supports link override
  */
-static void
-ice_write_qword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+bool ice_fw_supports_link_override(struct ice_hw *hw)
 {
-	u64 src_qword, mask;
-	__le64 dest_qword;
-	u8 *from, *dest;
-	u16 shift_width;
+	if (hw->api_maj_ver == ICE_FW_API_LINK_OVERRIDE_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_LINK_OVERRIDE_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_LINK_OVERRIDE_MIN &&
+		    hw->api_patch >= ICE_FW_API_LINK_OVERRIDE_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_LINK_OVERRIDE_MAJ) {
+		return true;
+	}
 
-	/* copy from the next struct field */
-	from = src_ctx + ce_info->offset;
+	return false;
+}
 
-	/* prepare the bits and mask */
-	shift_width = ce_info->lsb % 8;
+/**
+ * ice_get_link_default_override
+ * @ldo: pointer to the link default override struct
+ * @pi: pointer to the port info struct
+ *
+ * Gets the link default override for a port
+ */
+enum ice_status
+ice_get_link_default_override(struct ice_link_default_override_tlv *ldo,
+			      struct ice_port_info *pi)
+{
+	u16 i, tlv, tlv_len, tlv_start, buf, offset;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
 
-	/* if the field width is exactly 64 on an x86 machine, then the shift
-	 * operation will not work because the SHL instructions count is masked
-	 * to 6 bits so the shift will do nothing
-	 */
-	if (ce_info->width < 64)
-		mask = BIT_ULL(ce_info->width) - 1;
-	else
-		mask = (u64)~0;
+	status = ice_get_pfa_module_tlv(hw, &tlv, &tlv_len,
+					ICE_SR_LINK_DEFAULT_OVERRIDE_PTR);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read link override TLV.\n");
+		return status;
+	}
 
-	/* don't swizzle the bits until after the mask because the mask bits
-	 * will be in a different bit position on big endian machines
-	 */
-	src_qword = *(u64 *)from;
-	src_qword &= mask;
+	/* Each port has its own config; calculate for our port */
+	tlv_start = tlv + pi->lport * ICE_SR_PFA_LINK_OVERRIDE_WORDS +
+		ICE_SR_PFA_LINK_OVERRIDE_OFFSET;
 
-	/* shift to correct alignment */
-	mask <<= shift_width;
-	src_qword <<= shift_width;
+	/* link options first */
+	status = ice_read_sr_word(hw, tlv_start, &buf);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+		return status;
+	}
+	ldo->options = buf & ICE_LINK_OVERRIDE_OPT_M;
+	ldo->phy_config = (buf & ICE_LINK_OVERRIDE_PHY_CFG_M) >>
+		ICE_LINK_OVERRIDE_PHY_CFG_S;
 
-	/* get the current bits from the target bit string */
-	dest = dest_ctx + (ce_info->lsb / 8);
+	/* link PHY config */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_FEC_OFFSET;
+	status = ice_read_sr_word(hw, offset, &buf);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read override phy config.\n");
+		return status;
+	}
+	ldo->fec_options = buf & ICE_LINK_OVERRIDE_FEC_OPT_M;
 
-	memcpy(&dest_qword, dest, sizeof(dest_qword));
+	/* PHY types low */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET;
+	for (i = 0; i < ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS; i++) {
+		status = ice_read_sr_word(hw, (offset + i), &buf);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+			return status;
+		}
+		/* shift 16 bits at a time to fill 64 bits */
+		ldo->phy_type_low |= ((u64)buf << (i * 16));
+	}
 
-	dest_qword &= ~(cpu_to_le64(mask));	/* get the bits not changing */
-	dest_qword |= cpu_to_le64(src_qword);	/* add in the new bits */
+	/* PHY types high */
+	offset = tlv_start + ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET +
+		ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS;
+	for (i = 0; i < ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS; i++) {
+		status = ice_read_sr_word(hw, (offset + i), &buf);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read override link options.\n");
+			return status;
+		}
+		/* shift 16 bits at a time to fill 64 bits */
+		ldo->phy_type_high |= ((u64)buf << (i * 16));
+	}
 
-	/* put it all back */
-	memcpy(dest, &dest_qword, sizeof(dest_qword));
+	return status;
 }
 
 /**
- * ice_set_ctx - set context bits in packed structure
- * @src_ctx:  pointer to a generic non-packed context structure
- * @dest_ctx: pointer to memory for the packed structure
- * @ce_info:  a description of the structure to be transformed
+ * ice_is_phy_caps_an_enabled - check if PHY capabilities autoneg is enabled
+ * @caps: get PHY capability data
  */
-enum ice_status
-ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info)
+bool ice_is_phy_caps_an_enabled(struct ice_aqc_get_phy_caps_data *caps)
 {
-	int f;
-
-	for (f = 0; ce_info[f].width; f++) {
-		/* We have to deal with each element of the FW response
-		 * using the correct size so that we are correct regardless
-		 * of the endianness of the machine.
-		 */
-		switch (ce_info[f].size_of) {
-		case sizeof(u8):
-			ice_write_byte(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u16):
-			ice_write_word(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u32):
-			ice_write_dword(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		case sizeof(u64):
-			ice_write_qword(src_ctx, dest_ctx, &ce_info[f]);
-			break;
-		default:
-			return ICE_ERR_INVAL_SIZE;
-		}
-	}
+	if (caps->caps & ICE_AQC_PHY_AN_MODE ||
+	    caps->low_power_ctrl_an & (ICE_AQC_PHY_AN_EN_CLAUSE28 |
+				       ICE_AQC_PHY_AN_EN_CLAUSE73 |
+				       ICE_AQC_PHY_AN_EN_CLAUSE37))
+		return true;
 
-	return 0;
+	return false;
 }
 
 /**
- * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC
- * @hw: pointer to the HW struct
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @q_handle: software queue handle
+ * ice_is_fw_health_report_supported
+ * @hw: pointer to the hardware structure
+ *
+ * Return true if firmware supports health status reports,
+ * false otherwise
  */
-static struct ice_q_ctx *
-ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle)
+bool ice_is_fw_health_report_supported(struct ice_hw *hw)
 {
-	struct ice_vsi_ctx *vsi;
-	struct ice_q_ctx *q_ctx;
+	if (hw->api_maj_ver > ICE_FW_API_HEALTH_REPORT_MAJ)
+		return true;
+
+	if (hw->api_maj_ver == ICE_FW_API_HEALTH_REPORT_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_HEALTH_REPORT_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_HEALTH_REPORT_MIN &&
+		    hw->api_patch >= ICE_FW_API_HEALTH_REPORT_PATCH)
+			return true;
+	}
 
-	vsi = ice_get_vsi_ctx(hw, vsi_handle);
-	if (!vsi)
-		return NULL;
-	if (q_handle >= vsi->num_lan_q_entries[tc])
-		return NULL;
-	if (!vsi->lan_q_ctx[tc])
-		return NULL;
-	q_ctx = vsi->lan_q_ctx[tc];
-	return &q_ctx[q_handle];
+	return false;
 }
 
 /**
- * ice_ena_vsi_txq
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @q_handle: software queue handle
- * @num_qgrps: Number of added queue groups
- * @buf: list of queue groups to be added
- * @buf_size: size of buffer for indirect command
+ * ice_aq_set_health_status_config - Configure FW health events
+ * @hw: pointer to the HW struct
+ * @event_source: type of diagnostic events to enable
  * @cd: pointer to command details structure or NULL
  *
- * This function adds one LAN queue
+ * Configure the health status event types that the firmware will send to this
+ * PF. The supported event types are: PF-specific, all PFs, and global
  */
 enum ice_status
-ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
-		u8 num_qgrps, struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
-		struct ice_sq_cd *cd)
+ice_aq_set_health_status_config(struct ice_hw *hw, u8 event_source,
+				struct ice_sq_cd *cd)
 {
-	struct ice_aqc_txsched_elem_data node = { 0 };
-	struct ice_sched_node *parent;
-	struct ice_q_ctx *q_ctx;
-	enum ice_status status;
-	struct ice_hw *hw;
+	struct ice_aqc_set_health_status_config *cmd;
+	struct ice_aq_desc desc;
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+	cmd = &desc.params.set_health_status_config;
 
-	if (num_qgrps > 1 || buf->num_txqs > 1)
-		return ICE_ERR_MAX_LIMIT;
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_set_health_status_config);
 
-	hw = pi->hw;
+	cmd->event_source = event_source;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-	mutex_lock(&pi->sched_lock);
 
-	q_ctx = ice_get_lan_q_ctx(hw, vsi_handle, tc, q_handle);
-	if (!q_ctx) {
-		ice_debug(hw, ICE_DBG_SCHED, "Enaq: invalid queue handle %d\n",
-			  q_handle);
-		status = ICE_ERR_PARAM;
-		goto ena_txq_exit;
-	}
+/**
+ * ice_aq_get_port_options
+ * @hw: pointer to the hw struct
+ * @options: buffer for the resultant port options
+ * @option_count: input - size of the buffer in port options structures,
+ *                output - number of returned port options
+ * @lport: logical port to call the command with (optional)
+ * @lport_valid: when false, FW uses port owned by the PF instead of lport,
+ *               when PF owns more than 1 port it must be true
+ * @active_option_idx: index of active port option in returned buffer
+ * @active_option_valid: active option in returned buffer is valid
+ *
+ * Calls Get Port Options AQC (0x06ea) and verifies result.
+ */
+enum ice_status
+ice_aq_get_port_options(struct ice_hw *hw,
+			struct ice_aqc_get_port_options_elem *options,
+			u8 *option_count, u8 lport, bool lport_valid,
+			u8 *active_option_idx, bool *active_option_valid)
+{
+	struct ice_aqc_get_port_options *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u8 pmd_count;
+	u8 max_speed;
+	u8 i;
 
-	/* find a parent node */
-	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
-					    ICE_SCHED_NODE_OWNER_LAN);
-	if (!parent) {
-		status = ICE_ERR_PARAM;
-		goto ena_txq_exit;
-	}
+	/* options buffer shall be able to hold max returned options */
+	if (*option_count < ICE_AQC_PORT_OPT_COUNT_M)
+		return ICE_ERR_PARAM;
 
-	buf->parent_teid = parent->info.node_teid;
-	node.parent_teid = parent->info.node_teid;
-	/* Mark that the values in the "generic" section as valid. The default
-	 * value in the "generic" section is zero. This means that :
-	 * - Scheduling mode is Bytes Per Second (BPS), indicated by Bit 0.
-	 * - 0 priority among siblings, indicated by Bit 1-3.
-	 * - WFQ, indicated by Bit 4.
-	 * - 0 Adjustment value is used in PSM credit update flow, indicated by
-	 * Bit 5-6.
-	 * - Bit 7 is reserved.
-	 * Without setting the generic section as valid in valid_sections, the
-	 * Admin queue command will fail with error code ICE_AQ_RC_EINVAL.
-	 */
-	buf->txqs[0].info.valid_sections = ICE_AQC_ELEM_VALID_GENERIC;
+	cmd = &desc.params.get_port_options;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_port_options);
 
-	/* add the LAN queue */
-	status = ice_aq_add_lan_txq(hw, num_qgrps, buf, buf_size, cd);
-	if (status) {
-		ice_debug(hw, ICE_DBG_SCHED, "enable queue %d failed %d\n",
-			  le16_to_cpu(buf->txqs[0].txq_id),
-			  hw->adminq.sq_last_status);
-		goto ena_txq_exit;
-	}
+	if (lport_valid)
+		cmd->lport_num = lport;
+	cmd->lport_num_valid = lport_valid;
 
-	node.node_teid = buf->txqs[0].q_teid;
-	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
-	q_ctx->q_handle = q_handle;
+	status = ice_aq_send_cmd(hw, &desc, options,
+				 *option_count * sizeof(*options), NULL);
+	if (status)
+		return status;
 
-	/* add a leaf node into schduler tree queue layer */
-	status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1, &node);
+	/* verify direct FW response & set output parameters */
+	*option_count = cmd->port_options_count & ICE_AQC_PORT_OPT_COUNT_M;
+	ice_debug(hw, ICE_DBG_PHY, "options: %x\n", *option_count);
+	*active_option_valid = cmd->port_options & ICE_AQC_PORT_OPT_VALID;
+	if (*active_option_valid) {
+		*active_option_idx = cmd->port_options &
+				     ICE_AQC_PORT_OPT_ACTIVE_M;
+		if (*active_option_idx > (*option_count - 1))
+			return ICE_ERR_OUT_OF_RANGE;
+		ice_debug(hw, ICE_DBG_PHY, "active idx: %x\n",
+			  *active_option_idx);
+	}
 
-ena_txq_exit:
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	/* verify indirect FW response & mask output options fields */
+	for (i = 0; i < *option_count; i++) {
+		options[i].pmd &= ICE_AQC_PORT_OPT_PMD_COUNT_M;
+		options[i].max_lane_speed &= ICE_AQC_PORT_OPT_MAX_LANE_M;
+		pmd_count = options[i].pmd;
+		max_speed = options[i].max_lane_speed;
+		ice_debug(hw, ICE_DBG_PHY, "pmds: %x max speed: %x\n",
+			  pmd_count, max_speed);
+
+		/* check only entries containing valid max pmd speed values,
+		 * other reserved values may be returned, when logical port
+		 * used is unrelated to specific option
+		 */
+		if (max_speed <= ICE_AQC_PORT_OPT_MAX_LANE_100G) {
+			if (pmd_count > ICE_MAX_PORT_PER_PCI_DEV)
+				return ICE_ERR_OUT_OF_RANGE;
+			if (pmd_count > 2 &&
+			    max_speed > ICE_AQC_PORT_OPT_MAX_LANE_25G)
+				return ICE_ERR_CFG;
+			if (pmd_count > 7 &&
+			    max_speed > ICE_AQC_PORT_OPT_MAX_LANE_10G)
+				return ICE_ERR_CFG;
+		}
+	}
+
+	return 0;
 }
 
 /**
- * ice_dis_vsi_txq
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc: TC number
- * @num_queues: number of queues
- * @q_handles: pointer to software queue handle array
- * @q_ids: pointer to the q_id array
- * @q_teids: pointer to queue node teids
- * @rst_src: if called due to reset, specifies the reset source
- * @vmvf_num: the relative VM or VF number that is undergoing the reset
+ * ice_aq_set_lldp_mib - Set the LLDP MIB
+ * @hw: pointer to the HW struct
+ * @mib_type: Local, Remote or both Local and Remote MIBs
+ * @buf: pointer to the caller-supplied buffer to store the MIB block
+ * @buf_size: size of the buffer (in bytes)
  * @cd: pointer to command details structure or NULL
  *
- * This function removes queues and their corresponding nodes in SW DB
+ * Set the LLDP MIB. (0x0A08)
  */
 enum ice_status
-ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
-		u16 *q_handles, u16 *q_ids, u32 *q_teids,
-		enum ice_disq_rst_src rst_src, u16 vmvf_num,
-		struct ice_sq_cd *cd)
+ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
+		    struct ice_sq_cd *cd)
 {
-	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
-	struct ice_aqc_dis_txq_item qg_list;
-	struct ice_q_ctx *q_ctx;
-	u16 i;
+	struct ice_aqc_lldp_set_local_mib *cmd;
+	struct ice_aq_desc desc;
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+	cmd = &desc.params.lldp_set_mib;
 
-	if (!num_queues) {
-		/* if queue is disabled already yet the disable queue command
-		 * has to be sent to complete the VF reset, then call
-		 * ice_aq_dis_lan_txq without any queue information
-		 */
-		if (rst_src)
-			return ice_aq_dis_lan_txq(pi->hw, 0, NULL, 0, rst_src,
-						  vmvf_num, NULL);
-		return ICE_ERR_CFG;
-	}
+	if (buf_size == 0 || !buf)
+		return ICE_ERR_PARAM;
 
-	mutex_lock(&pi->sched_lock);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_local_mib);
 
-	for (i = 0; i < num_queues; i++) {
-		struct ice_sched_node *node;
+	desc.flags |= cpu_to_le16((u16)ICE_AQ_FLAG_RD);
+	desc.datalen = cpu_to_le16(buf_size);
 
-		node = ice_sched_find_node_by_teid(pi->root, q_teids[i]);
-		if (!node)
-			continue;
-		q_ctx = ice_get_lan_q_ctx(pi->hw, vsi_handle, tc, q_handles[i]);
-		if (!q_ctx) {
-			ice_debug(pi->hw, ICE_DBG_SCHED, "invalid queue handle%d\n",
-				  q_handles[i]);
-			continue;
-		}
-		if (q_ctx->q_handle != q_handles[i]) {
-			ice_debug(pi->hw, ICE_DBG_SCHED, "Err:handles %d %d\n",
-				  q_ctx->q_handle, q_handles[i]);
-			continue;
-		}
-		qg_list.parent_teid = node->info.parent_teid;
-		qg_list.num_qs = 1;
-		qg_list.q_id[0] = cpu_to_le16(q_ids[i]);
-		status = ice_aq_dis_lan_txq(pi->hw, 1, &qg_list,
-					    sizeof(qg_list), rst_src, vmvf_num,
-					    cd);
+	cmd->type = mib_type;
+	cmd->length = cpu_to_le16(buf_size);
 
-		if (status)
-			break;
-		ice_free_sched_node(pi, node);
-		q_ctx->q_handle = ICE_INVAL_Q_HANDLE;
-	}
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
 /**
- * ice_cfg_vsi_qs - configure the new/existing VSI queues
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc_bitmap: TC bitmap
- * @maxqs: max queues array per TC
- * @owner: LAN or RDMA
- *
- * This function adds/updates the VSI queues per TC.
+ * ice_fw_supports_lldp_fltr_ctrl - check NVM version supports lldp_fltr_ctrl
+ * @hw: pointer to HW struct
  */
-static enum ice_status
-ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
-	       u16 *maxqs, u8 owner)
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw)
 {
-	enum ice_status status = 0;
-	u8 i;
+	if (hw->mac_type != ICE_MAC_E810)
+		return false;
+
+	if (hw->api_maj_ver == ICE_FW_API_LLDP_FLTR_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_LLDP_FLTR_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_LLDP_FLTR_MIN &&
+		    hw->api_patch >= ICE_FW_API_LLDP_FLTR_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_LLDP_FLTR_MAJ) {
+		return true;
+	}
+	return false;
+}
 
-	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
-		return ICE_ERR_CFG;
+/**
+ * ice_lldp_fltr_add_remove - add or remove a LLDP Rx switch filter
+ * @hw: pointer to HW struct
+ * @vsi_num: absolute HW index for VSI
+ * @add: boolean for if adding or removing a filter
+ */
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add)
+{
+	struct ice_aqc_lldp_filter_ctrl *cmd;
+	struct ice_aq_desc desc;
 
-	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	cmd = &desc.params.lldp_filter_ctrl;
 
-	mutex_lock(&pi->sched_lock);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl);
 
-	ice_for_each_traffic_class(i) {
-		/* configuration is possible only if TC node is present */
-		if (!ice_sched_get_tc_node(pi, i))
-			continue;
+	if (add)
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_ADD;
+	else
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_DELETE;
 
-		status = ice_sched_cfg_vsi(pi, vsi_handle, i, maxqs[i], owner,
-					   ice_is_tc_ena(tc_bitmap, i));
-		if (status)
-			break;
-	}
+	cmd->vsi_num = cpu_to_le16(vsi_num);
 
-	mutex_unlock(&pi->sched_lock);
-	return status;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_cfg_vsi_lan - configure VSI LAN queues
- * @pi: port information structure
- * @vsi_handle: software VSI handle
- * @tc_bitmap: TC bitmap
- * @max_lanqs: max LAN queues array per TC
+ * ice_fw_supports_report_dflt_cfg
+ * @hw: pointer to the hardware structure
  *
- * This function adds/updates the VSI LAN queues per TC.
+ * Checks if the firmware supports report default configuration
  */
-enum ice_status
-ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
-		u16 *max_lanqs)
+bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw)
 {
-	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_lanqs,
-			      ICE_SCHED_NODE_OWNER_LAN);
+	if (hw->api_maj_ver == ICE_FW_API_REPORT_DFLT_CFG_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_REPORT_DFLT_CFG_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_REPORT_DFLT_CFG_MIN &&
+		    hw->api_patch >= ICE_FW_API_REPORT_DFLT_CFG_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_REPORT_DFLT_CFG_MAJ) {
+		return true;
+	}
+	return false;
 }
 
 /**
- * ice_replay_pre_init - replay pre initialization
- * @hw: pointer to the HW struct
+ * ice_is_pca9575_sw_handle
+ * @hw: pointer to the hw struct
+ * @handle: GPIO controller's handle
  *
- * Initializes required config data for VSI, FD, ACL, and RSS before replay.
+ * This command will check if the reset pin is present in the netlist for
+ * a given netlist handle. The SW controlled IO expander does not have this pin
+ * populated in the netlist.
  */
-static enum ice_status ice_replay_pre_init(struct ice_hw *hw)
+static bool
+ice_is_pca9575_sw_handle(struct ice_hw *hw, u16 handle)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	u8 i;
+	struct ice_aqc_get_link_topo_pin *cmd;
+	struct ice_aq_desc desc;
 
-	/* Delete old entries from replay filter list head if there is any */
-	ice_rm_all_sw_replay_rule_info(hw);
-	/* In start of replay, move entries into replay_rules list, it
-	 * will allow adding rules entries back to filt_rules list,
-	 * which is operational list.
+	cmd = &desc.params.get_link_topo_pin;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo_pin);
+
+	/* set node comtext to the given GPIO controller */
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
+	cmd->addr.handle = handle;
+
+	/* Try finding the reset pin in the GPIO context */
+	cmd->input_io_params = (ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_GPIO <<
+				ICE_AQC_LINK_TOPO_INPUT_IO_TYPE_S) |
+			       ICE_AQC_LINK_TOPO_IO_FUNC_RESET_N;
+
+	/* If the expander is controlled by software the following command
+	 * should return error ICE_AQ_RC_ENXIO
 	 */
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++)
-		list_replace_init(&sw->recp_list[i].filt_rules,
-				  &sw->recp_list[i].filt_replay_rules);
+	if (ice_aq_send_cmd(hw, &desc, NULL, 0, NULL) &&
+	    hw->adminq.sq_last_status == ICE_AQ_RC_ENXIO)
+		return true;
 
-	return 0;
+	return false;
 }
 
 /**
- * ice_replay_vsi - replay VSI configuration
- * @hw: pointer to the HW struct
- * @vsi_handle: driver VSI handle
+ * ice_get_pca9575_handle
+ * @hw: pointer to the hw struct
+ * @pca9575_handle: GPIO controller's handle
  *
- * Restore all VSI configuration after reset. It is required to call this
- * function with main VSI first.
+ * Find and return the GPIO controller's handle in the netlist.
+ * When found - the value will be cached in the hw structure and following calls
+ * will return cached value
  */
-enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle)
+static enum ice_status
+ice_get_pca9575_handle(struct ice_hw *hw, __le16 *pca9575_handle)
 {
+	struct ice_aqc_get_link_topo *cmd;
+	struct ice_aq_desc desc;
 	enum ice_status status;
+	__le16 handle;
+	u8 idx;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (!hw || !pca9575_handle)
 		return ICE_ERR_PARAM;
 
-	/* Replay pre-initialization if there is any */
-	if (vsi_handle == ICE_MAIN_VSI_HANDLE) {
-		status = ice_replay_pre_init(hw);
+	/* If handle was read previously return cached value */
+	if (hw->io_expander_handle) {
+		*pca9575_handle = hw->io_expander_handle;
+		return 0;
+	}
+
+	/* If handle was not detected read it from the netlist */
+	cmd = &desc.params.get_link_topo;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
+
+	/* Set node type to GPIO controller */
+	cmd->addr.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_TYPE_M &
+		 ICE_AQC_LINK_TOPO_NODE_TYPE_GPIO_CTRL);
+
+#define SW_PCA9575_MAX_TOPO_IDX	2
+
+	/* SW IO expander is usually the last one in the netlist. Scan the
+	 * netlist backward and see if we find it. Index 0 is assigned to
+	 * the IO widget so we skip it.
+	 */
+	for (idx = SW_PCA9575_MAX_TOPO_IDX; idx > 0; idx--) {
+		cmd->addr.topo_params.index = idx;
+
+		status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 		if (status)
-			return status;
+			continue;
+
+		handle = desc.params.get_link_topo.addr.handle;
+
+		/* Verify if we found the right IO expander type */
+		if (desc.params.get_link_topo.node_part_num ==
+		    ICE_ACQ_GET_LINK_TOPO_NODE_NR_PCA9575 &&
+		    ice_is_pca9575_sw_handle(hw, handle))
+			break;
 	}
 
-	/* Replay per VSI all filters */
-	status = ice_replay_vsi_all_fltr(hw, vsi_handle);
-	return status;
+	/* Expander not found */
+	if (!cmd->addr.topo_params.index)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	/* If present save the handle and return it */
+	hw->io_expander_handle = desc.params.get_link_topo.addr.handle;
+	*pca9575_handle = hw->io_expander_handle;
+
+	return 0;
 }
 
 /**
- * ice_replay_post - post replay configuration cleanup
- * @hw: pointer to the HW struct
+ * ice_read_e810t_pca9575_reg
+ * @hw: pointer to the hw struct
+ * @offset: GPIO controller register offset
+ * @data: pointer to data to be read from the GPIO controller
  *
- * Post replay cleanup.
+ * Read the register from the GPIO controller
  */
-void ice_replay_post(struct ice_hw *hw)
+enum ice_status
+ice_read_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 *data)
 {
-	/* Delete old entries from replay filter list head */
-	ice_rm_all_sw_replay_rule_info(hw);
-}
+	struct ice_aqc_link_topo_addr link_topo;
+	enum ice_status status;
+	__le16 addr;
 
-/**
- * ice_stat_update40 - read 40 bit stat from the chip and update stat values
- * @hw: ptr to the hardware info
- * @reg: offset of 64 bit HW register to read from
- * @prev_stat_loaded: bool to specify if previous stats are loaded
- * @prev_stat: ptr to previous loaded stat value
- * @cur_stat: ptr to current stat value
- */
-void
-ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
-		  u64 *prev_stat, u64 *cur_stat)
-{
-	u64 new_data = rd64(hw, reg) & (BIT_ULL(40) - 1);
+	memset(&link_topo, 0, sizeof(link_topo));
 
-	/* device stats are not reset at PFR, they likely will not be zeroed
-	 * when the driver starts. Thus, save the value from the first read
-	 * without adding to the statistic value so that we report stats which
-	 * count up from zero.
-	 */
-	if (!prev_stat_loaded) {
-		*prev_stat = new_data;
-		return;
-	}
+	status = ice_get_pca9575_handle(hw, &link_topo.handle);
+	if (status)
+		return status;
 
-	/* Calculate the difference between the new and old values, and then
-	 * add it to the software stat value.
-	 */
-	if (new_data >= *prev_stat)
-		*cur_stat += new_data - *prev_stat;
-	else
-		/* to manage the potential roll-over */
-		*cur_stat += (new_data + BIT_ULL(40)) - *prev_stat;
+	link_topo.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
 
-	/* Update the previously stored value to prepare for next read */
-	*prev_stat = new_data;
+	addr = cpu_to_le16((u16)offset);
+
+	return ice_aq_read_i2c(hw, link_topo, 0, addr, 1, data, NULL);
 }
 
 /**
- * ice_stat_update32 - read 32 bit stat from the chip and update stat values
- * @hw: ptr to the hardware info
- * @reg: offset of HW register to read from
- * @prev_stat_loaded: bool to specify if previous stats are loaded
- * @prev_stat: ptr to previous loaded stat value
- * @cur_stat: ptr to current stat value
+ * ice_write_e810t_pca9575_reg
+ * @hw: pointer to the hw struct
+ * @offset: GPIO controller register offset
+ * @data: data to be written to the GPIO controller
+ *
+ * Write the data to the GPIO controller register
  */
-void
-ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
-		  u64 *prev_stat, u64 *cur_stat)
+enum ice_status
+ice_write_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 data)
 {
-	u32 new_data;
+	struct ice_aqc_link_topo_addr link_topo;
+	enum ice_status status;
+	__le16 addr;
 
-	new_data = rd32(hw, reg);
+	memset(&link_topo, 0, sizeof(link_topo));
 
-	/* device stats are not reset at PFR, they likely will not be zeroed
-	 * when the driver starts. Thus, save the value from the first read
-	 * without adding to the statistic value so that we report stats which
-	 * count up from zero.
-	 */
-	if (!prev_stat_loaded) {
-		*prev_stat = new_data;
-		return;
-	}
+	status = ice_get_pca9575_handle(hw, &link_topo.handle);
+	if (status)
+		return status;
 
-	/* Calculate the difference between the new and old values, and then
-	 * add it to the software stat value.
-	 */
-	if (new_data >= *prev_stat)
-		*cur_stat += new_data - *prev_stat;
-	else
-		/* to manage the potential roll-over */
-		*cur_stat += (new_data + BIT_ULL(32)) - *prev_stat;
+	link_topo.topo_params.node_type_ctx =
+		(ICE_AQC_LINK_TOPO_NODE_CTX_PROVIDED <<
+		 ICE_AQC_LINK_TOPO_NODE_CTX_S);
 
-	/* Update the previously stored value to prepare for next read */
-	*prev_stat = new_data;
+	addr = cpu_to_le16((u16)offset);
+
+	return ice_aq_write_i2c(hw, link_topo, 0, addr, 1, &data, NULL);
 }
 
 /**
- * ice_sched_query_elem - query element information from HW
- * @hw: pointer to the HW struct
- * @node_teid: node TEID to be queried
- * @buf: buffer to element information
+ * ice_e810t_is_pca9575_present
+ * @hw: pointer to the hw struct
  *
- * This function queries HW element information
+ * Check if the SW IO expander is present on the board
  */
-enum ice_status
-ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
-		     struct ice_aqc_get_elem *buf)
+bool ice_e810t_is_pca9575_present(struct ice_hw *hw)
 {
-	u16 buf_size, num_elem_ret = 0;
 	enum ice_status status;
+	u8 data;
 
-	buf_size = sizeof(*buf);
-	memset(buf, 0, buf_size);
-	buf->generic[0].node_teid = cpu_to_le32(node_teid);
-	status = ice_aq_query_sched_elems(hw, 1, buf, buf_size, &num_elem_ret,
-					  NULL);
-	if (status || num_elem_ret != 1)
-		ice_debug(hw, ICE_DBG_SCHED, "query element failed\n");
-	return status;
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P0_IN, &data);
+
+	if (status)
+		return false;
+
+	return true;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index c3df92f57777b19b53055c2fa55b6d25bff0299b..e328501a233023f2edead2bd7f8bf53a20096e47 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -1,23 +1,34 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_COMMON_H_
 #define _ICE_COMMON_H_
 
 #include "ice.h"
 #include "ice_type.h"
+#include "ice_nvm.h"
 #include "ice_flex_pipe.h"
+#include "virtchnl.h"
 #include "ice_switch.h"
-#include <linux/avf/virtchnl.h>
+#include "ice_fdir.h"
 
-enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw);
+#define ICE_SQ_SEND_DELAY_TIME_MS	10
+#define ICE_SQ_SEND_MAX_EXECUTE		3
 
-void
-ice_debug_cq(struct ice_hw *hw, u32 mask, void *desc, void *buf, u16 buf_len);
+enum ice_fw_modes {
+	ICE_FW_MODE_NORMAL,
+	ICE_FW_MODE_DBG,
+	ICE_FW_MODE_REC,
+	ICE_FW_MODE_ROLLBACK
+};
+
+
+void ice_set_umac_shared(struct ice_hw *hw);
 enum ice_status ice_init_hw(struct ice_hw *hw);
 void ice_deinit_hw(struct ice_hw *hw);
 enum ice_status ice_check_reset(struct ice_hw *hw);
 enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req);
+
 enum ice_status ice_create_all_ctrlq(struct ice_hw *hw);
 enum ice_status ice_init_all_ctrlq(struct ice_hw *hw);
 void ice_shutdown_all_ctrlq(struct ice_hw *hw);
@@ -32,43 +43,73 @@ enum ice_status
 ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		enum ice_aq_res_access_type access, u32 timeout);
 void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res);
-enum ice_status ice_init_nvm(struct ice_hw *hw);
 enum ice_status
-ice_read_sr_buf(struct ice_hw *hw, u16 offset, u16 *words, u16 *data);
+ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res);
+enum ice_status
+ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res);
+enum ice_status
+ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
+		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
+		      enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_sq_send_cmd_nolock(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		       struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		       struct ice_sq_cd *cd);
 enum ice_status
 ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		struct ice_aq_desc *desc, void *buf, u16 buf_size,
 		struct ice_sq_cd *cd);
 void ice_clear_pxe_mode(struct ice_hw *hw);
+
 enum ice_status ice_get_caps(struct ice_hw *hw);
 
 void ice_set_safe_mode_caps(struct ice_hw *hw);
 
-void ice_dev_onetime_setup(struct ice_hw *hw);
+
+
+
 
 enum ice_status
 ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx,
 		  u32 rxq_index);
+enum ice_status ice_clear_rxq_ctx(struct ice_hw *hw, u32 rxq_index);
+enum ice_status
+ice_clear_tx_cmpltnq_ctx(struct ice_hw *hw, u32 tx_cmpltnq_index);
+enum ice_status
+ice_write_tx_cmpltnq_ctx(struct ice_hw *hw,
+			 struct ice_tx_cmpltnq_ctx *tx_cmpltnq_ctx,
+			 u32 tx_cmpltnq_index);
+enum ice_status
+ice_clear_tx_drbell_q_ctx(struct ice_hw *hw, u32 tx_drbell_q_index);
+enum ice_status
+ice_write_tx_drbell_q_ctx(struct ice_hw *hw,
+			  struct ice_tx_drbell_q_ctx *tx_drbell_q_ctx,
+			  u32 tx_drbell_q_index);
 
 enum ice_status
-ice_aq_get_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type, u8 *lut,
-		   u16 lut_size);
+ice_aq_get_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *get_params);
 enum ice_status
-ice_aq_set_rss_lut(struct ice_hw *hw, u16 vsi_handle, u8 lut_type, u8 *lut,
-		   u16 lut_size);
+ice_aq_set_rss_lut(struct ice_hw *hw, struct ice_aq_get_set_rss_lut_params *set_params);
 enum ice_status
 ice_aq_get_rss_key(struct ice_hw *hw, u16 vsi_handle,
 		   struct ice_aqc_get_set_rss_keys *keys);
 enum ice_status
 ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
 		   struct ice_aqc_get_set_rss_keys *keys);
+enum ice_status
+ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
+			  bool is_tc_change, bool subseq_call, bool flush_pipe,
+			  u8 timeout, u32 *blocked_cgds,
+			  struct ice_aqc_move_txqs_data *buf, u16 buf_size,
+			  u8 *txqs_moved, struct ice_sq_cd *cd);
 
 bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq);
 enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading);
 void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode);
 extern const struct ice_ctx_ele ice_tlan_ctx_info[];
 enum ice_status
-ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info);
+ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx,
+	    const struct ice_ctx_ele *ce_info);
 
 extern struct mutex ice_global_cfg_lock_sw;
 
@@ -81,31 +122,59 @@ enum ice_status
 ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
 		       struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_set_port_params(struct ice_port_info *pi, u16 bad_frame_vsi,
+		       bool save_bad_pac, bool pad_short_pac, bool double_vlan,
+		       struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 		    struct ice_aqc_get_phy_caps_data *caps,
 		    struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_list_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
+		 enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_discover_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps);
 void
 ice_update_phy_type(u64 *phy_type_low, u64 *phy_type_high,
 		    u16 link_speeds_bitmap);
 enum ice_status
 ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
 			struct ice_sq_cd *cd);
+
 enum ice_status ice_clear_pf_cfg(struct ice_hw *hw);
 enum ice_status
-ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
+ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
 		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd);
+bool ice_fw_supports_link_override(struct ice_hw *hw);
+enum ice_status
+ice_get_link_default_override(struct ice_link_default_override_tlv *ldo,
+			      struct ice_port_info *pi);
+bool ice_is_phy_caps_an_enabled(struct ice_aqc_get_phy_caps_data *caps);
+
+enum ice_fc_mode ice_caps_to_fc_mode(u8 caps);
+enum ice_fec_mode ice_caps_to_fec_mode(u8 caps, u8 fec_options);
 enum ice_status
 ice_set_fc(struct ice_port_info *pi, u8 *aq_failures,
 	   bool ena_auto_link_update);
+enum ice_status
+ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+	       enum ice_fc_mode req_mode);
+bool
+ice_phy_caps_equals_cfg(struct ice_aqc_get_phy_caps_data *caps,
+			struct ice_aqc_set_phy_cfg_data *cfg);
 void
-ice_cfg_phy_fec(struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec);
-void
-ice_copy_phy_caps_to_cfg(struct ice_aqc_get_phy_caps_data *caps,
+ice_copy_phy_caps_to_cfg(struct ice_port_info *pi,
+			 struct ice_aqc_get_phy_caps_data *caps,
 			 struct ice_aqc_set_phy_cfg_data *cfg);
 enum ice_status
+ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg,
+		enum ice_fec_mode fec);
+enum ice_status
 ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 		     struct ice_link_status *link, struct ice_sq_cd *cd);
 enum ice_status
@@ -114,17 +183,54 @@ ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
 enum ice_status
 ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd);
 
+
 enum ice_status
 ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
 		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
+		  u16 mem_addr, u8 page, u8 set_page, u8 *data, u8 length,
+		  bool write, struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_prog_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_read_topo_dev_nvm(struct ice_hw *hw,
+			 struct ice_aqc_link_topo_params *topo_params,
+			 u32 start_address, u8 *buf, u8 buf_size,
+			 struct ice_sq_cd *cd);
 
+
+void ice_dump_port_info(struct ice_port_info *pi);
+void ice_dump_caps(struct ice_hw *hw);
+void ice_dump_ptp_dev_caps(struct ice_hw *hw);
+void ice_dump_ptp_func_caps(struct ice_hw *hw);
+enum ice_status ice_dump_port_dflt_topo(struct ice_port_info *pi);
+void ice_dump_port_topo(struct ice_port_info *pi);
+
+enum ice_status
+ice_aq_get_port_options(struct ice_hw *hw,
+			struct ice_aqc_get_port_options_elem *options,
+			u8 *option_count, u8 lport, bool lport_valid,
+			u8 *active_option_idx, bool *active_option_valid);
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs);
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid);
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id);
 enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
 		u16 *q_handle, u16 *q_ids, u32 *q_teids,
 		enum ice_disq_rst_src rst_src, u16 vmvf_num,
 		struct ice_sq_cd *cd);
 enum ice_status
-ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
 		u16 *max_lanqs);
 enum ice_status
 ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
@@ -132,17 +238,77 @@ ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle,
 		struct ice_sq_cd *cd);
 enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle);
 void ice_replay_post(struct ice_hw *hw);
-void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf);
+struct ice_q_ctx *
+ice_get_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 q_handle);
+enum ice_status ice_sbq_rw_reg_lp(struct ice_hw *hw,
+				  struct ice_sbq_msg_input *in, bool lock);
+void ice_sbq_lock(struct ice_hw *hw);
+void ice_sbq_unlock(struct ice_hw *hw);
+enum ice_status ice_sbq_rw_reg(struct ice_hw *hw, struct ice_sbq_msg_input *in);
 void
 ice_stat_update40(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
 		  u64 *prev_stat, u64 *cur_stat);
 void
 ice_stat_update32(struct ice_hw *hw, u32 reg, bool prev_stat_loaded,
 		  u64 *prev_stat, u64 *cur_stat);
-void
-ice_get_nvm_version(struct ice_hw *hw, u8 *oem_ver, u16 *oem_build,
-		    u8 *oem_patch, u8 *ver_hi, u8 *ver_lo);
+enum ice_fw_modes ice_get_fw_mode(struct ice_hw *hw);
+void ice_print_rollback_msg(struct ice_hw *hw);
+bool ice_is_generic_mac(struct ice_hw *hw);
+bool ice_is_e810(struct ice_hw *hw);
 enum ice_status
 ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
-		     struct ice_aqc_get_elem *buf);
+		     struct ice_aqc_txsched_elem_data *buf);
+enum ice_status
+ice_aq_set_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_driver_param(struct ice_hw *hw, enum ice_aqc_driver_params idx,
+			u32 *value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx, bool value,
+		struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx,
+		bool *value, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
+		    struct ice_sq_cd *cd);
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw);
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add);
+enum ice_status
+ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
+		 u16 bus_addr, __le16 addr, u8 params, u8 *data,
+		 struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_health_status_config(struct ice_hw *hw, u8 event_source,
+				struct ice_sq_cd *cd);
+bool ice_is_fw_health_report_supported(struct ice_hw *hw);
+bool ice_fw_supports_report_dflt_cfg(struct ice_hw *hw);
+
+/* E810T PCA9575 IO controller registers */
+#define ICE_PCA9575_P0_IN	0x0
+#define ICE_PCA9575_P1_IN	0x1
+#define ICE_PCA9575_P0_CFG	0x8
+#define ICE_PCA9575_P1_CFG	0x9
+#define ICE_PCA9575_P0_OUT	0xA
+#define ICE_PCA9575_P1_OUT	0xB
+
+/* E810T PCA9575 IO controller pin control */
+#define ICE_E810T_P0_GNSS_PRSNT_N	BIT(4)
+#define ICE_E810T_P1_SMA1_DIR_EN	BIT(4)
+#define ICE_E810T_P1_SMA1_TX_EN		BIT(5)
+#define ICE_E810T_P1_SMA2_UFL2_RX_DIS	BIT(3)
+#define ICE_E810T_P1_SMA2_DIR_EN	BIT(6)
+#define ICE_E810T_P1_SMA2_TX_EN		BIT(7)
+
+enum ice_status
+ice_read_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 *data);
+enum ice_status
+ice_write_e810t_pca9575_reg(struct ice_hw *hw, u8 offset, u8 data);
+bool ice_e810t_is_pca9575_present(struct ice_hw *hw);
 #endif /* _ICE_COMMON_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index 2e9c97bad3c37c16278ecb4e9bafec622d9944ec..345ca841b42966640b471ec8543dc412a9f3bf9a 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 
+
 #define ICE_CQ_INIT_REGS(qinfo, prefix)				\
 do {								\
 	(qinfo)->sq.head = prefix##_ATQH;			\
@@ -12,6 +13,7 @@ do {								\
 	(qinfo)->sq.bal = prefix##_ATQBAL;			\
 	(qinfo)->sq.len_mask = prefix##_ATQLEN_ATQLEN_M;	\
 	(qinfo)->sq.len_ena_mask = prefix##_ATQLEN_ATQENABLE_M;	\
+	(qinfo)->sq.len_crit_mask = prefix##_ATQLEN_ATQCRIT_M;	\
 	(qinfo)->sq.head_mask = prefix##_ATQH_ATQH_M;		\
 	(qinfo)->rq.head = prefix##_ARQH;			\
 	(qinfo)->rq.tail = prefix##_ARQT;			\
@@ -20,9 +22,11 @@ do {								\
 	(qinfo)->rq.bal = prefix##_ARQBAL;			\
 	(qinfo)->rq.len_mask = prefix##_ARQLEN_ARQLEN_M;	\
 	(qinfo)->rq.len_ena_mask = prefix##_ARQLEN_ARQENABLE_M;	\
+	(qinfo)->rq.len_crit_mask = prefix##_ARQLEN_ARQCRIT_M;	\
 	(qinfo)->rq.head_mask = prefix##_ARQH_ARQH_M;		\
 } while (0)
 
+
 /**
  * ice_adminq_init_regs - Initialize AdminQ registers
  * @hw: pointer to the hardware structure
@@ -36,6 +40,7 @@ static void ice_adminq_init_regs(struct ice_hw *hw)
 	ICE_CQ_INIT_REGS(cq, PF_FW);
 }
 
+
 /**
  * ice_mailbox_init_regs - Initialize Mailbox registers
  * @hw: pointer to the hardware structure
@@ -49,6 +54,19 @@ static void ice_mailbox_init_regs(struct ice_hw *hw)
 	ICE_CQ_INIT_REGS(cq, PF_MBX);
 }
 
+/**
+ * ice_sb_init_regs - Initialize Sideband registers
+ * @hw: pointer to the hardware structure
+ *
+ * This assumes the alloc_sq and alloc_rq functions have already been called
+ */
+static void ice_sb_init_regs(struct ice_hw *hw)
+{
+	struct ice_ctl_q_info *cq = &hw->sbq;
+
+	ICE_CQ_INIT_REGS(cq, PF_SB);
+}
+
 /**
  * ice_check_sq_alive
  * @hw: pointer to the HW struct
@@ -310,9 +328,10 @@ ice_cfg_rq_regs(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 
 #define ICE_FREE_CQ_BUFS(hw, qi, ring)					\
 do {									\
-	int i;								\
 	/* free descriptors */						\
-	if ((qi)->ring.r.ring##_bi)					\
+	if ((qi)->ring.r.ring##_bi) {					\
+		int i;							\
+									\
 		for (i = 0; i < (qi)->num_##ring##_entries; i++)	\
 			if ((qi)->ring.r.ring##_bi[i].pa) {		\
 				dmam_free_coherent(ice_hw_to_dev(hw),	\
@@ -323,6 +342,7 @@ do {									\
 					(qi)->ring.r.ring##_bi[i].pa = 0;\
 					(qi)->ring.r.ring##_bi[i].size = 0;\
 		}							\
+	}								\
 	/* free the buffer info list */					\
 	if ((qi)->ring.cmd_buf)						\
 		devm_kfree(ice_hw_to_dev(hw), (qi)->ring.cmd_buf);	\
@@ -555,6 +575,7 @@ ice_shutdown_rq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	return ret_code;
 }
 
+
 /**
  * ice_init_check_adminq - Check version for Admin Queue to know if its alive
  * @hw: pointer to the hardware structure
@@ -568,6 +589,7 @@ static enum ice_status ice_init_check_adminq(struct ice_hw *hw)
 	if (status)
 		goto init_ctrlq_free_rq;
 
+
 	if (!ice_aq_ver_check(hw)) {
 		status = ICE_ERR_FW_API_VER;
 		goto init_ctrlq_free_rq;
@@ -605,6 +627,10 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
 		ice_adminq_init_regs(hw);
 		cq = &hw->adminq;
 		break;
+	case ICE_CTL_Q_SB:
+		ice_sb_init_regs(hw);
+		cq = &hw->sbq;
+		break;
 	case ICE_CTL_Q_MAILBOX:
 		ice_mailbox_init_regs(hw);
 		cq = &hw->mailboxq;
@@ -641,6 +667,68 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
 	return ret_code;
 }
 
+/**
+ * ice_is_sbq_supported - is the sideband queue supported
+ * @hw: pointer to the hardware structure
+ *
+ * Returns true if the sideband control queue interface is
+ * supported for the device, false otherwise
+ */
+static bool ice_is_sbq_supported(struct ice_hw *hw)
+{
+	return ice_is_generic_mac(hw);
+}
+
+/**
+ * ice_shutdown_ctrlq - shutdown routine for any control queue
+ * @hw: pointer to the hardware structure
+ * @q_type: specific Control queue type
+ *
+ * NOTE: this function does not destroy the control queue locks.
+ */
+static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
+{
+	struct ice_ctl_q_info *cq;
+
+	switch (q_type) {
+	case ICE_CTL_Q_ADMIN:
+		cq = &hw->adminq;
+		if (ice_check_sq_alive(hw, cq))
+			ice_aq_q_shutdown(hw, true);
+		break;
+	case ICE_CTL_Q_SB:
+		cq = &hw->sbq;
+		break;
+	case ICE_CTL_Q_MAILBOX:
+		cq = &hw->mailboxq;
+		break;
+	default:
+		return;
+	}
+
+	ice_shutdown_sq(hw, cq);
+	ice_shutdown_rq(hw, cq);
+}
+
+/**
+ * ice_shutdown_all_ctrlq - shutdown routine for all control queues
+ * @hw: pointer to the hardware structure
+ *
+ * NOTE: this function does not destroy the control queue locks. The driver
+ * may call this at runtime to shutdown and later restart control queues, such
+ * as in response to a reset event.
+ */
+void ice_shutdown_all_ctrlq(struct ice_hw *hw)
+{
+	/* Shutdown FW admin queue */
+	ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
+	/* Shutdown PHY Sideband */
+	if (ice_is_sbq_supported(hw))
+		ice_shutdown_ctrlq(hw, ICE_CTL_Q_SB);
+	/* Shutdown PF-VF Mailbox */
+	ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX);
+}
+
 /**
  * ice_init_all_ctrlq - main initialization routine for all control queues
  * @hw: pointer to the hardware structure
@@ -656,17 +744,35 @@ static enum ice_status ice_init_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
  */
 enum ice_status ice_init_all_ctrlq(struct ice_hw *hw)
 {
-	enum ice_status ret_code;
+	enum ice_status status;
+	u32 retry = 0;
 
 	/* Init FW admin queue */
-	ret_code = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN);
-	if (ret_code)
-		return ret_code;
+	do {
+		status = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN);
+		if (status)
+			return status;
 
-	ret_code = ice_init_check_adminq(hw);
-	if (ret_code)
-		return ret_code;
+		status = ice_init_check_adminq(hw);
+		if (status != ICE_ERR_AQ_FW_CRITICAL)
+			break;
+
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Retry Admin Queue init due to FW critical error\n");
+		ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
+		msleep(ICE_CTL_Q_ADMIN_INIT_MSEC);
+	} while (retry++ < ICE_CTL_Q_ADMIN_INIT_TIMEOUT);
 
+	if (status)
+		return status;
+	/* sideband control queue (SBQ) interface is not supported on some
+	 * devices. Initialize if supported, else fallback to the admin queue
+	 * interface
+	 */
+	if (ice_is_sbq_supported(hw)) {
+		status = ice_init_ctrlq(hw, ICE_CTL_Q_SB);
+		if (status)
+			return status;
+	}
 	/* Init Mailbox queue */
 	return ice_init_ctrlq(hw, ICE_CTL_Q_MAILBOX);
 }
@@ -702,63 +808,20 @@ static void ice_init_ctrlq_locks(struct ice_ctl_q_info *cq)
 enum ice_status ice_create_all_ctrlq(struct ice_hw *hw)
 {
 	ice_init_ctrlq_locks(&hw->adminq);
+	if (ice_is_sbq_supported(hw))
+		ice_init_ctrlq_locks(&hw->sbq);
 	ice_init_ctrlq_locks(&hw->mailboxq);
 
 	return ice_init_all_ctrlq(hw);
 }
 
-/**
- * ice_shutdown_ctrlq - shutdown routine for any control queue
- * @hw: pointer to the hardware structure
- * @q_type: specific Control queue type
- *
- * NOTE: this function does not destroy the control queue locks.
- */
-static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type)
-{
-	struct ice_ctl_q_info *cq;
-
-	switch (q_type) {
-	case ICE_CTL_Q_ADMIN:
-		cq = &hw->adminq;
-		if (ice_check_sq_alive(hw, cq))
-			ice_aq_q_shutdown(hw, true);
-		break;
-	case ICE_CTL_Q_MAILBOX:
-		cq = &hw->mailboxq;
-		break;
-	default:
-		return;
-	}
-
-	ice_shutdown_sq(hw, cq);
-	ice_shutdown_rq(hw, cq);
-}
-
-/**
- * ice_shutdown_all_ctrlq - shutdown routine for all control queues
- * @hw: pointer to the hardware structure
- *
- * NOTE: this function does not destroy the control queue locks. The driver
- * may call this at runtime to shutdown and later restart control queues, such
- * as in response to a reset event.
- */
-void ice_shutdown_all_ctrlq(struct ice_hw *hw)
-{
-	/* Shutdown FW admin queue */
-	ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN);
-	/* Shutdown PF-VF Mailbox */
-	ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX);
-}
-
 /**
  * ice_destroy_ctrlq_locks - Destroy locks for a control queue
  * @cq: pointer to the control queue
  *
  * Destroys the send and receive queue locks for a given control queue.
  */
-static void
-ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq)
+static void ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq)
 {
 	mutex_destroy(&cq->sq_lock);
 	mutex_destroy(&cq->rq_lock);
@@ -779,6 +842,8 @@ void ice_destroy_all_ctrlq(struct ice_hw *hw)
 	ice_shutdown_all_ctrlq(hw);
 
 	ice_destroy_ctrlq_locks(&hw->adminq);
+	if (ice_is_sbq_supported(hw))
+		ice_destroy_ctrlq_locks(&hw->sbq);
 	ice_destroy_ctrlq_locks(&hw->mailboxq);
 }
 
@@ -800,8 +865,7 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	details = ICE_CTL_Q_DETAILS(*sq, ntc);
 
 	while (rd32(hw, cq->sq.head) != ntc) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "ntc %d head %d.\n", ntc, rd32(hw, cq->sq.head));
+		ice_debug(hw, ICE_DBG_AQ_MSG, "ntc %d head %d.\n", ntc, rd32(hw, cq->sq.head));
 		memset(desc, 0, sizeof(*desc));
 		memset(details, 0, sizeof(*details));
 		ntc++;
@@ -816,6 +880,54 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 	return ICE_CTL_Q_DESC_UNUSED(sq);
 }
 
+/**
+ * ice_debug_cq
+ * @hw: pointer to the hardware structure
+ * @desc: pointer to control queue descriptor
+ * @buf: pointer to command buffer
+ * @buf_len: max length of buf
+ *
+ * Dumps debug log about control command with descriptor contents.
+ */
+static void ice_debug_cq(struct ice_hw *hw, void *desc, void *buf, u16 buf_len)
+{
+	struct ice_aq_desc *cq_desc = desc;
+	u16 datalen, flags;
+
+	if (!IS_ENABLED(CONFIG_DYNAMIC_DEBUG) &&
+	    !((ICE_DBG_AQ_DESC | ICE_DBG_AQ_DESC_BUF) & hw->debug_mask))
+		return;
+
+	if (!desc)
+		return;
+
+	datalen = le16_to_cpu(cq_desc->datalen);
+	flags = le16_to_cpu(cq_desc->flags);
+
+	ice_debug(hw, ICE_DBG_AQ_DESC, "CQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
+		  le16_to_cpu(cq_desc->opcode), flags, datalen,
+		  le16_to_cpu(cq_desc->retval));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\tcookie (h,l) 0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->cookie_high),
+		  le32_to_cpu(cq_desc->cookie_low));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\tparam (0,1)  0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->params.generic.param0),
+		  le32_to_cpu(cq_desc->params.generic.param1));
+	ice_debug(hw, ICE_DBG_AQ_DESC, "\taddr (h,l)   0x%08X 0x%08X\n",
+		  le32_to_cpu(cq_desc->params.generic.addr_high),
+		  le32_to_cpu(cq_desc->params.generic.addr_low));
+	/* Dump buffer iff 1) one exists and 2) is either a response indicated
+	 * by the DD and/or CMP flag set or a command with the RD flag set.
+	 */
+	if (buf && cq_desc->datalen != 0 &&
+	    (flags & (ICE_AQ_FLAG_DD | ICE_AQ_FLAG_CMP) ||
+	     flags & ICE_AQ_FLAG_RD)) {
+		ice_debug(hw, ICE_DBG_AQ_DESC_BUF, "Buffer:\n");
+		ice_debug_array(hw, ICE_DBG_AQ_DESC_BUF, 16, 1, buf,
+				min_t(u16, buf_len, datalen));
+	}
+}
+
 /**
  * ice_sq_done - check if FW has processed the Admin Send Queue (ATQ)
  * @hw: pointer to the HW struct
@@ -833,7 +945,7 @@ static bool ice_sq_done(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 }
 
 /**
- * ice_sq_send_cmd - send command to Control Queue (ATQ)
+ * ice_sq_send_cmd_nolock - send command to Control Queue (ATQ)
  * @hw: pointer to the HW struct
  * @cq: pointer to the specific Control queue
  * @desc: prefilled descriptor describing the command (non DMA mem)
@@ -845,9 +957,9 @@ static bool ice_sq_done(struct ice_hw *hw, struct ice_ctl_q_info *cq)
  * cleans the queue, etc.
  */
 enum ice_status
-ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
-		struct ice_aq_desc *desc, void *buf, u16 buf_size,
-		struct ice_sq_cd *cd)
+ice_sq_send_cmd_nolock(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		       struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		       struct ice_sq_cd *cd)
 {
 	struct ice_dma_mem *dma_buf = NULL;
 	struct ice_aq_desc *desc_on_ring;
@@ -861,13 +973,11 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	/* if reset is in progress return a soft error */
 	if (hw->reset_ongoing)
 		return ICE_ERR_RESET_ONGOING;
-	mutex_lock(&cq->sq_lock);
 
 	cq->sq_last_status = ICE_AQ_RC_OK;
 
 	if (!cq->sq.count) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Send queue not initialized.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send queue not initialized.\n");
 		status = ICE_ERR_AQ_EMPTY;
 		goto sq_send_command_error;
 	}
@@ -879,8 +989,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	if (buf) {
 		if (buf_size > cq->sq_buf_size) {
-			ice_debug(hw, ICE_DBG_AQ_MSG,
-				  "Invalid buffer size for Control Send queue: %d.\n",
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Invalid buffer size for Control Send queue: %d.\n",
 				  buf_size);
 			status = ICE_ERR_INVAL_SIZE;
 			goto sq_send_command_error;
@@ -893,8 +1002,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	val = rd32(hw, cq->sq.head);
 	if (val >= cq->num_sq_entries) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "head overrun at %d in the Control Send Queue ring\n",
+		ice_debug(hw, ICE_DBG_AQ_MSG, "head overrun at %d in the Control Send Queue ring\n",
 			  val);
 		status = ICE_ERR_AQ_EMPTY;
 		goto sq_send_command_error;
@@ -912,8 +1020,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	 * called in a separate thread in case of asynchronous completions.
 	 */
 	if (ice_clean_sq(hw, cq) == 0) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Error: Control Send Queue is full.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Error: Control Send Queue is full.\n");
 		status = ICE_ERR_AQ_FULL;
 		goto sq_send_command_error;
 	}
@@ -941,10 +1048,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	}
 
 	/* Debug desc and buffer */
-	ice_debug(hw, ICE_DBG_AQ_MSG,
-		  "ATQ: Control Send queue desc and buffer:\n");
+	ice_debug(hw, ICE_DBG_AQ_DESC, "ATQ: Control Send queue desc and buffer:\n");
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc_on_ring, buf, buf_size);
+	ice_debug_cq(hw, (void *)desc_on_ring, buf, buf_size);
 
 	(cq->sq.next_to_use)++;
 	if (cq->sq.next_to_use == cq->sq.count)
@@ -967,8 +1073,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 			u16 copy_size = le16_to_cpu(desc->datalen);
 
 			if (copy_size > buf_size) {
-				ice_debug(hw, ICE_DBG_AQ_MSG,
-					  "Return len %d > than buf len %d\n",
+				ice_debug(hw, ICE_DBG_AQ_MSG, "Return len %d > than buf len %d\n",
 					  copy_size, buf_size);
 				status = ICE_ERR_AQ_ERROR;
 			} else {
@@ -977,8 +1082,8 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		}
 		retval = le16_to_cpu(desc->retval);
 		if (retval) {
-			ice_debug(hw, ICE_DBG_AQ_MSG,
-				  "Control Send Queue command completed with error 0x%x\n",
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send Queue command 0x%04X completed with error 0x%X\n",
+				  le16_to_cpu(desc->opcode),
 				  retval);
 
 			/* strip off FW internal code */
@@ -990,10 +1095,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		cq->sq_last_status = (enum ice_aq_err)retval;
 	}
 
-	ice_debug(hw, ICE_DBG_AQ_MSG,
-		  "ATQ: desc and buffer writeback:\n");
+	ice_debug(hw, ICE_DBG_AQ_MSG, "ATQ: desc and buffer writeback:\n");
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc, buf, buf_size);
+	ice_debug_cq(hw, (void *)desc, buf, buf_size);
 
 	/* save writeback AQ if requested */
 	if (details->wb_desc)
@@ -1002,13 +1106,47 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 
 	/* update the error if time out occurred */
 	if (!cmd_completed) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Send Queue Writeback timeout.\n");
-		status = ICE_ERR_AQ_TIMEOUT;
+		if (rd32(hw, cq->rq.len) & cq->rq.len_crit_mask ||
+		    rd32(hw, cq->sq.len) & cq->sq.len_crit_mask) {
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Critical FW error.\n");
+			status = ICE_ERR_AQ_FW_CRITICAL;
+		} else {
+			ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send Queue Writeback timeout.\n");
+			status = ICE_ERR_AQ_TIMEOUT;
+		}
 	}
 
 sq_send_command_error:
+	return status;
+}
+
+/**
+ * ice_sq_send_cmd - send command to Control Queue (ATQ)
+ * @hw: pointer to the HW struct
+ * @cq: pointer to the specific Control queue
+ * @desc: prefilled descriptor describing the command
+ * @buf: buffer to use for indirect commands (or NULL for direct commands)
+ * @buf_size: size of buffer for indirect commands (or 0 for direct commands)
+ * @cd: pointer to command details structure
+ *
+ * This is the main send command routine for the ATQ. It runs the queue,
+ * cleans the queue, etc.
+ */
+enum ice_status
+ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
+		struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		struct ice_sq_cd *cd)
+{
+	enum ice_status status = 0;
+
+	/* if reset is in progress return a soft error */
+	if (hw->reset_ongoing)
+		return ICE_ERR_RESET_ONGOING;
+
+	mutex_lock(&cq->sq_lock);
+	status = ice_sq_send_cmd_nolock(hw, cq, desc, buf, buf_size, cd);
 	mutex_unlock(&cq->sq_lock);
+
 	return status;
 }
 
@@ -1043,6 +1181,7 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		  struct ice_rq_event_info *e, u16 *pending)
 {
 	u16 ntc = cq->rq.next_to_clean;
+	enum ice_aq_err rq_last_status;
 	enum ice_status ret_code = 0;
 	struct ice_aq_desc *desc;
 	struct ice_dma_mem *bi;
@@ -1058,8 +1197,7 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	mutex_lock(&cq->rq_lock);
 
 	if (!cq->rq.count) {
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Receive queue not initialized.\n");
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Receive queue not initialized.\n");
 		ret_code = ICE_ERR_AQ_EMPTY;
 		goto clean_rq_elem_err;
 	}
@@ -1077,24 +1215,23 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	desc = ICE_CTL_Q_DESC(cq->rq, ntc);
 	desc_idx = ntc;
 
-	cq->rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
+	rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
 	flags = le16_to_cpu(desc->flags);
 	if (flags & ICE_AQ_FLAG_ERR) {
 		ret_code = ICE_ERR_AQ_ERROR;
-		ice_debug(hw, ICE_DBG_AQ_MSG,
-			  "Control Receive Queue Event received with error 0x%x\n",
-			  cq->rq_last_status);
+		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Receive Queue Event 0x%04X received with error 0x%X\n",
+			  le16_to_cpu(desc->opcode), rq_last_status);
 	}
 	memcpy(&e->desc, desc, sizeof(e->desc));
 	datalen = le16_to_cpu(desc->datalen);
-	e->msg_len = min(datalen, e->buf_len);
+	e->msg_len = min_t(u16, datalen, e->buf_len);
 	if (e->msg_buf && e->msg_len)
 		memcpy(e->msg_buf, cq->rq.r.rq_bi[desc_idx].va, e->msg_len);
 
-	ice_debug(hw, ICE_DBG_AQ_MSG, "ARQ: desc and buffer:\n");
+	ice_debug(hw, ICE_DBG_AQ_DESC, "ARQ: desc and buffer:\n");
+
+	ice_debug_cq(hw, (void *)desc, e->msg_buf, cq->rq_buf_size);
 
-	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc, e->msg_buf,
-		     cq->rq_buf_size);
 
 	/* Restore the original datalen and buffer address in the desc,
 	 * FW updates datalen to indicate the event message size
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.h b/drivers/net/ethernet/intel/ice/ice_controlq.h
index 3b1d35365ef0f0c2506e600391008b4a3b9bb7ca..4ab82a4cf43c46071507aaa37ff3549abb8194a6 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.h
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.h
@@ -1,39 +1,44 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_CONTROLQ_H_
 #define _ICE_CONTROLQ_H_
 
 #include "ice_adminq_cmd.h"
 
+
 /* Maximum buffer lengths for all control queue types */
 #define ICE_AQ_MAX_BUF_LEN 4096
 #define ICE_MBXQ_MAX_BUF_LEN 4096
+#define ICE_SBQ_MAX_BUF_LEN 512
 
 #define ICE_CTL_Q_DESC(R, i) \
 	(&(((struct ice_aq_desc *)((R).desc_buf.va))[i]))
 
 #define ICE_CTL_Q_DESC_UNUSED(R) \
-	(u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
-	      (R)->next_to_clean - (R)->next_to_use - 1)
+	((u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	       (R)->next_to_clean - (R)->next_to_use - 1))
 
 /* Defines that help manage the driver vs FW API checks.
  * Take a look at ice_aq_ver_check in ice_controlq.c for actual usage.
  */
 #define EXP_FW_API_VER_BRANCH		0x00
 #define EXP_FW_API_VER_MAJOR		0x01
-#define EXP_FW_API_VER_MINOR		0x03
+#define EXP_FW_API_VER_MINOR		0x05
 
 /* Different control queue types: These are mainly for SW consumption. */
 enum ice_ctl_q {
 	ICE_CTL_Q_UNKNOWN = 0,
 	ICE_CTL_Q_ADMIN,
 	ICE_CTL_Q_MAILBOX,
+	ICE_CTL_Q_SB,
 };
 
 /* Control Queue timeout settings - max delay 1s */
 #define ICE_CTL_Q_SQ_CMD_TIMEOUT	10000 /* Count 10000 times */
 #define ICE_CTL_Q_SQ_CMD_USEC		100   /* Check every 100usec */
+#define ICE_CTL_Q_ADMIN_INIT_TIMEOUT	10    /* Count 10 times */
+#define ICE_CTL_Q_ADMIN_INIT_MSEC	100   /* Check every 100msec */
 
 struct ice_ctl_q_ring {
 	void *dma_head;			/* Virtual address to DMA head */
@@ -59,6 +64,7 @@ struct ice_ctl_q_ring {
 	u32 bal;
 	u32 len_mask;
 	u32 len_ena_mask;
+	u32 len_crit_mask;
 	u32 head_mask;
 };
 
@@ -80,7 +86,6 @@ struct ice_rq_event_info {
 /* Control Queue information */
 struct ice_ctl_q_info {
 	enum ice_ctl_q qtype;
-	enum ice_aq_err rq_last_status;	/* last status on receive queue */
 	struct ice_ctl_q_ring rq;	/* receive queue */
 	struct ice_ctl_q_ring sq;	/* send queue */
 	u32 sq_cmd_timeout;		/* send queue cmd write back timeout */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c
index dd7efff121bd60b3e02499624bad34dc94246a88..35b2b6c60262a5739b53d0580e680de6247f0378 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_sched.h"
@@ -18,7 +18,7 @@
  *
  * Requests the complete LLDP MIB (entire packet). (0x0A00)
  */
-static enum ice_status
+enum ice_status
 ice_aq_get_lldp_mib(struct ice_hw *hw, u8 bridge_type, u8 mib_type, void *buf,
 		    u16 buf_size, u16 *local_len, u16 *remote_len,
 		    struct ice_sq_cd *cd)
@@ -77,6 +77,108 @@ ice_aq_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_update,
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
+/**
+ * ice_aq_add_delete_lldp_tlv
+ * @hw: pointer to the HW struct
+ * @bridge_type: type of bridge
+ * @add_lldp_tlv: add (true) or delete (false) TLV
+ * @buf: buffer with TLV to add or delete
+ * @buf_size: length of the buffer
+ * @tlv_len: length of the TLV to be added/deleted
+ * @mib_len: length of the LLDP MIB returned in response
+ * @cd: pointer to command details structure or NULL
+ *
+ * (Add tlv)
+ * Add the specified TLV to LLDP Local MIB for the given bridge type,
+ * it is responsibility of the caller to make sure that the TLV is not
+ * already present in the LLDPDU.
+ * In return firmware will write the complete LLDP MIB with the newly
+ * added TLV in the response buffer. (0x0A02)
+ *
+ * (Delete tlv)
+ * Delete the specified TLV from LLDP Local MIB for the given bridge type.
+ * The firmware places the entire LLDP MIB in the response buffer. (0x0A04)
+ */
+enum ice_status
+ice_aq_add_delete_lldp_tlv(struct ice_hw *hw, u8 bridge_type, bool add_lldp_tlv,
+			   void *buf, u16 buf_size, u16 tlv_len, u16 *mib_len,
+			   struct ice_sq_cd *cd)
+{
+	struct ice_aqc_lldp_add_delete_tlv *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (tlv_len == 0)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.lldp_add_delete_tlv;
+
+	if (add_lldp_tlv)
+		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_add_tlv);
+	else
+		ice_fill_dflt_direct_cmd_desc(&desc,
+					      ice_aqc_opc_lldp_delete_tlv);
+
+	desc.flags |= cpu_to_le16((u16)(ICE_AQ_FLAG_RD));
+
+	cmd->type = ((bridge_type << ICE_AQ_LLDP_BRID_TYPE_S) &
+		     ICE_AQ_LLDP_BRID_TYPE_M);
+	cmd->len = cpu_to_le16(tlv_len);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && mib_len)
+		*mib_len = le16_to_cpu(desc.datalen);
+
+	return status;
+}
+
+/**
+ * ice_aq_update_lldp_tlv
+ * @hw: pointer to the HW struct
+ * @bridge_type: type of bridge
+ * @buf: buffer with TLV to update
+ * @buf_size: size of the buffer holding original and updated TLVs
+ * @old_len: Length of the Original TLV
+ * @new_len: Length of the Updated TLV
+ * @offset: offset of the updated TLV in the buff
+ * @mib_len: length of the returned LLDP MIB
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update the specified TLV to the LLDP Local MIB for the given bridge type.
+ * Firmware will place the complete LLDP MIB in response buffer with the
+ * updated TLV. (0x0A03)
+ */
+enum ice_status
+ice_aq_update_lldp_tlv(struct ice_hw *hw, u8 bridge_type, void *buf,
+		       u16 buf_size, u16 old_len, u16 new_len, u16 offset,
+		       u16 *mib_len, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_lldp_update_tlv *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.lldp_update_tlv;
+
+	if (offset == 0 || old_len == 0 || new_len == 0)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_update_tlv);
+
+	desc.flags |= cpu_to_le16((u16)(ICE_AQ_FLAG_RD));
+
+	cmd->type = ((bridge_type << ICE_AQ_LLDP_BRID_TYPE_S) &
+		     ICE_AQ_LLDP_BRID_TYPE_M);
+	cmd->old_len = cpu_to_le16(old_len);
+	cmd->new_offset = cpu_to_le16(offset);
+	cmd->new_len = cpu_to_le16(new_len);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && mib_len)
+		*mib_len = le16_to_cpu(desc.datalen);
+
+	return status;
+}
+
 /**
  * ice_aq_stop_lldp
  * @hw: pointer to the HW struct
@@ -134,39 +236,6 @@ ice_aq_start_lldp(struct ice_hw *hw, bool persist, struct ice_sq_cd *cd)
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
-/**
- * ice_aq_set_lldp_mib - Set the LLDP MIB
- * @hw: pointer to the HW struct
- * @mib_type: Local, Remote or both Local and Remote MIBs
- * @buf: pointer to the caller-supplied buffer to store the MIB block
- * @buf_size: size of the buffer (in bytes)
- * @cd: pointer to command details structure or NULL
- *
- * Set the LLDP MIB. (0x0A08)
- */
-static enum ice_status
-ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
-		    struct ice_sq_cd *cd)
-{
-	struct ice_aqc_lldp_set_local_mib *cmd;
-	struct ice_aq_desc desc;
-
-	cmd = &desc.params.lldp_set_mib;
-
-	if (buf_size == 0 || !buf)
-		return ICE_ERR_PARAM;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_local_mib);
-
-	desc.flags |= cpu_to_le16((u16)ICE_AQ_FLAG_RD);
-	desc.datalen = cpu_to_le16(buf_size);
-
-	cmd->type = mib_type;
-	cmd->length = cpu_to_le16(buf_size);
-
-	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
-}
-
 /**
  * ice_get_dcbx_status
  * @hw: pointer to the HW struct
@@ -705,6 +774,42 @@ ice_aq_get_dcb_cfg(struct ice_hw *hw, u8 mib_type, u8 bridgetype,
 	return ret;
 }
 
+/**
+ * ice_aq_dcb_ignore_pfc - Ignore PFC for given TCs
+ * @hw: pointer to the HW struct
+ * @tcmap: TC map for request/release any ignore PFC condition
+ * @request: request (true) or release (false) ignore PFC condition
+ * @tcmap_ret: return TCs for which PFC is currently ignored
+ * @cd: pointer to command details structure or NULL
+ *
+ * This sends out request/release to ignore PFC condition for a TC.
+ * It will return the TCs for which PFC is currently ignored. (0x0301)
+ */
+enum ice_status
+ice_aq_dcb_ignore_pfc(struct ice_hw *hw, u8 tcmap, bool request, u8 *tcmap_ret,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_pfc_ignore *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.pfc_ignore;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_pfc_ignore);
+
+	if (request)
+		cmd->cmd_flags = ICE_AQC_PFC_IGNORE_SET;
+
+	cmd->tc_bitmap = tcmap;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+
+	if (!status && tcmap_ret)
+		*tcmap_ret = cmd->tc_bitmap;
+
+	return status;
+}
+
 /**
  * ice_aq_start_stop_dcbx - Start/Stop DCBX service in FW
  * @hw: pointer to the HW struct
@@ -768,25 +873,126 @@ ice_aq_get_cee_dcb_cfg(struct ice_hw *hw,
 	return ice_aq_send_cmd(hw, &desc, (void *)buff, sizeof(*buff), cd);
 }
 
+/**
+ * ice_aq_query_pfc_mode - Query PFC mode
+ * @hw: pointer to the HW struct
+ * @pfcmode_ret: Return PFC mode
+ * @cd: pointer to command details structure or NULL
+ *
+ * This will return an indication if DSCP-based PFC or VLAN-based PFC
+ * is enabled. (0x0302)
+ */
+enum ice_status
+ice_aq_query_pfc_mode(struct ice_hw *hw, u8 *pfcmode_ret, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_query_pfc_mode *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.set_query_pfc_mode;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_pfc_mode);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+
+	if (!status)
+		*pfcmode_ret = cmd->pfc_mode;
+
+	return status;
+}
+
+/**
+ * ice_aq_set_pfc_mode - Set PFC mode
+ * @hw: pointer to the HW struct
+ * @pfc_mode: value of PFC mode to set
+ * @cd: pointer to command details structure or NULL
+ *
+ * This AQ call configures the PFC mdoe to DSCP-based PFC mode or VLAN
+ * -based PFC (0x0303)
+ */
+enum ice_status
+ice_aq_set_pfc_mode(struct ice_hw *hw, u8 pfc_mode, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_query_pfc_mode *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (pfc_mode > ICE_AQC_PFC_DSCP_BASED_PFC)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.set_query_pfc_mode;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_pfc_mode);
+
+	cmd->pfc_mode = pfc_mode;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (status)
+		return status;
+
+	/* The spec isn't clear about whether the FW will return an error code
+	 * if the PFC mode requested by the driver was not set. The spec just
+	 * says that the FW will write the PFC mode set back into cmd->pfc_mode,
+	 * so after the AQ has been executed, check if cmd->pfc_mode is what was
+	 * requested.
+	 */
+	if (cmd->pfc_mode != pfc_mode)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	return 0;
+}
+
+/**
+ * ice_aq_set_dcb_parameters - Set DCB parameters
+ * @hw: pointer to the HW struct
+ * @dcb_enable: True if DCB configuration needs to be applied
+ * @cd: pointer to command details structure or NULL
+ *
+ * This AQ command will tell FW if it will apply or not apply the default DCB
+ * configuration when link up (0x0306).
+ */
+enum ice_status
+ice_aq_set_dcb_parameters(struct ice_hw *hw, bool dcb_enable,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_dcb_params *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_dcb_params;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_dcb_params);
+
+	cmd->valid_flags = ICE_AQC_LINK_UP_DCB_CFG_VALID;
+	if (dcb_enable)
+		cmd->cmd_flags = ICE_AQC_LINK_UP_DCB_CFG;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
 /**
  * ice_cee_to_dcb_cfg
  * @cee_cfg: pointer to CEE configuration struct
- * @dcbcfg: DCB configuration struct
+ * @pi: port information structure
  *
  * Convert CEE configuration from firmware to DCB configuration
  */
 static void
 ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
-		   struct ice_dcbx_cfg *dcbcfg)
+		   struct ice_port_info *pi)
 {
 	u32 status, tlv_status = le32_to_cpu(cee_cfg->tlv_status);
 	u32 ice_aqc_cee_status_mask, ice_aqc_cee_status_shift;
+	u8 i, j, err, sync, oper, app_index, ice_app_sel_type;
 	u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio);
-	u8 i, err, sync, oper, app_index, ice_app_sel_type;
 	u16 ice_aqc_cee_app_mask, ice_aqc_cee_app_shift;
+	struct ice_dcbx_cfg *cmp_dcbcfg, *dcbcfg;
 	u16 ice_app_prot_id_type;
 
-	/* CEE PG data to ETS config */
+	dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
+	dcbcfg->dcbx_mode = ICE_DCBX_MODE_CEE;
+	dcbcfg->tlv_status = tlv_status;
+
+	/* CEE PG data */
 	dcbcfg->etscfg.maxtcs = cee_cfg->oper_num_tc;
 
 	/* Note that the FW creates the oper_prio_tc nibbles reversed
@@ -813,10 +1019,16 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 		}
 	}
 
-	/* CEE PFC data to ETS config */
+	/* CEE PFC data */
 	dcbcfg->pfc.pfcena = cee_cfg->oper_pfc_en;
 	dcbcfg->pfc.pfccap = ICE_MAX_TRAFFIC_CLASS;
 
+	/* CEE APP TLV data */
+	if (dcbcfg->app_mode == ICE_DCBX_APPS_NON_WILLING)
+		cmp_dcbcfg = &pi->qos_cfg.desired_dcbx_cfg;
+	else
+		cmp_dcbcfg = &pi->qos_cfg.remote_dcbx_cfg;
+
 	app_index = 0;
 	for (i = 0; i < 3; i++) {
 		if (i == 0) {
@@ -826,7 +1038,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_FCOE_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_FCOE_S;
 			ice_app_sel_type = ICE_APP_SEL_ETHTYPE;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_FCOE;
+			ice_app_prot_id_type = ETH_P_FCOE;
 		} else if (i == 1) {
 			/* iSCSI APP */
 			ice_aqc_cee_status_mask = ICE_AQC_CEE_ISCSI_STATUS_M;
@@ -834,7 +1046,19 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_ISCSI_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_ISCSI_S;
 			ice_app_sel_type = ICE_APP_SEL_TCPIP;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_ISCSI;
+			ice_app_prot_id_type = ISCSI_LISTEN_PORT;
+
+			for (j = 0; j < cmp_dcbcfg->numapps; j++) {
+				u16 prot_id = cmp_dcbcfg->app[j].prot_id;
+				u8 sel = cmp_dcbcfg->app[j].selector;
+
+				if  (sel == ICE_APP_SEL_TCPIP &&
+				     (prot_id == ISCSI_LISTEN_PORT ||
+				      prot_id == ICE_APP_PROT_ID_ISCSI_860)) {
+					ice_app_prot_id_type = prot_id;
+					break;
+				}
+			}
 		} else {
 			/* FIP APP */
 			ice_aqc_cee_status_mask = ICE_AQC_CEE_FIP_STATUS_M;
@@ -842,7 +1066,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 			ice_aqc_cee_app_mask = ICE_AQC_CEE_APP_FIP_M;
 			ice_aqc_cee_app_shift = ICE_AQC_CEE_APP_FIP_S;
 			ice_app_sel_type = ICE_APP_SEL_ETHTYPE;
-			ice_app_prot_id_type = ICE_APP_PROT_ID_FIP;
+			ice_app_prot_id_type = ETH_P_FIP;
 		}
 
 		status = (tlv_status & ice_aqc_cee_status_mask) >>
@@ -867,7 +1091,7 @@ ice_cee_to_dcb_cfg(struct ice_aqc_get_cee_dcb_cfg_resp *cee_cfg,
 }
 
 /**
- * ice_get_ieee_dcb_cfg
+ * ice_get_ieee_or_cee_dcb_cfg
  * @pi: port information structure
  * @dcbx_mode: mode of DCBX (IEEE or CEE)
  *
@@ -883,9 +1107,9 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		return ICE_ERR_PARAM;
 
 	if (dcbx_mode == ICE_DCBX_MODE_IEEE)
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	else if (dcbx_mode == ICE_DCBX_MODE_CEE)
-		dcbx_cfg = &pi->desired_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.desired_dcbx_cfg;
 
 	/* Get Local DCB Config in case of ICE_DCBX_MODE_IEEE
 	 * or get CEE DCB Desired Config in case of ICE_DCBX_MODE_CEE
@@ -896,7 +1120,7 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		goto out;
 
 	/* Get Remote DCB Config */
-	dcbx_cfg = &pi->remote_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.remote_dcbx_cfg;
 	ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 				 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID, dcbx_cfg);
 	/* Don't treat ENOENT as an error for Remote MIBs */
@@ -925,14 +1149,11 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
 	ret = ice_aq_get_cee_dcb_cfg(pi->hw, &cee_cfg, NULL);
 	if (!ret) {
 		/* CEE mode */
-		dcbx_cfg = &pi->local_dcbx_cfg;
-		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_CEE;
-		dcbx_cfg->tlv_status = le32_to_cpu(cee_cfg.tlv_status);
-		ice_cee_to_dcb_cfg(&cee_cfg, dcbx_cfg);
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_CEE);
+		ice_cee_to_dcb_cfg(&cee_cfg, pi);
 	} else if (pi->hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT) {
 		/* CEE mode not enabled try querying IEEE data */
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_IEEE;
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_IEEE);
 	}
@@ -949,34 +1170,34 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
  */
 enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret = 0;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
-	pi->is_sw_lldp = true;
+	qos_cfg->is_sw_lldp = true;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DONE ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DONE ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
 		/* Get current DCBX configuration */
-		ret = ice_get_dcb_cfg(pi);
-		pi->is_sw_lldp = (hw->adminq.sq_last_status == ICE_AQ_RC_EPERM);
+		ret = ice_get_dcb_cfg(hw->port_info);
 		if (ret)
 			return ret;
-	} else if (pi->dcbx_status == ICE_DCBX_STATUS_DIS) {
+		qos_cfg->is_sw_lldp = false;
+	} else if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS) {
 		return ICE_ERR_NOT_READY;
 	}
 
 	/* Configure the LLDP MIB change event */
 	if (enable_mib_change) {
 		ret = ice_aq_cfg_lldp_mib_change(hw, true, NULL);
-		if (!ret)
-			pi->is_sw_lldp = false;
+		if (ret)
+			qos_cfg->is_sw_lldp = true;
 	}
 
 	return ret;
@@ -991,21 +1212,21 @@ enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
  */
 enum ice_status ice_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_mib)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DIS)
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS)
 		return ICE_ERR_NOT_READY;
 
 	ret = ice_aq_cfg_lldp_mib_change(hw, ena_mib, NULL);
 	if (!ret)
-		pi->is_sw_lldp = !ena_mib;
+		qos_cfg->is_sw_lldp = !ena_mib;
 
 	return ret;
 }
@@ -1220,7 +1441,140 @@ ice_add_ieee_app_pri_tlv(struct ice_lldp_org_tlv *tlv,
 }
 
 /**
- * ice_add_dcb_tlv - Add all IEEE TLVs
+ * ice_add_dscp_up_tlv - Prepare DSCP to UP TLV
+ * @tlv: location to build the TLV data
+ * @dcbcfg: location of data to convert to TLV
+ */
+static void
+ice_add_dscp_up_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+	int i;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_UP_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_DSCP2UP);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* bytes 0 - 63 - IPv4 DSCP2UP LUT */
+	for (i = 0; i < ICE_DSCP_NUM_VAL; i++) {
+		/* IPv4 mapping */
+		buf[i] = dcbcfg->dscp_map[i];
+		/* IPv6 mapping */
+		buf[i + ICE_DSCP_IPV6_OFFSET] = dcbcfg->dscp_map[i];
+	}
+
+	/* byte 64 - IPv4 untagged traffic */
+	buf[i] = 0;
+
+	/* byte 144 - IPv6 untagged traffic */
+	buf[i + ICE_DSCP_IPV6_OFFSET] = 0;
+}
+
+#define ICE_BYTES_PER_TC	8
+/**
+ * ice_add_dscp_enf_tlv - Prepare DSCP Enforcement TLV
+ * @tlv: location to build the TLV data
+ */
+static void
+ice_add_dscp_enf_tlv(struct ice_lldp_org_tlv *tlv)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_ENF_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_ENFORCE);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* Allow all DSCP values to be valid for all TC's (IPv4 and IPv6) */
+	memset(buf, 0, 2 * (ICE_MAX_TRAFFIC_CLASS * ICE_BYTES_PER_TC));
+}
+
+/**
+ * ice_add_dscp_tc_bw_tlv - Prepare DSCP BW for TC TLV
+ * @tlv: location to build the TLV data
+ * @dcbcfg: location of the data to convert to TLV
+ */
+static void
+ice_add_dscp_tc_bw_tlv(struct ice_lldp_org_tlv *tlv,
+		       struct ice_dcbx_cfg *dcbcfg)
+{
+	struct ice_dcb_ets_cfg *etscfg;
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u8 offset = 0;
+	u16 typelen;
+	int i;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_TC_BW_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_TCBW);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* First Octect after subtype
+	 * ----------------------------
+	 * | RSV | CBS | RSV | Max TCs |
+	 * | 1b  | 1b  | 3b  | 3b      |
+	 * ----------------------------
+	 */
+	etscfg = &dcbcfg->etscfg;
+	buf[0] = etscfg->maxtcs & ICE_IEEE_ETS_MAXTC_M;
+
+	/* bytes 1 - 4 reserved */
+	offset = 5;
+
+	/* TC BW table
+	 * bytes 0 - 7 for TC 0 - 7
+	 *
+	 * TSA Assignment table
+	 * bytes 8 - 15 for TC 0 - 7
+	 */
+	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+		buf[offset] = etscfg->tcbwtable[i];
+		buf[offset + ICE_MAX_TRAFFIC_CLASS] = etscfg->tsatable[i];
+		offset++;
+	}
+}
+
+/**
+ * ice_add_dscp_pfc_tlv - Prepare DSCP PFC TLV
+ * @tlv: Fill PFC TLV in IEEE format
+ * @dcbcfg: Local store which holds the PFC CFG data
+ */
+static void
+ice_add_dscp_pfc_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+	u32 ouisubtype;
+	u16 typelen;
+
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_DSCP_PFC_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = (u32)((ICE_DSCP_OUI << ICE_LLDP_TLV_OUI_S) |
+			   ICE_DSCP_SUBTYPE_PFC);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	buf[0] = dcbcfg->pfc.pfccap & 0xF;
+	buf[1] = dcbcfg->pfc.pfcena & 0xF;
+}
+
+/**
+ * ice_add_dcb_tlv - Add all IEEE or DSCP TLVs
  * @tlv: Fill TLV data in IEEE format
  * @dcbcfg: Local store which holds the DCB Config
  * @tlvid: Type of IEEE TLV
@@ -1231,21 +1585,41 @@ static void
 ice_add_dcb_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg,
 		u16 tlvid)
 {
-	switch (tlvid) {
-	case ICE_IEEE_TLV_ID_ETS_CFG:
-		ice_add_ieee_ets_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_ETS_REC:
-		ice_add_ieee_etsrec_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_PFC_CFG:
-		ice_add_ieee_pfc_tlv(tlv, dcbcfg);
-		break;
-	case ICE_IEEE_TLV_ID_APP_PRI:
-		ice_add_ieee_app_pri_tlv(tlv, dcbcfg);
-		break;
-	default:
-		break;
+	if (dcbcfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+		switch (tlvid) {
+		case ICE_IEEE_TLV_ID_ETS_CFG:
+			ice_add_ieee_ets_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_ETS_REC:
+			ice_add_ieee_etsrec_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_PFC_CFG:
+			ice_add_ieee_pfc_tlv(tlv, dcbcfg);
+			break;
+		case ICE_IEEE_TLV_ID_APP_PRI:
+			ice_add_ieee_app_pri_tlv(tlv, dcbcfg);
+			break;
+		default:
+			break;
+		}
+	} else {
+		/* pfc_mode == ICE_QOS_MODE_DSCP */
+		switch (tlvid) {
+		case ICE_TLV_ID_DSCP_UP:
+			ice_add_dscp_up_tlv(tlv, dcbcfg);
+			break;
+		case ICE_TLV_ID_DSCP_ENF:
+			ice_add_dscp_enf_tlv(tlv);
+			break;
+		case ICE_TLV_ID_DSCP_TC_BW:
+			ice_add_dscp_tc_bw_tlv(tlv, dcbcfg);
+			break;
+		case ICE_TLV_ID_DSCP_TO_PFC:
+			ice_add_dscp_pfc_tlv(tlv, dcbcfg);
+			break;
+		default:
+			break;
+		}
 	}
 }
 
@@ -1303,7 +1677,7 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi)
 	hw = pi->hw;
 
 	/* update the HW local config */
-	dcbcfg = &pi->local_dcbx_cfg;
+	dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	/* Allocate the LLDPDU */
 	lldpmib = devm_kzalloc(ice_hw_to_dev(hw), ICE_LLDPDU_SIZE, GFP_KERNEL);
 	if (!lldpmib)
@@ -1323,13 +1697,13 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi)
 }
 
 /**
- * ice_aq_query_port_ets - query port ets configuration
+ * ice_aq_query_port_ets - query port ETS configuration
  * @pi: port information structure
  * @buf: pointer to buffer
  * @buf_size: buffer size in bytes
  * @cd: pointer to command details structure or NULL
  *
- * query current port ets configuration
+ * query current port ETS configuration
  */
 static enum ice_status
 ice_aq_query_port_ets(struct ice_port_info *pi,
@@ -1362,7 +1736,7 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 			    struct ice_aqc_port_ets_elem *buf)
 {
 	struct ice_sched_node *node, *tc_node;
-	struct ice_aqc_get_elem elem;
+	struct ice_aqc_txsched_elem_data elem;
 	enum ice_status status = 0;
 	u32 teid1, teid2;
 	u8 i, j;
@@ -1404,7 +1778,7 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 		/* new TC */
 		status = ice_sched_query_elem(pi->hw, teid2, &elem);
 		if (!status)
-			status = ice_sched_add_node(pi, 1, &elem.generic[0]);
+			status = ice_sched_add_node(pi, 1, &elem);
 		if (status)
 			break;
 		/* update the TC number */
@@ -1416,13 +1790,13 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi,
 }
 
 /**
- * ice_query_port_ets - query port ets configuration
+ * ice_query_port_ets - query port ETS configuration
  * @pi: port information structure
  * @buf: pointer to buffer
  * @buf_size: buffer size in bytes
  * @cd: pointer to command details structure or NULL
  *
- * query current port ets configuration and update the
+ * query current port ETS configuration and update the
  * SW DB with the TC changes
  */
 enum ice_status
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.h b/drivers/net/ethernet/intel/ice/ice_dcb.h
index ee138f9bdc7cd0af22debb35cfafdc09b6ad96d6..370f673c5746f1c315039a12a3d2b0c27aef72f9 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.h
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.h
@@ -1,14 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DCB_H_
 #define _ICE_DCB_H_
 
 #include "ice_type.h"
+#include "ice_common.h"
 
 #define ICE_DCBX_STATUS_NOT_STARTED	0
 #define ICE_DCBX_STATUS_IN_PROGRESS	1
 #define ICE_DCBX_STATUS_DONE		2
+#define ICE_DCBX_STATUS_MULTIPLE_PEERS	3
 #define ICE_DCBX_STATUS_DIS		7
 
 #define ICE_TLV_TYPE_END		0
@@ -22,9 +24,19 @@
 
 #define ICE_CEE_DCBX_OUI		0x001B21
 #define ICE_CEE_DCBX_TYPE		2
+
+#define ICE_DSCP_OUI			0xFFFFFF
+#define ICE_DSCP_SUBTYPE_DSCP2UP	0x41
+#define ICE_DSCP_SUBTYPE_ENFORCE	0x42
+#define ICE_DSCP_SUBTYPE_TCBW		0x43
+#define ICE_DSCP_SUBTYPE_PFC		0x44
+#define ICE_DSCP_IPV6_OFFSET		80
+
+#define ICE_CEE_SUBTYPE_CTRL		1
 #define ICE_CEE_SUBTYPE_PG_CFG		2
 #define ICE_CEE_SUBTYPE_PFC_CFG		3
 #define ICE_CEE_SUBTYPE_APP_PRI		4
+
 #define ICE_CEE_MAX_FEAT_TYPE		3
 /* Defines for LLDP TLV header */
 #define ICE_LLDP_TLV_LEN_S		0
@@ -72,22 +84,34 @@
 #define ICE_IEEE_APP_PRIO_M		(0x7 << ICE_IEEE_APP_PRIO_S)
 
 /* TLV definitions for preparing MIB */
+#define ICE_TLV_ID_CHASSIS_ID		0
+#define ICE_TLV_ID_PORT_ID		1
+#define ICE_TLV_ID_TIME_TO_LIVE		2
 #define ICE_IEEE_TLV_ID_ETS_CFG		3
 #define ICE_IEEE_TLV_ID_ETS_REC		4
 #define ICE_IEEE_TLV_ID_PFC_CFG		5
 #define ICE_IEEE_TLV_ID_APP_PRI		6
 #define ICE_TLV_ID_END_OF_LLDPPDU	7
 #define ICE_TLV_ID_START		ICE_IEEE_TLV_ID_ETS_CFG
+#define ICE_TLV_ID_DSCP_UP		3
+#define ICE_TLV_ID_DSCP_ENF		4
+#define ICE_TLV_ID_DSCP_TC_BW		5
+#define ICE_TLV_ID_DSCP_TO_PFC		6
 
 #define ICE_IEEE_ETS_TLV_LEN		25
 #define ICE_IEEE_PFC_TLV_LEN		6
 #define ICE_IEEE_APP_TLV_LEN		11
 
+#define ICE_DSCP_UP_TLV_LEN		148
+#define ICE_DSCP_ENF_TLV_LEN		132
+#define ICE_DSCP_TC_BW_TLV_LEN		25
+#define ICE_DSCP_PFC_TLV_LEN		6
+
 /* IEEE 802.1AB LLDP Organization specific TLV */
 struct ice_lldp_org_tlv {
 	__be16 typelen;
 	__be32 ouisubtype;
-	u8 tlvinfo[1];
+	u8 tlvinfo[];
 } __packed;
 
 struct ice_cee_tlv_hdr {
@@ -109,7 +133,7 @@ struct ice_cee_feat_tlv {
 #define ICE_CEE_FEAT_TLV_WILLING_M	0x40
 #define ICE_CEE_FEAT_TLV_ERR_M		0x20
 	u8 subtype;
-	u8 tlvinfo[1];
+	u8 tlvinfo[];
 };
 
 struct ice_cee_app_prio {
@@ -120,6 +144,29 @@ struct ice_cee_app_prio {
 	u8 prio_map;
 } __packed;
 
+
+enum ice_status
+ice_aq_get_lldp_mib(struct ice_hw *hw, u8 bridge_type, u8 mib_type, void *buf,
+		    u16 buf_size, u16 *local_len, u16 *remote_len,
+		    struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_add_delete_lldp_tlv(struct ice_hw *hw, u8 bridge_type, bool add_lldp_tlv,
+			   void *buf, u16 buf_size, u16 tlv_len, u16 *mib_len,
+			   struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_lldp_tlv(struct ice_hw *hw, u8 bridge_type, void *buf,
+		       u16 buf_size, u16 old_len, u16 new_len, u16 offset,
+		       u16 *mib_len, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_dcb_ignore_pfc(struct ice_hw *hw, u8 tcmap, bool request, u8 *tcmap_ret,
+		      struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_pfc_mode(struct ice_hw *hw, u8 *pfcmode_ret, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_dcb_parameters(struct ice_hw *hw, bool dcb_enable,
+			  struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_pfc_mode(struct ice_hw *hw, u8 pfc_mode, struct ice_sq_cd *cd);
 enum ice_status
 ice_aq_get_dcb_cfg(struct ice_hw *hw, u8 mib_type, u8 bridgetype,
 		   struct ice_dcbx_cfg *dcbcfg);
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index dd47869c4ad497d808bdc415d6d2f4a89cba6caa..8475e0a52925840da8687f0e7769698c3afc2933 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -1,71 +1,92 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_dcb_lib.h"
+#include "ice_dcb_nl.h"
 
 /**
- * ice_vsi_cfg_netdev_tc - Setup the netdev TC configuration
- * @vsi: the VSI being configured
- * @ena_tc: TC map to be enabled
+ * ice_is_pfc_causing_hung_q
+ * @pf: pointer to PF structure
+ * @txqueue: Tx queue which is supposedly hung queue
+ *
+ * find if PFC is causing the hung queue, if yes return true else false
  */
-void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
+bool ice_is_pfc_causing_hung_q(struct ice_pf *pf, unsigned int txqueue)
 {
-	struct net_device *netdev = vsi->netdev;
-	struct ice_pf *pf = vsi->back;
-	struct ice_dcbx_cfg *dcbcfg;
-	u8 netdev_tc;
-	int i;
-
-	if (!netdev)
-		return;
+	u8 num_tcs = 0, i, tc, up_mapped_tc, up_in_tc = 0;
+	u64 ref_prio_xoff[ICE_MAX_UP];
+	struct ice_vsi *vsi;
+	u32 up2tc;
 
-	if (!ena_tc) {
-		netdev_reset_tc(netdev);
-		return;
-	}
-
-	if (netdev_set_num_tc(netdev, vsi->tc_cfg.numtc))
-		return;
-
-	dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return false;
 
 	ice_for_each_traffic_class(i)
 		if (vsi->tc_cfg.ena_tc & BIT(i))
-			netdev_set_tc_queue(netdev,
-					    vsi->tc_cfg.tc_info[i].netdev_tc,
-					    vsi->tc_cfg.tc_info[i].qcount_tx,
-					    vsi->tc_cfg.tc_info[i].qoffset);
+			num_tcs++;
 
-	for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-		u8 ets_tc = dcbcfg->etscfg.prio_table[i];
+	/* first find out the TC to which the hung queue belongs to */
+	for (tc = 0; tc < num_tcs - 1; tc++)
+		if (ice_find_q_in_range(vsi->tc_cfg.tc_info[tc].qoffset,
+					vsi->tc_cfg.tc_info[tc + 1].qoffset,
+					txqueue))
+			break;
 
-		/* Get the mapped netdev TC# for the UP */
-		netdev_tc = vsi->tc_cfg.tc_info[ets_tc].netdev_tc;
-		netdev_set_prio_tc_map(netdev, i, netdev_tc);
+	/* Build a bit map of all UPs associated to the suspect hung queue TC,
+	 * so that we check for its counter increment.
+	 */
+	up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
+	for (i = 0; i < ICE_MAX_UP; i++) {
+		up_mapped_tc = (up2tc >> (i * 3)) & 0x7;
+		if (up_mapped_tc == tc)
+			up_in_tc |= BIT(i);
 	}
+
+	/* Now that we figured out that hung queue is PFC enabled, still the
+	 * Tx timeout can be legitimate. So to make sure Tx timeout is
+	 * absolutely caused by PFC storm, check if the counters are
+	 * incrementing.
+	 */
+	for (i = 0; i < ICE_MAX_UP; i++)
+		if (up_in_tc & BIT(i))
+			ref_prio_xoff[i] = pf->stats.priority_xoff_rx[i];
+
+	ice_update_dcb_stats(pf);
+
+	for (i = 0; i < ICE_MAX_UP; i++)
+		if (up_in_tc & BIT(i))
+			if (pf->stats.priority_xoff_rx[i] > ref_prio_xoff[i])
+				return true;
+
+	return false;
 }
 
 /**
- * ice_dcb_get_ena_tc - return bitmap of enabled TCs
- * @dcbcfg: DCB config to evaluate for enabled TCs
+ * ice_dcb_get_mode - gets the DCB mode
+ * @port_info: pointer to port info structure
+ * @host: if set it's HOST if not it's MANAGED
  */
-u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg)
+static u8 ice_dcb_get_mode(struct ice_port_info *port_info, bool host)
 {
-	u8 i, num_tc, ena_tc = 1;
-
-	num_tc = ice_dcb_get_num_tc(dcbcfg);
+	u8 mode;
 
-	for (i = 0; i < num_tc; i++)
-		ena_tc |= BIT(i);
+	if (host)
+		mode = DCB_CAP_DCBX_HOST;
+	else
+		mode = DCB_CAP_DCBX_LLD_MANAGED;
 
-	return ena_tc;
+	if (port_info->qos_cfg.local_dcbx_cfg.dcbx_mode & ICE_DCBX_MODE_CEE)
+		return mode | DCB_CAP_DCBX_VER_CEE;
+	else
+		return mode | DCB_CAP_DCBX_VER_IEEE;
 }
 
 /**
  * ice_dcb_get_num_tc - Get the number of TCs from DCBX config
  * @dcbcfg: config to retrieve number of TCs from
  */
-u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
+static u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
 {
 	bool tc_unused = false;
 	u8 num_tc = 0;
@@ -99,26 +120,120 @@ u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg)
 	return ret;
 }
 
+/**
+ * ice_dcb_get_ena_tc - return bitmap of enabled TCs
+ * @dcbcfg: DCB config to evaluate for enabled TCs
+ */
+static u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg)
+{
+	u8 i, num_tc, ena_tc = 1;
+
+	num_tc = ice_dcb_get_num_tc(dcbcfg);
+
+	for (i = 0; i < num_tc; i++)
+		ena_tc |= BIT(i);
+
+	return ena_tc;
+}
+
+/**
+ * ice_get_first_droptc - returns number of first droptc
+ * @vsi: used to find the first droptc
+ *
+ * This function returns the value of first_droptc.
+ * When DCB is enabled, first droptc information is derived from enabled_tc
+ * and PFC enabled bits. otherwise this function returns 0 as there is one
+ * TC without DCB (tc0)
+ */
+static u8 ice_get_first_droptc(struct ice_vsi *vsi)
+{
+	struct ice_dcbx_cfg *cfg = &vsi->port_info->qos_cfg.local_dcbx_cfg;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	u8 num_tc, ena_tc_map, pfc_ena_map;
+	u8 i;
+
+	num_tc = ice_dcb_get_num_tc(cfg);
+
+	/* get bitmap of enabled TCs */
+	ena_tc_map = ice_dcb_get_ena_tc(cfg);
+
+	/* get bitmap of PFC enabled TCs */
+	pfc_ena_map = cfg->pfc.pfcena;
+
+	/* get first TC that is not PFC enabled */
+	for (i = 0; i < num_tc; i++) {
+		if ((ena_tc_map & BIT(i)) && (!(pfc_ena_map & BIT(i)))) {
+			dev_dbg(dev, "first drop tc = %d\n", i);
+			return i;
+		}
+	}
+
+	dev_dbg(dev, "first drop tc = 0\n");
+	return 0;
+}
+
+/**
+ * ice_vsi_set_dcb_tc_cfg - Set VSI's TC based on DCB configuration
+ * @vsi: pointer to the VSI instance
+ */
+void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi)
+{
+	struct ice_dcbx_cfg *cfg = &vsi->port_info->qos_cfg.local_dcbx_cfg;
+
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+		vsi->tc_cfg.ena_tc = ice_dcb_get_ena_tc(cfg);
+		vsi->tc_cfg.numtc = ice_dcb_get_num_tc(cfg);
+		break;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	case ICE_VSI_CHNL:
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#ifdef HAVE_NETDEV_SB_DEV
+	case ICE_VSI_OFFLOAD_MACVLAN:
+#endif /* HAVE_NETDEV_SB_DEV */
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		vsi->tc_cfg.ena_tc = BIT(ice_get_first_droptc(vsi));
+		vsi->tc_cfg.numtc = 1;
+		break;
+	case ICE_VSI_CTRL:
+	case ICE_VSI_LB:
+	default:
+		vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+		vsi->tc_cfg.numtc = 1;
+	}
+}
+
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+/**
+ * ice_dcb_get_tc - Get the TC associated with the queue
+ * @vsi: ptr to the VSI
+ * @queue_index: queue number associated with VSI
+ */
+u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index)
+{
+	return vsi->tx_rings[queue_index]->dcb_tc;
+}
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+
 /**
  * ice_vsi_cfg_dcb_rings - Update rings to reflect DCB TC
  * @vsi: VSI owner of rings being updated
  */
 void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi)
 {
-	struct ice_ring *tx_ring, *rx_ring;
-	u16 qoffset, qcount;
+	u16 qoffset;
+	u16 qcount;
 	int i, n;
 
 	if (!test_bit(ICE_FLAG_DCB_ENA, vsi->back->flags)) {
 		/* Reset the TC information */
-		for (i = 0; i < vsi->num_txq; i++) {
-			tx_ring = vsi->tx_rings[i];
-			tx_ring->dcb_tc = 0;
-		}
-		for (i = 0; i < vsi->num_rxq; i++) {
-			rx_ring = vsi->rx_rings[i];
-			rx_ring->dcb_tc = 0;
-		}
+		for (i = 0; i < vsi->num_txq; i++)
+			vsi->tx_rings[i]->dcb_tc = 0;
+
+		for (i = 0; i < vsi->num_rxq; i++)
+			vsi->rx_rings[i]->dcb_tc = 0;
+
 		return;
 	}
 
@@ -127,50 +242,150 @@ void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi)
 			break;
 
 		qoffset = vsi->tc_cfg.tc_info[n].qoffset;
+
 		qcount = vsi->tc_cfg.tc_info[n].qcount_tx;
-		for (i = qoffset; i < (qoffset + qcount); i++) {
-			tx_ring = vsi->tx_rings[i];
-			rx_ring = vsi->rx_rings[i];
-			tx_ring->dcb_tc = n;
-			rx_ring->dcb_tc = n;
+		for (i = qoffset; i < (qoffset + qcount); i++)
+			vsi->tx_rings[i]->dcb_tc = n;
+
+		qcount = vsi->tc_cfg.tc_info[n].qcount_rx;
+		for (i = qoffset; i < (qoffset + qcount); i++)
+			vsi->rx_rings[i]->dcb_tc = n;
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	/* when DCB is configured TC for MACVLAN queues should be
+	 * the first drop TC of the main VSI
+	 */
+	if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		u8 first_droptc = ice_get_first_droptc(vsi);
+
+		ice_for_each_alloc_txq(vsi, i)
+			vsi->tx_rings[i]->dcb_tc = first_droptc;
+		ice_for_each_alloc_rxq(vsi, i)
+			vsi->rx_rings[i]->dcb_tc = first_droptc;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->type == ICE_VSI_PF) {
+		u8 first_droptc = ice_get_first_droptc(vsi);
+
+		/* When DCB is configured, TC for ADQ queues (which are really
+		 * PF queues) should be the first drop TC of the main VSI
+		 */
+		ice_for_each_chnl_tc(n) {
+			if (!(vsi->all_enatc & BIT(n)))
+				break;
+
+			qoffset = vsi->mqprio_qopt.qopt.offset[n];
+			qcount = vsi->mqprio_qopt.qopt.count[n];
+			for (i = qoffset; i < (qoffset + qcount); i++) {
+				vsi->tx_rings[i]->dcb_tc = first_droptc;
+				vsi->rx_rings[i]->dcb_tc = first_droptc;
+			}
 		}
 	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 }
 
 /**
- * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs
- * @pf: pointer to the PF struct
+ * ice_peer_prep_tc_change - Pre-notify RDMA Peer in blocking call of TC change
+ * @peer_obj_int: ptr to peer device internal struct
+ * @data: ptr to opaque data
+ */
+static int
+ice_peer_prep_tc_change(struct ice_peer_obj_int *peer_obj_int,
+			void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!ice_validate_peer_obj(peer_obj))
+		return 0;
+
+	if (!test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state))
+		return 0;
+
+	if (peer_obj->peer_ops && peer_obj->peer_ops->prep_tc_change)
+		peer_obj->peer_ops->prep_tc_change(peer_obj);
+
+	return 0;
+}
+
+/**
+ * ice_dcb_ena_dis_vsi - disable certain VSIs for DCB config/reconfig
+ * @pf: pointer to the PF instance
+ * @ena: true to enable VSIs, false to disable
+ * @locked: true if caller holds RTNL lock, false otherwise
  *
- * Assumed caller has already disabled all VSIs before
- * calling this function. Reconfiguring DCB based on
- * local_dcbx_cfg.
+ * Before a new DCB configuration can be applied, VSIs of type PF, MACVLAN,
+ * and CHNL need to be brought down. Following completion of DCB configuration
+ * the VSIs that were downed need to be brought up again. This helper function
+ * does both.
  */
-static void ice_pf_dcb_recfg(struct ice_pf *pf)
+static void ice_dcb_ena_dis_vsi(struct ice_pf *pf, bool ena, bool locked)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
-	u8 tc_map = 0;
-	int v, ret;
+	int i;
 
-	/* Update each VSI */
-	ice_for_each_vsi(pf, v) {
-		if (!pf->vsi[v])
-			continue;
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
 
-		if (pf->vsi[v]->type == ICE_VSI_PF)
-			tc_map = ice_dcb_get_ena_tc(dcbcfg);
-		else
-			tc_map = ICE_DFLT_TRAFFIC_CLASS;
+		if (!vsi)
+			continue;
 
-		ret = ice_vsi_cfg_tc(pf->vsi[v], tc_map);
-		if (ret) {
-			dev_err(&pf->pdev->dev,
-				"Failed to config TC for VSI index: %d\n",
-				pf->vsi[v]->idx);
+		switch (vsi->type) {
+		case ICE_VSI_CHNL:
+		case ICE_VSI_OFFLOAD_MACVLAN:
+		case ICE_VSI_VMDQ2:
+		case ICE_VSI_SWITCHDEV_CTRL:
+		case ICE_VSI_PF:
+			if (ena)
+				ice_ena_vsi(vsi, locked);
+			else
+				ice_dis_vsi(vsi, locked);
+			break;
+		default:
 			continue;
 		}
+	}
+}
+
+/**
+ * ice_dcb_bwchk - check if ETS bandwidth input parameters are correct
+ * @pf: pointer to PF struct
+ * @dcbcfg: pointer to DCB config structure
+ */
+int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg)
+{
+	struct ice_dcb_ets_cfg *etscfg = &dcbcfg->etscfg;
+	u8 num_tc, total_bw = 0;
+	int i;
+
+	/* returns number of contigous TCs and 1 TC for non-contigous TCs,
+	 * since at least 1 TC has to be configured
+	 */
+	num_tc = ice_dcb_get_num_tc(dcbcfg);
+
+	/* no bandwidth checks required if there's only one TC, so assign
+	 * all bandwidth to TC0 and return
+	 */
+	if (num_tc == 1) {
+		etscfg->tcbwtable[0] = ICE_TC_MAX_BW;
+		return 0;
+	}
 
-		ice_vsi_map_rings_to_vectors(pf->vsi[v]);
+	for (i = 0; i < num_tc; i++)
+		total_bw += etscfg->tcbwtable[i];
+
+	if (!total_bw) {
+		etscfg->tcbwtable[0] = ICE_TC_MAX_BW;
+	} else if (total_bw != ICE_TC_MAX_BW) {
+		dev_err(ice_pf_to_dev(pf),
+			"Invalid config, total bandwidth must equal 100\n");
+		return -EINVAL;
 	}
+
+	return 0;
 }
 
 /**
@@ -179,50 +394,65 @@ static void ice_pf_dcb_recfg(struct ice_pf *pf)
  * @new_cfg: DCBX config to apply
  * @locked: is the RTNL held
  */
-static
 int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 {
-	struct ice_dcbx_cfg *old_cfg, *curr_cfg;
 	struct ice_aqc_port_ets_elem buf = { 0 };
-	int ret = 0;
+	struct ice_dcbx_cfg *old_cfg, *curr_cfg;
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret = ICE_DCB_NO_HW_CHG;
 
-	curr_cfg = &pf->hw.port_info->local_dcbx_cfg;
+	curr_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	/* FW does not care if change happened */
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
+		ret = ICE_DCB_HW_CHG_RST;
 
 	/* Enable DCB tagging only when more than one TC */
 	if (ice_dcb_get_num_tc(new_cfg) > 1) {
-		dev_dbg(&pf->pdev->dev, "DCB tagging enabled (num TC > 1)\n");
+		dev_dbg(dev, "DCB tagging enabled (num TC > 1)\n");
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	} else {
-		dev_dbg(&pf->pdev->dev, "DCB tagging disabled (num TC = 1)\n");
+		dev_dbg(dev, "DCB tagging disabled (num TC = 1)\n");
 		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	}
 
 	if (!memcmp(new_cfg, curr_cfg, sizeof(*new_cfg))) {
-		dev_dbg(&pf->pdev->dev, "No change in DCB config required\n");
+		dev_dbg(dev, "No change in DCB config required\n");
 		return ret;
 	}
 
+	if (ice_dcb_bwchk(pf, new_cfg))
+		return -EINVAL;
+
 	/* Store old config in case FW config fails */
-	old_cfg = devm_kzalloc(&pf->pdev->dev, sizeof(*old_cfg), GFP_KERNEL);
-	memcpy(old_cfg, curr_cfg, sizeof(*old_cfg));
+	old_cfg = kmemdup(curr_cfg, sizeof(*old_cfg), GFP_KERNEL);
+	if (!old_cfg)
+		return -ENOMEM;
+
+	dev_info(dev, "Commit DCB Configuration to the hardware\n");
+	/* Notify capable peers about impending change to TCs */
+	ice_for_each_peer(pf, NULL, ice_peer_prep_tc_change);
 
 	/* avoid race conditions by holding the lock while disabling and
 	 * re-enabling the VSI
 	 */
 	if (!locked)
 		rtnl_lock();
-	ice_pf_dis_all_vsi(pf, true);
+
+	/* disable VSIs affected by DCB changes */
+	ice_dcb_ena_dis_vsi(pf, false, true);
 
 	memcpy(curr_cfg, new_cfg, sizeof(*curr_cfg));
 	memcpy(&curr_cfg->etsrec, &curr_cfg->etscfg, sizeof(curr_cfg->etsrec));
+	memcpy(&new_cfg->etsrec, &curr_cfg->etscfg, sizeof(curr_cfg->etsrec));
 
 	/* Only send new config to HW if we are in SW LLDP mode. Otherwise,
 	 * the new config came from the HW in the first place.
 	 */
-	if (pf->hw.port_info->is_sw_lldp) {
+	if (pf->hw.port_info->qos_cfg.is_sw_lldp) {
 		ret = ice_set_dcb_cfg(pf->hw.port_info);
 		if (ret) {
-			dev_err(&pf->pdev->dev, "Set DCB Config failed\n");
+			dev_err(dev, "Set DCB Config failed\n");
 			/* Restore previous settings to local config */
 			memcpy(curr_cfg, old_cfg, sizeof(*curr_cfg));
 			goto out;
@@ -231,17 +461,18 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto out;
 	}
 
 	ice_pf_dcb_recfg(pf);
 
 out:
-	ice_pf_ena_all_vsi(pf, true);
+	/* enable previously downed VSIs */
+	ice_dcb_ena_dis_vsi(pf, true, true);
 	if (!locked)
 		rtnl_unlock();
-	devm_kfree(&pf->pdev->dev, old_cfg);
+	kfree(old_cfg);
 	return ret;
 }
 
@@ -251,7 +482,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
  */
 static void ice_cfg_etsrec_defaults(struct ice_port_info *pi)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pi->local_dcbx_cfg;
+	struct ice_dcbx_cfg *dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	u8 i;
 
 	/* Ensure ETS recommended DCB configuration is not already set */
@@ -277,6 +508,7 @@ static bool
 ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
 		   struct ice_dcbx_cfg *new_cfg)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	bool need_reconfig = false;
 
 	/* Check if ETS configuration has changed */
@@ -287,33 +519,33 @@ ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
 			   &old_cfg->etscfg.prio_table,
 			   sizeof(new_cfg->etscfg.prio_table))) {
 			need_reconfig = true;
-			dev_dbg(&pf->pdev->dev, "ETS UP2TC changed.\n");
+			dev_dbg(dev, "ETS UP2TC changed.\n");
 		}
 
 		if (memcmp(&new_cfg->etscfg.tcbwtable,
 			   &old_cfg->etscfg.tcbwtable,
 			   sizeof(new_cfg->etscfg.tcbwtable)))
-			dev_dbg(&pf->pdev->dev, "ETS TC BW Table changed.\n");
+			dev_dbg(dev, "ETS TC BW Table changed.\n");
 
 		if (memcmp(&new_cfg->etscfg.tsatable,
 			   &old_cfg->etscfg.tsatable,
 			   sizeof(new_cfg->etscfg.tsatable)))
-			dev_dbg(&pf->pdev->dev, "ETS TSA Table changed.\n");
+			dev_dbg(dev, "ETS TSA Table changed.\n");
 	}
 
 	/* Check if PFC configuration has changed */
 	if (memcmp(&new_cfg->pfc, &old_cfg->pfc, sizeof(new_cfg->pfc))) {
 		need_reconfig = true;
-		dev_dbg(&pf->pdev->dev, "PFC config change detected.\n");
+		dev_dbg(dev, "PFC config change detected.\n");
 	}
 
 	/* Check if APP Table has changed */
 	if (memcmp(&new_cfg->app, &old_cfg->app, sizeof(new_cfg->app))) {
 		need_reconfig = true;
-		dev_dbg(&pf->pdev->dev, "APP Table change detected.\n");
+		dev_dbg(dev, "APP Table change detected.\n");
 	}
 
-	dev_dbg(&pf->pdev->dev, "dcb need_reconfig=%d\n", need_reconfig);
+	dev_dbg(dev, "dcb need_reconfig=%d\n", need_reconfig);
 	return need_reconfig;
 }
 
@@ -323,87 +555,69 @@ ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
  */
 void ice_dcb_rebuild(struct ice_pf *pf)
 {
-	struct ice_dcbx_cfg *local_dcbx_cfg, *desired_dcbx_cfg, *prev_cfg;
 	struct ice_aqc_port_ets_elem buf = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_dcbx_cfg *err_cfg;
 	enum ice_status ret;
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto dcb_error;
 	}
 
-	/* If DCB was not enabled previously, we are done */
-	if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags))
-		return;
-
-	local_dcbx_cfg = &pf->hw.port_info->local_dcbx_cfg;
-	desired_dcbx_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	mutex_lock(&pf->tc_mutex);
 
-	/* Save current willing state and force FW to unwilling */
-	local_dcbx_cfg->etscfg.willing = 0x0;
-	local_dcbx_cfg->pfc.willing = 0x0;
-	local_dcbx_cfg->app_mode = ICE_DCBX_APPS_NON_WILLING;
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
+		ice_cfg_etsrec_defaults(pf->hw.port_info);
 
-	ice_cfg_etsrec_defaults(pf->hw.port_info);
 	ret = ice_set_dcb_cfg(pf->hw.port_info);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to set DCB to unwilling\n");
+		dev_err(dev, "Failed to set DCB config in rebuild\n");
 		goto dcb_error;
 	}
 
-	/* Retrieve DCB config and ensure same as current in SW */
-	prev_cfg = devm_kmemdup(&pf->pdev->dev, local_dcbx_cfg,
-				sizeof(*prev_cfg), GFP_KERNEL);
-	if (!prev_cfg) {
-		dev_err(&pf->pdev->dev, "Failed to alloc space for DCB cfg\n");
-		goto dcb_error;
-	}
-
-	ice_init_dcb(&pf->hw, true);
-	if (pf->hw.port_info->dcbx_status == ICE_DCBX_STATUS_DIS)
-		pf->hw.port_info->is_sw_lldp = true;
-	else
-		pf->hw.port_info->is_sw_lldp = false;
-
-	if (ice_dcb_need_recfg(pf, prev_cfg, local_dcbx_cfg)) {
-		/* difference in cfg detected - disable DCB till next MIB */
-		dev_err(&pf->pdev->dev, "Set local MIB not accurate\n");
-		goto dcb_error;
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp) {
+		ret = ice_cfg_lldp_mib_change(&pf->hw, true);
+		if (ret && !pf->hw.port_info->qos_cfg.is_sw_lldp) {
+			dev_err(dev, "Failed to register for MIB changes\n");
+			goto dcb_error;
+		}
 	}
 
-	/* fetched config congruent to previous configuration */
-	devm_kfree(&pf->pdev->dev, prev_cfg);
-
-	/* Set the local desired config */
-	if (local_dcbx_cfg->dcbx_mode == ICE_DCBX_MODE_CEE)
-		memcpy(local_dcbx_cfg, desired_dcbx_cfg,
-		       sizeof(*local_dcbx_cfg));
-
-	ice_cfg_etsrec_defaults(pf->hw.port_info);
-	ret = ice_set_dcb_cfg(pf->hw.port_info);
-	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to set desired config\n");
-		goto dcb_error;
-	}
-	dev_info(&pf->pdev->dev, "DCB restored after reset\n");
+	dev_info(dev, "DCB info restored\n");
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
+		dev_err(dev, "Query Port ETS failed\n");
 		goto dcb_error;
 	}
 
+	mutex_unlock(&pf->tc_mutex);
+
 	return;
 
 dcb_error:
-	dev_err(&pf->pdev->dev, "Disabling DCB until new settings occur\n");
-	prev_cfg = devm_kzalloc(&pf->pdev->dev, sizeof(*prev_cfg), GFP_KERNEL);
-	prev_cfg->etscfg.willing = true;
-	prev_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW;
-	prev_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
-	memcpy(&prev_cfg->etsrec, &prev_cfg->etscfg, sizeof(prev_cfg->etsrec));
-	ice_pf_dcb_cfg(pf, prev_cfg, false);
-	devm_kfree(&pf->pdev->dev, prev_cfg);
+	dev_err(dev, "Disabling DCB until new settings occur\n");
+	err_cfg = kzalloc(sizeof(*err_cfg), GFP_KERNEL);
+	if (!err_cfg) {
+		mutex_unlock(&pf->tc_mutex);
+		return;
+	}
+
+	err_cfg->etscfg.willing = true;
+	err_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW;
+	err_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
+	memcpy(&err_cfg->etsrec, &err_cfg->etscfg, sizeof(err_cfg->etsrec));
+	/* Coverity warns the return code of ice_pf_dcb_cfg() is not checked
+	 * here as is done for other calls to that function. That check is
+	 * not necessary since this is in this function's error cleanup path.
+	 * Suppress the Coverity warning with the following comment...
+	 */
+	/* coverity[check_return] */
+	ice_pf_dcb_cfg(pf, err_cfg, false);
+	kfree(err_cfg);
+
+	mutex_unlock(&pf->tc_mutex);
 }
 
 /**
@@ -418,28 +632,29 @@ static int ice_dcb_init_cfg(struct ice_pf *pf, bool locked)
 	int ret = 0;
 
 	pi = pf->hw.port_info;
-	newcfg = devm_kzalloc(&pf->pdev->dev, sizeof(*newcfg), GFP_KERNEL);
+	newcfg = kmemdup(&pi->qos_cfg.local_dcbx_cfg, sizeof(*newcfg),
+			 GFP_KERNEL);
 	if (!newcfg)
 		return -ENOMEM;
 
-	memcpy(newcfg, &pi->local_dcbx_cfg, sizeof(*newcfg));
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*newcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*newcfg));
 
-	dev_info(&pf->pdev->dev, "Configuring initial DCB values\n");
+	dev_info(ice_pf_to_dev(pf), "Configuring initial DCB values\n");
 	if (ice_pf_dcb_cfg(pf, newcfg, locked))
 		ret = -EINVAL;
 
-	devm_kfree(&pf->pdev->dev, newcfg);
+	kfree(newcfg);
 
 	return ret;
 }
 
 /**
- * ice_dcb_sw_default_config - Apply a default DCB config
+ * ice_dcb_sw_dflt_cfg - Apply a default DCB config
  * @pf: PF to apply config to
+ * @ets_willing: configure ETS willing
  * @locked: was this function called with RTNL held
  */
-static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
+int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked)
 {
 	struct ice_aqc_port_ets_elem buf = { 0 };
 	struct ice_dcbx_cfg *dcbcfg;
@@ -449,12 +664,13 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
 
 	hw = &pf->hw;
 	pi = hw->port_info;
-	dcbcfg = devm_kzalloc(&pf->pdev->dev, sizeof(*dcbcfg), GFP_KERNEL);
+	dcbcfg = kzalloc(sizeof(*dcbcfg), GFP_KERNEL);
+	if (!dcbcfg)
+		return -ENOMEM;
 
-	memset(dcbcfg, 0, sizeof(*dcbcfg));
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*dcbcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*dcbcfg));
 
-	dcbcfg->etscfg.willing = 1;
+	dcbcfg->etscfg.willing = ets_willing ? 1 : 0;
 	dcbcfg->etscfg.maxtcs = hw->func_caps.common_cap.maxtc;
 	dcbcfg->etscfg.tcbwtable[0] = 100;
 	dcbcfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
@@ -469,16 +685,135 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
 	dcbcfg->numapps = 1;
 	dcbcfg->app[0].selector = ICE_APP_SEL_ETHTYPE;
 	dcbcfg->app[0].priority = 3;
-	dcbcfg->app[0].prot_id = ICE_APP_PROT_ID_FCOE;
+	dcbcfg->app[0].prot_id = ETH_P_FCOE;
 
 	ret = ice_pf_dcb_cfg(pf, dcbcfg, locked);
-	devm_kfree(&pf->pdev->dev, dcbcfg);
+	kfree(dcbcfg);
 	if (ret)
 		return ret;
 
 	return ice_query_port_ets(pi, &buf, sizeof(buf), NULL);
 }
 
+/**
+ * ice_dcb_tc_contig - Check that TCs are contiguous
+ * @prio_table: pointer to priority table
+ *
+ * Check if TCs begin with TC0 and are contiguous
+ */
+static bool ice_dcb_tc_contig(u8 *prio_table)
+{
+	bool found_empty = false;
+	u8 used_tc = 0;
+	int i;
+
+	/* Create a bitmap of used TCs */
+	for (i = 0; i < CEE_DCBX_MAX_PRIO; i++)
+		used_tc |= BIT(prio_table[i]);
+
+	for (i = 0; i < CEE_DCBX_MAX_PRIO; i++) {
+		if (used_tc & BIT(i)) {
+			if (found_empty)
+				return false;
+		} else {
+			found_empty = true;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * ice_dcb_noncontig_cfg - Configure DCB for non-contiguous TCs
+ * @pf: pointer to the PF struct
+ *
+ * If non-contiguous TCs, then configure SW DCB with TC0 and ETS non-willing
+ */
+static int ice_dcb_noncontig_cfg(struct ice_pf *pf)
+{
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret;
+
+	/* Configure SW DCB default with ETS non-willing */
+	ret = ice_dcb_sw_dflt_cfg(pf, false, true);
+	if (ret) {
+		dev_err(dev, "Failed to set local DCB config %d\n", ret);
+		return ret;
+	}
+
+	/* Reconfigure with ETS willing so that FW will send LLDP MIB event */
+	dcbcfg->etscfg.willing = 1;
+	ret = ice_set_dcb_cfg(pf->hw.port_info);
+	if (ret)
+		dev_err(dev, "Failed to set DCB to unwilling\n");
+
+	return ret;
+}
+
+/**
+ * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs
+ * @pf: pointer to the PF struct
+ *
+ * Assumed caller has already disabled all VSIs before
+ * calling this function. Reconfiguring DCB based on
+ * local_dcbx_cfg.
+ */
+void ice_pf_dcb_recfg(struct ice_pf *pf)
+{
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	u8 tc_map = 0;
+	int v, ret;
+
+
+	/* Update each VSI */
+	ice_for_each_vsi(pf, v) {
+		struct ice_vsi *vsi = pf->vsi[v];
+
+		if (!vsi)
+			continue;
+
+		if (vsi->type == ICE_VSI_PF) {
+			tc_map = ice_dcb_get_ena_tc(dcbcfg);
+
+			/* If DCBX request non-contiguous TC, then configure
+			 * default TC
+			 */
+			if (!ice_dcb_tc_contig(dcbcfg->etscfg.prio_table)) {
+				tc_map = ICE_DFLT_TRAFFIC_CLASS;
+				ice_dcb_noncontig_cfg(pf);
+			}
+#if defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO) && defined(HAVE_NETDEV_SB_DEV)
+		} else if (vsi->type == ICE_VSI_CHNL ||
+			   vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+			tc_map = BIT(ice_get_first_droptc(vsi));
+# endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO && HAVE_NETDEV_SB_DEV */
+		} else {
+			tc_map = ICE_DFLT_TRAFFIC_CLASS;
+		}
+
+		ret = ice_vsi_cfg_tc(vsi, tc_map);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf), "Failed to config TC for VSI index: %d\n",
+				vsi->idx);
+			continue;
+		}
+		/* no need to proceed with remaining cfg if it is CHNL VSI */
+		if (vsi->type == ICE_VSI_CHNL)
+			continue;
+
+		ice_vsi_map_rings_to_vectors(vsi);
+		if (vsi->type == ICE_VSI_PF)
+			ice_dcbnl_set_all(vsi);
+	}
+	/* If the RDMA peer is registered, update that peer's initial_qos_info struct.
+	 * The peer is closed during this process, so when it is opened, it will access
+	 * the initial_qos_info element to configure itself.
+	 */
+	if (pf->rdma_peer)
+		ice_setup_dcb_qos_info(pf, &pf->rdma_peer->initial_qos_info);
+}
+
 /**
  * ice_init_pf_dcb - initialize DCB for a PF
  * @pf: PF to initialize DCB for
@@ -486,43 +821,63 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool locked)
  */
 int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 {
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_port_info *port_info;
 	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
 	int err;
 
 	port_info = hw->port_info;
 
-	err = ice_init_dcb(hw, false);
-	if (err && !port_info->is_sw_lldp) {
-		dev_err(&pf->pdev->dev, "Error initializing DCB %d\n", err);
+	status = ice_init_dcb(hw, false);
+	if (status && !port_info->qos_cfg.is_sw_lldp) {
+		dev_err(dev, "Error initializing DCB %s\n",
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
 		goto dcb_init_err;
 	}
 
-	dev_info(&pf->pdev->dev,
-		 "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n",
+	dev_info(dev, "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n",
 		 pf->hw.func_caps.common_cap.maxtc);
-	if (err) {
+	if (port_info->qos_cfg.is_sw_lldp) {
+		struct ice_vsi *pf_vsi;
+
 		/* FW LLDP is disabled, activate SW DCBX/LLDP mode */
-		dev_info(&pf->pdev->dev,
-			 "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
+		dev_info(dev, "FW LLDP is disabled, DCBx/LLDP in SW mode.\n");
 		clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
-		err = ice_dcb_sw_dflt_cfg(pf, locked);
+		err = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_VLAN_BASED_PFC,
+					  NULL);
+		if (err)
+			dev_info(dev, "Fail to set VLAN PFC mode\n");
+
+		err = ice_dcb_sw_dflt_cfg(pf, true, locked);
 		if (err) {
-			dev_err(&pf->pdev->dev,
-				"Failed to set local DCB config %d\n", err);
+			dev_err(dev, "Failed to set local DCB config %d\n",
+				err);
+			err = -EIO;
+			goto dcb_init_err;
+		}
+
+		/* If the FW DCBX engine is not running then Rx LLDP packets
+		 * need to be redirected up the stack.
+		 */
+		pf_vsi = ice_get_main_vsi(pf);
+		if (!pf_vsi) {
+			dev_err(dev, "Failed to set local DCB config\n");
 			err = -EIO;
 			goto dcb_init_err;
 		}
 
-		pf->dcbx_cap = DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+		ice_cfg_sw_lldp(pf_vsi, false, true);
+
+		pf->dcbx_cap = ice_dcb_get_mode(port_info, true);
 		return 0;
 	}
 
 	set_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
 
-	/* DCBX in FW and LLDP enabled in FW */
-	pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_IEEE;
+	/* DCBX/LLDP enabled in FW, set DCBNL mode advertisement */
+	pf->dcbx_cap = ice_dcb_get_mode(port_info, false);
 
 	err = ice_dcb_init_cfg(pf, locked);
 	if (err)
@@ -578,39 +933,70 @@ void ice_update_dcb_stats(struct ice_pf *pf)
  * ice_tx_prepare_vlan_flags_dcb - prepare VLAN tagging for DCB
  * @tx_ring: ring to send buffer on
  * @first: pointer to struct ice_tx_buf
+ *
+ * This should not be called if the outer VLAN is software offloaded as the VLAN
+ * tag will already be configured with the correct ID and priority bits
  */
-int
+void
 ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring,
 			      struct ice_tx_buf *first)
 {
 	struct sk_buff *skb = first->skb;
 
 	if (!test_bit(ICE_FLAG_DCB_ENA, tx_ring->vsi->back->flags))
-		return 0;
+		return;
 
 	/* Insert 802.1p priority into VLAN header */
-	if ((first->tx_flags & (ICE_TX_FLAGS_HW_VLAN | ICE_TX_FLAGS_SW_VLAN)) ||
+	if ((first->tx_flags & ICE_TX_FLAGS_HW_VLAN ||
+	     first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) ||
 	    skb->priority != TC_PRIO_CONTROL) {
 		first->tx_flags &= ~ICE_TX_FLAGS_VLAN_PR_M;
 		/* Mask the lower 3 bits to set the 802.1p priority */
 		first->tx_flags |= (skb->priority & 0x7) <<
 				   ICE_TX_FLAGS_VLAN_PR_S;
-		if (first->tx_flags & ICE_TX_FLAGS_SW_VLAN) {
-			struct vlan_ethhdr *vhdr;
-			int rc;
-
-			rc = skb_cow_head(skb, 0);
-			if (rc < 0)
-				return rc;
-			vhdr = (struct vlan_ethhdr *)skb->data;
-			vhdr->h_vlan_TCI = htons(first->tx_flags >>
-						 ICE_TX_FLAGS_VLAN_S);
-		} else {
+		/* if this is not already set it means a VLAN 0 + priority needs
+		 * to be offloaded
+		 */
+		if (tx_ring->flags & ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)
+			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+		else
 			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
-		}
 	}
+}
 
-	return 0;
+/**
+ * ice_setup_dcb_qos_info - Setup DCB QoS information
+ * @pf: ptr to ice_pf
+ * @qos_info: QoS param instance
+ */
+void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info)
+{
+	struct ice_dcbx_cfg *dcbx_cfg;
+	unsigned int i;
+	u32 up2tc;
+
+	dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+	up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
+	qos_info->num_apps = dcbx_cfg->numapps;
+
+	qos_info->num_tc = ice_dcb_get_num_tc(dcbx_cfg);
+
+	for (i = 0; i < ICE_IDC_MAX_USER_PRIORITY; i++)
+		qos_info->up2tc[i] = (up2tc >> (i * 3)) & 0x7;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		qos_info->tc_info[i].rel_bw =
+			dcbx_cfg->etscfg.tcbwtable[i];
+
+	for (i = 0; i < qos_info->num_apps; i++) {
+		qos_info->apps[i].priority = dcbx_cfg->app[i].priority;
+		qos_info->apps[i].prot_id = dcbx_cfg->app[i].prot_id;
+		qos_info->apps[i].selector = dcbx_cfg->app[i].selector;
+	}
+
+	qos_info->pfc_mode = dcbx_cfg->pfc_mode;
+	for (i = 0; i < ICE_IDC_DSCP_NUM_VAL; i++)
+		qos_info->dscp_map[i] = dcbx_cfg->dscp_map[i];
 }
 
 /**
@@ -623,11 +1009,12 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 				    struct ice_rq_event_info *event)
 {
 	struct ice_aqc_port_ets_elem buf = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_aqc_lldp_get_mib *mib;
 	struct ice_dcbx_cfg tmp_dcbx_cfg;
 	bool need_reconfig = false;
 	struct ice_port_info *pi;
-	u8 type;
+	u8 mib_type;
 	int ret;
 
 	/* Not DCB capable or capability disabled */
@@ -635,82 +1022,90 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 		return;
 
 	if (pf->dcbx_cap & DCB_CAP_DCBX_HOST) {
-		dev_dbg(&pf->pdev->dev,
-			"MIB Change Event in HOST mode\n");
+		dev_dbg(dev, "MIB Change Event in HOST mode\n");
 		return;
 	}
 
 	pi = pf->hw.port_info;
 	mib = (struct ice_aqc_lldp_get_mib *)&event->desc.params.raw;
 	/* Ignore if event is not for Nearest Bridge */
-	type = ((mib->type >> ICE_AQ_LLDP_BRID_TYPE_S) &
-		ICE_AQ_LLDP_BRID_TYPE_M);
-	dev_dbg(&pf->pdev->dev, "LLDP event MIB bridge type 0x%x\n", type);
-	if (type != ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID)
+	mib_type = ((mib->type >> ICE_AQ_LLDP_BRID_TYPE_S) &
+		    ICE_AQ_LLDP_BRID_TYPE_M);
+	dev_dbg(dev, "LLDP event MIB bridge type 0x%x\n", mib_type);
+	if (mib_type != ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID)
 		return;
 
 	/* Check MIB Type and return if event for Remote MIB update */
-	type = mib->type & ICE_AQ_LLDP_MIB_TYPE_M;
-	dev_dbg(&pf->pdev->dev,
-		"LLDP event mib type %s\n", type ? "remote" : "local");
-	if (type == ICE_AQ_LLDP_MIB_REMOTE) {
+	mib_type = mib->type & ICE_AQ_LLDP_MIB_TYPE_M;
+	dev_dbg(dev, "LLDP event mib type %s\n", mib_type ? "remote" : "local");
+	if (mib_type == ICE_AQ_LLDP_MIB_REMOTE) {
 		/* Update the remote cached instance and return */
 		ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 					 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID,
-					 &pi->remote_dcbx_cfg);
+					 &pi->qos_cfg.remote_dcbx_cfg);
 		if (ret) {
-			dev_err(&pf->pdev->dev, "Failed to get remote DCB config\n");
+			dev_err(dev, "Failed to get remote DCB config\n");
 			return;
 		}
 	}
 
+	mutex_lock(&pf->tc_mutex);
+
 	/* store the old configuration */
-	tmp_dcbx_cfg = pf->hw.port_info->local_dcbx_cfg;
+	tmp_dcbx_cfg = pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	/* Reset the old DCBX configuration data */
-	memset(&pi->local_dcbx_cfg, 0, sizeof(pi->local_dcbx_cfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0,
+	       sizeof(pi->qos_cfg.local_dcbx_cfg));
 
 	/* Get updated DCBX data from firmware */
 	ret = ice_get_dcb_cfg(pf->hw.port_info);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Failed to get DCB config\n");
-		return;
+		dev_err(dev, "Failed to get DCB config\n");
+		goto out;
 	}
 
 	/* No change detected in DCBX configs */
-	if (!memcmp(&tmp_dcbx_cfg, &pi->local_dcbx_cfg, sizeof(tmp_dcbx_cfg))) {
-		dev_dbg(&pf->pdev->dev,
-			"No change detected in DCBX configuration.\n");
-		return;
+	if (!memcmp(&tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg,
+		    sizeof(tmp_dcbx_cfg))) {
+		dev_dbg(dev, "No change detected in DCBX configuration.\n");
+		goto out;
 	}
 
+	pf->dcbx_cap = ice_dcb_get_mode(pi, false);
+
 	need_reconfig = ice_dcb_need_recfg(pf, &tmp_dcbx_cfg,
-					   &pi->local_dcbx_cfg);
+					   &pi->qos_cfg.local_dcbx_cfg);
+	ice_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg);
 	if (!need_reconfig)
-		return;
+		goto out;
 
 	/* Enable DCB tagging only when more than one TC */
-	if (ice_dcb_get_num_tc(&pi->local_dcbx_cfg) > 1) {
-		dev_dbg(&pf->pdev->dev, "DCB tagging enabled (num TC > 1)\n");
+	if (ice_dcb_get_num_tc(&pi->qos_cfg.local_dcbx_cfg) > 1) {
+		dev_dbg(dev, "DCB tagging enabled (num TC > 1)\n");
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	} else {
-		dev_dbg(&pf->pdev->dev, "DCB tagging disabled (num TC = 1)\n");
+		dev_dbg(dev, "DCB tagging disabled (num TC = 1)\n");
 		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	}
 
 	rtnl_lock();
-	ice_pf_dis_all_vsi(pf, true);
+	/* disable VSIs affected by DCB changes */
+	ice_dcb_ena_dis_vsi(pf, false, true);
 
 	ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL);
 	if (ret) {
-		dev_err(&pf->pdev->dev, "Query Port ETS failed\n");
-		rtnl_unlock();
-		return;
+		dev_err(dev, "Query Port ETS failed\n");
+		goto unlock_rtnl;
 	}
 
 	/* changes in configuration update VSI */
 	ice_pf_dcb_recfg(pf);
 
-	ice_pf_ena_all_vsi(pf, true);
+	/* enable previously downed VSIs */
+	ice_dcb_ena_dis_vsi(pf, true, true);
+unlock_rtnl:
 	rtnl_unlock();
+out:
+	mutex_unlock(&pf->tc_mutex);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
index 661a6f7bca6494e196bdf5ccea33c08b468dcb25..971a8561415a084ff3937cca8c44e89feb64bd6a 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h
@@ -1,50 +1,105 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DCB_LIB_H_
 #define _ICE_DCB_LIB_H_
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
 
 #ifdef CONFIG_DCB
-#define ICE_TC_MAX_BW 100 /* Default Max BW percentage */
+#define ICE_TC_MAX_BW		100 /* Default Max BW percentage */
+#define ICE_DCB_HW_CHG_RST	0 /* DCB configuration changed with reset */
+#define ICE_DCB_NO_HW_CHG	1 /* DCB configuration did not change */
+#define ICE_DCB_HW_CHG		2 /* DCB configuration changed, no reset */
 
 void ice_dcb_rebuild(struct ice_pf *pf);
-u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg *dcbcfg);
-u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg *dcbcfg);
+int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked);
+void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi);
+bool ice_is_pfc_causing_hung_q(struct ice_pf *pf, unsigned int txqueue);
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index);
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+int
+ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked);
+int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg);
+void ice_pf_dcb_recfg(struct ice_pf *pf);
 void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi);
 int ice_init_pf_dcb(struct ice_pf *pf, bool locked);
 void ice_update_dcb_stats(struct ice_pf *pf);
-int
+void
 ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring,
 			      struct ice_tx_buf *first);
+void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info);
 void
 ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 				    struct ice_rq_event_info *event);
-void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc);
+
+/**
+ * ice_find_q_in_range
+ * @low: start of queue range for a TC i.e. offset of TC
+ * @high: start of queue for next TC
+ * @tx_q: hung_queue/tx_queue
+ *
+ * finds if queue 'tx_q' falls between the two offsets of any given TC
+ */
+static inline bool ice_find_q_in_range(u16 low, u16 high, unsigned int tx_q)
+{
+	return (tx_q >= low) && (tx_q < high);
+}
+
 static inline void
 ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, struct ice_ring *ring)
 {
 	tlan_ctx->cgd_num = ring->dcb_tc;
 }
+
+static inline bool ice_is_dcb_active(struct ice_pf *pf)
+{
+	return (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags) ||
+		test_bit(ICE_FLAG_DCB_ENA, pf->flags));
+}
+
+static inline u8 ice_get_pfc_mode(struct ice_pf *pf)
+{
+	return pf->hw.port_info->qos_cfg.local_dcbx_cfg.pfc_mode;
+}
+
 #else
-#define ice_dcb_rebuild(pf) do {} while (0)
+static inline void ice_dcb_rebuild(struct ice_pf *pf) { }
+static inline void ice_vsi_set_dcb_tc_cfg(struct ice_vsi *vsi)
+{
+	vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+	vsi->tc_cfg.numtc = 1;
+}
 
-static inline u8 ice_dcb_get_ena_tc(struct ice_dcbx_cfg __always_unused *dcbcfg)
+static inline u8 ice_get_first_droptc(struct ice_vsi __always_unused *vsi)
 {
-	return ICE_DFLT_TRAFFIC_CLASS;
+	return 0;
 }
 
-static inline u8 ice_dcb_get_num_tc(struct ice_dcbx_cfg __always_unused *dcbcfg)
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+static inline u8
+ice_dcb_get_tc(struct ice_vsi __always_unused *vsi,
+	       int __always_unused queue_index)
 {
-	return 1;
+	return 0;
 }
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
 
 static inline int
 ice_init_pf_dcb(struct ice_pf *pf, bool __always_unused locked)
 {
-	dev_dbg(&pf->pdev->dev, "DCB not supported\n");
+	dev_dbg(ice_pf_to_dev(pf), "DCB not supported\n");
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ice_pf_dcb_cfg(struct ice_pf __always_unused *pf,
+	       struct ice_dcbx_cfg __always_unused *new_cfg,
+	       bool __always_unused locked)
+{
 	return -EOPNOTSUPP;
 }
 
@@ -55,10 +110,30 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_ring __always_unused *tx_ring,
 	return 0;
 }
 
-#define ice_update_dcb_stats(pf) do {} while (0)
-#define ice_vsi_cfg_dcb_rings(vsi) do {} while (0)
-#define ice_dcb_process_lldp_set_mib_change(pf, event) do {} while (0)
-#define ice_set_cgd_num(tlan_ctx, ring) do {} while (0)
-#define ice_vsi_cfg_netdev_tc(vsi, ena_tc) do {} while (0)
+static inline bool ice_is_dcb_active(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline bool
+ice_is_pfc_causing_hung_q(struct ice_pf __always_unused *pf,
+			  unsigned int __always_unused txqueue)
+{
+	return false;
+}
+
+static inline u8 ice_get_pfc_mode(struct ice_pf *pf)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void ice_pf_dcb_recfg(struct ice_pf *pf) { }
+static inline void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi) { }
+static inline void ice_update_dcb_stats(struct ice_pf *pf) { }
+static inline void ice_setup_dcb_qos_info(struct ice_pf *pf, struct ice_qos_params *qos_info) { }
+static inline
+void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, struct ice_ring *ring) { }
 #endif /* CONFIG_DCB */
+
 #endif /* _ICE_DCB_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
new file mode 100644
index 0000000000000000000000000000000000000000..4e7acd02f446794c88053b9d60340a3e4bab2c27
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
@@ -0,0 +1,1139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_dcb.h"
+#include "ice_dcb_lib.h"
+#include "ice_dcb_nl.h"
+#include <net/dcbnl.h>
+
+/**
+ * ice_dcbnl_devreset - perform enough of a ifdown/ifup to sync DCBNL info
+ * @netdev: device associated with interface that needs reset
+ */
+static void ice_dcbnl_devreset(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	while (ice_is_reset_in_progress(pf->state))
+		usleep_range(1000, 2000);
+
+	dev_close(netdev);
+	netdev_state_change(netdev);
+	dev_open(netdev, NULL);
+	netdev_state_change(netdev);
+}
+
+/**
+ * ice_dcbnl_getets - retrieve local ETS configuration
+ * @netdev: the relevant netdev
+ * @ets: struct to hold ETS configuration
+ */
+static int ice_dcbnl_getets(struct net_device *netdev, struct ieee_ets *ets)
+{
+	struct ice_dcbx_cfg *dcbxcfg;
+	struct ice_pf *pf;
+
+	pf = ice_netdev_to_pf(netdev);
+	dcbxcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ets->willing = dcbxcfg->etscfg.willing;
+	ets->ets_cap = dcbxcfg->etscfg.maxtcs;
+	ets->cbs = dcbxcfg->etscfg.cbs;
+	memcpy(ets->tc_tx_bw, dcbxcfg->etscfg.tcbwtable, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_rx_bw, dcbxcfg->etscfg.tcbwtable, sizeof(ets->tc_rx_bw));
+	memcpy(ets->tc_tsa, dcbxcfg->etscfg.tsatable, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, dcbxcfg->etscfg.prio_table, sizeof(ets->prio_tc));
+	memcpy(ets->tc_reco_bw, dcbxcfg->etsrec.tcbwtable,
+	       sizeof(ets->tc_reco_bw));
+	memcpy(ets->tc_reco_tsa, dcbxcfg->etsrec.tsatable,
+	       sizeof(ets->tc_reco_tsa));
+	memcpy(ets->reco_prio_tc, dcbxcfg->etscfg.prio_table,
+	       sizeof(ets->reco_prio_tc));
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_setets - set IEEE ETS configuration
+ * @netdev: pointer to relevant netdev
+ * @ets: struct to hold ETS configuration
+ */
+static int ice_dcbnl_setets(struct net_device *netdev, struct ieee_ets *ets)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int bwcfg = 0, bwrec = 0;
+	int err, i;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto ets_out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	new_cfg->etscfg.willing = ets->willing;
+	new_cfg->etscfg.cbs = ets->cbs;
+	ice_for_each_traffic_class(i) {
+		new_cfg->etscfg.tcbwtable[i] = ets->tc_tx_bw[i];
+		bwcfg += ets->tc_tx_bw[i];
+		new_cfg->etscfg.tsatable[i] = ets->tc_tsa[i];
+		if (new_cfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+			/* in DSCP mode up->tc mapping cannot change */
+			new_cfg->etscfg.prio_table[i] = ets->prio_tc[i];
+			new_cfg->etsrec.prio_table[i] = ets->reco_prio_tc[i];
+		}
+		new_cfg->etsrec.tcbwtable[i] = ets->tc_reco_bw[i];
+		bwrec += ets->tc_reco_bw[i];
+		new_cfg->etsrec.tsatable[i] = ets->tc_reco_tsa[i];
+	}
+
+	if (ice_dcb_bwchk(pf, new_cfg)) {
+		err = -EINVAL;
+		goto ets_out;
+	}
+
+	new_cfg->etscfg.maxtcs = pf->hw.func_caps.common_cap.maxtc;
+
+	if (!bwrec)
+		new_cfg->etsrec.tcbwtable[0] = 100;
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+	/* return of zero indicates new cfg applied */
+	if (err == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	if (err == ICE_DCB_NO_HW_CHG)
+		err = ICE_DCB_HW_CHG_RST;
+
+ets_out:
+	mutex_unlock(&pf->tc_mutex);
+	return err;
+}
+
+/**
+ * ice_dcbnl_getnumtcs - Get max number of traffic classes supported
+ * @dev: pointer to netdev struct
+ * @tcid: TC ID
+ * @num: total number of TCs supported by the adapter
+ *
+ * Return the total number of TCs supported
+ */
+static int
+ice_dcbnl_getnumtcs(struct net_device *dev, int __always_unused tcid, u8 *num)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(dev);
+
+	if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags))
+		return -EINVAL;
+
+	*num = pf->hw.func_caps.common_cap.maxtc;
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getdcbx - retrieve current DCBX capability
+ * @netdev: pointer to the netdev struct
+ */
+static u8 ice_dcbnl_getdcbx(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	return pf->dcbx_cap;
+}
+
+/**
+ * ice_dcbnl_setdcbx - set required DCBX capability
+ * @netdev: the corresponding netdev
+ * @mode: required mode
+ */
+static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_qos_cfg *qos_cfg;
+
+	/* if FW LLDP agent is running, DCBNL not allowed to change mode */
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* No support for LLD_MANAGED modes or CEE+IEEE */
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    ((mode & DCB_CAP_DCBX_VER_IEEE) && (mode & DCB_CAP_DCBX_VER_CEE)) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* Already set to the given mode no change */
+	if (mode == pf->dcbx_cap)
+		return ICE_DCB_NO_HW_CHG;
+
+	pf->dcbx_cap = mode;
+	qos_cfg = &pf->hw.port_info->qos_cfg;
+	if (mode & DCB_CAP_DCBX_VER_CEE) {
+		if (qos_cfg->local_dcbx_cfg.pfc_mode == ICE_QOS_MODE_DSCP)
+			return ICE_DCB_NO_HW_CHG;
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_CEE;
+	} else {
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_IEEE;
+	}
+
+	dev_info(ice_pf_to_dev(pf), "DCBx mode = 0x%x\n", mode);
+	return ICE_DCB_HW_CHG_RST;
+}
+
+/**
+ * ice_dcbnl_get_perm_hw_addr - MAC address used by DCBX
+ * @netdev: pointer to netdev struct
+ * @perm_addr: buffer to return permanent MAC address
+ */
+static void ice_dcbnl_get_perm_hw_addr(struct net_device *netdev, u8 *perm_addr)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+	int i, j;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	for (i = 0; i < netdev->addr_len; i++)
+		perm_addr[i] = pi->mac.perm_addr[i];
+
+	for (j = 0; j < netdev->addr_len; j++, i++)
+		perm_addr[i] = pi->mac.perm_addr[j];
+}
+
+/**
+ * ice_get_pfc_delay - Retrieve PFC Link Delay
+ * @hw: pointer to HW struct
+ * @delay: holds the PFC Link Delay value
+ */
+static void ice_get_pfc_delay(struct ice_hw *hw, u16 *delay)
+{
+	u32 val;
+
+	val = rd32(hw, PRTDCB_GENC);
+	*delay = (u16)((val & PRTDCB_GENC_PFCLDA_M) >> PRTDCB_GENC_PFCLDA_S);
+}
+
+/**
+ * ice_dcbnl_getpfc - retrieve local IEEE PFC config
+ * @netdev: pointer to netdev struct
+ * @pfc: struct to hold PFC info
+ */
+static int ice_dcbnl_getpfc(struct net_device *netdev, struct ieee_pfc *pfc)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+	struct ice_dcbx_cfg *dcbxcfg;
+	int i;
+
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
+	pfc->pfc_cap = dcbxcfg->pfc.pfccap;
+	pfc->pfc_en = dcbxcfg->pfc.pfcena;
+	pfc->mbc = dcbxcfg->pfc.mbc;
+	ice_get_pfc_delay(&pf->hw, &pfc->delay);
+
+	ice_for_each_traffic_class(i) {
+		pfc->requests[i] = pf->stats.priority_xoff_tx[i];
+		pfc->indications[i] = pf->stats.priority_xoff_rx[i];
+	}
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_setpfc - set local IEEE PFC config
+ * @netdev: pointer to relevant netdev
+ * @pfc: pointer to struct holding PFC config
+ */
+static int ice_dcbnl_setpfc(struct net_device *netdev, struct ieee_pfc *pfc)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int err;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto pfc_out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	if (pfc->pfc_cap)
+		new_cfg->pfc.pfccap = pfc->pfc_cap;
+	else
+		new_cfg->pfc.pfccap = pf->hw.func_caps.common_cap.maxtc;
+
+	new_cfg->pfc.pfcena = pfc->pfc_en;
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+	if (err == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	if (err == ICE_DCB_NO_HW_CHG)
+		err = ICE_DCB_HW_CHG_RST;
+#ifdef NETIF_F_HW_TC
+pfc_out:
+#endif /* NETIF_F_HW_TC */
+	mutex_unlock(&pf->tc_mutex);
+	return err;
+}
+
+/**
+ * ice_dcbnl_get_pfc_cfg - Get CEE PFC config
+ * @netdev: pointer to netdev struct
+ * @prio: corresponding user priority
+ * @setting: the PFC setting for given priority
+ */
+static void
+ice_dcbnl_get_pfc_cfg(struct net_device *netdev, int prio, u8 *setting)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*setting = (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena >> prio) & 0x1;
+	dev_dbg(ice_pf_to_dev(pf), "Get PFC Config up=%d, setting=%d, pfcenable=0x%x\n",
+		prio, *setting, pi->qos_cfg.local_dcbx_cfg.pfc.pfcena);
+}
+
+/**
+ * ice_dcbnl_set_pfc_cfg - Set CEE PFC config
+ * @netdev: the corresponding netdev
+ * @prio: User Priority
+ * @set: PFC setting to apply
+ */
+static void ice_dcbnl_set_pfc_cfg(struct net_device *netdev, int prio, u8 set)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	new_cfg->pfc.pfccap = pf->hw.func_caps.common_cap.maxtc;
+	if (set)
+		new_cfg->pfc.pfcena |= BIT(prio);
+	else
+		new_cfg->pfc.pfcena &= ~BIT(prio);
+
+	dev_dbg(ice_pf_to_dev(pf), "Set PFC config UP:%d set:%d pfcena:0x%x\n",
+		prio, set, new_cfg->pfc.pfcena);
+}
+
+/**
+ * ice_dcbnl_getpfcstate - get CEE PFC mode
+ * @netdev: pointer to netdev struct
+ */
+static u8 ice_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	/* Return enabled if any UP enabled for PFC */
+	if (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getstate - get DCB enabled state
+ * @netdev: pointer to netdev struct
+ */
+static u8 ice_dcbnl_getstate(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	u8 state = 0;
+
+	state = test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
+
+	dev_dbg(ice_pf_to_dev(pf), "DCB enabled state = %d\n", state);
+	return state;
+}
+
+/**
+ * ice_dcbnl_setstate - Set CEE DCB state
+ * @netdev: pointer to relevant netdev
+ * @state: state value to set
+ */
+static u8 ice_dcbnl_setstate(struct net_device *netdev, u8 state)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return ICE_DCB_NO_HW_CHG;
+
+	/* Nothing to do */
+	if (!!state == test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		return ICE_DCB_NO_HW_CHG;
+
+	if (state) {
+		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
+		memcpy(&pf->hw.port_info->qos_cfg.desired_dcbx_cfg,
+		       &pf->hw.port_info->qos_cfg.local_dcbx_cfg,
+		       sizeof(struct ice_dcbx_cfg));
+	} else {
+		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
+	}
+
+	return ICE_DCB_HW_CHG;
+}
+
+/**
+ * ice_dcbnl_get_pg_tc_cfg_tx - get CEE PG Tx config
+ * @netdev: pointer to netdev struct
+ * @prio: the corresponding user priority
+ * @prio_type: traffic priority type
+ * @pgid: the BW group ID the traffic class belongs to
+ * @bw_pct: BW percentage for the corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int prio,
+			   u8 __always_unused *prio_type, u8 *pgid,
+			   u8 __always_unused *bw_pct,
+			   u8 __always_unused *up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
+	dev_dbg(ice_pf_to_dev(pf), "Get PG config prio=%d tc=%d\n", prio,
+		*pgid);
+}
+
+/**
+ * ice_dcbnl_set_pg_tc_cfg_tx - set CEE PG Tx config
+ * @netdev: pointer to relevant netdev
+ * @tc: the corresponding traffic class
+ * @prio_type: the traffic priority type
+ * @bwg_id: the BW group ID the TC belongs to
+ * @bw_pct: the BW perventage for the BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc,
+			   u8 __always_unused prio_type,
+			   u8 __always_unused bwg_id,
+			   u8 __always_unused bw_pct, u8 up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int i;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	/* prio_type, bwg_id and bw_pct per UP are not supported */
+
+	ice_for_each_traffic_class(i) {
+		if (up_map & BIT(i))
+			new_cfg->etscfg.prio_table[i] = tc;
+	}
+	new_cfg->etscfg.tsatable[tc] = ICE_IEEE_TSA_ETS;
+}
+
+/**
+ * ice_dcbnl_get_pg_bwg_cfg_tx - Get CEE PGBW config
+ * @netdev: pointer to the netdev struct
+ * @pgid: corresponding traffic class
+ * @bw_pct: the BW percentage for the corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 *bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	*bw_pct = pi->qos_cfg.local_dcbx_cfg.etscfg.tcbwtable[pgid];
+	dev_dbg(ice_pf_to_dev(pf), "Get PG BW config tc=%d bw_pct=%d\n",
+		pgid, *bw_pct);
+}
+
+/**
+ * ice_dcbnl_set_pg_bwg_cfg_tx - set CEE PG Tx BW config
+ * @netdev: the corresponding netdev
+ * @pgid: Correspongind traffic class
+ * @bw_pct: the BW percentage for the specified TC
+ */
+static void
+ice_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
+		return;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	new_cfg->etscfg.tcbwtable[pgid] = bw_pct;
+}
+
+/**
+ * ice_dcbnl_get_pg_tc_cfg_rx - Get CEE PG Rx config
+ * @netdev: pointer to netdev struct
+ * @prio: the corresponding user priority
+ * @prio_type: the traffic priority type
+ * @pgid: the PG ID
+ * @bw_pct: the BW percentage for the corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int prio,
+			   u8 __always_unused *prio_type, u8 *pgid,
+			   u8 __always_unused *bw_pct,
+			   u8 __always_unused *up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	if (prio >= ICE_MAX_USER_PRIORITY)
+		return;
+
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
+}
+
+/**
+ * ice_dcbnl_set_pg_tc_cfg_rx
+ * @netdev: relevant netdev struct
+ * @prio: corresponding user priority
+ * @prio_type: the traffic priority type
+ * @pgid: the PG ID
+ * @bw_pct: BW percentage for corresponding BWG
+ * @up_map: prio mapped to corresponding TC
+ *
+ * lldpad requires this function pointer to be non-NULL to complete CEE config.
+ */
+static void
+ice_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev,
+			   int __always_unused prio,
+			   u8 __always_unused prio_type,
+			   u8 __always_unused pgid,
+			   u8 __always_unused bw_pct,
+			   u8 __always_unused up_map)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	dev_dbg(ice_pf_to_dev(pf), "Rx TC PG Config Not Supported.\n");
+}
+
+/**
+ * ice_dcbnl_get_pg_bwg_cfg_rx - Get CEE PG BW Rx config
+ * @netdev: pointer to netdev struct
+ * @pgid: the corresponding traffic class
+ * @bw_pct: the BW percentage for the corresponding TC
+ */
+static void
+ice_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int __always_unused pgid,
+			    u8 *bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return;
+
+	*bw_pct = 0;
+}
+
+/**
+ * ice_dcbnl_set_pg_bwg_cfg_rx
+ * @netdev: the corresponding netdev
+ * @pgid: corresponding TC
+ * @bw_pct: BW percentage for given TC
+ *
+ * lldpad requires this function pointer to be non-NULL to complete CEE config.
+ */
+static void
+ice_dcbnl_set_pg_bwg_cfg_rx(struct net_device *netdev, int __always_unused pgid,
+			    u8 __always_unused bw_pct)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	dev_dbg(ice_pf_to_dev(pf), "Rx BWG PG Config Not Supported.\n");
+}
+
+/**
+ * ice_dcbnl_get_cap - Get DCBX capabilities of adapter
+ * @netdev: pointer to netdev struct
+ * @capid: the capability type
+ * @cap: the capability value
+ */
+static u8 ice_dcbnl_get_cap(struct net_device *netdev, int capid, u8 *cap)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+
+	if (!(test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags)))
+		return ICE_DCB_NO_HW_CHG;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 0x80;
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 0x80;
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = pf->dcbx_cap;
+		break;
+	default:
+		*cap = false;
+		break;
+	}
+
+	dev_dbg(ice_pf_to_dev(pf), "DCBX Get Capability cap=%d capval=0x%x\n",
+		capid, *cap);
+	return 0;
+}
+
+/**
+ * ice_dcbnl_getapp - get CEE APP
+ * @netdev: pointer to netdev struct
+ * @idtype: the App selector
+ * @id: the App ethtype or port number
+ */
+#ifdef HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+static int ice_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+#else
+static u8 ice_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+#endif /* HAVE_DCBNL_OPS_SETAPP_RETURN_INT */
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct dcb_app app = {
+				.selector = idtype,
+				.protocol = id,
+			     };
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return -EINVAL;
+
+	return dcb_getapp(netdev, &app);
+}
+
+/**
+ * ice_dcbnl_find_app - Search for APP in given DCB config
+ * @cfg: struct to hold DCBX config
+ * @app: struct to hold app data to look for
+ */
+static bool
+ice_dcbnl_find_app(struct ice_dcbx_cfg *cfg,
+		   struct ice_dcb_app_priority_table *app)
+{
+	unsigned int i;
+
+	for (i = 0; i < cfg->numapps; i++) {
+		if (app->selector == cfg->app[i].selector &&
+		    app->prot_id == cfg->app[i].prot_id &&
+		    app->priority == cfg->app[i].priority)
+			return true;
+	}
+
+	return false;
+}
+
+#define ICE_BYTES_PER_DSCP_VAL		8
+
+/**
+ * ice_dcbnl_setapp - set local IEEE App config
+ * @netdev: relevant netdev struct
+ * @app: struct to hold app config info
+ */
+static int ice_dcbnl_setapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcb_app_priority_table new_app;
+	struct ice_dcbx_cfg *old_cfg, *new_cfg;
+	u8 max_tc;
+	int ret;
+
+	/* ONLY DSCP APP TLVs have operational significance */
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP)
+		return -EINVAL;
+
+	/* only allow APP TLVs in SW Mode */
+	if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+		netdev_err(netdev, "can't do DSCP QoS when FW DCB agent active\n");
+		return -EINVAL;
+	}
+
+	if (!(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	if (!ice_is_feature_supported(pf, ICE_F_DSCP))
+		return -EOPNOTSUPP;
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev,
+			   "can't set DCB configuration when ADQ is active\n");
+		return ICE_DCB_NO_HW_CHG;
+	}
+#endif /* NETIF_F_HW_TC */
+	if (app->protocol >= ICE_DSCP_NUM_VAL) {
+		netdev_err(netdev, "DSCP value 0x%04X out of range\n",
+			   app->protocol);
+		return -EINVAL;
+	}
+
+	max_tc = pf->hw.func_caps.common_cap.maxtc;
+	if (app->priority >= max_tc) {
+		netdev_err(netdev, "TC %d out of range, max TC %d\n",
+			   app->priority, max_tc);
+		return -EINVAL;
+	}
+
+
+	/* grab TC mutex */
+	mutex_lock(&pf->tc_mutex);
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ret = dcb_ieee_setapp(netdev, app);
+	if (ret)
+		goto setapp_out;
+
+	if (test_and_set_bit(app->protocol, new_cfg->dscp_mapped)) {
+		netdev_err(netdev, "DSCP value 0x%04X already user mapped\n",
+			   app->protocol);
+		ret = dcb_ieee_delapp(netdev, app);
+		if (ret)
+			netdev_err(netdev, "Failed to delete re-mapping TLV\n");
+		ret = -EINVAL;
+		goto setapp_out;
+	}
+
+	new_app.selector = app->selector;
+	new_app.prot_id = app->protocol;
+	new_app.priority = app->priority;
+
+	/* If port is not in DSCP mode, need to set */
+	if (old_cfg->pfc_mode == ICE_QOS_MODE_VLAN) {
+		int i, j;
+
+		/* set DSCP mode */
+		ret = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_DSCP_BASED_PFC,
+					  NULL);
+		if (ret) {
+			netdev_err(netdev, "Failed to set DSCP PFC mode %d\n",
+				   ret);
+			goto setapp_out;
+		}
+		netdev_info(netdev, "Switched QoS to L3 DSCP mode\n");
+
+		new_cfg->pfc_mode = ICE_QOS_MODE_DSCP;
+
+		/* set default DSCP QoS values */
+		new_cfg->etscfg.willing = 0;
+		new_cfg->pfc.pfccap = max_tc;
+		new_cfg->pfc.willing = 0;
+
+		for (i = 0; i < max_tc; i++)
+			for (j = 0; j < ICE_BYTES_PER_DSCP_VAL; j++) {
+				int dscp, offset;
+
+				dscp = (i * max_tc) + j;
+				offset = max_tc * ICE_BYTES_PER_DSCP_VAL;
+
+				new_cfg->dscp_map[dscp] = i;
+				/* if less that 8 TCs supported */
+				if (max_tc < ICE_MAX_TRAFFIC_CLASS)
+					new_cfg->dscp_map[dscp + offset] = i;
+			}
+
+		new_cfg->etscfg.tcbwtable[0] = 100;
+		new_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS;
+		new_cfg->etscfg.prio_table[0] = 0;
+
+		for (i = 1; i < max_tc; i++) {
+			new_cfg->etscfg.tcbwtable[i] = 0;
+			new_cfg->etscfg.tsatable[i] = ICE_IEEE_TSA_ETS;
+			new_cfg->etscfg.prio_table[i] = i;
+		}
+	} /* end of switching to DSCP mode */
+
+	/* apply new mapping for this DSCP value */
+	new_cfg->dscp_map[app->protocol] = app->priority;
+	new_cfg->app[new_cfg->numapps++] = new_app;
+
+	ret = ice_pf_dcb_cfg(pf, new_cfg, true);
+	/* return of zero indicates new cfg applied */
+	if (ret == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+	else
+		ret = ICE_DCB_NO_HW_CHG;
+
+setapp_out:
+	mutex_unlock(&pf->tc_mutex);
+	return ret;
+}
+
+/**
+ * ice_dcbnl_delapp - Delete local IEEE App config
+ * @netdev: relevant netdev
+ * @app: struct to hold app too delete
+ *
+ * Will not delete first application required by the FW
+ */
+static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *old_cfg, *new_cfg;
+	unsigned int i, j;
+	int ret = 0;
+
+	if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+		netdev_err(netdev, "can't delete DSCP netlink app when FW DCB agent is active\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		ret = ICE_DCB_NO_HW_CHG;
+		goto delapp_out;
+	}
+#endif /* NETIF_F_HW_TC */
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ret = dcb_ieee_delapp(netdev, app);
+	if (ret)
+		goto delapp_out;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	for (i = 0; i < new_cfg->numapps; i++) {
+		if (app->selector == new_cfg->app[i].selector &&
+		    app->protocol == new_cfg->app[i].prot_id &&
+		    app->priority == new_cfg->app[i].priority) {
+			new_cfg->app[i].selector = 0;
+			new_cfg->app[i].prot_id = 0;
+			new_cfg->app[i].priority = 0;
+			break;
+		}
+	}
+
+	/* Did not find DCB App */
+	if (i == new_cfg->numapps) {
+		ret = -EINVAL;
+		goto delapp_out;
+	}
+
+	new_cfg->numapps--;
+
+	for (j = i; j < new_cfg->numapps; j++) {
+		new_cfg->app[j].selector = old_cfg->app[j + 1].selector;
+		new_cfg->app[j].prot_id = old_cfg->app[j + 1].prot_id;
+		new_cfg->app[j].priority = old_cfg->app[j + 1].priority;
+	}
+
+	/* if not a DSCP APP TLV or DSCP is not supported, we are done */
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP ||
+	    !ice_is_feature_supported(pf, ICE_F_DSCP)) {
+		ret = ICE_DCB_HW_CHG;
+		goto delapp_out;
+	}
+
+	/* if DSCP TLV, then need to address change in mapping */
+	clear_bit(app->protocol, new_cfg->dscp_mapped);
+	/* remap this DSCP value to default value */
+	new_cfg->dscp_map[app->protocol] = app->protocol %
+					   ICE_BYTES_PER_DSCP_VAL;
+
+	/* if the last DSCP mapping just got deleted, need to switch
+	 * to L2 VLAN QoS mode
+	 */
+	if (bitmap_empty(new_cfg->dscp_mapped, ICE_DSCP_NUM_VAL) &&
+	    new_cfg->pfc_mode == ICE_QOS_MODE_DSCP) {
+
+		ret = ice_aq_set_pfc_mode(&pf->hw,
+					  ICE_AQC_PFC_VLAN_BASED_PFC,
+					  NULL);
+		if (ret) {
+			netdev_info(netdev, "Failed to set VLAN PFC mode %d\n",
+				    ret);
+			goto delapp_out;
+		}
+		netdev_info(netdev, "Switched QoS to L2 VLAN mode\n");
+
+		new_cfg->pfc_mode = ICE_QOS_MODE_VLAN;
+
+		ret = ice_dcb_sw_dflt_cfg(pf, true, true);
+	} else {
+		ret = ice_pf_dcb_cfg(pf, new_cfg, true);
+	}
+
+	/* return of ICE_DCB_HW_CHG_RST indicates new cfg applied
+	 * and reset needs to be performed
+	 */
+	if (ret == ICE_DCB_HW_CHG_RST)
+		ice_dcbnl_devreset(netdev);
+
+	/* if the change was not siginificant enough to actually call
+	 * the reconfiguration flow, we still need to tell caller that
+	 * their request was successfully handled
+	 */
+	if (ret == ICE_DCB_NO_HW_CHG)
+		ret = ICE_DCB_HW_CHG;
+
+delapp_out:
+	mutex_unlock(&pf->tc_mutex);
+	return ret;
+}
+
+/**
+ * ice_dcbnl_cee_set_all - Commit CEE DCB settings to HW
+ * @netdev: the corresponding netdev
+ */
+static u8 ice_dcbnl_cee_set_all(struct net_device *netdev)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_dcbx_cfg *new_cfg;
+	int err;
+
+	if ((pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return ICE_DCB_NO_HW_CHG;
+
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
+
+	mutex_lock(&pf->tc_mutex);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev, "can't set DCB configuration when ADQ is active\n");
+		err = ICE_DCB_NO_HW_CHG;
+		goto out;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	err = ice_pf_dcb_cfg(pf, new_cfg, true);
+
+#ifdef NETIF_F_HW_TC
+out:
+#endif /* NETIF_F_HW_TC */
+	mutex_unlock(&pf->tc_mutex);
+	return (err != ICE_DCB_HW_CHG_RST) ? ICE_DCB_NO_HW_CHG : err;
+}
+
+static const struct dcbnl_rtnl_ops dcbnl_ops = {
+	/* IEEE 802.1Qaz std */
+	.ieee_getets = ice_dcbnl_getets,
+	.ieee_setets = ice_dcbnl_setets,
+	.ieee_getpfc = ice_dcbnl_getpfc,
+	.ieee_setpfc = ice_dcbnl_setpfc,
+	.ieee_setapp = ice_dcbnl_setapp,
+	.ieee_delapp = ice_dcbnl_delapp,
+
+	/* CEE std */
+	.getstate = ice_dcbnl_getstate,
+	.setstate = ice_dcbnl_setstate,
+	.getpermhwaddr = ice_dcbnl_get_perm_hw_addr,
+	.setpgtccfgtx = ice_dcbnl_set_pg_tc_cfg_tx,
+	.setpgbwgcfgtx = ice_dcbnl_set_pg_bwg_cfg_tx,
+	.setpgtccfgrx = ice_dcbnl_set_pg_tc_cfg_rx,
+	.setpgbwgcfgrx = ice_dcbnl_set_pg_bwg_cfg_rx,
+	.getpgtccfgtx = ice_dcbnl_get_pg_tc_cfg_tx,
+	.getpgbwgcfgtx = ice_dcbnl_get_pg_bwg_cfg_tx,
+	.getpgtccfgrx = ice_dcbnl_get_pg_tc_cfg_rx,
+	.getpgbwgcfgrx = ice_dcbnl_get_pg_bwg_cfg_rx,
+	.setpfccfg = ice_dcbnl_set_pfc_cfg,
+	.getpfccfg = ice_dcbnl_get_pfc_cfg,
+	.setall = ice_dcbnl_cee_set_all,
+	.getcap = ice_dcbnl_get_cap,
+	.getnumtcs = ice_dcbnl_getnumtcs,
+	.getpfcstate = ice_dcbnl_getpfcstate,
+	.getapp = ice_dcbnl_getapp,
+
+	/* DCBX configuration */
+	.getdcbx = ice_dcbnl_getdcbx,
+	.setdcbx = ice_dcbnl_setdcbx,
+};
+
+/**
+ * ice_dcbnl_set_all - set all the apps and ieee data from DCBX config
+ * @vsi: pointer to VSI struct
+ */
+void ice_dcbnl_set_all(struct ice_vsi *vsi)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_dcbx_cfg *dcbxcfg;
+	struct ice_port_info *pi;
+	struct dcb_app sapp;
+	struct ice_pf *pf;
+	unsigned int i;
+
+	if (!netdev)
+		return;
+
+	pf = ice_netdev_to_pf(netdev);
+	pi = pf->hw.port_info;
+
+	/* SW DCB taken care of by SW Default Config */
+	if (pf->dcbx_cap & DCB_CAP_DCBX_HOST)
+		return;
+
+	/* DCB not enabled */
+	if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		return;
+
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
+
+	for (i = 0; i < dcbxcfg->numapps; i++) {
+		u8 prio, tc_map;
+
+		prio = dcbxcfg->app[i].priority;
+		tc_map = BIT(dcbxcfg->etscfg.prio_table[prio]);
+
+		/* Add APP only if the TC is enabled for this VSI */
+		if (tc_map & vsi->tc_cfg.ena_tc) {
+			sapp.selector = dcbxcfg->app[i].selector;
+			sapp.protocol = dcbxcfg->app[i].prot_id;
+			sapp.priority = prio;
+			dcb_ieee_setapp(netdev, &sapp);
+		}
+	}
+#ifdef HAVE_DCBNL_IEEE_DELAPP
+	/* Notify user-space of the changes */
+	dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, 0, 0);
+#endif /* HAVE_DCBNL_IEEE_DELAPP */
+}
+
+/**
+ * ice_dcbnl_vsi_del_app - Delete APP on all VSIs
+ * @vsi: pointer to the main VSI
+ * @app: APP to delete
+ *
+ * Delete given APP from all the VSIs for given PF
+ */
+static void
+ice_dcbnl_vsi_del_app(struct ice_vsi *vsi,
+		      struct ice_dcb_app_priority_table *app)
+{
+	struct dcb_app sapp;
+	int err;
+
+	sapp.selector = app->selector;
+	sapp.protocol = app->prot_id;
+	sapp.priority = app->priority;
+	err = ice_dcbnl_delapp(vsi->netdev, &sapp);
+	dev_dbg(ice_pf_to_dev(vsi->back), "Deleting app for VSI idx=%d err=%d sel=%d proto=0x%x, prio=%d\n",
+		vsi->idx, err, app->selector, app->prot_id, app->priority);
+}
+
+/**
+ * ice_dcbnl_flush_apps - Delete all removed APPs
+ * @pf: the corresponding PF
+ * @old_cfg: old DCBX configuration data
+ * @new_cfg: new DCBX configuration data
+ *
+ * Find and delete all APPS that are not present in the passed
+ * DCB configuration
+ */
+void
+ice_dcbnl_flush_apps(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
+		     struct ice_dcbx_cfg *new_cfg)
+{
+	struct ice_vsi *main_vsi = ice_get_main_vsi(pf);
+	unsigned int i;
+
+	if (!main_vsi)
+		return;
+
+	for (i = 0; i < old_cfg->numapps; i++) {
+		struct ice_dcb_app_priority_table app = old_cfg->app[i];
+
+		/* The APP is not available anymore delete it */
+		if (!ice_dcbnl_find_app(new_cfg, &app))
+			ice_dcbnl_vsi_del_app(main_vsi, &app);
+	}
+}
+
+/**
+ * ice_dcbnl_setup - setup DCBNL
+ * @vsi: VSI to get associated netdev from
+ */
+void ice_dcbnl_setup(struct ice_vsi *vsi)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_pf *pf;
+
+	pf = ice_netdev_to_pf(netdev);
+	if (!test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags))
+		return;
+
+	netdev->dcbnl_ops = &dcbnl_ops;
+	ice_dcbnl_set_all(vsi);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.h b/drivers/net/ethernet/intel/ice/ice_dcb_nl.h
new file mode 100644
index 0000000000000000000000000000000000000000..69bea270e372cc9271774f0b3c610d53761eccd5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DCB_NL_H_
+#define _ICE_DCB_NL_H_
+
+#ifdef CONFIG_DCB
+void ice_dcbnl_setup(struct ice_vsi *vsi);
+void ice_dcbnl_set_all(struct ice_vsi *vsi);
+void
+ice_dcbnl_flush_apps(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg,
+		     struct ice_dcbx_cfg *new_cfg);
+#else
+static inline void ice_dcbnl_setup(struct ice_vsi *vsi) { }
+static inline void ice_dcbnl_set_all(struct ice_vsi *vsi) { }
+static inline void ice_dcbnl_flush_apps(struct ice_pf *pf,
+					struct ice_dcbx_cfg *old_cfg,
+					struct ice_dcbx_cfg *new_cfg) { }
+#endif /* CONFIG_DCB */
+#endif /* _ICE_DCB_NL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_dcf.c b/drivers/net/ethernet/intel/ice/ice_dcf.c
new file mode 100644
index 0000000000000000000000000000000000000000..7810116d9d3b2eaa12b9e3b9f33c35ee317976d1
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcf.c
@@ -0,0 +1,1137 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+
+static const enum ice_adminq_opc aqc_permitted_tbl[] = {
+	/* Generic Firmware Admin commands */
+	ice_aqc_opc_get_ver,
+	ice_aqc_opc_req_res,
+	ice_aqc_opc_release_res,
+	ice_aqc_opc_list_func_caps,
+	ice_aqc_opc_list_dev_caps,
+
+	ice_aqc_opc_get_vlan_mode_parameters,
+
+	/* Package Configuration Admin Commands */
+	ice_aqc_opc_update_pkg,
+	ice_aqc_opc_get_pkg_info_list,
+
+	/* PHY commands */
+	ice_aqc_opc_get_phy_caps,
+	ice_aqc_opc_get_link_status,
+
+	/* Switch Block */
+	ice_aqc_opc_get_sw_cfg,
+	ice_aqc_opc_alloc_res,
+	ice_aqc_opc_free_res,
+	ice_aqc_opc_add_recipe,
+	ice_aqc_opc_recipe_to_profile,
+	ice_aqc_opc_get_recipe,
+	ice_aqc_opc_get_recipe_to_profile,
+	ice_aqc_opc_add_sw_rules,
+	ice_aqc_opc_update_sw_rules,
+	ice_aqc_opc_remove_sw_rules,
+
+	/* ACL commands */
+	ice_aqc_opc_alloc_acl_tbl,
+	ice_aqc_opc_dealloc_acl_tbl,
+	ice_aqc_opc_alloc_acl_actpair,
+	ice_aqc_opc_dealloc_acl_actpair,
+	ice_aqc_opc_alloc_acl_scen,
+	ice_aqc_opc_dealloc_acl_scen,
+	ice_aqc_opc_alloc_acl_counters,
+	ice_aqc_opc_dealloc_acl_counters,
+	ice_aqc_opc_dealloc_acl_res,
+	ice_aqc_opc_update_acl_scen,
+	ice_aqc_opc_program_acl_actpair,
+	ice_aqc_opc_program_acl_prof_extraction,
+	ice_aqc_opc_program_acl_prof_ranges,
+	ice_aqc_opc_program_acl_entry,
+	ice_aqc_opc_query_acl_prof,
+	ice_aqc_opc_query_acl_prof_ranges,
+	ice_aqc_opc_query_acl_scen,
+	ice_aqc_opc_query_acl_entry,
+	ice_aqc_opc_query_acl_actpair,
+	ice_aqc_opc_query_acl_counter,
+};
+
+/**
+ * ice_dcf_aq_cmd_permitted - validate the AdminQ command permitted or not
+ * @desc: descriptor describing the command
+ */
+bool ice_dcf_aq_cmd_permitted(struct ice_aq_desc *desc)
+{
+	u16 opc = le16_to_cpu(desc->opcode);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(aqc_permitted_tbl); i++)
+		if (opc == aqc_permitted_tbl[i])
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_dcf_is_acl_aq_cmd - check if the AdminQ command is ACL command
+ * @desc: descriptor describing the command
+ */
+bool ice_dcf_is_acl_aq_cmd(struct ice_aq_desc *desc)
+{
+	u16 opc = le16_to_cpu(desc->opcode);
+
+	if (opc >= ice_aqc_opc_alloc_acl_tbl &&
+	    opc <= ice_aqc_opc_query_acl_counter)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_dcf_is_udp_tunnel_aq_cmd - check if the AdminQ command is UDP tunnel
+ * command
+ * @desc: descriptor describing the command
+ * @aq_buf: AdminQ buffer
+ */
+bool ice_dcf_is_udp_tunnel_aq_cmd(struct ice_aq_desc *desc, u8 *aq_buf)
+{
+	struct ice_buf_hdr *pkg_buf;
+
+	if (!aq_buf)
+		return false;
+
+	if (le16_to_cpu(desc->opcode) != ice_aqc_opc_update_pkg)
+		return false;
+
+	pkg_buf = (struct ice_buf_hdr *)aq_buf;
+	/* section count for udp tunnel command is always 2 */
+	if (le16_to_cpu(pkg_buf->section_count) != 2)
+		return false;
+
+	if (le32_to_cpu(pkg_buf->section_entry[0].type) ==
+	    ICE_SID_RXPARSER_BOOST_TCAM ||
+	    le32_to_cpu(pkg_buf->section_entry[0].type) ==
+	    ICE_SID_TXPARSER_BOOST_TCAM)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_check_dcf_allowed - check if DCF is allowed based on various checks
+ * @vf: pointer to the VF to check
+ */
+bool ice_check_dcf_allowed(struct ice_vf *vf)
+{
+	struct ice_switch_info *sw;
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+	u16 i;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (vf->vf_id != ICE_DCF_VFID0 && vf->vf_id != ICE_DCF_VFID1) {
+		dev_err(dev, "VF %d requested DCF capability, but only VF %d and %d are allowed to request DCF capability\n",
+			vf->vf_id, ICE_DCF_VFID0, ICE_DCF_VFID1);
+		return false;
+	}
+
+	if (!vf->trusted) {
+#ifdef HAVE_NDO_SET_VF_TRUST
+		dev_err(dev, "VF needs to be trusted to configure DCF capability\n");
+		return false;
+#else
+
+		int ret;
+		ret = ice_set_vf_trust(ice_get_main_vsi(pf)->netdev, vf->vf_id, true);
+		if (ret) {
+			dev_err(dev, "Failed to set trusted VF to configure DCF capability.\n");
+			return false;
+		}
+#endif /* HAVE_NDO_SET_VF_TRUST */
+	}
+
+	/* DCF and ADQ are mutually exclusive. */
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		dev_err(dev, "ADQ on PF is currently enabled. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* NETIF_F_HW_TC */
+	ice_for_each_vf(pf, i) {
+		if (pf->vf[i].adq_enabled) {
+			dev_err(dev, "ADQ on VF %d is currently enabled. Device Control Functionality cannot be enabled.\n",
+				pf->vf[i].vf_id);
+			return false;
+		}
+	}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	if (!hlist_empty(&pf->tc_flower_fltr_list)) {
+		dev_err(dev, "TC filters on PF are currently in use. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	ice_for_each_vf(pf, i) {
+		if (pf->vf[i].num_dmac_chnl_fltrs) {
+			dev_err(dev, "TC filters on VF %d are currently in use. Device Control Functionality cannot be enabled.\n",
+				pf->vf[i].vf_id);
+			return false;
+		}
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (ice_is_offloaded_macvlan_ena(pf)) {
+		dev_err(dev, "L2 Forwarding Offload is currently enabled. Device Control Functionality cannot be enabled.\n");
+		return false;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	sw = pf->hw.switch_info;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		if (sw->recp_list[i].adv_rule) {
+			dev_err(dev, "Advanced switch filters are currently in use. Device Control Functionality cannot be enabled.\n");
+			return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * ice_is_dcf_enabled - Check the DCF enabled status of the associated PF
+ * @pf: PF instance
+ */
+bool ice_is_dcf_enabled(struct ice_pf *pf)
+{
+	return !!pf->dcf.vf;
+}
+
+/**
+ * ice_is_vf_dcf - helper to check if the assigned VF is a DCF
+ * @vf: the assigned VF to be checked
+ */
+bool ice_is_vf_dcf(struct ice_vf *vf)
+{
+	return vf == vf->pf->dcf.vf;
+}
+
+/**
+ * ice_dcf_get_state - Get DCF state of the associated PF
+ * @pf: PF instance
+ */
+enum ice_dcf_state ice_dcf_get_state(struct ice_pf *pf)
+{
+	return pf->dcf.vf ? pf->dcf.state : ICE_DCF_STATE_OFF;
+}
+
+/**
+ * ice_dcf_state_str - convert DCF state code to a string
+ * @state: the DCF state code to convert
+ */
+static const char *ice_dcf_state_str(enum ice_dcf_state state)
+{
+	switch (state) {
+	case ICE_DCF_STATE_OFF:
+		return "ICE_DCF_STATE_OFF";
+	case ICE_DCF_STATE_ON:
+		return "ICE_DCF_STATE_ON";
+	case ICE_DCF_STATE_BUSY:
+		return "ICE_DCF_STATE_BUSY";
+	case ICE_DCF_STATE_PAUSE:
+		return "ICE_DCF_STATE_PAUSE";
+	}
+
+	return "ICE_DCF_STATE_UNKNOWN";
+}
+
+/**
+ * ice_dcf_set_state - Set DCF state for the associated PF
+ * @pf: PF instance
+ * @state: new DCF state
+ */
+void ice_dcf_set_state(struct ice_pf *pf, enum ice_dcf_state state)
+{
+	dev_dbg(ice_pf_to_dev(pf), "DCF state is changing from %s to %s\n",
+		ice_dcf_state_str(pf->dcf.state),
+		ice_dcf_state_str(state));
+
+	pf->dcf.state = state;
+}
+
+/**
+ * ice_dcf_rm_sw_rule_to_vsi - remove switch rule of "forward to VSI"
+ * @pf: pointer to the PF struct
+ * @s_entry: pointer to switch rule entry to remove
+ */
+static int
+ice_dcf_rm_sw_rule_to_vsi(struct ice_pf *pf,
+			  struct ice_dcf_sw_rule_entry *s_entry)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = kzalloc(ICE_SW_RULE_RX_TX_NO_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+	s_rule->pdata.lkup_tx_rx.act = 0;
+	s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(s_entry->rule_id);
+	status = ice_aq_sw_rules(&pf->hw, s_rule, ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
+				 1, ice_aqc_opc_remove_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+	list_del(&s_entry->list_entry);
+	kfree(s_entry);
+	return 0;
+}
+
+/**
+ * ice_dcf_rm_sw_rule_to_vsi_list - remove switch rule of "forward to VSI list"
+ * @pf: pointer to the PF struct
+ * @s_entry: pointer to switch rule entry to remove
+ */
+static int
+ice_dcf_rm_sw_rule_to_vsi_list(struct ice_pf *pf,
+			       struct ice_dcf_sw_rule_entry *s_entry)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info = s_entry->vsi_list_info;
+	struct ice_aqc_alloc_free_res_elem *res_buf;
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+	u16 rule_sz;
+	u16 vsi_id;
+	int i = 0;
+
+	if (!vsi_list_info)
+		return -EINVAL;
+
+	/* The VSI list is empty, it can be freed immediately */
+	if (!vsi_list_info->vsi_count)
+		goto free_vsi_list;
+
+	rule_sz = ICE_SW_RULE_VSI_LIST_SIZE(vsi_list_info->vsi_count);
+	s_rule = kzalloc(rule_sz, GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_info->list_id);
+	s_rule->pdata.vsi_list.number_vsi =
+					cpu_to_le16(vsi_list_info->vsi_count);
+	for_each_set_bit(vsi_id, vsi_list_info->hw_vsi_map, ICE_HW_VSI_ID_MAX)
+		s_rule->pdata.vsi_list.vsi[i++] = cpu_to_le16(vsi_id);
+
+	bitmap_zero(vsi_list_info->hw_vsi_map, ICE_HW_VSI_ID_MAX);
+	vsi_list_info->vsi_count = 0;
+
+	status = ice_aq_sw_rules(&pf->hw, s_rule, rule_sz, 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+free_vsi_list:
+	res_buf = kzalloc(sizeof(*res_buf), GFP_KERNEL);
+	if (!res_buf)
+		return -ENOMEM;
+
+	res_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
+	res_buf->num_elems = cpu_to_le16(1);
+	res_buf->elem[0].e.sw_resp = cpu_to_le16(vsi_list_info->list_id);
+	status = ice_aq_alloc_free_res(&pf->hw, 1, res_buf, sizeof(*res_buf),
+				       ice_aqc_opc_free_res, NULL);
+	kfree(res_buf);
+	if (status)
+		return -EIO;
+
+	list_del(&vsi_list_info->list_entry);
+	kfree(vsi_list_info);
+	s_entry->vsi_list_info = NULL;
+
+	return ice_dcf_rm_sw_rule_to_vsi(pf, s_entry);
+}
+
+/**
+ * ice_dcf_rm_vsi_from_list - remove VSI from switch rule forward VSI list
+ * @pf: pointer to the PF struct
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the Hardware VSI number
+ */
+static int
+ice_dcf_rm_vsi_from_list(struct ice_pf *pf,
+			 struct ice_dcf_vsi_list_info *vsi_list_info,
+			 u16 hw_vsi_id)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	if (!vsi_list_info || !vsi_list_info->vsi_count ||
+	    !test_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		return -ENOENT;
+
+	s_rule = kzalloc(ICE_SW_RULE_VSI_LIST_SIZE(1), GFP_KERNEL);
+	if (!s_rule)
+		return -ENOMEM;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_info->list_id);
+	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(1);
+	s_rule->pdata.vsi_list.vsi[0] = cpu_to_le16(hw_vsi_id);
+	status = ice_aq_sw_rules(&pf->hw, s_rule,
+				 ICE_SW_RULE_VSI_LIST_SIZE(1), 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	kfree(s_rule);
+	if (status)
+		return -EIO;
+
+	/* When the VF resets gracefully, it should keep the VSI list and its
+	 * rule, just clears the VSI from list, so that the DCF can replay the
+	 * rule by updating this VF to list successfully.
+	 */
+	vsi_list_info->vsi_count--;
+	clear_bit(hw_vsi_id, vsi_list_info->hw_vsi_map);
+
+	return 0;
+}
+
+/**
+ * ice_rm_dcf_sw_vsi_rule - remove switch rules added by DCF to VSI
+ * @pf: pointer to the PF struct
+ * @hw_vsi_id: hardware VSI ID of the VF
+ */
+void ice_rm_dcf_sw_vsi_rule(struct ice_pf *pf, u16 hw_vsi_id)
+{
+	struct ice_dcf_sw_rule_entry *s_entry, *tmp;
+	int ret;
+
+	list_for_each_entry_safe(s_entry, tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (s_entry->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			ret = ice_dcf_rm_vsi_from_list(pf,
+						       s_entry->vsi_list_info,
+						       hw_vsi_id);
+			if (ret && ret != -ENOENT)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove VSI %u from VSI list : %d\n",
+					hw_vsi_id, ret);
+		} else if (s_entry->fwd_id.hw_vsi_id == hw_vsi_id) {
+			ret = ice_dcf_rm_sw_rule_to_vsi(pf, s_entry);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove VSI %u switch rule : %d\n",
+					hw_vsi_id, ret);
+		}
+}
+
+/**
+ * ice_dcf_init_sw_rule_mgmt - initializes DCF rule filter mngt struct
+ * @pf: pointer to the PF struct
+ */
+void ice_dcf_init_sw_rule_mgmt(struct ice_pf *pf)
+{
+	INIT_LIST_HEAD(&pf->dcf.sw_rule_head);
+	INIT_LIST_HEAD(&pf->dcf.vsi_list_info_head);
+}
+
+/**
+ * ice_rm_all_dcf_sw_rules - remove switch rules configured by DCF
+ * @pf: pointer to the PF struct
+ */
+void ice_rm_all_dcf_sw_rules(struct ice_pf *pf)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info, *list_info_tmp;
+	struct ice_dcf_sw_rule_entry *sw_rule, *rule_tmp;
+	u16 rule_id, list_id;
+	int ret;
+
+	list_for_each_entry_safe(sw_rule, rule_tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (sw_rule->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			list_id = sw_rule->fwd_id.vsi_list_id;
+			rule_id = sw_rule->rule_id;
+			ret = ice_dcf_rm_sw_rule_to_vsi_list(pf, sw_rule);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove switch rule 0x%04x with list id %u : %d\n",
+					rule_id, list_id, ret);
+		} else {
+			rule_id = sw_rule->rule_id;
+			ret = ice_dcf_rm_sw_rule_to_vsi(pf, sw_rule);
+			if (ret)
+				dev_err(ice_pf_to_dev(pf),
+					"Failed to remove switch rule 0x%04x : %d\n",
+					rule_id, ret);
+		}
+
+	/* clears rule filter management data if AdminQ command has error */
+	list_for_each_entry_safe(vsi_list_info, list_info_tmp,
+				 &pf->dcf.vsi_list_info_head,
+				 list_entry) {
+		list_del(&vsi_list_info->list_entry);
+		kfree(vsi_list_info);
+	}
+
+	list_for_each_entry_safe(sw_rule, rule_tmp, &pf->dcf.sw_rule_head,
+				 list_entry) {
+		list_del(&sw_rule->list_entry);
+		kfree(sw_rule);
+	}
+}
+
+/**
+ * ice_clear_dcf_acl_cfg - clear DCF ACL configuration for the PF
+ * @pf: pointer to the PF info
+ */
+void ice_clear_dcf_acl_cfg(struct ice_pf *pf)
+{
+	if (pf->hw.dcf_caps & DCF_ACL_CAP) {
+		ice_acl_destroy_tbl(&pf->hw);
+		ice_init_acl(pf);
+	}
+}
+
+/**
+ * ice_dcf_is_acl_capable - check if DCF ACL capability enabled
+ * @hw: pointer to the hardware info
+ */
+bool ice_dcf_is_acl_capable(struct ice_hw *hw)
+{
+	return hw->dcf_caps & DCF_ACL_CAP;
+}
+
+/**
+ * ice_clear_dcf_udp_tunnel_cfg - clear DCF UDP tunnel configuration for the PF
+ * @pf: pointer to the PF info
+ */
+void ice_clear_dcf_udp_tunnel_cfg(struct ice_pf *pf)
+{
+	if (pf->hw.dcf_caps & DCF_UDP_TUNNEL_CAP)
+		ice_destroy_tunnel(&pf->hw, 0, true);
+}
+
+/**
+ * ice_dcf_is_udp_tunnel_capable - check if DCF UDP tunnel capability enabled
+ * @hw: pointer to the hardware info
+ */
+bool ice_dcf_is_udp_tunnel_capable(struct ice_hw *hw)
+{
+	return hw->dcf_caps & DCF_UDP_TUNNEL_CAP;
+}
+
+/**
+ * ice_dcf_find_vsi_list_info - find the VSI list by ID.
+ * @pf: pointer to the PF info
+ * @vsi_list_id: VSI list ID
+ */
+static struct ice_dcf_vsi_list_info *
+ice_dcf_find_vsi_list_info(struct ice_pf *pf, u16 vsi_list_id)
+{
+	struct ice_dcf_vsi_list_info *list_info;
+
+	list_for_each_entry(list_info, &pf->dcf.vsi_list_info_head, list_entry)
+		if (list_info->list_id == vsi_list_id)
+			return list_info;
+
+	return NULL;
+}
+
+/**
+ * ice_dcf_add_vsi_id - add new VSI ID into list.
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the VSI ID
+ */
+static void
+ice_dcf_add_vsi_id(struct ice_dcf_vsi_list_info *vsi_list_info, u16 hw_vsi_id)
+{
+	if (!test_and_set_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		vsi_list_info->vsi_count++;
+}
+
+/**
+ * ice_dcf_del_vsi_id - delete the VSI ID from list.
+ * @vsi_list_info: pointer to the VSI list info
+ * @hw_vsi_id: the VSI ID
+ */
+static void
+ice_dcf_del_vsi_id(struct ice_dcf_vsi_list_info *vsi_list_info, u16 hw_vsi_id)
+{
+	if (test_and_clear_bit(hw_vsi_id, vsi_list_info->hw_vsi_map))
+		vsi_list_info->vsi_count--;
+}
+
+/**
+ * ice_dcf_parse_alloc_vsi_list_res - parse the allocate VSI list resource
+ * @pf: pointer to the PF info
+ * @res: pointer to the VSI list resource
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_alloc_vsi_list_res(struct ice_pf *pf,
+				 struct ice_aqc_res_elem *res)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	u16 list_id = le16_to_cpu(res->e.sw_resp);
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+	if (vsi_list_info)
+		return VIRTCHNL_STATUS_SUCCESS;
+
+	vsi_list_info = kzalloc(sizeof(*vsi_list_info), GFP_KERNEL);
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+	vsi_list_info->list_id = list_id;
+	list_add(&vsi_list_info->list_entry, &pf->dcf.vsi_list_info_head);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_free_vsi_list_res - parse the free VSI list resource
+ * @pf: pointer to the PF info
+ * @res: pointer to the VSI list resource
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_free_vsi_list_res(struct ice_pf *pf,
+				struct ice_aqc_res_elem *res)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	u16 list_id = le16_to_cpu(res->e.sw_resp);
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	if (vsi_list_info->vsi_count)
+		dev_warn(ice_pf_to_dev(pf),
+			 "VSI list %u still has %u VSIs to be removed!\n",
+			 list_id, vsi_list_info->vsi_count);
+
+	if (vsi_list_info->sw_rule)
+		vsi_list_info->sw_rule->vsi_list_info = NULL;
+
+	list_del(&vsi_list_info->list_entry);
+	kfree(vsi_list_info);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_set_vsi_list - set the VSI to VSI list
+ * @pf: pointer to the PF info
+ * @vsi_list: pointer to the VSI ID list to be set
+ */
+static enum virtchnl_status_code
+ice_dcf_set_vsi_list(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *vsi_list)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	int i;
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf,
+						   le16_to_cpu(vsi_list->pdata.vsi_list.index));
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	for (i = 0; i < le16_to_cpu(vsi_list->pdata.vsi_list.number_vsi); i++)
+		ice_dcf_add_vsi_id(vsi_list_info,
+				   le16_to_cpu(vsi_list->pdata.vsi_list.vsi[i]));
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_clear_vsi_list - clear the VSI from VSI list
+ * @pf: pointer to the PF info
+ * @vsi_list: pointer to the VSI ID list to be cleared
+ */
+static enum virtchnl_status_code
+ice_dcf_clear_vsi_list(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *vsi_list)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	int i;
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf,
+						   le16_to_cpu(vsi_list->pdata.vsi_list.index));
+	if (!vsi_list_info)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	for (i = 0; i < le16_to_cpu(vsi_list->pdata.vsi_list.number_vsi); i++)
+		ice_dcf_del_vsi_id(vsi_list_info,
+				   le16_to_cpu(vsi_list->pdata.vsi_list.vsi[i]));
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_find_sw_rule - find the switch rule by ID.
+ * @pf: pointer to the PF info
+ * @rule_id: switch rule ID
+ */
+static struct ice_dcf_sw_rule_entry *
+ice_dcf_find_sw_rule(struct ice_pf *pf, u16 rule_id)
+{
+	struct ice_dcf_sw_rule_entry *sw_rule;
+
+	list_for_each_entry(sw_rule, &pf->dcf.sw_rule_head, list_entry)
+		if (sw_rule->rule_id == rule_id)
+			return sw_rule;
+
+	return NULL;
+}
+
+/**
+ * ice_dcf_parse_add_sw_rule_data - parse the add switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the add switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_add_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u32 act;
+
+	sw_rule = kzalloc(sizeof(*sw_rule), GFP_KERNEL);
+	if (!sw_rule)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+	act = le32_to_cpu(lkup->pdata.lkup_tx_rx.act);
+	sw_rule->fltr_act = ICE_FWD_TO_VSI;
+	sw_rule->fwd_id.hw_vsi_id = (act & ICE_SINGLE_ACT_VSI_ID_M) >>
+					ICE_SINGLE_ACT_VSI_ID_S;
+	sw_rule->rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+
+	list_add(&sw_rule->list_entry, &pf->dcf.sw_rule_head);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_updt_sw_rule_data - parse the update switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the update switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_updt_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u16 vsi_list_id, rule_id;
+	u32 act;
+
+	rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+	sw_rule = ice_dcf_find_sw_rule(pf, rule_id);
+	if (!sw_rule)
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	act = le32_to_cpu(lkup->pdata.lkup_tx_rx.act);
+	if (!(act & ICE_SINGLE_ACT_VSI_LIST)) {
+		u16 vsi_hw_id = (act & ICE_SINGLE_ACT_VSI_ID_M) >>
+				ICE_SINGLE_ACT_VSI_ID_S;
+
+		sw_rule->fltr_act = ICE_FWD_TO_VSI;
+		sw_rule->fwd_id.hw_vsi_id = vsi_hw_id;
+
+		return VIRTCHNL_STATUS_SUCCESS;
+	}
+
+	vsi_list_id = (act & ICE_SINGLE_ACT_VSI_LIST_ID_M) >>
+				ICE_SINGLE_ACT_VSI_LIST_ID_S;
+	if (sw_rule->vsi_list_info) {
+		if (sw_rule->vsi_list_info->list_id == vsi_list_id)
+			return VIRTCHNL_STATUS_SUCCESS;
+
+		dev_err(ice_pf_to_dev(pf),
+			"The switch rule 0x%04x is running on VSI list %u\n",
+			rule_id, sw_rule->vsi_list_info->list_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi_list_info = ice_dcf_find_vsi_list_info(pf, vsi_list_id);
+	if (!vsi_list_info) {
+		dev_err(ice_pf_to_dev(pf),
+			"No VSI list %u found to bind the switch rule 0x%04x\n",
+			vsi_list_id, rule_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	if (vsi_list_info->sw_rule) {
+		if (vsi_list_info->sw_rule->rule_id == rule_id)
+			return VIRTCHNL_STATUS_SUCCESS;
+
+		dev_err(ice_pf_to_dev(pf),
+			"The VSI list %u is running on switch rule 0x%04x\n",
+			vsi_list_id, vsi_list_info->sw_rule->rule_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi_list_info->sw_rule = sw_rule;
+
+	sw_rule->fltr_act = ICE_FWD_TO_VSI_LIST;
+	sw_rule->fwd_id.vsi_list_id = vsi_list_id;
+	sw_rule->vsi_list_info = vsi_list_info;
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_parse_rm_sw_rule_data - parse the remove switch rule data
+ * @pf: pointer to the PF info
+ * @lkup: pointer to the remove switch rule data
+ */
+static enum virtchnl_status_code
+ice_dcf_parse_rm_sw_rule_data(struct ice_pf *pf, struct ice_aqc_sw_rules_elem *lkup)
+{
+	u16 rule_id = le16_to_cpu(lkup->pdata.lkup_tx_rx.index);
+	struct ice_dcf_sw_rule_entry *sw_rule, *tmp;
+
+	list_for_each_entry_safe(sw_rule, tmp, &pf->dcf.sw_rule_head,
+				 list_entry)
+		if (sw_rule->rule_id == rule_id) {
+			list_del(&sw_rule->list_entry);
+			kfree(sw_rule);
+		}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_handle_add_sw_rule_rsp - handle the add switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the add switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_add_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_VSI_LIST_SET)
+		status = ice_dcf_set_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_add_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_updt_sw_rule_rsp - handle the update switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the update switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_updt_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_VSI_LIST_SET)
+		status = ice_dcf_set_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR)
+		status = ice_dcf_clear_vsi_list(pf, em);
+	else if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_updt_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_rm_sw_rule_rsp - handle the remove switch rule response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the remove switch rule command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_rm_sw_rule_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_sw_rules_elem *em =
+			(struct ice_aqc_sw_rules_elem *)aq_buf;
+	u16 type = le16_to_cpu(em->type);
+
+	if (type == ICE_AQC_SW_RULES_T_LKUP_RX)
+		status = ice_dcf_parse_rm_sw_rule_data(pf, em);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_alloc_res_rsp - handle the allocate resource response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the allocate resource command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_alloc_res_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_alloc_free_res_elem *res_buf =
+		 (struct ice_aqc_alloc_free_res_elem *)aq_buf;
+	u16 type = (le16_to_cpu(res_buf->res_type) &
+		    ICE_AQC_RES_TYPE_M) >> ICE_AQC_RES_TYPE_S;
+
+	if (type == ICE_AQC_RES_TYPE_VSI_LIST_REP)
+		status = ice_dcf_parse_alloc_vsi_list_res(pf,
+							  &res_buf->elem[0]);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_free_res_rsp - handle the free resource response
+ * @pf: pointer to the PF info
+ * @aq_buf: pointer to the free resource command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_free_res_rsp(struct ice_pf *pf, u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_aqc_alloc_free_res_elem *res_buf =
+		 (struct ice_aqc_alloc_free_res_elem *)aq_buf;
+	u16 type = (le16_to_cpu(res_buf->res_type) &
+		    ICE_AQC_RES_TYPE_M) >> ICE_AQC_RES_TYPE_S;
+
+	if (type == ICE_AQC_RES_TYPE_VSI_LIST_REP)
+		status = ice_dcf_parse_free_vsi_list_res(pf,
+							 &res_buf->elem[0]);
+
+	return status;
+}
+
+/**
+ * ice_dcf_handle_udp_tunnel_rsp - handle the update package response
+ * @pf: pointer to the PF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: pointer to the package update command buffer
+ */
+static enum virtchnl_status_code
+ice_dcf_handle_udp_tunnel_rsp(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			      u8 *aq_buf)
+{
+	struct ice_boost_tcam_section *sect;
+	struct ice_buf_hdr *pkg_buf;
+	struct ice_hw *hw = &pf->hw;
+	u16 port_key, inv_port_key;
+	u16 offset;
+	u16 addr;
+	u8 count;
+	u16 i, j;
+
+	mutex_lock(&hw->tnl_lock);
+	pkg_buf = (struct ice_buf_hdr *)aq_buf;
+	offset = le16_to_cpu(pkg_buf->section_entry[0].offset);
+	sect = (struct ice_boost_tcam_section *)(((u8 *)pkg_buf) + offset);
+	count = le16_to_cpu(sect->count);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		for (j = 0; j < count; j++) {
+			addr = le16_to_cpu(sect->tcam[j].addr);
+			inv_port_key =
+			    le16_to_cpu(sect->tcam[j].key.key.hv_dst_port_key);
+			port_key =
+			    le16_to_cpu(sect->tcam[j].key.key2.hv_dst_port_key);
+			if (hw->tnl.tbl[i].valid &&
+			    hw->tnl.tbl[i].boost_addr == addr) {
+				/* It's tunnel destroy command if the key and
+				 * inverse key is the same.
+				 */
+				if (port_key == inv_port_key) {
+					hw->tnl.tbl[i].in_use = false;
+					hw->tnl.tbl[i].port = 0;
+					hw->tnl.tbl[i].ref = 0;
+				} else {
+					hw->tnl.tbl[i].port = port_key;
+					hw->tnl.tbl[i].in_use = true;
+					hw->tnl.tbl[i].ref = 1;
+				}
+			}
+		}
+
+	if (ice_is_tunnel_empty(&pf->hw))
+		hw->dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
+	else
+		hw->dcf_caps |= DCF_UDP_TUNNEL_CAP;
+	mutex_unlock(&hw->tnl_lock);
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_post_aq_send_cmd - get the data from firmware successful response
+ * @pf: pointer to the PF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ */
+enum virtchnl_status_code
+ice_dcf_post_aq_send_cmd(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			 u8 *aq_buf)
+{
+	enum virtchnl_status_code status = VIRTCHNL_STATUS_SUCCESS;
+	u16 opc = le16_to_cpu(aq_desc->opcode);
+
+	if (!aq_buf)
+		return VIRTCHNL_STATUS_SUCCESS;
+
+	switch (opc) {
+	case ice_aqc_opc_add_sw_rules:
+		status = ice_dcf_handle_add_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_update_sw_rules:
+		status = ice_dcf_handle_updt_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_remove_sw_rules:
+		status = ice_dcf_handle_rm_sw_rule_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_alloc_res:
+		status = ice_dcf_handle_alloc_res_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_free_res:
+		status = ice_dcf_handle_free_res_rsp(pf, aq_buf);
+		break;
+	case ice_aqc_opc_update_pkg:
+		if (ice_dcf_is_udp_tunnel_aq_cmd(aq_desc, aq_buf))
+			status = ice_dcf_handle_udp_tunnel_rsp(pf, aq_desc,
+							       aq_buf);
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_dcf_update_acl_rule_info - update DCF ACL rule info
+ * @pf: pointer to the PF info
+ * @desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ */
+enum virtchnl_status_code
+ice_dcf_update_acl_rule_info(struct ice_pf *pf, struct ice_aq_desc *desc,
+			     u8 *aq_buf)
+{
+	struct ice_acl_scen *scen, *tmp;
+	struct ice_acl_tbl *tbl;
+	u16 scen_id;
+
+	switch (le16_to_cpu(desc->opcode)) {
+	case ice_aqc_opc_alloc_acl_tbl:
+		if (pf->hw.acl_tbl)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		tbl = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*tbl),
+				   GFP_ATOMIC);
+		if (!tbl)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		tbl->id = le16_to_cpu(((struct ice_aqc_acl_generic *)
+					aq_buf)->alloc_id);
+		INIT_LIST_HEAD(&tbl->scens);
+		pf->hw.acl_tbl = tbl;
+		break;
+	case ice_aqc_opc_dealloc_acl_tbl:
+		list_for_each_entry_safe(scen, tmp, &pf->hw.acl_tbl->scens,
+					 list_entry) {
+			list_del(&scen->list_entry);
+			devm_kfree(ice_pf_to_dev(pf), scen);
+		}
+		devm_kfree(ice_pf_to_dev(pf), pf->hw.acl_tbl);
+		pf->hw.acl_tbl = NULL;
+		break;
+	case ice_aqc_opc_alloc_acl_scen:
+		scen = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*scen),
+				    GFP_ATOMIC);
+		if (!scen)
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		INIT_LIST_HEAD(&scen->list_entry);
+		scen_id = le16_to_cpu(desc->params.alloc_scen.ops.resp.scen_id);
+		scen->id = scen_id;
+		list_add(&scen->list_entry, &pf->hw.acl_tbl->scens);
+		break;
+	case ice_aqc_opc_dealloc_acl_scen:
+		list_for_each_entry_safe(scen, tmp, &pf->hw.acl_tbl->scens,
+					 list_entry) {
+			if (le16_to_cpu(desc->params.dealloc_scen.scen_id) ==
+			    scen->id) {
+				list_del(&scen->list_entry);
+				devm_kfree(ice_pf_to_dev(pf), scen);
+			}
+		}
+		break;
+	}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_dcf_pre_aq_send_cmd - check if it needs to send the command to firmware
+ * @vf: pointer to the VF info
+ * @aq_desc: descriptor describing the command
+ * @aq_buf: the AdminQ command buffer
+ * @aq_buf_size: the AdminQ command buffer size
+ */
+bool
+ice_dcf_pre_aq_send_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+			u8 *aq_buf, u16 aq_buf_size)
+{
+	struct ice_pf *pf = vf->pf;
+
+	switch (le16_to_cpu(aq_desc->opcode)) {
+	case ice_aqc_opc_update_sw_rules:
+	{
+		struct ice_dcf_vsi_list_info *vsi_list_info;
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 list_id, vsi_id;
+
+		if (aq_buf_size < ICE_SW_RULE_VSI_LIST_SIZE(1))
+			break;
+
+		s_rule = (struct ice_aqc_sw_rules_elem *)aq_buf;
+		if (le16_to_cpu(s_rule->type) !=
+					ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR ||
+		    le16_to_cpu(s_rule->pdata.vsi_list.number_vsi) != 1)
+			break;
+
+		list_id = le16_to_cpu(s_rule->pdata.vsi_list.index);
+		vsi_list_info = ice_dcf_find_vsi_list_info(pf, list_id);
+		if (!vsi_list_info)
+			break;
+
+		vsi_id = le16_to_cpu(s_rule->pdata.vsi_list.vsi[0]);
+		if (vsi_id >= ICE_HW_VSI_ID_MAX ||
+		    test_bit(vsi_id, vsi_list_info->hw_vsi_map))
+			break;
+
+		/* The VSI is removed from list already, no need to send the
+		 * command to firmware.
+		 */
+		return true;
+	}
+	case ice_aqc_opc_remove_sw_rules:
+	{
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 rule_id;
+
+		if (aq_buf_size < ICE_SW_RULE_RX_TX_NO_HDR_SIZE)
+			break;
+
+		s_rule = (struct ice_aqc_sw_rules_elem *)aq_buf;
+		if (le16_to_cpu(s_rule->type) != ICE_AQC_SW_RULES_T_LKUP_RX)
+			break;
+
+		rule_id = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		if (ice_dcf_find_sw_rule(pf, rule_id))
+			break;
+
+		/* The switch rule is removed already, no need to send the
+		 * command to firmware.
+		 */
+		return true;
+	}
+
+	default:
+		break;
+	}
+
+	return false;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_dcf.h b/drivers/net/ethernet/intel/ice/ice_dcf.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d95ec56f5b3a471fcfc49ab3b0f66bef631f965
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_dcf.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DCF_H_
+#define _ICE_DCF_H_
+
+struct ice_vf;
+struct ice_pf;
+
+#define ICE_DCF_VFID0	0
+#define ICE_DCF_VFID1	1
+
+/* DCF mode states */
+enum ice_dcf_state {
+	/* DCF mode is fully off */
+	ICE_DCF_STATE_OFF = 0,
+	/* Process is live, acquired capability to send DCF CMD */
+	ICE_DCF_STATE_ON,
+	/* Kernel is busy, deny DCF CMD */
+	ICE_DCF_STATE_BUSY,
+	/* Kernel is ready for Process to Re-establish, deny DCF CMD */
+	ICE_DCF_STATE_PAUSE,
+};
+
+struct ice_dcf_sw_rule_entry;
+
+#define ICE_HW_VSI_ID_MAX	BIT(10) /* The AQ VSI number uses 10 bits */
+
+struct ice_dcf_vsi_list_info {
+	struct list_head list_entry;
+	struct ice_dcf_sw_rule_entry *sw_rule;
+	u16 list_id;
+
+	u16 vsi_count;
+	DECLARE_BITMAP(hw_vsi_map, ICE_HW_VSI_ID_MAX);
+};
+
+struct ice_dcf_sw_rule_entry {
+	struct list_head list_entry;
+	u16 rule_id;
+
+	/* Only support ICE_FWD_TO_VSI and ICE_FWD_TO_VSI_LIST */
+	enum ice_sw_fwd_act_type fltr_act;
+	/* Depending on filter action */
+	union {
+		u16 hw_vsi_id:10;
+		u16 vsi_list_id:10;
+	} fwd_id;
+
+	struct ice_dcf_vsi_list_info *vsi_list_info;
+};
+
+struct ice_dcf {
+	struct ice_vf *vf;
+	enum ice_dcf_state state;
+
+	/* Trace the switch rules added/removed by DCF */
+	struct list_head sw_rule_head;
+	struct list_head vsi_list_info_head;
+
+	/* Handle the AdminQ command between the DCF (Device Config Function)
+	 * and the firmware.
+	 */
+#define ICE_DCF_AQ_DESC_TIMEOUT	(HZ / 10)
+	struct ice_aq_desc aq_desc;
+	u8 aq_desc_received;
+	unsigned long aq_desc_expires;
+
+	/* Save the current Device Serial Number when searching the package
+	 * path for later query.
+	 */
+#define ICE_DSN_NUM_LEN 8
+	u8 dsn[ICE_DSN_NUM_LEN];
+};
+
+#ifdef CONFIG_PCI_IOV
+bool ice_dcf_aq_cmd_permitted(struct ice_aq_desc *desc);
+bool ice_check_dcf_allowed(struct ice_vf *vf);
+bool ice_is_dcf_enabled(struct ice_pf *pf);
+bool ice_is_vf_dcf(struct ice_vf *vf);
+enum ice_dcf_state ice_dcf_get_state(struct ice_pf *pf);
+void ice_dcf_set_state(struct ice_pf *pf, enum ice_dcf_state state);
+void ice_dcf_init_sw_rule_mgmt(struct ice_pf *pf);
+void ice_rm_all_dcf_sw_rules(struct ice_pf *pf);
+void ice_rm_dcf_sw_vsi_rule(struct ice_pf *pf, u16 hw_vsi_id);
+bool
+ice_dcf_pre_aq_send_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+			u8 *aq_buf, u16 aq_buf_size);
+enum virtchnl_status_code
+ice_dcf_post_aq_send_cmd(struct ice_pf *pf, struct ice_aq_desc *aq_desc,
+			 u8 *aq_buf);
+bool ice_dcf_is_acl_aq_cmd(struct ice_aq_desc *desc);
+bool ice_dcf_is_udp_tunnel_aq_cmd(struct ice_aq_desc *desc, u8 *aq_buf);
+void ice_clear_dcf_acl_cfg(struct ice_pf *pf);
+bool ice_dcf_is_acl_capable(struct ice_hw *hw);
+void ice_clear_dcf_udp_tunnel_cfg(struct ice_pf *pf);
+bool ice_dcf_is_udp_tunnel_capable(struct ice_hw *hw);
+enum virtchnl_status_code
+ice_dcf_update_acl_rule_info(struct ice_pf *pf, struct ice_aq_desc *desc,
+			     u8 *aq_buf);
+#else
+static inline bool ice_is_dcf_enabled(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline bool
+ice_dcf_is_udp_tunnel_capable(struct ice_hw __always_unused *hw)
+{
+	return false;
+}
+
+static inline bool ice_dcf_is_acl_capable(struct ice_hw __always_unused *hw)
+{
+	return false;
+}
+#endif /* CONFIG_PCI_IOV */
+#endif /* _ICE_DCF_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_debugfs.c b/drivers/net/ethernet/intel/ice/ice_debugfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..09b403d5ff48063de91e45ae1e843b4f71da9e39
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_debugfs.c
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+
+
+static struct dentry *ice_debugfs_root;
+
+
+static void ice_dump_pf(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	dev_info(dev, "pf struct:\n");
+	dev_info(dev, "\tmax_pf_txqs = %d\n", pf->max_pf_txqs);
+	dev_info(dev, "\tmax_pf_rxqs = %d\n", pf->max_pf_rxqs);
+	dev_info(dev, "\tnum_alloc_vsi = %d\n", pf->num_alloc_vsi);
+	dev_info(dev, "\tnum_lan_tx = %d\n", pf->num_lan_tx);
+	dev_info(dev, "\tnum_lan_rx = %d\n", pf->num_lan_rx);
+	dev_info(dev, "\tnum_avail_tx = %d\n", ice_get_avail_txq_count(pf));
+	dev_info(dev, "\tnum_avail_rx = %d\n", ice_get_avail_rxq_count(pf));
+	dev_info(dev, "\tnum_lan_msix = %d\n", pf->num_lan_msix);
+	dev_info(dev, "\tnum_rdma_msix = %d\n", pf->num_rdma_msix);
+	dev_info(dev, "\trdma_base_vector = %d\n", pf->rdma_base_vector);
+#ifdef HAVE_NETDEV_SB_DEV
+	dev_info(dev, "\tnum_macvlan = %d\n", pf->num_macvlan);
+	dev_info(dev, "\tmax_num_macvlan = %d\n", pf->max_num_macvlan);
+#endif /* HAVE_NETDEV_SB_DEV */
+	dev_info(dev, "\tirq_tracker->num_entries = %d\n",
+		 pf->irq_tracker->num_entries);
+	dev_info(dev, "\tirq_tracker->end = %d\n", pf->irq_tracker->end);
+	dev_info(dev, "\tirq_tracker valid count = %d\n",
+		 ice_get_valid_res_count(pf->irq_tracker));
+	dev_info(dev, "\tnum_avail_sw_msix = %d\n", pf->num_avail_sw_msix);
+	dev_info(dev, "\tsriov_base_vector = %d\n", pf->sriov_base_vector);
+	dev_info(dev, "\tnum_alloc_vfs = %d\n", pf->num_alloc_vfs);
+	dev_info(dev, "\tnum_qps_per_vf = %d\n", pf->num_qps_per_vf);
+	dev_info(dev, "\tnum_msix_per_vf = %d\n", pf->num_msix_per_vf);
+}
+
+static void ice_dump_pf_vsi_list(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi)
+			continue;
+
+		dev_info(dev, "vsi[%d]:\n", i);
+		dev_info(dev, "\tvsi = %pK\n", vsi);
+		dev_info(dev, "\tvsi_num = %d\n", vsi->vsi_num);
+		dev_info(dev, "\ttype = %s\n", ice_vsi_type_str(vsi->type));
+		if (vsi->type == ICE_VSI_VF)
+			dev_info(dev, "\tvf_id = %d\n", vsi->vf_id);
+		dev_info(dev, "\tback = %pK\n", vsi->back);
+		dev_info(dev, "\tnetdev = %pK\n", vsi->netdev);
+		dev_info(dev, "\tmax_frame = %d\n", vsi->max_frame);
+		dev_info(dev, "\trx_buf_len = %d\n", vsi->rx_buf_len);
+		dev_info(dev, "\tnum_txq = %d\n", vsi->num_txq);
+		dev_info(dev, "\tnum_rxq = %d\n", vsi->num_rxq);
+		dev_info(dev, "\treq_txq = %d\n", vsi->req_txq);
+		dev_info(dev, "\treq_rxq = %d\n", vsi->req_rxq);
+		dev_info(dev, "\talloc_txq = %d\n", vsi->alloc_txq);
+		dev_info(dev, "\talloc_rxq = %d\n", vsi->alloc_rxq);
+		dev_info(dev, "\tnum_rx_desc = %d\n", vsi->num_rx_desc);
+		dev_info(dev, "\tnum_tx_desc = %d\n", vsi->num_tx_desc);
+		dev_info(dev, "\tnum_vlan = %d\n", vsi->num_vlan);
+	}
+}
+
+/**
+ * ice_dump_pf_fdir - output Flow Director stats to dmesg log
+ * @pf: pointer to PF to get Flow Director HW stats for.
+ */
+static void ice_dump_pf_fdir(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	u16 pf_guar_pool = 0;
+	u32 dev_fltr_size;
+	u32 dev_fltr_cnt;
+	u32 pf_fltr_cnt;
+	u16 i;
+
+	pf_fltr_cnt = rd32(hw, PFQF_FD_CNT);
+	dev_fltr_cnt = rd32(hw, GLQF_FD_CNT);
+	dev_fltr_size = rd32(hw, GLQF_FD_SIZE);
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi)
+			continue;
+
+		pf_guar_pool += vsi->num_gfltr;
+	}
+
+	dev_info(dev, "Flow Director filter usage:\n");
+	dev_info(dev, "\tPF guaranteed used = %d\n",
+		 (pf_fltr_cnt & PFQF_FD_CNT_FD_GCNT_M) >>
+		 PFQF_FD_CNT_FD_GCNT_S);
+	dev_info(dev, "\tPF best_effort used = %d\n",
+		 (pf_fltr_cnt & PFQF_FD_CNT_FD_BCNT_M) >>
+		 PFQF_FD_CNT_FD_BCNT_S);
+	dev_info(dev, "\tdevice guaranteed used = %d\n",
+		 (dev_fltr_cnt & GLQF_FD_CNT_FD_GCNT_M) >>
+		 GLQF_FD_CNT_FD_GCNT_S);
+	dev_info(dev, "\tdevice best_effort used = %d\n",
+		 (dev_fltr_cnt & GLQF_FD_CNT_FD_BCNT_M) >>
+		 GLQF_FD_CNT_FD_BCNT_S);
+	dev_info(dev, "\tPF guaranteed pool = %d\n", pf_guar_pool);
+	dev_info(dev, "\tdevice guaranteed pool = %d\n",
+		 (dev_fltr_size & GLQF_FD_SIZE_FD_GSIZE_M) >>
+		 GLQF_FD_SIZE_FD_GSIZE_S);
+	dev_info(dev, "\tdevice best_effort pool = %d\n",
+		 hw->func_caps.fd_fltr_best_effort);
+}
+
+/**
+ * ice_vsi_dump_ctxt - print the passed in VSI context structure
+ * @dev: Device used for dev_info prints
+ * @ctxt: VSI context structure to print
+ */
+static void ice_vsi_dump_ctxt(struct device *dev, struct ice_vsi_ctx *ctxt)
+{
+	struct ice_aqc_vsi_props *info;
+
+	if (!ctxt)
+		return;
+
+	info = &ctxt->info;
+	dev_info(dev, "Get VSI Parameters:\n");
+	dev_info(dev, "\tVSI Number: %d Valid sections: 0x%04x\n",
+		 ctxt->vsi_num, le16_to_cpu(info->valid_sections));
+
+	dev_info(dev, "========================\n");
+	dev_info(dev, "| Category - Switching |");
+	dev_info(dev, "========================\n");
+	dev_info(dev, "\tSwitch ID: %u\n", info->sw_id);
+	dev_info(dev, "\tAllow Loopback: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_ALLOW_LB) ? "enabled" : "disabled");
+	dev_info(dev, "\tAllow Local Loopback: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_LOCAL_LB) ? "enabled" : "disabled");
+	dev_info(dev, "\tApply source VSI pruning: %s\n", (info->sw_flags &
+		 ICE_AQ_VSI_SW_FLAG_SRC_PRUNE) ? "enabled" : "disabled");
+	dev_info(dev, "\tEgress (Rx VLAN) pruning: %s\n",
+		 (info->sw_flags2 & ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M) ?
+		 "enabled" : "disabled");
+	dev_info(dev, "\tLAN enable: %s\n", (info->sw_flags2 &
+		 ICE_AQ_VSI_SW_FLAG_LAN_ENA) ? "enabled" : "disabled");
+	dev_info(dev, "\tVEB statistic block ID: %u\n", info->veb_stat_id &
+		 ICE_AQ_VSI_SW_VEB_STAT_ID_M);
+	dev_info(dev, "\tVEB statistic block ID valid: %d\n",
+		 (info->veb_stat_id & ICE_AQ_VSI_SW_VEB_STAT_ID_VALID) ? 1 : 0);
+
+	dev_info(dev, "=======================\n");
+	dev_info(dev, "| Category - Security |\n");
+	dev_info(dev, "=======================\n");
+	dev_info(dev, "\tAllow destination override: %s\n", (info->sec_flags &
+		 ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD) ? "enabled" : "disabled");
+	dev_info(dev, "\tEnable MAC anti-spoof: %s\n", (info->sec_flags &
+		 ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF) ? "enabled" : "disabled");
+	dev_info(dev, "\tIngress (Tx VLAN) pruning enables: %s\n",
+		 (info->sec_flags & ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M) ?
+		 "enabled" : "disabled");
+
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "| Category: Inner VLAN Handling |\n");
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "\tPort Based Inner VLAN Insertion: PVLAN ID: %d PRIO: %d\n",
+		 le16_to_cpu(info->port_based_inner_vlan) & VLAN_VID_MASK,
+		 (le16_to_cpu(info->port_based_inner_vlan) & VLAN_PRIO_MASK) >>
+		 VLAN_PRIO_SHIFT);
+	dev_info(dev, "\tInner VLAN TX Mode: 0x%02x\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_TX_MODE_M) >>
+		 ICE_AQ_VSI_INNER_VLAN_TX_MODE_S);
+	dev_info(dev, "\tInsert PVID: %s\n", (info->inner_vlan_flags &
+		 ICE_AQ_VSI_INNER_VLAN_INSERT_PVID) ? "enabled" : "disabled");
+	dev_info(dev, "\tInner VLAN and UP expose mode (RX): 0x%02x\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_EMODE_M) >>
+		 ICE_AQ_VSI_INNER_VLAN_EMODE_S);
+	dev_info(dev, "\tBlock Inner VLAN from TX Descriptor: %s\n",
+		 (info->inner_vlan_flags & ICE_AQ_VSI_INNER_VLAN_BLOCK_TX_DESC) ?
+		 "enabled" : "disabled");
+
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "| Category: Outer VLAN Handling |\n");
+	dev_info(dev, "=================================\n");
+	dev_info(dev, "\tPort Based Outer VLAN Insertion: PVID: %d PRIO: %d\n",
+		 le16_to_cpu(info->port_based_outer_vlan) & VLAN_VID_MASK,
+		 (le16_to_cpu(info->port_based_outer_vlan) & VLAN_PRIO_MASK) >>
+		 VLAN_PRIO_SHIFT);
+	dev_info(dev, "\tOuter VLAN and UP expose mode (RX): 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_EMODE_M) >>
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S);
+	dev_info(dev, "\tOuter Tag type (Tx and Rx): 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_TAG_TYPE_M) >>
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_S);
+	dev_info(dev, "\tPort Based Outer VLAN Insert Enable: %s\n",
+		 (info->outer_vlan_flags &
+		 ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT) ?
+		 "enabled" : "disabled");
+	dev_info(dev, "\tOuter VLAN TX Mode: 0x%02x\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) >>
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S);
+	dev_info(dev, "\tBlock Outer VLAN from TX Descriptor: %s\n",
+		 (info->outer_vlan_flags & ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC) ?
+		 "enabled" : "disabled");
+}
+
+static const char *module_id_to_name(u16 module_id)
+{
+	switch (module_id) {
+	case ICE_AQC_FW_LOG_ID_GENERAL:
+		return "General";
+	case ICE_AQC_FW_LOG_ID_CTRL:
+		return "Control (Resets + Autoload)";
+	case ICE_AQC_FW_LOG_ID_LINK:
+		return "Link Management";
+	case ICE_AQC_FW_LOG_ID_LINK_TOPO:
+		return "Link Topology Detection";
+	case ICE_AQC_FW_LOG_ID_DNL:
+		return "DNL";
+	case ICE_AQC_FW_LOG_ID_I2C:
+		return "I2C";
+	case ICE_AQC_FW_LOG_ID_SDP:
+		return "SDP";
+	case ICE_AQC_FW_LOG_ID_MDIO:
+		return "MDIO";
+	case ICE_AQC_FW_LOG_ID_ADMINQ:
+		return "Admin Queue";
+	case ICE_AQC_FW_LOG_ID_HDMA:
+		return "HDMA";
+	case ICE_AQC_FW_LOG_ID_LLDP:
+		return "LLDP";
+	case ICE_AQC_FW_LOG_ID_DCBX:
+		return "DCBX";
+	case ICE_AQC_FW_LOG_ID_DCB:
+		return "DCB";
+	case ICE_AQC_FW_LOG_ID_XLR:
+		return "XLR";
+	case ICE_AQC_FW_LOG_ID_NVM:
+		return "NVM";
+	case ICE_AQC_FW_LOG_ID_AUTH:
+		return "Authentication";
+	case ICE_AQC_FW_LOG_ID_VPD:
+		return "VPD";
+	case ICE_AQC_FW_LOG_ID_IOSF:
+		return "IOSF";
+	case ICE_AQC_FW_LOG_ID_PARSER:
+		return "Parser";
+	case ICE_AQC_FW_LOG_ID_SW:
+		return "Switch";
+	case ICE_AQC_FW_LOG_ID_SCHEDULER:
+		return "Scheduler";
+	case ICE_AQC_FW_LOG_ID_TXQ:
+		return "Tx Queue Management";
+	case ICE_AQC_FW_LOG_ID_ACL:
+		return "ACL";
+	case ICE_AQC_FW_LOG_ID_POST:
+		return "Post";
+	case ICE_AQC_FW_LOG_ID_WATCHDOG:
+		return "Watchdog";
+	case ICE_AQC_FW_LOG_ID_TASK_DISPATCH:
+		return "Task Dispatcher";
+	case ICE_AQC_FW_LOG_ID_MNG:
+		return "Manageability";
+	case ICE_AQC_FW_LOG_ID_SYNCE:
+		return "Synce";
+	case ICE_AQC_FW_LOG_ID_HEALTH:
+		return "Health";
+	case ICE_AQC_FW_LOG_ID_TSDRV:
+		return "Time Sync";
+	case ICE_AQC_FW_LOG_ID_PFREG:
+		return "PF Registration";
+	case ICE_AQC_FW_LOG_ID_MDLVER:
+		return "Module Version";
+	default:
+		return "Unsupported";
+	}
+}
+
+static const char *log_level_to_name(u8 log_level)
+{
+	switch (log_level) {
+	case ICE_FWLOG_LEVEL_NONE:
+		return "None";
+	case ICE_FWLOG_LEVEL_ERROR:
+		return "Error";
+	case ICE_FWLOG_LEVEL_WARNING:
+		return "Warning";
+	case ICE_FWLOG_LEVEL_NORMAL:
+		return "Normal";
+	case ICE_FWLOG_LEVEL_VERBOSE:
+		return "Verbose";
+	default:
+		return "Unsupported";
+	}
+}
+
+/**
+ * ice_fwlog_dump_cfg - Dump current FW logging configuration
+ * @hw: pointer to the HW structure
+ */
+static void ice_fwlog_dump_cfg(struct ice_hw *hw)
+{
+	struct device *dev = ice_pf_to_dev((struct ice_pf *)(hw->back));
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+	u16 i;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return;
+
+	status = ice_fwlog_get(hw, cfg);
+	if (status) {
+		kfree(cfg);
+		return;
+	}
+
+	dev_info(dev, "FWLOG Configuration:\n");
+	dev_info(dev, "Options: 0x%04x\n", cfg->options);
+	dev_info(dev, "\tarq_ena: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_ARQ_ENA) ? "true" : "false");
+	dev_info(dev, "\tuart_ena: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_UART_ENA) ? "true" : "false");
+	dev_info(dev, "\tPF registered: %s\n",
+		 (cfg->options &
+		  ICE_FWLOG_OPTION_IS_REGISTERED) ? "true" : "false");
+
+	dev_info(dev, "Module Entries:\n");
+	for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
+		struct ice_fwlog_module_entry *entry =
+			&cfg->module_entries[i];
+
+		dev_info(dev, "\tModule ID %d (%s) Log Level %d (%s)\n",
+			 entry->module_id, module_id_to_name(entry->module_id),
+			 entry->log_level, log_level_to_name(entry->log_level));
+	}
+
+	kfree(cfg);
+}
+
+/**
+ * ice_debugfs_command_write - write into command datum
+ * @filp: the opened file
+ * @buf: where to find the user's data
+ * @count: the length of the user's data
+ * @ppos: file position offset
+ */
+static ssize_t
+ice_debugfs_command_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ice_pf *pf = filp->private_data;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	char *cmd_buf, *cmd_buf_tmp;
+	ssize_t ret;
+	char **argv;
+	int argc;
+
+	/* don't allow partial writes and writes when reset is in progress*/
+	if (*ppos != 0 || ice_is_reset_in_progress(pf->state))
+		return 0;
+
+	cmd_buf = memdup_user(buf, count + 1);
+	if (IS_ERR(cmd_buf))
+		return PTR_ERR(cmd_buf);
+	cmd_buf[count] = '\0';
+
+	cmd_buf_tmp = strchr(cmd_buf, '\n');
+	if (cmd_buf_tmp) {
+		*cmd_buf_tmp = '\0';
+		count = (size_t)cmd_buf_tmp - (size_t)cmd_buf + 1;
+	}
+
+	argv = argv_split(GFP_KERNEL, cmd_buf, &argc);
+	if (!argv) {
+		ret = -ENOMEM;
+		goto err_copy_from_user;
+	}
+
+	if (argc > 1 && !strncmp(argv[1], "vsi", 3)) {
+		if (argc == 3 && !strncmp(argv[0], "get", 3)) {
+			struct ice_vsi_ctx *vsi_ctx;
+
+			vsi_ctx = devm_kzalloc(dev, sizeof(*vsi_ctx),
+					       GFP_KERNEL);
+			if (!vsi_ctx) {
+				ret = -ENOMEM;
+				goto command_write_done;
+			}
+			ret = kstrtou16(argv[2], 0, &vsi_ctx->vsi_num);
+			if (ret) {
+				devm_kfree(dev, vsi_ctx);
+				goto command_help;
+			}
+			ret = ice_aq_get_vsi_params(hw, vsi_ctx, NULL);
+			if (ret) {
+				ret = -EINVAL;
+				devm_kfree(dev, vsi_ctx);
+				goto command_help;
+			}
+
+			ice_vsi_dump_ctxt(dev, vsi_ctx);
+			devm_kfree(dev, vsi_ctx);
+		} else {
+			goto command_help;
+		}
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "switch", 6)) {
+		ret = ice_dump_sw_cfg(hw);
+		if (ret) {
+			ret = -EINVAL;
+			dev_err(dev, "dump switch failed\n");
+			goto command_write_done;
+		}
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "capabilities", 11)) {
+		ice_dump_caps(hw);
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "fwlog_cfg", 9)) {
+		ice_fwlog_dump_cfg(&pf->hw);
+	} else if (argc == 4 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ptp", 3) &&
+		   !strncmp(argv[2], "func", 4) &&
+		   !strncmp(argv[3], "capabilities", 11)) {
+		ice_dump_ptp_func_caps(hw);
+	} else if (argc == 4 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ptp", 3) &&
+		   !strncmp(argv[2], "dev", 3) &&
+		   !strncmp(argv[3], "capabilities", 11)) {
+		ice_dump_ptp_dev_caps(hw);
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "ports", 5)) {
+		dev_info(dev, "port_info:\n");
+		ice_dump_port_info(hw->port_info);
+#ifdef ICE_ADD_PROBES
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4) &&
+		   !strncmp(argv[1], "arfs_stats", 10)) {
+		struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+		if (!vsi) {
+			dev_err(dev, "Failed to find PF VSI\n");
+		} else if (vsi->netdev->features & NETIF_F_NTUPLE) {
+			struct ice_arfs_active_fltr_cntrs *fltr_cntrs;
+
+			fltr_cntrs = vsi->arfs_fltr_cntrs;
+
+			/* active counters can be updated by multiple CPUs */
+			smp_mb__before_atomic();
+			dev_info(dev, "arfs_active_tcpv4_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_tcpv4_cnt));
+			dev_info(dev, "arfs_active_tcpv6_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_tcpv6_cnt));
+			dev_info(dev, "arfs_active_udpv4_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_udpv4_cnt));
+			dev_info(dev, "arfs_active_udpv6_filters: %d\n",
+				 atomic_read(&fltr_cntrs->active_udpv6_cnt));
+		}
+#endif /* ICE_ADD_PROBES */
+	} else if (argc == 2 && !strncmp(argv[0], "dump", 4)) {
+		if (!strncmp(argv[1], "mmcast", 6)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_MAC);
+		} else if (!strncmp(argv[1], "vlan", 4)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_VLAN);
+		} else if (!strncmp(argv[1], "eth", 3)) {
+			ice_dump_sw_rules(hw, ICE_SW_LKUP_ETHERTYPE);
+		} else if (!strncmp(argv[1], "pf_vsi", 6)) {
+			ice_dump_pf_vsi_list(pf);
+		} else if (!strncmp(argv[1], "pf_port_num", 11)) {
+			dev_info(dev, "pf_id = %d, port_num = %d\n",
+				 hw->pf_id, hw->port_info->lport);
+		} else if (!strncmp(argv[1], "pf", 2)) {
+			ice_dump_pf(pf);
+		} else if (!strncmp(argv[1], "vfs", 3)) {
+			ice_dump_all_vfs(pf);
+		} else if (!strncmp(argv[1], "fdir_stats", 10)) {
+			ice_dump_pf_fdir(pf);
+		} else if (!strncmp(argv[1], "reset_stats", 11)) {
+			dev_info(dev, "core reset count: %d\n",
+				 pf->corer_count);
+			dev_info(dev, "global reset count: %d\n",
+				 pf->globr_count);
+			dev_info(dev, "emp reset count: %d\n", pf->empr_count);
+			dev_info(dev, "pf reset count: %d\n", pf->pfr_count);
+		}
+
+#ifdef CONFIG_DCB
+	} else if (argc == 3 && !strncmp(argv[0], "lldp", 4) &&
+				!strncmp(argv[1], "get", 3)) {
+		u8 mibtype;
+		u16 llen, rlen;
+		u8 *buff;
+
+		if (!strncmp(argv[2], "local", 5))
+			mibtype = ICE_AQ_LLDP_MIB_LOCAL;
+		else if (!strncmp(argv[2], "remote", 6))
+			mibtype = ICE_AQ_LLDP_MIB_REMOTE;
+		else
+			goto command_help;
+
+		buff = devm_kzalloc(dev, ICE_LLDPDU_SIZE, GFP_KERNEL);
+		if (!buff) {
+			ret = -ENOMEM;
+			goto command_write_done;
+		}
+
+		ret = ice_aq_get_lldp_mib(hw,
+					  ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID,
+					  mibtype, (void *)buff,
+					  ICE_LLDPDU_SIZE,
+					  &llen, &rlen, NULL);
+
+		if (!ret) {
+			if (mibtype == ICE_AQ_LLDP_MIB_LOCAL) {
+				dev_info(dev, "LLDP MIB (local)\n");
+				print_hex_dump(KERN_INFO, "LLDP MIB (local): ",
+					       DUMP_PREFIX_OFFSET, 16, 1,
+					       buff, llen, true);
+			} else if (mibtype == ICE_AQ_LLDP_MIB_REMOTE) {
+				dev_info(dev, "LLDP MIB (remote)\n");
+				print_hex_dump(KERN_INFO, "LLDP MIB (remote): ",
+					       DUMP_PREFIX_OFFSET, 16, 1,
+					       buff, rlen, true);
+			}
+		} else {
+			dev_err(dev, "GET LLDP MIB failed. Status: %ld\n", ret);
+		}
+		devm_kfree(dev, buff);
+#endif /* CONFIG_DCB */
+	} else if ((argc > 1) && !strncmp(argv[1], "scheduling", 10)) {
+		if (argc == 4 && !strncmp(argv[0], "get", 3) &&
+		    !strncmp(argv[2], "tree", 4) &&
+		    !strncmp(argv[3], "topology", 8)) {
+			ice_dump_port_topo(hw->port_info);
+		}
+	} else if (argc == 4 && !strncmp(argv[0], "set_ts_pll", 10)) {
+		u8 time_ref_freq;
+		u8 time_ref_sel;
+		u8 src_tmr_mode;
+
+		ret = kstrtou8(argv[1], 0, &time_ref_freq);
+		if (ret)
+			goto command_help;
+		ret = kstrtou8(argv[2], 0, &time_ref_sel);
+		if (ret)
+			goto command_help;
+		ret = kstrtou8(argv[3], 0, &src_tmr_mode);
+		if (ret)
+			goto command_help;
+
+		ice_cgu_cfg_ts_pll(pf, false, (enum ice_time_ref_freq)time_ref_freq,
+				   (enum ice_cgu_time_ref_sel)time_ref_sel,
+				   (enum ice_src_tmr_mode)src_tmr_mode);
+		ice_cgu_cfg_ts_pll(pf, true, (enum ice_time_ref_freq)time_ref_freq,
+				   (enum ice_cgu_time_ref_sel)time_ref_sel,
+				   (enum ice_src_tmr_mode)src_tmr_mode);
+	} else {
+command_help:
+		dev_info(dev, "unknown or invalid command '%s'\n", cmd_buf);
+		dev_info(dev, "available commands\n");
+		dev_info(dev, "\t get vsi <vsinum>\n");
+		dev_info(dev, "\t dump switch\n");
+		dev_info(dev, "\t dump ports\n");
+		dev_info(dev, "\t dump capabilities\n");
+		dev_info(dev, "\t dump fwlog_cfg\n");
+		dev_info(dev, "\t dump ptp func capabilities\n");
+		dev_info(dev, "\t dump ptp dev capabilities\n");
+		dev_info(dev, "\t dump mmcast\n");
+		dev_info(dev, "\t dump vlan\n");
+		dev_info(dev, "\t dump eth\n");
+		dev_info(dev, "\t dump pf_vsi\n");
+		dev_info(dev, "\t dump pf\n");
+		dev_info(dev, "\t dump pf_port_num\n");
+		dev_info(dev, "\t dump vfs\n");
+		dev_info(dev, "\t dump reset_stats\n");
+		dev_info(dev, "\t dump fdir_stats\n");
+		dev_info(dev, "\t get scheduling tree topology\n");
+		dev_info(dev, "\t get scheduling tree topology portnum <port>\n");
+#ifdef CONFIG_DCB
+		dev_info(dev, "\t lldp get local\n");
+		dev_info(dev, "\t lldp get remote\n");
+#endif /* CONFIG_DCB */
+#ifdef ICE_ADD_PROBES
+		dev_info(dev, "\t dump arfs_stats\n");
+#endif /* ICE_ADD_PROBES */
+		ret = -EINVAL;
+		goto command_write_done;
+	}
+
+	/* if we get here, nothing went wrong; return bytes copied */
+	ret = (ssize_t)count;
+
+command_write_done:
+	argv_free(argv);
+err_copy_from_user:
+	kfree(cmd_buf);
+	return ret;
+}
+
+static const struct file_operations ice_debugfs_command_fops = {
+	.owner = THIS_MODULE,
+	.open  = simple_open,
+	.write = ice_debugfs_command_write,
+};
+
+/**
+ * ice_debugfs_pf_init - setup the debugfs directory
+ * @pf: the ice that is starting up
+ */
+void ice_debugfs_pf_init(struct ice_pf *pf)
+{
+	const char *name = pci_name(pf->pdev);
+	struct dentry *pfile;
+
+	pf->ice_debugfs_pf = debugfs_create_dir(name, ice_debugfs_root);
+	if (IS_ERR(pf->ice_debugfs_pf))
+		return;
+
+	pfile = debugfs_create_file("command", 0600, pf->ice_debugfs_pf, pf,
+				    &ice_debugfs_command_fops);
+	if (!pfile)
+		goto create_failed;
+
+	return;
+
+create_failed:
+	dev_err(ice_pf_to_dev(pf), "debugfs dir/file for %s failed\n", name);
+	debugfs_remove_recursive(pf->ice_debugfs_pf);
+}
+
+/**
+ * ice_debugfs_pf_exit - clear out the ices debugfs entries
+ * @pf: the ice that is stopping
+ */
+void ice_debugfs_pf_exit(struct ice_pf *pf)
+{
+	debugfs_remove_recursive(pf->ice_debugfs_pf);
+	pf->ice_debugfs_pf = NULL;
+}
+
+/**
+ * ice_debugfs_init - create root directory for debugfs entries
+ */
+void ice_debugfs_init(void)
+{
+	ice_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (IS_ERR(ice_debugfs_root))
+		pr_info("init of debugfs failed\n");
+}
+
+/**
+ * ice_debugfs_exit - remove debugfs entries
+ */
+void ice_debugfs_exit(void)
+{
+	debugfs_remove_recursive(ice_debugfs_root);
+	ice_debugfs_root = NULL;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_devids.h b/drivers/net/ethernet/intel/ice/ice_devids.h
index f8d5c661d0baf8c115884a5b1e32178a72a21f66..a9c1d294def5e478a020d1f047af6a69b8405ab5 100644
--- a/drivers/net/ethernet/intel/ice/ice_devids.h
+++ b/drivers/net/ethernet/intel/ice/ice_devids.h
@@ -1,15 +1,60 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_DEVIDS_H_
 #define _ICE_DEVIDS_H_
 
+
 /* Device IDs */
+/* Intel(R) Ethernet Connection E823-L for backplane */
+#define ICE_DEV_ID_E823L_BACKPLANE	0x124C
+/* Intel(R) Ethernet Connection E823-L for SFP */
+#define ICE_DEV_ID_E823L_SFP		0x124D
+/* Intel(R) Ethernet Connection E823-L/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E823L_10G_BASE_T	0x124E
+/* Intel(R) Ethernet Connection E823-L 1GbE */
+#define ICE_DEV_ID_E823L_1GBE		0x124F
+/* Intel(R) Ethernet Connection E823-L for QSFP */
+#define ICE_DEV_ID_E823L_QSFP		0x151D
 /* Intel(R) Ethernet Controller E810-C for backplane */
 #define ICE_DEV_ID_E810C_BACKPLANE	0x1591
 /* Intel(R) Ethernet Controller E810-C for QSFP */
 #define ICE_DEV_ID_E810C_QSFP		0x1592
 /* Intel(R) Ethernet Controller E810-C for SFP */
 #define ICE_DEV_ID_E810C_SFP		0x1593
+/* Intel(R) Ethernet Controller E810-XXV for backplane */
+#define ICE_DEV_ID_E810_XXV_BACKPLANE	0x1599
+/* Intel(R) Ethernet Controller E810-XXV for QSFP */
+#define ICE_DEV_ID_E810_XXV_QSFP	0x159A
+/* Intel(R) Ethernet Controller E810-XXV for SFP */
+#define ICE_DEV_ID_E810_XXV_SFP		0x159B
+/* Intel(R) Ethernet Connection E823-C for backplane */
+#define ICE_DEV_ID_E823C_BACKPLANE	0x188A
+/* Intel(R) Ethernet Connection E823-C for QSFP */
+#define ICE_DEV_ID_E823C_QSFP		0x188B
+/* Intel(R) Ethernet Connection E823-C for SFP */
+#define ICE_DEV_ID_E823C_SFP		0x188C
+/* Intel(R) Ethernet Connection E823-C/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E823C_10G_BASE_T	0x188D
+/* Intel(R) Ethernet Connection E823-C 1GbE */
+#define ICE_DEV_ID_E823C_SGMII		0x188E
+/* Intel(R) Ethernet Connection E822-C for backplane */
+#define ICE_DEV_ID_E822C_BACKPLANE	0x1890
+/* Intel(R) Ethernet Connection E822-C for QSFP */
+#define ICE_DEV_ID_E822C_QSFP		0x1891
+/* Intel(R) Ethernet Connection E822-C for SFP */
+#define ICE_DEV_ID_E822C_SFP		0x1892
+/* Intel(R) Ethernet Connection E822-C/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E822C_10G_BASE_T	0x1893
+/* Intel(R) Ethernet Connection E822-C 1GbE */
+#define ICE_DEV_ID_E822C_SGMII		0x1894
+/* Intel(R) Ethernet Connection E822-L for backplane */
+#define ICE_DEV_ID_E822L_BACKPLANE	0x1897
+/* Intel(R) Ethernet Connection E822-L for SFP */
+#define ICE_DEV_ID_E822L_SFP		0x1898
+/* Intel(R) Ethernet Connection E822-L/X557-AT 10GBASE-T */
+#define ICE_DEV_ID_E822L_10G_BASE_T	0x1899
+/* Intel(R) Ethernet Connection E822-L 1GbE */
+#define ICE_DEV_ID_E822L_SGMII		0x189A
 
 #endif /* _ICE_DEVIDS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
new file mode 100644
index 0000000000000000000000000000000000000000..8b7fcbc0a32dfff21773fd2ed122274248a3526c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
@@ -0,0 +1,1090 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_devlink.h"
+#include "ice_eswitch.h"
+#include "ice_fw_update.h"
+
+#ifdef HAVE_DEVLINK_INFO_GET
+/* context for devlink info version reporting */
+struct ice_info_ctx {
+	char buf[128];
+	struct ice_orom_info pending_orom;
+	struct ice_nvm_info pending_nvm;
+	struct ice_netlist_info pending_netlist;
+	struct ice_hw_dev_caps dev_caps;
+};
+
+/*
+ * The following functions are used to format specific strings for various
+ * devlink info versions. The ctx parameter is used to provide the storage
+ * buffer, as well as any ancillary information calculated when the info
+ * request was made.
+ *
+ * If a version does not exist, for example a "stored" version that does not
+ * exist because no update is pending, the function should leave the buffer in
+ * the ctx structure empty and return 0.
+ */
+
+static void ice_info_get_dsn(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	u8 dsn[8];
+
+	/* Copy the DSN into an array in Big Endian format */
+	put_unaligned_be64(pci_get_dsn(pf->pdev), dsn);
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%8phD", dsn);
+}
+
+static void ice_info_pba(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+
+	status = ice_read_pba_string(hw, (u8 *)ctx->buf, sizeof(ctx->buf));
+	if (status)
+		/* We failed to locate the PBA, so just skip this entry */
+		dev_dbg(ice_pf_to_dev(pf), "Failed to read Product Board Assembly string, status %s\n",
+			ice_stat_str(status));
+}
+
+static void ice_info_fw_mgmt(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", hw->fw_maj_ver, hw->fw_min_ver,
+		 hw->fw_patch);
+}
+
+static void ice_info_fw_api(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u", hw->api_maj_ver, hw->api_min_ver);
+}
+
+static void ice_info_fw_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", hw->fw_build);
+}
+
+static void ice_info_fw_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev);
+}
+
+static void ice_info_pending_fw_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev);
+}
+
+static void ice_info_orom_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &pf->hw.flash.orom;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", orom->major, orom->build, orom->patch);
+}
+
+static void ice_info_pending_orom_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &ctx->pending_orom;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u",
+			 orom->major, orom->build, orom->patch);
+}
+
+static void ice_info_orom_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &pf->hw.flash.orom;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev);
+}
+
+static void ice_info_pending_orom_srev(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_orom_info *orom = &ctx->pending_orom;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev);
+}
+
+static void ice_info_nvm_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor);
+}
+
+static void ice_info_pending_nvm_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor);
+}
+
+static void ice_info_eetrack(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &pf->hw.flash.nvm;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm->eetrack);
+}
+
+static void ice_info_pending_eetrack(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_nvm_info *nvm = &ctx->pending_nvm;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm)
+		snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm->eetrack);
+}
+
+static void ice_info_ddp_pkg_name(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%s", hw->active_pkg_name);
+}
+
+static void ice_info_ddp_pkg_version(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_pkg_ver *pkg = &pf->hw.active_pkg_ver;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u.%u", pkg->major, pkg->minor, pkg->update,
+		 pkg->draft);
+}
+
+static void ice_info_ddp_pkg_bundle_id(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", pf->hw.active_track_id);
+}
+
+static void ice_info_netlist_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &pf->hw.flash.netlist;
+
+	/* The netlist versions are BCD formatted */
+	snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x", netlist->major, netlist->minor,
+		 netlist->type >> 16, netlist->type & 0xFFFF, netlist->rev,
+		 netlist->cust_ver);
+}
+
+static void ice_info_netlist_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &pf->hw.flash.netlist;
+
+	snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash);
+}
+
+static void ice_info_pending_netlist_ver(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &ctx->pending_netlist;
+
+	/* The netlist versions are BCD formatted */
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist)
+		snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x",
+			 netlist->major, netlist->minor,
+			 netlist->type >> 16, netlist->type & 0xFFFF, netlist->rev,
+			 netlist->cust_ver);
+}
+
+static void ice_info_pending_netlist_build(struct ice_pf *pf, struct ice_info_ctx *ctx)
+{
+	struct ice_netlist_info *netlist = &ctx->pending_netlist;
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist)
+		snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash);
+}
+
+#define fixed(key, getter) { ICE_VERSION_FIXED, key, getter }
+#define running(key, getter) { ICE_VERSION_RUNNING, key, getter }
+#define stored(key, getter) { ICE_VERSION_STORED, key, getter }
+
+enum ice_version_type {
+	ICE_VERSION_FIXED,
+	ICE_VERSION_RUNNING,
+	ICE_VERSION_STORED,
+};
+
+static const struct ice_devlink_version {
+	enum ice_version_type type;
+	const char *key;
+	void (*getter)(struct ice_pf *pf, struct ice_info_ctx *ctx);
+} ice_devlink_versions[] = {
+	fixed(DEVLINK_INFO_VERSION_GENERIC_BOARD_ID, ice_info_pba),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_MGMT, ice_info_fw_mgmt),
+	running("fw.mgmt.api", ice_info_fw_api),
+	running("fw.mgmt.build", ice_info_fw_build),
+	running("fw.mgmt.srev", ice_info_fw_srev),
+	stored("fw.mgmt.srev", ice_info_pending_fw_srev),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ice_info_orom_ver),
+	stored(DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ice_info_pending_orom_ver),
+	running("fw.undi.srev", ice_info_orom_srev),
+	stored("fw.undi.srev", ice_info_pending_orom_srev),
+	running("fw.psid.api", ice_info_nvm_ver),
+	stored("fw.psid.api", ice_info_pending_nvm_ver),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ice_info_eetrack),
+	stored(DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ice_info_pending_eetrack),
+	running("fw.app.name", ice_info_ddp_pkg_name),
+	running(DEVLINK_INFO_VERSION_GENERIC_FW_APP, ice_info_ddp_pkg_version),
+	running("fw.app.bundle_id", ice_info_ddp_pkg_bundle_id),
+	running("fw.netlist", ice_info_netlist_ver),
+	stored("fw.netlist", ice_info_pending_netlist_ver),
+	running("fw.netlist.build", ice_info_netlist_build),
+	stored("fw.netlist.build", ice_info_pending_netlist_build),
+};
+
+/**
+ * ice_devlink_info_get - .info_get devlink handler
+ * @devlink: devlink instance structure
+ * @req: the devlink info request
+ * @extack: extended netdev ack structure
+ *
+ * Callback for the devlink .info_get operation. Reports information about the
+ * device.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+static int ice_devlink_info_get(struct devlink *devlink,
+				struct devlink_info_req *req,
+				struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_info_ctx *ctx;
+	enum ice_status status;
+	size_t i;
+	int err;
+
+	err = ice_wait_for_reset(pf, 10 * HZ);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Device is busy resetting");
+		return err;
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	/* discover capabilities first */
+	status = ice_discover_dev_caps(hw, &ctx->dev_caps);
+	if (status) {
+		dev_dbg(dev, "Failed to discover device capabilities, status %s aq_err %s\n",
+			ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Unable to discover device capabilities");
+		err = -EIO;
+		goto out_free_ctx;
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_orom) {
+		status = ice_get_inactive_orom_ver(hw, &ctx->pending_orom);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive Option ROM version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_orom = false;
+		}
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_nvm) {
+		status = ice_get_inactive_nvm_ver(hw, &ctx->pending_nvm);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive NVM version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_nvm = false;
+		}
+	}
+
+	if (ctx->dev_caps.common_cap.nvm_update_pending_netlist) {
+		status = ice_get_inactive_netlist_ver(hw, &ctx->pending_netlist);
+		if (status) {
+			dev_dbg(dev, "Unable to read inactive Netlist version data, status %s aq_err %s\n",
+				ice_stat_str(status), ice_aq_str(hw->adminq.sq_last_status));
+
+			/* disable display of pending Option ROM */
+			ctx->dev_caps.common_cap.nvm_update_pending_netlist = false;
+		}
+	}
+
+	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to set driver name");
+		goto out_free_ctx;
+	}
+
+	ice_info_get_dsn(pf, ctx);
+
+	err = devlink_info_serial_number_put(req, ctx->buf);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to set serial number");
+		goto out_free_ctx;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ice_devlink_versions); i++) {
+		enum ice_version_type type = ice_devlink_versions[i].type;
+		const char *key = ice_devlink_versions[i].key;
+
+		memset(ctx->buf, 0, sizeof(ctx->buf));
+
+		ice_devlink_versions[i].getter(pf, ctx);
+
+		/* Do not report missing versions */
+		if (ctx->buf[0] == '\0')
+			continue;
+
+		switch (type) {
+		case ICE_VERSION_FIXED:
+			err = devlink_info_version_fixed_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set fixed version");
+				goto out_free_ctx;
+			}
+			break;
+		case ICE_VERSION_RUNNING:
+			err = devlink_info_version_running_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set running version");
+				goto out_free_ctx;
+			}
+			break;
+		case ICE_VERSION_STORED:
+			err = devlink_info_version_stored_put(req, key, ctx->buf);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Unable to set stored version");
+				goto out_free_ctx;
+			}
+			break;
+		}
+	}
+
+out_free_ctx:
+	kfree(ctx);
+	return err;
+}
+#endif /* HAVE_DEVLINK_INFO_GET */
+
+#ifdef HAVE_DEVLINK_PARAMS
+enum ice_devlink_param_id {
+	ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV,
+	ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV,
+};
+
+/**
+ * ice_devlink_minsrev_get - Get the current minimum security revision
+ * @devlink: pointer to the devlink instance
+ * @id: the parameter ID to get
+ * @ctx: context to return the parameter value
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_minsrev_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	if (id != ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV &&
+	    id != ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV)
+		return -EINVAL;
+
+	status = ice_get_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		dev_warn(dev, "Failed to read minimum security revision data from flash\n");
+		return -EIO;
+	}
+
+	/* We report zero if the device has not yet had a valid minimum
+	 * security revision programmed for the associated module. This makes
+	 * sense because it is not possible to have a security revision of
+	 * less than zero. Thus, all images will be able to load if the
+	 * minimum security revision is zero, the same as the case where the
+	 * minimum value is indicated as invalid.
+	 */
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		if (minsrevs.nvm_valid)
+			ctx->val.vu32 = minsrevs.nvm;
+		else
+			ctx->val.vu32 = 0;
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		if (minsrevs.orom_valid)
+			ctx->val.vu32 = minsrevs.orom;
+		else
+			ctx->val.vu32 = 0;
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_minsrev_set - Set the minimum security revision
+ * @devlink: pointer to the devlink instance
+ * @id: the parameter ID to set
+ * @ctx: context to return the parameter value
+ *
+ * Set the minimum security revision value for fw.mgmt or fw.undi. The kernel
+ * calls the validate handler before calling this, so we do not need to
+ * duplicate those checks here.
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_minsrev_set(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		minsrevs.nvm_valid = true;
+		minsrevs.nvm = ctx->val.vu32;
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		minsrevs.orom_valid = true;
+		minsrevs.orom = ctx->val.vu32;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	status = ice_update_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		dev_warn(dev, "Failed to update minimum security revision data\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_minsrev_validate - Validate a minimum security revision update
+ * @devlink: unused pointer to devlink instance
+ * @id: the parameter ID to validate
+ * @val: value to validate
+ * @extack: netlink extended ACK structure
+ *
+ * Check that a proposed update to a minimum security revision field is valid.
+ * Each minimum security revision can only be increased, not decreased.
+ * Additionally, we verify that the value is never set higher than the
+ * security revision of the active flash component.
+ *
+ * Returns: zero if the value is valid, -ERANGE if it is out of range, and
+ * -EINVAL if this function is called with the wrong ID.
+ */
+static int
+ice_devlink_minsrev_validate(struct devlink *devlink, u32 id, union devlink_param_value val,
+			     struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_minsrev_info minsrevs = {};
+	enum ice_status status;
+
+	if (id != ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV &&
+	    id != ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV)
+		return -EINVAL;
+
+	status = ice_get_nvm_minsrevs(&pf->hw, &minsrevs);
+	if (status) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read minimum security revision data from flash");
+		return -EIO;
+	}
+
+	switch (id) {
+	case ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV:
+		if (val.vu32 > pf->hw.flash.nvm.srev) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot update fw.mgmt minimum security revision higher than the currently running firmware");
+			dev_dbg(dev, "Attempted to set fw.mgmt.minsrev to %u, but running firmware has srev %u\n",
+				val.vu32, pf->hw.flash.nvm.srev);
+			return -EPERM;
+		}
+
+		if (minsrevs.nvm_valid && val.vu32 < minsrevs.nvm) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot lower the minimum security revision for fw.mgmt flash section");
+			dev_dbg(dev, "Attempted  to set fw.mgmt.minsrev to %u, but current minsrev is %u\n",
+				val.vu32, minsrevs.nvm);
+			return -EPERM;
+		}
+		break;
+	case ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV:
+		if (val.vu32 > pf->hw.flash.orom.srev) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot update fw.undi minimum security revision higher than the currently running firmware");
+			dev_dbg(dev, "Attempted to set fw.undi.minsrev to %u, but running firmware has srev %u\n",
+				val.vu32, pf->hw.flash.orom.srev);
+			return -EPERM;
+		}
+
+		if (minsrevs.orom_valid && val.vu32 < minsrevs.orom) {
+			NL_SET_ERR_MSG_MOD(extack, "Cannot lower the minimum security revision for fw.undi flash section");
+			dev_dbg(dev, "Attempted  to set fw.undi.minsrev to %u, but current minsrev is %u\n",
+				val.vu32, minsrevs.orom);
+			return -EPERM;
+		}
+		break;
+	}
+
+	return 0;
+}
+
+/* devlink parameters for the ice driver */
+static const struct devlink_param ice_devlink_params[] = {
+	DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_FW_MGMT_MINSREV,
+			     "fw.mgmt.minsrev",
+			     DEVLINK_PARAM_TYPE_U32,
+			     BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+			     ice_devlink_minsrev_get,
+			     ice_devlink_minsrev_set,
+			     ice_devlink_minsrev_validate),
+	DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_FW_UNDI_MINSREV,
+			     "fw.undi.minsrev",
+			     DEVLINK_PARAM_TYPE_U32,
+			     BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+			     ice_devlink_minsrev_get,
+			     ice_devlink_minsrev_set,
+			     ice_devlink_minsrev_validate),
+};
+#endif /* HAVE_DEVLINK_PARAMS */
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE
+/**
+ * ice_devlink_flash_update - Update firmware stored in flash on the device
+ * @devlink: pointer to devlink associated with device to update
+ * @params: flash update parameters
+ * @extack: netlink extended ACK structure
+ *
+ * Perform a device flash update. The bulk of the update logic is contained
+ * within the ice_flash_pldm_image function.
+ *
+ * Returns: zero on success, or an error code on failure.
+ */
+static int
+ice_devlink_flash_update(struct devlink *devlink,
+			 struct devlink_flash_update_params *params,
+			 struct netlink_ext_ack *extack)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	struct device *dev = &pf->pdev->dev;
+#endif
+	struct ice_hw *hw = &pf->hw;
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	const struct firmware *fw;
+#endif
+	u8 preservation;
+	int err;
+
+	if (!params->overwrite_mask) {
+		/* preserve all settings and identifiers */
+		preservation = ICE_AQC_NVM_PRESERVE_ALL;
+	} else if (params->overwrite_mask == DEVLINK_FLASH_OVERWRITE_SETTINGS) {
+		/* overwrite settings, but preserve the vital device identifiers */
+		preservation = ICE_AQC_NVM_PRESERVE_SELECTED;
+	} else if (params->overwrite_mask == (DEVLINK_FLASH_OVERWRITE_SETTINGS |
+					      DEVLINK_FLASH_OVERWRITE_IDENTIFIERS)) {
+		/* overwrite both settings and identifiers, preserve nothing */
+		preservation = ICE_AQC_NVM_NO_PRESERVATION;
+	} else {
+		NL_SET_ERR_MSG_MOD(extack, "Requested overwrite mask is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (!hw->dev_caps.common_cap.nvm_unified_update) {
+		NL_SET_ERR_MSG_MOD(extack, "Current firmware does not support unified update");
+		return -EOPNOTSUPP;
+	}
+
+	err = ice_check_for_pending_update(pf, NULL, extack);
+	if (err)
+		return err;
+
+	devlink_flash_update_status_notify(devlink, "Preparing to flash", NULL, 0, 0);
+
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+	err = request_firmware(&fw, params->file_name, dev);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to read file from disk");
+		return err;
+	}
+
+	err = ice_flash_pldm_image(pf, fw, preservation, extack);
+
+	release_firmware(fw);
+
+	return err;
+#else
+	return ice_flash_pldm_image(pf, params->fw, preservation, extack);
+#endif
+}
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+static int
+ice_devlink_flash_update_notify_compat(struct devlink *devlink,
+				       struct devlink_flash_update_params *params,
+				       struct netlink_ext_ack *extack)
+{
+	int err;
+
+	devlink_flash_update_begin_notify(devlink);
+	err = ice_devlink_flash_update(devlink, params, extack);
+	devlink_flash_update_end_notify(devlink);
+
+	return err;
+}
+#endif
+
+#ifndef HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+static int
+ice_devlink_flash_update_params_compat(struct devlink *devlink, const char *file_name,
+				       const char *component, struct netlink_ext_ack *extack)
+{
+	struct devlink_flash_update_params params = {};
+
+	/* individual component update is not yet supported, and older kernels
+	 * did not check this for us.
+	 */
+	if (component)
+		return -EOPNOTSUPP;
+
+	params.file_name = file_name;
+
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+	return ice_devlink_flash_update_notify_compat(devlink, &params, extack);
+#else
+	return ice_devlink_flash_update(devlink, &params, extack);
+#endif
+}
+#endif /* !HAVE_DEVLINK_FLASH_UPDATE_PARAMS */
+#endif /* HAVE_DEVLINK_FLASH_UPDATE */
+
+static const struct devlink_ops ice_devlink_ops = {
+#ifdef HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+	.supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK,
+#endif /* HAVE_DEVLINK_FLASH_UPDATE_PARAMS */
+	.eswitch_mode_get = ice_eswitch_mode_get,
+	.eswitch_mode_set = ice_eswitch_mode_set,
+#ifdef HAVE_DEVLINK_INFO_GET
+	.info_get = ice_devlink_info_get,
+#endif /* HAVE_DEVLINK_INFO_GET */
+#ifdef HAVE_DEVLINK_FLASH_UPDATE
+#if !defined(HAVE_DEVLINK_FLASH_UPDATE_PARAMS)
+	.flash_update = ice_devlink_flash_update_params_compat,
+#elif defined(HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY)
+	.flash_update = ice_devlink_flash_update_notify_compat,
+#else
+	.flash_update = ice_devlink_flash_update,
+#endif
+#endif /* HAVE_DEVLINK_FLASH_UPDATE */
+};
+
+static void ice_devlink_free(void *devlink_ptr)
+{
+	devlink_free((struct devlink *)devlink_ptr);
+}
+
+/**
+ * ice_allocate_pf - Allocate devlink and return PF structure pointer
+ * @dev: the device to allocate for
+ *
+ * Allocate a devlink instance for this device and return the private area as
+ * the PF structure. The devlink memory is kept track of through devres by
+ * adding an action to remove it when unwinding.
+ */
+struct ice_pf *ice_allocate_pf(struct device *dev)
+{
+	struct devlink *devlink;
+
+	devlink = devlink_alloc(&ice_devlink_ops, sizeof(struct ice_pf));
+	if (!devlink)
+		return NULL;
+
+	/* Add an action to teardown the devlink when unwinding the driver */
+	if (devm_add_action(dev, ice_devlink_free, devlink)) {
+		devlink_free(devlink);
+		return NULL;
+	}
+
+	return devlink_priv(devlink);
+}
+
+/**
+ * ice_devlink_register - Register devlink interface for this PF
+ * @pf: the PF to register the devlink for.
+ *
+ * Register the devlink instance associated with this physical function.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_register(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	int err;
+
+	err = devlink_register(devlink, dev);
+	if (err) {
+		dev_err(dev, "devlink registration failed: %d\n", err);
+		return err;
+	}
+
+#ifdef HAVE_DEVLINK_PARAMS
+	err = devlink_params_register(devlink, ice_devlink_params,
+				      ARRAY_SIZE(ice_devlink_params));
+	if (err) {
+		dev_err(dev, "devlink params registration failed: %d\n", err);
+		return err;
+	}
+#endif /* HAVE_DEVLINK_PARAMS */
+
+	return 0;
+}
+
+/**
+ * ice_devlink_unregister - Unregister devlink resources for this PF.
+ * @pf: the PF structure to cleanup
+ *
+ * Releases resources used by devlink and cleans up associated memory.
+ */
+void ice_devlink_unregister(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_unregister(devlink, ice_devlink_params,
+				  ARRAY_SIZE(ice_devlink_params));
+#endif /* HAVE_DEVLINK_PARAMS */
+	devlink_unregister(devlink);
+}
+
+/**
+ * ice_devlink_params_publish - Publish parameters to allow user access.
+ * @pf: the PF structure pointer
+ */
+void ice_devlink_params_publish(struct ice_pf __maybe_unused *pf)
+{
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_publish(priv_to_devlink(pf));
+#endif
+}
+
+/**
+ * ice_devlink_params_unpublish - Unpublish parameters to prevent user access.
+ * @pf: the PF structure pointer
+ */
+void ice_devlink_params_unpublish(struct ice_pf __maybe_unused *pf)
+{
+#ifdef HAVE_DEVLINK_PARAMS
+	devlink_params_unpublish(priv_to_devlink(pf));
+#endif
+}
+
+/**
+ * ice_devlink_create_pf_port - Create a devlink port for this PF
+ * @pf: the PF to create a devlink port for
+ *
+ * Create and register a devlink_port for this PF.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_create_pf_port(struct ice_pf *pf)
+{
+	struct devlink_port_attrs attrs = {};
+	struct devlink_port *devlink_port;
+	struct devlink *devlink;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+
+	devlink_port = &pf->devlink_port;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EIO;
+
+	attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+	attrs.phys.port_number = vsi->port_info->lport;
+
+	devlink_port_attrs_set(devlink_port, &attrs);
+	devlink = priv_to_devlink(pf);
+
+	err = devlink_port_register(devlink, devlink_port, vsi->idx);
+	if (err) {
+		dev_err(dev, "Failed to create devlink port for PF %d, error %d\n",
+			pf->hw.pf_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_destroy_pf_port - Destroy the devlink_port for this PF
+ * @pf: the PF to cleanup
+ *
+ * Unregisters the devlink_port structure associated with this PF.
+ */
+void ice_devlink_destroy_pf_port(struct ice_pf *pf)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = &pf->devlink_port;
+
+	devlink_port_type_clear(devlink_port);
+	devlink_port_unregister(devlink_port);
+}
+
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+/**
+ * ice_devlink_create_vf_port - Create a devlink port for this VF
+ * @vf: the VF to create a port for
+ *
+ * Create and register a devlink_port for this VF.
+ *
+ * Return: zero on success or an error code on failure.
+ */
+int ice_devlink_create_vf_port(struct ice_vf *vf)
+{
+	struct devlink_port_attrs attrs = {};
+	struct devlink_port *devlink_port;
+	struct devlink *devlink;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	int err;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	vsi = ice_get_vf_vsi(vf);
+	devlink_port = &vf->devlink_port;
+
+	attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF;
+	attrs.pci_vf.pf = pf->hw.bus.func;
+	attrs.pci_vf.vf = vf->vf_id;
+
+	devlink_port_attrs_set(devlink_port, &attrs);
+	devlink = priv_to_devlink(pf);
+
+	err = devlink_port_register(devlink, devlink_port, vsi->idx);
+	if (err) {
+		dev_err(dev, "Failed to create devlink port for VF %d, error %d\n",
+			vf->vf_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_devlink_destroy_vf_port - Destroy the devlink_port for this VF
+ * @vf: the VF to cleanup
+ *
+ * Unregisters the devlink_port structure associated with this VF.
+ */
+void ice_devlink_destroy_vf_port(struct ice_vf *vf)
+{
+	struct devlink_port *devlink_port;
+
+	devlink_port = &vf->devlink_port;
+
+	devlink_port_type_clear(devlink_port);
+	devlink_port_unregister(devlink_port);
+}
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+
+#ifdef HAVE_DEVLINK_REGIONS
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+/**
+ * ice_devlink_nvm_snapshot - Capture a snapshot of the Shadow RAM contents
+ * @devlink: the devlink instance
+ * @ops: the devlink region being snapshotted
+ * @extack: extended ACK response structure
+ * @data: on exit points to snapshot data buffer
+ *
+ * This function is called in response to the DEVLINK_CMD_REGION_TRIGGER for
+ * the shadow-ram devlink region. It captures a snapshot of the shadow ram
+ * contents. This snapshot can later be viewed via the devlink-region
+ * interface.
+ *
+ * @returns zero on success, and updates the data pointer. Returns a non-zero
+ * error code on failure.
+ */
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+static int
+ice_devlink_nvm_snapshot(struct devlink *devlink,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+			 const struct devlink_region_ops __always_unused *ops,
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+			 struct netlink_ext_ack *extack, u8 **data)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 *nvm_data;
+	u32 nvm_size;
+
+	nvm_size = hw->flash.flash_size;
+	nvm_data = vzalloc(nvm_size);
+	if (!nvm_data)
+		return -ENOMEM;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status) {
+		dev_dbg(dev, "ice_acquire_nvm failed, err %d aq_err %d\n",
+			status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire NVM semaphore");
+		vfree(nvm_data);
+		return -EIO;
+	}
+
+	status = ice_read_flat_nvm(hw, 0, &nvm_size, nvm_data, false);
+	if (status) {
+		dev_dbg(dev, "ice_read_flat_nvm failed after reading %u bytes, err %d aq_err %d\n",
+			nvm_size, status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read NVM contents");
+		ice_release_nvm(hw);
+		vfree(nvm_data);
+		return -EIO;
+	}
+
+	ice_release_nvm(hw);
+
+	*data = nvm_data;
+
+	return 0;
+}
+
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+/**
+ * ice_devlink_devcaps_snapshot - Capture snapshot of device capabilities
+ * @devlink: the devlink instance
+ * @ops: the devlink region being snapshotted
+ * @extack: extended ACK response structure
+ * @data: on exit points to snapshot data buffer
+ *
+ * This function is called in response to the DEVLINK_CMD_REGION_TRIGGER for
+ * the device-caps devlink region. It captures a snapshot of the device
+ * capabilities reported by firmware.
+ *
+ * @returns zero on success, and updates the data pointer. Returns a non-zero
+ * error code on failure.
+ */
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+static int
+ice_devlink_devcaps_snapshot(struct devlink *devlink,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+			     const struct devlink_region_ops __always_unused *ops,
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS */
+			     struct netlink_ext_ack *extack, u8 **data)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	void *devcaps;
+
+	devcaps = vzalloc(ICE_AQ_MAX_BUF_LEN);
+	if (!devcaps)
+		return -ENOMEM;
+
+	status = ice_aq_list_caps(hw, devcaps, ICE_AQ_MAX_BUF_LEN, NULL,
+				  ice_aqc_opc_list_dev_caps, NULL);
+	if (status) {
+		dev_dbg(dev, "ice_aq_list_caps: failed to read device capabilities, err %d aq_err %d\n",
+			status, hw->adminq.sq_last_status);
+		NL_SET_ERR_MSG_MOD(extack, "Failed to read device capabilities");
+		vfree(devcaps);
+		return -EIO;
+	}
+
+	*data = (u8 *)devcaps;
+
+	return 0;
+}
+#endif /* HAVE_DEVLINK_REGION_OPS_SNAPSHOT */
+
+static const struct devlink_region_ops ice_nvm_region_ops = {
+	.name = "nvm-flash",
+	.destructor = vfree,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+	.snapshot = ice_devlink_nvm_snapshot,
+#endif
+};
+
+static const struct devlink_region_ops ice_devcaps_region_ops = {
+	.name = "device-caps",
+	.destructor = vfree,
+#ifdef HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+	.snapshot = ice_devlink_devcaps_snapshot,
+#endif
+};
+
+/**
+ * ice_devlink_init_regions - Initialize devlink regions
+ * @pf: the PF device structure
+ *
+ * Create devlink regions used to enable access to dump the contents of the
+ * flash memory on the device.
+ */
+void ice_devlink_init_regions(struct ice_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	u64 nvm_size;
+
+	nvm_size = pf->hw.flash.flash_size;
+	pf->nvm_region = devlink_region_create(devlink, &ice_nvm_region_ops, 1,
+					       nvm_size);
+	if (IS_ERR(pf->nvm_region)) {
+		dev_err(dev, "failed to create NVM devlink region, err %ld\n",
+			PTR_ERR(pf->nvm_region));
+		pf->nvm_region = NULL;
+	}
+
+	pf->devcaps_region = devlink_region_create(devlink,
+						   &ice_devcaps_region_ops, 10,
+						   ICE_AQ_MAX_BUF_LEN);
+	if (IS_ERR(pf->devcaps_region)) {
+		dev_err(dev, "failed to create device-caps devlink region, err %ld\n",
+			PTR_ERR(pf->devcaps_region));
+		pf->devcaps_region = NULL;
+	}
+}
+
+/**
+ * ice_devlink_destroy_regions - Destroy devlink regions
+ * @pf: the PF device structure
+ *
+ * Remove previously created regions for this PF.
+ */
+void ice_devlink_destroy_regions(struct ice_pf *pf)
+{
+	if (pf->nvm_region)
+		devlink_region_destroy(pf->nvm_region);
+
+	if (pf->devcaps_region)
+		devlink_region_destroy(pf->devcaps_region);
+}
+#endif /* HAVE_DEVLINK_REGIONS */
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.h b/drivers/net/ethernet/intel/ice/ice_devlink.h
new file mode 100644
index 0000000000000000000000000000000000000000..33464adf4ee66c02d52c4123b5af4d1e7cf0ce5b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_DEVLINK_H_
+#define _ICE_DEVLINK_H_
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+struct ice_pf *ice_allocate_pf(struct device *dev);
+
+int ice_devlink_register(struct ice_pf *pf);
+void ice_devlink_unregister(struct ice_pf *pf);
+void ice_devlink_params_publish(struct ice_pf *pf);
+void ice_devlink_params_unpublish(struct ice_pf *pf);
+int ice_devlink_create_pf_port(struct ice_pf *pf);
+void ice_devlink_destroy_pf_port(struct ice_pf *pf);
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+int ice_devlink_create_vf_port(struct ice_vf *vf);
+void ice_devlink_destroy_vf_port(struct ice_vf *vf);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#else /* CONFIG_NET_DEVLINK */
+static inline struct ice_pf *ice_allocate_pf(struct device *dev)
+{
+	return devm_kzalloc(dev, sizeof(struct ice_pf), GFP_KERNEL);
+}
+
+static inline int ice_devlink_register(struct ice_pf *pf) { return 0; }
+static inline void ice_devlink_unregister(struct ice_pf *pf) { }
+static inline void ice_devlink_params_publish(struct ice_pf *pf) { }
+static inline void ice_devlink_params_unpublish(struct ice_pf *pf) { }
+static inline int ice_devlink_create_pf_port(struct ice_pf *pf) { return 0; }
+static inline void ice_devlink_destroy_pf_port(struct ice_pf *pf) { }
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+static inline int ice_devlink_create_vf_port(struct ice_vf *vf) { return 0; }
+static inline void ice_devlink_destroy_vf_port(struct ice_vf *vf) { }
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* !CONFIG_NET_DEVLINK */
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK) && defined(HAVE_DEVLINK_REGIONS)
+void ice_devlink_init_regions(struct ice_pf *pf);
+void ice_devlink_destroy_regions(struct ice_pf *pf);
+#else
+static inline void ice_devlink_init_regions(struct ice_pf *pf) { }
+static inline void ice_devlink_destroy_regions(struct ice_pf *pf) { }
+#endif
+
+#endif /* _ICE_DEVLINK_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c
new file mode 100644
index 0000000000000000000000000000000000000000..d520934b420ff7cf462776324a70103c9f254e29
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_eswitch.h"
+#include "ice_fltr.h"
+#include "ice_repr.h"
+#include "ice_devlink.h"
+#include "ice_pf_vsi_vlan_ops.h"
+#include "ice_tc_lib.h"
+
+/**
+ * ice_eswitch_setup_env - configure switchdev HW filters
+ * @pf: pointer to PF struct
+ *
+ * This function adds HW filters configuration specific for switchdev
+ * mode.
+ */
+static int ice_eswitch_setup_env(struct ice_pf *pf)
+{
+	struct ice_vsi *uplink_vsi = pf->switchdev.uplink_vsi;
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	struct ice_port_info *pi = pf->hw.port_info;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	bool rule_added = false;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(ctrl_vsi);
+	if (vlan_ops->dis_stripping(ctrl_vsi) ||
+	    vlan_ops->dis_insertion(ctrl_vsi))
+		return -ENODEV;
+
+	ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx);
+
+	if (ice_vsi_add_vlan_zero(uplink_vsi))
+		goto err_def_rx;
+
+	if (!ice_is_dflt_vsi_in_use(uplink_vsi->vsw)) {
+		if (ice_set_dflt_vsi(uplink_vsi->vsw, uplink_vsi))
+			goto err_def_rx;
+		rule_added = true;
+	}
+
+	if (ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, true, ICE_FLTR_TX))
+		goto err_def_tx;
+
+	if (ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_set_allow_override))
+		goto err_override_uplink;
+
+	if (ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_set_allow_override))
+		goto err_override_control;
+
+	if (ice_fltr_update_flags_dflt_rule(ctrl_vsi, pi->dflt_tx_vsi_rule_id,
+					    ICE_FLTR_TX,
+					    ICE_SINGLE_ACT_LB_ENABLE))
+		goto err_update_action;
+
+	return 0;
+
+err_update_action:
+	ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override);
+err_override_control:
+	ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override);
+err_override_uplink:
+	ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, false, ICE_FLTR_TX);
+err_def_tx:
+	if (rule_added)
+		ice_clear_dflt_vsi(uplink_vsi->vsw);
+err_def_rx:
+	ice_fltr_add_mac_and_broadcast(uplink_vsi,
+				       uplink_vsi->port_info->mac.perm_addr,
+				       ICE_FWD_TO_VSI);
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_release_env - clear switchdev HW filters
+ * @pf: pointer to PF struct
+ *
+ * This function removes HW filters configuration specific for switchdev
+ * mode and restores default legacy mode settings.
+ */
+static void
+ice_eswitch_release_env(struct ice_pf *pf)
+{
+	struct ice_vsi *uplink_vsi = pf->switchdev.uplink_vsi;
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	struct ice_port_info *pi = pf->hw.port_info;
+
+	ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override);
+	ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override);
+	ice_cfg_dflt_vsi(pi, ctrl_vsi->idx, false, ICE_FLTR_TX);
+	ice_clear_dflt_vsi(uplink_vsi->vsw);
+	ice_fltr_add_mac_and_broadcast(uplink_vsi,
+				       uplink_vsi->port_info->mac.perm_addr,
+				       ICE_FWD_TO_VSI);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_eswitch_remap_ring - reconfigure ring of switchdev ctrl VSI
+ * @ring: pointer to ring
+ * @q_vector: pointer of q_vector which is connected with this ring
+ * @netdev: netdevice connected with this ring
+ */
+static void
+ice_eswitch_remap_ring(struct ice_ring *ring, struct ice_q_vector *q_vector,
+		       struct net_device *netdev)
+{
+	ring->q_vector = q_vector;
+	ring->next = NULL;
+	ring->netdev = netdev;
+}
+
+/**
+ * ice_eswitch_remap_rings_to_vectors - reconfigure rings of switchdev ctrl VSI
+ * @pf: pointer to PF struct
+ *
+ * In switchdev number of allocated Tx/Rx rings is equal.
+ *
+ * This function fills q_vectors structures associated with representator and
+ * move each ring pairs to port representator netdevs. Each port representor
+ * will have dedicated 1 Tx/Rx ring pair, so number of rings pair is equal to
+ * number of VFs.
+ */
+static void
+ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = pf->switchdev.control_vsi;
+	int q_id;
+
+	ice_for_each_txq(vsi, q_id) {
+		struct ice_repr *repr = pf->vf[q_id].repr;
+		struct ice_q_vector *q_vector = repr->q_vector;
+		struct ice_ring *tx_ring = vsi->tx_rings[q_id];
+		struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+
+		q_vector->vsi = vsi;
+		q_vector->reg_idx = vsi->q_vectors[0]->reg_idx;
+
+		q_vector->num_ring_tx = 1;
+		q_vector->tx.ring = tx_ring;
+		ice_eswitch_remap_ring(tx_ring, q_vector, repr->netdev);
+		/* In switchdev mode, from OS stack perspective, there is only
+		 * one queue for given netdev, so it needs to be indexed as 0.
+		 */
+		tx_ring->q_index = 0;
+
+		q_vector->num_ring_rx = 1;
+		q_vector->rx.ring = rx_ring;
+		ice_eswitch_remap_ring(rx_ring, q_vector, repr->netdev);
+	}
+}
+
+/**
+ * ice_eswitch_setup_reprs - configure port reprs to run in switchdev mode
+ * @pf: pointer to PF struct
+ */
+static int ice_eswitch_setup_reprs(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	int max_vsi_num = 0;
+	int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_remove_vsi_fltr(&pf->hw, vsi->idx);
+		vf->repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX,
+						   GFP_KERNEL);
+		if (!vf->repr->dst) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			goto err;
+		}
+
+		if (ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof)) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			metadata_dst_free(vf->repr->dst);
+			goto err;
+		}
+
+		if (ice_vsi_add_vlan_zero(vsi)) {
+			ice_fltr_add_mac_and_broadcast(vsi,
+						       vf->hw_lan_addr.addr,
+						       ICE_FWD_TO_VSI);
+			metadata_dst_free(vf->repr->dst);
+			ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+			goto err;
+		}
+
+		if (max_vsi_num < vsi->vsi_num)
+			max_vsi_num = vsi->vsi_num;
+
+		netif_napi_add(vf->repr->netdev, &vf->repr->q_vector->napi, ice_napi_poll,
+			       NAPI_POLL_WEIGHT);
+
+		netif_keep_dst(vf->repr->netdev);
+	}
+
+	kfree(ctrl_vsi->target_netdevs);
+
+	ctrl_vsi->target_netdevs = kcalloc(max_vsi_num + 1,
+					   sizeof(*ctrl_vsi->target_netdevs),
+					   GFP_KERNEL);
+	if (!ctrl_vsi->target_netdevs)
+		goto err;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_repr *repr = pf->vf[i].repr;
+		struct ice_vsi *vsi = repr->src_vsi;
+		struct metadata_dst *dst;
+
+		ctrl_vsi->target_netdevs[vsi->vsi_num] = repr->netdev;
+
+		dst = repr->dst;
+		dst->u.port_info.port_id = vsi->vsi_num;
+		dst->u.port_info.lower_dev = repr->netdev;
+		ice_repr_set_traffic_vsi(repr, ctrl_vsi);
+	}
+
+	return 0;
+
+err:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+		metadata_dst_free(vf->repr->dst);
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr,
+					       ICE_FWD_TO_VSI);
+	}
+
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_release_reprs - clear PR VSIs configuration
+ * @pf: poiner to PF struct
+ * @ctrl_vsi: pointer to switchdev control VSI
+ */
+static void ice_eswitch_release_reprs(struct ice_pf *pf,
+				      struct ice_vsi *ctrl_vsi)
+{
+	int i;
+
+	kfree(ctrl_vsi->target_netdevs);
+	ice_for_each_vf(pf, i) {
+		struct ice_vsi *vsi = pf->vf[i].repr->src_vsi;
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
+		metadata_dst_free(vf->repr->dst);
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr,
+					       ICE_FWD_TO_VSI);
+
+		netif_napi_del(&vf->repr->q_vector->napi);
+	}
+}
+
+/**
+ * ice_eswitch_update_repr - reconfigure VF port representor
+ * @vsi: VF VSI for which port representor is configured
+ */
+void ice_eswitch_update_repr(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_repr *repr;
+	struct ice_vf *vf;
+	int ret;
+
+	if (!ice_is_switchdev_running(pf))
+		return;
+
+	vf = &pf->vf[vsi->vf_id];
+	repr = vf->repr;
+	repr->src_vsi = vsi;
+	repr->dst->u.port_info.port_id = vsi->vsi_num;
+
+	ret = ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof);
+	if (ret) {
+		ice_fltr_add_mac_and_broadcast(vsi, vf->hw_lan_addr.addr, ICE_FWD_TO_VSI);
+		dev_err(ice_pf_to_dev(pf), "Failed to update VF %d port representor", vsi->vf_id);
+		return;
+	}
+}
+
+/**
+ * ice_eswitch_port_start_xmit - callback for packets transmit
+ * @skb: send buffer
+ * @netdev: network interface device structure
+ *
+ * Returns NETDEV_TX_OK if sent, else an error code
+ */
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ice_netdev_priv *np;
+	struct ice_repr *repr;
+	struct ice_vsi *vsi;
+
+	np = netdev_priv(netdev);
+	vsi = np->vsi;
+
+	if (ice_is_reset_in_progress(vsi->back->state))
+		return NETDEV_TX_BUSY;
+
+	repr = ice_netdev_to_repr(netdev);
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *)repr->dst);
+	skb_dst_set(skb, (struct dst_entry *)repr->dst);
+	skb->queue_mapping = repr->vf->vf_id;
+
+	return ice_start_xmit(skb, netdev);
+}
+
+/**
+ * ice_eswitch_set_target_vsi - set switchdev context in Tx context descriptor
+ * @skb: pointer to send buffer
+ * @off: pointer to offload struct
+ */
+void ice_eswitch_set_target_vsi(struct sk_buff *skb,
+				struct ice_tx_offload_params *off)
+{
+	struct metadata_dst *dst = skb_metadata_dst(skb);
+	u64 cd_cmd, dst_vsi;
+
+	if (!dst) {
+		cd_cmd = ICE_TX_CTX_DESC_SWTCH_UPLINK << ICE_TXD_CTX_QW1_CMD_S;
+		off->cd_qw1 |= (cd_cmd | ICE_TX_DESC_DTYPE_CTX);
+	} else {
+		cd_cmd = ICE_TX_CTX_DESC_SWTCH_VSI << ICE_TXD_CTX_QW1_CMD_S;
+		dst_vsi = ((u64)dst->u.port_info.port_id <<
+			   ICE_TXD_CTX_QW1_VSI_S) & ICE_TXD_CTX_QW1_VSI_M;
+		off->cd_qw1 = cd_cmd | dst_vsi | ICE_TX_DESC_DTYPE_CTX;
+	}
+}
+#else
+static void
+ice_eswitch_release_reprs(struct ice_pf __always_unused *pf,
+			  struct ice_vsi __always_unused *ctrl_vsi)
+{
+}
+
+static void
+ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
+{
+}
+
+static int ice_eswitch_setup_reprs(struct ice_pf __always_unused *pf)
+{
+	return -ENODEV;
+}
+
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff __always_unused *skb,
+			    struct net_device __always_unused *netdev)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_eswitch_vsi_setup - configure switchdev control VSI
+ * @pf: pointer to PF structure
+ * @pi: pointer to port_info structure
+ */
+static struct ice_vsi *
+ice_eswitch_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_SWITCHDEV_CTRL, ICE_INVAL_VFID, NULL, 0);
+}
+
+
+/**
+ * ice_eswitch_napi_del - remove NAPI handle for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_del(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		netif_napi_del(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_napi_enable - enable NAPI for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_enable(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		napi_enable(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_napi_disable - disable NAPI for all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_napi_disable(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i)
+		napi_disable(&pf->vf[i].repr->q_vector->napi);
+}
+
+/**
+ * ice_eswitch_set_rxdid - configure rxdid on all rx queues from VSI
+ * @vsi: vsi to setup rxdid on
+ * @rxdid: flex descriptor id
+ */
+static void ice_eswitch_set_rxdid(struct ice_vsi *vsi, u32 rxdid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	int i;
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = vsi->rx_rings[i];
+		u16 pf_q = vsi->rxq_map[ring->q_index];
+
+		ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3, true);
+	}
+}
+
+/**
+ * ice_eswitch_enable_switchdev - configure eswitch in switchdev mode
+ * @pf: pointer to PF structure
+ */
+static int
+ice_eswitch_enable_switchdev(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi;
+
+	pf->switchdev.control_vsi = ice_eswitch_vsi_setup(pf, pf->hw.port_info);
+	if (!pf->switchdev.control_vsi)
+		return -ENODEV;
+
+	ctrl_vsi = pf->switchdev.control_vsi;
+	pf->switchdev.uplink_vsi = ice_get_main_vsi(pf);
+	if (!pf->switchdev.uplink_vsi)
+		goto err_vsi;
+
+	if (ice_eswitch_setup_env(pf))
+		goto err_vsi;
+
+	if (ice_repr_add_for_all_vfs(pf))
+		goto err_repr_add;
+
+	if (ice_eswitch_setup_reprs(pf))
+		goto err_setup_reprs;
+
+	ice_eswitch_remap_rings_to_vectors(pf);
+
+	if (ice_vsi_open(ctrl_vsi))
+		goto err_setup_reprs;
+
+	ice_eswitch_napi_enable(pf);
+
+	ice_eswitch_set_rxdid(ctrl_vsi, ICE_RXDID_FLEX_NIC_2);
+
+	return 0;
+
+err_setup_reprs:
+	ice_repr_rem_from_all_vfs(pf);
+err_repr_add:
+	ice_eswitch_release_env(pf);
+err_vsi:
+	ice_vsi_release(ctrl_vsi);
+	return -ENODEV;
+}
+
+/**
+ * ice_eswitch_disable_switchdev - disable switchdev resources
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_disable_switchdev(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+
+	ice_eswitch_napi_disable(pf);
+	ice_eswitch_release_env(pf);
+	ice_vsi_release(ctrl_vsi);
+	ice_eswitch_release_reprs(pf, ctrl_vsi);
+	ice_repr_rem_from_all_vfs(pf);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+/**
+ * ice_eswitch_mode_set - set new eswitch mode
+ * @devlink: pointer to devlink structure
+ * @mode: eswitch mode to switch to
+ * @extack: pointer to extack structure
+ */
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			 struct netlink_ext_ack *extack)
+#else
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode)
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+
+	if (pf->eswitch_mode == mode)
+		return 0;
+
+	if (pf->num_alloc_vfs) {
+		dev_info(ice_pf_to_dev(pf),
+			 "Changing eswitch mode is allowed only if there is no VFs created");
+		return -EOPNOTSUPP;
+	}
+
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		dev_info(ice_pf_to_dev(pf), "PF %d changed eswitch mode to legacy", pf->hw.pf_id);
+		break;
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+	{
+#ifdef NETIF_F_HW_TC
+		if (ice_is_adq_active(pf)) {
+			dev_err(ice_pf_to_dev(pf), "switchdev cannot be configured - ADQ is active. Delete ADQ configs using TC and try again\n");
+			return -EOPNOTSUPP;
+		}
+#endif /* NETIF_F_HW_TC */
+
+#ifdef HAVE_NETDEV_SB_DEV
+		if (ice_is_offloaded_macvlan_ena(pf)) {
+			dev_err(ice_pf_to_dev(pf), "switchdev cannot be configured -  L2 Forwarding Offload is currently enabled.\n");
+			return -EOPNOTSUPP;
+		}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+		dev_info(ice_pf_to_dev(pf),
+			 "PF %d changed eswitch mode to switchdev", pf->hw.pf_id);
+		break;
+	}
+	default:
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+		NL_SET_ERR_MSG_MOD(extack, "Unknown eswitch mode");
+#else
+		dev_err(ice_pf_to_dev(pf), "Unknown eswitch mode");
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+		return -EINVAL;
+	}
+
+	pf->eswitch_mode = mode;
+	return 0;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_eswitch_get_target_netdev - return port representor netdev
+ * @rx_ring: pointer to rx ring
+ * @rx_desc: pointer to rx descriptor
+ *
+ * When working in switchdev mode context (when control vsi is used), this
+ * function returns netdev of appropriate port representor. For non-switchdev
+ * context, regular netdev associated with rx ring is returned.
+ */
+struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc)
+{
+	struct ice_32b_rx_flex_desc_nic_2 *desc;
+	struct ice_vsi *vsi = rx_ring->vsi;
+	struct ice_vsi *control_vsi;
+	u16 target_vsi_id;
+
+	control_vsi = vsi->back->switchdev.control_vsi;
+	if (vsi != control_vsi)
+		return rx_ring->netdev;
+
+	desc = (struct ice_32b_rx_flex_desc_nic_2 *)rx_desc;
+	target_vsi_id = le16_to_cpu(desc->src_vsi);
+
+	return vsi->target_netdevs[target_vsi_id];
+}
+
+/**
+ * ice_eswitch_mode_get - get current eswitch mode
+ * @devlink: pointer to devlink structure
+ * @mode: output parameter for current eswitch mode
+ */
+int ice_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct ice_pf *pf = devlink_priv(devlink);
+
+	*mode = pf->eswitch_mode;
+	return 0;
+}
+
+/**
+ * ice_is_eswitch_mode_switchdev - check if eswitch mode is set to switchdev
+ * @pf: pointer to PF structure
+ *
+ * Returns true if eswitch mode is set to DEVLINK_ESWITCH_MODE_SWITCHDEV,
+ * false otherwise.
+ */
+bool ice_is_eswitch_mode_switchdev(struct ice_pf *pf)
+{
+	return pf->eswitch_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV;
+}
+
+/**
+ * ice_eswitch_release - cleanup eswitch
+ * @pf: pointer to PF structure
+ */
+void ice_eswitch_release(struct ice_pf *pf)
+{
+	if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY)
+		return;
+
+	ice_eswitch_disable_switchdev(pf);
+	pf->switchdev.is_running = false;
+}
+
+/**
+ * ice_eswitch_configure - configure eswitch
+ * @pf: pointer to PF structure
+ */
+int ice_eswitch_configure(struct ice_pf *pf)
+{
+	int status;
+
+	if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY || pf->switchdev.is_running)
+		return 0;
+
+	status = ice_eswitch_enable_switchdev(pf);
+	if (status)
+		return status;
+
+	pf->switchdev.is_running = true;
+	return 0;
+}
+
+/**
+ * ice_eswitch_start_all_tx_queues - start Tx queues of all port representors
+ * @pf: pointer to PF structure
+ */
+static void ice_eswitch_start_all_tx_queues(struct ice_pf *pf)
+{
+	struct ice_repr *repr;
+	int i;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		repr = pf->vf[i].repr;
+		if (repr)
+			ice_repr_start_tx_queues(repr);
+	}
+}
+
+/**
+ * ice_eswitch_stop_all_tx_queues - stop Tx queues of all port representors
+ * @pf: pointer to PF structure
+ */
+void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf)
+{
+	struct ice_repr *repr;
+	int i;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		repr = pf->vf[i].repr;
+		if (repr)
+			ice_repr_stop_tx_queues(repr);
+	}
+}
+
+/**
+ * ice_eswitch_rebuild - rebuild eswitch
+ * @pf: pointer to PF structure
+ */
+int ice_eswitch_rebuild(struct ice_pf *pf)
+{
+	struct ice_vsi *ctrl_vsi = pf->switchdev.control_vsi;
+	int status;
+
+	ice_eswitch_napi_disable(pf);
+	ice_eswitch_napi_del(pf);
+
+	status = ice_eswitch_setup_env(pf);
+	if (status)
+		return status;
+
+	status = ice_eswitch_setup_reprs(pf);
+	if (status)
+		return status;
+
+	ice_eswitch_remap_rings_to_vectors(pf);
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	ice_replay_tc_fltrs(pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+	status = ice_vsi_open(ctrl_vsi);
+	if (status)
+		return status;
+
+	ice_eswitch_napi_enable(pf);
+	ice_eswitch_set_rxdid(ctrl_vsi, ICE_RXDID_FLEX_NIC_2);
+	ice_eswitch_start_all_tx_queues(pf);
+
+	return 0;
+}
+#endif /* CONFIG_NET_DEVLINK */
diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f8a39493cb76f47cea1893349b4545324fe4f77
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ESWITCH_H_
+#define _ICE_ESWITCH_H_
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include <net/devlink.h>
+
+void ice_eswitch_release(struct ice_pf *pf);
+int ice_eswitch_configure(struct ice_pf *pf);
+int ice_eswitch_rebuild(struct ice_pf *pf);
+int ice_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf);
+
+struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc);
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_eswitch_set_target_vsi(struct sk_buff *skb,
+				struct ice_tx_offload_params *off);
+void ice_eswitch_update_repr(struct ice_vsi *vsi);
+#else
+static inline
+void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off) { }
+static inline void ice_eswitch_update_repr(struct ice_vsi *vsi) { }
+#endif /* HAVE_METADATA_PORT_INFO */
+netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+#ifdef HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#ifdef HAVE_METADATA_PORT_INFO
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			 struct netlink_ext_ack *extack);
+#else
+static inline int
+ice_eswitch_mode_set(struct devlink __always_unused *devlink,
+		     u16 __always_unused mode,
+		     struct netlink_ext_ack __always_unused *extack)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+#else
+#ifdef HAVE_METADATA_PORT_INFO
+int ice_eswitch_mode_set(struct devlink *devlink, u16 mode);
+#else
+static inline int ice_eswitch_mode_set(struct devlink __always_unused *devlink,
+				       u16 __always_unused mode)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+#endif /* HAVE_DEVLINK_ESWITCH_OPS_EXTACK */
+bool ice_is_eswitch_mode_switchdev(struct ice_pf *pf);
+#else /* !CONFIG_NET_DEVLINK */
+static inline void ice_eswitch_release(struct ice_pf *pf) { }
+static inline
+void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off) { }
+static inline void ice_eswitch_update_repr(struct ice_vsi *vsi) { }
+static inline void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf) { }
+
+static inline int
+ice_eswitch_configure(struct ice_pf *pf)
+{
+	return 0;
+}
+
+static inline bool
+ice_is_eswitch_mode_switchdev(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline int
+ice_eswitch_rebuild(struct ice_pf __always_unused *pf)
+{
+	return 0;
+}
+
+static inline netdev_tx_t
+ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	return 0;
+}
+
+static inline struct net_device *
+ice_eswitch_get_target_netdev(struct ice_ring *rx_ring,
+			      union ice_32b_rx_flex_desc *rx_desc)
+{
+	return NULL;
+}
+#endif /* CONFIG_NET_DEVLINK */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index fc9ff985a62bd2ec74b27284f8f62040e609e420..9d9adef809ddcd505d816fa7e88ee461596d24f1 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -1,64 +1,111 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* ethtool support for ice */
 
 #include "ice.h"
+#include "ice_ethtool.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
-
-struct ice_stats {
-	char stat_string[ETH_GSTRING_LEN];
-	int sizeof_stat;
-	int stat_offset;
-};
-
-#define ICE_STAT(_type, _name, _stat) { \
-	.stat_string = _name, \
-	.sizeof_stat = FIELD_SIZEOF(_type, _stat), \
-	.stat_offset = offsetof(_type, _stat) \
-}
-
-#define ICE_VSI_STAT(_name, _stat) \
-		ICE_STAT(struct ice_vsi, _name, _stat)
-#define ICE_PF_STAT(_name, _stat) \
-		ICE_STAT(struct ice_pf, _name, _stat)
+#include "ice_dcb_nl.h"
 
 static int ice_q_stats_len(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-
-	return ((np->vsi->alloc_txq + np->vsi->alloc_rxq) *
-		(sizeof(struct ice_q_stats) / sizeof(u64)));
+	int stats_size, total_slen = 0;
+
+#ifdef ADQ_PERF_COUNTERS
+	/* Tx stats */
+	stats_size = sizeof(struct ice_q_stats) +
+		     sizeof(struct ice_ch_q_poll_stats) +
+		     sizeof(struct ice_ch_tx_q_stats);
+	total_slen += np->vsi->alloc_txq * (stats_size / sizeof(u64));
+
+	/* Rx stats */
+	stats_size = sizeof(struct ice_q_stats) +
+		     sizeof(struct ice_ch_q_poll_stats) +
+		     sizeof(struct ice_ch_rx_q_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+
+	stats_size = sizeof(struct ice_q_vector_ch_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+
+	stats_size = sizeof(struct ice_q_stats);
+	total_slen += (ICE_MAX_MACVLANS * 2) * (stats_size / sizeof(u64));
+	/* the napi_poll_cnt isn't included in the MACVLAN stats so reduce
+	 * the count by that many so the stats get printed correctly
+	 */
+	total_slen -= ICE_MAX_MACVLANS * 2;
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+#else
+	stats_size = sizeof(struct ice_q_stats);
+
+	total_slen += np->vsi->alloc_txq * (stats_size / sizeof(u64));
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	total_slen += (ICE_MAX_MACVLANS * 2) * (stats_size / sizeof(u64));
+	/* the napi_poll_cnt isn't included in the MACVLAN stats so reduce
+	 * the count by that many so the stats get printed correctly
+	 */
+	total_slen -= ICE_MAX_MACVLANS * 2;
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+	stats_size = sizeof(struct ice_q_stats);
+	total_slen += np->vsi->num_xdp_txq * (stats_size / sizeof(u64));
+
+	stats_size = sizeof(struct ice_xdp_stats);
+	total_slen += np->vsi->alloc_rxq * (stats_size / sizeof(u64));
+#endif
+#endif
+
+	return total_slen;
 }
 
 #define ICE_PF_STATS_LEN	ARRAY_SIZE(ice_gstrings_pf_stats)
 #define ICE_VSI_STATS_LEN	ARRAY_SIZE(ice_gstrings_vsi_stats)
 
 #define ICE_PFC_STATS_LEN ( \
-		(FIELD_SIZEOF(struct ice_pf, stats.priority_xoff_rx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xon_rx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xoff_tx) + \
-		 FIELD_SIZEOF(struct ice_pf, stats.priority_xon_tx)) \
+		(sizeof_field(struct ice_pf, stats.priority_xoff_rx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xon_rx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xoff_tx) + \
+		 sizeof_field(struct ice_pf, stats.priority_xon_tx)) \
 		 / sizeof(u64))
 #define ICE_ALL_STATS_LEN(n)	(ICE_PF_STATS_LEN + ICE_PFC_STATS_LEN + \
 				 ICE_VSI_STATS_LEN + ice_q_stats_len(n))
 
 static const struct ice_stats ice_gstrings_vsi_stats[] = {
-	ICE_VSI_STAT("rx_unicast", eth_stats.rx_unicast),
-	ICE_VSI_STAT("tx_unicast", eth_stats.tx_unicast),
-	ICE_VSI_STAT("rx_multicast", eth_stats.rx_multicast),
-	ICE_VSI_STAT("tx_multicast", eth_stats.tx_multicast),
-	ICE_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
-	ICE_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
-	ICE_VSI_STAT("rx_bytes", eth_stats.rx_bytes),
-	ICE_VSI_STAT("tx_bytes", eth_stats.tx_bytes),
-	ICE_VSI_STAT("rx_dropped", eth_stats.rx_discards),
-	ICE_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
-	ICE_VSI_STAT("rx_alloc_fail", rx_buf_failed),
-	ICE_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
-	ICE_VSI_STAT("tx_errors", eth_stats.tx_errors),
-	ICE_VSI_STAT("tx_linearize", tx_linearize),
+	ICE_VSI_STAT(ICE_RX_UNICAST, eth_stats.rx_unicast),
+	ICE_VSI_STAT(ICE_TX_UNICAST, eth_stats.tx_unicast),
+	ICE_VSI_STAT(ICE_RX_MULTICAST, eth_stats.rx_multicast),
+	ICE_VSI_STAT(ICE_TX_MULTICAST, eth_stats.tx_multicast),
+	ICE_VSI_STAT(ICE_RX_BROADCAST, eth_stats.rx_broadcast),
+	ICE_VSI_STAT(ICE_TX_BROADCAST, eth_stats.tx_broadcast),
+	ICE_VSI_STAT(ICE_RX_BYTES, eth_stats.rx_bytes),
+	ICE_VSI_STAT(ICE_TX_BYTES, eth_stats.tx_bytes),
+	ICE_VSI_STAT(ICE_RX_DROPPED, eth_stats.rx_discards),
+	ICE_VSI_STAT(ICE_RX_UNKNOWN_PROTO, eth_stats.rx_unknown_protocol),
+	ICE_VSI_STAT(ICE_RX_ALLOC_FAIL, rx_buf_failed),
+	ICE_VSI_STAT(ICE_RX_PAGE_ALLOC_FAIL, rx_page_failed),
+#ifdef ICE_ADD_PROBES
+	ICE_VSI_STAT(ICE_RX_PAGE_REUSE, rx_page_reuse),
+#endif /* ICE_ADD_PROBES */
+	ICE_VSI_STAT(ICE_TX_ERRORS, eth_stats.tx_errors),
+	ICE_VSI_STAT(ICE_TX_LINEARIZE, tx_linearize),
+	ICE_VSI_STAT(ICE_TX_BUSY, tx_busy),
+	ICE_VSI_STAT(ICE_TX_RESTART, tx_restart),
+#ifdef ADQ_PERF_COUNTERS
+	ICE_VSI_STAT("chnl_trans_inline_fd", cnt_inline_fd_transition),
+	ICE_VSI_STAT("chnl_fd_table_flushed", cnt_table_flushed),
+	ICE_VSI_STAT("chnl_fd_table_full", cnt_tbl_full),
+#endif /* ADQ_PERF_COUNTERS */
 };
 
 enum ice_ethtool_test_id {
@@ -89,45 +136,77 @@ static const char ice_gstrings_test[][ETH_GSTRING_LEN] = {
  * is queried on the base PF netdev.
  */
 static const struct ice_stats ice_gstrings_pf_stats[] = {
-	ICE_PF_STAT("rx_bytes.nic", stats.eth.rx_bytes),
-	ICE_PF_STAT("tx_bytes.nic", stats.eth.tx_bytes),
-	ICE_PF_STAT("rx_unicast.nic", stats.eth.rx_unicast),
-	ICE_PF_STAT("tx_unicast.nic", stats.eth.tx_unicast),
-	ICE_PF_STAT("rx_multicast.nic", stats.eth.rx_multicast),
-	ICE_PF_STAT("tx_multicast.nic", stats.eth.tx_multicast),
-	ICE_PF_STAT("rx_broadcast.nic", stats.eth.rx_broadcast),
-	ICE_PF_STAT("tx_broadcast.nic", stats.eth.tx_broadcast),
-	ICE_PF_STAT("tx_errors.nic", stats.eth.tx_errors),
-	ICE_PF_STAT("rx_size_64.nic", stats.rx_size_64),
-	ICE_PF_STAT("tx_size_64.nic", stats.tx_size_64),
-	ICE_PF_STAT("rx_size_127.nic", stats.rx_size_127),
-	ICE_PF_STAT("tx_size_127.nic", stats.tx_size_127),
-	ICE_PF_STAT("rx_size_255.nic", stats.rx_size_255),
-	ICE_PF_STAT("tx_size_255.nic", stats.tx_size_255),
-	ICE_PF_STAT("rx_size_511.nic", stats.rx_size_511),
-	ICE_PF_STAT("tx_size_511.nic", stats.tx_size_511),
-	ICE_PF_STAT("rx_size_1023.nic", stats.rx_size_1023),
-	ICE_PF_STAT("tx_size_1023.nic", stats.tx_size_1023),
-	ICE_PF_STAT("rx_size_1522.nic", stats.rx_size_1522),
-	ICE_PF_STAT("tx_size_1522.nic", stats.tx_size_1522),
-	ICE_PF_STAT("rx_size_big.nic", stats.rx_size_big),
-	ICE_PF_STAT("tx_size_big.nic", stats.tx_size_big),
-	ICE_PF_STAT("link_xon_rx.nic", stats.link_xon_rx),
-	ICE_PF_STAT("link_xon_tx.nic", stats.link_xon_tx),
-	ICE_PF_STAT("link_xoff_rx.nic", stats.link_xoff_rx),
-	ICE_PF_STAT("link_xoff_tx.nic", stats.link_xoff_tx),
-	ICE_PF_STAT("tx_dropped_link_down.nic", stats.tx_dropped_link_down),
-	ICE_PF_STAT("rx_undersize.nic", stats.rx_undersize),
-	ICE_PF_STAT("rx_fragments.nic", stats.rx_fragments),
-	ICE_PF_STAT("rx_oversize.nic", stats.rx_oversize),
-	ICE_PF_STAT("rx_jabber.nic", stats.rx_jabber),
-	ICE_PF_STAT("rx_csum_bad.nic", hw_csum_rx_error),
-	ICE_PF_STAT("rx_length_errors.nic", stats.rx_len_errors),
-	ICE_PF_STAT("rx_dropped.nic", stats.eth.rx_discards),
-	ICE_PF_STAT("rx_crc_errors.nic", stats.crc_errors),
-	ICE_PF_STAT("illegal_bytes.nic", stats.illegal_bytes),
-	ICE_PF_STAT("mac_local_faults.nic", stats.mac_local_faults),
-	ICE_PF_STAT("mac_remote_faults.nic", stats.mac_remote_faults),
+	ICE_PF_STAT(ICE_PORT_RX_BYTES, stats.eth.rx_bytes),
+	ICE_PF_STAT(ICE_PORT_TX_BYTES, stats.eth.tx_bytes),
+	ICE_PF_STAT(ICE_PORT_RX_UNICAST, stats.eth.rx_unicast),
+	ICE_PF_STAT(ICE_PORT_TX_UNICAST, stats.eth.tx_unicast),
+	ICE_PF_STAT(ICE_PORT_RX_MULTICAST, stats.eth.rx_multicast),
+	ICE_PF_STAT(ICE_PORT_TX_MULTICAST, stats.eth.tx_multicast),
+	ICE_PF_STAT(ICE_PORT_RX_BROADCAST, stats.eth.rx_broadcast),
+	ICE_PF_STAT(ICE_PORT_TX_BROADCAST, stats.eth.tx_broadcast),
+	ICE_PF_STAT(ICE_PORT_TX_ERRORS, stats.eth.tx_errors),
+	ICE_PF_STAT(ICE_PORT_TX_TIMEOUT, tx_timeout_count),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_64, stats.rx_size_64),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_64, stats.tx_size_64),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_127, stats.rx_size_127),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_127, stats.tx_size_127),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_255, stats.rx_size_255),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_255, stats.tx_size_255),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_511, stats.rx_size_511),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_511, stats.tx_size_511),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_1023, stats.rx_size_1023),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_1023, stats.tx_size_1023),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_1522, stats.rx_size_1522),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_1522, stats.tx_size_1522),
+	ICE_PF_STAT(ICE_PORT_RX_SIZE_JUMBO, stats.rx_size_big),
+	ICE_PF_STAT(ICE_PORT_TX_SIZE_JUMBO, stats.tx_size_big),
+	ICE_PF_STAT(ICE_PORT_RX_LINK_XON, stats.link_xon_rx),
+	ICE_PF_STAT(ICE_PORT_TX_LINK_XON, stats.link_xon_tx),
+	ICE_PF_STAT(ICE_PORT_RX_LINK_XOFF, stats.link_xoff_rx),
+	ICE_PF_STAT(ICE_PORT_TX_LINK_XOFF, stats.link_xoff_tx),
+	ICE_PF_STAT(ICE_PORT_TX_DROP_LINK_DOWN, stats.tx_dropped_link_down),
+	ICE_PF_STAT(ICE_PORT_RX_UNDERSIZE, stats.rx_undersize),
+	ICE_PF_STAT(ICE_PORT_RX_FRAGMENTS, stats.rx_fragments),
+	ICE_PF_STAT(ICE_PORT_RX_OVERSIZE, stats.rx_oversize),
+	ICE_PF_STAT(ICE_PORT_RX_JABBER, stats.rx_jabber),
+	ICE_PF_STAT(ICE_PORT_RX_CSUM_BAD, hw_csum_rx_error),
+	ICE_PF_STAT(ICE_PORT_RX_LEN_ERRORS, stats.rx_len_errors),
+	ICE_PF_STAT(ICE_PORT_RX_DROPPED, stats.eth.rx_discards),
+	ICE_PF_STAT(ICE_PORT_RX_CRC_ERRORS, stats.crc_errors),
+	ICE_PF_STAT(ICE_PORT_ILLEGAL_BYTES, stats.illegal_bytes),
+	ICE_PF_STAT(ICE_PORT_MAC_LOCAL_FAULTS, stats.mac_local_faults),
+	ICE_PF_STAT(ICE_PORT_MAC_REMOTE_FAULTS, stats.mac_remote_faults),
+#ifdef ICE_ADD_PROBES
+	ICE_PF_STAT(ICE_PORT_TX_TCP_SEGMENTS, tcp_segs),
+	ICE_PF_STAT(ICE_PORT_TX_UDP_SEGMENTS, udp_segs),
+	ICE_PF_STAT(ICE_PORT_RX_TCP_CSO, rx_tcp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_TCP_CSO, tx_tcp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_UDP_CSO, rx_udp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_UDP_CSO, tx_udp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_SCTP_CSO, rx_sctp_cso),
+	ICE_PF_STAT(ICE_PORT_TX_SCTP_CSO, tx_sctp_cso),
+	ICE_PF_STAT(ICE_PORT_RX_IP4_CSO, rx_ip4_cso),
+	ICE_PF_STAT(ICE_PORT_TX_IP4_CSO, tx_ip4_cso),
+	ICE_PF_STAT(ICE_PORT_RX_IP4_CSO_ERROR, rx_ip4_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_TCP_CSO_ERROR, rx_tcp_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_UDP_CSO_ERROR, rx_udp_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_SCTP_CSO_ERROR, rx_sctp_cso_err),
+	ICE_PF_STAT(ICE_PORT_TX_L3_CSO_ERROR, tx_l3_cso_err),
+	ICE_PF_STAT(ICE_PORT_TX_L4_CSO_ERROR, tx_l4_cso_err),
+	ICE_PF_STAT(ICE_PORT_RX_Q_VLANO, rx_q_vlano),
+	ICE_PF_STAT(ICE_PORT_TX_Q_VLANO, tx_q_vlano),
+	ICE_PF_STAT(ICE_PORT_RX_AD_VLANO, rx_ad_vlano),
+	ICE_PF_STAT(ICE_PORT_TX_AD_VLANO, tx_ad_vlano),
+#endif
+	ICE_PF_STAT(ICE_PORT_FDIR_SB_MATCH, stats.fd_sb_match),
+	ICE_PF_STAT(ICE_PORT_FDIR_SB_STATUS, stats.fd_sb_status),
+	ICE_PF_STAT("chnl_inline_fd_match", stats.ch_atr_match),
+#ifdef ICE_ADD_PROBES
+	ICE_PF_STAT(ICE_PORT_ARFS_TCPV4_MATCH, stats.arfs_tcpv4_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_TCPV6_MATCH, stats.arfs_tcpv6_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_UDP4_MATCH, stats.arfs_udpv4_match),
+	ICE_PF_STAT(ICE_PORT_ARFS_UDP6_MATCH, stats.arfs_udpv6_match),
+#endif /* ICE_ADD_PROBES */
 };
 
 static const u32 ice_regs_dump_list[] = {
@@ -138,9 +217,6 @@ static const u32 ice_regs_dump_list[] = {
 	QINT_RQCTL(0),
 	PFINT_OICR_ENA,
 	QRX_ITR(0),
-	PF0INT_ITR_0(0),
-	PF0INT_ITR_1(0),
-	PF0INT_ITR_2(0),
 };
 
 struct ice_priv_flag {
@@ -155,27 +231,98 @@ struct ice_priv_flag {
 
 static const struct ice_priv_flag ice_gstrings_priv_flags[] = {
 	ICE_PRIV_FLAG("link-down-on-close", ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA),
+#ifndef ETHTOOL_GFECPARAM
+	ICE_PRIV_FLAG("rs-fec", ICE_FLAG_RS_FEC),
+	ICE_PRIV_FLAG("base-r-fec", ICE_FLAG_BASE_R_FEC),
+#endif /* !ETHTOOL_GFECPARAM */
 	ICE_PRIV_FLAG("fw-lldp-agent", ICE_FLAG_FW_LLDP_AGENT),
+#ifdef NETIF_F_HW_TC
+	ICE_PRIV_FLAG("channel-inline-flow-director",
+		      ICE_FLAG_CHNL_INLINE_FD_ENA),
+	ICE_PRIV_FLAG("channel-inline-fd-mark",
+		      ICE_FLAG_CHNL_INLINE_FD_MARK_ENA),
+	ICE_PRIV_FLAG("channel-pkt-inspect-optimize",
+		      ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA),
+	ICE_PRIV_FLAG("channel-pkt-clean-bp-stop",
+		      ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA),
+	ICE_PRIV_FLAG("channel-pkt-clean-bp-stop-cfg",
+		      ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG),
+#endif /* NETIF_F_HW_TC */
+	ICE_PRIV_FLAG("vf-true-promisc-support",
+		      ICE_FLAG_VF_TRUE_PROMISC_ENA),
+	ICE_PRIV_FLAG("mdd-auto-reset-vf", ICE_FLAG_MDD_AUTO_RESET_VF),
+	ICE_PRIV_FLAG("vf-vlan-prune-disable", ICE_FLAG_VF_VLAN_PRUNE_DIS),
+	ICE_PRIV_FLAG("legacy-rx", ICE_FLAG_LEGACY_RX),
 };
 
 #define ICE_PRIV_FLAG_ARRAY_SIZE	ARRAY_SIZE(ice_gstrings_priv_flags)
 
 static void
-ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+__ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo,
+		  struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_orom_info *orom;
+	struct ice_nvm_info *nvm;
+
+	nvm = &hw->flash.nvm;
+	orom = &hw->flash.orom;
+
+	strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
+	strscpy(drvinfo->version, ice_drv_ver, sizeof(drvinfo->version));
+
+	/* Display NVM version (from which the firmware version can be
+	 * determined) which contains more pertinent information.
+	 */
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%x.%02x 0x%x %d.%d.%d", nvm->major, nvm->minor,
+		 nvm->eetrack, orom->major, orom->build, orom->patch);
+
+	/* When called via 'ethtool -i|--driver <iface>', log the above with
+	 * additional Netlist version information as a kernel message since it
+	 * will not all fit in the 32-byte fixed-length buffer.
+	 */
+	if (!strncmp(current->comm, "ethtool", 7)) {
+		struct ice_netlist_info *netlist = &hw->flash.netlist;
+
+		/* The netlist versions are stored in packed BCD format */
+		netdev_info(netdev, "NVM version details - %x.%02x, 0x%x, %x.%x.%x-%x.%x.%x.%08x, %d.%d.%d\n",
+			    nvm->major, nvm->minor, nvm->eetrack,
+			    netlist->major, netlist->minor,
+			    netlist->type >> 16, netlist->type & 0xffff,
+			    netlist->rev, netlist->cust_ver, netlist->hash,
+			    orom->major, orom->build, orom->patch);
+	}
 
-	strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, ice_drv_ver, sizeof(drvinfo->version));
-	strlcpy(drvinfo->fw_version, ice_nvm_version_str(&pf->hw),
-		sizeof(drvinfo->fw_version));
-	strlcpy(drvinfo->bus_info, pci_name(pf->pdev),
+	strscpy(drvinfo->bus_info, pci_name(pf->pdev),
 		sizeof(drvinfo->bus_info));
+
+	if (test_bit(ICE_RECOVERY_MODE, pf->state))
+		return;
+
 	drvinfo->n_priv_flags = ICE_PRIV_FLAG_ARRAY_SIZE;
 }
 
+static void
+ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	__ice_get_drvinfo(netdev, drvinfo, np->vsi);
+}
+
+static void
+ice_repr_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+
+	if (ice_check_vf_ready_for_cfg(repr->vf))
+		return;
+
+	__ice_get_drvinfo(netdev, drvinfo, repr->src_vsi);
+}
+
 static int ice_get_regs_len(struct net_device __always_unused *netdev)
 {
 	return sizeof(ice_regs_dump_list);
@@ -188,7 +335,7 @@ ice_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p)
 	struct ice_pf *pf = np->vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	u32 *regs_buf = (u32 *)p;
-	int i;
+	unsigned int i;
 
 	regs->version = 1;
 
@@ -230,7 +377,8 @@ static int ice_get_eeprom_len(struct net_device *netdev)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_pf *pf = np->vsi->back;
 
-	return (int)(pf->hw.nvm.sr_words * sizeof(u16));
+	/* Report the flash size, or at least 10MB */
+	return max_t(int, pf->hw.flash.flash_size, 10 * 1024 * 1024);
 }
 
 static int
@@ -238,42 +386,121 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
 	       u8 *bytes)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	u16 first_word, last_word, nwords;
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	enum ice_status status;
 	struct device *dev;
 	int ret = 0;
-	u16 *buf;
+	u32 magic;
+	u8 *buf;
+
+	dev = ice_pf_to_dev(pf);
 
-	dev = &pf->pdev->dev;
+	magic = hw->vendor_id | (hw->device_id << 16);
+	if (eeprom->magic && eeprom->magic != magic) {
+		struct ice_nvm_access_cmd *nvm;
+		union ice_nvm_access_data *data;
+
+		nvm = (struct ice_nvm_access_cmd *)eeprom;
+		data = (union ice_nvm_access_data *)bytes;
+
+		netdev_dbg(netdev, "GEEPROM config 0x%08x, offset 0x%08x, data_size 0x%08x\n",
+			   nvm->config, nvm->offset, nvm->data_size);
+
+		status = ice_handle_nvm_access(hw, nvm, data);
+
+		ice_debug_array(hw, ICE_DBG_NVM, 16, 1, (u8 *)data,
+				nvm->data_size);
+
+		if (status) {
+			int err = ice_status_to_errno(status);
+
+			netdev_err(netdev, "NVM read offset 0x%x failed with status %s, error %d\n",
+				   nvm->offset, ice_stat_str(status), err);
+
+			return err;
+		}
 
-	eeprom->magic = hw->vendor_id | (hw->device_id << 16);
+		return 0;
+	}
 
-	first_word = eeprom->offset >> 1;
-	last_word = (eeprom->offset + eeprom->len - 1) >> 1;
-	nwords = last_word - first_word + 1;
+	eeprom->magic = magic;
+	netdev_dbg(netdev, "GEEPROM offset 0x%08x, len 0x%08x\n",
+		   eeprom->offset, eeprom->len);
 
-	buf = devm_kcalloc(dev, nwords, sizeof(u16), GFP_KERNEL);
+	buf = kzalloc(eeprom->len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-	status = ice_read_sr_buf(hw, first_word, &nwords, buf);
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
 	if (status) {
-		dev_err(dev, "ice_read_sr_buf failed, err %d aq_err %d\n",
-			status, hw->adminq.sq_last_status);
-		eeprom->len = sizeof(u16) * nwords;
+		dev_err(dev, "ice_acquire_nvm failed: %s %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
 		ret = -EIO;
 		goto out;
 	}
 
-	memcpy(bytes, (u8 *)buf + (eeprom->offset & 1), eeprom->len);
+	status = ice_read_flat_nvm(hw, eeprom->offset, &eeprom->len, buf,
+				   false);
+	if (status) {
+		dev_err(dev, "ice_read_flat_nvm failed: %s %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto release;
+	}
+
+	memcpy(bytes, buf, eeprom->len);
+release:
+	ice_release_nvm(hw);
 out:
-	devm_kfree(dev, buf);
+	kfree(buf);
 	return ret;
 }
 
+static int
+ice_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
+	       u8 *bytes)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_hw *hw = &np->vsi->back->hw;
+	struct ice_pf *pf = np->vsi->back;
+	union ice_nvm_access_data *data;
+	struct ice_nvm_access_cmd *nvm;
+	enum ice_status status = 0;
+	int err = 0;
+	u32 magic;
+
+	/* normal ethtool set_eeprom is not supported */
+	nvm = (struct ice_nvm_access_cmd *)eeprom;
+	data = (union ice_nvm_access_data *)bytes;
+	magic = hw->vendor_id | (hw->device_id << 16);
+
+	netdev_dbg(netdev, "SEEPROM cmd 0x%08x, config 0x%08x, offset 0x%08x, data_size 0x%08x\n",
+		   nvm->command, nvm->config, nvm->offset, nvm->data_size);
+	ice_debug_array(hw, ICE_DBG_NVM, 16, 1, (u8 *)data, nvm->data_size);
+
+	if (eeprom->magic == magic)
+		err = -EOPNOTSUPP;
+	/* check for NVM access method */
+	else if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id)
+		err = -EINVAL;
+	else if (ice_is_reset_in_progress(pf->state))
+		err = -EBUSY;
+	else
+		status = ice_handle_nvm_access(hw, nvm, data);
+
+	if (status) {
+		err = ice_status_to_errno(status);
+		netdev_err(netdev, "NVM write offset 0x%x failed with status %s, error %d\n",
+			   nvm->offset, ice_stat_str(status), err);
+	}
+
+	return err;
+}
+
 /**
  * ice_active_vfs - check if there are any active VFs
  * @pf: board private structure
@@ -282,12 +509,15 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
  */
 static bool ice_active_vfs(struct ice_pf *pf)
 {
-	struct ice_vf *vf = pf->vf;
-	int i;
+	unsigned int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
 
-	for (i = 0; i < pf->num_alloc_vfs; i++, vf++)
 		if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
 			return true;
+	}
+
 	return false;
 }
 
@@ -307,7 +537,8 @@ static u64 ice_link_test(struct net_device *netdev)
 	netdev_info(netdev, "link test\n");
 	status = ice_get_link_status(np->vsi->port_info, &link_up);
 	if (status) {
-		netdev_err(netdev, "link query error, status = %d\n", status);
+		netdev_err(netdev, "link query error, status = %s\n",
+			   ice_stat_str(status));
 		return 1;
 	}
 
@@ -341,14 +572,16 @@ static u64 ice_eeprom_test(struct net_device *netdev)
  */
 static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 {
-	struct ice_pf *pf = (struct ice_pf *)hw->back;
 	static const u32 patterns[] = {
 		0x5A5A5A5A, 0xA5A5A5A5,
 		0x00000000, 0xFFFFFFFF
 	};
+	struct ice_pf *pf = hw->back;
+	struct device *dev;
 	u32 val, orig_val;
-	int i;
+	unsigned int i;
 
+	dev = ice_pf_to_dev(pf);
 	orig_val = rd32(hw, reg);
 	for (i = 0; i < ARRAY_SIZE(patterns); ++i) {
 		u32 pattern = patterns[i] & mask;
@@ -357,8 +590,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 		val = rd32(hw, reg);
 		if (val == pattern)
 			continue;
-		dev_err(&pf->pdev->dev,
-			"%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n"
+		dev_err(dev, "%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n"
 			, __func__, reg, pattern, val);
 		return 1;
 	}
@@ -366,8 +598,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask)
 	wr32(hw, reg, orig_val);
 	val = rd32(hw, reg);
 	if (val != orig_val) {
-		dev_err(&pf->pdev->dev,
-			"%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n"
+		dev_err(dev, "%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n"
 			, __func__, reg, orig_val, val);
 		return 1;
 	}
@@ -402,7 +633,7 @@ static u64 ice_reg_test(struct net_device *netdev)
 			GLINT_ITR(2, 1) - GLINT_ITR(2, 0)},
 		{GLINT_CTL, 0xffff0001, 1, 0}
 	};
-	int i;
+	unsigned int i;
 
 	netdev_dbg(netdev, "Register test\n");
 	for (i = 0; i < ARRAY_SIZE(ice_reg_list); ++i) {
@@ -447,7 +678,7 @@ static int ice_lbtest_prepare_rings(struct ice_vsi *vsi)
 	if (status)
 		goto err_setup_rx_ring;
 
-	status = ice_vsi_start_rx_rings(vsi);
+	status = ice_vsi_start_all_rx_rings(vsi);
 	if (status)
 		goto err_start_rx_ring;
 
@@ -479,7 +710,7 @@ static int ice_lbtest_disable_rings(struct ice_vsi *vsi)
 		netdev_err(vsi->netdev, "Failed to stop Tx rings, VSI %d error %d\n",
 			   vsi->vsi_num, status);
 
-	status = ice_vsi_stop_rx_rings(vsi);
+	status = ice_vsi_stop_all_rx_rings(vsi);
 	if (status)
 		netdev_err(vsi->netdev, "Failed to stop Rx rings, VSI %d error %d\n",
 			   vsi->vsi_num, status);
@@ -506,7 +737,7 @@ static int ice_lbtest_create_frame(struct ice_pf *pf, u8 **ret_data, u16 size)
 	if (!pf)
 		return -EINVAL;
 
-	data = devm_kzalloc(&pf->pdev->dev, size, GFP_KERNEL);
+	data = devm_kzalloc(ice_pf_to_dev(pf), size, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -623,7 +854,7 @@ static int ice_lbtest_receive_frames(struct ice_ring *rx_ring)
 			continue;
 
 		rx_buf = &rx_ring->rx_buf[i];
-		received_buf = page_address(rx_buf->page);
+		received_buf = page_address(rx_buf->page) + rx_buf->page_offset;
 
 		if (ice_lbtest_check_frame(received_buf))
 			valid_frames++;
@@ -647,15 +878,16 @@ static u64 ice_loopback_test(struct net_device *netdev)
 	struct ice_ring *tx_ring, *rx_ring;
 	u8 broadcast[ETH_ALEN], ret = 0;
 	int num_frames, valid_frames;
-	LIST_HEAD(tmp_list);
+	struct device *dev;
 	u8 *tx_frame;
 	int i;
 
+	dev = ice_pf_to_dev(pf);
 	netdev_info(netdev, "loopback test\n");
 
 	test_vsi = ice_lb_vsi_setup(pf, pf->hw.port_info);
 	if (!test_vsi) {
-		netdev_err(netdev, "Failed to create a VSI for the loopback test");
+		netdev_err(netdev, "Failed to create a VSI for the loopback test\n");
 		return 1;
 	}
 
@@ -681,16 +913,11 @@ static u64 ice_loopback_test(struct net_device *netdev)
 
 	/* Test VSI needs to receive broadcast packets */
 	eth_broadcast_addr(broadcast);
-	if (ice_add_mac_to_list(test_vsi, &tmp_list, broadcast)) {
+	if (ice_fltr_add_mac(test_vsi, broadcast, ICE_FWD_TO_VSI)) {
 		ret = 5;
 		goto lbtest_mac_dis;
 	}
 
-	if (ice_add_mac(&pf->hw, &tmp_list)) {
-		ret = 6;
-		goto free_mac_list;
-	}
-
 	if (ice_lbtest_create_frame(pf, &tx_frame, ICE_LB_FRAME_SIZE)) {
 		ret = 7;
 		goto remove_mac_filters;
@@ -711,12 +938,10 @@ static u64 ice_loopback_test(struct net_device *netdev)
 		ret = 10;
 
 lbtest_free_frame:
-	devm_kfree(&pf->pdev->dev, tx_frame);
+	devm_kfree(dev, tx_frame);
 remove_mac_filters:
-	if (ice_remove_mac(&pf->hw, &tmp_list))
-		netdev_err(netdev, "Could not remove MAC filter for the test VSI");
-free_mac_list:
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_list);
+	if (ice_fltr_remove_mac(test_vsi, broadcast, ICE_FWD_TO_VSI))
+		netdev_err(netdev, "Could not remove MAC filter for the test VSI\n");
 lbtest_mac_dis:
 	/* Disable MAC loopback after the test is completed. */
 	if (ice_aq_set_mac_loopback(&pf->hw, false, NULL))
@@ -727,7 +952,7 @@ static u64 ice_loopback_test(struct net_device *netdev)
 lbtest_vsi_close:
 	test_vsi->netdev = NULL;
 	if (ice_vsi_release(test_vsi))
-		netdev_err(netdev, "Failed to remove the test VSI");
+		netdev_err(netdev, "Failed to remove the test VSI\n");
 
 	return ret;
 }
@@ -773,22 +998,24 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	bool if_running = netif_running(netdev);
 	struct ice_pf *pf = np->vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
 
 	if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
 		netdev_info(netdev, "offline testing starting\n");
 
-		set_bit(__ICE_TESTING, pf->state);
+		set_bit(ICE_TESTING, pf->state);
 
-		if (ice_active_vfs(pf)) {
-			dev_warn(&pf->pdev->dev,
-				 "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n");
+		if (ice_active_vfs(pf) || ice_active_vmdqs(pf)) {
+			dev_warn(dev, "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n");
 			data[ICE_ETH_TEST_REG] = 1;
 			data[ICE_ETH_TEST_EEPROM] = 1;
 			data[ICE_ETH_TEST_INTR] = 1;
 			data[ICE_ETH_TEST_LOOP] = 1;
 			data[ICE_ETH_TEST_LINK] = 1;
 			eth_test->flags |= ETH_TEST_FL_FAILED;
-			clear_bit(__ICE_TESTING, pf->state);
+			clear_bit(ICE_TESTING, pf->state);
 			goto skip_ol_tests;
 		}
 		/* If the device is online then take it offline */
@@ -809,14 +1036,13 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 		    data[ICE_ETH_TEST_REG])
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
-		clear_bit(__ICE_TESTING, pf->state);
+		clear_bit(ICE_TESTING, pf->state);
 
 		if (if_running) {
 			int status = ice_open(netdev);
 
 			if (status) {
-				dev_err(&pf->pdev->dev,
-					"Could not open device %s, err %d",
+				dev_err(dev, "Could not open device %s, err %d\n",
 					pf->int_name, status);
 			}
 		}
@@ -839,1441 +1065,2806 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test,
 	netdev_info(netdev, "testing finished\n");
 }
 
-static void ice_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_get_xdp_rx_strings
+ * @q: queue index
+ * @num_rxq: number of rx queue
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Rx queue XDP related counters strings
+ */
+static void
+ice_get_xdp_rx_strings(unsigned int q, u16 num_rxq, char **loc_in_buf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	char *p = (char *)data;
-	unsigned int i;
+	char *p;
 
-	switch (stringset) {
-	case ETH_SS_STATS:
-		for (i = 0; i < ICE_VSI_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_vsi_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
+	if (!loc_in_buf || q >= num_rxq)
+		return;
 
-		ice_for_each_alloc_txq(vsi, i) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_queue_%u_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
+	p = *loc_in_buf;
+
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx_q-%u_bytes", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-passed_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-dropped_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-tx_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-tx-fail_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-unknown_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-redirected_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, "xdp-rx-redir-fail_q-%u_pkts", q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
+}
 
-		ice_for_each_alloc_rxq(vsi, i) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_queue_%u_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
+/**
+ * ice_get_xdp_tx_strings
+ * @vsi: pointer to the VSI structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx queue XDP related counters strings
+ */
+static void
+ice_get_xdp_tx_strings(struct ice_vsi *vsi, char **loc_in_buf)
+{
+	char *p;
+	u16 q;
 
-		if (vsi->type != ICE_VSI_PF)
-			return;
+	if (!vsi || !loc_in_buf)
+		return;
 
-		for (i = 0; i < ICE_PF_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_pf_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
+	for (q = 0; q < vsi->num_xdp_txq; ++q) {
+		p = *loc_in_buf;
 
-		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_priority_%u_xon.nic", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "tx_priority_%u_xoff.nic", i);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_priority_%u_xon.nic", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "rx_priority_%u_xoff.nic", i);
-			p += ETH_GSTRING_LEN;
-		}
-		break;
-	case ETH_SS_TEST:
-		memcpy(data, ice_gstrings_test, ICE_TEST_LEN * ETH_GSTRING_LEN);
-		break;
-	case ETH_SS_PRIV_FLAGS:
-		for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 ice_gstrings_priv_flags[i].name);
-			p += ETH_GSTRING_LEN;
-		}
-		break;
-	default:
-		break;
+		snprintf(p, ETH_GSTRING_LEN, "xdp-tx_q-%u_pkts", q);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN, "xdp-tx_q-%u_bytes", q);
+		p += ETH_GSTRING_LEN;
+
+		/* copy back updated length */
+		*loc_in_buf = p;
 	}
 }
 
-static int
-ice_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state)
+/**
+ * ice_get_xdp_rx_stats - get stats for Rx rings if XDP is enabled
+ * @xdp_stats: ptr to stats being updated
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function reads XDP per-action Rx statistics.
+ */
+static void
+ice_get_xdp_rx_stats(struct ice_xdp_stats *xdp_stats, u64 *data, int *idx)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	bool led_active;
+	bool set = !!xdp_stats;
+	int i;
 
-	switch (state) {
-	case ETHTOOL_ID_ACTIVE:
-		led_active = true;
-		break;
-	case ETHTOOL_ID_INACTIVE:
-		led_active = false;
-		break;
-	default:
-		return -EINVAL;
-	}
+	if (!idx)
+		return;
 
-	if (ice_aq_set_port_id_led(np->vsi->port_info, !led_active, NULL))
-		return -EIO;
+	i = *idx; /* start index in data buffer */
 
-	return 0;
+	data[i++] = set ? xdp_stats->xdp_rx_pkts : 0;
+	data[i++] = set ? xdp_stats->xdp_rx_bytes : 0;
+	data[i++] = set ? xdp_stats->xdp_pass : 0;
+	data[i++] = set ? xdp_stats->xdp_drop : 0;
+	data[i++] = set ? xdp_stats->xdp_tx : 0;
+	data[i++] = set ? xdp_stats->xdp_tx_fail : 0;
+	data[i++] = set ? xdp_stats->xdp_unknown : 0;
+	data[i++] = set ? xdp_stats->xdp_redirect : 0;
+	data[i++] = set ? xdp_stats->xdp_redirect_fail : 0;
+
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_set_fec_cfg - Set link FEC options
- * @netdev: network interface device structure
- * @req_fec: FEC mode to configure
+ * ice_get_xdp_tx_stats - get stats for XDP Tx rings if XDP is enabled
+ * @vsi: pointer to the VSI structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function reads XDP per-action Tx statistics.
  */
-static int ice_set_fec_cfg(struct net_device *netdev, enum ice_fec_mode req_fec)
+static void
+ice_get_xdp_tx_stats(struct ice_vsi *vsi, u64 *data, int *idx)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_aqc_set_phy_cfg_data config = { 0 };
-	struct ice_aqc_get_phy_caps_data *caps;
-	struct ice_vsi *vsi = np->vsi;
-	u8 sw_cfg_caps, sw_cfg_fec;
-	struct ice_port_info *pi;
-	enum ice_status status;
-	int err = 0;
-
-	pi = vsi->port_info;
-	if (!pi)
-		return -EOPNOTSUPP;
+	int i, q;
 
-	/* Changing the FEC parameters is not supported if not the PF VSI */
-	if (vsi->type != ICE_VSI_PF) {
-		netdev_info(netdev, "Changing FEC parameters only supported for PF VSI\n");
-		return -EOPNOTSUPP;
-	}
+	if (!idx || !vsi || !vsi->xdp_rings)
+		return;
 
-	/* Get last SW configuration */
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
-	if (!caps)
-		return -ENOMEM;
+	i = *idx; /* start index in data buffer */
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
-	}
+	for (q = 0; q < vsi->num_xdp_txq; ++q) {
+		struct ice_q_stats *stats;
+		struct ice_ring *ring;
 
-	/* Copy SW configuration returned from PHY caps to PHY config */
-	ice_copy_phy_caps_to_cfg(caps, &config);
-	sw_cfg_caps = caps->caps;
-	sw_cfg_fec = caps->link_fec_options;
+		ring = READ_ONCE(vsi->xdp_rings[q]);
 
-	/* Get toloplogy caps, then copy PHY FEC topoloy caps to PHY config */
-	memset(caps, 0, sizeof(*caps));
+		stats = !!ring ? &ring->stats : NULL;
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
+		data[i++] = !!ring ? stats->pkts : 0;
+		data[i++] = !!ring ? stats->bytes : 0;
 	}
 
-	config.caps |= (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC);
-	config.link_fec_opt = caps->link_fec_options;
-
-	ice_cfg_phy_fec(&config, req_fec);
-
-	/* If FEC mode has changed, then set PHY configuration and enable AN. */
-	if ((config.caps & ICE_AQ_PHY_ENA_AUTO_FEC) !=
-	    (sw_cfg_caps & ICE_AQC_PHY_EN_AUTO_FEC) ||
-	    config.link_fec_opt != sw_cfg_fec) {
-		if (caps->caps & ICE_AQC_PHY_AN_MODE)
-			config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+	/* copy back updated index */
+	*idx = i;
+}
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
 
-		status = ice_aq_set_phy_cfg(pi->hw, pi->lport, &config, NULL);
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_get_chnl_tx_strings
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx queue related strings for ADQ performance counters
+ */
+static void
+ice_get_chnl_tx_strings(struct ice_vsi *vsi, unsigned int q, char **loc_in_buf)
+{
+	char *p;
 
-		if (status)
-			err = -EAGAIN;
-	}
+	if (!loc_in_buf)
+		return;
+	if (q >= vsi->num_txq)
+		return;
 
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	p = *loc_in_buf;
+
+	/* Tx queue specific extra counters */
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_NOT_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_SETUP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_MARK_ATR_SETUP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_TEARDOWN, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_ATR_BAIL, q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
 
 /**
- * ice_set_fecparam - Set FEC link options
- * @netdev: network interface device structure
- * @fecparam: Ethtool structure to retrieve FEC parameters
+ * ice_get_chnl_tx_stats - get stats for Tx rings if channel enabled
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ * @set: copy counters if true otherwise copy zero
+ *
+ * This function is used to collect performance counters for specific Tx ring.
  */
-static int
-ice_set_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+static void
+ice_get_chnl_tx_stats(struct ice_vsi *vsi, int q, u64 *data, int *idx, bool set)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	enum ice_fec_mode fec;
+	struct ice_ch_q_stats *ch_stats;
+	struct ice_ring *tx_ring;
+	int i;
 
-	switch (fecparam->fec) {
-	case ETHTOOL_FEC_AUTO:
-		fec = ICE_FEC_AUTO;
-		break;
-	case ETHTOOL_FEC_RS:
-		fec = ICE_FEC_RS;
-		break;
-	case ETHTOOL_FEC_BASER:
-		fec = ICE_FEC_BASER;
-		break;
-	case ETHTOOL_FEC_OFF:
-	case ETHTOOL_FEC_NONE:
-		fec = ICE_FEC_NONE;
-		break;
-	default:
-		dev_warn(&vsi->back->pdev->dev, "Unsupported FEC mode: %d\n",
-			 fecparam->fec);
-		return -EINVAL;
-	}
+	if (!idx)
+		return;
+	if (q >= vsi->num_txq)
+		return;
 
-	return ice_set_fec_cfg(netdev, fec);
+	tx_ring = vsi->tx_rings[q];
+	ch_stats = &tx_ring->ch_q_stats;
+	set = set && ch_stats;
+
+	i = *idx; /* start index in data buffer */
+
+	/* Tx queue specific extra counters */
+	data[i++] = set ? ch_stats->poll.bp_packets : 0;
+	data[i++] = set ? ch_stats->poll.np_packets : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_setup : 0;
+	data[i++] = set ? ch_stats->tx.num_mark_atr_setup : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_evict : 0;
+	data[i++] = set ? ch_stats->tx.num_atr_bailouts : 0;
+
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_get_fecparam - Get link FEC options
- * @netdev: network interface device structure
- * @fecparam: Ethtool structure to retrieve FEC parameters
+ * ice_get_chnl_rx_stats - get stats for Rx rings if channel enabled
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ * @set: copy counters if true otherwise copy zero
+ *
+ * This function is used to collect performance counters for specific Rx ring
+ * and related vector. All these counters are related to ADQ.
  */
-static int
-ice_get_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+static void
+ice_get_chnl_rx_stats(struct ice_vsi *vsi, int q, u64 *data, int *idx, bool set)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_aqc_get_phy_caps_data *caps;
-	struct ice_link_status *link_info;
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_port_info *pi;
-	enum ice_status status;
-	int err = 0;
+	struct ice_q_vector_ch_stats *vector_ch_stats = NULL;
+	struct ice_ch_q_stats *ch_stats;
+	struct ice_ring *rx_ring;
+	bool orig_set = set;
+	int i;
 
-	pi = vsi->port_info;
+	if (!idx)
+		return;
+	if (q >= vsi->num_rxq)
+		return;
 
-	if (!pi)
-		return -EOPNOTSUPP;
-	link_info = &pi->phy.link_info;
+	rx_ring = vsi->rx_rings[q];
+	ch_stats = &rx_ring->ch_q_stats;
+	if (rx_ring->q_vector)
+		vector_ch_stats = &rx_ring->q_vector->ch_stats;
+
+	i = *idx; /* start index in data buffer */
+
+	/* Rx queue specific extra counters */
+	set = orig_set && ch_stats;
+	/* busy_poll and not busy_poll packets */
+	data[i++] = set ? ch_stats->poll.bp_packets : 0;
+	data[i++] = set ? ch_stats->poll.np_packets : 0;
+	/* Rx queue set/bailout from override */
+	data[i++] = set ? ch_stats->rx.num_rx_queue_set : 0;
+	data[i++] = set ? ch_stats->rx.num_rx_queue_bailouts : 0;
+	/* ctrl pkts, only ctrl_pkts,, FIN/RST/SYN */
+	data[i++] = set ? ch_stats->rx.num_tcp_ctrl_pkts : 0;
+	data[i++] = set ? ch_stats->rx.num_only_ctrl_pkts : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_fin : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_rst : 0;
+	data[i++] = set ? ch_stats->rx.num_tcp_flags_syn : 0;
+	/* BP: no data packets cleaned */
+	data[i++] = set ? ch_stats->rx.num_no_data_pkt_bp : 0;
+
+	/* vector specific extra counters */
+	set = orig_set && vector_ch_stats;
+	/* state machine */
+	data[i++] = set ? vector_ch_stats->in_bp : 0;
+	data[i++] = set ? vector_ch_stats->real_int_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->real_bp_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->in_int : 0;
+	data[i++] = set ? vector_ch_stats->real_bp_to_int : 0;
+	data[i++] = set ? vector_ch_stats->real_int_to_int : 0;
+	/* unlikely_cb_to_bp, once_in_bp */
+	data[i++] = set ? vector_ch_stats->unlikely_cb_to_bp : 0;
+	data[i++] = set ? vector_ch_stats->ucb_o_bp : 0;
+	data[i++] = set ? vector_ch_stats->once_bp_false : 0;
+	/* Busypoll stop due to either need_resched() or possible timeout */
+	data[i++] = set ? vector_ch_stats->num_need_resched_bp_stop : 0;
+	data[i++] = set ? vector_ch_stats->num_timeout_bp_stop : 0;
+	/* Busypoll->Interrupt, last time "cleaned data packets" */
+	data[i++] = set ? vector_ch_stats->cleaned_any_data_pkt : 0;
+	/* need_resched() and !cleaned data packets */
+	data[i++] = set ? vector_ch_stats->num_l_c_data_pkt : 0;
+	/* possible timeout and !cleaned data packets */
+	data[i++] = set ? vector_ch_stats->num_l_c_data_pkt1 : 0;
+	/* software triggered omterrupt either from napi_poll based
+	 * on channel specific heuristic or from service_task
+	 */
+	data[i++] = set ? vector_ch_stats->num_sw_intr_timeout : 0;
+	data[i++] = set ? vector_ch_stats->num_sw_intr_serv_task : 0;
+	/* times, SW triggered interrupt were not fired */
+	data[i++] = set ? vector_ch_stats->num_no_sw_intr_opt_off : 0;
+	/* number of times WB_ON_ITR is set */
+	data[i++] = set ? vector_ch_stats->num_wb_on_itr_set : 0;
+
+	/* number of Rx packets processed when busy_poll_stop is invoked */
+	data[i++] = set ? vector_ch_stats->pkt_bp_stop_bp_budget : 0;
+
+	/* number of Rx packets processed when napi_schedule is invoked because
+	 * busy_poll_stop:napi_poll returned budget
+	 */
+	data[i++] = set ? vector_ch_stats->pkt_bp_stop_napi_budget : 0;
 
-	/* Set FEC mode based on negotiated link info */
-	switch (link_info->fec_info) {
-	case ICE_AQ_LINK_25G_KR_FEC_EN:
-		fecparam->active_fec = ETHTOOL_FEC_BASER;
-		break;
-	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
-		/* fall through */
-	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
-		fecparam->active_fec = ETHTOOL_FEC_RS;
-		break;
-	default:
-		fecparam->active_fec = ETHTOOL_FEC_OFF;
-		break;
-	}
+	/* num of times work_done == budget from busy_poll_stop code path */
+	data[i++] = set ? vector_ch_stats->bp_wd_equals_budget8 : 0;
 
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
-	if (!caps)
-		return -ENOMEM;
+	/* num of times work_done == budget from napi_shedule which gets invoked
+	 * if busy_poll_stop:napi_poll returned "budget"
+	 */
+	data[i++] = set ? vector_ch_stats->bp_wd_equals_budget64 : 0;
 
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP,
-				     caps, NULL);
-	if (status) {
-		err = -EAGAIN;
-		goto done;
-	}
+	/* how many times, kept internal state to be in BUSY_POLL
+	 * when napi_poll is invoked due to busy_poll_stop
+	 */
+	data[i++] = set ? vector_ch_stats->keep_state_bp_budget8 : 0;
 
-	/* Set supported/configured FEC modes based on PHY capability */
-	if (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC)
-		fecparam->fec |= ETHTOOL_FEC_AUTO;
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
-		fecparam->fec |= ETHTOOL_FEC_BASER;
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
-		fecparam->fec |= ETHTOOL_FEC_RS;
-	if (caps->link_fec_options == 0)
-		fecparam->fec |= ETHTOOL_FEC_OFF;
+	/* how many times, kept internal state to be in BUSY_POLL
+	 * when napi_poll is invoked due to napi_schedule.
+	 */
+	data[i++] = set ? vector_ch_stats->keep_state_bp_budget64 : 0;
 
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	/* copy back updated index */
+	*idx = i;
 }
 
 /**
- * ice_get_priv_flags - report device private flags
- * @netdev: network interface device structure
+ * ice_get_chnl_rx_strings
+ * @vsi: ptr to VSI
+ * @q:  queue index
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
  *
- * The get string set count and the string set should be matched for each
- * flag returned.  Add new strings for each flag to the ice_gstrings_priv_flags
- * array.
- *
- * Returns a u32 bitmap of flags.
+ * This function returns Rx queue and vector related strings for
+ * ADQ performance counters
  */
-static u32 ice_get_priv_flags(struct net_device *netdev)
+static void
+ice_get_chnl_rx_strings(struct ice_vsi *vsi, unsigned int q, char **loc_in_buf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	u32 i, ret_flags = 0;
+	char *p;
 
-	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-		const struct ice_priv_flag *priv_flag;
+	if (!loc_in_buf)
+		return;
+	if (q >= vsi->num_rxq)
+		return;
 
-		priv_flag = &ice_gstrings_priv_flags[i];
+	p = *loc_in_buf;
+
+	/* Rx queue specific extra counters */
+
+	/* busy and non-busy poll packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NOT_BUSY_POLL, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times Rx queue was set thru' Rx queue override logic */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SET, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times Rx queue was not set thru' Rx queue override logic */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BAIL, q);
+	p += ETH_GSTRING_LEN;
+	/* total TCP ctrl pkts */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_CTRL_PKTS, q);
+	p += ETH_GSTRING_LEN;
+	/* total "only ctrl pkts" */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_ONLY_CTRL_PKTS, q);
+	p += ETH_GSTRING_LEN;
+	/* number of FIN recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_FIN_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* number of RST recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_RST_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SYN recv */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TCP_SYN_RECV, q);
+	p += ETH_GSTRING_LEN;
+	/* BP, but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_NO_DATA_PKT, q);
+	p += ETH_GSTRING_LEN;
+
+	/* Vector specific extra counters */
+
+	/* tracking BP, INT, BP->INT, INT->BP */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_IN_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_IN_INTR, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_TO_INTR, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_TO_INTR, q);
+	p += ETH_GSTRING_LEN;
+	/* unlikely comeback to busy_poll */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_UNLIKELY_CB_TO_BP, q);
+	p += ETH_GSTRING_LEN;
+	/* unlikely comeback to busy_poll and once_in_bp is true */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_UCB_ONCE_IN_BP, q);
+	p += ETH_GSTRING_LEN;
+	/* once_in_bp is false */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_INTR_ONCE_IN_BP_FALSE, q);
+	p += ETH_GSTRING_LEN;
+	/* busy_poll stop due to need_resched() */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_STOP_NEED_RESCHED, q);
+	p += ETH_GSTRING_LEN;
+	/* busy_poll stop due to possible due to timeout */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_STOP_TIMEOUT, q);
+	p += ETH_GSTRING_LEN;
+	/* Transition: BP->INT: previously cleaned data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_CLEANED_ANY_DATA_PKT, q);
+	p += ETH_GSTRING_LEN;
+	/* need_resched(), but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NEED_RESCHED_NO_DATA, q);
+	p += ETH_GSTRING_LEN;
+	/* possible timeout(), but didn't clean any data packets */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_TIMEOUT_NO_DATA, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SW triggered interrupt from napi_poll due to
+	 * possible timeout detected
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SW_INTR_TIMEOUT, q);
+	p += ETH_GSTRING_LEN;
+	/* number of SW triggered interrupt from service_task */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_SW_INTR_SERV_TASK, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times, SW triggered interrupt is not triggered from
+	 * napi_poll even when unlikely_cb_to_bp is set, once_in_bp is set
+	 * but ethtool private featute flag is off (for interrupt optimization
+	 * strategy
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NO_SW_INTR_OPT_OFF, q);
+	p += ETH_GSTRING_LEN;
+	/* number of times WB_ON_ITR is set */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_WB_ON_ITR_SET, q);
+	p += ETH_GSTRING_LEN;
+
+	/* number of Rx packet processed due busy_poll_stop */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PKTS_BP_STOP_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
+
+	/* number of Rx packet processed due to napi_schedule which gets invoked
+	 * if busy_poll_stop returned budget
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PKTS_BP_STOP_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
 
-		if (test_bit(priv_flag->bitno, pf->flags))
-			ret_flags |= BIT(i);
-	}
+	/* num of times work_done == budget condition met from
+	 * busy_poll_stop:napi_poll code path
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_WD_EQUAL_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
 
-	return ret_flags;
+	/* num of times work_done == budget condition met from
+	 * napi_schedule:napi_poll code path (this happens if busy_poll_stop
+	 * returned "budget")
+	 */
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BP_WD_EQUAL_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
+
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_KEEP_STATE_BP_BUDGET8, q);
+	p += ETH_GSTRING_LEN;
+	snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_KEEP_STATE_BP_BUDGET64, q);
+	p += ETH_GSTRING_LEN;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
+#endif /* ADQ_PERF_COUNTERS */
 
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
 /**
- * ice_set_priv_flags - set private flags
- * @netdev: network interface device structure
- * @flags: bit flags to be set
+ * ice_get_macvlan
+ * @id: macvlan ID
+ * @pf: pointer to the PF structure
+ *
+ * Returns the MACVLAN matching the provided ID
  */
-static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
+static struct ice_macvlan *ice_get_macvlan(int id, struct ice_pf *pf)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	DECLARE_BITMAP(change_flags, ICE_PF_FLAGS_NBITS);
-	DECLARE_BITMAP(orig_flags, ICE_PF_FLAGS_NBITS);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int ret = 0;
-	u32 i;
+	struct ice_macvlan *mv;
 
-	if (flags > BIT(ICE_PRIV_FLAG_ARRAY_SIZE))
-		return -EINVAL;
+	/* If the ID is not marked as in use, no need to search */
+	if (!(test_bit(id, pf->avail_macvlan)))
+		return NULL;
 
-	set_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+	list_for_each_entry(mv, &pf->macvlan_list, list) {
+		if (id == mv->id)
+			return mv;
+	}
 
-	bitmap_copy(orig_flags, pf->flags, ICE_PF_FLAGS_NBITS);
-	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
-		const struct ice_priv_flag *priv_flag;
+	return NULL;
+}
 
-		priv_flag = &ice_gstrings_priv_flags[i];
+/**
+ * ice_get_macvlan_tx_strings
+ * @pf: pointer to the PF structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Tx related strings for MACVLAN offload
+ */
+static void ice_get_macvlan_tx_strings(struct ice_pf *pf, char **loc_in_buf)
+{
+	char *p;
+	int i;
 
-		if (flags & BIT(i))
-			set_bit(priv_flag->bitno, pf->flags);
-		else
-			clear_bit(priv_flag->bitno, pf->flags);
-	}
+	if (!loc_in_buf)
+		return;
+	p = *loc_in_buf;
 
-	bitmap_xor(change_flags, pf->flags, orig_flags, ICE_PF_FLAGS_NBITS);
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv = ice_get_macvlan(i, pf);
 
-	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, change_flags)) {
-		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
-			enum ice_status status;
+		if (mv) {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_PKTS1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_BYTES1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+		} else {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_PKTS2, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_TX_BYTES2, i);
+			p += ETH_GSTRING_LEN;
+		}
+	}
 
-			/* Disable FW LLDP engine */
-			status = ice_cfg_lldp_mib_change(&pf->hw, false);
+	/* copy back updated length */
+	*loc_in_buf = p;
+}
 
-			/* If unregistering for LLDP events fails, this is
-			 * not an error state, as there shouldn't be any
-			 * events to respond to.
-			 */
-			if (status)
-				dev_info(&pf->pdev->dev,
-					 "Failed to unreg for LLDP events\n");
+/**
+ * ice_get_macvlan_tx_stats
+ * @pf: pointer to the PF structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function is used to collect Tx statistics for MACVLAN offload
+ */
+static void ice_get_macvlan_tx_stats(struct ice_pf *pf, u64 *data, int *idx)
+{
+	int i, j;
 
-			/* The AQ call to stop the FW LLDP agent will generate
-			 * an error if the agent is already stopped.
-			 */
-			status = ice_aq_stop_lldp(&pf->hw, true, true, NULL);
-			if (status)
-				dev_warn(&pf->pdev->dev,
-					 "Fail to stop LLDP agent\n");
-			/* Use case for having the FW LLDP agent stopped
-			 * will likely not need DCB, so failure to init is
-			 * not a concern of ethtool
-			 */
-			status = ice_init_pf_dcb(pf, true);
-			if (status)
-				dev_warn(&pf->pdev->dev, "Fail to init DCB\n");
+	if (!idx)
+		return;
+	j = *idx;
 
-			/* Forward LLDP packets to default VSI so that they
-			 * are passed up the stack
-			 */
-			ice_cfg_sw_lldp(vsi, false, true);
-		} else {
-			enum ice_status status;
-			bool dcbx_agent_status;
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv;
 
-			/* AQ command to start FW LLDP agent will return an
-			 * error if the agent is already started
-			 */
-			status = ice_aq_start_lldp(&pf->hw, true, NULL);
-			if (status)
-				dev_warn(&pf->pdev->dev,
-					 "Fail to start LLDP Agent\n");
+		mv = ice_get_macvlan(i, pf);
+		if (mv) {
+			data[j++] = mv->vsi->net_stats.tx_packets;
+			data[j++] = mv->vsi->net_stats.tx_bytes;
+		} else {
+			data[j++] = 0;
+			data[j++] = 0;
+		}
+	}
 
-			/* AQ command to start FW DCBX agent will fail if
-			 * the agent is already started
-			 */
-			status = ice_aq_start_stop_dcbx(&pf->hw, true,
-							&dcbx_agent_status,
-							NULL);
-			if (status)
-				dev_dbg(&pf->pdev->dev,
-					"Failed to start FW DCBX\n");
+	/* copy back updated index */
+	*idx = j;
+}
 
-			dev_info(&pf->pdev->dev, "FW DCBX agent is %s\n",
-				 dcbx_agent_status ? "ACTIVE" : "DISABLED");
+/**
+ * ice_get_macvlan_rx_strings
+ * @pf: pointer to the PF structure
+ * @loc_in_buf: ptr to ptr to location in buffer (input and output param)
+ *
+ * This function returns Rx related strings for MACVLAN offload
+ */
+static void ice_get_macvlan_rx_strings(struct ice_pf *pf, char **loc_in_buf)
+{
+	char *p;
+	int i;
 
-			/* Failure to configure MIB change or init DCB is not
-			 * relevant to ethtool.  Print notification that
-			 * registration/init failed but do not return error
-			 * state to ethtool
-			 */
-			status = ice_init_pf_dcb(pf, true);
-			if (status)
-				dev_dbg(&pf->pdev->dev, "Fail to init DCB\n");
+	if (!loc_in_buf)
+		return;
+	p = *loc_in_buf;
 
-			/* Remove rule to direct LLDP packets to default VSI.
-			 * The FW LLDP engine will now be consuming them.
-			 */
-			ice_cfg_sw_lldp(vsi, false, false);
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv = ice_get_macvlan(i, pf);
 
-			/* Register for MIB change events */
-			status = ice_cfg_lldp_mib_change(&pf->hw, true);
-			if (status)
-				dev_dbg(&pf->pdev->dev,
-					"Fail to enable MIB change events\n");
+		if (mv) {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_PKTS1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_BYTES1,
+				 mv->vdev->name);
+			p += ETH_GSTRING_LEN;
+		} else {
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_PKTS2, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, L2_FWD_RX_BYTES2, i);
+			p += ETH_GSTRING_LEN;
 		}
 	}
-	clear_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
-	return ret;
+
+	/* copy back updated length */
+	*loc_in_buf = p;
 }
 
-static int ice_get_sset_count(struct net_device *netdev, int sset)
+/**
+ * ice_get_macvlan_rx_stats
+ * @pf: private board structure
+ * @data: ptr to data
+ * @idx: ptr to idx in data buffer (input/output param)
+ *
+ * This function is used to collect statistics for MACVLAN offload
+ */
+static void
+ice_get_macvlan_rx_stats(struct ice_pf *pf, u64 *data, int *idx)
 {
-	switch (sset) {
-	case ETH_SS_STATS:
-		/* The number (and order) of strings reported *must* remain
-		 * constant for a given netdevice. This function must not
-		 * report a different number based on run time parameters
-		 * (such as the number of queues in use, or the setting of
-		 * a private ethtool flag). This is due to the nature of the
-		 * ethtool stats API.
-		 *
-		 * Userspace programs such as ethtool must make 3 separate
-		 * ioctl requests, one for size, one for the strings, and
-		 * finally one for the stats. Since these cross into
-		 * userspace, changes to the number or size could result in
-		 * undefined memory access or incorrect string<->value
-		 * correlations for statistics.
-		 *
-		 * Even if it appears to be safe, changes to the size or
-		 * order of strings will suffer from race conditions and are
-		 * not safe.
-		 */
-		return ICE_ALL_STATS_LEN(netdev);
-	case ETH_SS_TEST:
-		return ICE_TEST_LEN;
-	case ETH_SS_PRIV_FLAGS:
-		return ICE_PRIV_FLAG_ARRAY_SIZE;
-	default:
-		return -EOPNOTSUPP;
+	int i, j;
+
+	if (!idx)
+		return;
+	j = *idx;
+
+	for (i = 0; i < ICE_MAX_MACVLANS; i++) {
+		struct ice_macvlan *mv;
+
+		mv = ice_get_macvlan(i, pf);
+		if (mv) {
+			data[j++] = mv->vsi->net_stats.rx_packets;
+			data[j++] = mv->vsi->net_stats.rx_bytes;
+		} else {
+			data[j++] = 0;
+			data[j++] = 0;
+		}
 	}
+
+	/* copy back updated index */
+	*idx = j;
 }
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-static void
-ice_get_ethtool_stats(struct net_device *netdev,
-		      struct ethtool_stats __always_unused *stats, u64 *data)
+static void ice_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_ring *ring;
-	unsigned int j;
-	int i = 0;
-	char *p;
+	struct ice_vsi *vsi = ice_get_netdev_priv_vsi(np);
+	char *p = (char *)data;
+	unsigned int i;
 
-	ice_update_pf_stats(pf);
-	ice_update_vsi_stats(vsi);
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ICE_VSI_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_vsi_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		if (ice_is_port_repr_netdev(netdev))
+			return;
 
-	for (j = 0; j < ICE_VSI_STATS_LEN; j++) {
-		p = (char *)vsi + ice_gstrings_vsi_stats[j].stat_offset;
-		data[i++] = (ice_gstrings_vsi_stats[j].sizeof_stat ==
-			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
-	}
+		ice_for_each_alloc_txq(vsi, i) {
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_PACKETS, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_BYTES, i);
+			p += ETH_GSTRING_LEN;
+#ifdef ICE_ADD_PROBES
+			snprintf(p, ETH_GSTRING_LEN, ICE_TXQ_NAPI_POLL, i);
+			p += ETH_GSTRING_LEN;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_strings(vsi, i, &p);
+#endif /* ADQ_PERF_COUNTERS */
+		}
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+		ice_get_xdp_tx_strings(vsi, &p);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+		ice_get_macvlan_tx_strings(vsi->back, &p);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-	/* populate per queue stats */
-	rcu_read_lock();
+		ice_for_each_alloc_rxq(vsi, i) {
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_PACKETS, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_BYTES, i);
+			p += ETH_GSTRING_LEN;
+#ifdef ICE_ADD_PROBES
+			snprintf(p, ETH_GSTRING_LEN, ICE_RXQ_NAPI_POLL, i);
+			p += ETH_GSTRING_LEN;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_strings(vsi, i, &p);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_strings(i, vsi->num_rxq, &p);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		}
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+		ice_get_macvlan_rx_strings(vsi->back, &p);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
 
-	ice_for_each_alloc_txq(vsi, j) {
-		ring = READ_ONCE(vsi->tx_rings[j]);
-		if (ring) {
-			data[i++] = ring->stats.pkts;
-			data[i++] = ring->stats.bytes;
-		} else {
-			data[i++] = 0;
-			data[i++] = 0;
+		if (vsi->type != ICE_VSI_PF)
+			return;
+
+		for (i = 0; i < ICE_PF_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_pf_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
 		}
-	}
 
-	ice_for_each_alloc_rxq(vsi, j) {
-		ring = READ_ONCE(vsi->rx_rings[j]);
-		if (ring) {
-			data[i++] = ring->stats.pkts;
-			data[i++] = ring->stats.bytes;
-		} else {
-			data[i++] = 0;
-			data[i++] = 0;
+		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN, PORT_TX_PRIO_XON, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, PORT_TX_PRIO_XOFF, i);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN, PORT_RX_PRIO_XON, i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, PORT_RX_PRIO_XOFF, i);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	case ETH_SS_TEST:
+		memcpy(data, ice_gstrings_test, ICE_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 ice_gstrings_priv_flags[i].name);
+			p += ETH_GSTRING_LEN;
 		}
+		break;
+	default:
+		break;
 	}
+}
 
-	rcu_read_unlock();
-
-	if (vsi->type != ICE_VSI_PF)
-		return;
+static int
+ice_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	bool led_active;
 
-	for (j = 0; j < ICE_PF_STATS_LEN; j++) {
-		p = (char *)pf + ice_gstrings_pf_stats[j].stat_offset;
-		data[i++] = (ice_gstrings_pf_stats[j].sizeof_stat ==
-			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		led_active = true;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		led_active = false;
+		break;
+	default:
+		return -EINVAL;
 	}
 
-	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_tx[j];
-		data[i++] = pf->stats.priority_xoff_tx[j];
-	}
+	if (ice_aq_set_port_id_led(np->vsi->port_info, !led_active, NULL))
+		return -EIO;
 
-	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_rx[j];
-		data[i++] = pf->stats.priority_xoff_rx[j];
-	}
+	return 0;
 }
 
 /**
- * ice_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * ice_set_fec_cfg - Set link FEC options
  * @netdev: network interface device structure
- * @ks: ethtool link ksettings struct to fill out
+ * @req_fec: FEC mode to configure
  */
-static void
-ice_phy_type_to_ethtool(struct net_device *netdev,
-			struct ethtool_link_ksettings *ks)
+static int ice_set_fec_cfg(struct net_device *netdev, enum ice_fec_mode req_fec)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_link_status *hw_link_info;
-	bool need_add_adv_mode = false;
+	struct ice_aqc_set_phy_cfg_data config = { 0 };
 	struct ice_vsi *vsi = np->vsi;
-	u64 phy_types_high;
-	u64 phy_types_low;
-
-	hw_link_info = &vsi->port_info->phy.link_info;
-	phy_types_low = vsi->port_info->phy.phy_type_low;
-	phy_types_high = vsi->port_info->phy.phy_type_high;
+	struct ice_port_info *pi;
 
-	ethtool_link_ksettings_zero_link_mode(ks, supported);
-	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+	pi = vsi->port_info;
+	if (!pi)
+		return -EOPNOTSUPP;
 
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100M_SGMII) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     100baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1G_SGMII) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseKX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseKX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_SX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_LX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     1000baseX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     2500baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_X ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     2500baseX_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     5000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_5GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     5000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_DA ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_C2C) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseT_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseKR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseKR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_SR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseSR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseSR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseLR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     10000baseLR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_C2C) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseCR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_SR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseSR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseSR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseKR_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     25000baseKR_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseKR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseKR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseCR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_SR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseSR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseSR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_LR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseLR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     40000baseLR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_LAUI2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CP ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_SR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50G_AUI1) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseCR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseKR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseKR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_SR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_LR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_FR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_LR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseSR2_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_50GB)
-			ethtool_link_ksettings_add_link_mode(ks, advertising,
-							     50000baseSR2_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_CAUI4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100G_AUI4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CP2  ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_CAUI2 ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100G_AUI2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_SR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_SR2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseSR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseSR4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_LR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_DR) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseLR4_ER4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
-	}
-	if (need_add_adv_mode) {
-		need_add_adv_mode = false;
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseLR4_ER4_Full);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 ||
-	    phy_types_high & ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		if (!hw_link_info->req_speeds ||
-		    hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100GB)
-			need_add_adv_mode = true;
+	/* Changing the FEC parameters is not supported if not the PF VSI */
+	if (vsi->type != ICE_VSI_PF) {
+		netdev_info(netdev, "Changing FEC parameters only supported for PF VSI\n");
+		return -EOPNOTSUPP;
 	}
-	if (need_add_adv_mode)
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
 
-	/* Autoneg PHY types */
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR2 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_CP ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-	if (phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 ||
-	    phy_types_low & ICE_PHY_TYPE_LOW_100GBASE_CP2) {
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Autoneg);
-	}
-}
+	/* Proceed only if requesting different FEC mode */
+	if (pi->phy.curr_user_fec_req == req_fec)
+		return 0;
 
-#define TEST_SET_BITS_TIMEOUT	50
-#define TEST_SET_BITS_SLEEP_MAX	2000
-#define TEST_SET_BITS_SLEEP_MIN	1000
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	memcpy(&config, &pi->phy.curr_user_phy_cfg, sizeof(config));
+
+	ice_cfg_phy_fec(pi, &config, req_fec);
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	if (ice_aq_set_phy_cfg(pi->hw, pi, &config, NULL))
+		return -EAGAIN;
+
+	/* Save requested FEC config */
+	pi->phy.curr_user_fec_req = req_fec;
+
+	return 0;
+}
 
+#ifdef ETHTOOL_GFECPARAM
 /**
- * ice_get_settings_link_up - Get Link settings for when link is up
- * @ks: ethtool ksettings to fill in
+ * ice_set_fecparam - Set FEC link options
  * @netdev: network interface device structure
+ * @fecparam: Ethtool structure to retrieve FEC parameters
  */
-static void
-ice_get_settings_link_up(struct ethtool_link_ksettings *ks,
-			 struct net_device *netdev)
+static int
+ice_set_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_port_info *pi = np->vsi->port_info;
-	struct ethtool_link_ksettings cap_ksettings;
-	struct ice_link_status *link_info;
 	struct ice_vsi *vsi = np->vsi;
-	bool unrecog_phy_high = false;
-	bool unrecog_phy_low = false;
-
-	link_info = &vsi->port_info->phy.link_info;
+	enum ice_fec_mode fec;
 
-	/* Initialize supported and advertised settings based on PHY settings */
-	switch (link_info->phy_type_low) {
-	case ICE_PHY_TYPE_LOW_100BASE_TX:
+	switch (fecparam->fec) {
+	case ETHTOOL_FEC_AUTO:
+		fec = ICE_FEC_AUTO;
+		break;
+	case ETHTOOL_FEC_RS:
+		fec = ICE_FEC_RS;
+		break;
+	case ETHTOOL_FEC_BASER:
+		fec = ICE_FEC_BASER;
+		break;
+	case ETHTOOL_FEC_OFF:
+	case ETHTOOL_FEC_NONE:
+		fec = ICE_FEC_NONE;
+		break;
+	default:
+		dev_warn(ice_pf_to_dev(vsi->back), "Unsupported FEC mode: %d\n",
+			 fecparam->fec);
+		return -EINVAL;
+	}
+
+	return ice_set_fec_cfg(netdev, fec);
+}
+
+/**
+ * ice_get_fecparam - Get link FEC options
+ * @netdev: network interface device structure
+ * @fecparam: Ethtool structure to retrieve FEC parameters
+ */
+static int
+ice_get_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_aqc_get_phy_caps_data *caps;
+	struct ice_link_status *link_info;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_port_info *pi;
+	enum ice_status status;
+	int err = 0;
+
+	pi = vsi->port_info;
+
+	if (!pi)
+		return -EOPNOTSUPP;
+	link_info = &pi->phy.link_info;
+
+	/* Set FEC mode based on negotiated link info */
+	switch (link_info->fec_info) {
+	case ICE_AQ_LINK_25G_KR_FEC_EN:
+		fecparam->active_fec = ETHTOOL_FEC_BASER;
+		break;
+	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
+	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
+		fecparam->active_fec = ETHTOOL_FEC_RS;
+		break;
+	default:
+		fecparam->active_fec = ETHTOOL_FEC_OFF;
+		break;
+	}
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps)
+		return -ENOMEM;
+
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+				     caps, NULL);
+	if (status) {
+		err = -EAGAIN;
+		goto done;
+	}
+
+	/* Set supported/configured FEC modes based on PHY capability */
+	if (caps->caps & ICE_AQC_PHY_EN_AUTO_FEC)
+		fecparam->fec |= ETHTOOL_FEC_AUTO;
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
+		fecparam->fec |= ETHTOOL_FEC_BASER;
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
+		fecparam->fec |= ETHTOOL_FEC_RS;
+	if (caps->link_fec_options == 0)
+		fecparam->fec |= ETHTOOL_FEC_OFF;
+
+done:
+	kfree(caps);
+	return err;
+}
+#endif /* ETHTOOL_GFECPARAM */
+
+/**
+ * ice_nway_reset - restart autonegotiation
+ * @netdev: network interface device structure
+ */
+static int ice_nway_reset(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int err;
+
+	/* If VSI state is up, then restart autoneg with link up */
+	if (!test_bit(ICE_DOWN, vsi->back->state))
+		err = ice_set_link(vsi, true);
+	else
+		err = ice_set_link(vsi, false);
+
+	return err;
+}
+
+/**
+ * ice_get_priv_flags - report device private flags
+ * @netdev: network interface device structure
+ *
+ * The get string set count and the string set should be matched for each
+ * flag returned. Add new strings for each flag to the ice_gstrings_priv_flags
+ * array.
+ *
+ * Returns a u32 bitmap of flags.
+ */
+static u32 ice_get_priv_flags(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u32 i, ret_flags = 0;
+
+	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+		const struct ice_priv_flag *priv_flag;
+
+		priv_flag = &ice_gstrings_priv_flags[i];
+
+		if (test_bit(priv_flag->bitno, pf->flags))
+			ret_flags |= BIT(i);
+	}
+
+	return ret_flags;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_recfg_chnl_vsis - reconfig channel VSIs
+ * @pf: ptr to PF
+ * @vsi: ptr to main VSI
+ *
+ * This function adjust ADQ VSI's feature flags based on changes
+ * in private flag setting - to avoid stale bits in per ADQ VSI's
+ * feature flag.
+ */
+static void ice_recfg_chnl_vsis(struct ice_pf *pf, struct ice_vsi *vsi)
+{
+	struct ice_channel *ch;
+
+	/* Nothing to be done if there is no active ADQ config */
+	if (!ice_is_adq_active(pf))
+		return;
+
+	list_for_each_entry(ch, &vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+		/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+		 * based on PF level private flags
+		 */
+		if (test_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA,
+				  ch_vsi->features);
+
+		/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+		 * based on PF level private flags: this flag meant to harvest
+		 * clean of Rx queue upon busy_poll stop and after that clean
+		 * once only.
+		 */
+		if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+				  ch_vsi->features);
+
+		/* set/clear inline flow-director bits for ADQ (aka channel)
+		 * VSIs based on PF level private flags
+		 */
+		if (test_bit(ICE_FLAG_CHNL_INLINE_FD_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA,
+				  ch_vsi->features);
+		if (test_bit(ICE_FLAG_CHNL_INLINE_FD_MARK_ENA, pf->flags))
+			set_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+				ch_vsi->features);
+		else
+			clear_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA,
+				  ch_vsi->features);
+	}
+}
+
+/**
+ * ice_recfg_vsi - reconfig specified VSI
+ * @pf: ptr to PF
+ * @vsi: ptr to main VSI
+ *
+ * Set up per vector configurable param which allows cleanup of Tx and
+ * Rx packets upto that many time if napi_schedule is invoked after
+ * busy_poll_stop (where driver returned "budget") based on driver maintained
+ * state for ADQ specific vector.
+ */
+static void ice_recfg_vsi(struct ice_pf *pf, struct ice_vsi *vsi)
+{
+	int q_vectors = vsi->num_q_vectors;
+	int vector;
+
+	if (!q_vectors)
+		return;
+
+	for (vector = 0; vector < q_vectors; vector++) {
+		struct ice_q_vector *qv = vsi->q_vectors[vector];
+
+		if (!qv)
+			continue;
+		if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_CFG, pf->flags))
+			qv->max_limit_process_rx_queues =
+						ICE_MAX_LIMIT_PROCESS_RX_PKTS;
+		else
+			qv->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+	}
+}
+#endif /* ADQ_SUPPORT */
+
+/**
+ * ice_set_priv_flags - set private flags
+ * @netdev: network interface device structure
+ * @flags: bit flags to be set
+ */
+static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	DECLARE_BITMAP(change_flags, ICE_PF_FLAGS_NBITS);
+	DECLARE_BITMAP(orig_flags, ICE_PF_FLAGS_NBITS);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int ret = 0;
+	u32 i;
+
+	if (flags > BIT(ICE_PRIV_FLAG_ARRAY_SIZE))
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(pf);
+	set_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+
+	bitmap_copy(orig_flags, pf->flags, ICE_PF_FLAGS_NBITS);
+	for (i = 0; i < ICE_PRIV_FLAG_ARRAY_SIZE; i++) {
+		const struct ice_priv_flag *priv_flag;
+
+		priv_flag = &ice_gstrings_priv_flags[i];
+
+		if (flags & BIT(i))
+			set_bit(priv_flag->bitno, pf->flags);
+		else
+			clear_bit(priv_flag->bitno, pf->flags);
+	}
+
+#ifdef NETIF_F_HW_TC
+	ice_recfg_chnl_vsis(pf, vsi);
+	ice_recfg_vsi(pf, vsi);
+#endif /* ADQ_SUPPORT */
+
+	bitmap_xor(change_flags, pf->flags, orig_flags, ICE_PF_FLAGS_NBITS);
+
+#ifndef ETHTOOL_GFECPARAM
+	if (test_bit(ICE_FLAG_RS_FEC, change_flags) ||
+	    test_bit(ICE_FLAG_BASE_R_FEC, change_flags)) {
+		enum ice_fec_mode fec = ICE_FEC_NONE;
+		int err;
+
+		/* Check if FEC is supported */
+		if (pf->hw.device_id != ICE_DEV_ID_E810C_BACKPLANE &&
+		    pf->hw.device_id != ICE_DEV_ID_E810C_QSFP &&
+		    pf->hw.device_id != ICE_DEV_ID_E810C_SFP) {
+			dev_warn(dev, "Device does not support changing FEC configuration\n");
+			ret = -EOPNOTSUPP;
+			goto ethtool_exit;
+		}
+
+		/* Set FEC configuration */
+		if (test_bit(ICE_FLAG_RS_FEC, pf->flags) &&
+		    test_bit(ICE_FLAG_BASE_R_FEC, pf->flags))
+			fec = ICE_FEC_AUTO;
+		else if (test_bit(ICE_FLAG_RS_FEC, pf->flags))
+			fec = ICE_FEC_RS;
+		else if (test_bit(ICE_FLAG_BASE_R_FEC, pf->flags))
+			fec = ICE_FEC_BASER;
+
+		err = ice_set_fec_cfg(netdev, fec);
+
+		/* If FEC configuration fails, restore original FEC flags */
+		if (err) {
+			if (test_bit(ICE_FLAG_BASE_R_FEC, orig_flags))
+				set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+			else
+				clear_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+
+			if (test_bit(ICE_FLAG_RS_FEC, orig_flags))
+				set_bit(ICE_FLAG_RS_FEC, pf->flags);
+			else
+				clear_bit(ICE_FLAG_RS_FEC, pf->flags);
+
+			ret = err;
+			goto ethtool_exit;
+		}
+	}
+#endif /* !ETHTOOL_GFECPARAM */
+
+	/* Do not allow change to link-down-on-close when Total Port Shutdown
+	 * is enabled.
+	 */
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, change_flags) &&
+	    test_bit(ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA, pf->flags)) {
+		dev_err(dev, "Setting link-down-on-close not supported on this port\n");
+		set_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags);
+		ret = -EINVAL;
+		goto ethtool_exit;
+	}
+
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, change_flags)) {
+		if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+			enum ice_status status;
+
+			/* Disable FW LLDP engine */
+			status = ice_cfg_lldp_mib_change(&pf->hw, false);
+
+			/* If unregistering for LLDP events fails, this is
+			 * not an error state, as there shouldn't be any
+			 * events to respond to.
+			 */
+			if (status)
+				dev_info(dev, "Failed to unreg for LLDP events\n");
+
+			/* The AQ call to stop the FW LLDP agent will generate
+			 * an error if the agent is already stopped.
+			 */
+			status = ice_aq_stop_lldp(&pf->hw, true, true, NULL);
+			if (status)
+				dev_warn(dev, "Fail to stop LLDP agent\n");
+			/* Use case for having the FW LLDP agent stopped
+			 * will likely not need DCB, so failure to init is
+			 * not a concern of ethtool
+			 */
+			status = ice_init_pf_dcb(pf, true);
+			if (status)
+				dev_warn(dev, "Fail to init DCB\n");
+
+			pf->dcbx_cap &= ~DCB_CAP_DCBX_LLD_MANAGED;
+			pf->dcbx_cap |= DCB_CAP_DCBX_HOST;
+		} else {
+			enum ice_status status;
+			bool dcbx_agent_status;
+
+#ifdef NETIF_F_HW_TC
+			if (ice_is_adq_active(pf)) {
+				dev_err(dev, "Disable ADQ and try again ex:'tc qdisc del dev <eth0> root'\n");
+				/* fw-lldp flag is set without checking if
+				 * the operation is successful or not, so
+				 * clear this flag when it fails
+				 */
+				clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
+				ret = -EOPNOTSUPP;
+				goto ethtool_exit;
+			}
+#endif /* NETIF_F_HW_TC */
+			if (ice_get_pfc_mode(pf) == ICE_QOS_MODE_DSCP) {
+				clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags);
+				dev_err(dev, "QoS in L3 DSCP mode, FW Agent not allowed to start\n");
+				ret = -EOPNOTSUPP;
+				goto ethtool_exit;
+			}
+
+			/* Remove rule to direct LLDP packets to default VSI.
+			 * The FW LLDP engine will now be consuming them.
+			 */
+			ice_cfg_sw_lldp(vsi, false, false);
+
+			/* AQ command to start FW LLDP agent will return an
+			 * error if the agent is already started
+			 */
+			status = ice_aq_start_lldp(&pf->hw, true, NULL);
+			if (status)
+				dev_warn(dev, "Fail to start LLDP Agent\n");
+
+			/* AQ command to start FW DCBX agent will fail if
+			 * the agent is already started
+			 */
+			status = ice_aq_start_stop_dcbx(&pf->hw, true,
+							&dcbx_agent_status,
+							NULL);
+			if (status)
+				dev_dbg(dev, "Failed to start FW DCBX\n");
+
+			dev_info(dev, "FW DCBX agent is %s\n",
+				 dcbx_agent_status ? "ACTIVE" : "DISABLED");
+
+			/* Failure to configure MIB change or init DCB is not
+			 * relevant to ethtool.  Print notification that
+			 * registration/init failed but do not return error
+			 * state to ethtool
+			 */
+			status = ice_init_pf_dcb(pf, true);
+			if (status)
+				dev_dbg(dev, "Fail to init DCB\n");
+
+			/* Register for MIB change events */
+			status = ice_cfg_lldp_mib_change(&pf->hw, true);
+			if (status)
+				dev_dbg(dev, "Fail to enable MIB change events\n");
+
+			pf->dcbx_cap &= ~DCB_CAP_DCBX_HOST;
+			pf->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED;
+
+			ice_nway_reset(netdev);
+		}
+	}
+	if (test_bit(ICE_FLAG_LEGACY_RX, change_flags)) {
+		/* down and up VSI so that changes of Rx cfg are reflected. */
+		ice_down(vsi);
+		ice_up(vsi);
+	}
+	/* don't allow modification of this flag when a single VF is in
+	 * promiscuous mode because it's not supported
+	 */
+	if (test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, change_flags) &&
+	    ice_is_any_vf_in_promisc(pf)) {
+		dev_err(dev, "Changing vf-true-promisc-support flag while VF(s) are in promiscuous mode not supported\n");
+		/* toggle bit back to previous state */
+		change_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags);
+		ret = -EAGAIN;
+	}
+
+	if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, change_flags) &&
+	    pf->num_alloc_vfs) {
+		dev_err(dev, "Changing vf-vlan-prune-disable flag while VF(s) are active is not supported\n");
+		/* toggle bit back to previous state */
+		change_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, change_flags);
+		ret = -EOPNOTSUPP;
+	}
+ethtool_exit:
+	clear_bit(ICE_FLAG_ETHTOOL_CTXT, pf->flags);
+	return ret;
+}
+
+static int ice_get_sset_count(struct net_device *netdev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		/* The number (and order) of strings reported *must* remain
+		 * constant for a given netdevice. This function must not
+		 * report a different number based on run time parameters
+		 * (such as the number of queues in use, or the setting of
+		 * a private ethtool flag). This is due to the nature of the
+		 * ethtool stats API.
+		 *
+		 * Userspace programs such as ethtool must make 3 separate
+		 * ioctl requests, one for size, one for the strings, and
+		 * finally one for the stats. Since these cross into
+		 * userspace, changes to the number or size could result in
+		 * undefined memory access or incorrect string<->value
+		 * correlations for statistics.
+		 *
+		 * Even if it appears to be safe, changes to the size or
+		 * order of strings will suffer from race conditions and are
+		 * not safe.
+		 */
+		if (ice_is_port_repr_netdev(netdev))
+			return ICE_VSI_STATS_LEN;
+		else
+			return ICE_ALL_STATS_LEN(netdev);
+	case ETH_SS_TEST:
+		return ICE_TEST_LEN;
+	case ETH_SS_PRIV_FLAGS:
+		return ICE_PRIV_FLAG_ARRAY_SIZE;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+ice_get_ethtool_stats(struct net_device *netdev,
+		      struct ethtool_stats __always_unused *stats, u64 *data)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = ice_get_netdev_priv_vsi(np);
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *ring;
+	unsigned int j;
+	int i = 0;
+	char *p;
+
+	ice_update_pf_stats(pf);
+	ice_update_vsi_stats(vsi);
+
+	for (j = 0; j < ICE_VSI_STATS_LEN; j++) {
+		p = (char *)vsi + ice_gstrings_vsi_stats[j].stat_offset;
+		data[i++] = (ice_gstrings_vsi_stats[j].sizeof_stat ==
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	if (ice_is_port_repr_netdev(netdev))
+		return;
+	/* populate per queue stats */
+	rcu_read_lock();
+
+	ice_for_each_alloc_txq(vsi, j) {
+		ring = READ_ONCE(vsi->tx_rings[j]);
+		if (ring) {
+			data[i++] = ring->stats.pkts;
+			data[i++] = ring->stats.bytes;
+#ifdef ICE_ADD_PROBES
+			data[i++] = ring->stats.napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_stats(vsi, j, data, &i, true);
+#endif /* ADQ_PERF_COUNTERS */
+		} else {
+			data[i++] = 0;
+			data[i++] = 0;
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_tx_stats(vsi, j, data, &i, false);
+#endif /* ADQ_PERF_COUNTERS */
+		}
+	}
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+	ice_get_xdp_tx_stats(vsi, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	ice_get_macvlan_tx_stats(vsi->back, data, &i);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+
+	ice_for_each_alloc_rxq(vsi, j) {
+		ring = READ_ONCE(vsi->rx_rings[j]);
+		if (ring) {
+			data[i++] = ring->stats.pkts;
+			data[i++] = ring->stats.bytes;
+#ifdef ICE_ADD_PROBES
+			data[i++] = ring->stats.napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_stats(vsi, j, data, &i, true);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_stats(&ring->xdp_stats, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		} else {
+			data[i++] = 0;
+			data[i++] = 0;
+#ifdef ADQ_PERF_COUNTERS
+			ice_get_chnl_rx_stats(vsi, j, data, &i, false);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+			ice_get_xdp_rx_stats(NULL, data, &i);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+		}
+	}
+#ifdef ICE_ADD_PROBES
+#ifdef HAVE_NETDEV_SB_DEV
+	ice_get_macvlan_rx_stats(vsi->back, data, &i);
+#endif /* HAVE_NETDEV_SB_DEV */
+#endif /* ICE_ADD_PROBES */
+
+	rcu_read_unlock();
+
+	if (vsi->type != ICE_VSI_PF)
+		return;
+
+	for (j = 0; j < ICE_PF_STATS_LEN; j++) {
+		p = (char *)pf + ice_gstrings_pf_stats[j].stat_offset;
+		data[i++] = (ice_gstrings_pf_stats[j].sizeof_stat ==
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+
+	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_tx[j];
+		data[i++] = pf->stats.priority_xoff_tx[j];
+	}
+
+	for (j = 0; j < ICE_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_rx[j];
+		data[i++] = pf->stats.priority_xoff_rx[j];
+	}
+}
+
+#define ICE_PHY_TYPE_LOW_MASK_MIN_1G	(ICE_PHY_TYPE_LOW_100BASE_TX | \
+					 ICE_PHY_TYPE_LOW_100M_SGMII)
+
+#define ICE_PHY_TYPE_LOW_MASK_MIN_25G	(ICE_PHY_TYPE_LOW_MASK_MIN_1G | \
+					 ICE_PHY_TYPE_LOW_1000BASE_T | \
+					 ICE_PHY_TYPE_LOW_1000BASE_SX | \
+					 ICE_PHY_TYPE_LOW_1000BASE_LX | \
+					 ICE_PHY_TYPE_LOW_1000BASE_KX | \
+					 ICE_PHY_TYPE_LOW_1G_SGMII | \
+					 ICE_PHY_TYPE_LOW_2500BASE_T | \
+					 ICE_PHY_TYPE_LOW_2500BASE_X | \
+					 ICE_PHY_TYPE_LOW_2500BASE_KX | \
+					 ICE_PHY_TYPE_LOW_5GBASE_T | \
+					 ICE_PHY_TYPE_LOW_5GBASE_KR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_T | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_DA | \
+					 ICE_PHY_TYPE_LOW_10GBASE_SR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_LR | \
+					 ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_10G_SFI_C2C)
+
+#define ICE_PHY_TYPE_LOW_MASK_100G	(ICE_PHY_TYPE_LOW_100GBASE_CR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_SR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_LR4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_KR4 | \
+					 ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_100G_CAUI4 | \
+					 ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC | \
+					 ICE_PHY_TYPE_LOW_100G_AUI4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_CP2 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_SR2 | \
+					 ICE_PHY_TYPE_LOW_100GBASE_DR)
+
+#define ICE_PHY_TYPE_HIGH_MASK_100G	(ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4 | \
+					 ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC |\
+					 ICE_PHY_TYPE_HIGH_100G_CAUI2 | \
+					 ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC | \
+					 ICE_PHY_TYPE_HIGH_100G_AUI2)
+
+#ifdef HAVE_ETHTOOL_100G_BITS
+/**
+ * ice_mask_min_supported_speeds
+ * @phy_types_high: PHY type high
+ * @phy_types_low: PHY type low to apply minimum supported speeds mask
+ *
+ * Apply minimum supported speeds mask to PHY type low. These are the speeds
+ * for ethtool supported link mode.
+ */
+static
+void ice_mask_min_supported_speeds(u64 phy_types_high, u64 *phy_types_low)
+#else
+static void ice_mask_min_supported_speeds(u64 *phy_types_low)
+#endif  /* !HAVE_ETHTOOL_100G_BITS */
+{
+	/* if QSFP connection with 100G speed, minimum supported speed is 25G */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (*phy_types_low & ICE_PHY_TYPE_LOW_MASK_100G ||
+	    phy_types_high & ICE_PHY_TYPE_HIGH_MASK_100G)
+#else /* HAVE_ETHTOOL_100G_BITS */
+	if (*phy_types_low & ICE_PHY_TYPE_LOW_MASK_100G)
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+		*phy_types_low &= ~ICE_PHY_TYPE_LOW_MASK_MIN_25G;
+	else
+		*phy_types_low &= ~ICE_PHY_TYPE_LOW_MASK_MIN_1G;
+}
+
+#ifdef HAVE_ETHTOOL_100G_BITS
+#define ice_ethtool_advertise_link_mode(aq_link_speed, ethtool_link_mode)    \
+	do {								     \
+		if (req_speeds & (aq_link_speed) ||			     \
+		    (!req_speeds &&					     \
+		     (advert_phy_type_lo & phy_type_mask_lo ||		     \
+		      advert_phy_type_hi & phy_type_mask_hi)))		     \
+			ethtool_link_ksettings_add_link_mode(ks, advertising,\
+							ethtool_link_mode);  \
+	} while (0)
+#else /* HAVE_ETHTOOL_100G_BITS */
+#define ice_ethtool_advertise_link_mode(aq_link_speed, ethtool_link_mode)    \
+	do {								     \
+		if (req_speeds & (aq_link_speed) ||			     \
+		    (req_speeds && advert_phy_type_lo & phy_type_mask_lo))   \
+			ethtool_link_ksettings_add_link_mode(ks, advertising,\
+							ethtool_link_mode);  \
+	} while (0)
+#endif /* ! HAVE_ETHTOOL_100G_BITS */
+
+/**
+ * ice_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * @netdev: network interface device structure
+ * @ks: ethtool link ksettings struct to fill out
+ */
+static void
+ice_phy_type_to_ethtool(struct net_device *netdev,
+			struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u64 phy_type_mask_lo = 0;
+#ifdef HAVE_ETHTOOL_100G_BITS
+	u64 phy_type_mask_hi = 0;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	u64 advert_phy_type_lo = 0;
+#ifdef HAVE_ETHTOOL_100G_BITS
+	u64 advert_phy_type_hi = 0;
+	u64 phy_types_high = 0;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	u64 phy_types_low = 0;
+	u16 req_speeds;
+
+	req_speeds = vsi->port_info->phy.link_info.req_speeds;
+
+	/* Check if lenient mode is supported and enabled, or in strict mode.
+	 *
+	 * In lenient mode the Supported link modes are the PHY types without
+	 * media. The Advertising link mode is either 1. the user requested
+	 * speed, 2. the override PHY mask, or 3. the PHY types with media.
+	 *
+	 * In strict mode Supported link mode are the PHY type with media,
+	 * and Advertising link modes are the media PHY type or the speed
+	 * requested by user.
+	 */
+	if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags)) {
+		phy_types_low = le64_to_cpu(pf->nvm_phy_type_lo);
+#ifdef HAVE_ETHTOOL_100G_BITS
+		phy_types_high = le64_to_cpu(pf->nvm_phy_type_hi);
+
+		ice_mask_min_supported_speeds(phy_types_high, &phy_types_low);
+#else /* HAVE_ETHTOOL_100G_BITS */
+		ice_mask_min_supported_speeds(&phy_types_low);
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+		/* determine advertised modes based on link override only
+		 * if it's supported and if the FW doesn't abstract the
+		 * driver from having to account for link overrides
+		 */
+		if (ice_fw_supports_link_override(&pf->hw) &&
+		    !ice_fw_supports_report_dflt_cfg(&pf->hw)) {
+			struct ice_link_default_override_tlv *ldo;
+
+			ldo = &pf->link_dflt_override;
+			/* If override enabled and PHY mask set, then
+			 * Advertising link mode is the intersection of the PHY
+			 * types without media and the override PHY mask.
+			 */
+			if (ldo->options & ICE_LINK_OVERRIDE_EN &&
+			    (ldo->phy_type_low || ldo->phy_type_high)) {
+				advert_phy_type_lo =
+					le64_to_cpu(pf->nvm_phy_type_lo) &
+					ldo->phy_type_low;
+#ifdef HAVE_ETHTOOL_100G_BITS
+				advert_phy_type_hi =
+					le64_to_cpu(pf->nvm_phy_type_hi) &
+					ldo->phy_type_high;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+			}
+		}
+	} else {
+		/* strict mode */
+		phy_types_low = vsi->port_info->phy.phy_type_low;
+#ifdef HAVE_ETHTOOL_100G_BITS
+		phy_types_high = vsi->port_info->phy.phy_type_high;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+	}
+
+	/* If Advertising link mode PHY type is not using override PHY type,
+	 * then use PHY type with media.
+	 */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (!advert_phy_type_lo && !advert_phy_type_hi) {
+		advert_phy_type_lo = vsi->port_info->phy.phy_type_low;
+		advert_phy_type_hi = vsi->port_info->phy.phy_type_high;
+	}
+#else
+	if (!advert_phy_type_lo)
+		advert_phy_type_lo = vsi->port_info->phy.phy_type_low;
+#endif /* !HAVE_ETHTOOL_100G_BITS */
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100BASE_TX |
+			   ICE_PHY_TYPE_LOW_100M_SGMII;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100MB,
+						100baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_T |
+			   ICE_PHY_TYPE_LOW_1G_SGMII;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_KX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseKX_Full);
+	}
+#ifdef HAVE_ETHTOOL_NEW_1G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_SX |
+			   ICE_PHY_TYPE_LOW_1000BASE_LX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseX_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_1000BASE_SX |
+			   ICE_PHY_TYPE_LOW_1000BASE_LX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_1000MB,
+						1000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_1G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_2500MB_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_T;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseT_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_T;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseX_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_2500MB_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_2500BASE_X |
+			   ICE_PHY_TYPE_LOW_2500BASE_KX;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_2500MB,
+						2500baseX_Full);
+	}
+#ifdef HAVE_ETHTOOL_5G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_5GBASE_T |
+			   ICE_PHY_TYPE_LOW_5GBASE_KR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     5000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_5GB,
+						5000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_5G_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_T |
+			   ICE_PHY_TYPE_LOW_10G_SFI_DA |
+			   ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseT_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 |
+			   ICE_PHY_TYPE_LOW_10G_SFI_C2C;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseKR_Full);
+	}
+#ifdef HAVE_ETHTOOL_NEW_10G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_SR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseSR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseLR_Full);
+	}
+#else
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_10GBASE_SR |
+			   ICE_PHY_TYPE_LOW_10GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_10GB,
+						10000baseT_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_10G_BITS */
+#ifdef HAVE_ETHTOOL_25G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_T |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR_S |
+			   ICE_PHY_TYPE_LOW_25GBASE_CR1 |
+			   ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseCR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_SR |
+			   ICE_PHY_TYPE_LOW_25GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseSR_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_25GBASE_KR |
+			   ICE_PHY_TYPE_LOW_25GBASE_KR_S |
+			   ICE_PHY_TYPE_LOW_25GBASE_KR1 |
+			   ICE_PHY_TYPE_LOW_25G_AUI_C2C;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_25GB,
+						25000baseKR_Full);
+	}
+#endif /* HAVE_ETHTOOL_25G_BITS */
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_KR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseKR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseKR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_CR4 |
+			   ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_40G_XLAUI;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseCR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_SR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseSR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_40GBASE_LR4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_40GB,
+						40000baseLR4_Full);
+	}
+#ifdef HAVE_ETHTOOL_50G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_CR2 |
+			   ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_LAUI2 |
+			   ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_AUI2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_CP |
+			   ICE_PHY_TYPE_LOW_50GBASE_SR |
+			   ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_50G_AUI1;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseCR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseCR2_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_KR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseKR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseKR2_Full);
+	}
+#endif /* HAVE_ETHTOOL_50G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_50G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_50GBASE_SR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_LR2 |
+			   ICE_PHY_TYPE_LOW_50GBASE_FR |
+			   ICE_PHY_TYPE_LOW_50GBASE_LR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     50000baseSR2_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_50GB,
+						50000baseSR2_Full);
+	}
+#endif /* HAVE_ETHTOOL_NEW_50G_BITS */
+#ifdef HAVE_ETHTOOL_100G_BITS
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_CR4 |
+			   ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_100G_CAUI4 |
+			   ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC |
+			   ICE_PHY_TYPE_LOW_100G_AUI4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_CP2;
+	phy_type_mask_hi = ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC |
+			   ICE_PHY_TYPE_HIGH_100G_CAUI2 |
+			   ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC |
+			   ICE_PHY_TYPE_HIGH_100G_AUI2;
+	if (phy_types_low & phy_type_mask_lo ||
+	    phy_types_high & phy_type_mask_hi) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseCR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseCR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_SR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_SR2;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseSR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseSR4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_LR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_DR;
+	if (phy_types_low & phy_type_mask_lo) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseLR4_ER4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseLR4_ER4_Full);
+	}
+
+	phy_type_mask_lo = ICE_PHY_TYPE_LOW_100GBASE_KR4 |
+			   ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4;
+	phy_type_mask_hi = ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4;
+	if (phy_types_low & phy_type_mask_lo ||
+	    phy_types_high & phy_type_mask_hi) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100000baseKR4_Full);
+		ice_ethtool_advertise_link_mode(ICE_AQ_LINK_SPEED_100GB,
+						100000baseKR4_Full);
+	}
+#endif /* HAVE_ETHTOOL_100G_BITS */
+}
+
+#define TEST_SET_BITS_TIMEOUT	50
+#define TEST_SET_BITS_SLEEP_MAX	2000
+#define TEST_SET_BITS_SLEEP_MIN	1000
+
+#ifdef ETHTOOL_GLINKSETTINGS
+/**
+ * ice_get_settings_link_up - Get Link settings for when link is up
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ */
+static void
+ice_get_settings_link_up(struct ethtool_link_ksettings *ks,
+			 struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_port_info *pi = np->vsi->port_info;
+	struct ice_link_status *link_info;
+	struct ice_vsi *vsi = np->vsi;
+
+	link_info = &vsi->port_info->phy.link_info;
+
+	/* Get supported and advertised settings from PHY ability with media */
+	ice_phy_type_to_ethtool(netdev, ks);
+
+	switch (link_info->link_speed) {
+#ifdef HAVE_ETHTOOL_100G_BITS
+	case ICE_AQ_LINK_SPEED_100GB:
+		ks->base.speed = SPEED_100000;
+		break;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+#if defined(HAVE_ETHTOOL_50G_BITS) || defined(HAVE_ETHTOOL_NEW_50G_BITS)
+	case ICE_AQ_LINK_SPEED_50GB:
+		ks->base.speed = SPEED_50000;
+		break;
+#endif /* HAVE_ETHTOOL_50G_BITS || HAVE_ETHTOOL_NEW_50G_BITS */
+	case ICE_AQ_LINK_SPEED_40GB:
+		ks->base.speed = SPEED_40000;
+		break;
+#ifdef HAVE_ETHTOOL_25G_BITS
+	case ICE_AQ_LINK_SPEED_25GB:
+		ks->base.speed = SPEED_25000;
+		break;
+#endif /* HAVE_ETHTOOL_25G_BITS */
+	case ICE_AQ_LINK_SPEED_20GB:
+		ks->base.speed = SPEED_20000;
+		break;
+	case ICE_AQ_LINK_SPEED_10GB:
+		ks->base.speed = SPEED_10000;
+		break;
+#ifdef HAVE_ETHTOOL_5G_BITS
+	case ICE_AQ_LINK_SPEED_5GB:
+		ks->base.speed = SPEED_5000;
+		break;
+#endif /* HAVE_ETHTOOL_5G_BITS */
+	case ICE_AQ_LINK_SPEED_2500MB:
+		ks->base.speed = SPEED_2500;
+		break;
+	case ICE_AQ_LINK_SPEED_1000MB:
+		ks->base.speed = SPEED_1000;
+		break;
+	case ICE_AQ_LINK_SPEED_100MB:
+		ks->base.speed = SPEED_100;
+		break;
+	default:
+		netdev_info(netdev, "WARNING: Unrecognized link_speed (0x%x).\n",
+			    link_info->link_speed);
+		break;
+	}
+	ks->base.duplex = DUPLEX_FULL;
+
+	if (link_info->an_info & ICE_AQ_AN_COMPLETED)
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Autoneg);
+
+	/* Set flow control negotiated Rx/Tx pause */
+	switch (pi->fc.current_mode) {
+	case ICE_FC_FULL:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
+		break;
+	case ICE_FC_TX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	case ICE_FC_RX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	case ICE_FC_PFC:
+	default:
+		ethtool_link_ksettings_del_link_mode(ks, lp_advertising, Pause);
+		ethtool_link_ksettings_del_link_mode(ks, lp_advertising,
+						     Asym_Pause);
+		break;
+	}
+}
+
+/**
+ * ice_get_settings_link_down - Get the Link settings when link is down
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ *
+ * Reports link settings that can be determined when link is down
+ */
+static void
+ice_get_settings_link_down(struct ethtool_link_ksettings *ks,
+			   struct net_device *netdev)
+{
+	/* link is down and the driver needs to fall back on
+	 * supported PHY types to figure out what info to display
+	 */
+	ice_phy_type_to_ethtool(netdev, ks);
+
+	/* With no link, speed and duplex are unknown */
+	ks->base.speed = SPEED_UNKNOWN;
+	ks->base.duplex = DUPLEX_UNKNOWN;
+}
+
+/**
+ * ice_get_link_ksettings - Get Link Speed and Duplex settings
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Reports speed/duplex settings based on media_type
+ */
+static int
+ice_get_link_ksettings(struct net_device *netdev,
+		       struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_aqc_get_phy_caps_data *caps;
+	struct ice_link_status *hw_link_info;
+	struct ice_vsi *vsi = np->vsi;
+	enum ice_status status;
+	int err = 0;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+	ethtool_link_ksettings_zero_link_mode(ks, lp_advertising);
+	hw_link_info = &vsi->port_info->phy.link_info;
+
+
+	/* set speed and duplex */
+	if (hw_link_info->link_info & ICE_AQ_LINK_UP)
+		ice_get_settings_link_up(ks, netdev);
+	else
+		ice_get_settings_link_down(ks, netdev);
+
+	/* set autoneg settings */
+	ks->base.autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	/* set media type settings */
+	switch (vsi->port_info->phy.media_type) {
+	case ICE_MEDIA_FIBER:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ks->base.port = PORT_FIBRE;
+		break;
+	case ICE_MEDIA_BASET:
+		ethtool_link_ksettings_add_link_mode(ks, supported, TP);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, TP);
+		ks->base.port = PORT_TP;
+		break;
+	case ICE_MEDIA_BACKPLANE:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Backplane);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Backplane);
+		ks->base.port = PORT_NONE;
+		break;
+	case ICE_MEDIA_DA:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, FIBRE);
+		ks->base.port = PORT_DA;
+		break;
+	default:
+		ks->base.port = PORT_OTHER;
+		break;
+	}
+
+	/* flow control is symmetric and always supported */
+	ethtool_link_ksettings_add_link_mode(ks, supported, Pause);
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps)
+		return -ENOMEM;
+
+	status = ice_aq_get_phy_caps(vsi->port_info, false,
+				     ICE_AQC_REPORT_ACTIVE_CFG, caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	/* Set the advertised flow control based on the PHY capability */
+	if ((caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) &&
+	    (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else if (caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else if (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE) {
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+	} else {
+		ethtool_link_ksettings_del_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_del_link_mode(ks, advertising,
+						     Asym_Pause);
+	}
+
+#ifdef ETHTOOL_GFECPARAM
+	/* Set advertised FEC modes based on PHY capability */
+	ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_NONE);
+
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     FEC_BASER);
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
+		ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_RS);
+#endif /* ETHTOOL_GFECPARAM */
+
+	status = ice_aq_get_phy_caps(vsi->port_info, false,
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+#ifdef ETHTOOL_GFECPARAM
+	/* Set supported FEC modes based on PHY capability */
+	ethtool_link_ksettings_add_link_mode(ks, supported, FEC_NONE);
+
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
+	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN)
+		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_BASER);
+	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
+		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_RS);
+#endif /* ETHTOOL_GFECPARAM */
+
+	/* Set supported and advertised autoneg */
+	if (ice_is_phy_caps_an_enabled(caps)) {
 		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
 		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100baseT_Full);
-		break;
+	}
+
+done:
+	kfree(caps);
+	return err;
+}
+
+/**
+ * ice_ksettings_find_adv_link_speed - Find advertising link speed
+ * @ks: ethtool ksettings
+ */
+static u16
+ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
+{
+	u16 adv_link_speed = 0;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_100MB;
+#ifdef HAVE_ETHTOOL_NEW_1G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+#endif /* HAVE_ETHTOOL_NEW_1G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseKX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+#ifdef HAVE_ETHTOOL_NEW_2500MB_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+#endif /* HAVE_ETHTOOL_NEW_2500MB_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+#ifdef HAVE_ETHTOOL_5G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  5000baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_5GB;
+#endif /* HAVE_ETHTOOL_5G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+#ifdef HAVE_ETHTOOL_NEW_10G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseLR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+#endif /* HAVE_ETHTOOL_NEW_10G_BITS */
+#ifdef HAVE_ETHTOOL_25G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseCR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_25GB;
+#endif /* HAVE_ETHTOOL_25G_BITS */
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseLR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseKR4_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
+#ifdef HAVE_ETHTOOL_50G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseCR2_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseKR2_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
+#endif /* HAVE_ETHTOOL_50G_BITS */
+#ifdef HAVE_ETHTOOL_NEW_50G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  50000baseSR2_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
+#endif /* HAVE_ETHTOOL_NEW_50G_BITS */
+#ifdef HAVE_ETHTOOL_100G_BITS
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseLR4_ER4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100000baseKR4_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_100GB;
+#endif /* HAVE_ETHTOOL_100G_BITS */
+
+	return adv_link_speed;
+}
+
+/**
+ * ice_setup_autoneg
+ * @p: port info
+ * @ks: ethtool_link_ksettings
+ * @config: configuration that will be sent down to FW
+ * @autoneg_enabled: autonegotiation is enabled or not
+ * @autoneg_changed: will there a change in autonegotiation
+ * @netdev: network interface device structure
+ *
+ * Setup PHY autonegotiation feature
+ */
+static int
+ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
+		  struct ice_aqc_set_phy_cfg_data *config,
+		  u8 autoneg_enabled, u8 *autoneg_changed,
+		  struct net_device *netdev)
+{
+	int err = 0;
+
+	*autoneg_changed = 0;
+
+	/* Check autoneg */
+	if (autoneg_enabled == AUTONEG_ENABLE) {
+		/* If autoneg was not already enabled */
+		if (!(p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED)) {
+			/* If autoneg is not supported, return error */
+			if (!ethtool_link_ksettings_test_link_mode(ks,
+								   supported,
+								   Autoneg)) {
+				netdev_info(netdev, "Autoneg not supported on this phy.\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	} else {
+		/* If autoneg is currently enabled */
+		if (p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED) {
+			/* If autoneg is supported 10GBASE_T is the only PHY
+			 * that can disable it, so otherwise return error
+			 */
+			if (ethtool_link_ksettings_test_link_mode(ks,
+								  supported,
+								  Autoneg)) {
+				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	}
+
+	return err;
+}
+
+/**
+ * ice_set_link_ksettings - Set Speed and Duplex
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Set speed/duplex per media_types advertised/forced
+ */
+static int
+ice_set_link_ksettings(struct net_device *netdev,
+		       const struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT;
+	struct ethtool_link_ksettings copy_ks = *ks;
+	struct ethtool_link_ksettings safe_ks = {};
+	struct ice_aqc_get_phy_caps_data *phy_caps;
+	struct ice_aqc_set_phy_cfg_data config;
+	u16 adv_link_speed, curr_link_speed;
+	struct ice_pf *pf = np->vsi->back;
+	struct ice_port_info *pi;
+	u8 autoneg_changed = 0;
+	enum ice_status status;
+	u64 phy_type_high = 0;
+	u64 phy_type_low = 0;
+	int err = 0;
+	bool linkup;
+
+	pi = np->vsi->port_info;
+
+	if (!pi)
+		return -EIO;
+
+	if (pi->phy.media_type != ICE_MEDIA_BASET &&
+	    pi->phy.media_type != ICE_MEDIA_FIBER &&
+	    pi->phy.media_type != ICE_MEDIA_BACKPLANE &&
+	    pi->phy.media_type != ICE_MEDIA_DA &&
+	    pi->phy.link_info.link_info & ICE_AQ_LINK_UP)
+		return -EOPNOTSUPP;
+
+	phy_caps = kzalloc(sizeof(*phy_caps), GFP_KERNEL);
+	if (!phy_caps)
+		return -ENOMEM;
+
+	/* Get the PHY capabilities based on media */
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     phy_caps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     phy_caps, NULL);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	/* save autoneg out of ksettings */
+	autoneg = copy_ks.base.autoneg;
+
+	/* Get link modes supported by hardware.*/
+	ice_phy_type_to_ethtool(netdev, &safe_ks);
+
+	/* and check against modes requested by user.
+	 * Return an error if unsupported mode was set.
+	 */
+	if (!bitmap_subset(copy_ks.link_modes.advertising,
+			   safe_ks.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+		if (!test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags))
+			netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* get our own copy of the bits to check against */
+	memset(&safe_ks, 0, sizeof(safe_ks));
+	safe_ks.base.cmd = copy_ks.base.cmd;
+	safe_ks.base.link_mode_masks_nwords =
+		copy_ks.base.link_mode_masks_nwords;
+	ice_get_link_ksettings(netdev, &safe_ks);
+
+	/* set autoneg back to what it currently is */
+	copy_ks.base.autoneg = safe_ks.base.autoneg;
+	/* we don't compare the speed */
+	copy_ks.base.speed = safe_ks.base.speed;
+
+	/* If copy_ks.base and safe_ks.base are not the same now, then they are
+	 * trying to set something that we do not support.
+	 */
+	if (memcmp(&copy_ks.base, &safe_ks.base, sizeof(copy_ks.base))) {
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout) {
+			err = -EBUSY;
+			goto done;
+		}
+		usleep_range(TEST_SET_BITS_SLEEP_MIN, TEST_SET_BITS_SLEEP_MAX);
+	}
+
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	config = pi->phy.curr_user_phy_cfg;
+
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	/* Check autoneg */
+	err = ice_setup_autoneg(pi, &safe_ks, &config, autoneg, &autoneg_changed,
+				netdev);
+
+	if (err)
+		goto done;
+
+	/* Call to get the current link speed */
+	pi->phy.get_link_info = true;
+	status = ice_get_link_status(pi, &linkup);
+	if (status) {
+		err = -EIO;
+		goto done;
+	}
+
+	curr_link_speed = pi->phy.link_info.link_speed;
+	adv_link_speed = ice_ksettings_find_adv_link_speed(ks);
+
+	/* If speed didn't get set, set it to what it currently is.
+	 * This is needed because if advertise is 0 (as it is when autoneg
+	 * is disabled) then speed won't get set.
+	 */
+	if (!adv_link_speed)
+		adv_link_speed = curr_link_speed;
+
+	/* Convert the advertise link speeds to their corresponded PHY_TYPE */
+	ice_update_phy_type(&phy_type_low, &phy_type_high, adv_link_speed);
+
+	if (!autoneg_changed && adv_link_speed == curr_link_speed) {
+		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
+		goto done;
+	}
+
+	/* save the requested speeds */
+	pi->phy.link_info.req_speeds = adv_link_speed;
+
+	/* set link and auto negotiation so changes take effect */
+	config.caps |= ICE_AQ_PHY_ENA_LINK;
+
+	/* check if there is a PHY type for the requested advertised speed */
+	if (!(phy_type_low || phy_type_high)) {
+		netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+	/* intersect requested advertised speed PHY types with media PHY types
+	 * for set PHY configuration
+	 */
+	config.phy_type_high = cpu_to_le64(phy_type_high) &
+			phy_caps->phy_type_high;
+	config.phy_type_low = cpu_to_le64(phy_type_low) &
+			phy_caps->phy_type_low;
+
+	if (!(config.phy_type_high || config.phy_type_low)) {
+		/* If there is no intersection and lenient mode is enabled, then
+		 * intersect the requested advertised speed with NVM media type
+		 * PHY types.
+		 */
+		if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags)) {
+			config.phy_type_high = cpu_to_le64(phy_type_high) &
+					       pf->nvm_phy_type_hi;
+			config.phy_type_low = cpu_to_le64(phy_type_low) &
+					      pf->nvm_phy_type_lo;
+		} else {
+			netdev_info(netdev, "The selected speed is not supported by the current media. Please select a link speed that is supported by the current media.\n");
+			err = -EOPNOTSUPP;
+			goto done;
+		}
+	}
+
+	/* If link is up put link down */
+	if (pi->phy.link_info.link_info & ICE_AQ_LINK_UP) {
+		/* Tell the OS link is going down, the link will go
+		 * back up when fw says it is ready asynchronously
+		 */
+		ice_print_link_msg(np->vsi, false);
+		netif_carrier_off(netdev);
+		netif_tx_stop_all_queues(netdev);
+	}
+
+	/* make the aq call */
+	status = ice_aq_set_phy_cfg(&pf->hw, pi, &config, NULL);
+	if (status) {
+		netdev_info(netdev, "Set phy config failed,\n");
+		err = -EIO;
+		goto done;
+	}
+
+	/* Save speed request */
+	pi->phy.curr_user_speed_req = adv_link_speed;
+done:
+	kfree(phy_caps);
+	clear_bit(ICE_CFG_BUSY, pf->state);
+
+	return err;
+}
+#else /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_get_legacy_settings_link_up - Get the Link settings for when link is up
+ * @ecmd: ethtool command to fill in
+ * @netdev: network interface device structure
+ *
+ * Reports link settings that can be determined when link is up
+ */
+static void
+ice_get_legacy_settings_link_up(struct ethtool_cmd *ecmd,
+				struct net_device *netdev)
+{
+	struct ethtool_link_ksettings ks, cap_ks;
+	struct ice_link_status *hw_link_info;
+	struct ice_netdev_priv *np;
+	struct ice_vsi *vsi;
+	u64 phy_types_low;
+
+	np = netdev_priv(netdev);
+	vsi = np->vsi;
+	hw_link_info = &vsi->port_info->phy.link_info;
+	phy_types_low = hw_link_info->phy_type_low;
+
+	/* Initialize supported and advertised settings based on PHY settings */
+	switch (phy_types_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
 	case ICE_PHY_TYPE_LOW_100M_SGMII:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100baseT_Full);
+		ecmd->supported = SUPPORTED_100baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_100M_SGMII)
+			ecmd->advertising = ADVERTISED_100baseT_Full;
 		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     1000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_1G_SGMII:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseT_Full);
-		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_SX:
 	case ICE_PHY_TYPE_LOW_1000BASE_LX:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseX_Full);
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		ecmd->supported = SUPPORTED_1000baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_1000BASE_T)
+			ecmd->advertising = ADVERTISED_1000baseT_Full;
 		break;
 	case ICE_PHY_TYPE_LOW_1000BASE_KX:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     1000baseKX_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     1000baseKX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     2500baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_X:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_2500BASE_KX:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     2500baseX_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     2500baseX_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_5GBASE_T:
-	case ICE_PHY_TYPE_LOW_5GBASE_KR:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     5000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     5000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_T:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     10000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
-	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseT_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_SR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseSR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseLR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     10000baseKR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     10000baseKR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_T:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
-	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     25000baseCR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseCR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_SR:
-	case ICE_PHY_TYPE_LOW_25GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseSR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_25GBASE_KR:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
-	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     25000baseKR_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     25000baseKR_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     40000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_40G_XLAUI:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseSR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseLR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     40000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     40000baseKR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_CR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_CP:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     50000baseCR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_LAUI2:
-	case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI2:
-	case ICE_PHY_TYPE_LOW_50GBASE_SR:
-	case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_50G_AUI1:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseCR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_KR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseKR2_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     50000baseKR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_50GBASE_SR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR2:
-	case ICE_PHY_TYPE_LOW_50GBASE_FR:
-	case ICE_PHY_TYPE_LOW_50GBASE_LR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     50000baseSR2_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CR4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_CAUI4:
-	case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC:
-	case ICE_PHY_TYPE_LOW_100G_AUI4:
-	case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_CP2:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseCR4_Full);
+		ecmd->supported = SUPPORTED_1000baseKX_Full;
+		ecmd->advertising = ADVERTISED_1000baseKX_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_SR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_SR2:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseSR4_Full);
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		ecmd->supported = SUPPORTED_2500baseX_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_2500BASE_T ||
+		    phy_types_low == ICE_PHY_TYPE_LOW_2500BASE_KX)
+			ecmd->advertising = ADVERTISED_2500baseX_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_LR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_DR:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseLR4_ER4_Full);
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+		ecmd->supported = SUPPORTED_10000baseT_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_10GBASE_T)
+			ecmd->advertising = ADVERTISED_10000baseT_Full;
 		break;
-	case ICE_PHY_TYPE_LOW_100GBASE_KR4:
-	case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		ecmd->supported = SUPPORTED_10000baseKR_Full;
+		ecmd->advertising = ADVERTISED_10000baseKR_Full;
 		break;
-	default:
-		unrecog_phy_low = true;
-	}
-
-	switch (link_info->phy_type_high) {
-	case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseKR4_Full);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     100000baseKR4_Full);
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		ecmd->supported = SUPPORTED_40000baseCR4_Full;
+		if (phy_types_low == ICE_PHY_TYPE_LOW_40GBASE_CR4)
+			ecmd->advertising = ADVERTISED_40000baseCR4_Full;
 		break;
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_CAUI2:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC:
-	case ICE_PHY_TYPE_HIGH_100G_AUI2:
-		ethtool_link_ksettings_add_link_mode(ks, supported,
-						     100000baseCR4_Full);
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+		ecmd->supported = SUPPORTED_40000baseSR4_Full;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+		ecmd->supported = SUPPORTED_40000baseLR4_Full;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+		ecmd->supported = SUPPORTED_40000baseKR4_Full;
+		ecmd->advertising = ADVERTISED_40000baseKR4_Full;
 		break;
 	default:
-		unrecog_phy_high = true;
-	}
-
-	if (unrecog_phy_low && unrecog_phy_high) {
 		/* if we got here and link is up something bad is afoot */
-		netdev_info(netdev,
-			    "WARNING: Unrecognized PHY_Low (0x%llx).\n",
-			    (u64)link_info->phy_type_low);
-		netdev_info(netdev,
-			    "WARNING: Unrecognized PHY_High (0x%llx).\n",
-			    (u64)link_info->phy_type_high);
+		netdev_info(netdev, "WARNING: Link up but PhyType isn't recognized.\n");
+		netdev_info(netdev, "WARNING: Unrecognized PHY_Low (0x%llx).\n",
+			    (u64)phy_types_low);
 	}
 
 	/* Now that we've worked out everything that could be supported by the
-	 * current PHY type, get what is supported by the NVM and intersect
-	 * them to get what is truly supported
+	 * current PHY type, get what is supported by the NVM and and them to
+	 * get what is truly supported
+	 *
+	 * ice_phy_type_to_ethtool uses the new API ethtool_link_ksettings
+	 * struct, so we need to convert ecmd to a ksettings to intersect them,
+	 * then convert back to legacy ethtool_cmd.
 	 */
-	memset(&cap_ksettings, 0, sizeof(cap_ksettings));
-	ice_phy_type_to_ethtool(netdev, &cap_ksettings);
-	ethtool_intersect_link_masks(ks, &cap_ksettings);
-
-	switch (link_info->link_speed) {
-	case ICE_AQ_LINK_SPEED_100GB:
-		ks->base.speed = SPEED_100000;
-		break;
-	case ICE_AQ_LINK_SPEED_50GB:
-		ks->base.speed = SPEED_50000;
-		break;
+	ks.link_modes.supported[0] = ecmd->supported;
+	ks.link_modes.advertising[0] = ecmd->advertising;
+	ice_phy_type_to_ethtool(netdev, &cap_ks);
+	ethtool_intersect_link_masks(&ks, &cap_ks);
+	ecmd->supported = (u32)ks.link_modes.supported[0];
+	ecmd->advertising = (u32)ks.link_modes.advertising[0];
+
+	/* Set speed and duplex */
+	switch (hw_link_info->link_speed) {
 	case ICE_AQ_LINK_SPEED_40GB:
-		ks->base.speed = SPEED_40000;
-		break;
-	case ICE_AQ_LINK_SPEED_25GB:
-		ks->base.speed = SPEED_25000;
-		break;
-	case ICE_AQ_LINK_SPEED_20GB:
-		ks->base.speed = SPEED_20000;
+		ethtool_cmd_speed_set(ecmd, SPEED_40000);
 		break;
 	case ICE_AQ_LINK_SPEED_10GB:
-		ks->base.speed = SPEED_10000;
-		break;
-	case ICE_AQ_LINK_SPEED_5GB:
-		ks->base.speed = SPEED_5000;
+		ethtool_cmd_speed_set(ecmd, SPEED_10000);
 		break;
 	case ICE_AQ_LINK_SPEED_2500MB:
-		ks->base.speed = SPEED_2500;
+		ethtool_cmd_speed_set(ecmd, SPEED_2500);
 		break;
 	case ICE_AQ_LINK_SPEED_1000MB:
-		ks->base.speed = SPEED_1000;
+		ethtool_cmd_speed_set(ecmd, SPEED_1000);
 		break;
 	case ICE_AQ_LINK_SPEED_100MB:
-		ks->base.speed = SPEED_100;
-		break;
-	default:
-		netdev_info(netdev,
-			    "WARNING: Unrecognized link_speed (0x%x).\n",
-			    link_info->link_speed);
-		break;
-	}
-	ks->base.duplex = DUPLEX_FULL;
-
-	if (link_info->an_info & ICE_AQ_AN_COMPLETED)
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Autoneg);
-
-	/* Set flow control negotiated Rx/Tx pause */
-	switch (pi->fc.current_mode) {
-	case ICE_FC_FULL:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
-		break;
-	case ICE_FC_TX_PAUSE:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Asym_Pause);
-		break;
-	case ICE_FC_RX_PAUSE:
-		ethtool_link_ksettings_add_link_mode(ks, lp_advertising,
-						     Asym_Pause);
+		ethtool_cmd_speed_set(ecmd, SPEED_100);
 		break;
-	case ICE_FC_PFC:
-		/* fall through */
 	default:
-		ethtool_link_ksettings_del_link_mode(ks, lp_advertising, Pause);
-		ethtool_link_ksettings_del_link_mode(ks, lp_advertising,
-						     Asym_Pause);
+		netdev_info(netdev, "WARNING: Unrecognized link_speed (0x%x).\n",
+			    hw_link_info->link_speed);
 		break;
 	}
+	ecmd->duplex = DUPLEX_FULL;
 }
 
 /**
- * ice_get_settings_link_down - Get the Link settings when link is down
- * @ks: ethtool ksettings to fill in
+ * ice_get_legacy_settings_link_down - Get the Link settings when link is down
+ * @ecmd: ethtool command to fill in
  * @netdev: network interface device structure
  *
  * Reports link settings that can be determined when link is down
  */
 static void
-ice_get_settings_link_down(struct ethtool_link_ksettings *ks,
-			   struct net_device *netdev)
+ice_get_legacy_settings_link_down(struct ethtool_cmd *ecmd,
+				  struct net_device *netdev)
 {
+	struct ethtool_link_ksettings ks;
+
 	/* link is down and the driver needs to fall back on
 	 * supported PHY types to figure out what info to display
 	 */
-	ice_phy_type_to_ethtool(netdev, ks);
+	ice_phy_type_to_ethtool(netdev, &ks);
+	ecmd->supported = (u32)ks.link_modes.supported[0];
+	ecmd->advertising = (u32)ks.link_modes.advertising[0];
 
-	/* With no link, speed and duplex are unknown */
-	ks->base.speed = SPEED_UNKNOWN;
-	ks->base.duplex = DUPLEX_UNKNOWN;
+	ethtool_cmd_speed_set(ecmd, SPEED_UNKNOWN);
+	ecmd->duplex = DUPLEX_UNKNOWN;
 }
 
 /**
- * ice_get_link_ksettings - Get Link Speed and Duplex settings
+ * ice_get_settings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
- * @ks: ethtool ksettings
+ * @ecmd: ethtool command
  *
  * Reports speed/duplex settings based on media_type
  */
-static int
-ice_get_link_ksettings(struct net_device *netdev,
-		       struct ethtool_link_ksettings *ks)
+static int ice_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_aqc_get_phy_caps_data *caps;
 	struct ice_link_status *hw_link_info;
 	struct ice_vsi *vsi = np->vsi;
 	enum ice_status status;
-	int err = 0;
+	bool link_up;
 
-	ethtool_link_ksettings_zero_link_mode(ks, supported);
-	ethtool_link_ksettings_zero_link_mode(ks, advertising);
-	ethtool_link_ksettings_zero_link_mode(ks, lp_advertising);
 	hw_link_info = &vsi->port_info->phy.link_info;
+	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
+
 
 	/* set speed and duplex */
-	if (hw_link_info->link_info & ICE_AQ_LINK_UP)
-		ice_get_settings_link_up(ks, netdev);
+	if (link_up)
+		ice_get_legacy_settings_link_up(ecmd, netdev);
 	else
-		ice_get_settings_link_down(ks, netdev);
+		ice_get_legacy_settings_link_down(ecmd, netdev);
 
 	/* set autoneg settings */
-	ks->base.autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+	ecmd->autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
 		AUTONEG_ENABLE : AUTONEG_DISABLE;
 
-	/* set media type settings */
+	/* Set media type settings */
 	switch (vsi->port_info->phy.media_type) {
 	case ICE_MEDIA_FIBER:
-		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
-		ks->base.port = PORT_FIBRE;
+		ecmd->supported |= SUPPORTED_FIBRE;
+		ecmd->port = PORT_FIBRE;
 		break;
 	case ICE_MEDIA_BASET:
-		ethtool_link_ksettings_add_link_mode(ks, supported, TP);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, TP);
-		ks->base.port = PORT_TP;
+		ecmd->supported |= SUPPORTED_TP;
+		ecmd->advertising |= ADVERTISED_TP;
+		ecmd->port = PORT_TP;
 		break;
 	case ICE_MEDIA_BACKPLANE:
-		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, supported, Backplane);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Backplane);
-		ks->base.port = PORT_NONE;
+		ecmd->supported |= SUPPORTED_Backplane;
+		ecmd->advertising |= ADVERTISED_Backplane;
+		ecmd->port = PORT_NONE;
 		break;
 	case ICE_MEDIA_DA:
-		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
-		ethtool_link_ksettings_add_link_mode(ks, advertising, FIBRE);
-		ks->base.port = PORT_DA;
+		ecmd->supported |= SUPPORTED_FIBRE;
+		ecmd->advertising |= ADVERTISED_FIBRE;
+		ecmd->port = PORT_DA;
 		break;
 	default:
-		ks->base.port = PORT_OTHER;
+		ecmd->port = PORT_OTHER;
 		break;
 	}
 
-	/* flow control is symmetric and always supported */
-	ethtool_link_ksettings_add_link_mode(ks, supported, Pause);
-
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
 	if (!caps)
 		return -ENOMEM;
 
 	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_SW_CFG, caps, NULL);
+				     ICE_AQC_REPORT_TOPO_CAP_MEDIA, caps, NULL);
 	if (status) {
-		err = -EIO;
-		goto done;
+		dev_dbg(ice_pf_to_dev(vsi->back), "get PHY caps failed, status %s\n",
+			ice_stat_str(status));
+		kfree(caps);
+		return -EIO;
 	}
 
-	/* Set the advertised flow control based on the PHY capability */
-	if ((caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) &&
-	    (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else if (caps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else if (caps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE) {
-		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     Asym_Pause);
-	} else {
-		ethtool_link_ksettings_del_link_mode(ks, advertising, Pause);
-		ethtool_link_ksettings_del_link_mode(ks, advertising,
-						     Asym_Pause);
+	/* Set supported and advertised autoneg */
+	if (ice_is_phy_caps_an_enabled(caps)) {
+		ecmd->supported |= SUPPORTED_Autoneg;
+		ecmd->advertising |= ADVERTISED_Autoneg;
 	}
 
-	/* Set advertised FEC modes based on PHY capability */
-	ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_NONE);
+	ecmd->transceiver = XCVR_EXTERNAL;
 
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_REQ)
-		ethtool_link_ksettings_add_link_mode(ks, advertising,
-						     FEC_BASER);
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
-		ethtool_link_ksettings_add_link_mode(ks, advertising, FEC_RS);
+	/* flow control is symmetric and always supported */
+	ecmd->supported |= SUPPORTED_Pause;
 
-	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_TOPO_CAP, caps, NULL);
-	if (status) {
-		err = -EIO;
-		goto done;
+	switch (vsi->port_info->fc.req_mode) {
+	case ICE_FC_RX_PAUSE:
+		ecmd->advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
+		break;
+	case ICE_FC_TX_PAUSE:
+		ecmd->advertising |= ADVERTISED_Asym_Pause;
+		break;
+	case ICE_FC_FULL:
+		ecmd->advertising |= ADVERTISED_Pause;
+		break;
+	case ICE_FC_PFC:
+	default:
+		ecmd->advertising &= ~(ADVERTISED_Pause |
+				       ADVERTISED_Asym_Pause);
+		break;
 	}
 
-	/* Set supported FEC modes based on PHY capability */
-	ethtool_link_ksettings_add_link_mode(ks, supported, FEC_NONE);
-
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_10G_KR_40G_KR4_EN ||
-	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_KR_CLAUSE74_EN)
-		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_BASER);
-	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_CLAUSE91_EN)
-		ethtool_link_ksettings_add_link_mode(ks, supported, FEC_RS);
-
-done:
-	devm_kfree(&vsi->back->pdev->dev, caps);
-	return err;
+	kfree(caps);
+	return 0;
 }
 
 /**
- * ice_ksettings_find_adv_link_speed - Find advertising link speed
- * @ks: ethtool ksettings
+ * ice_legacy_find_adv_link_speed
+ * @advertise_phy: advertisement PHY value
+ *
+ * Find advertising link speed
  */
 static u16
-ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
+ice_legacy_find_adv_link_speed(u32 advertise_phy)
 {
 	u16 adv_link_speed = 0;
 
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100baseT_Full))
+	if (advertise_phy & ADVERTISED_100baseT_Full)
 		adv_link_speed |= ICE_AQ_LINK_SPEED_100MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseT_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  1000baseKX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  2500baseT_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  2500baseX_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  5000baseT_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_5GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseT_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseKR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseSR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseLR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseCR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseSR_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  25000baseKR_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_25GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseCR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseSR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseLR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  40000baseKR4_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseCR2_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseKR2_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  50000baseSR2_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_50GB;
-	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseCR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseSR4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseLR4_ER4_Full) ||
-	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  100000baseKR4_Full))
-		adv_link_speed |= ICE_AQ_LINK_SPEED_100GB;
+	if (advertise_phy & ADVERTISED_1000baseT_Full ||
+	    advertise_phy & ADVERTISED_1000baseKX_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+	if (advertise_phy & ADVERTISED_2500baseX_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+	if (advertise_phy & ADVERTISED_10000baseT_Full ||
+	    advertise_phy & ADVERTISED_10000baseKR_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+	if (advertise_phy & ADVERTISED_40000baseKR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseCR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseSR4_Full ||
+	    advertise_phy & ADVERTISED_40000baseLR4_Full)
+		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
 
 	return adv_link_speed;
 }
@@ -2281,7 +3872,7 @@ ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
 /**
  * ice_setup_autoneg
  * @p: port info
- * @ks: ethtool_link_ksettings
+ * @ecmd: ethtool command
  * @config: configuration that will be sent down to FW
  * @autoneg_enabled: autonegotiation is enabled or not
  * @autoneg_changed: will there a change in autonegotiation
@@ -2290,7 +3881,7 @@ ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
  * Setup PHY autonegotiation feature
  */
 static int
-ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
+ice_setup_autoneg(struct ice_port_info *p, struct ethtool_cmd *ecmd,
 		  struct ice_aqc_set_phy_cfg_data *config,
 		  u8 autoneg_enabled, u8 *autoneg_changed,
 		  struct net_device *netdev)
@@ -2299,21 +3890,18 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 
 	*autoneg_changed = 0;
 
-	/* Check autoneg */
 	if (autoneg_enabled == AUTONEG_ENABLE) {
 		/* If autoneg was not already enabled */
 		if (!(p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED)) {
 			/* If autoneg is not supported, return error */
-			if (!ethtool_link_ksettings_test_link_mode(ks,
-								   supported,
-								   Autoneg)) {
+			if (!(ecmd->supported & SUPPORTED_Autoneg)) {
 				netdev_info(netdev, "Autoneg not supported on this phy.\n");
-				err = -EINVAL;
-			} else {
-				/* Autoneg is allowed to change */
-				config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-				*autoneg_changed = 1;
+				return -EINVAL;
 			}
+
+			/* Autoneg is allowed to change */
+			config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+			*autoneg_changed = 1;
 		}
 	} else {
 		/* If autoneg is currently enabled */
@@ -2321,16 +3909,14 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 			/* If autoneg is supported 10GBASE_T is the only PHY
 			 * that can disable it, so otherwise return error
 			 */
-			if (ethtool_link_ksettings_test_link_mode(ks,
-								  supported,
-								  Autoneg)) {
-				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
-				err = -EINVAL;
-			} else {
-				/* Autoneg is allowed to change */
-				config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-				*autoneg_changed = 1;
+			if (ecmd->supported & SUPPORTED_Autoneg) {
+				netdev_info(netdev, "Autoneg cannot be disabled.\n");
+				return -EINVAL;
 			}
+
+			/* Autoneg is allowed to change */
+			config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+			*autoneg_changed = 1;
 		}
 	}
 
@@ -2338,28 +3924,27 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 }
 
 /**
- * ice_set_link_ksettings - Set Speed and Duplex
+ * ice_set_settings - Set Speed and Duplex
  * @netdev: network interface device structure
- * @ks: ethtool ksettings
+ * @ecmd: ethtool command
  *
  * Set speed/duplex per media_types advertised/forced
  */
-static int
-ice_set_link_ksettings(struct net_device *netdev,
-		       const struct ethtool_link_ksettings *ks)
+static int ice_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
 {
-	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT, lport = 0;
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ethtool_link_ksettings safe_ks, copy_ks;
 	struct ice_aqc_get_phy_caps_data *abilities;
+	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT;
 	u16 adv_link_speed, curr_link_speed, idx;
 	struct ice_aqc_set_phy_cfg_data config;
 	struct ice_pf *pf = np->vsi->back;
+	struct ethtool_cmd safe_ecmd;
 	struct ice_port_info *p;
 	u8 autoneg_changed = 0;
 	enum ice_status status;
-	u64 phy_type_high;
-	u64 phy_type_low;
+	u64 phy_type_high = 0;
+	u64 phy_type_low = 0;
+	u32 advertise;
 	int err = 0;
 	bool linkup;
 
@@ -2383,76 +3968,64 @@ ice_set_link_ksettings(struct net_device *netdev,
 	    p->phy.link_info.link_info & ICE_AQ_LINK_UP)
 		return -EOPNOTSUPP;
 
-	/* copy the ksettings to copy_ks to avoid modifying the original */
-	memcpy(&copy_ks, ks, sizeof(copy_ks));
-
-	/* save autoneg out of ksettings */
-	autoneg = copy_ks.base.autoneg;
-
-	memset(&safe_ks, 0, sizeof(safe_ks));
-
-	/* Get link modes supported by hardware.*/
-	ice_phy_type_to_ethtool(netdev, &safe_ks);
-
-	/* and check against modes requested by user.
-	 * Return an error if unsupported mode was set.
-	 */
-	if (!bitmap_subset(copy_ks.link_modes.advertising,
-			   safe_ks.link_modes.supported,
-			   __ETHTOOL_LINK_MODE_MASK_NBITS))
-		return -EINVAL;
-
 	/* get our own copy of the bits to check against */
-	memset(&safe_ks, 0, sizeof(safe_ks));
-	safe_ks.base.cmd = copy_ks.base.cmd;
-	safe_ks.base.link_mode_masks_nwords =
-		copy_ks.base.link_mode_masks_nwords;
-	ice_get_link_ksettings(netdev, &safe_ks);
+	memset(&safe_ecmd, 0, sizeof(safe_ecmd));
+	ice_get_settings(netdev, &safe_ecmd);
 
-	/* set autoneg back to what it currently is */
-	copy_ks.base.autoneg = safe_ks.base.autoneg;
-	/* we don't compare the speed */
-	copy_ks.base.speed = safe_ks.base.speed;
+	/* save autoneg and speed out of ecmd */
+	autoneg = ecmd->autoneg;
+	advertise = ecmd->advertising;
 
-	/* If copy_ks.base and safe_ks.base are not the same now, then they are
-	 * trying to set something that we do not support.
+	/* set autoneg and speed back to what they currently are */
+	ecmd->autoneg = safe_ecmd.autoneg;
+	ecmd->speed = safe_ecmd.speed;
+	ecmd->advertising = safe_ecmd.advertising;
+	ecmd->cmd = safe_ecmd.cmd;
+
+	/* If ecmd and safe_ecmd are not the same now, then they are
+	 * trying to set something that we do not support
 	 */
-	if (memcmp(&copy_ks.base, &safe_ks.base, sizeof(copy_ks.base)))
+	if (memcmp(ecmd, &safe_ecmd, sizeof(*ecmd)))
 		return -EOPNOTSUPP;
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, pf->state)) {
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
 		timeout--;
 		if (!timeout)
 			return -EBUSY;
 		usleep_range(TEST_SET_BITS_SLEEP_MIN, TEST_SET_BITS_SLEEP_MAX);
 	}
 
-	abilities = devm_kzalloc(&pf->pdev->dev, sizeof(*abilities),
-				 GFP_KERNEL);
+	abilities = kzalloc(sizeof(*abilities), GFP_KERNEL);
 	if (!abilities)
 		return -ENOMEM;
 
 	/* Get the current PHY config */
-	status = ice_aq_get_phy_caps(p, false, ICE_AQC_REPORT_SW_CFG, abilities,
-				     NULL);
+	status = ice_aq_get_phy_caps(p, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+				     abilities, NULL);
 	if (status) {
 		err = -EAGAIN;
 		goto done;
 	}
 
-	/* Copy abilities to config in case autoneg is not set below */
-	memset(&config, 0, sizeof(config));
-	config.caps = abilities->caps & ~ICE_AQC_PHY_AN_MODE;
-	if (abilities->caps & ICE_AQC_PHY_AN_MODE)
-		config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+	/* Copy the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	memcpy(&config, &p->phy.curr_user_phy_cfg, sizeof(config));
 
-	/* Check autoneg */
-	err = ice_setup_autoneg(p, &safe_ks, &config, autoneg, &autoneg_changed,
-				netdev);
+	config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
 
+	/* Check autoneg */
+	err = ice_setup_autoneg(p, &safe_ecmd, &config, autoneg,
+				&autoneg_changed, netdev);
 	if (err)
 		goto done;
 
+	if (advertise & ~safe_ecmd.supported) {
+		err = -EINVAL;
+		goto done;
+	}
+
 	/* Call to get the current link speed */
 	p->phy.get_link_info = true;
 	status = ice_get_link_status(p, &linkup);
@@ -2462,7 +4035,7 @@ ice_set_link_ksettings(struct net_device *netdev,
 	}
 
 	curr_link_speed = p->phy.link_info.link_speed;
-	adv_link_speed = ice_ksettings_find_adv_link_speed(ks);
+	adv_link_speed = ice_legacy_find_adv_link_speed(advertise);
 
 	/* If speed didn't get set, set it to what it currently is.
 	 * This is needed because if advertise is 0 (as it is when autoneg
@@ -2479,12 +4052,6 @@ ice_set_link_ksettings(struct net_device *netdev,
 		goto done;
 	}
 
-	/* copy over the rest of the abilities */
-	config.low_power_ctrl = abilities->low_power_ctrl;
-	config.eee_cap = abilities->eee_cap;
-	config.eeer_value = abilities->eeer_value;
-	config.link_fec_opt = abilities->link_fec_options;
-
 	/* save the requested speeds */
 	p->phy.link_info.req_speeds = adv_link_speed;
 
@@ -2497,34 +4064,283 @@ ice_set_link_ksettings(struct net_device *netdev,
 		config.phy_type_low = cpu_to_le64(phy_type_low) &
 			abilities->phy_type_low;
 	} else {
-		err = -EAGAIN;
 		netdev_info(netdev, "Nothing changed. No PHY_TYPE is corresponded to advertised link speed.\n");
 		goto done;
 	}
 
-	/* If link is up put link down */
+	/* If link is up, put link down */
 	if (p->phy.link_info.link_info & ICE_AQ_LINK_UP) {
 		/* Tell the OS link is going down, the link will go
-		 * back up when fw says it is ready asynchronously
+		 * back up when FW says it is ready asynchronously
 		 */
 		ice_print_link_msg(np->vsi, false);
 		netif_carrier_off(netdev);
 		netif_tx_stop_all_queues(netdev);
 	}
 
-	/* make the aq call */
-	status = ice_aq_set_phy_cfg(&pf->hw, lport, &config, NULL);
+	/* make the AQ call */
+	status = ice_aq_set_phy_cfg(&pf->hw, p, &config, NULL);
 	if (status) {
 		netdev_info(netdev, "Set phy config failed,\n");
 		err = -EAGAIN;
+		goto done;
 	}
 
+	/* Save speed request */
+	p->phy.curr_user_speed_req = adv_link_speed;
 done:
-	devm_kfree(&pf->pdev->dev, abilities);
-	clear_bit(__ICE_CFG_BUSY, pf->state);
+	kfree(abilities);
+	clear_bit(ICE_CFG_BUSY, pf->state);
 
 	return err;
 }
+#endif /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_parse_hdrs - parses headers from RSS hash input
+ * @nfc: ethtool rxnfc command
+ *
+ * This function parses the rxnfc command and returns intended
+ * header types for RSS configuration
+ */
+static u32 ice_parse_hdrs(struct ethtool_rxnfc *nfc)
+{
+	u32 hdrs = ICE_FLOW_SEG_HDR_NONE;
+
+	switch (nfc->flow_type) {
+	case TCP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case UDP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case SCTP_V4_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4;
+		break;
+	case TCP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	case UDP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	case SCTP_V6_FLOW:
+		hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6;
+		break;
+	default:
+		break;
+	}
+	return hdrs;
+}
+
+#define ICE_FLOW_HASH_FLD_IPV4_SA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)
+#define ICE_FLOW_HASH_FLD_IPV6_SA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)
+#define ICE_FLOW_HASH_FLD_IPV4_DA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)
+#define ICE_FLOW_HASH_FLD_IPV6_DA	BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)
+#define ICE_FLOW_HASH_FLD_TCP_SRC_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_TCP_DST_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)
+#define ICE_FLOW_HASH_FLD_UDP_SRC_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_UDP_DST_PORT	BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)
+#define ICE_FLOW_HASH_FLD_SCTP_SRC_PORT	\
+	BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)
+#define ICE_FLOW_HASH_FLD_SCTP_DST_PORT	\
+	BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)
+
+/**
+ * ice_parse_hash_flds - parses hash fields from RSS hash input
+ * @nfc: ethtool rxnfc command
+ *
+ * This function parses the rxnfc command and returns intended
+ * hash fields for RSS configuration
+ */
+static u64 ice_parse_hash_flds(struct ethtool_rxnfc *nfc)
+{
+	u64 hfld = ICE_HASH_INVALID;
+
+	if (nfc->data & RXH_IP_SRC || nfc->data & RXH_IP_DST) {
+		switch (nfc->flow_type) {
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+		case SCTP_V4_FLOW:
+			if (nfc->data & RXH_IP_SRC)
+				hfld |= ICE_FLOW_HASH_FLD_IPV4_SA;
+			if (nfc->data & RXH_IP_DST)
+				hfld |= ICE_FLOW_HASH_FLD_IPV4_DA;
+			break;
+		case TCP_V6_FLOW:
+		case UDP_V6_FLOW:
+		case SCTP_V6_FLOW:
+			if (nfc->data & RXH_IP_SRC)
+				hfld |= ICE_FLOW_HASH_FLD_IPV6_SA;
+			if (nfc->data & RXH_IP_DST)
+				hfld |= ICE_FLOW_HASH_FLD_IPV6_DA;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (nfc->data & RXH_L4_B_0_1 || nfc->data & RXH_L4_B_2_3) {
+		switch (nfc->flow_type) {
+		case TCP_V4_FLOW:
+		case TCP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_TCP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_TCP_DST_PORT;
+			break;
+		case UDP_V4_FLOW:
+		case UDP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_UDP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_UDP_DST_PORT;
+			break;
+		case SCTP_V4_FLOW:
+		case SCTP_V6_FLOW:
+			if (nfc->data & RXH_L4_B_0_1)
+				hfld |= ICE_FLOW_HASH_FLD_SCTP_SRC_PORT;
+			if (nfc->data & RXH_L4_B_2_3)
+				hfld |= ICE_FLOW_HASH_FLD_SCTP_DST_PORT;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return hfld;
+}
+
+/**
+ * ice_set_rss_hash_opt - Enable/Disable flow types for RSS hash
+ * @vsi: the VSI being configured
+ * @nfc: ethtool rxnfc command
+ *
+ * Returns Success if the flow input set is supported.
+ */
+static int
+ice_set_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_rss_hash_cfg cfg;
+	enum ice_status status;
+	struct device *dev;
+	u64 hashed_flds;
+	u32 hdrs;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	hashed_flds = ice_parse_hash_flds(nfc);
+	if (hashed_flds == ICE_HASH_INVALID) {
+		dev_dbg(dev, "Invalid hash fields, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	hdrs = ice_parse_hdrs(nfc);
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE) {
+		dev_dbg(dev, "Header type is not valid, vsi num = %d\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	cfg.hash_flds = hashed_flds;
+	cfg.addl_hdrs = hdrs;
+	cfg.hdr_type = ICE_RSS_ANY_HEADERS;
+	cfg.symm = false;
+	status = ice_add_rss_cfg(&pf->hw, vsi->idx, &cfg);
+	if (status) {
+		dev_dbg(dev, "ice_add_rss_cfg failed, vsi num = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_hash_opt - Retrieve hash fields for a given flow-type
+ * @vsi: the VSI being configured
+ * @nfc: ethtool rxnfc command
+ */
+static void
+ice_get_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	u64 hash_flds;
+	u32 hdrs;
+
+	dev = ice_pf_to_dev(pf);
+
+	nfc->data = 0;
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	hdrs = ice_parse_hdrs(nfc);
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE) {
+		dev_dbg(dev, "Header type is not valid, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	hash_flds = ice_get_rss_cfg(&pf->hw, vsi->idx, hdrs);
+	if (hash_flds == ICE_HASH_INVALID) {
+		dev_dbg(dev, "No hash fields found for the given header type, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_SA ||
+	    hash_flds & ICE_FLOW_HASH_FLD_IPV6_SA)
+		nfc->data |= (u64)RXH_IP_SRC;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_DA ||
+	    hash_flds & ICE_FLOW_HASH_FLD_IPV6_DA)
+		nfc->data |= (u64)RXH_IP_DST;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_TCP_SRC_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_UDP_SRC_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_SCTP_SRC_PORT)
+		nfc->data |= (u64)RXH_L4_B_0_1;
+
+	if (hash_flds & ICE_FLOW_HASH_FLD_TCP_DST_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_UDP_DST_PORT ||
+	    hash_flds & ICE_FLOW_HASH_FLD_SCTP_DST_PORT)
+		nfc->data |= (u64)RXH_L4_B_2_3;
+}
+
+/**
+ * ice_set_rxnfc - command to set Rx flow rules.
+ * @netdev: network interface device structure
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns 0 for success and negative values for errors
+ */
+static int ice_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		return ice_add_ntuple_ethtool(vsi, cmd);
+	case ETHTOOL_SRXCLSRLDEL:
+		return ice_del_ntuple_ethtool(vsi, cmd);
+	case ETHTOOL_SRXFH:
+		return ice_set_rss_hash_opt(vsi, cmd);
+	default:
+		break;
+	}
+	return -EOPNOTSUPP;
+}
 
 /**
  * ice_get_rxnfc - command to get Rx flow classification rules
@@ -2541,12 +4357,31 @@ ice_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 	int ret = -EOPNOTSUPP;
+	struct ice_hw *hw;
+
+	hw = &vsi->back->hw;
 
 	switch (cmd->cmd) {
 	case ETHTOOL_GRXRINGS:
 		cmd->data = vsi->rss_size;
 		ret = 0;
 		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = hw->fdir_active_fltr;
+		/* report max rule count */
+		cmd->data = ice_ntuple_get_max_fltr_cnt(hw);
+		ret = 0;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		ret = ice_get_ethtool_fdir_entry(hw, cmd);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		ret = ice_get_fdir_fltr_ids(hw, cmd, (u32 *)rule_locs);
+		break;
+	case ETHTOOL_GRXFH:
+		ice_get_rss_hash_opt(vsi, cmd);
+		ret = 0;
+		break;
 	default:
 		break;
 	}
@@ -2577,10 +4412,13 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 {
 	struct ice_ring *tx_rings = NULL, *rx_rings = NULL;
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef HAVE_XDP_SUPPORT
+	struct ice_ring *xdp_rings = NULL;
+#endif /* HAVE_XDP_SUPPORT */
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	int i, timeout = 50, err = 0;
-	u32 new_rx_cnt, new_tx_cnt;
+	u16 new_rx_cnt, new_tx_cnt;
 
 	if (ring->tx_pending > ICE_MAX_NUM_DESC ||
 	    ring->tx_pending < ICE_MIN_NUM_DESC ||
@@ -2595,13 +4433,11 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 
 	new_tx_cnt = ALIGN(ring->tx_pending, ICE_REQ_DESC_MULTIPLE);
 	if (new_tx_cnt != ring->tx_pending)
-		netdev_info(netdev,
-			    "Requested Tx descriptor count rounded up to %d\n",
+		netdev_info(netdev, "Requested Tx descriptor count rounded up to %d\n",
 			    new_tx_cnt);
 	new_rx_cnt = ALIGN(ring->rx_pending, ICE_REQ_DESC_MULTIPLE);
 	if (new_rx_cnt != ring->rx_pending)
-		netdev_info(netdev,
-			    "Requested Rx descriptor count rounded up to %d\n",
+		netdev_info(netdev, "Requested Rx descriptor count rounded up to %d\n",
 			    new_rx_cnt);
 
 	/* if nothing to do return success */
@@ -2611,7 +4447,16 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 		return 0;
 	}
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, pf->state)) {
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	/* If there is a AF_XDP UMEM attached to any of Rx rings,
+	 * disallow changing the number of descriptors -- regardless
+	 * if the netdev is running or not.
+	 */
+	if (ice_xsk_any_rx_ring_ena(vsi))
+		return -EBUSY;
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
 		timeout--;
 		if (!timeout)
 			return -EBUSY;
@@ -2624,6 +4469,13 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 			vsi->tx_rings[i]->count = new_tx_cnt;
 		for (i = 0; i < vsi->alloc_rxq; i++)
 			vsi->rx_rings[i]->count = new_rx_cnt;
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi))
+			for (i = 0; i < vsi->num_xdp_txq; i++)
+				vsi->xdp_rings[i]->count = new_tx_cnt;
+#endif /* HAVE_XDP_SUPPORT */
+		vsi->num_tx_desc = (u16)new_tx_cnt;
+		vsi->num_rx_desc = (u16)new_rx_cnt;
 		netdev_dbg(netdev, "Link is down, descriptor count change happens when link is brought up\n");
 		goto done;
 	}
@@ -2635,8 +4487,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	netdev_info(netdev, "Changing Tx descriptor count from %d to %d\n",
 		    vsi->tx_rings[0]->count, new_tx_cnt);
 
-	tx_rings = devm_kcalloc(&pf->pdev->dev, vsi->num_txq,
-				sizeof(*tx_rings), GFP_KERNEL);
+	tx_rings = kcalloc(vsi->num_txq, sizeof(*tx_rings), GFP_KERNEL);
 	if (!tx_rings) {
 		err = -ENOMEM;
 		goto done;
@@ -2650,15 +4501,44 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 		tx_rings[i].tx_buf = NULL;
 		err = ice_setup_tx_ring(&tx_rings[i]);
 		if (err) {
-			while (i) {
-				i--;
+			while (i--)
 				ice_clean_tx_ring(&tx_rings[i]);
-			}
-			devm_kfree(&pf->pdev->dev, tx_rings);
+			kfree(tx_rings);
 			goto done;
 		}
 	}
 
+#ifdef HAVE_XDP_SUPPORT
+	if (!ice_is_xdp_ena_vsi(vsi))
+		goto process_rx;
+
+	/* alloc updated XDP resources */
+	netdev_info(netdev, "Changing XDP descriptor count from %d to %d\n",
+		    vsi->xdp_rings[0]->count, new_tx_cnt);
+
+	xdp_rings = kcalloc(vsi->num_xdp_txq, sizeof(*xdp_rings), GFP_KERNEL);
+	if (!xdp_rings) {
+		err = -ENOMEM;
+		goto free_tx;
+	}
+
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		/* clone ring and setup updated count */
+		xdp_rings[i] = *vsi->xdp_rings[i];
+		xdp_rings[i].count = new_tx_cnt;
+		xdp_rings[i].desc = NULL;
+		xdp_rings[i].tx_buf = NULL;
+		err = ice_setup_tx_ring(&xdp_rings[i]);
+		if (err) {
+			while (i--)
+				ice_clean_tx_ring(&xdp_rings[i]);
+			kfree(xdp_rings);
+			goto free_tx;
+		}
+		ice_set_ring_xdp(&xdp_rings[i]);
+	}
+#endif /* HAVE_XDP_SUPPORT */
+
 process_rx:
 	if (new_rx_cnt == vsi->rx_rings[0]->count)
 		goto process_link;
@@ -2667,8 +4547,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	netdev_info(netdev, "Changing Rx descriptor count from %d to %d\n",
 		    vsi->rx_rings[0]->count, new_rx_cnt);
 
-	rx_rings = devm_kcalloc(&pf->pdev->dev, vsi->num_rxq,
-				sizeof(*rx_rings), GFP_KERNEL);
+	rx_rings = kcalloc(vsi->num_rxq, sizeof(*rx_rings), GFP_KERNEL);
 	if (!rx_rings) {
 		err = -ENOMEM;
 		goto done;
@@ -2698,7 +4577,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				i--;
 				ice_free_rx_ring(&rx_rings[i]);
 			}
-			devm_kfree(&pf->pdev->dev, rx_rings);
+			kfree(rx_rings);
 			err = -ENOMEM;
 			goto free_tx;
 		}
@@ -2708,7 +4587,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	/* Bring interface down, copy in the new ring info, then restore the
 	 * interface. if VSI is up, bring it down and then back up
 	 */
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state)) {
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
 		ice_down(vsi);
 
 		if (tx_rings) {
@@ -2716,7 +4595,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				ice_free_tx_ring(vsi->tx_rings[i]);
 				*vsi->tx_rings[i] = tx_rings[i];
 			}
-			devm_kfree(&pf->pdev->dev, tx_rings);
+			kfree(tx_rings);
 		}
 
 		if (rx_rings) {
@@ -2734,9 +4613,21 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 				rx_rings[i].next_to_alloc = 0;
 				*vsi->rx_rings[i] = rx_rings[i];
 			}
-			devm_kfree(&pf->pdev->dev, rx_rings);
+			kfree(rx_rings);
+		}
+
+#ifdef HAVE_XDP_SUPPORT
+		if (xdp_rings) {
+			for (i = 0; i < vsi->num_xdp_txq; i++) {
+				ice_free_tx_ring(vsi->xdp_rings[i]);
+				*vsi->xdp_rings[i] = xdp_rings[i];
+			}
+			kfree(xdp_rings);
 		}
+#endif /* HAVE_XDP_SUPPORT */
 
+		vsi->num_tx_desc = new_tx_cnt;
+		vsi->num_rx_desc = new_rx_cnt;
 		ice_up(vsi);
 	}
 	goto done;
@@ -2746,38 +4637,15 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring)
 	if (tx_rings) {
 		ice_for_each_txq(vsi, i)
 			ice_free_tx_ring(&tx_rings[i]);
-		devm_kfree(&pf->pdev->dev, tx_rings);
+		kfree(tx_rings);
 	}
 
 done:
-	clear_bit(__ICE_CFG_BUSY, pf->state);
+	clear_bit(ICE_CFG_BUSY, pf->state);
 	return err;
 }
 
-static int ice_nway_reset(struct net_device *netdev)
-{
-	/* restart autonegotiation */
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_port_info *pi;
-	enum ice_status status;
-
-	pi = vsi->port_info;
-	/* If VSI state is up, then restart autoneg with link up */
-	if (!test_bit(__ICE_DOWN, vsi->back->state))
-		status = ice_aq_set_link_restart_an(pi, true, NULL);
-	else
-		status = ice_aq_set_link_restart_an(pi, false, NULL);
-
-	if (status) {
-		netdev_info(netdev, "link restart failed, err %d aq_err %d\n",
-			    status, pi->hw->adminq.sq_last_status);
-		return -EIO;
-	}
-
-	return 0;
-}
-
+#ifdef ETHTOOL_GLINKSETTINGS
 /**
  * ice_get_pauseparam - Get Flow Control status
  * @netdev: network interface device structure
@@ -2794,7 +4662,6 @@ ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_port_info *pi = np->vsi->port_info;
 	struct ice_aqc_get_phy_caps_data *pcaps;
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_dcbx_cfg *dcbx_cfg;
 	enum ice_status status;
 
@@ -2802,34 +4669,82 @@ ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	pause->rx_pause = 0;
 	pause->tx_pause = 0;
 
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
-	pcaps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*pcaps),
-			     GFP_KERNEL);
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps)
 		return;
 
 	/* Get current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				     NULL);
 	if (status)
 		goto out;
 
-	pause->autoneg = ((pcaps->caps & ICE_AQC_PHY_AN_MODE) ?
-			AUTONEG_ENABLE : AUTONEG_DISABLE);
+	pause->autoneg = ice_is_phy_caps_an_enabled(pcaps) ? AUTONEG_ENABLE :
+							     AUTONEG_DISABLE;
+
+	if (dcbx_cfg->pfc.pfcena)
+		/* PFC enabled so report LFC as off */
+		goto out;
+
+	if (pcaps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+		pause->tx_pause = 1;
+	if (pcaps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		pause->rx_pause = 1;
+
+out:
+	kfree(pcaps);
+}
+#else /* ETHTOOL_GLINKSETTINGS */
+
+/**
+ * ice_get_pauseparam - Get Flow Control status
+ * @netdev: network interface device structure
+ * @pause: ethernet pause (flow control) parameters
+ *
+ * Get autonegotiated flow control status from link status.
+ */
+static void
+ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_port_info *pi = np->vsi->port_info;
+	struct ice_link_status *hw_link_info;
+	struct ice_dcbx_cfg *dcbx_cfg;
+
+	hw_link_info = &pi->phy.link_info;
+
+	/* Initialize pause params */
+	pause->rx_pause = 0;
+	pause->tx_pause = 0;
+
+	pause->autoneg = ((hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+			  AUTONEG_ENABLE : AUTONEG_DISABLE);
+
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
 	if (dcbx_cfg->pfc.pfcena)
 		/* PFC enabled so report LFC as off */
-		goto out;
+		return;
 
-	if (pcaps->caps & ICE_AQC_PHY_EN_TX_LINK_PAUSE)
+	/* Get flow control status based on autonegotiation */
+	switch (pi->fc.current_mode) {
+	case ICE_FC_TX_PAUSE:
 		pause->tx_pause = 1;
-	if (pcaps->caps & ICE_AQC_PHY_EN_RX_LINK_PAUSE)
+		break;
+	case ICE_FC_RX_PAUSE:
 		pause->rx_pause = 1;
-
-out:
-	devm_kfree(&vsi->back->pdev->dev, pcaps);
+		break;
+	case ICE_FC_FULL:
+		pause->tx_pause = 1;
+		pause->rx_pause = 1;
+		break;
+	default:
+		break;
+	}
 }
+#endif /* ETHTOOL_GLINKSETTINGS */
 
 /**
  * ice_set_pauseparam - Set Flow Control parameter
@@ -2840,7 +4755,9 @@ static int
 ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef ETHTOOL_GLINKSETTINGS
 	struct ice_aqc_get_phy_caps_data *pcaps;
+#endif
 	struct ice_link_status *hw_link_info;
 	struct ice_pf *pf = np->vsi->back;
 	struct ice_dcbx_cfg *dcbx_cfg;
@@ -2855,7 +4772,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 
 	pi = vsi->port_info;
 	hw_link_info = &pi->phy.link_info;
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
 
 	/* Changing the port's flow control is not supported if this isn't the
@@ -2866,6 +4783,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 		return -EOPNOTSUPP;
 	}
 
+#ifdef ETHTOOL_GLINKSETTINGS
 	/* Get pause param reports configured and negotiated flow control pause
 	 * when ETHTOOL_GLINKSETTINGS is defined. Since ETHTOOL_GLINKSETTINGS is
 	 * defined get pause param pause->autoneg reports SW configured setting,
@@ -2877,17 +4795,23 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 		return -ENOMEM;
 
 	/* Get current PHY config */
-	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				     NULL);
 	if (status) {
 		kfree(pcaps);
 		return -EIO;
 	}
 
-	is_an = ((pcaps->caps & ICE_AQC_PHY_AN_MODE) ?
+	is_an = ice_is_phy_caps_an_enabled(pcaps) ? AUTONEG_ENABLE :
+						    AUTONEG_DISABLE;
+#else /* ETHTOOL_GLINKSETTINGS */
+	is_an = ((hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
 			AUTONEG_ENABLE : AUTONEG_DISABLE);
+#endif /* ETHTOOL_GLINKSETTINGS */
 
+#ifdef ETHTOOL_GLINKSETTINGS
 	kfree(pcaps);
+#endif
 
 	if (pause->autoneg != is_an) {
 		netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
@@ -2895,7 +4819,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	}
 
 	/* If we have link and don't have autoneg */
-	if (!test_bit(__ICE_DOWN, pf->state) &&
+	if (!test_bit(ICE_DOWN, pf->state) &&
 	    !(hw_link_info->an_info & ICE_AQ_AN_COMPLETED)) {
 		/* Send message that it might not necessarily work*/
 		netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n");
@@ -2920,34 +4844,26 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	status = ice_set_fc(pi, &aq_failures, link_up);
 
 	if (aq_failures & ICE_SET_FC_AQ_FAIL_GET) {
-		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	} else if (aq_failures & ICE_SET_FC_AQ_FAIL_SET) {
-		netdev_info(netdev, "Set fc failed on the set_phy_config call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the set_phy_config call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	} else if (aq_failures & ICE_SET_FC_AQ_FAIL_UPDATE) {
-		netdev_info(netdev, "Set fc failed on the get_link_info call with err %d aq_err %d\n",
-			    status, hw->adminq.sq_last_status);
+		netdev_info(netdev, "Set fc failed on the get_link_info call with err %s aq_err %s\n",
+			    ice_stat_str(status),
+			    ice_aq_str(hw->adminq.sq_last_status));
 		err = -EAGAIN;
 	}
 
-	if (!test_bit(__ICE_DOWN, pf->state)) {
-		/* Give it a little more time to try to come back. If still
-		 * down, restart autoneg link or reinitialize the interface.
-		 */
-		msleep(75);
-		if (!test_bit(__ICE_DOWN, pf->state))
-			return ice_nway_reset(netdev);
-
-		ice_down(vsi);
-		ice_up(vsi);
-	}
-
 	return err;
 }
 
+#if defined(ETHTOOL_GRSSH) && defined(ETHTOOL_SRSSH)
 /**
  * ice_get_rxfh_key_size - get the RSS hash key size
  * @netdev: network interface device structure
@@ -2972,116 +4888,463 @@ static u32 ice_get_rxfh_indir_size(struct net_device *netdev)
 	return np->vsi->rss_table_size;
 }
 
+#ifdef HAVE_RXFH_HASHFUNC
+/**
+ * ice_get_rxfh - get the Rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ * @hfunc: hash function
+ *
+ * Reads the indirection table directly from the hardware.
+ */
+static int
+ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+#else
+static int ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int err, i;
+	u8 *lut;
+
+#ifdef HAVE_RXFH_HASHFUNC
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+#endif
+
+	if (!indir)
+		return 0;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		/* RSS not supported return error here */
+		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
+		return -EIO;
+	}
+
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	err = ice_get_rss_key(vsi, key);
+	if (err)
+		goto out;
+
+	err = ice_get_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err)
+		goto out;
+
+	for (i = 0; i < vsi->rss_table_size; i++)
+		indir[i] = (u32)(lut[i]);
+
+out:
+	kfree(lut);
+	return err;
+}
+
+#ifdef HAVE_RXFH_HASHFUNC
+/**
+ * ice_set_rxfh - set the Rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ * @hfunc: hash function
+ *
+ * Returns -EINVAL if the table specifies an invalid queue ID, otherwise
+ * returns 0 after programming the table.
+ */
+static int
+ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
+	     const u8 hfunc)
+#elif defined(HAVE_RXFH_NONCONST)
+static int ice_set_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+#else
+static int
+ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key)
+#endif /* HAVE_RXFH_HASHFUNC */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_RXFH_HASHFUNC
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+#endif
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		/* RSS not supported return error here */
+		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
+		return -EIO;
+	}
+
+	/* Verify user input. */
+	if (indir) {
+		int i;
+
+		for (i = 0; i < vsi->rss_table_size; i++)
+			if (indir[i] >= vsi->rss_size)
+				return -EINVAL;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(netdev,
+			   "Cannot change RSS params with ADQ configured.\n");
+		return -EOPNOTSUPP;
+	}
+
+#endif /* NETIF_F_HW_TC */
+	if (key) {
+		if (!vsi->rss_hkey_user) {
+			vsi->rss_hkey_user =
+				devm_kzalloc(dev, ICE_VSIQF_HKEY_ARRAY_SIZE,
+					     GFP_KERNEL);
+			if (!vsi->rss_hkey_user)
+				return -ENOMEM;
+		}
+		memcpy(vsi->rss_hkey_user, key, ICE_VSIQF_HKEY_ARRAY_SIZE);
+
+		err = ice_set_rss_key(vsi, vsi->rss_hkey_user);
+		if (err)
+			return err;
+	}
+
+	if (!vsi->rss_lut_user) {
+		vsi->rss_lut_user = devm_kzalloc(dev, vsi->rss_table_size,
+						 GFP_KERNEL);
+		if (!vsi->rss_lut_user)
+			return -ENOMEM;
+	}
+
+	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
+	if (indir) {
+		int i;
+
+		for (i = 0; i < vsi->rss_table_size; i++)
+			vsi->rss_lut_user[i] = (u8)(indir[i]);
+	} else {
+		ice_fill_rss_lut(vsi->rss_lut_user, vsi->rss_table_size,
+				 vsi->rss_size);
+	}
+
+	err = ice_set_rss_lut(vsi, vsi->rss_lut_user, vsi->rss_table_size);
+	if (err)
+		return err;
+
+	return 0;
+}
+#endif /* ETHTOOL_GRSSH && ETHTOOL_SRSSH */
+
+static int
+ice_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(dev);
+
+	/* only report timestamping if PTP is enabled */
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return ethtool_op_get_ts_info(dev, info);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE |
+				SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	info->phc_index = ice_get_ptp_clock_index(pf);
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
+/**
+ * ice_get_max_txq - return the maximum number of Tx queues for in a PF
+ * @pf: PF structure
+ */
+static int ice_get_max_txq(struct ice_pf *pf)
+{
+	return min3(pf->num_lan_msix, (u16)num_online_cpus(),
+		    (u16)pf->hw.func_caps.common_cap.num_txq);
+}
+
+/**
+ * ice_get_max_rxq - return the maximum number of Rx queues for in a PF
+ * @pf: PF structure
+ */
+static int ice_get_max_rxq(struct ice_pf *pf)
+{
+	return min3(pf->num_lan_msix, (u16)num_online_cpus(),
+		    (u16)pf->hw.func_caps.common_cap.num_rxq);
+}
+
+/**
+ * ice_get_combined_cnt - return the current number of combined channels
+ * @vsi: PF VSI pointer
+ *
+ * Go through all queue vectors and count ones that have both Rx and Tx ring
+ * attached
+ */
+static u32 ice_get_combined_cnt(struct ice_vsi *vsi)
+{
+	u32 combined = 0;
+	int q_idx;
+
+	ice_for_each_q_vector(vsi, q_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring && q_vector->tx.ring)
+			combined++;
+	}
+
+	return combined;
+}
+
+/**
+ * ice_get_channels - get the current and max supported channels
+ * @dev: network interface device structure
+ * @ch: ethtool channel data structure
+ */
+static void
+ice_get_channels(struct net_device *dev, struct ethtool_channels *ch)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+
+	/* report maximum channels */
+	ch->max_rx = ice_get_max_rxq(pf);
+	ch->max_tx = ice_get_max_txq(pf);
+	ch->max_combined = min_t(int, ch->max_rx, ch->max_tx);
+
+	/* report current channels */
+	ch->combined_count = ice_get_combined_cnt(vsi);
+	ch->rx_count = vsi->num_rxq - ch->combined_count;
+	ch->tx_count = vsi->num_txq - ch->combined_count;
+#ifdef HAVE_NETDEV_SB_DEV
+
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		/* L2 forwarding devices are single queue so we infer one
+		 * device is one channel
+		 */
+		ch->max_combined += pf->max_num_macvlan;
+		ch->combined_count += pf->num_macvlan;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	/* report other queues */
+	ch->other_count = test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1 : 0;
+	ch->max_other = ch->other_count;
+}
+
+/**
+ * ice_get_valid_rss_size - return valid number of RSS queues
+ * @hw: pointer to the HW structure
+ * @new_size: requested RSS queues
+ */
+static int ice_get_valid_rss_size(struct ice_hw *hw, int new_size)
+{
+	struct ice_hw_common_caps *caps = &hw->func_caps.common_cap;
+
+	return min_t(int, new_size, BIT(caps->rss_table_entry_width));
+}
+
+/**
+ * ice_vsi_set_dflt_rss_lut - set default RSS LUT with requested RSS size
+ * @vsi: VSI to reconfigure RSS LUT on
+ * @req_rss_size: requested range of queue numbers for hashing
+ *
+ * Set the VSI's RSS parameters, configure the RSS LUT based on these.
+ */
+static int ice_vsi_set_dflt_rss_lut(struct ice_vsi *vsi, int req_rss_size)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	struct ice_hw *hw;
+	int err;
+	u8 *lut;
+
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+
+	if (!req_rss_size)
+		return -EINVAL;
+
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	/* set RSS LUT parameters */
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		vsi->rss_size = 1;
+	else
+		vsi->rss_size = ice_get_valid_rss_size(hw, req_rss_size);
+
+	/* create/set RSS LUT */
+	ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size);
+	err = ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err)
+		dev_err(dev, "Cannot set RSS lut, err %d aq_err %s\n", err,
+			ice_aq_str(hw->adminq.sq_last_status));
+
+	kfree(lut);
+	return err;
+}
+
+/**
+ * ice_set_channels - set the number channels
+ * @dev: network interface device structure
+ * @ch: ethtool channel data structure
+ */
+static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int new_rx = 0, new_tx = 0;
+	u32 curr_combined;
+
+	/* do not support changing channels in Safe Mode */
+	if (ice_is_safe_mode(pf)) {
+		netdev_err(dev, "Changing channel in Safe Mode is not supported\n");
+		return -EOPNOTSUPP;
+	}
+	/* do not support changing other_count */
+	if (ch->other_count != (test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1U : 0U))
+		return -EINVAL;
+
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		netdev_err(dev, "Cannot set channels with ADQ configured.\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		netdev_err(dev, "Cannot set channels when L2 forwarding enabled\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags) && pf->hw.fdir_active_fltr) {
+		netdev_err(dev, "Cannot set channels when Flow Director filters are active\n");
+		return -EOPNOTSUPP;
+	}
+
+	curr_combined = ice_get_combined_cnt(vsi);
+
+	/* these checks are for cases where user didn't specify a particular
+	 * value on cmd line but we get non-zero value anyway via
+	 * get_channels(); look at ethtool.c in ethtool repository (the user
+	 * space part), particularly, do_schannels() routine
+	 */
+	if (ch->rx_count == vsi->num_rxq - curr_combined)
+		ch->rx_count = 0;
+	if (ch->tx_count == vsi->num_txq - curr_combined)
+		ch->tx_count = 0;
+	if (ch->combined_count == curr_combined)
+		ch->combined_count = 0;
+
+	if (!(ch->combined_count || (ch->rx_count && ch->tx_count))) {
+		netdev_err(dev, "Please specify at least 1 Rx and 1 Tx channel\n");
+		return -EINVAL;
+	}
+
+	new_rx = ch->combined_count + ch->rx_count;
+	new_tx = ch->combined_count + ch->tx_count;
+
+	if (new_rx > ice_get_max_rxq(pf)) {
+		netdev_err(dev, "Maximum allowed Rx channels is %d\n",
+			   ice_get_max_rxq(pf));
+		return -EINVAL;
+	}
+	if (new_tx > ice_get_max_txq(pf)) {
+		netdev_err(dev, "Maximum allowed Tx channels is %d\n",
+			   ice_get_max_txq(pf));
+		return -EINVAL;
+	}
+
+	ice_vsi_recfg_qs(vsi, new_rx, new_tx);
+
+#ifdef IFF_RXFH_CONFIGURED
+	if (!netif_is_rxfh_configured(dev))
+		return ice_vsi_set_dflt_rss_lut(vsi, new_rx);
+
+	/* Update rss_size due to change in Rx queues */
+	vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx);
+
+	return 0;
+#else
+	/* Clear the previous vsi->rss_lut_user because it is assumed to
+	 * be invalid at this point.
+	 */
+	if (vsi->rss_lut_user) {
+		netdev_info(vsi->netdev, "Rx queue count changed, clearing user modified RSS LUT, re-run ethtool [-x|-X] to [check|set] settings if needed\n");
+		devm_kfree(ice_pf_to_dev(pf), vsi->rss_lut_user);
+		vsi->rss_lut_user = NULL;
+	}
+
+	return ice_vsi_set_dflt_rss_lut(vsi, new_rx);
+#endif /* IFF_RXFH_CONFIGURED */
+}
+
 /**
- * ice_get_rxfh - get the Rx flow hash indirection table
+ * ice_get_wol - get current Wake on LAN configuration
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
- *
- * Reads the indirection table directly from the hardware.
+ * @wol: Ethtool structure to retrieve WoL settings
  */
-static int
-ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+static void ice_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int ret = 0, i;
-	u8 *lut;
-
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-
-	if (!indir)
-		return 0;
-
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		/* RSS not supported return error here */
-		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
-		return -EIO;
-	}
+	struct ice_pf *pf = np->vsi->back;
 
-	lut = devm_kzalloc(&pf->pdev->dev, vsi->rss_table_size, GFP_KERNEL);
-	if (!lut)
-		return -ENOMEM;
+	if (np->vsi->type != ICE_VSI_PF)
+		netdev_warn(netdev, "Wake on LAN is not supported on this interface!\n");
 
-	if (ice_get_rss(vsi, key, lut, vsi->rss_table_size)) {
-		ret = -EIO;
-		goto out;
+	/* Get WoL settings based on the HW capability */
+	if (ice_is_wol_supported(&pf->hw)) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = pf->wol_ena ? WAKE_MAGIC : 0;
+	} else {
+		wol->supported = 0;
+		wol->wolopts = 0;
 	}
-
-	for (i = 0; i < vsi->rss_table_size; i++)
-		indir[i] = (u32)(lut[i]);
-
-out:
-	devm_kfree(&pf->pdev->dev, lut);
-	return ret;
 }
 
 /**
- * ice_set_rxfh - set the Rx flow hash indirection table
+ * ice_set_wol - set Wake on LAN on supported device
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
- *
- * Returns -EINVAL if the table specifies an invalid queue ID, otherwise
- * returns 0 after programming the table.
+ * @wol: Ethtool structure to set WoL
  */
-static int
-ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
-	     const u8 hfunc)
+static int ice_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u8 *seed = NULL;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (vsi->type != ICE_VSI_PF || !ice_is_wol_supported(&pf->hw))
 		return -EOPNOTSUPP;
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		/* RSS not supported return error here */
-		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
-		return -EIO;
-	}
-
-	if (key) {
-		if (!vsi->rss_hkey_user) {
-			vsi->rss_hkey_user =
-				devm_kzalloc(&pf->pdev->dev,
-					     ICE_VSIQF_HKEY_ARRAY_SIZE,
-					     GFP_KERNEL);
-			if (!vsi->rss_hkey_user)
-				return -ENOMEM;
-		}
-		memcpy(vsi->rss_hkey_user, key, ICE_VSIQF_HKEY_ARRAY_SIZE);
-		seed = vsi->rss_hkey_user;
-	}
-
-	if (!vsi->rss_lut_user) {
-		vsi->rss_lut_user = devm_kzalloc(&pf->pdev->dev,
-						 vsi->rss_table_size,
-						 GFP_KERNEL);
-		if (!vsi->rss_lut_user)
-			return -ENOMEM;
-	}
-
-	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
-	if (indir) {
-		int i;
+	/* only magic packet is supported */
+	if (wol->wolopts && wol->wolopts != WAKE_MAGIC)
+		return -EOPNOTSUPP;
 
-		for (i = 0; i < vsi->rss_table_size; i++)
-			vsi->rss_lut_user[i] = (u8)(indir[i]);
-	} else {
-		ice_fill_rss_lut(vsi->rss_lut_user, vsi->rss_table_size,
-				 vsi->rss_size);
+	/* Set WoL only if there is a new value */
+	if (pf->wol_ena != !!wol->wolopts) {
+		pf->wol_ena = !!wol->wolopts;
+		device_set_wakeup_enable(ice_pf_to_dev(pf), pf->wol_ena);
+		netdev_dbg(netdev, "WoL magic packet %sabled\n",
+			   pf->wol_ena ? "en" : "dis");
 	}
 
-	if (ice_set_rss(vsi, seed, vsi->rss_lut_user, vsi->rss_table_size))
-		return -EIO;
-
 	return 0;
 }
 
@@ -3106,25 +5369,21 @@ static int
 ice_get_rc_coalesce(struct ethtool_coalesce *ec, enum ice_container_type c_type,
 		    struct ice_ring_container *rc)
 {
-	struct ice_pf *pf;
-
 	if (!rc->ring)
 		return -EINVAL;
 
-	pf = rc->ring->vsi->back;
-
 	switch (c_type) {
 	case ICE_RX_CONTAINER:
-		ec->use_adaptive_rx_coalesce = ITR_IS_DYNAMIC(rc->itr_setting);
-		ec->rx_coalesce_usecs = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+		ec->use_adaptive_rx_coalesce = ITR_IS_DYNAMIC(rc);
+		ec->rx_coalesce_usecs = rc->itr_setting;
 		ec->rx_coalesce_usecs_high = rc->ring->q_vector->intrl;
 		break;
 	case ICE_TX_CONTAINER:
-		ec->use_adaptive_tx_coalesce = ITR_IS_DYNAMIC(rc->itr_setting);
-		ec->tx_coalesce_usecs = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+		ec->use_adaptive_tx_coalesce = ITR_IS_DYNAMIC(rc);
+		ec->tx_coalesce_usecs = rc->itr_setting;
 		break;
 	default:
-		dev_dbg(&pf->pdev->dev, "Invalid c_type %d\n", c_type);
+		dev_dbg(ice_pf_to_dev(rc->ring->vsi->back), "Invalid c_type %d\n", c_type);
 		return -EINVAL;
 	}
 
@@ -3197,12 +5456,14 @@ ice_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
 	return __ice_get_coalesce(netdev, ec, -1);
 }
 
+#ifdef ETHTOOL_PERQUEUE
 static int
 ice_get_per_q_coalesce(struct net_device *netdev, u32 q_num,
 		       struct ethtool_coalesce *ec)
 {
 	return __ice_get_coalesce(netdev, ec, q_num);
 }
+#endif /* ETHTOOL_PERQUEUE */
 
 /**
  * ice_set_rc_coalesce - set ITR values for specific ring container
@@ -3234,17 +5495,21 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
 		if (ec->rx_coalesce_usecs_high > ICE_MAX_INTRL ||
 		    (ec->rx_coalesce_usecs_high &&
 		     ec->rx_coalesce_usecs_high < pf->hw.intrl_gran)) {
-			netdev_info(vsi->netdev,
-				    "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n",
+			netdev_info(vsi->netdev, "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n",
 				    c_type_str, pf->hw.intrl_gran,
 				    ICE_MAX_INTRL);
 			return -EINVAL;
 		}
+		if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl &&
+		    (ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce)) {
+			netdev_info(vsi->netdev, "Invalid value, %s-usecs-high cannot be changed if adaptive-tx or adaptive-rx is enabled\n",
+				    c_type_str);
+			return -EINVAL;
+		}
 		if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl) {
 			rc->ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
-			wr32(&pf->hw, GLINT_RATE(rc->ring->q_vector->reg_idx),
-			     ice_intrl_usec_to_reg(ec->rx_coalesce_usecs_high,
-						   pf->hw.intrl_gran));
+			ice_write_intrl(rc->ring->q_vector,
+					ec->rx_coalesce_usecs_high);
 		}
 
 		use_adaptive_coalesce = ec->use_adaptive_rx_coalesce;
@@ -3252,53 +5517,47 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
 
 		break;
 	case ICE_TX_CONTAINER:
-		if (ec->tx_coalesce_usecs_high) {
-			netdev_info(vsi->netdev,
-				    "setting %s-usecs-high is not supported\n",
-				    c_type_str);
-			return -EINVAL;
-		}
+#ifndef ETHTOOL_COALESCE_USECS
+		if (ec->tx_coalesce_usecs_high)
+			return -EOPNOTSUPP;
 
+#endif /* !ETHTOOL_COALESCE_USECS */
 		use_adaptive_coalesce = ec->use_adaptive_tx_coalesce;
 		coalesce_usecs = ec->tx_coalesce_usecs;
 
 		break;
 	default:
-		dev_dbg(&pf->pdev->dev, "Invalid container type %d\n", c_type);
+		dev_dbg(ice_pf_to_dev(pf), "Invalid container type %d\n",
+			c_type);
 		return -EINVAL;
 	}
 
-	itr_setting = rc->itr_setting & ~ICE_ITR_DYNAMIC;
+	itr_setting = rc->itr_setting;
 	if (coalesce_usecs != itr_setting && use_adaptive_coalesce) {
-		netdev_info(vsi->netdev,
-			    "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n",
+		netdev_info(vsi->netdev, "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n",
 			    c_type_str, c_type_str);
 		return -EINVAL;
 	}
 
 	if (coalesce_usecs > ICE_ITR_MAX) {
-		netdev_info(vsi->netdev,
-			    "Invalid value, %s-usecs range is 0-%d\n",
+		netdev_info(vsi->netdev, "Invalid value, %s-usecs range is 0-%d\n",
 			    c_type_str, ICE_ITR_MAX);
 		return -EINVAL;
 	}
 
-	/* hardware only supports an ITR granularity of 2us */
-	if (coalesce_usecs % 2 != 0) {
-		netdev_info(vsi->netdev,
-			    "Invalid value, %s-usecs must be even\n",
-			    c_type_str);
-		return -EINVAL;
-	}
-
 	if (use_adaptive_coalesce) {
-		rc->itr_setting |= ICE_ITR_DYNAMIC;
+		rc->itr_mode = ITR_DYNAMIC;
 	} else {
+		rc->itr_mode = ITR_STATIC;
 		/* store user facing value how it was set */
 		rc->itr_setting = coalesce_usecs;
-		/* set to static and convert to value HW understands */
-		rc->target_itr =
-			ITR_TO_REG(ITR_REG_ALIGN(rc->itr_setting));
+		/* write the change to the register */
+		ice_write_itr(rc, coalesce_usecs);
+		/* force writes to take effect immediately, the flush shouldn't
+		 * be done in the functions above because the intent is for
+		 * them to do lazy writes.
+		 */
+		ice_flush(&pf->hw);
 	}
 
 	return 0;
@@ -3344,6 +5603,50 @@ ice_set_q_coalesce(struct ice_vsi *vsi, struct ethtool_coalesce *ec, int q_num)
 	return 0;
 }
 
+#ifndef ETHTOOL_COALESCE_USECS
+/**
+ * ice_is_coalesce_param_invalid - check for unsupported coalesce parameters
+ * @ec: ethtool structure to fill with driver's coalesce settings
+ */
+static bool ice_is_coalesce_param_invalid(struct ethtool_coalesce *ec)
+{
+	if (ec->stats_block_coalesce_usecs || ec->rate_sample_interval ||
+	    ec->pkt_rate_low || ec->pkt_rate_high ||
+	    ec->rx_max_coalesced_frames || ec->rx_coalesce_usecs_irq ||
+	    ec->rx_max_coalesced_frames_irq || ec->tx_max_coalesced_frames ||
+	    ec->tx_coalesce_usecs_irq || ec->tx_max_coalesced_frames_irq ||
+	    ec->rx_coalesce_usecs_low || ec->rx_max_coalesced_frames_low ||
+	    ec->tx_coalesce_usecs_low || ec->tx_max_coalesced_frames_low ||
+	    ec->rx_max_coalesced_frames_high ||
+	    ec->tx_max_coalesced_frames_high)
+		return true;
+
+	return false;
+}
+#endif /* !ETHTOOL_COALESCE_USECS */
+
+/**
+ * ice_print_if_odd_usecs - print message if user tries to set odd [tx|rx]-usecs
+ * @netdev: netdev used for print
+ * @itr_setting: previous user setting
+ * @use_adaptive_coalesce: if adaptive coalesce is enabled or being enabled
+ * @coalesce_usecs: requested value of [tx|rx]-usecs
+ * @c_type_str: either "rx" or "tx" to match user set field of [tx|rx]-usecs
+ */
+static void
+ice_print_if_odd_usecs(struct net_device *netdev, u16 itr_setting,
+		       u32 use_adaptive_coalesce, u32 coalesce_usecs,
+		       const char *c_type_str)
+{
+	if (use_adaptive_coalesce)
+		return;
+
+	if (itr_setting != coalesce_usecs && (coalesce_usecs % 2))
+		netdev_info(netdev, "User set %s-usecs to %d, device only supports even values. Rounding down and attempting to set %s-usecs to %d\n",
+			    c_type_str, coalesce_usecs, c_type_str,
+			    ITR_REG_ALIGN(coalesce_usecs));
+}
+
 /**
  * __ice_set_coalesce - set ITR/INTRL values for the device
  * @netdev: pointer to the netdev associated with this query
@@ -3360,9 +5663,25 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec,
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
+#ifndef ETHTOOL_COALESCE_USECS
+	if (ice_is_coalesce_param_invalid(ec))
+		return -EOPNOTSUPP;
+#endif /* !ETHTOOL_COALESCE_USECS */
+
 	if (q_num < 0) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[0];
 		int v_idx;
 
+		if (q_vector) {
+			ice_print_if_odd_usecs(netdev, q_vector->rx.itr_setting,
+					       ec->use_adaptive_rx_coalesce,
+					       ec->rx_coalesce_usecs, "rx");
+
+			ice_print_if_odd_usecs(netdev, q_vector->tx.itr_setting,
+					       ec->use_adaptive_tx_coalesce,
+					       ec->tx_coalesce_usecs, "tx");
+		}
+
 		ice_for_each_q_vector(vsi, v_idx) {
 			/* In some cases if DCB is configured the num_[rx|tx]q
 			 * can be less than vsi->num_q_vectors. This check
@@ -3381,7 +5700,6 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec,
 		return -EINVAL;
 
 set_complete:
-
 	return 0;
 }
 
@@ -3391,25 +5709,306 @@ ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
 	return __ice_set_coalesce(netdev, ec, -1);
 }
 
+#ifdef ETHTOOL_PERQUEUE
 static int
 ice_set_per_q_coalesce(struct net_device *netdev, u32 q_num,
 		       struct ethtool_coalesce *ec)
 {
 	return __ice_set_coalesce(netdev, ec, q_num);
 }
+#endif /* ETHTOOL_PERQUEUE */
+
+#ifndef ETHTOOL_COALESCE_USECS
+/**
+ * ice_repr_is_coalesce_param_invalid - check for unsupported coalesce params
+ * @ec: ethtool structure to fill with driver's coalesce settings
+ *
+ * Returns true if anything but ec->rx_coalesce_usecs_high is set,
+ * returns false otherwise.
+ *
+ */
+static bool
+ice_repr_is_coalesce_param_invalid(struct ethtool_coalesce *ec)
+{
+	if (ec->rx_coalesce_usecs || ec->rx_max_coalesced_frames ||
+	    ec->rx_coalesce_usecs_irq || ec->rx_max_coalesced_frames_irq ||
+	    ec->tx_coalesce_usecs || ec->tx_max_coalesced_frames ||
+	    ec->tx_coalesce_usecs_irq || ec->tx_max_coalesced_frames_irq ||
+	    ec->stats_block_coalesce_usecs || ec->use_adaptive_rx_coalesce ||
+	    ec->use_adaptive_tx_coalesce || ec->pkt_rate_low ||
+	    ec->rx_coalesce_usecs_low || ec->rx_max_coalesced_frames_low ||
+	    ec->tx_coalesce_usecs_low || ec->tx_max_coalesced_frames_low ||
+	    ec->pkt_rate_high || ec->rx_max_coalesced_frames_high ||
+	    ec->tx_coalesce_usecs_high || ec->tx_max_coalesced_frames_high ||
+	    ec->rate_sample_interval)
+		return true;
+
+	return false;
+}
+#endif /* !ETHTOOL_COALESCE_USECS */
+
+/**
+ * ice_repr_set_coalesce - set coalesce settings for all queues
+ * @netdev: pointer to the netdev associated with this query
+ * @ec: ethtool structure to read the requested coalesce settings
+ *
+ * Return 0 on success, negative otherwise.
+ */
+static int
+ice_repr_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring *rx_ring;
+	int v_idx;
+
+#ifndef ETHTOOL_COALESCE_USECS
+	if (ice_repr_is_coalesce_param_invalid(ec))
+		return -EOPNOTSUPP;
+#endif
+
+	if (ec->rx_coalesce_usecs_high > ICE_MAX_INTRL ||
+	    (ec->rx_coalesce_usecs_high &&
+	     ec->rx_coalesce_usecs_high < pf->hw.intrl_gran)) {
+		netdev_info(vsi->netdev, "Invalid value,  rx-usecs-high valid values are 0 (disabled), %d-%d\n",
+			    pf->hw.intrl_gran, ICE_MAX_INTRL);
+		return -EINVAL;
+	}
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		rx_ring = vsi->rx_rings[v_idx];
+		ice_write_intrl(rx_ring->q_vector, ec->rx_coalesce_usecs_high);
+		rx_ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_repr_get_coalesce - get coalesce settings
+ * @netdev: pointer to the netdev associated with this query
+ * @ec: ethtool structure to read the requested coalesce settings
+ *
+ * Since all queues have the same Rx coalesce high settings,
+ * read the value from te first queue.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+static int
+ice_repr_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (!vsi->rx_rings || !vsi->rx_rings[0]->q_vector->rx.ring)
+		return -EINVAL;
+
+	ec->rx_coalesce_usecs_high = vsi->rx_rings[0]->q_vector->intrl;
+
+	return 0;
+}
+
+#ifdef ETHTOOL_GMODULEINFO
+#define ICE_I2C_EEPROM_DEV_ADDR		0xA0
+#define ICE_I2C_EEPROM_DEV_ADDR2	0xA2
+#define ICE_MODULE_TYPE_SFP		0x03
+#define ICE_MODULE_TYPE_QSFP_PLUS	0x0D
+#define ICE_MODULE_TYPE_QSFP28		0x11
+#define ICE_MODULE_SFF_ADDR_MODE	0x04
+#define ICE_MODULE_SFF_DIAG_CAPAB	0x40
+#define ICE_MODULE_REVISION_ADDR	0x01
+#define ICE_MODULE_SFF_8472_COMP	0x5E
+#define ICE_MODULE_SFF_8472_SWAP	0x5C
+#define ICE_MODULE_QSFP_MAX_LEN		640
+
+/**
+ * ice_get_module_info - get SFF module type and revision information
+ * @netdev: network interface device structure
+ * @modinfo: module EEPROM size and layout information structure
+ */
+static int
+ice_get_module_info(struct net_device *netdev,
+		    struct ethtool_modinfo *modinfo)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 sff8472_comp = 0;
+	u8 sff8472_swap = 0;
+	u8 sff8636_rev = 0;
+	u8 value = 0;
+
+	status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR, 0x00, 0x00,
+				   0, &value, 1, 0, NULL);
+	if (status)
+		return -EIO;
+
+	switch (value) {
+	case ICE_MODULE_TYPE_SFP:
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_SFF_8472_COMP, 0x00, 0,
+					   &sff8472_comp, 1, 0, NULL);
+		if (status)
+			return -EIO;
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_SFF_8472_SWAP, 0x00, 0,
+					   &sff8472_swap, 1, 0, NULL);
+		if (status)
+			return -EIO;
+
+		if (sff8472_swap & ICE_MODULE_SFF_ADDR_MODE) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else if (sff8472_comp &&
+			   (sff8472_swap & ICE_MODULE_SFF_DIAG_CAPAB)) {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		}
+		break;
+	case ICE_MODULE_TYPE_QSFP_PLUS:
+	case ICE_MODULE_TYPE_QSFP28:
+		status = ice_aq_sff_eeprom(hw, 0, ICE_I2C_EEPROM_DEV_ADDR,
+					   ICE_MODULE_REVISION_ADDR, 0x00, 0,
+					   &sff8636_rev, 1, 0, NULL);
+		if (status)
+			return -EIO;
+		/* Check revision compliance */
+		if (sff8636_rev > 0x02) {
+			/* Module is SFF-8636 compliant */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ICE_MODULE_QSFP_MAX_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ICE_MODULE_QSFP_MAX_LEN;
+		}
+		break;
+	default:
+		netdev_warn(netdev, "SFF Module Type not recognized.\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * ice_get_module_eeprom - fill buffer with SFF EEPROM contents
+ * @netdev: network interface device structure
+ * @ee: EEPROM dump request structure
+ * @data: buffer to be filled with EEPROM contents
+ */
+static int
+ice_get_module_eeprom(struct net_device *netdev,
+		      struct ethtool_eeprom *ee, u8 *data)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+#define SFF_READ_BLOCK_SIZE 8
+	u8 value[SFF_READ_BLOCK_SIZE] = {0};
+	u8 addr = ICE_I2C_EEPROM_DEV_ADDR;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	bool is_sfp = false;
+	unsigned int i, j;
+	u16 offset = 0;
+	u8 page = 0;
+
+	if (!ee || !ee->len || !data)
+		return -EINVAL;
+
+	status = ice_aq_sff_eeprom(hw, 0, addr, offset, page, 0, value, 1, 0,
+				   NULL);
+	if (status)
+		return -EIO;
+
+	if (value[0] == ICE_MODULE_TYPE_SFP)
+		is_sfp = true;
+
+	memset(data, 0, ee->len);
+	for (i = 0; i < ee->len; i += SFF_READ_BLOCK_SIZE) {
+		offset = i + ee->offset;
+		page = 0;
+
+		/* Check if we need to access the other memory page */
+		if (is_sfp) {
+			if (offset >= ETH_MODULE_SFF_8079_LEN) {
+				offset -= ETH_MODULE_SFF_8079_LEN;
+				addr = ICE_I2C_EEPROM_DEV_ADDR2;
+			}
+		} else {
+			while (offset >= ETH_MODULE_SFF_8436_LEN) {
+				/* Compute memory page number and offset. */
+				offset -= ETH_MODULE_SFF_8436_LEN / 2;
+				page++;
+			}
+		}
+
+		/* Bit 2 of eeprom address 0x02 declares upper
+		 * pages are disabled on QSFP modules.
+		 * SFP modules only ever use page 0.
+		 */
+		if (page == 0 || !(data[0x2] & 0x4)) {
+			/* If i2c bus is busy due to slow page change or
+			 * link management access, call can fail. This is normal.
+			 * So we retry this a few times.
+			 */
+			for (j = 0; j < 4; j++) {
+				status = ice_aq_sff_eeprom(hw, 0, addr, offset, page,
+							   !is_sfp, value,
+							   SFF_READ_BLOCK_SIZE,
+							   0, NULL);
+				netdev_dbg(netdev, "SFF %02X %02X %02X %X = %02X%02X%02X%02X.%02X%02X%02X%02X (%X)\n",
+					   addr, offset, page, is_sfp,
+					   value[0], value[1], value[2], value[3],
+					   value[4], value[5], value[6], value[7],
+					   status);
+				if (status) {
+					usleep_range(1500, 2500);
+					memset(value, 0, SFF_READ_BLOCK_SIZE);
+					continue;
+				}
+				break;
+			}
+
+			/* Make sure we have enough room for the new block */
+			if ((i + SFF_READ_BLOCK_SIZE) < ee->len)
+				memcpy(data + i, value, SFF_READ_BLOCK_SIZE);
+		}
+	}
+	return 0;
+}
+#endif /* ETHTOOL_GMODULEINFO */
 
 static const struct ethtool_ops ice_ethtool_ops = {
+#ifdef ETHTOOL_COALESCE_USECS
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE |
+				     ETHTOOL_COALESCE_RX_USECS_HIGH,
+#endif /* ETHTOOL_COALESCE_USECS */
+#ifdef ETHTOOL_GLINKSETTINGS
 	.get_link_ksettings	= ice_get_link_ksettings,
 	.set_link_ksettings	= ice_set_link_ksettings,
-	.get_drvinfo            = ice_get_drvinfo,
-	.get_regs_len           = ice_get_regs_len,
-	.get_regs               = ice_get_regs,
-	.get_msglevel           = ice_get_msglevel,
-	.set_msglevel           = ice_set_msglevel,
+#else
+	.get_settings		= ice_get_settings,
+	.set_settings		= ice_set_settings,
+#endif /* ETHTOOL_GLINKSETTINGS */
+	.get_drvinfo		= ice_get_drvinfo,
+	.get_regs_len		= ice_get_regs_len,
+	.get_regs		= ice_get_regs,
+	.get_wol		= ice_get_wol,
+	.set_wol		= ice_set_wol,
+	.get_msglevel		= ice_get_msglevel,
+	.set_msglevel		= ice_set_msglevel,
 	.self_test		= ice_self_test,
 	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= ice_get_eeprom_len,
 	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 	.get_coalesce		= ice_get_coalesce,
 	.set_coalesce		= ice_set_coalesce,
 	.get_strings		= ice_get_strings,
@@ -3419,38 +6018,68 @@ static const struct ethtool_ops ice_ethtool_ops = {
 	.set_priv_flags		= ice_set_priv_flags,
 	.get_sset_count		= ice_get_sset_count,
 	.get_rxnfc		= ice_get_rxnfc,
+	.set_rxnfc		= ice_set_rxnfc,
 	.get_ringparam		= ice_get_ringparam,
 	.set_ringparam		= ice_set_ringparam,
 	.nway_reset		= ice_nway_reset,
 	.get_pauseparam		= ice_get_pauseparam,
 	.set_pauseparam		= ice_set_pauseparam,
+#if defined(ETHTOOL_GRSSH) && defined(ETHTOOL_SRSSH)
 	.get_rxfh_key_size	= ice_get_rxfh_key_size,
 	.get_rxfh_indir_size	= ice_get_rxfh_indir_size,
 	.get_rxfh		= ice_get_rxfh,
 	.set_rxfh		= ice_set_rxfh,
-	.get_ts_info		= ethtool_op_get_ts_info,
-	.get_per_queue_coalesce = ice_get_per_q_coalesce,
-	.set_per_queue_coalesce = ice_set_per_q_coalesce,
+#endif /* ETHTOOL_GRSSH && ETHTOOL_SRSSH */
+	.get_channels		= ice_get_channels,
+	.set_channels		= ice_set_channels,
+	.get_ts_info		= ice_get_ts_info,
+#ifdef ETHTOOL_PERQUEUE
+	.get_per_queue_coalesce	= ice_get_per_q_coalesce,
+	.set_per_queue_coalesce	= ice_set_per_q_coalesce,
+#endif /* ETHTOOL_PERQUEUE */
+#ifdef ETHTOOL_GFECPARAM
 	.get_fecparam		= ice_get_fecparam,
 	.set_fecparam		= ice_set_fecparam,
+#endif /* ETHTOOL_GFECPARAM */
+#ifdef ETHTOOL_GMODULEINFO
+	.get_module_info	= ice_get_module_info,
+	.get_module_eeprom	= ice_get_module_eeprom,
+#endif /* ETHTOOL_GMODULEINFO */
+};
+
+static const struct ethtool_ops ice_ethtool_recovery_ops = {
+	.get_drvinfo		= ice_get_drvinfo,
+	.get_eeprom_len		= ice_get_eeprom_len,
+	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 };
 
 static const struct ethtool_ops ice_ethtool_safe_mode_ops = {
+#ifdef ETHTOOL_GLINKSETTINGS
 	.get_link_ksettings	= ice_get_link_ksettings,
 	.set_link_ksettings	= ice_set_link_ksettings,
+#else
+	.get_settings		= ice_get_settings,
+	.set_settings		= ice_set_settings,
+#endif /* ETHTOOL_GLINKSETTINGS */
 	.get_drvinfo		= ice_get_drvinfo,
 	.get_regs_len		= ice_get_regs_len,
 	.get_regs		= ice_get_regs,
+	.get_wol		= ice_get_wol,
+	.set_wol		= ice_set_wol,
 	.get_msglevel		= ice_get_msglevel,
 	.set_msglevel		= ice_set_msglevel,
+	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= ice_get_eeprom_len,
 	.get_eeprom		= ice_get_eeprom,
+	.set_eeprom		= ice_set_eeprom,
 	.get_strings		= ice_get_strings,
-	.get_ethtool_stats	= ice_get_ethtool_stats,
+	.get_ethtool_stats      = ice_get_ethtool_stats,
 	.get_sset_count		= ice_get_sset_count,
 	.get_ringparam		= ice_get_ringparam,
 	.set_ringparam		= ice_set_ringparam,
 	.nway_reset		= ice_nway_reset,
+	.get_channels		= ice_get_channels,
 };
 
 /**
@@ -3462,6 +6091,38 @@ void ice_set_ethtool_safe_mode_ops(struct net_device *netdev)
 	netdev->ethtool_ops = &ice_ethtool_safe_mode_ops;
 }
 
+
+static const struct ethtool_ops ice_ethtool_repr_ops = {
+#ifdef ETHTOOL_COALESCE_USECS
+	.supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS_HIGH,
+#endif
+	.get_coalesce		= ice_repr_get_coalesce,
+	.set_coalesce		= ice_repr_set_coalesce,
+	.get_drvinfo		= ice_repr_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= ice_get_strings,
+	.get_ethtool_stats      = ice_get_ethtool_stats,
+	.get_sset_count		= ice_get_sset_count,
+};
+
+/**
+ * ice_set_ethtool_repr_ops - setup VF's port representor ethtool ops
+ * @netdev: network interface device structure
+ */
+void ice_set_ethtool_repr_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ice_ethtool_repr_ops;
+}
+
+/**
+ * ice_set_ethtool_recovery_ops - setup FW recovery ethtool ops
+ * @netdev: network interface device structure
+ */
+void ice_set_ethtool_recovery_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &ice_ethtool_recovery_ops;
+}
+
 /**
  * ice_set_ethtool_ops - setup netdev ethtool ops
  * @netdev: network interface device structure
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.h b/drivers/net/ethernet/intel/ice/ice_ethtool.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b167694aa188c56afc5ee0b7f04d5ff7fc32ef5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_ETHTOOL_H_
+#define _ICE_ETHTOOL_H_
+
+struct ice_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define ICE_STAT(_type, _name, _stat) { \
+	.stat_string = _name, \
+	.sizeof_stat = sizeof_field(_type, _stat), \
+	.stat_offset = offsetof(_type, _stat) \
+}
+
+#define ICE_VSI_STAT(_name, _stat) \
+		ICE_STAT(struct ice_vsi, _name, _stat)
+#define ICE_PF_STAT(_name, _stat) \
+		ICE_STAT(struct ice_pf, _name, _stat)
+
+#ifdef UNIFIED_STATS
+#define PICK(legacy_stat, unified_stat) unified_stat
+#else
+#define PICK(legacy_stat, unified_stat) legacy_stat
+#endif
+
+/* VSI stats */
+#define ICE_RX_UNICAST			"rx_unicast"
+#define ICE_TX_UNICAST			"tx_unicast"
+#define ICE_RX_MULTICAST		"rx_multicast"
+#define ICE_TX_MULTICAST		"tx_multicast"
+#define ICE_RX_BROADCAST		"rx_broadcast"
+#define ICE_TX_BROADCAST		"tx_broadcast"
+#define ICE_RX_BYTES			"rx_bytes"
+#define ICE_TX_BYTES			"tx_bytes"
+#define ICE_RX_DROPPED			PICK("rx_dropped", "rx-dropped_pkts")
+#define ICE_RX_UNKNOWN_PROTO		PICK("rx_unknown_protocol", "rx-unknown-protocol_pkts")
+#define ICE_RX_ALLOC_FAIL		PICK("rx_alloc_fail", "rx-buf-alloc-fail_events")
+#define ICE_RX_PAGE_ALLOC_FAIL		PICK("rx_pg_alloc_fail", "rx-page-alloc-fail_events")
+#define ICE_TX_ERRORS			"tx_errors"
+#define ICE_TX_LINEARIZE		PICK("tx_linearized", "tx-linearized_pkts")
+#define ICE_TX_BUSY			PICK("tx_busy", "tx-busy_events")
+#define ICE_TX_RESTART			"tx_restart"
+#ifdef ICE_ADD_PROBES
+#define ICE_RX_PAGE_REUSE		"rx_page_reuse"
+#endif
+#ifdef ADQ_PERF_COUNTERS
+#endif /* ADQ_PERF_COUNTERS */
+
+/* port stats */
+#define ICE_PORT_RX_BYTES		PICK("rx_bytes.nic", "port-rx_bytes")
+#define ICE_PORT_TX_BYTES		PICK("tx_bytes.nic", "port-tx_bytes")
+#define ICE_PORT_RX_UNICAST		PICK("rx_unicast.nic", "port-rx-unicast_pkts")
+#define ICE_PORT_TX_UNICAST		PICK("tx_unicast.nic", "port-tx-unicast_pkts")
+#define ICE_PORT_RX_MULTICAST		PICK("rx_multicast.nic", "port-rx-multicast_pkts")
+#define ICE_PORT_TX_MULTICAST		PICK("tx_multicast.nic", "port-tx-multicast_pkts")
+#define ICE_PORT_RX_BROADCAST		PICK("rx_broadcast.nic", "port-rx-broadcast_pkts")
+#define ICE_PORT_TX_BROADCAST		PICK("tx_broadcast.nic", "port-tx-broadcast_pkts")
+#define ICE_PORT_TX_ERRORS		PICK("tx_errors.nic", "port-tx_errors")
+#define ICE_PORT_TX_TIMEOUT		PICK("tx_timeout.nic", "port-tx-timeout_events")
+#define ICE_PORT_RX_SIZE_64		PICK("rx_size_64.nic", "port-rx_size-64_pkts")
+#define ICE_PORT_TX_SIZE_64		PICK("tx_size_64.nic", "port-tx_size-64_pkts")
+#define ICE_PORT_RX_SIZE_127		PICK("rx_size_127.nic", "port-rx_size-127_pkts")
+#define ICE_PORT_TX_SIZE_127		PICK("tx_size_127.nic", "port-tx_size-127_pkts")
+#define ICE_PORT_RX_SIZE_255		PICK("rx_size_255.nic", "port-rx_size-255_pkts")
+#define ICE_PORT_TX_SIZE_255		PICK("tx_size_255.nic", "port-tx_size-255_pkts")
+#define ICE_PORT_RX_SIZE_511		PICK("rx_size_511.nic", "port-rx_size-511_pkts")
+#define ICE_PORT_TX_SIZE_511		PICK("tx_size_511.nic", "port-tx_size-511_pkts")
+#define ICE_PORT_RX_SIZE_1023		PICK("rx_size_1023.nic", "port-rx_size-1023_pkts")
+#define ICE_PORT_TX_SIZE_1023		PICK("tx_size_1023.nic", "port-tx_size-1023_pkts")
+#define ICE_PORT_RX_SIZE_1522		PICK("rx_size_1522.nic", "port-rx_size-1522_pkts")
+#define ICE_PORT_TX_SIZE_1522		PICK("tx_size_1522.nic", "port-tx_size-1522_pkts")
+#define ICE_PORT_RX_SIZE_JUMBO		PICK("rx_size_big.nic", "port-rx_size-jumbo_pkts")
+#define ICE_PORT_TX_SIZE_JUMBO		PICK("tx_size_big.nic", "port-tx_size-jumbo_pkts")
+#define ICE_PORT_RX_LINK_XON		PICK("link_xon_rx.nic", "port-rx-xon_events")
+#define ICE_PORT_TX_LINK_XON		PICK("link_xon_tx.nic", "port-tx-xon_events")
+#define ICE_PORT_RX_LINK_XOFF		PICK("link_xoff_rx.nic", "port-rx-xoff_events")
+#define ICE_PORT_TX_LINK_XOFF		PICK("link_xoff_tx.nic", "port-tx-xoff_events")
+#define ICE_PORT_TX_DROP_LINK_DOWN	PICK("tx_dropped_link_down.nic", "port-tx-dropped_link-down_pkts")
+#define ICE_PORT_RX_UNDERSIZE		PICK("rx_undersize.nic", "port-rx-undersized_pkts")
+#define ICE_PORT_RX_FRAGMENTS		PICK("rx_fragments.nic", "port-rx-fragmented_pkts")
+#define ICE_PORT_RX_OVERSIZE		PICK("rx_oversize.nic", "port-rx-oversized_pkts")
+#define ICE_PORT_RX_JABBER		PICK("rx_jabber.nic", "port-rx-jabber_pkts")
+#define ICE_PORT_RX_CSUM_BAD		PICK("rx_csum_bad.nic", "port-rx-csum_errors")
+#define ICE_PORT_RX_LEN_ERRORS		PICK("rx_length_errors.nic", "port-rx-length_errors")
+#define ICE_PORT_RX_DROPPED		PICK("rx_dropped.nic", "port-rx-dropped_pkts")
+#define ICE_PORT_RX_CRC_ERRORS		PICK("rx_crc_errors.nic", "port-rx-crc_errors")
+#define ICE_PORT_ILLEGAL_BYTES		PICK("illegal_bytes.nic", "port-rx-illegal_bytes")
+#define ICE_PORT_MAC_LOCAL_FAULTS	PICK("mac_local_faults.nic", "port-mac-local_faults")
+#define ICE_PORT_MAC_REMOTE_FAULTS	PICK("mac_remote_faults.nic", "port-mac-remote_faults")
+#ifdef ICE_ADD_PROBES
+#define ICE_PORT_TX_TCP_SEGMENTS	PICK("tx_tcp_segments.nic", "port-tx-tcp-segments_count")
+#define ICE_PORT_TX_UDP_SEGMENTS	PICK("tx_udp_segments.nic", "port-tx-udp-segments_count")
+#define ICE_PORT_RX_TCP_CSO		PICK("rx_tcp_cso.nic", "port-rx-tcp-csum-offload_count")
+#define ICE_PORT_TX_TCP_CSO		PICK("tx_tcp_cso.nic", "port-tx-tcp-csum-offload_count")
+#define ICE_PORT_RX_UDP_CSO		PICK("rx_udp_cso.nic", "port-rx-udp-csum-offload_count")
+#define ICE_PORT_TX_UDP_CSO		PICK("tx_udp_cso.nic", "port-tx-udp-csum-offload_count")
+#define ICE_PORT_RX_SCTP_CSO		PICK("rx_sctp_cso.nic", "port-rx-sctp-csum-offload_count")
+#define ICE_PORT_TX_SCTP_CSO		PICK("tx_sctp_cso.nic", "port-tx-sctp-csum-offload_count")
+#define ICE_PORT_RX_IP4_CSO		PICK("rx_ip4_cso.nic", "port-rx-ipv4-csum-offload_count")
+#define ICE_PORT_TX_IP4_CSO		PICK("tx_ip4_cso.nic", "port-tx-ipv4-csum-offload_count")
+#define ICE_PORT_RX_IP4_CSO_ERROR	PICK("rx_ip4_cso_error.nic", "port-rx-ipv4-csum_errors")
+#define ICE_PORT_RX_TCP_CSO_ERROR	PICK("rx_tcp_cso_error.nic", "port-rx-tcp-csum_errors")
+#define ICE_PORT_RX_UDP_CSO_ERROR	PICK("rx_udp_cso_error.nic", "port-rx-udp-csum_errors")
+#define ICE_PORT_RX_SCTP_CSO_ERROR	PICK("rx_sctp_cso_error.nic", "port-rx-sctp-csum_errors")
+#define ICE_PORT_TX_L3_CSO_ERROR	PICK("tx_l3_cso_err.nic", "port-tx-layer-3-csum_errors")
+#define ICE_PORT_TX_L4_CSO_ERROR	PICK("tx_l4_cso_err.nic", "port-tx-layer-4-csum_errors")
+#define ICE_PORT_RX_Q_VLANO		PICK("rx_vlano.nic", "port-rx-q-vlan-offload_pkts")
+#define ICE_PORT_TX_Q_VLANO		PICK("tx_vlano.nic", "port-tx-q-vlan-offload_pkts")
+#define ICE_PORT_RX_AD_VLANO		PICK("rx_ad_vlano.nic", "port-rx-ad-vlan-offload_pkts")
+#define ICE_PORT_TX_AD_VLANO		PICK("tx_ad_vlano.nic", "port-tx-ad-vlan-offload_pkts")
+#endif /* ICE_ADD_PROBES */
+#define ICE_PORT_FDIR_SB_MATCH		PICK("fdir_sb_match.nic", "port-rx-fdir-sideband")
+#define ICE_PORT_FDIR_SB_STATUS		PICK("fdir_sb_status.nic", "port-rx-fdir-sideband-status")
+#ifdef ICE_ADD_PROBES
+#define ICE_PORT_ARFS_TCPV4_MATCH	PICK("arfs_tcpv4_match.nic", "port-rx-arfs-tcpv4-pkts")
+#define ICE_PORT_ARFS_TCPV6_MATCH	PICK("arfs_tcpv6_match.nic", "port-rx-arfs-tcpv6-pkts")
+#define ICE_PORT_ARFS_UDP4_MATCH	PICK("arfs_udpv4_match.nic", "port-rx-arfs-udpv4-pkts")
+#define ICE_PORT_ARFS_UDP6_MATCH	PICK("arfs_udpv6_match.nic", "port-rx-arfs-udpv6-pkts")
+#endif /* ICE_ADD_PROBES */
+#define PORT_TX_PRIO_XON		PICK("tx_priority_%u_xon.nic", "port-tx-xon_prio-%u_events")
+#define PORT_TX_PRIO_XOFF		PICK("tx_priority_%u_xoff.nic", "port-tx-xoff_prio-%u_events")
+#define PORT_RX_PRIO_XON		PICK("rx_priority_%u_xon.nic", "port-rx-xon_prio-%u_events")
+#define PORT_RX_PRIO_XOFF		PICK("rx_priority_%u_xoff.nic", "port-rx-xoff_prio-%u_events")
+
+/* per-queue stats */
+#define ICE_TXQ_PACKETS			PICK("tx_queue_%u_packets", "tx_q-%u_pkts")
+#define ICE_TXQ_BYTES			PICK("tx_queue_%u_bytes", "tx_q-%u_bytes")
+#define ICE_RXQ_PACKETS			PICK("rx_queue_%u_packets", "rx_q-%u_pkts")
+#define ICE_RXQ_BYTES			PICK("rx_queue_%u_bytes", "rx_q-%u_bytes")
+#ifdef ICE_ADD_PROBES
+#define ICE_TXQ_NAPI_POLL		PICK("tx_queue_%u_napi_poll_cnt", "tx_q-%u_napi_poll_count")
+#define ICE_RXQ_NAPI_POLL		PICK("rx_queue_%u_napi_poll_cnt", "rx_q-%u_napi_poll_count")
+#endif /* ICE_ADD_PROBES */
+
+#ifdef HAVE_NETDEV_SB_DEV
+#ifdef ICE_ADD_PROBES
+/* macvlan stats */
+#define L2_FWD_TX_PKTS1			PICK("l2-fwd-%s-tx_pkts", "tx-l2-forward_q-%s_pkts")
+#define L2_FWD_TX_BYTES1		PICK("l2-fwd-%s-tx_bytes", "tx-l2-forward_q-%s_bytes")
+#define L2_FWD_TX_PKTS2			PICK("l2-fwd-%i-tx_pkts", "tx-l2-forward_q-%i_pkts")
+#define L2_FWD_TX_BYTES2		PICK("l2-fwd-%i-tx_bytes", "tx-l2-forward_q-%i_bytes")
+#define L2_FWD_RX_PKTS1			PICK("l2-fwd-%s-rx_pkts", "rx-l2-forward_q-%s_pkts")
+#define L2_FWD_RX_BYTES1		PICK("l2-fwd-%s-rx_bytes", "rx-l2-forward_q-%s_bytes")
+#define L2_FWD_RX_PKTS2			PICK("l2-fwd-%i-rx_pkts", "rx-l2-forward_q-%i_pkts")
+#define L2_FWD_RX_BYTES2		PICK("l2-fwd-%i-rx_bytes", "rx-l2-forward_q-%i_bytes")
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#ifdef ADQ_PERF_COUNTERS
+/* ADQ stats */
+#define ICE_TXQ_BUSY_POLL		PICK("tx_%u.pkt_busy_poll", "tx_q-%u_pkt_busy_poll")
+#define ICE_TXQ_NOT_BUSY_POLL		PICK("tx_%u.pkt_not_busy_poll", "tx_q-%u_pkt_not_busy_poll")
+#define ICE_TXQ_ATR_SETUP		PICK("tx_%u.atr_setup", "tx_q-%u_atr_setup")
+#define ICE_TXQ_MARK_ATR_SETUP		PICK("tx_%u.mark_atr_setup", "tx_q-%u_mark_atr_setup")
+#define ICE_TXQ_ATR_TEARDOWN		PICK("tx_%u.atr_teardown", "tx_q-%u_atr_teardown")
+#define ICE_TXQ_ATR_BAIL		PICK("tx_%u.atr_bailouts", "tx_q-%u_atr_bailouts")
+#define ICE_RXQ_BUSY_POLL		PICK("rx_%u.pkt_busy_poll", "rx_q-%u_pkt_busy_poll")
+#define ICE_RXQ_NOT_BUSY_POLL		PICK("rx_%u.pkt_not_busy_poll", "rx_q-%u_pkt_not_busy_poll")
+#define ICE_RXQ_SET			PICK("rx_%u.queue_set", "rx_q-%u_queue_set")
+#define ICE_RXQ_BAIL			PICK("rx_%u.queue_bailouts", "rx_q-%u_queue_bailouts")
+#define ICE_RXQ_TCP_CTRL_PKTS		PICK("rx_%u.tcp_ctrl_pkts", "rx_q-%u_tcp_ctrl_pkts")
+#define ICE_RXQ_ONLY_CTRL_PKTS		PICK("rx_%u.only_ctrl_pkts", "rx_q-%u_only_ctrl_pkts")
+#define ICE_RXQ_TCP_FIN_RECV		PICK("rx_%u.tcp_fin_recv", "rx_q-%u_tcp_fin_recv")
+#define ICE_RXQ_TCP_RST_RECV		PICK("rx_%u.tcp_rst_recv", "rx_q-%u_tcp_rst_recv")
+#define ICE_RXQ_TCP_SYN_RECV		PICK("rx_%u.tcp_syn_recv", "rx_q-%u_tcp_syn_recv")
+#define ICE_RXQ_BP_NO_DATA_PKT		PICK("rx_%u.bp_no_data_pkt", "rx_q-%u_bp_no_data_pkt")
+#define ICE_RXQ_IN_BP			PICK("rx_%u.in_bp", "rx_q-%u_in_bp")
+#define ICE_RXQ_INTR_TO_BP		PICK("rx_%u.intr_to_bp", "rx_q-%u_intr_to_bp")
+#define ICE_RXQ_BP_TO_BP		PICK("rx_%u.bp_to_bp", "rx_q-%u_bp_to_bp")
+#define ICE_RXQ_IN_INTR			PICK("rx_%u.in_intr", "rx_q-%u_in_intr")
+#define ICE_RXQ_BP_TO_INTR		PICK("rx_%u.bp_to_intr", "rx_q-%u_bp_to_intr")
+#define ICE_RXQ_INTR_TO_INTR		PICK("rx_%u.intr_to_intr", "rx_q-%u_intr_to_intr")
+#define ICE_RXQ_UNLIKELY_CB_TO_BP	PICK("rx_%u.unlikely_cb_to_bp", "rx_q-%u_unlikely_cb_to_bp")
+#define ICE_RXQ_UCB_ONCE_IN_BP		PICK("rx_%u.ucb_once_in_bp", "rx_q-%u_ucb_once_in_bp")
+#define ICE_RXQ_INTR_ONCE_IN_BP_FALSE	PICK("rx_%u.intr_once_in_bp_false", "rx_q-%u_intr_once_in_bp_false")
+#define ICE_RXQ_BP_STOP_NEED_RESCHED	PICK("rx_%u.bp_stop_need_resched", "rx_q-%u_bp_stop_need_resched")
+#define ICE_RXQ_BP_STOP_TIMEOUT		PICK("rx_%u.bp_stop_timeout", "rx_q-%u_bp_stop_timeout")
+#define ICE_RXQ_CLEANED_ANY_DATA_PKT	PICK("rx_%u.cleaned_any_data_pkt", "rx_q-%u_cleaned_any_data_pkt")
+#define ICE_RXQ_NEED_RESCHED_NO_DATA	PICK("rx_%u.need_resched_no_data", "rx_q-%u_need_resched_no_data")
+#define ICE_RXQ_TIMEOUT_NO_DATA		PICK("rx_%u.timeout_no_data", "rx_q-%u_timeout_no_data")
+#define ICE_RXQ_SW_INTR_TIMEOUT		PICK("rx_%u.sw_intr_timeout", "rx_q-%u_sw_intr_timeout")
+#define ICE_RXQ_SW_INTR_SERV_TASK	PICK("rx_%u.sw_intr_service_task", "rx_q-%u_sw_intr_service_task")
+#define ICE_RXQ_NO_SW_INTR_OPT_OFF	PICK("rx_%u.no_sw_intr_opt_off", "rx_q-%u_no_sw_intr_opt_off")
+#define ICE_RXQ_WB_ON_ITR_SET		PICK("rx_%u.wb_on_itr_set", "rx_q-%u_wb_on_itr_set")
+#define ICE_RXQ_PKTS_BP_STOP_BUDGET8	PICK("rx_%u.pkts_bp_stop_budget8", "rx_q-%u_pkts_bp_stop_budget8")
+#define ICE_RXQ_PKTS_BP_STOP_BUDGET64	PICK("rx_%u.pkts_bp_stop_budget64", "rx_q-%u_pkts_bp_stop_budget64")
+#define ICE_RXQ_BP_WD_EQUAL_BUDGET8	PICK("rx_%u.bp_wd_equal_budget8", "rx_q-%u_bp_wd_equal_budget8")
+#define ICE_RXQ_BP_WD_EQUAL_BUDGET64	PICK("rx_%u.bp_wd_equal_budget64", "rx_q-%u_bp_wd_equal_b")
+#define ICE_RXQ_KEEP_STATE_BP_BUDGET8	PICK("rx_%u.keep_state_bp_budget8", "rx_q-%u_keep_state_bp_budget8")
+#define ICE_RXQ_KEEP_STATE_BP_BUDGET64	PICK("rx_%u.keep_state_bp_budget64", "rx_q-%u_keep_state_bp_budget64")
+#endif /* ADQ_PERF_COUNTERS */
+#endif /* !_ICE_ETHTOOL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
new file mode 100644
index 0000000000000000000000000000000000000000..87d09c332eacb7248599af4e9463958e105ede94
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
@@ -0,0 +1,2180 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* flow director ethtool support for ice */
+
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fdir.h"
+#include "ice_flow.h"
+
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+static struct in6_addr full_ipv6_addr_mask = {
+	.in6_u = {
+		.u6_addr8 = {
+			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		}
+	}
+};
+
+static struct in6_addr zero_ipv6_addr_mask = {
+	.in6_u = {
+		.u6_addr8 = {
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		}
+	}
+};
+#endif
+
+/* calls to ice_flow_add_prof require the number of segments in the array
+ * for segs_cnt. In this code that is one more than the index.
+ */
+#define TNL_SEG_CNT(_TNL_) ((_TNL_) + 1)
+
+/**
+ * ice_fltr_to_ethtool_flow - convert filter type values to ethtool
+ * flow type values
+ * @flow: filter type to be converted
+ *
+ * Returns the corresponding ethtool flow type.
+ */
+static int ice_fltr_to_ethtool_flow(enum ice_fltr_ptype flow)
+{
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		return TCP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		return UDP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		return SCTP_V4_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		return IPV4_USER_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		return TCP_V6_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		return UDP_V6_FLOW;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		return SCTP_V6_FLOW;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		return IPV6_USER_FLOW;
+#endif
+	default:
+		/* 0 is undefined ethtool flow */
+		return 0;
+	}
+}
+
+/**
+ * ice_ethtool_flow_to_fltr - convert ethtool flow type to filter enum
+ * @eth: Ethtool flow type to be converted
+ *
+ * Returns flow enum
+ */
+enum ice_fltr_ptype ice_ethtool_flow_to_fltr(int eth)
+{
+	switch (eth) {
+	case TCP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+	case UDP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+	case SCTP_V4_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+	case IPV4_USER_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+	case TCP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+	case UDP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+	case SCTP_V6_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case IPV6_USER_FLOW:
+		return ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+#endif
+	default:
+		return ICE_FLTR_PTYPE_NONF_NONE;
+	}
+}
+
+/**
+ * ice_is_mask_valid - check mask field set
+ * @mask: full mask to check
+ * @field: field for which mask should be valid
+ *
+ * If the mask is fully set return true. If it is not valid for field return
+ * false.
+ */
+static bool ice_is_mask_valid(u64 mask, u64 field)
+{
+	return (mask & field) == field;
+}
+
+/**
+ * ice_get_ethtool_fdir_entry - fill ethtool structure with fdir filter data
+ * @hw: hardware structure that contains filter list
+ * @cmd: ethtool command data structure to receive the filter data
+ *
+ * Returns 0 on success and -EINVAL on failure
+ */
+int ice_get_ethtool_fdir_entry(struct ice_hw *hw, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fdir_fltr *rule;
+	int ret = 0;
+	u16 idx;
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+
+	rule = ice_fdir_find_fltr_by_idx(hw, fsp->location);
+
+	if (!rule || fsp->location != rule->fltr_id) {
+		ret = -EINVAL;
+		goto release_lock;
+	}
+
+	fsp->flow_type = ice_fltr_to_ethtool_flow(rule->flow_type);
+
+	memset(&fsp->m_u, 0, sizeof(fsp->m_u));
+	memset(&fsp->m_ext, 0, sizeof(fsp->m_ext));
+
+	switch (fsp->flow_type) {
+	case IPV4_USER_FLOW:
+		fsp->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4;
+		fsp->h_u.usr_ip4_spec.proto = 0;
+		fsp->h_u.usr_ip4_spec.l4_4_bytes = rule->ip.v4.l4_header;
+		fsp->h_u.usr_ip4_spec.tos = rule->ip.v4.tos;
+		fsp->h_u.usr_ip4_spec.ip4src = rule->ip.v4.src_ip;
+		fsp->h_u.usr_ip4_spec.ip4dst = rule->ip.v4.dst_ip;
+		fsp->m_u.usr_ip4_spec.ip4src = rule->mask.v4.src_ip;
+		fsp->m_u.usr_ip4_spec.ip4dst = rule->mask.v4.dst_ip;
+		fsp->m_u.usr_ip4_spec.ip_ver = 0xFF;
+		fsp->m_u.usr_ip4_spec.proto = 0;
+		fsp->m_u.usr_ip4_spec.l4_4_bytes = rule->mask.v4.l4_header;
+		fsp->m_u.usr_ip4_spec.tos = rule->mask.v4.tos;
+		break;
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		fsp->h_u.tcp_ip4_spec.psrc = rule->ip.v4.src_port;
+		fsp->h_u.tcp_ip4_spec.pdst = rule->ip.v4.dst_port;
+		fsp->h_u.tcp_ip4_spec.ip4src = rule->ip.v4.src_ip;
+		fsp->h_u.tcp_ip4_spec.ip4dst = rule->ip.v4.dst_ip;
+		fsp->m_u.tcp_ip4_spec.psrc = rule->mask.v4.src_port;
+		fsp->m_u.tcp_ip4_spec.pdst = rule->mask.v4.dst_port;
+		fsp->m_u.tcp_ip4_spec.ip4src = rule->mask.v4.src_ip;
+		fsp->m_u.tcp_ip4_spec.ip4dst = rule->mask.v4.dst_ip;
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case IPV6_USER_FLOW:
+		fsp->h_u.usr_ip6_spec.l4_4_bytes = rule->ip.v6.l4_header;
+		fsp->h_u.usr_ip6_spec.tclass = rule->ip.v6.tc;
+		fsp->h_u.usr_ip6_spec.l4_proto = rule->ip.v6.proto;
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6src, rule->ip.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, rule->ip.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6src, rule->mask.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6dst, rule->mask.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->m_u.usr_ip6_spec.l4_4_bytes = rule->mask.v6.l4_header;
+		fsp->m_u.usr_ip6_spec.tclass = rule->mask.v6.tc;
+		fsp->m_u.usr_ip6_spec.l4_proto = rule->mask.v6.proto;
+		break;
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6src, rule->ip.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, rule->ip.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->h_u.tcp_ip6_spec.psrc = rule->ip.v6.src_port;
+		fsp->h_u.tcp_ip6_spec.pdst = rule->ip.v6.dst_port;
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6src,
+		       rule->mask.v6.src_ip,
+		       sizeof(struct in6_addr));
+		memcpy(fsp->m_u.tcp_ip6_spec.ip6dst,
+		       rule->mask.v6.dst_ip,
+		       sizeof(struct in6_addr));
+		fsp->m_u.tcp_ip6_spec.psrc = rule->mask.v6.src_port;
+		fsp->m_u.tcp_ip6_spec.pdst = rule->mask.v6.dst_port;
+		fsp->h_u.tcp_ip6_spec.tclass = rule->ip.v6.tc;
+		fsp->m_u.tcp_ip6_spec.tclass = rule->mask.v6.tc;
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		break;
+	}
+
+	if (rule->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DROP_PKT)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
+	else
+		fsp->ring_cookie = rule->orig_q_index;
+
+
+	idx = ice_ethtool_flow_to_fltr(fsp->flow_type);
+	if (idx == ICE_FLTR_PTYPE_NONF_NONE) {
+		dev_err(ice_hw_to_dev(hw), "Missing input index for flow_type %d\n",
+			rule->flow_type);
+		ret = -EINVAL;
+	}
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+	return ret;
+}
+
+/**
+ * ice_ntuple_get_max_fltr_cnt - return the maximum number of allowed filters
+ * @hw: hardware structure containing filter information
+ */
+u32 ice_ntuple_get_max_fltr_cnt(struct ice_hw *hw)
+{
+	int acl_cnt;
+
+	if (hw->dev_caps.num_funcs < 8)
+		acl_cnt = ICE_AQC_ACL_TCAM_DEPTH / ICE_ACL_ENTIRE_SLICE;
+	else
+		acl_cnt = ICE_AQC_ACL_TCAM_DEPTH / ICE_ACL_HALF_SLICE;
+
+	return ice_get_fdir_cnt_all(hw) + acl_cnt;
+}
+
+/**
+ * ice_get_fdir_fltr_ids - fill buffer with filter IDs of active filters
+ * @hw: hardware structure containing the filter list
+ * @cmd: ethtool command data structure
+ * @rule_locs: ethtool array passed in from OS to receive filter IDs
+ *
+ * Returns 0 as expected for success by ethtool
+ */
+int
+ice_get_fdir_fltr_ids(struct ice_hw *hw, struct ethtool_rxnfc *cmd,
+		      u32 *rule_locs)
+{
+	struct ice_fdir_fltr *f_rule;
+	unsigned int cnt = 0;
+	int val = 0;
+
+	/* report max rule count */
+	cmd->data = ice_ntuple_get_max_fltr_cnt(hw);
+
+	mutex_lock(&hw->fdir_fltr_lock);
+
+	list_for_each_entry(f_rule, &hw->fdir_list_head, fltr_node) {
+		if (cnt == cmd->rule_cnt) {
+			val = -EMSGSIZE;
+			goto release_lock;
+		}
+		rule_locs[cnt] = f_rule->fltr_id;
+		cnt++;
+	}
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+	if (!val)
+		cmd->rule_cnt = cnt;
+	return val;
+}
+
+/**
+ * ice_fdir_remap_entries - update the FDir entries in profile
+ * @prof: FDir structure pointer
+ * @tun: tunneled or non-tunneled packet
+ * @idx: FDir entry index
+ */
+static void
+ice_fdir_remap_entries(struct ice_fd_hw_prof *prof, int tun, int idx)
+{
+	if (idx != prof->cnt && tun < ICE_FD_HW_SEG_MAX) {
+		int i;
+
+		for (i = idx; i < (prof->cnt - 1); i++) {
+			u64 old_entry_h;
+
+			old_entry_h = prof->entry_h[i + 1][tun];
+			prof->entry_h[i][tun] = old_entry_h;
+			prof->vsi_h[i] = prof->vsi_h[i + 1];
+		}
+
+		prof->entry_h[i][tun] = 0;
+		prof->vsi_h[i] = 0;
+	}
+}
+
+/**
+ * ice_fdir_rem_adq_chnl - remove a ADQ channel from HW filter rules
+ * @hw: hardware structure containing filter list
+ * @vsi_idx: VSI handle
+ */
+void ice_fdir_rem_adq_chnl(struct ice_hw *hw, u16 vsi_idx)
+{
+	enum ice_status status;
+	int flow;
+
+	if (!hw->fdir_prof)
+		return;
+
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		struct ice_fd_hw_prof *prof = hw->fdir_prof[flow];
+		int tun, i;
+
+		if (!prof)
+			continue;
+
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			u64 prof_id;
+
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+
+			for (i = 0; i < prof->cnt; i++) {
+				if (prof->vsi_h[i] != vsi_idx)
+					continue;
+
+				prof->entry_h[i][tun] = 0;
+				prof->vsi_h[i] = 0;
+				break;
+			}
+
+			/* after clearing FDir entries update the remaining */
+			ice_fdir_remap_entries(prof, tun, i);
+
+			/* find flow profile corresponding to prof_id and clear
+			 * vsi_idx from bitmap.
+			 */
+			status = ice_flow_rem_vsi_prof(hw, ICE_BLK_FD, vsi_idx, prof_id);
+			if (status) {
+				dev_err(ice_hw_to_dev(hw),
+					"ice_flow_rem_vsi_prof() failed status=%d\n",
+					status);
+			}
+		}
+		prof->cnt--;
+	}
+}
+
+/**
+ * ice_fdir_get_hw_prof - return the ice_fd_hw_proc associated with a flow
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow: FDir flow type to release
+ */
+static struct ice_fd_hw_prof *
+ice_fdir_get_hw_prof(struct ice_hw *hw, enum ice_block blk, int flow)
+{
+	if (blk == ICE_BLK_ACL && hw->acl_prof)
+		return hw->acl_prof[flow];
+
+	if (blk == ICE_BLK_FD && hw->fdir_prof)
+		return hw->fdir_prof[flow];
+
+	return NULL;
+}
+
+/**
+ * ice_fdir_erase_flow_from_hw - remove a flow from the HW profile tables
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow: FDir flow type to release
+ */
+static void
+ice_fdir_erase_flow_from_hw(struct ice_hw *hw, enum ice_block blk, int flow)
+{
+	struct ice_fd_hw_prof *prof = ice_fdir_get_hw_prof(hw, blk, flow);
+	int tun;
+
+	if (!prof)
+		return;
+
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		u64 prof_id;
+		int j;
+
+		prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+
+		for (j = 0; j < prof->cnt; j++) {
+			u16 vsi_num;
+
+			vsi_num = ice_get_hw_vsi_num(hw, prof->vsi_h[j]);
+
+			if (!prof->entry_h[j][tun] || !prof->vsi_h[j])
+				continue;
+
+			ice_rem_prof_id_flow(hw, blk, vsi_num, prof_id);
+			prof->entry_h[j][tun] = 0;
+		}
+
+		ice_flow_rem_prof(hw, blk, prof_id);
+	}
+}
+
+/**
+ * ice_fdir_rem_flow - release the ice_flow structures for a filter type
+ * @hw: hardware structure containing the filter list
+ * @blk: hardware block
+ * @flow_type: FDir flow type to release
+ */
+static void
+ice_fdir_rem_flow(struct ice_hw *hw, enum ice_block blk,
+		  enum ice_fltr_ptype flow_type)
+{
+	int flow = (int)flow_type & ~FLOW_EXT;
+	struct ice_fd_hw_prof *prof;
+	int tun, i;
+
+	prof = ice_fdir_get_hw_prof(hw, blk, flow);
+	if (!prof)
+		return;
+
+	ice_fdir_erase_flow_from_hw(hw, blk, flow);
+	for (i = 0; i < prof->cnt; i++)
+		prof->vsi_h[i] = 0;
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		if (!prof->fdir_seg[tun])
+			continue;
+		devm_kfree(ice_hw_to_dev(hw), prof->fdir_seg[tun]);
+		prof->fdir_seg[tun] = NULL;
+	}
+	prof->cnt = 0;
+}
+
+/**
+ * ice_fdir_release_flows - release all flows in use for later replay
+ * @hw: pointer to HW instance
+ */
+void ice_fdir_release_flows(struct ice_hw *hw)
+{
+	int flow;
+
+	/* release Flow Director HW table entries */
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++)
+		ice_fdir_erase_flow_from_hw(hw, ICE_BLK_FD, flow);
+}
+
+/**
+ * ice_fdir_replay_flows - replay HW Flow Director filter info
+ * @hw: pointer to HW instance
+ */
+void ice_fdir_replay_flows(struct ice_hw *hw)
+{
+	int flow;
+
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		int tun;
+
+		if (!hw->fdir_prof[flow] || !hw->fdir_prof[flow]->cnt)
+			continue;
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			struct ice_flow_prof *hw_prof;
+			struct ice_fd_hw_prof *prof;
+			u64 prof_id;
+			int j;
+
+			prof = hw->fdir_prof[flow];
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+			ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id,
+					  prof->fdir_seg[tun], TNL_SEG_CNT(tun),
+					  NULL, 0, &hw_prof);
+			for (j = 0; j < prof->cnt; j++) {
+				enum ice_flow_priority prio;
+				u64 entry_h = 0;
+				int err;
+
+				prio = ICE_FLOW_PRIO_NORMAL;
+				err = ice_flow_add_entry(hw, ICE_BLK_FD,
+							 prof_id,
+							 prof->vsi_h[0],
+							 prof->vsi_h[j],
+							 prio, prof->fdir_seg,
+							 NULL, 0, &entry_h);
+				if (err) {
+					dev_err(ice_hw_to_dev(hw), "Could not replay Flow Director, flow type %d\n",
+						flow);
+					continue;
+				}
+				prof->entry_h[j][tun] = entry_h;
+			}
+		}
+	}
+}
+
+/**
+ * ice_parse_rx_flow_user_data - deconstruct user-defined data
+ * @fsp: pointer to ethtool Rx flow specification
+ * @data: pointer to userdef data structure for storage
+ *
+ * Returns 0 on success, negative error value on failure
+ */
+static int
+ice_parse_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp,
+			    struct ice_rx_flow_userdef *data)
+{
+	u64 value, mask;
+
+	memset(data, 0, sizeof(*data));
+	if (!(fsp->flow_type & FLOW_EXT))
+		return 0;
+
+	value = be64_to_cpu(*((__force __be64 *)fsp->h_ext.data));
+	mask = be64_to_cpu(*((__force __be64 *)fsp->m_ext.data));
+	if (!mask)
+		return 0;
+
+#define ICE_USERDEF_FLEX_WORD_M	GENMASK_ULL(15, 0)
+#define ICE_USERDEF_FLEX_OFFS_S	16
+#define ICE_USERDEF_FLEX_OFFS_M	GENMASK_ULL(31, ICE_USERDEF_FLEX_OFFS_S)
+#define ICE_USERDEF_FLEX_FLTR_M	GENMASK_ULL(31, 0)
+
+	/* 0x1fe is the maximum value for offsets stored in the internal
+	 * filtering tables.
+	 */
+#define ICE_USERDEF_FLEX_MAX_OFFS_VAL 0x1fe
+
+	if (!ice_is_mask_valid(mask, ICE_USERDEF_FLEX_FLTR_M) ||
+	    value > ICE_USERDEF_FLEX_FLTR_M)
+		return -EINVAL;
+
+	data->flex_word = value & ICE_USERDEF_FLEX_WORD_M;
+	data->flex_offset = (value & ICE_USERDEF_FLEX_OFFS_M) >>
+			     ICE_USERDEF_FLEX_OFFS_S;
+	if (data->flex_offset > ICE_USERDEF_FLEX_MAX_OFFS_VAL)
+		return -EINVAL;
+
+	data->flex_fltr = true;
+
+	return 0;
+}
+
+/**
+ * ice_fdir_num_avail_fltr - return the number of unused flow director filters
+ * @hw: pointer to hardware structure
+ * @vsi: software VSI structure
+ *
+ * There are 2 filter pools: guaranteed and best effort(shared). Each VSI can
+ * use filters from either pool. The guaranteed pool is divided between VSIs.
+ * The best effort filter pool is common to all VSIs and is a device shared
+ * resource pool. The number of filters available to this VSI is the sum of
+ * the VSIs guaranteed filter pool and the global available best effort
+ * filter pool.
+ *
+ * Returns the number of available flow director filters to this VSI
+ */
+static int ice_fdir_num_avail_fltr(struct ice_hw *hw, struct ice_vsi *vsi)
+{
+	u16 vsi_num = ice_get_hw_vsi_num(hw, vsi->idx);
+	u16 num_guar;
+	u16 num_be;
+
+	/* total guaranteed filters assigned to this VSI */
+	num_guar = vsi->num_gfltr;
+
+	/* minus the guaranteed filters programed by this VSI */
+	num_guar -= (rd32(hw, VSIQF_FD_CNT(vsi_num)) &
+		     VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S;
+
+	/* total global best effort filters */
+	num_be = hw->func_caps.fd_fltr_best_effort;
+
+	/* minus the global best effort filters programmed */
+	num_be -= (rd32(hw, GLQF_FD_CNT) & GLQF_FD_CNT_FD_BCNT_M) >>
+		   GLQF_FD_CNT_FD_BCNT_S;
+
+	return num_guar + num_be;
+}
+
+/**
+ * ice_fdir_alloc_flow_prof - allocate FDir flow profile structure(s)
+ * @hw: HW structure containing the FDir flow profile structure(s)
+ * @flow: flow type to allocate the flow profile for
+ *
+ * Allocate the fdir_prof and fdir_prof[flow] if not already created. Return 0
+ * on success and negative on error.
+ */
+static int
+ice_fdir_alloc_flow_prof(struct ice_hw *hw, enum ice_fltr_ptype flow)
+{
+	if (!hw)
+		return -EINVAL;
+
+	if (!hw->fdir_prof) {
+		hw->fdir_prof = devm_kcalloc(ice_hw_to_dev(hw),
+					     ICE_FLTR_PTYPE_MAX,
+					     sizeof(*hw->fdir_prof),
+					     GFP_KERNEL);
+		if (!hw->fdir_prof)
+			return -ENOMEM;
+	}
+
+	if (!hw->fdir_prof[flow]) {
+		hw->fdir_prof[flow] = devm_kzalloc(ice_hw_to_dev(hw),
+						   sizeof(**hw->fdir_prof),
+						   GFP_KERNEL);
+		if (!hw->fdir_prof[flow])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+#ifdef NETIF_F_HW_TC
+/**
+ * ice_fdir_prof_vsi_idx - find or insert a vsi_idx in structure
+ * @prof: pointer to flow director HW profile
+ * @vsi_idx: vsi_idx to locate
+ *
+ * return the index of the vsi_idx. if vsi_idx is not found insert it
+ * into the vsi_h table.
+ */
+static u16
+ice_fdir_prof_vsi_idx(struct ice_fd_hw_prof *prof, int vsi_idx)
+{
+	u16 idx = 0;
+
+	for (idx = 0; idx < prof->cnt; idx++)
+		if (prof->vsi_h[idx] == vsi_idx)
+			return idx;
+
+	if (idx == prof->cnt)
+		prof->vsi_h[prof->cnt++] = vsi_idx;
+	return idx;
+}
+#endif /* NETIF_F_HW_TC */
+
+/**
+ * ice_fdir_set_hw_fltr_rule - Configure HW tables to generate a FDir rule
+ * @pf: pointer to the PF structure
+ * @seg: protocol header description pointer
+ * @flow: filter enum
+ * @tun: FDir segment to program
+ */
+static int
+ice_fdir_set_hw_fltr_rule(struct ice_pf *pf, struct ice_flow_seg_info *seg,
+			  enum ice_fltr_ptype flow, enum ice_fd_hw_seg tun)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *main_vsi, *ctrl_vsi;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_fd_hw_prof *hw_prof;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 entry1_h = 0;
+	u64 entry2_h = 0;
+#ifdef NETIF_F_HW_TC
+	bool del_last;
+#endif /* NETIF_F_HW_TC */
+	u64 prof_id;
+	int err;
+#ifdef NETIF_F_HW_TC
+	int idx;
+#endif /* NETIF_F_HW_TC */
+
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return -EINVAL;
+
+	ctrl_vsi = ice_get_ctrl_vsi(pf);
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	err = ice_fdir_alloc_flow_prof(hw, flow);
+	if (err)
+		return err;
+
+	hw_prof = hw->fdir_prof[flow];
+	old_seg = hw_prof->fdir_seg[tun];
+	if (old_seg) {
+		/* This flow_type already has a changed input set.
+		 * If it matches the requested input set then we are
+		 * done. Or, if it's different then it's an error.
+		 */
+		if (!memcmp(old_seg, seg, sizeof(*seg)))
+			return -EEXIST;
+
+		/* if there are FDir filters using this flow,
+		 * then return error.
+		 */
+		if (hw->fdir_fltr_cnt[flow]) {
+			dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+			return -EINVAL;
+		}
+		if (ice_is_arfs_using_perfect_flow(hw, flow)) {
+			dev_err(dev, "aRFS using perfect flow type %d, cannot change input set\n",
+				flow);
+			return -EINVAL;
+		}
+
+		/* remove HW filter definition */
+		ice_fdir_rem_flow(hw, ICE_BLK_FD, flow);
+	}
+
+	/* Adding a profile, but there is only one header supported.
+	 * That is the final parameters are 1 header (segment), no
+	 * actions (NULL) and zero actions 0.
+	 */
+	prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+	status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg,
+				   TNL_SEG_CNT(tun), NULL, 0, &prof);
+	if (status)
+		return ice_status_to_errno(status);
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, main_vsi->idx,
+				    main_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry1_h);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_prof;
+	}
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, main_vsi->idx,
+				    ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry2_h);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_entry;
+	}
+
+	hw_prof->fdir_seg[tun] = seg;
+	hw_prof->entry_h[0][tun] = entry1_h;
+	hw_prof->entry_h[1][tun] = entry2_h;
+	hw_prof->vsi_h[0] = main_vsi->idx;
+	hw_prof->vsi_h[1] = ctrl_vsi->idx;
+	if (!hw_prof->cnt)
+		hw_prof->cnt = 2;
+
+#ifdef NETIF_F_HW_TC
+	for (idx = 1; idx < ICE_CHNL_MAX_TC; idx++) {
+		u16 vsi_idx;
+		u16 vsi_h;
+
+		if (!ice_is_adq_active(pf) || !main_vsi->tc_map_vsi[idx])
+			continue;
+
+		entry1_h = 0;
+		vsi_h = main_vsi->tc_map_vsi[idx]->idx;
+		status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id,
+					    main_vsi->idx, vsi_h,
+					    ICE_FLOW_PRIO_NORMAL, seg, NULL, 0,
+					    &entry1_h);
+		if (status) {
+			dev_err(dev, "Could not add Channel VSI %d to flow group\n",
+				idx);
+			goto err_unroll;
+		}
+
+		vsi_idx = ice_fdir_prof_vsi_idx(hw_prof,
+						main_vsi->tc_map_vsi[idx]->idx);
+		hw_prof->entry_h[vsi_idx][tun] = entry1_h;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	return 0;
+
+#ifdef NETIF_F_HW_TC
+err_unroll:
+	entry1_h = 0;
+	hw_prof->fdir_seg[tun] = NULL;
+
+	/* The variable del_last will be used to determine when to clean up
+	 * the VSI group data. The VSI data is not needed if there are no
+	 * segments.
+	 */
+	del_last = true;
+	for (idx = 0; idx < ICE_FD_HW_SEG_MAX; idx++)
+		if (hw_prof->fdir_seg[idx]) {
+			del_last = false;
+			break;
+		}
+
+	for (idx = 0; idx < hw_prof->cnt; idx++) {
+		u16 vsi_num = ice_get_hw_vsi_num(hw, hw_prof->vsi_h[idx]);
+
+		if (!hw_prof->entry_h[idx][tun])
+			continue;
+		ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id);
+		ice_flow_rem_entry(hw, ICE_BLK_FD, hw_prof->entry_h[idx][tun]);
+		hw_prof->entry_h[idx][tun] = 0;
+		if (del_last)
+			hw_prof->vsi_h[idx] = 0;
+	}
+	if (del_last)
+		hw_prof->cnt = 0;
+#endif /* NETIF_F_HW_TC */
+err_entry:
+	ice_rem_prof_id_flow(hw, ICE_BLK_FD,
+			     ice_get_hw_vsi_num(hw, main_vsi->idx), prof_id);
+	ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h);
+err_prof:
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf))
+		dev_err(dev, "Failed to add filter.  Flow director filters must have the same input set as ADQ filters.\n");
+	else
+		dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+#else /* !NETIF_F_HW_TC */
+	dev_err(dev, "Failed to add filter.  Flow director filters on each port must have the same input set.\n");
+#endif /* !NETIF_F_HW_TC */
+
+	return err;
+}
+
+/**
+ * ice_set_init_fdir_seg
+ * @seg: flow segment for programming
+ * @l3_proto: ICE_FLOW_SEG_HDR_IPV4 or ICE_FLOW_SEG_HDR_IPV6
+ * @l4_proto: ICE_FLOW_SEG_HDR_TCP or ICE_FLOW_SEG_HDR_UDP
+ *
+ * Set the configuration for perfect filters to the provided flow segment for
+ * programming the HW filter. This is to be called only when initializing
+ * filters as this function it assumes no filters exist.
+ */
+static int
+ice_set_init_fdir_seg(struct ice_flow_seg_info *seg,
+		      enum ice_flow_seg_hdr l3_proto,
+		      enum ice_flow_seg_hdr l4_proto)
+{
+	enum ice_flow_field src_addr, dst_addr, src_port, dst_port;
+
+	if (!seg)
+		return -EINVAL;
+
+	if (l3_proto == ICE_FLOW_SEG_HDR_IPV4) {
+		src_addr = ICE_FLOW_FIELD_IDX_IPV4_SA;
+		dst_addr = ICE_FLOW_FIELD_IDX_IPV4_DA;
+	} else if (l3_proto == ICE_FLOW_SEG_HDR_IPV6) {
+		src_addr = ICE_FLOW_FIELD_IDX_IPV6_SA;
+		dst_addr = ICE_FLOW_FIELD_IDX_IPV6_DA;
+	} else {
+		return -EINVAL;
+	}
+
+	if (l4_proto == ICE_FLOW_SEG_HDR_TCP) {
+		src_port = ICE_FLOW_FIELD_IDX_TCP_SRC_PORT;
+		dst_port = ICE_FLOW_FIELD_IDX_TCP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_UDP) {
+		src_port = ICE_FLOW_FIELD_IDX_UDP_SRC_PORT;
+		dst_port = ICE_FLOW_FIELD_IDX_UDP_DST_PORT;
+	} else {
+		return -EINVAL;
+	}
+
+	ICE_FLOW_SET_HDRS(seg, l3_proto | l4_proto);
+
+	/* IP source address */
+	ice_flow_set_fld(seg, src_addr, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* IP destination address */
+	ice_flow_set_fld(seg, dst_addr, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 source port */
+	ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	/* Layer 4 destination port */
+	ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+			 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, false);
+
+	return 0;
+}
+
+/**
+ * ice_create_init_fdir_rule
+ * @pf: PF structure
+ * @flow: filter enum
+ *
+ * Return error value or 0 on success.
+ */
+static int
+ice_create_init_fdir_rule(struct ice_pf *pf, enum ice_fltr_ptype flow)
+{
+	struct ice_flow_seg_info *seg, *tun_seg;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int ret;
+
+	/* if there is already a filter rule for kind return -EINVAL */
+	if (hw->fdir_prof && hw->fdir_prof[flow] &&
+	    hw->fdir_prof[flow]->fdir_seg[0])
+		return -EINVAL;
+
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	tun_seg = devm_kzalloc(dev, sizeof(*seg) * ICE_FD_HW_SEG_MAX,
+			       GFP_KERNEL);
+	if (!tun_seg) {
+		devm_kfree(dev, seg);
+		return -ENOMEM;
+	}
+
+	if (flow == ICE_FLTR_PTYPE_NONF_IPV4_TCP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV4,
+					    ICE_FLOW_SEG_HDR_TCP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV4_UDP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV4,
+					    ICE_FLOW_SEG_HDR_UDP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV6_TCP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV6,
+					    ICE_FLOW_SEG_HDR_TCP);
+	else if (flow == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		ret = ice_set_init_fdir_seg(seg, ICE_FLOW_SEG_HDR_IPV6,
+					    ICE_FLOW_SEG_HDR_UDP);
+	else
+		ret = -EINVAL;
+	if (ret)
+		goto err_exit;
+
+	/* add filter for outer headers */
+	ret = ice_fdir_set_hw_fltr_rule(pf, seg, flow, ICE_FD_HW_SEG_NON_TUN);
+	if (ret)
+		/* could not write filter, free memory */
+		goto err_exit;
+
+	/* make tunneled filter HW entries if possible */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+	ret = ice_fdir_set_hw_fltr_rule(pf, tun_seg, flow, ICE_FD_HW_SEG_TUN);
+	if (ret)
+		/* could not write tunnel filter, but outer header filter
+		 * exists
+		 */
+		devm_kfree(dev, tun_seg);
+
+	set_bit(flow, hw->fdir_perfect_fltr);
+	return ret;
+err_exit:
+	devm_kfree(dev, tun_seg);
+	devm_kfree(dev, seg);
+
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ice_ntuple_check_ip4_seg - Check valid fields are provided for filter
+ * @tcp_ip4_spec: mask data from ethtool
+ */
+int ice_ntuple_check_ip4_seg(struct ethtool_tcpip4_spec *tcp_ip4_spec)
+{
+	/* make sure we don't have any empty rule */
+	if (!tcp_ip4_spec->psrc && !tcp_ip4_spec->ip4src &&
+	    !tcp_ip4_spec->pdst && !tcp_ip4_spec->ip4dst)
+		return -EINVAL;
+
+	/* filtering on TOS not supported */
+	if (tcp_ip4_spec->tos)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_l4_proto_to_port
+ * @l4_proto: Layer 4 protocol to program
+ * @src_port: source flow field value for provided l4 protocol
+ * @dst_port: destination flow field value for provided l4 protocol
+ *
+ * Set associated src and dst port for given l4 protocol
+ */
+int
+ice_ntuple_l4_proto_to_port(enum ice_flow_seg_hdr l4_proto,
+			    enum ice_flow_field *src_port,
+			    enum ice_flow_field *dst_port)
+{
+	if (l4_proto == ICE_FLOW_SEG_HDR_TCP) {
+		*src_port = ICE_FLOW_FIELD_IDX_TCP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_TCP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_UDP) {
+		*src_port = ICE_FLOW_FIELD_IDX_UDP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_UDP_DST_PORT;
+	} else if (l4_proto == ICE_FLOW_SEG_HDR_SCTP) {
+		*src_port = ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT;
+		*dst_port = ICE_FLOW_FIELD_IDX_SCTP_DST_PORT;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip4_seg
+ * @seg: flow segment for programming
+ * @tcp_ip4_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ * @perfect_fltr: only valid on success; returns true if perfect filter,
+ *		  false if not
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv4
+ */
+static int
+ice_set_fdir_ip4_seg(struct ice_flow_seg_info *seg,
+		     struct ethtool_tcpip4_spec *tcp_ip4_spec,
+		     enum ice_flow_seg_hdr l4_proto, bool *perfect_fltr)
+{
+	enum ice_flow_field src_port, dst_port;
+	int ret;
+
+	ret = ice_ntuple_check_ip4_seg(tcp_ip4_spec);
+	if (ret)
+		return ret;
+
+	ret = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 | l4_proto);
+
+	/* IP source address */
+	if (tcp_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!tcp_ip4_spec->ip4src)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* IP destination address */
+	if (tcp_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!tcp_ip4_spec->ip4dst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 source port */
+	if (tcp_ip4_spec->psrc == htons(0xFFFF))
+		ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip4_spec->psrc)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 destination port */
+	if (tcp_ip4_spec->pdst == htons(0xFFFF))
+		ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip4_spec->pdst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_check_ip4_usr_seg - Check valid fields are provided for filter
+ * @usr_ip4_spec: ethtool userdef packet offset
+ */
+int ice_ntuple_check_ip4_usr_seg(struct ethtool_usrip4_spec *usr_ip4_spec)
+{
+	/* first 4 bytes of Layer 4 header */
+	if (usr_ip4_spec->l4_4_bytes)
+		return -EINVAL;
+	if (usr_ip4_spec->tos)
+		return -EINVAL;
+	if (usr_ip4_spec->ip_ver)
+		return -EINVAL;
+	/* Filtering on Layer 4 protocol not supported */
+	if (usr_ip4_spec->proto)
+		return -EOPNOTSUPP;
+	/* empty rules are not valid */
+	if (!usr_ip4_spec->ip4src && !usr_ip4_spec->ip4dst)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip4_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip4_spec: ethtool userdef packet offset
+ * @perfect_fltr: only set on success; returns true if perfect filter, false if
+ *		  not
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv4
+ */
+static int
+ice_set_fdir_ip4_usr_seg(struct ice_flow_seg_info *seg,
+			 struct ethtool_usrip4_spec *usr_ip4_spec,
+			 bool *perfect_fltr)
+{
+	int ret;
+
+	ret = ice_ntuple_check_ip4_usr_seg(usr_ip4_spec);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4);
+
+	/* IP source address */
+	if (usr_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!usr_ip4_spec->ip4src)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* IP destination address */
+	if (usr_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV4_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!usr_ip4_spec->ip4dst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+/**
+ * ice_ntuple_check_ip6_seg - Check valid fields are provided for filter
+ * @tcp_ip6_spec: mask data from ethtool
+ */
+static int ice_ntuple_check_ip6_seg(struct ethtool_tcpip6_spec *tcp_ip6_spec)
+{
+	/* make sure we don't have any empty rule */
+	if (!memcmp(tcp_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !memcmp(tcp_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !tcp_ip6_spec->psrc && !tcp_ip6_spec->pdst)
+		return -EINVAL;
+
+	/* filtering on TC not supported */
+	if (tcp_ip6_spec->tclass)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip6_seg
+ * @seg: flow segment for programming
+ * @tcp_ip6_spec: mask data from ethtool
+ * @l4_proto: Layer 4 protocol to program
+ * @perfect_fltr: only valid on success; returns true if perfect filter,
+ *		  false if not
+ *
+ * Set the mask data into the flow segment to be used to program HW
+ * table based on provided L4 protocol for IPv6
+ */
+static int
+ice_set_fdir_ip6_seg(struct ice_flow_seg_info *seg,
+		     struct ethtool_tcpip6_spec *tcp_ip6_spec,
+		     enum ice_flow_seg_hdr l4_proto, bool *perfect_fltr)
+{
+	enum ice_flow_field src_port, dst_port;
+	int ret;
+
+	ret = ice_ntuple_check_ip6_seg(tcp_ip6_spec);
+	if (ret)
+		return ret;
+
+	ret = ice_ntuple_l4_proto_to_port(l4_proto, &src_port, &dst_port);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 | l4_proto);
+
+	if (!memcmp(tcp_ip6_spec->ip6src, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(tcp_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	if (!memcmp(tcp_ip6_spec->ip6dst, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(tcp_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 source port */
+	if (tcp_ip6_spec->psrc == htons(0xFFFF))
+		ice_flow_set_fld(seg, src_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip6_spec->psrc)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	/* Layer 4 destination port */
+	if (tcp_ip6_spec->pdst == htons(0xFFFF))
+		ice_flow_set_fld(seg, dst_port, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 false);
+	else if (!tcp_ip6_spec->pdst)
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ice_ntuple_check_ip6_usr_seg - Check valid fields are provided for filter
+ * @usr_ip6_spec: ethtool userdef packet offset
+ */
+static int
+ice_ntuple_check_ip6_usr_seg(struct ethtool_usrip6_spec *usr_ip6_spec)
+{
+	/* filtering on Layer 4 bytes not supported */
+	if (usr_ip6_spec->l4_4_bytes)
+		return -EOPNOTSUPP;
+	/* filtering on TC not supported */
+	if (usr_ip6_spec->tclass)
+		return -EOPNOTSUPP;
+	/* filtering on Layer 4 protocol not supported */
+	if (usr_ip6_spec->l4_proto)
+		return -EOPNOTSUPP;
+	/* empty rules are not valid */
+	if (!memcmp(usr_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)) &&
+	    !memcmp(usr_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_set_fdir_ip6_usr_seg
+ * @seg: flow segment for programming
+ * @usr_ip6_spec: ethtool userdef packet offset
+ * @perfect_fltr: only set on success; returns true if perfect filter, false if
+ *		  not
+ *
+ * Set the offset data into the flow segment to be used to program HW
+ * table for IPv6
+ */
+static int
+ice_set_fdir_ip6_usr_seg(struct ice_flow_seg_info *seg,
+			 struct ethtool_usrip6_spec *usr_ip6_spec,
+			 bool *perfect_fltr)
+{
+	int ret;
+
+	ret = ice_ntuple_check_ip6_usr_seg(usr_ip6_spec);
+	if (ret)
+		return ret;
+
+	*perfect_fltr = true;
+	ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6);
+
+	if (!memcmp(usr_ip6_spec->ip6src, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_SA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(usr_ip6_spec->ip6src, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	if (!memcmp(usr_ip6_spec->ip6dst, &full_ipv6_addr_mask,
+		    sizeof(struct in6_addr)))
+		ice_flow_set_fld(seg, ICE_FLOW_FIELD_IDX_IPV6_DA,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	else if (!memcmp(usr_ip6_spec->ip6dst, &zero_ipv6_addr_mask,
+			 sizeof(struct in6_addr)))
+		*perfect_fltr = false;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+
+/**
+ * ice_cfg_fdir_xtrct_seq - Configure extraction sequence for the given filter
+ * @pf: PF structure
+ * @fsp: pointer to ethtool Rx flow specification
+ * @user: user defined data from flow specification
+ *
+ * Returns 0 on success.
+ */
+static int
+ice_cfg_fdir_xtrct_seq(struct ice_pf *pf, struct ethtool_rx_flow_spec *fsp,
+		       struct ice_rx_flow_userdef *user)
+{
+	struct ice_flow_seg_info *seg, *tun_seg;
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_fltr_ptype fltr_idx;
+	struct ice_hw *hw = &pf->hw;
+	bool perfect_filter;
+	int ret;
+
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	tun_seg = devm_kzalloc(dev, sizeof(*seg) * ICE_FD_HW_SEG_MAX,
+			       GFP_KERNEL);
+	if (!tun_seg) {
+		devm_kfree(dev, seg);
+		return -ENOMEM;
+	}
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_TCP,
+					   &perfect_filter);
+		break;
+	case UDP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_UDP,
+					   &perfect_filter);
+		break;
+	case SCTP_V4_FLOW:
+		ret = ice_set_fdir_ip4_seg(seg, &fsp->m_u.tcp_ip4_spec,
+					   ICE_FLOW_SEG_HDR_SCTP,
+					   &perfect_filter);
+		break;
+	case IPV4_USER_FLOW:
+		ret = ice_set_fdir_ip4_usr_seg(seg, &fsp->m_u.usr_ip4_spec,
+					       &perfect_filter);
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case TCP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_TCP,
+					   &perfect_filter);
+		break;
+	case UDP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_UDP,
+					   &perfect_filter);
+		break;
+	case SCTP_V6_FLOW:
+		ret = ice_set_fdir_ip6_seg(seg, &fsp->m_u.tcp_ip6_spec,
+					   ICE_FLOW_SEG_HDR_SCTP,
+					   &perfect_filter);
+		break;
+	case IPV6_USER_FLOW:
+		ret = ice_set_fdir_ip6_usr_seg(seg, &fsp->m_u.usr_ip6_spec,
+					       &perfect_filter);
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		ret = -EINVAL;
+	}
+	if (ret)
+		goto err_exit;
+
+	/* tunnel segments are shifted up one. */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+
+	if (user && user->flex_fltr) {
+		perfect_filter = false;
+		ice_flow_add_fld_raw(seg, user->flex_offset,
+				     ICE_FLTR_PRGM_FLEX_WORD_SIZE,
+				     ICE_FLOW_FLD_OFF_INVAL,
+				     ICE_FLOW_FLD_OFF_INVAL);
+		ice_flow_add_fld_raw(&tun_seg[1], user->flex_offset,
+				     ICE_FLTR_PRGM_FLEX_WORD_SIZE,
+				     ICE_FLOW_FLD_OFF_INVAL,
+				     ICE_FLOW_FLD_OFF_INVAL);
+	}
+
+	/* add filter for outer headers */
+	fltr_idx = ice_ethtool_flow_to_fltr(fsp->flow_type & ~FLOW_EXT);
+	ret = ice_fdir_set_hw_fltr_rule(pf, seg, fltr_idx,
+					ICE_FD_HW_SEG_NON_TUN);
+	if (ret == -EEXIST)
+		/* Rule already exists, free memory and continue */
+		devm_kfree(dev, seg);
+	else if (ret)
+		/* could not write filter, free memory */
+		goto err_exit;
+
+	/* make tunneled filter HW entries if possible */
+	memcpy(&tun_seg[1], seg, sizeof(*seg));
+	ret = ice_fdir_set_hw_fltr_rule(pf, tun_seg, fltr_idx,
+					ICE_FD_HW_SEG_TUN);
+	if (ret == -EEXIST) {
+		/* Rule already exists, free memory and count as success */
+		devm_kfree(dev, tun_seg);
+		ret = 0;
+	} else if (ret) {
+		/* could not write tunnel filter, but outer filter exists */
+		devm_kfree(dev, tun_seg);
+	}
+
+	if (perfect_filter)
+		set_bit(fltr_idx, hw->fdir_perfect_fltr);
+	else
+		clear_bit(fltr_idx, hw->fdir_perfect_fltr);
+
+	return ret;
+
+err_exit:
+	devm_kfree(dev, tun_seg);
+	devm_kfree(dev, seg);
+
+	return -EOPNOTSUPP;
+}
+
+/**
+ * ice_update_per_q_fltr
+ * @vsi: ptr to VSI
+ * @q_index: queue index
+ * @inc: true to increment or false to decrement per queue filter count
+ *
+ * This function is used to keep track of per queue sideband filters
+ */
+static void ice_update_per_q_fltr(struct ice_vsi *vsi, u32 q_index, bool inc)
+{
+	struct ice_ring *rx_ring;
+
+	if (!vsi->num_rxq || q_index >= vsi->num_rxq)
+		return;
+
+	rx_ring = vsi->rx_rings[q_index];
+	if (!rx_ring || !rx_ring->ch)
+		return;
+
+	if (inc)
+		atomic_inc(&rx_ring->ch->num_sb_fltr);
+	else
+		atomic_dec_if_positive(&rx_ring->ch->num_sb_fltr);
+}
+
+/**
+ * ice_fdir_write_fltr - send a flow director filter to the hardware
+ * @pf: PF data structure
+ * @input: filter structure
+ * @add: true adds filter and false removed filter
+ * @is_tun: true adds inner filter on tunnel and false outer headers
+ *
+ * returns 0 on success and negative value on error
+ */
+int
+ice_fdir_write_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input, bool add,
+		    bool is_tun)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_fltr_desc desc;
+	struct ice_vsi *ctrl_vsi;
+	enum ice_status status;
+	u8 *pkt, *frag_pkt;
+	bool has_frag;
+	int err;
+
+	ctrl_vsi = ice_get_ctrl_vsi(pf);
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+	frag_pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!frag_pkt) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	ice_fdir_get_prgm_desc(hw, input, &desc, add);
+	status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto err_free_all;
+	}
+	err = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt);
+	if (err)
+		goto err_free_all;
+
+	/* repeat for fragment packet */
+	has_frag = ice_fdir_has_frag(input->flow_type);
+	if (has_frag) {
+		/* does not return error */
+		ice_fdir_get_prgm_desc(hw, input, &desc, add);
+		status = ice_fdir_get_gen_prgm_pkt(hw, input, frag_pkt, true,
+						   is_tun);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto err_frag;
+		}
+		err = ice_prgm_fdir_fltr(ctrl_vsi, &desc, frag_pkt);
+		if (err)
+			goto err_frag;
+	} else {
+		devm_kfree(dev, frag_pkt);
+	}
+
+	return 0;
+
+err_free_all:
+	devm_kfree(dev, frag_pkt);
+err_free:
+	devm_kfree(dev, pkt);
+	return err;
+
+err_frag:
+	devm_kfree(dev, frag_pkt);
+	return err;
+}
+
+/**
+ * ice_fdir_write_all_fltr - send a flow director filter to the hardware
+ * @pf: PF data structure
+ * @input: filter structure
+ * @add: true adds filter and false removed filter
+ *
+ * returns 0 on success and negative value on error
+ */
+static int
+ice_fdir_write_all_fltr(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			bool add)
+{
+	u16 port_num;
+	int tun;
+
+	for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+		bool is_tun = tun == ICE_FD_HW_SEG_TUN;
+		int err;
+
+		if (is_tun && !ice_get_open_tunnel_port(&pf->hw, TNL_ALL,
+							&port_num))
+			continue;
+		err = ice_fdir_write_fltr(pf, input, add, is_tun);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * ice_fdir_replay_fltrs - replay filters from the HW filter list
+ * @pf: board private structure
+ */
+void ice_fdir_replay_fltrs(struct ice_pf *pf)
+{
+	struct ice_fdir_fltr *f_rule;
+	struct ice_hw *hw = &pf->hw;
+
+	list_for_each_entry(f_rule, &hw->fdir_list_head, fltr_node) {
+		int err = ice_fdir_write_all_fltr(pf, f_rule, true);
+
+		if (err)
+			dev_dbg(ice_pf_to_dev(pf), "Flow Director error %d, could not reprogram filter %d\n",
+				err, f_rule->fltr_id);
+	}
+}
+
+/**
+ * ice_fdir_create_dflt_rules - create default perfect filters
+ * @pf: PF data structure
+ *
+ * Returns 0 for success or error.
+ */
+int ice_fdir_create_dflt_rules(struct ice_pf *pf)
+{
+	int err;
+
+	/* Create perfect TCP and UDP rules in hardware. */
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV4_TCP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV4_UDP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV6_TCP);
+	if (err)
+		return err;
+
+	err = ice_create_init_fdir_rule(pf, ICE_FLTR_PTYPE_NONF_IPV6_UDP);
+
+	return err;
+}
+
+/**
+ * ice_vsi_manage_fdir - turn on/off flow director
+ * @vsi: the VSI being changed
+ * @ena: boolean value indicating if this is an enable or disable request
+ */
+void ice_vsi_manage_fdir(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_fdir_fltr *f_rule, *tmp;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_fltr_ptype flow;
+
+	if (ena) {
+		set_bit(ICE_FLAG_FD_ENA, pf->flags);
+		ice_fdir_create_dflt_rules(pf);
+		return;
+	}
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	if (!test_and_clear_bit(ICE_FLAG_FD_ENA, pf->flags))
+		goto release_lock;
+	list_for_each_entry_safe(f_rule, tmp, &hw->fdir_list_head, fltr_node) {
+		if (!f_rule->acl_fltr)
+			ice_fdir_write_all_fltr(pf, f_rule, false);
+		ice_fdir_update_cntrs(hw, f_rule->flow_type, f_rule->acl_fltr,
+				      false);
+		list_del(&f_rule->fltr_node);
+		devm_kfree(ice_pf_to_dev(pf), f_rule);
+	}
+
+	if (hw->fdir_prof)
+		for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX;
+		     flow++)
+			if (hw->fdir_prof[flow])
+				ice_fdir_rem_flow(hw, ICE_BLK_FD, flow);
+
+	if (hw->acl_prof)
+		for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX;
+		     flow++)
+			if (hw->acl_prof[flow])
+				ice_fdir_rem_flow(hw, ICE_BLK_ACL, flow);
+
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+}
+
+/**
+ * ice_del_acl_ethtool - Deletes an ACL rule entry.
+ * @hw: pointer to HW instance
+ * @fltr: filter structure
+ *
+ * returns 0 on success and negative value on error
+ */
+static int
+ice_del_acl_ethtool(struct ice_hw *hw, struct ice_fdir_fltr *fltr)
+{
+	u64 entry;
+
+	entry = ice_flow_find_entry(hw, ICE_BLK_ACL, fltr->fltr_id);
+	return ice_status_to_errno(ice_flow_rem_entry(hw, ICE_BLK_ACL, entry));
+}
+
+/**
+ * ice_fdir_do_rem_flow - delete flow and possibly add perfect flow
+ * @pf: PF structure
+ * @flow_type: FDir flow type to release
+ */
+static void
+ice_fdir_do_rem_flow(struct ice_pf *pf, enum ice_fltr_ptype flow_type)
+{
+	struct ice_hw *hw = &pf->hw;
+	bool need_perfect = false;
+
+	if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+	    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP)
+		need_perfect = true;
+
+	if (need_perfect && test_bit(flow_type, hw->fdir_perfect_fltr))
+		return;
+
+	ice_fdir_rem_flow(hw, ICE_BLK_FD, flow_type);
+	if (need_perfect)
+		ice_create_init_fdir_rule(pf, flow_type);
+}
+
+/**
+ * ice_ntuple_update_list_entry - add or delete a filter from the filter list
+ * @pf: PF structure
+ * @input: filter structure
+ * @fltr_idx: ethtool index of filter to modify
+ *
+ * returns 0 on success and negative on errors
+ */
+int
+ice_ntuple_update_list_entry(struct ice_pf *pf, struct ice_fdir_fltr *input,
+			     int fltr_idx)
+{
+	struct ice_fdir_fltr *old_fltr;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_vsi *vsi;
+	int err = -ENOENT;
+
+	/* Do not update filters during reset */
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EINVAL;
+
+	old_fltr = ice_fdir_find_fltr_by_idx(hw, fltr_idx);
+	if (old_fltr) {
+		if (!old_fltr->acl_fltr) {
+			/* FD filter */
+			err = ice_fdir_write_all_fltr(pf, old_fltr, false);
+			if (err)
+				return err;
+		} else {
+			/* ACL filter - if the input buffer is present
+			 * then this is an update and we don't want to
+			 * delete the filter from the HW. we've already
+			 * written the change to the HW at this point, so
+			 * just update the SW structures to make sure
+			 * everything is hunky-dory. if no input then this
+			 * is a delete so we should delete the filter from
+			 * the HW and clean up our SW structures.
+			 */
+			if (!input) {
+				err = ice_del_acl_ethtool(hw, old_fltr);
+				if (err)
+					return err;
+			}
+		}
+		ice_fdir_update_cntrs(hw, old_fltr->flow_type,
+				      old_fltr->acl_fltr, false);
+		/* update sb-filters count, specific to ring->channel */
+		ice_update_per_q_fltr(vsi, old_fltr->orig_q_index, false);
+		/* Also delete the HW filter info if we have just deleted the
+		 * last filter of flow_type.
+		 */
+		if (!old_fltr->acl_fltr && !input &&
+		    !hw->fdir_fltr_cnt[old_fltr->flow_type])
+			ice_fdir_do_rem_flow(pf, old_fltr->flow_type);
+		else if (old_fltr->acl_fltr && !input &&
+			 !hw->acl_fltr_cnt[old_fltr->flow_type])
+			ice_fdir_rem_flow(hw, ICE_BLK_ACL, old_fltr->flow_type);
+		list_del(&old_fltr->fltr_node);
+		devm_kfree(ice_pf_to_dev(pf), old_fltr);
+	}
+	if (!input)
+		return err;
+	ice_fdir_list_add_fltr(hw, input);
+	/* update sb-filters count, specific to ring->channel */
+	ice_update_per_q_fltr(vsi, input->orig_q_index, true);
+	ice_fdir_update_cntrs(hw, input->flow_type, input->acl_fltr, true);
+	return 0;
+}
+
+/**
+ * ice_del_ntuple_ethtool - delete Flow Director or ACL filter
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete the filter
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_del_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp =
+		(struct ethtool_rx_flow_spec *)&cmd->fs;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	int val;
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EOPNOTSUPP;
+
+	/* Do not delete filters during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(ice_pf_to_dev(pf), "Device is resetting - deleting Flow Director filters not supported during reset\n");
+		return -EBUSY;
+	}
+
+	if (test_bit(ICE_FD_FLUSH_REQ, pf->state))
+		return -EBUSY;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	val = ice_ntuple_update_list_entry(pf, NULL, fsp->location);
+	mutex_unlock(&hw->fdir_fltr_lock);
+
+	return val;
+}
+
+/**
+ * ice_update_ring_dest_vsi - update dest ring and dest VSI
+ * @vsi: pointer to target VSI
+ * @dest_vsi: ptr to dest VSI index
+ * @ring: ptr to dest ring
+ *
+ * This function updates destination VSI and queue if user specifies
+ * target queue which falls in channel's (aka ADQ) queue region
+ */
+void
+ice_update_ring_dest_vsi(struct ice_vsi *vsi, u16 *dest_vsi, u32 *ring)
+{
+	struct ice_channel *ch;
+
+	if (!ring || !dest_vsi)
+		return;
+
+	list_for_each_entry(ch, &vsi->ch_list, list) {
+		if (!ch->ch_vsi)
+			continue;
+
+		/* make sure to locate corresponding channel based on "queue"
+		 * specified
+		 */
+		if ((*ring < ch->base_q) ||
+		    (*ring > (ch->base_q + ch->num_rxq)))
+			continue;
+
+		/* update the dest_vsi based on channel */
+		*dest_vsi = ch->ch_vsi->idx;
+
+		/* update the "ring" to be correct based on channel */
+		*ring -= ch->base_q;
+	}
+}
+
+/**
+ * ice_is_acl_filter - Checks if it's a FD or ACL filter
+ * @fsp: pointer to ethtool Rx flow specification
+ *
+ * If any field of the provided filter is using a partial mask then this is
+ * an ACL filter.
+ *
+ * Returns true if ACL filter otherwise false.
+ */
+static bool ice_is_acl_filter(struct ethtool_rx_flow_spec *fsp)
+{
+	struct ethtool_tcpip4_spec *tcp_ip4_spec;
+	struct ethtool_usrip4_spec *usr_ip4_spec;
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		tcp_ip4_spec = &fsp->m_u.tcp_ip4_spec;
+
+		/* IP source address */
+		if (tcp_ip4_spec->ip4src &&
+		    tcp_ip4_spec->ip4src != htonl(0xFFFFFFFF))
+			return true;
+
+		/* IP destination address */
+		if (tcp_ip4_spec->ip4dst &&
+		    tcp_ip4_spec->ip4dst != htonl(0xFFFFFFFF))
+			return true;
+
+		/* Layer 4 source port */
+		if (tcp_ip4_spec->psrc && tcp_ip4_spec->psrc != htons(0xFFFF))
+			return true;
+
+		/* Layer 4 destination port */
+		if (tcp_ip4_spec->pdst && tcp_ip4_spec->pdst != htons(0xFFFF))
+			return true;
+
+		break;
+	case IPV4_USER_FLOW:
+		usr_ip4_spec = &fsp->m_u.usr_ip4_spec;
+
+		/* IP source address */
+		if (usr_ip4_spec->ip4src &&
+		    usr_ip4_spec->ip4src != htonl(0xFFFFFFFF))
+			return true;
+
+		/* IP destination address */
+		if (usr_ip4_spec->ip4dst &&
+		    usr_ip4_spec->ip4dst != htonl(0xFFFFFFFF))
+			return true;
+
+		break;
+	}
+
+	return false;
+}
+
+/**
+ * ice_ntuple_set_input_set - Set the input set for specified block
+ * @blk: filter block to configure
+ * @vsi: pointer to target VSI
+ * @fsp: pointer to ethtool Rx flow specification
+ * @input: filter structure
+ */
+int
+ice_ntuple_set_input_set(struct ice_vsi *vsi, enum ice_block blk,
+			 struct ethtool_rx_flow_spec *fsp,
+			 struct ice_fdir_fltr *input)
+{
+	u16 dest_vsi, q_index = 0;
+	int flow_type, flow_mask;
+	u16 orig_q_index = 0;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u8 dest_ctl;
+
+	if (blk == ICE_BLK_FD)
+		flow_mask = FLOW_EXT;
+	else if (blk == ICE_BLK_ACL)
+		flow_mask = FLOW_MAC_EXT;
+	else
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+
+	dest_vsi = vsi->idx;
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DROP_PKT;
+	} else {
+		u32 ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
+		u8 vf = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie);
+
+		if (!vf) {
+			if (ring >= vsi->num_rxq)
+				return -EINVAL;
+			orig_q_index = ring;
+			ice_update_ring_dest_vsi(vsi, &dest_vsi, &ring);
+		} else {
+			dev_err(ice_pf_to_dev(pf), "Failed to add filter. %s filters are not supported on VF queues.\n",
+				blk == ICE_BLK_FD ? "Flow Director" : "ACL");
+			return -EINVAL;
+		}
+		dest_ctl = ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+		q_index = ring;
+	}
+
+	input->fltr_id = fsp->location;
+	input->q_index = q_index;
+	flow_type = fsp->flow_type & ~flow_mask;
+
+	/* Record the original queue as specified by user, because
+	 * due to channel, configuration 'q_index' gets adjusted
+	 * accordingly, but to keep user experience same - queue of
+	 * flow-director filter shall report original queue number
+	 * as specified by user, hence record it and use it later
+	 */
+	input->orig_q_index = orig_q_index;
+	input->dest_vsi = dest_vsi;
+	input->dest_ctl = dest_ctl;
+	input->fltr_status = ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID;
+	input->cnt_index = ICE_FD_SB_STAT_IDX(hw->fd_ctr_base);
+	input->flow_type = ice_ethtool_flow_to_fltr(flow_type);
+
+	if (fsp->flow_type & FLOW_EXT) {
+		memcpy(input->ext_data.usr_def, fsp->h_ext.data,
+		       sizeof(input->ext_data.usr_def));
+		input->ext_data.vlan_type = fsp->h_ext.vlan_etype;
+		input->ext_data.vlan_tag = fsp->h_ext.vlan_tci;
+		memcpy(input->ext_mask.usr_def, fsp->m_ext.data,
+		       sizeof(input->ext_mask.usr_def));
+		input->ext_mask.vlan_type = fsp->m_ext.vlan_etype;
+		input->ext_mask.vlan_tag = fsp->m_ext.vlan_tci;
+	}
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		input->ip.v4.dst_port = fsp->h_u.tcp_ip4_spec.pdst;
+		input->ip.v4.src_port = fsp->h_u.tcp_ip4_spec.psrc;
+		input->ip.v4.dst_ip = fsp->h_u.tcp_ip4_spec.ip4dst;
+		input->ip.v4.src_ip = fsp->h_u.tcp_ip4_spec.ip4src;
+		input->mask.v4.dst_port = fsp->m_u.tcp_ip4_spec.pdst;
+		input->mask.v4.src_port = fsp->m_u.tcp_ip4_spec.psrc;
+		input->mask.v4.dst_ip = fsp->m_u.tcp_ip4_spec.ip4dst;
+		input->mask.v4.src_ip = fsp->m_u.tcp_ip4_spec.ip4src;
+		break;
+	case IPV4_USER_FLOW:
+		input->ip.v4.dst_ip = fsp->h_u.usr_ip4_spec.ip4dst;
+		input->ip.v4.src_ip = fsp->h_u.usr_ip4_spec.ip4src;
+		input->ip.v4.l4_header = fsp->h_u.usr_ip4_spec.l4_4_bytes;
+		input->ip.v4.proto = fsp->h_u.usr_ip4_spec.proto;
+		input->ip.v4.ip_ver = fsp->h_u.usr_ip4_spec.ip_ver;
+		input->ip.v4.tos = fsp->h_u.usr_ip4_spec.tos;
+		input->mask.v4.dst_ip = fsp->m_u.usr_ip4_spec.ip4dst;
+		input->mask.v4.src_ip = fsp->m_u.usr_ip4_spec.ip4src;
+		input->mask.v4.l4_header = fsp->m_u.usr_ip4_spec.l4_4_bytes;
+		input->mask.v4.proto = fsp->m_u.usr_ip4_spec.proto;
+		input->mask.v4.ip_ver = fsp->m_u.usr_ip4_spec.ip_ver;
+		input->mask.v4.tos = fsp->m_u.usr_ip4_spec.tos;
+		break;
+#ifdef HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+		memcpy(input->ip.v6.dst_ip, fsp->h_u.tcp_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->ip.v6.src_ip, fsp->h_u.tcp_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->ip.v6.dst_port = fsp->h_u.tcp_ip6_spec.pdst;
+		input->ip.v6.src_port = fsp->h_u.tcp_ip6_spec.psrc;
+		input->ip.v6.tc = fsp->h_u.tcp_ip6_spec.tclass;
+		memcpy(input->mask.v6.dst_ip, fsp->m_u.tcp_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->mask.v6.src_ip, fsp->m_u.tcp_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->mask.v6.dst_port = fsp->m_u.tcp_ip6_spec.pdst;
+		input->mask.v6.src_port = fsp->m_u.tcp_ip6_spec.psrc;
+		input->mask.v6.tc = fsp->m_u.tcp_ip6_spec.tclass;
+		break;
+	case IPV6_USER_FLOW:
+		memcpy(input->ip.v6.dst_ip, fsp->h_u.usr_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->ip.v6.src_ip, fsp->h_u.usr_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->ip.v6.l4_header = fsp->h_u.usr_ip6_spec.l4_4_bytes;
+		input->ip.v6.tc = fsp->h_u.usr_ip6_spec.tclass;
+
+		/* if no protocol requested, use IPPROTO_NONE */
+		if (!fsp->m_u.usr_ip6_spec.l4_proto)
+			input->ip.v6.proto = IPPROTO_NONE;
+		else
+			input->ip.v6.proto = fsp->h_u.usr_ip6_spec.l4_proto;
+
+		memcpy(input->mask.v6.dst_ip, fsp->m_u.usr_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		memcpy(input->mask.v6.src_ip, fsp->m_u.usr_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		input->mask.v6.l4_header = fsp->m_u.usr_ip6_spec.l4_4_bytes;
+		input->mask.v6.tc = fsp->m_u.usr_ip6_spec.tclass;
+		input->mask.v6.proto = fsp->m_u.usr_ip6_spec.l4_proto;
+		break;
+#endif /* HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC */
+	default:
+		/* not doing un-parsed flow types */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_ntuple_ethtool - Add/Remove Flow Director  or ACL filter
+ * @vsi: pointer to target VSI
+ * @cmd: command to add or delete the filter
+ *
+ * Returns 0 on success and negative values for failure
+ */
+int ice_add_ntuple_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
+{
+	struct ice_rx_flow_userdef userdata;
+	struct ethtool_rx_flow_spec *fsp;
+	struct ice_fdir_fltr *input;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int fltrs_needed;
+	u16 tunnel_port;
+	int ret;
+
+	if (!vsi)
+		return -EINVAL;
+
+	pf = vsi->back;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EOPNOTSUPP;
+
+
+	/* Do not program filters during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(dev, "Device is resetting - adding ntuple filters not supported during reset\n");
+		return -EBUSY;
+	}
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	if (ice_parse_rx_flow_user_data(fsp, &userdata))
+		return -EINVAL;
+
+	if (fsp->flow_type & FLOW_MAC_EXT)
+		return -EINVAL;
+
+	if (fsp->location >= ice_ntuple_get_max_fltr_cnt(hw)) {
+		dev_err(dev, "Failed to add filter.  The maximum number of ntuple filters has been reached.\n");
+		return -ENOSPC;
+	}
+
+	/* ACL filter */
+	if (pf->hw.acl_tbl && ice_is_acl_filter(fsp))
+		return ice_acl_add_rule_ethtool(vsi, cmd);
+
+	ret = ice_cfg_fdir_xtrct_seq(pf, fsp, &userdata);
+	if (ret)
+		return ret;
+
+	/* return error if not an update and no available filters */
+	fltrs_needed = ice_get_open_tunnel_port(hw, TNL_ALL, &tunnel_port) ?
+		2 : 1;
+	if (!ice_fdir_find_fltr_by_idx(hw, fsp->location) &&
+	    ice_fdir_num_avail_fltr(hw, pf->vsi[vsi->idx]) < fltrs_needed) {
+		dev_err(dev, "Failed to add filter.  The maximum number of flow director filters has been reached.\n");
+		return -ENOSPC;
+	}
+
+	input = devm_kzalloc(dev, sizeof(*input), GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	ret = ice_ntuple_set_input_set(vsi, ICE_BLK_FD, fsp, input);
+	if (ret)
+		goto free_input;
+
+	mutex_lock(&hw->fdir_fltr_lock);
+	if (ice_fdir_is_dup_fltr(hw, input)) {
+		ret = -EINVAL;
+		goto release_lock;
+	}
+
+	if (userdata.flex_fltr) {
+		input->flex_fltr = true;
+		input->flex_word = cpu_to_be16(userdata.flex_word);
+		input->flex_offset = userdata.flex_offset;
+	}
+
+	input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	input->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+
+	/* input struct is added to the HW filter list */
+	ice_ntuple_update_list_entry(pf, input, fsp->location);
+
+	ret = ice_fdir_write_all_fltr(pf, input, true);
+	if (ret)
+		goto remove_sw_rule;
+
+	goto release_lock;
+
+remove_sw_rule:
+	ice_fdir_update_cntrs(hw, input->flow_type, false, false);
+	/* update sb-filters count, specific to ring->channel */
+	ice_update_per_q_fltr(vsi, input->orig_q_index, false);
+	list_del(&input->fltr_node);
+release_lock:
+	mutex_unlock(&hw->fdir_fltr_lock);
+free_input:
+	if (ret)
+		devm_kfree(dev, input);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.c b/drivers/net/ethernet/intel/ice/ice_fdir.c
new file mode 100644
index 0000000000000000000000000000000000000000..d63f5d85590318346b8fc4d88fa92a989600a593
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.c
@@ -0,0 +1,2103 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_fdir.h"
+
+/* These are training packet headers used to program flow director filters. */
+static const u8 ice_fdir_tcpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x28, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x00,
+	0x20, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const u8 ice_fdir_udpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctpv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x20, 0x00, 0x00, 0x40, 0x00, 0x40, 0x84,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x10,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_udp4_vxlan_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xb2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x24,
+	0xbf, 0xc0, 0x30, 0xff, 0x00, 0x14, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x14, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x3a, 0x3d, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x40, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2c,
+	0x00, 0x6f, 0x30, 0xff, 0x00, 0x1c, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x3a, 0x24, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0xbe, 0xc7, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x9e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x38,
+	0x00, 0x4c, 0x30, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x28, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x3a, 0x23, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x4e, 0xd2,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x1d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x23, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb8, 0x00, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x1d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x23, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb8, 0x00, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x00,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x42, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa8, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x2e,
+	0xba, 0x0d, 0x34, 0xff, 0x00, 0x1e, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x16, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x00, 0x7c, 0xe5, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4a, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0xa0, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x36,
+	0xb8, 0x13, 0x34, 0xff, 0x00, 0x26, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x1e, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x11, 0x7c, 0xcc, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x0a, 0x01, 0xd8, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_gtpu4_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x56, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x94, 0x7f, 0x00, 0x00, 0x01, 0x7f, 0x00,
+	0x00, 0x01, 0x08, 0x68, 0x08, 0x68, 0x00, 0x42,
+	0xb7, 0xf0, 0x34, 0xff, 0x00, 0x32, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x01, 0x10,
+	0x00, 0x00, 0x45, 0x00, 0x00, 0x2a, 0x00, 0x01,
+	0x00, 0x00, 0x40, 0x06, 0x7c, 0xcb, 0x7f, 0x00,
+	0x00, 0x01, 0x7f, 0x00, 0x00, 0x01, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x02, 0x20, 0x00, 0x91, 0xde,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_icmp4_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x00,
+	0x00, 0x00, 0x34, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x02, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, 0x00,
+	0x00, 0x1c, 0x00, 0x00, 0x40, 0x00, 0x40, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x9e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x38,
+	0x24, 0x42, 0x30, 0xff, 0x00, 0x28, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x54, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x40,
+	0x4e, 0x3d, 0x30, 0xff, 0x00, 0x30, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x08,
+	0x11, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+	0xff, 0xdc, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp6_gtpu4_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x62, 0x00, 0x01, 0x00, 0x00, 0x40, 0x11,
+	0x7c, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x08, 0x68, 0x08, 0x68, 0x00, 0x4e,
+	0x59, 0x08, 0x30, 0xff, 0x00, 0x3e, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x16,
+	0x06, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x14, 0x00, 0x50, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x02,
+	0x20, 0x00, 0x8f, 0x7b, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x38, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x38, 0x22, 0x43, 0x30, 0xff,
+	0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x3b, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x9a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_dw_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x9a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_gtpu6_eh_up_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xdd, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x44, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x68,
+	0x08, 0x68, 0x00, 0x44, 0x1b, 0x8a, 0x34, 0xff,
+	0x00, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x85, 0x02, 0x10, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x3b, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_l2tpv3_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x73,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_l2tpv3_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x73, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x32,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_ipv6_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x32, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_ah_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x40, 0x00, 0x40, 0x33,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_ipv6_ah_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x33, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_nat_t_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x11, 0x94, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_nat_t_esp_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x11, 0x94, 0x00, 0x00, 0x00, 0x08,
+};
+
+static const u8 ice_fdir_ipv4_pfcp_node_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x2C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x22, 0x65, 0x22, 0x65, 0x00, 0x00,
+	0x00, 0x00, 0x20, 0x00, 0x00, 0x10, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_pfcp_session_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x2C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x22, 0x65, 0x22, 0x65, 0x00, 0x00,
+	0x00, 0x00, 0x21, 0x00, 0x00, 0x10, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pfcp_node_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x18, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x65,
+	0x22, 0x65, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+	0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pfcp_session_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x18, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x22, 0x65,
+	0x22, 0x65, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00,
+	0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_non_ip_l2_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ecpri_tp0_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0xAE, 0xFE, 0x10, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_udp_ecpri_tp0_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x1C, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_frag_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x2C, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3B, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv4_frag_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x14, 0x00, 0x00, 0x20, 0x00, 0x40, 0x10,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00
+};
+
+static const u8 ice_fdir_tcpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x14, 0x06, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0x00, 0x20, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_udpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x11, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctpv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x0C, 0x84, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const u8 ice_fdir_ipv6_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x86, 0xDD, 0x60, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x3B, 0x40, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x5a, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x28, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x4e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctp4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x52, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00,
+	0x40, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ip4_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x46, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+	0x45, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00,
+	0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_tcp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x6e, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x14, 0x06, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_udp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x62, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x08, 0x11, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_sctp6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x66, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x84, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 ice_fdir_ip6_tun_pkt[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x45, 0x00,
+	0x00, 0x5a, 0x00, 0x00, 0x40, 0x00, 0x40, 0x11,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x04, 0x00, 0x00, 0x03, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0xdd,
+	0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3b, 0x40,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/* Flow Director no-op training packet table */
+static const struct ice_fdir_base_pkt ice_fdir_pkt[] = {
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_TCP,
+		sizeof(ice_fdir_tcpv4_pkt), ice_fdir_tcpv4_pkt,
+		sizeof(ice_fdir_tcp4_tun_pkt), ice_fdir_tcp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP,
+		sizeof(ice_fdir_udpv4_pkt), ice_fdir_udpv4_pkt,
+		sizeof(ice_fdir_udp4_tun_pkt), ice_fdir_udp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_SCTP,
+		sizeof(ice_fdir_sctpv4_pkt), ice_fdir_sctpv4_pkt,
+		sizeof(ice_fdir_sctp4_tun_pkt), ice_fdir_sctp4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_pkt), ice_fdir_ipv4_pkt,
+		sizeof(ice_fdir_ip4_tun_pkt), ice_fdir_ip4_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_FRAG_IPV4,
+		sizeof(ice_fdir_ipv4_frag_pkt), ice_fdir_ipv4_frag_pkt,
+		sizeof(ice_fdir_ipv4_frag_pkt), ice_fdir_ipv4_frag_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_FRAG_IPV6,
+		sizeof(ice_fdir_ipv6_frag_pkt), ice_fdir_ipv6_frag_pkt,
+		sizeof(ice_fdir_ipv6_frag_pkt), ice_fdir_ipv6_frag_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_pkt),
+		ice_fdir_udp4_gtpu4_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_pkt),
+		ice_fdir_udp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_pkt),
+		ice_fdir_tcp4_gtpu4_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_pkt),
+		ice_fdir_tcp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_pkt),
+		ice_fdir_udp4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_pkt),
+		ice_fdir_udp4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_pkt),
+		ice_fdir_tcp4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_pkt),
+		ice_fdir_tcp4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_dw_pkt),
+		ice_fdir_ipv4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_dw_pkt),
+		ice_fdir_udp4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_dw_pkt),
+		ice_fdir_udp4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_dw_pkt),
+		ice_fdir_tcp4_gtpu4_eh_dw_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_dw_pkt),
+		ice_fdir_tcp4_gtpu4_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_up_pkt),
+		ice_fdir_ipv4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP,
+		sizeof(ice_fdir_udp4_gtpu4_eh_up_pkt),
+		ice_fdir_udp4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_udp4_gtpu4_eh_up_pkt),
+		ice_fdir_udp4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_up_pkt),
+		ice_fdir_tcp4_gtpu4_eh_up_pkt,
+		sizeof(ice_fdir_tcp4_gtpu4_eh_up_pkt),
+		ice_fdir_tcp4_gtpu4_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP,
+		sizeof(ice_fdir_icmp4_gtpu4_pkt),
+		ice_fdir_icmp4_gtpu4_pkt,
+		sizeof(ice_fdir_icmp4_gtpu4_pkt),
+		ice_fdir_icmp4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_pkt),
+		ice_fdir_ipv4_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6,
+		sizeof(ice_fdir_ipv6_gtpu4_pkt),
+		ice_fdir_ipv6_gtpu4_pkt,
+		sizeof(ice_fdir_ipv6_gtpu4_pkt),
+		ice_fdir_ipv6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP,
+		sizeof(ice_fdir_udp6_gtpu4_pkt),
+		ice_fdir_udp6_gtpu4_pkt,
+		sizeof(ice_fdir_udp6_gtpu4_pkt),
+		ice_fdir_udp6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP,
+		sizeof(ice_fdir_tcp6_gtpu4_pkt),
+		ice_fdir_tcp6_gtpu4_pkt,
+		sizeof(ice_fdir_tcp6_gtpu4_pkt),
+		ice_fdir_tcp6_gtpu4_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_dw_pkt),
+		ice_fdir_ipv6_gtpu6_eh_dw_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_dw_pkt),
+		ice_fdir_ipv6_gtpu6_eh_dw_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_up_pkt),
+		ice_fdir_ipv6_gtpu6_eh_up_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_up_pkt),
+		ice_fdir_ipv6_gtpu6_eh_up_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_pkt),
+		ice_fdir_ipv6_gtpu6_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+		sizeof(ice_fdir_ipv4_gtpu4_eh_pkt),
+		ice_fdir_ipv4_gtpu4_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+		sizeof(ice_fdir_ipv6_gtpu6_eh_pkt),
+		ice_fdir_ipv6_gtpu6_eh_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3,
+		sizeof(ice_fdir_ipv4_l2tpv3_pkt), ice_fdir_ipv4_l2tpv3_pkt,
+		sizeof(ice_fdir_ipv4_l2tpv3_pkt), ice_fdir_ipv4_l2tpv3_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3,
+		sizeof(ice_fdir_ipv6_l2tpv3_pkt), ice_fdir_ipv6_l2tpv3_pkt,
+		sizeof(ice_fdir_ipv6_l2tpv3_pkt), ice_fdir_ipv6_l2tpv3_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_ESP,
+		sizeof(ice_fdir_ipv4_esp_pkt), ice_fdir_ipv4_esp_pkt,
+		sizeof(ice_fdir_ipv4_esp_pkt), ice_fdir_ipv4_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_ESP,
+		sizeof(ice_fdir_ipv6_esp_pkt), ice_fdir_ipv6_esp_pkt,
+		sizeof(ice_fdir_ipv6_esp_pkt), ice_fdir_ipv6_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_AH,
+		sizeof(ice_fdir_ipv4_ah_pkt), ice_fdir_ipv4_ah_pkt,
+		sizeof(ice_fdir_ipv4_ah_pkt), ice_fdir_ipv4_ah_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_AH,
+		sizeof(ice_fdir_ipv6_ah_pkt), ice_fdir_ipv6_ah_pkt,
+		sizeof(ice_fdir_ipv6_ah_pkt), ice_fdir_ipv6_ah_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP,
+		sizeof(ice_fdir_ipv4_nat_t_esp_pkt),
+		ice_fdir_ipv4_nat_t_esp_pkt,
+		sizeof(ice_fdir_ipv4_nat_t_esp_pkt),
+		ice_fdir_ipv4_nat_t_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP,
+		sizeof(ice_fdir_ipv6_nat_t_esp_pkt),
+		ice_fdir_ipv6_nat_t_esp_pkt,
+		sizeof(ice_fdir_ipv6_nat_t_esp_pkt),
+		ice_fdir_ipv6_nat_t_esp_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE,
+		sizeof(ice_fdir_ipv4_pfcp_node_pkt),
+		ice_fdir_ipv4_pfcp_node_pkt,
+		sizeof(ice_fdir_ipv4_pfcp_node_pkt),
+		ice_fdir_ipv4_pfcp_node_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION,
+		sizeof(ice_fdir_ipv4_pfcp_session_pkt),
+		ice_fdir_ipv4_pfcp_session_pkt,
+		sizeof(ice_fdir_ipv4_pfcp_session_pkt),
+		ice_fdir_ipv4_pfcp_session_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE,
+		sizeof(ice_fdir_ipv6_pfcp_node_pkt),
+		ice_fdir_ipv6_pfcp_node_pkt,
+		sizeof(ice_fdir_ipv6_pfcp_node_pkt),
+		ice_fdir_ipv6_pfcp_node_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION,
+		sizeof(ice_fdir_ipv6_pfcp_session_pkt),
+		ice_fdir_ipv6_pfcp_session_pkt,
+		sizeof(ice_fdir_ipv6_pfcp_session_pkt),
+		ice_fdir_ipv6_pfcp_session_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NON_IP_L2,
+		sizeof(ice_fdir_non_ip_l2_pkt), ice_fdir_non_ip_l2_pkt,
+		sizeof(ice_fdir_non_ip_l2_pkt), ice_fdir_non_ip_l2_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN,
+		sizeof(ice_fdir_udp4_vxlan_pkt), ice_fdir_udp4_vxlan_pkt,
+		sizeof(ice_fdir_udp4_vxlan_pkt), ice_fdir_udp4_vxlan_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_ECPRI_TP0,
+		sizeof(ice_fdir_ecpri_tp0_pkt), ice_fdir_ecpri_tp0_pkt,
+		sizeof(ice_fdir_ecpri_tp0_pkt), ice_fdir_ecpri_tp0_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0,
+		sizeof(ice_fdir_ipv4_udp_ecpri_tp0_pkt),
+		ice_fdir_ipv4_udp_ecpri_tp0_pkt,
+		sizeof(ice_fdir_ipv4_udp_ecpri_tp0_pkt),
+		ice_fdir_ipv4_udp_ecpri_tp0_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_TCP,
+		sizeof(ice_fdir_tcpv6_pkt), ice_fdir_tcpv6_pkt,
+		sizeof(ice_fdir_tcp6_tun_pkt), ice_fdir_tcp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_UDP,
+		sizeof(ice_fdir_udpv6_pkt), ice_fdir_udpv6_pkt,
+		sizeof(ice_fdir_udp6_tun_pkt), ice_fdir_udp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_SCTP,
+		sizeof(ice_fdir_sctpv6_pkt), ice_fdir_sctpv6_pkt,
+		sizeof(ice_fdir_sctp6_tun_pkt), ice_fdir_sctp6_tun_pkt,
+	},
+	{
+		ICE_FLTR_PTYPE_NONF_IPV6_OTHER,
+		sizeof(ice_fdir_ipv6_pkt), ice_fdir_ipv6_pkt,
+		sizeof(ice_fdir_ip6_tun_pkt), ice_fdir_ip6_tun_pkt,
+	},
+};
+
+#define ICE_FDIR_NUM_PKT ARRAY_SIZE(ice_fdir_pkt)
+
+/**
+ * ice_set_dflt_val_fd_desc
+ * @fd_fltr_ctx: pointer to fd filter descriptor
+ */
+void ice_set_dflt_val_fd_desc(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx)
+{
+	fd_fltr_ctx->comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO;
+	fd_fltr_ctx->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+	fd_fltr_ctx->fd_space = ICE_FXD_FLTR_QW0_FD_SPACE_GUAR_BEST;
+	fd_fltr_ctx->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	fd_fltr_ctx->evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_TRUE;
+	fd_fltr_ctx->toq = ICE_FXD_FLTR_QW0_TO_Q_EQUALS_QINDEX;
+	fd_fltr_ctx->toq_prio = ICE_FXD_FLTR_QW0_TO_Q_PRIO1;
+	fd_fltr_ctx->dpu_recipe = ICE_FXD_FLTR_QW0_DPU_RECIPE_DFLT;
+	fd_fltr_ctx->drop = ICE_FXD_FLTR_QW0_DROP_NO;
+	fd_fltr_ctx->flex_prio = ICE_FXD_FLTR_QW0_FLEX_PRI_NONE;
+	fd_fltr_ctx->flex_mdid = ICE_FXD_FLTR_QW0_FLEX_MDID0;
+	fd_fltr_ctx->flex_val = ICE_FXD_FLTR_QW0_FLEX_VAL0;
+	fd_fltr_ctx->dtype = ICE_TX_DESC_DTYPE_FLTR_PROG;
+	fd_fltr_ctx->desc_prof_prio = ICE_FXD_FLTR_QW1_PROF_PRIO_ZERO;
+	fd_fltr_ctx->desc_prof = ICE_FXD_FLTR_QW1_PROF_ZERO;
+	fd_fltr_ctx->swap = ICE_FXD_FLTR_QW1_SWAP_SET;
+	fd_fltr_ctx->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_ONE;
+	fd_fltr_ctx->fdid_mdid = ICE_FXD_FLTR_QW1_FDID_MDID_FD;
+	fd_fltr_ctx->fdid = ICE_FXD_FLTR_QW1_FDID_ZERO;
+}
+
+/**
+ * ice_set_fd_desc_val
+ * @ctx: pointer to fd filter descriptor context
+ * @fdir_desc: populated with fd filter descriptor values
+ */
+void
+ice_set_fd_desc_val(struct ice_fd_fltr_desc_ctx *ctx,
+		    struct ice_fltr_desc *fdir_desc)
+{
+	u64 qword;
+
+	/* prep QW0 of FD filter programming desc */
+	qword = ((u64)ctx->qindex << ICE_FXD_FLTR_QW0_QINDEX_S) &
+		ICE_FXD_FLTR_QW0_QINDEX_M;
+	qword |= ((u64)ctx->comp_q << ICE_FXD_FLTR_QW0_COMP_Q_S) &
+		 ICE_FXD_FLTR_QW0_COMP_Q_M;
+	qword |= ((u64)ctx->comp_report << ICE_FXD_FLTR_QW0_COMP_REPORT_S) &
+		 ICE_FXD_FLTR_QW0_COMP_REPORT_M;
+	qword |= ((u64)ctx->fd_space << ICE_FXD_FLTR_QW0_FD_SPACE_S) &
+		 ICE_FXD_FLTR_QW0_FD_SPACE_M;
+	qword |= ((u64)ctx->cnt_index << ICE_FXD_FLTR_QW0_STAT_CNT_S) &
+		 ICE_FXD_FLTR_QW0_STAT_CNT_M;
+	qword |= ((u64)ctx->cnt_ena << ICE_FXD_FLTR_QW0_STAT_ENA_S) &
+		 ICE_FXD_FLTR_QW0_STAT_ENA_M;
+	qword |= ((u64)ctx->evict_ena << ICE_FXD_FLTR_QW0_EVICT_ENA_S) &
+		 ICE_FXD_FLTR_QW0_EVICT_ENA_M;
+	qword |= ((u64)ctx->toq << ICE_FXD_FLTR_QW0_TO_Q_S) &
+		 ICE_FXD_FLTR_QW0_TO_Q_M;
+	qword |= ((u64)ctx->toq_prio << ICE_FXD_FLTR_QW0_TO_Q_PRI_S) &
+		 ICE_FXD_FLTR_QW0_TO_Q_PRI_M;
+	qword |= ((u64)ctx->dpu_recipe << ICE_FXD_FLTR_QW0_DPU_RECIPE_S) &
+		 ICE_FXD_FLTR_QW0_DPU_RECIPE_M;
+	qword |= ((u64)ctx->drop << ICE_FXD_FLTR_QW0_DROP_S) &
+		 ICE_FXD_FLTR_QW0_DROP_M;
+	qword |= ((u64)ctx->flex_prio << ICE_FXD_FLTR_QW0_FLEX_PRI_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_PRI_M;
+	qword |= ((u64)ctx->flex_mdid << ICE_FXD_FLTR_QW0_FLEX_MDID_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_MDID_M;
+	qword |= ((u64)ctx->flex_val << ICE_FXD_FLTR_QW0_FLEX_VAL_S) &
+		 ICE_FXD_FLTR_QW0_FLEX_VAL_M;
+	fdir_desc->qidx_compq_space_stat = cpu_to_le64(qword);
+
+	/* prep QW1 of FD filter programming desc */
+	qword = ((u64)ctx->dtype << ICE_FXD_FLTR_QW1_DTYPE_S) &
+		ICE_FXD_FLTR_QW1_DTYPE_M;
+	qword |= ((u64)ctx->pcmd << ICE_FXD_FLTR_QW1_PCMD_S) &
+		 ICE_FXD_FLTR_QW1_PCMD_M;
+	qword |= ((u64)ctx->desc_prof_prio << ICE_FXD_FLTR_QW1_PROF_PRI_S) &
+		 ICE_FXD_FLTR_QW1_PROF_PRI_M;
+	qword |= ((u64)ctx->desc_prof << ICE_FXD_FLTR_QW1_PROF_S) &
+		 ICE_FXD_FLTR_QW1_PROF_M;
+	qword |= ((u64)ctx->fd_vsi << ICE_FXD_FLTR_QW1_FD_VSI_S) &
+		 ICE_FXD_FLTR_QW1_FD_VSI_M;
+	qword |= ((u64)ctx->swap << ICE_FXD_FLTR_QW1_SWAP_S) &
+		 ICE_FXD_FLTR_QW1_SWAP_M;
+	qword |= ((u64)ctx->fdid_prio << ICE_FXD_FLTR_QW1_FDID_PRI_S) &
+		 ICE_FXD_FLTR_QW1_FDID_PRI_M;
+	qword |= ((u64)ctx->fdid_mdid << ICE_FXD_FLTR_QW1_FDID_MDID_S) &
+		 ICE_FXD_FLTR_QW1_FDID_MDID_M;
+	qword |= ((u64)ctx->fdid << ICE_FXD_FLTR_QW1_FDID_S) &
+		 ICE_FXD_FLTR_QW1_FDID_M;
+	fdir_desc->dtype_cmd_vsi_fdid = cpu_to_le64(qword);
+}
+
+/**
+ * ice_fdir_get_prgm_desc - set a fdir descriptor from a fdir filter struct
+ * @hw: pointer to the hardware structure
+ * @input: filter
+ * @fdesc: filter descriptor
+ * @add: if add is true, this is an add operation, false implies delete
+ */
+void
+ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
+		       struct ice_fltr_desc *fdesc, bool add)
+{
+	struct ice_fd_fltr_desc_ctx fdir_fltr_ctx = { 0 };
+
+	/* set default context info */
+	ice_set_dflt_val_fd_desc(&fdir_fltr_ctx);
+
+	/* change sideband filtering values */
+	fdir_fltr_ctx.fdid = input->fltr_id;
+	if (input->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DROP_PKT) {
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_YES;
+		fdir_fltr_ctx.qindex = 0;
+	} else if (input->dest_ctl ==
+		   ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER) {
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO;
+		fdir_fltr_ctx.qindex = 0;
+	} else {
+		if (input->dest_ctl ==
+		    ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP)
+			fdir_fltr_ctx.toq = input->q_region;
+		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO;
+		fdir_fltr_ctx.qindex = input->q_index;
+	}
+	fdir_fltr_ctx.cnt_ena = input->cnt_ena;
+	fdir_fltr_ctx.cnt_index = input->cnt_index;
+	fdir_fltr_ctx.fd_vsi = ice_get_hw_vsi_num(hw, input->dest_vsi);
+	fdir_fltr_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE;
+	if (input->dest_ctl == ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER)
+		fdir_fltr_ctx.toq_prio = 0;
+	else
+		fdir_fltr_ctx.toq_prio = 3;
+	fdir_fltr_ctx.pcmd = add ? ICE_FXD_FLTR_QW1_PCMD_ADD :
+		ICE_FXD_FLTR_QW1_PCMD_REMOVE;
+	fdir_fltr_ctx.swap = ICE_FXD_FLTR_QW1_SWAP_NOT_SET;
+	fdir_fltr_ctx.comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO;
+	fdir_fltr_ctx.comp_report = input->comp_report;
+	fdir_fltr_ctx.fdid_prio = input->fdid_prio;
+	fdir_fltr_ctx.desc_prof = 1;
+	fdir_fltr_ctx.desc_prof_prio = 3;
+	ice_set_fd_desc_val(&fdir_fltr_ctx, fdesc);
+}
+
+/**
+ * ice_alloc_fd_res_cntr - obtain counter resource for FD type
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ */
+enum ice_status ice_alloc_fd_res_cntr(struct ice_hw *hw, u16 *cntr_id)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1, cntr_id);
+}
+
+/**
+ * ice_free_fd_res_cntr - Free counter resource for FD type
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index to be freed
+ */
+enum ice_status ice_free_fd_res_cntr(struct ice_hw *hw, u16 cntr_id)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_COUNTER_BLOCK,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1, cntr_id);
+}
+
+/**
+ * ice_alloc_fd_guar_item - allocate resource for FD guaranteed entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ * @num_fltr: number of filter entries to be allocated
+ */
+enum ice_status
+ice_alloc_fd_guar_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				  cntr_id);
+}
+
+/**
+ * ice_free_fd_guar_item - Free flow director guaranteed entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index that needs to be freed
+ * @num_fltr: number of filters to be freed
+ */
+enum ice_status
+ice_free_fd_guar_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_GUARANTEED_ENTRIES,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				 cntr_id);
+}
+
+/**
+ * ice_alloc_fd_shrd_item - allocate resource for flow director shared entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: returns counter index
+ * @num_fltr: number of filter entries to be allocated
+ */
+enum ice_status
+ice_alloc_fd_shrd_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				  cntr_id);
+}
+
+/**
+ * ice_free_fd_shrd_item - Free flow director shared entries
+ * @hw: pointer to the hardware structure
+ * @cntr_id: counter index that needs to be freed
+ * @num_fltr: number of filters to be freed
+ */
+enum ice_status
+ice_free_fd_shrd_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_FDIR_SHARED_ENTRIES,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, num_fltr,
+				 cntr_id);
+}
+
+/**
+ * ice_get_fdir_cnt_all - get the number of Flow Director filters
+ * @hw: hardware data structure
+ *
+ * Returns the number of filters available on device
+ */
+int ice_get_fdir_cnt_all(struct ice_hw *hw)
+{
+	return hw->func_caps.fd_fltr_guar + hw->func_caps.fd_fltr_best_effort;
+}
+
+/**
+ * ice_pkt_insert_ipv6_addr - insert a be32 IPv6 address into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @addr: IPv6 address to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_ipv6_addr(u8 *pkt, int offset, __be32 *addr)
+{
+	int idx;
+
+	for (idx = 0; idx < ICE_IPV6_ADDR_LEN_AS_U32; idx++)
+		memcpy(pkt + offset + idx * sizeof(*addr), &addr[idx],
+		       sizeof(*addr));
+}
+
+/**
+ * ice_pkt_insert_u6_qfi - insert a u6 value qfi into a memory buffer for gtpu
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ *
+ * This function is designed for inserting qfi (6 bits) for gtpu.
+ */
+static void ice_pkt_insert_u6_qfi(u8 *pkt, int offset, u8 data)
+{
+	u8 ret;
+
+	ret = (data & 0x3F) + (*(pkt + offset) & 0xC0);
+	memcpy(pkt + offset, &ret, sizeof(ret));
+}
+
+/**
+ * ice_pkt_insert_u8 - insert a u8 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u8(u8 *pkt, int offset, u8 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_u8_tc - insert a u8 value into a memory buffer for TC ipv6.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 8 bit value to convert and insert into pkt at offset
+ *
+ * This function is designed for inserting Traffic Class (TC) for IPv6,
+ * since that TC is not aligned in number of bytes. Here we split it out
+ * into two part and fill each byte with data copy from pkt, then insert
+ * the two bytes data one by one.
+ */
+static void ice_pkt_insert_u8_tc(u8 *pkt, int offset, u8 data)
+{
+	u8 high, low;
+
+	high = (data >> 4) + (*(pkt + offset) & 0xF0);
+	memcpy(pkt + offset, &high, sizeof(high));
+
+	low = (*(pkt + offset + 1) & 0x0F) + ((data & 0x0F) << 4);
+	memcpy(pkt + offset + 1, &low, sizeof(low));
+}
+
+/**
+ * ice_pkt_insert_u16 - insert a be16 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 16 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u16(u8 *pkt, int offset, __be16 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_u32 - insert a be32 value into a memory buffer.
+ * @pkt: packet buffer
+ * @offset: offset into buffer
+ * @data: 32 bit value to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_u32(u8 *pkt, int offset, __be32 data)
+{
+	memcpy(pkt + offset, &data, sizeof(data));
+}
+
+/**
+ * ice_pkt_insert_mac_addr - insert a MAC addr into a memory buffer.
+ * @pkt: packet buffer
+ * @addr: MAC address to convert and insert into pkt at offset
+ */
+static void ice_pkt_insert_mac_addr(u8 *pkt, u8 *addr)
+{
+	ether_addr_copy(pkt, addr);
+}
+
+/**
+ * ice_fdir_get_open_tunnel_port
+ * @hw: pointer to the hardware structure
+ * @flow: flow ptype
+ * @port: returns open port
+ *
+ * returns an open tunnel port specified for this flow type
+ */
+static enum ice_status
+ice_fdir_get_open_tunnel_port(struct ice_hw *hw, enum ice_fltr_ptype flow,
+			      u16 *port)
+{
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		/* eCPRI tunnel */
+		if (!ice_get_open_tunnel_port(hw, TNL_ECPRI, port))
+			return ICE_ERR_DOES_NOT_EXIST;
+		break;
+	default:
+		if (!ice_get_open_tunnel_port(hw, TNL_VXLAN, port) &&
+		    !ice_get_open_tunnel_port(hw, TNL_GENEVE, port))
+			return ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_fdir_get_gen_prgm_pkt - generate a training packet
+ * @hw: pointer to the hardware structure
+ * @input: flow director filter data structure
+ * @pkt: pointer to return filter packet
+ * @frag: generate a fragment packet
+ * @tun: true implies generate a tunnel packet
+ */
+enum ice_status
+ice_fdir_get_gen_prgm_pkt(struct ice_hw *hw, struct ice_fdir_fltr *input,
+			  u8 *pkt, bool frag, bool tun)
+{
+	enum ice_fltr_ptype flow;
+	u16 tnl_port;
+	u8 *loc;
+	u16 idx;
+
+	if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER) {
+		switch (input->ip.v4.proto) {
+		case IPPROTO_TCP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+			break;
+		case IPPROTO_UDP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+			break;
+		case IPPROTO_SCTP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+			break;
+		default:
+			flow = ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+			break;
+		}
+	} else if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV6_OTHER) {
+		switch (input->ip.v6.proto) {
+		case IPPROTO_TCP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+			break;
+		case IPPROTO_UDP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+			break;
+		case IPPROTO_SCTP:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+			break;
+		default:
+			flow = ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+			break;
+		}
+	} else {
+		flow = input->flow_type;
+	}
+
+	for (idx = 0; idx < ICE_FDIR_NUM_PKT; idx++)
+		if (ice_fdir_pkt[idx].flow == flow)
+			break;
+	if (idx == ICE_FDIR_NUM_PKT)
+		return ICE_ERR_PARAM;
+	if (!tun) {
+		memcpy(pkt, ice_fdir_pkt[idx].pkt, ice_fdir_pkt[idx].pkt_len);
+		loc = pkt;
+	} else {
+		if (!ice_fdir_pkt[idx].tun_pkt)
+			return ICE_ERR_PARAM;
+
+		switch (flow) {
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP:
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			loc = &pkt[ICE_FDIR_GTPU_IP_INNER_PKT_OFF];
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP:
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			loc = &pkt[ICE_FDIR_GTPU_EH_INNER_PKT_OFF];
+			break;
+		default:
+			if (ice_fdir_get_open_tunnel_port(hw, flow, &tnl_port))
+				return ICE_ERR_DOES_NOT_EXIST;
+
+			memcpy(pkt, ice_fdir_pkt[idx].tun_pkt,
+			       ice_fdir_pkt[idx].tun_pkt_len);
+			ice_pkt_insert_u16(pkt, ICE_IPV4_UDP_DST_PORT_OFFSET,
+					   htons(tnl_port));
+			loc = &pkt[ICE_FDIR_TUN_PKT_OFF];
+			break;
+		}
+	}
+
+	/* Reverse the src and dst, since the HW expects them to be from Tx
+	 * perspective. The input from user is from Rx filter perspective.
+	 */
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_TCP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_TCP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		if (frag)
+			loc[20] = ICE_FDIR_IPV4_PKT_FLAG_MF;
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		ice_pkt_insert_mac_addr(pkt, input->ext_data_outer.dst_mac);
+		ice_pkt_insert_mac_addr(pkt + ETH_ALEN,
+					input->ext_data_outer.src_mac);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip_outer.v4.dst_ip);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip_outer.v4.src_ip);
+		ice_pkt_insert_u8(pkt, ICE_IPV4_TOS_OFFSET, input->ip_outer.v4.tos);
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		ice_pkt_insert_mac_addr(loc + ETH_ALEN,
+					input->ext_data.src_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_SCTP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_SCTP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN:
+		ice_pkt_insert_mac_addr(pkt, input->ext_data_outer.dst_mac);
+		ice_pkt_insert_mac_addr(pkt + ETH_ALEN, input->ext_data_outer.src_mac);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip_outer.v4.dst_ip);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip_outer.v4.src_ip);
+		ice_pkt_insert_u8(pkt, ICE_IPV4_TOS_OFFSET, input->ip_outer.v4.tos);
+		ice_pkt_insert_u32(pkt, ICE_IPV4_VXLAN_VNI_OFFSET,
+				   input->vxlan_data.vni);
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		ice_pkt_insert_mac_addr(loc + ETH_ALEN, input->ext_data.src_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		ice_pkt_insert_u6_qfi(loc, ICE_IPV4_GTPU_QFI_OFFSET,
+				      input->gtpu_data.qfi);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP4_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP4_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP4_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v4.src_port);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP4_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u8(loc, ICE_IPV4_NO_MAC_TTL_OFFSET, input->ip.v4.ttl);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_PROTO_OFFSET,
+				  input->ip.v6.proto);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_UDP6_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_UDP6_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_TCP6_NO_MAC_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_TCP6_NO_MAC_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_NO_MAC_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_NO_MAC_HLIM_OFFSET, input->ip.v6.hlim);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_GTPU_TEID_OFFSET,
+				   input->gtpu_data.teid);
+		ice_pkt_insert_u6_qfi(loc, ICE_IPV6_GTPU_QFI_OFFSET,
+				      input->gtpu_data.qfi);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3:
+		ice_pkt_insert_u32(loc, ICE_IPV4_L2TPV3_SESS_ID_OFFSET,
+				   input->l2tpv3_data.session_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3:
+		ice_pkt_insert_u32(loc, ICE_IPV6_L2TPV3_SESS_ID_OFFSET,
+				   input->l2tpv3_data.session_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_ESP_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV6_ESP_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_AH:
+		ice_pkt_insert_u32(loc, ICE_IPV4_AH_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_AH:
+		ice_pkt_insert_u32(loc, ICE_IPV6_AH_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_NAT_T_ESP_SPI_OFFSET,
+				   input->ip.v4.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV6_NAT_T_ESP_SPI_OFFSET,
+				   input->ip.v6.sec_parm_idx);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE:
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION:
+		ice_pkt_insert_u16(loc, ICE_IPV4_UDP_SRC_PORT_OFFSET,
+				   input->ip.v4.dst_port);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE:
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION:
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		break;
+	case ICE_FLTR_PTYPE_NON_IP_L2:
+		ice_pkt_insert_u16(loc, ICE_MAC_ETHTYPE_OFFSET,
+				   input->ext_data.ether_type);
+		break;
+	case ICE_FLTR_PTYPE_NONF_ECPRI_TP0:
+		ice_pkt_insert_u16(loc, ICE_ECPRI_TP0_PC_ID_OFFSET,
+				   input->ecpri_data.pc_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		/* Use pkt instead of loc, since PC_ID is in outter part */
+		ice_pkt_insert_u16(pkt, ICE_IPV4_UDP_ECPRI_TP0_PC_ID_OFFSET,
+				   input->ecpri_data.pc_id);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_TCP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_TCP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_UDP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u16(loc, ICE_IPV6_SCTP_DST_PORT_OFFSET,
+				   input->ip.v6.src_port);
+		ice_pkt_insert_u16(loc, ICE_IPV6_SCTP_SRC_PORT_OFFSET,
+				   input->ip.v6.dst_port);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_DST_ADDR_OFFSET,
+					 input->ip.v6.src_ip);
+		ice_pkt_insert_ipv6_addr(loc, ICE_IPV6_SRC_ADDR_OFFSET,
+					 input->ip.v6.dst_ip);
+		ice_pkt_insert_u8_tc(loc, ICE_IPV6_TC_OFFSET, input->ip.v6.tc);
+		ice_pkt_insert_u8(loc, ICE_IPV6_HLIM_OFFSET, input->ip.v6.hlim);
+		ice_pkt_insert_u8(loc, ICE_IPV6_PROTO_OFFSET,
+				  input->ip.v6.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_FRAG_IPV4:
+		ice_pkt_insert_u32(loc, ICE_IPV4_DST_ADDR_OFFSET,
+				   input->ip.v4.src_ip);
+		ice_pkt_insert_u32(loc, ICE_IPV4_SRC_ADDR_OFFSET,
+				   input->ip.v4.dst_ip);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TOS_OFFSET, input->ip.v4.tos);
+		ice_pkt_insert_u16(loc, ICE_IPV4_ID_OFFSET,
+				   input->ip.v4.packet_id);
+		ice_pkt_insert_u8(loc, ICE_IPV4_TTL_OFFSET, input->ip.v4.ttl);
+		ice_pkt_insert_u8(loc, ICE_IPV4_PROTO_OFFSET,
+				  input->ip.v4.proto);
+		ice_pkt_insert_mac_addr(loc, input->ext_data.dst_mac);
+		break;
+	case ICE_FLTR_PTYPE_FRAG_IPV6:
+		ice_pkt_insert_u32(loc, ICE_IPV6_ID_OFFSET,
+				   input->ip.v6.packet_id);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+
+	if (input->flex_fltr)
+		ice_pkt_insert_u16(loc, input->flex_offset, input->flex_word);
+
+	return 0;
+}
+
+/**
+ * ice_fdir_get_prgm_pkt - generate a training packet
+ * @input: flow director filter data structure
+ * @pkt: pointer to return filter packet
+ * @frag: generate a fragment packet
+ */
+enum ice_status
+ice_fdir_get_prgm_pkt(struct ice_fdir_fltr *input, u8 *pkt, bool frag)
+{
+	return ice_fdir_get_gen_prgm_pkt(NULL, input, pkt, frag, false);
+}
+
+/**
+ * ice_fdir_has_frag - does flow type have 2 ptypes
+ * @flow: flow ptype
+ *
+ * returns true is there is a fragment packet for this ptype
+ */
+bool ice_fdir_has_frag(enum ice_fltr_ptype flow)
+{
+	if (flow == ICE_FLTR_PTYPE_FRAG_IPV4 ||
+	    flow == ICE_FLTR_PTYPE_FRAG_IPV6)
+		return true;
+	else
+		return false;
+}
+
+/**
+ * ice_fdir_find_fltr_by_idx - find filter with idx
+ * @hw: pointer to hardware structure
+ * @fltr_idx: index to find.
+ *
+ * Returns pointer to filter if found or null
+ */
+struct ice_fdir_fltr *
+ice_fdir_find_fltr_by_idx(struct ice_hw *hw, u32 fltr_idx)
+{
+	struct ice_fdir_fltr *rule;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		/* rule ID found in the list */
+		if (fltr_idx == rule->fltr_id)
+			return rule;
+		if (fltr_idx < rule->fltr_id)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * ice_fdir_list_add_fltr - add a new node to the flow director filter list
+ * @hw: hardware structure
+ * @fltr: filter node to add to structure
+ */
+void ice_fdir_list_add_fltr(struct ice_hw *hw, struct ice_fdir_fltr *fltr)
+{
+	struct ice_fdir_fltr *rule, *parent = NULL;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		/* rule ID found or pass its spot in the list */
+		if (rule->fltr_id >= fltr->fltr_id)
+			break;
+		parent = rule;
+	}
+
+	if (parent)
+		list_add(&fltr->fltr_node, &parent->fltr_node);
+	else
+		list_add(&fltr->fltr_node, &hw->fdir_list_head);
+}
+
+/**
+ * ice_fdir_update_cntrs - increment / decrement filter counter
+ * @hw: pointer to hardware structure
+ * @flow: filter flow type
+ * @acl_fltr: true indicates an ACL filter
+ * @add: true implies filters added
+ */
+void
+ice_fdir_update_cntrs(struct ice_hw *hw, enum ice_fltr_ptype flow,
+		      bool acl_fltr, bool add)
+{
+	int incr;
+
+	incr = add ? 1 : -1;
+	hw->fdir_active_fltr += incr;
+	if (flow == ICE_FLTR_PTYPE_NONF_NONE || flow >= ICE_FLTR_PTYPE_MAX) {
+		ice_debug(hw, ICE_DBG_SW, "Unknown filter type %d\n", flow);
+	} else {
+		if (acl_fltr)
+			hw->acl_fltr_cnt[flow] += incr;
+		else
+			hw->fdir_fltr_cnt[flow] += incr;
+	}
+}
+
+/**
+ * ice_cmp_ipv6_addr - compare 2 IP v6 addresses
+ * @a: IP v6 address
+ * @b: IP v6 address
+ *
+ * Returns 0 on equal, returns non-0 if different
+ */
+static int ice_cmp_ipv6_addr(__be32 *a, __be32 *b)
+{
+	return memcmp(a, b, 4 * sizeof(__be32));
+}
+
+/**
+ * ice_fdir_comp_rules - compare 2 filters
+ * @a: a Flow Director filter data structure
+ * @b: a Flow Director filter data structure
+ * @v6: bool true if v6 filter
+ *
+ * Returns true if the filters match
+ */
+static bool
+ice_fdir_comp_rules(struct ice_fdir_fltr *a,  struct ice_fdir_fltr *b, bool v6)
+{
+	enum ice_fltr_ptype flow_type = a->flow_type;
+
+	/* The calling function already checks that the two filters have the
+	 * same flow_type.
+	 */
+	if (!v6) {
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_SCTP) {
+			if (a->ip.v4.dst_ip == b->ip.v4.dst_ip &&
+			    a->ip.v4.src_ip == b->ip.v4.src_ip &&
+			    a->ip.v4.dst_port == b->ip.v4.dst_port &&
+			    a->ip.v4.src_port == b->ip.v4.src_port)
+				return true;
+		} else if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER) {
+			if (a->ip.v4.dst_ip == b->ip.v4.dst_ip &&
+			    a->ip.v4.src_ip == b->ip.v4.src_ip &&
+			    a->ip.v4.l4_header == b->ip.v4.l4_header &&
+			    a->ip.v4.proto == b->ip.v4.proto &&
+			    a->ip.v4.ip_ver == b->ip.v4.ip_ver &&
+			    a->ip.v4.tos == b->ip.v4.tos)
+				return true;
+		}
+	} else {
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV6_SCTP) {
+			if (a->ip.v6.dst_port == b->ip.v6.dst_port &&
+			    a->ip.v6.src_port == b->ip.v6.src_port &&
+			    !ice_cmp_ipv6_addr(a->ip.v6.dst_ip,
+					       b->ip.v6.dst_ip) &&
+			    !ice_cmp_ipv6_addr(a->ip.v6.src_ip,
+					       b->ip.v6.src_ip))
+				return true;
+		} else if (flow_type == ICE_FLTR_PTYPE_NONF_IPV6_OTHER) {
+			if (a->ip.v6.dst_port == b->ip.v6.dst_port &&
+			    a->ip.v6.src_port == b->ip.v6.src_port)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * ice_fdir_is_dup_fltr - test if filter is already in list for PF
+ * @hw: hardware data structure
+ * @input: Flow Director filter data structure
+ *
+ * Returns true if the filter is found in the list
+ */
+bool ice_fdir_is_dup_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input)
+{
+	struct ice_fdir_fltr *rule;
+	bool ret = false;
+
+	list_for_each_entry(rule, &hw->fdir_list_head, fltr_node) {
+		enum ice_fltr_ptype flow_type;
+
+		if (rule->flow_type != input->flow_type)
+			continue;
+
+		flow_type = input->flow_type;
+		if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_SCTP ||
+		    flow_type == ICE_FLTR_PTYPE_NONF_IPV4_OTHER)
+			ret = ice_fdir_comp_rules(rule, input, false);
+		else
+			ret = ice_fdir_comp_rules(rule, input, true);
+		if (ret) {
+			if (rule->fltr_id == input->fltr_id &&
+			    rule->q_index != input->q_index)
+				ret = false;
+			else
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ice_clear_vsi_fd_table - admin command to clear FD table for a VSI
+ * @hw: hardware data structure
+ * @vsi_num: vsi_num (HW VSI num)
+ *
+ * Clears FD table entries by issuing admin command (direct, 0x0B06)
+ * Must to pass valid vsi_num as returned by "AddVSI".
+ */
+enum ice_status ice_clear_vsi_fd_table(struct ice_hw *hw, u16 vsi_num)
+{
+	struct ice_aqc_clear_fd_table *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.clear_fd_table;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_fd_table);
+	cmd->clear_type = CL_FD_VM_VF_TYPE_VSI_IDX;
+
+	cmd->vsi_index = cpu_to_le16(vsi_num);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_clear_pf_fd_table - admin command to clear FD table for PF
+ * @hw: hardware data structure
+ *
+ * Clears FD table entries for a PF by issuing admin command (direct, 0x0B06)
+ */
+enum ice_status ice_clear_pf_fd_table(struct ice_hw *hw)
+{
+	struct ice_aqc_clear_fd_table *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.clear_fd_table;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_fd_table);
+	cmd->clear_type = CL_FD_VM_VF_TYPE_PF_IDX;
+	/* vsi_index must be 0 to clear FD table for a PF */
+	cmd->vsi_index = cpu_to_le16(0);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.h b/drivers/net/ethernet/intel/ice/ice_fdir.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bdf45458fbbec59d165374d5c6eaca72398f925
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FDIR_H_
+#define _ICE_FDIR_H_
+
+#include "ice_common.h"
+
+
+#define ICE_FDIR_GTPU_IP_INNER_PKT_OFF 50
+#define ICE_FDIR_GTPU_EH_INNER_PKT_OFF 58
+
+#define ICE_FDIR_TUN_PKT_OFF		50
+#define ICE_FDIR_MAX_RAW_PKT_SIZE	(512 + ICE_FDIR_TUN_PKT_OFF)
+#define ICE_FDIR_BUF_FULL_MARGIN	10
+
+/* macros for offsets into packets for flow director programming */
+#define ICE_IPV4_SRC_ADDR_OFFSET	26
+#define ICE_IPV4_DST_ADDR_OFFSET	30
+#define ICE_IPV4_TCP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_TCP_DST_PORT_OFFSET	36
+#define ICE_IPV4_UDP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_UDP_DST_PORT_OFFSET	36
+#define ICE_IPV4_SCTP_SRC_PORT_OFFSET	34
+#define ICE_IPV4_SCTP_DST_PORT_OFFSET	36
+#define ICE_IPV4_PROTO_OFFSET		23
+#define ICE_IPV6_SRC_ADDR_OFFSET	22
+#define ICE_IPV6_DST_ADDR_OFFSET	38
+#define ICE_IPV6_TCP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_TCP_DST_PORT_OFFSET	56
+#define ICE_IPV6_UDP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_UDP_DST_PORT_OFFSET	56
+#define ICE_IPV6_SCTP_SRC_PORT_OFFSET	54
+#define ICE_IPV6_SCTP_DST_PORT_OFFSET	56
+
+#define ICE_MAC_ETHTYPE_OFFSET		12
+#define ICE_IPV4_TOS_OFFSET		15
+#define ICE_IPV4_ID_OFFSET		18
+#define ICE_IPV4_TTL_OFFSET		22
+#define ICE_IPV6_TC_OFFSET		14
+#define ICE_IPV6_HLIM_OFFSET		21
+#define ICE_IPV6_PROTO_OFFSET		20
+#define ICE_IPV6_ID_OFFSET		58
+/* For TUN inner (without inner MAC) */
+#define ICE_IPV4_NO_MAC_TOS_OFFSET	1
+#define ICE_IPV4_NO_MAC_TTL_OFFSET	8
+#define ICE_IPV4_NO_MAC_PROTO_OFFSET	9
+#define ICE_IPV4_NO_MAC_SRC_ADDR_OFFSET	12
+#define ICE_IPV4_NO_MAC_DST_ADDR_OFFSET	16
+#define ICE_TCP4_NO_MAC_SRC_PORT_OFFSET	20
+#define ICE_TCP4_NO_MAC_DST_PORT_OFFSET	22
+#define ICE_UDP4_NO_MAC_SRC_PORT_OFFSET	20
+#define ICE_UDP4_NO_MAC_DST_PORT_OFFSET	22
+#define ICE_IPV6_NO_MAC_TC_OFFSET	0
+#define ICE_IPV6_NO_MAC_HLIM_OFFSET	7
+#define ICE_IPV6_NO_MAC_PROTO_OFFSET	6
+#define ICE_IPV6_NO_MAC_SRC_ADDR_OFFSET	8
+#define ICE_IPV6_NO_MAC_DST_ADDR_OFFSET	24
+#define ICE_TCP6_NO_MAC_SRC_PORT_OFFSET	40
+#define ICE_TCP6_NO_MAC_DST_PORT_OFFSET	42
+#define ICE_UDP6_NO_MAC_SRC_PORT_OFFSET	40
+#define ICE_UDP6_NO_MAC_DST_PORT_OFFSET	42
+#define ICE_IPV4_GTPU_TEID_OFFSET	46
+#define ICE_IPV4_GTPU_QFI_OFFSET	56
+#define ICE_IPV6_GTPU_TEID_OFFSET	66
+#define ICE_IPV6_GTPU_QFI_OFFSET	76
+#define ICE_IPV4_L2TPV3_SESS_ID_OFFSET	34
+#define ICE_IPV6_L2TPV3_SESS_ID_OFFSET	54
+#define ICE_IPV4_ESP_SPI_OFFSET		34
+#define ICE_IPV6_ESP_SPI_OFFSET		54
+#define ICE_IPV4_AH_SPI_OFFSET		38
+#define ICE_IPV6_AH_SPI_OFFSET		58
+#define ICE_IPV4_NAT_T_ESP_SPI_OFFSET	42
+#define ICE_IPV6_NAT_T_ESP_SPI_OFFSET	62
+#define ICE_IPV4_VXLAN_VNI_OFFSET	45
+#define ICE_ECPRI_TP0_PC_ID_OFFSET	18
+#define ICE_IPV4_UDP_ECPRI_TP0_PC_ID_OFFSET			46
+
+#define ICE_FDIR_MAX_FLTRS		16384
+
+/* IPv4 has 2 flag bits that enable fragment processing: DF and MF. DF
+ * requests that the packet not be fragmented. MF indicates that a packet has
+ * been fragmented, except that for the last fragment has a non-zero
+ * Fragment Offset field with zero MF.
+ */
+#define ICE_FDIR_IPV4_PKT_FLAG_MF		0x20
+#define ICE_FDIR_IPV4_PKT_FLAG_MF_SHIFT	8
+#define ICE_FDIR_IPV4_PKT_FLAG_DF		0x40
+
+/* For IPv6 fragmented packets, all fragments except the last have
+ * the MF flag set.
+ */
+#define ICE_FDIR_IPV6_PKT_FLAG_MF		0x100
+#define ICE_FDIR_IPV6_PKT_FLAG_MF_SHIFT	8
+
+enum ice_fltr_prgm_desc_dest {
+	ICE_FLTR_PRGM_DESC_DEST_DROP_PKT,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP,
+	ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER,
+};
+
+enum ice_fltr_prgm_desc_fd_status {
+	ICE_FLTR_PRGM_DESC_FD_STATUS_NONE,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_FD_ID_4FLEX_BYTES,
+	ICE_FLTR_PRGM_DESC_FD_STATUS_8FLEX_BYTES,
+};
+
+/* Flow Director (FD) Filter Programming descriptor */
+struct ice_fd_fltr_desc_ctx {
+	u32 fdid;
+	u16 qindex;
+	u16 cnt_index;
+	u16 fd_vsi;
+	u16 flex_val;
+	u8 comp_q;
+	u8 comp_report;
+	u8 fd_space;
+	u8 cnt_ena;
+	u8 evict_ena;
+	u8 toq;
+	u8 toq_prio;
+	u8 dpu_recipe;
+	u8 drop;
+	u8 flex_prio;
+	u8 flex_mdid;
+	u8 dtype;
+	u8 pcmd;
+	u8 desc_prof_prio;
+	u8 desc_prof;
+	u8 swap;
+	u8 fdid_prio;
+	u8 fdid_mdid;
+};
+
+#define ICE_FLTR_PRGM_FLEX_WORD_SIZE	sizeof(__be16)
+
+struct ice_rx_flow_userdef {
+	u16 flex_word;
+	u16 flex_offset;
+	u16 flex_fltr;
+};
+
+struct ice_fdir_v4 {
+	__be32 dst_ip;
+	__be32 src_ip;
+	__be16 dst_port;
+	__be16 src_port;
+	__be32 l4_header;
+	__be32 sec_parm_idx;	/* security parameter index */
+	u8 tos;
+	u8 ip_ver;
+	u8 proto;
+	u8 ttl;
+	__be16 packet_id;
+};
+
+#define ICE_IPV6_ADDR_LEN_AS_U32		4
+
+struct ice_fdir_v6 {
+	__be32 dst_ip[ICE_IPV6_ADDR_LEN_AS_U32];
+	__be32 src_ip[ICE_IPV6_ADDR_LEN_AS_U32];
+	__be16 dst_port;
+	__be16 src_port;
+	__be32 l4_header; /* next header */
+	__be32 sec_parm_idx; /* security parameter index */
+	u8 tc;
+	u8 proto;
+	u8 hlim;
+	__be32 packet_id;
+};
+
+struct ice_fdir_udp_gtp {
+	u8 flags;
+	u8 msg_type;
+	__be16 rsrvd_len;
+	__be32 teid;
+	__be16 rsrvd_seq_nbr;
+	u8 rsrvd_n_pdu_nbr;
+	u8 rsrvd_next_ext_type;
+	u8 rsvrd_ext_len;
+	u8	pdu_type:4,
+		spare:4;
+	u8	ppp:1,
+		rqi:1,
+		qfi:6;
+	u32 rsvrd;
+	u8 next_ext;
+};
+
+struct ice_fdir_l2tpv3 {
+	__be32 session_id;
+};
+
+struct ice_fdir_udp_vxlan {
+	__be32 vni; /* 8 bits reserved, always be zero */
+};
+
+struct ice_fdir_ecpri {
+	__be16 pc_id;
+};
+
+struct ice_fdir_extra {
+	u8 dst_mac[ETH_ALEN];	/* dest MAC address */
+	u8 src_mac[ETH_ALEN];	/* src MAC address */
+	__be16 ether_type;      /* for NON_IP_L2 */
+	u32 usr_def[2];		/* user data */
+	__be16 vlan_type;	/* VLAN ethertype */
+	__be16 vlan_tag;	/* VLAN tag info */
+};
+
+struct ice_fdir_fltr {
+	struct list_head fltr_node;
+	enum ice_fltr_ptype flow_type;
+
+	union {
+		struct ice_fdir_v4 v4;
+		struct ice_fdir_v6 v6;
+	} ip, mask;
+
+	/* for tunnel outer part */
+	union {
+		struct ice_fdir_v4 v4;
+		struct ice_fdir_v6 v6;
+	} ip_outer, mask_outer;
+
+	struct ice_fdir_extra ext_data_outer;
+	struct ice_fdir_extra ext_mask_outer;
+
+	struct ice_fdir_udp_vxlan vxlan_data;
+	struct ice_fdir_udp_vxlan vxlan_mask;
+
+	struct ice_fdir_udp_gtp gtpu_data;
+	struct ice_fdir_udp_gtp gtpu_mask;
+
+	struct ice_fdir_l2tpv3 l2tpv3_data;
+	struct ice_fdir_l2tpv3 l2tpv3_mask;
+
+	struct ice_fdir_ecpri ecpri_data;
+	struct ice_fdir_ecpri ecpri_mask;
+
+	struct ice_fdir_extra ext_data;
+	struct ice_fdir_extra ext_mask;
+
+	/* flex byte filter data */
+	__be16 flex_word;
+	/* queue region size (=2^q_region) */
+	u8 q_region;
+	u16 flex_offset;
+	u16 flex_fltr;
+
+	/* filter control */
+	u16 q_index;
+	u16 orig_q_index;
+	u16 dest_vsi;
+	u8 dest_ctl;
+	u8 cnt_ena;
+	u8 fltr_status;
+	u16 cnt_index;
+	u32 fltr_id;
+	u8 fdid_prio;
+	u8 comp_report;
+	/* Set to true for an ACL filter */
+	bool acl_fltr;
+};
+
+/* Dummy packet filter definition structure */
+struct ice_fdir_base_pkt {
+	enum ice_fltr_ptype flow;
+	u16 pkt_len;
+	const u8 *pkt;
+	u16 tun_pkt_len;
+	const u8 *tun_pkt;
+};
+
+enum ice_status ice_alloc_fd_res_cntr(struct ice_hw *hw, u16 *cntr_id);
+enum ice_status ice_free_fd_res_cntr(struct ice_hw *hw, u16 cntr_id);
+void
+ice_set_fd_desc_val(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx,
+		    struct ice_fltr_desc *fdir_desc);
+void ice_set_dflt_val_fd_desc(struct ice_fd_fltr_desc_ctx *fd_fltr_ctx);
+enum ice_status
+ice_alloc_fd_guar_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr);
+enum ice_status
+ice_free_fd_guar_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr);
+enum ice_status
+ice_alloc_fd_shrd_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr);
+enum ice_status
+ice_free_fd_shrd_item(struct ice_hw *hw, u16 cntr_id, u16 num_fltr);
+enum ice_status ice_clear_vsi_fd_table(struct ice_hw *hw, u16 vsi_num);
+enum ice_status ice_clear_pf_fd_table(struct ice_hw *hw);
+void
+ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
+		       struct ice_fltr_desc *fdesc, bool add);
+enum ice_status
+ice_fdir_get_gen_prgm_pkt(struct ice_hw *hw, struct ice_fdir_fltr *input,
+			  u8 *pkt, bool frag, bool tun);
+enum ice_status
+ice_fdir_get_prgm_pkt(struct ice_fdir_fltr *input, u8 *pkt, bool frag);
+int ice_get_fdir_cnt_all(struct ice_hw *hw);
+bool ice_fdir_is_dup_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input);
+bool ice_fdir_has_frag(enum ice_fltr_ptype flow);
+struct ice_fdir_fltr *
+ice_fdir_find_fltr_by_idx(struct ice_hw *hw, u32 fltr_idx);
+void
+ice_fdir_update_cntrs(struct ice_hw *hw, enum ice_fltr_ptype flow,
+		      bool acl_fltr, bool add);
+void ice_fdir_list_add_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input);
+#endif /* _ICE_FDIR_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
index 6cfe8eb7f47dcfc3b52dd027e5d084b4a7105a03..b98e4984f7ffce153635c81197538a4e920e2284 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
@@ -1,8 +1,110 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 #include "ice_flex_pipe.h"
+#include "ice_protocol_type.h"
+#include "ice_flow.h"
+
+
+
+/* For supporting double VLAN mode, it is necessary to enable or disable certain
+ * boost tcam entries. The metadata labels names that match the following
+ * prefixes will be saved to allow enabling double VLAN mode.
+ */
+#define ICE_DVM_PRE	"BOOST_MAC_VLAN_DVM"	/* enable these entries */
+#define ICE_SVM_PRE	"BOOST_MAC_VLAN_SVM"	/* disable these entries */
+
+/* To support tunneling entries by PF, the package will append the PF number to
+ * the label; for example TNL_VXLAN_PF0, TNL_VXLAN_PF1, TNL_VXLAN_PF2, etc.
+ */
+#define ICE_TNL_PRE	"TNL_"
+static const struct ice_tunnel_type_scan tnls[] = {
+	{ TNL_VXLAN,		"TNL_VXLAN_PF" },
+	{ TNL_GENEVE,		"TNL_GENEVE_PF" },
+	{ TNL_ECPRI,		"TNL_UDP_ECPRI_PF" },
+	{ TNL_LAST,		"" }
+};
+
+static const u32 ice_sect_lkup[ICE_BLK_COUNT][ICE_SECT_COUNT] = {
+	/* SWITCH */
+	{
+		ICE_SID_XLT0_SW,
+		ICE_SID_XLT_KEY_BUILDER_SW,
+		ICE_SID_XLT1_SW,
+		ICE_SID_XLT2_SW,
+		ICE_SID_PROFID_TCAM_SW,
+		ICE_SID_PROFID_REDIR_SW,
+		ICE_SID_FLD_VEC_SW,
+		ICE_SID_CDID_KEY_BUILDER_SW,
+		ICE_SID_CDID_REDIR_SW
+	},
+
+	/* ACL */
+	{
+		ICE_SID_XLT0_ACL,
+		ICE_SID_XLT_KEY_BUILDER_ACL,
+		ICE_SID_XLT1_ACL,
+		ICE_SID_XLT2_ACL,
+		ICE_SID_PROFID_TCAM_ACL,
+		ICE_SID_PROFID_REDIR_ACL,
+		ICE_SID_FLD_VEC_ACL,
+		ICE_SID_CDID_KEY_BUILDER_ACL,
+		ICE_SID_CDID_REDIR_ACL
+	},
+
+	/* FD */
+	{
+		ICE_SID_XLT0_FD,
+		ICE_SID_XLT_KEY_BUILDER_FD,
+		ICE_SID_XLT1_FD,
+		ICE_SID_XLT2_FD,
+		ICE_SID_PROFID_TCAM_FD,
+		ICE_SID_PROFID_REDIR_FD,
+		ICE_SID_FLD_VEC_FD,
+		ICE_SID_CDID_KEY_BUILDER_FD,
+		ICE_SID_CDID_REDIR_FD
+	},
+
+	/* RSS */
+	{
+		ICE_SID_XLT0_RSS,
+		ICE_SID_XLT_KEY_BUILDER_RSS,
+		ICE_SID_XLT1_RSS,
+		ICE_SID_XLT2_RSS,
+		ICE_SID_PROFID_TCAM_RSS,
+		ICE_SID_PROFID_REDIR_RSS,
+		ICE_SID_FLD_VEC_RSS,
+		ICE_SID_CDID_KEY_BUILDER_RSS,
+		ICE_SID_CDID_REDIR_RSS
+	},
+
+	/* PE */
+	{
+		ICE_SID_XLT0_PE,
+		ICE_SID_XLT_KEY_BUILDER_PE,
+		ICE_SID_XLT1_PE,
+		ICE_SID_XLT2_PE,
+		ICE_SID_PROFID_TCAM_PE,
+		ICE_SID_PROFID_REDIR_PE,
+		ICE_SID_FLD_VEC_PE,
+		ICE_SID_CDID_KEY_BUILDER_PE,
+		ICE_SID_CDID_REDIR_PE
+	}
+};
+
+/**
+ * ice_sect_id - returns section ID
+ * @blk: block type
+ * @sect: section type
+ *
+ * This helper function returns the proper section ID given a block type and a
+ * section type.
+ */
+static u32 ice_sect_id(enum ice_block blk, enum ice_sect sect)
+{
+	return ice_sect_lkup[blk][sect];
+}
 
 /**
  * ice_pkg_val_buf
@@ -159,1101 +261,3869 @@ ice_pkg_enum_section(struct ice_seg *ice_seg, struct ice_pkg_enum *state,
 }
 
 /**
- * ice_acquire_global_cfg_lock
- * @hw: pointer to the HW structure
- * @access: access type (read or write)
+ * ice_pkg_enum_entry
+ * @ice_seg: pointer to the ice segment (or NULL on subsequent calls)
+ * @state: pointer to the enum state
+ * @sect_type: section type to enumerate
+ * @offset: pointer to variable that receives the offset in the table (optional)
+ * @handler: function that handles access to the entries into the section type
  *
- * This function will request ownership of the global config lock for reading
- * or writing of the package. When attempting to obtain write access, the
- * caller must check for the following two return values:
+ * This function will enumerate all the entries in particular section type in
+ * the ice segment. The first call is made with the ice_seg parameter non-NULL;
+ * on subsequent calls, ice_seg is set to NULL which continues the enumeration.
+ * When the function returns a NULL pointer, then the end of the entries has
+ * been reached.
  *
- * ICE_SUCCESS        - Means the caller has acquired the global config lock
- *                      and can perform writing of the package.
- * ICE_ERR_AQ_NO_WORK - Indicates another driver has already written the
- *                      package or has found that no update was necessary; in
- *                      this case, the caller can just skip performing any
- *                      update of the package.
+ * Since each section may have a different header and entry size, the handler
+ * function is needed to determine the number and location entries in each
+ * section.
+ *
+ * The offset parameter is optional, but should be used for sections that
+ * contain an offset for each section table. For such cases, the section handler
+ * function must return the appropriate offset + index to give the absolution
+ * offset for each entry. For example, if the base for a section's header
+ * indicates a base offset of 10, and the index for the entry is 2, then
+ * section handler function should set the offset to 10 + 2 = 12.
  */
-static enum ice_status
-ice_acquire_global_cfg_lock(struct ice_hw *hw,
-			    enum ice_aq_res_access_type access)
+static void *
+ice_pkg_enum_entry(struct ice_seg *ice_seg, struct ice_pkg_enum *state,
+		   u32 sect_type, u32 *offset,
+		   void *(*handler)(u32 sect_type, void *section,
+				    u32 index, u32 *offset))
 {
-	enum ice_status status;
+	void *entry;
 
-	status = ice_acquire_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID, access,
-				 ICE_GLOBAL_CFG_LOCK_TIMEOUT);
+	if (ice_seg) {
+		if (!handler)
+			return NULL;
 
-	if (!status)
-		mutex_lock(&ice_global_cfg_lock_sw);
-	else if (status == ICE_ERR_AQ_NO_WORK)
-		ice_debug(hw, ICE_DBG_PKG,
-			  "Global config lock: No work to do\n");
+		if (!ice_pkg_enum_section(ice_seg, state, sect_type))
+			return NULL;
 
-	return status;
+		state->entry_idx = 0;
+		state->handler = handler;
+	} else {
+		state->entry_idx++;
+	}
+
+	if (!state->handler)
+		return NULL;
+
+	/* get entry */
+	entry = state->handler(state->sect_type, state->sect, state->entry_idx,
+			       offset);
+	if (!entry) {
+		/* end of a section, look for another section of this type */
+		if (!ice_pkg_enum_section(NULL, state, 0))
+			return NULL;
+
+		state->entry_idx = 0;
+		entry = state->handler(state->sect_type, state->sect,
+				       state->entry_idx, offset);
+	}
+
+	return entry;
 }
 
 /**
- * ice_release_global_cfg_lock
+ * ice_hw_ptype_ena - check if the PTYPE is enabled or not
  * @hw: pointer to the HW structure
- *
- * This function will release the global config lock.
+ * @ptype: the hardware PTYPE
  */
-static void ice_release_global_cfg_lock(struct ice_hw *hw)
+bool ice_hw_ptype_ena(struct ice_hw *hw, u16 ptype)
 {
-	mutex_unlock(&ice_global_cfg_lock_sw);
-	ice_release_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID);
+	return ptype < ICE_FLOW_PTYPE_MAX &&
+	       test_bit(ptype, hw->hw_ptype);
 }
 
 /**
- * ice_aq_download_pkg
- * @hw: pointer to the hardware structure
- * @pkg_buf: the package buffer to transfer
- * @buf_size: the size of the package buffer
- * @last_buf: last buffer indicator
- * @error_offset: returns error offset
- * @error_info: returns error information
- * @cd: pointer to command details structure or NULL
+ * ice_marker_ptype_tcam_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the Marker PType TCAM entry to be returned
+ * @offset: pointer to receive absolute offset, always 0 for ptype TCAM sections
  *
- * Download Package (0x0C40)
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual Marker PType TCAM entries.
  */
-static enum ice_status
-ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
-		    u16 buf_size, bool last_buf, u32 *error_offset,
-		    u32 *error_info, struct ice_sq_cd *cd)
+static void *
+ice_marker_ptype_tcam_handler(u32 sect_type, void *section, u32 index,
+			      u32 *offset)
 {
-	struct ice_aqc_download_pkg *cmd;
-	struct ice_aq_desc desc;
-	enum ice_status status;
+	struct ice_marker_ptype_tcam_section *marker_ptype;
 
-	if (error_offset)
-		*error_offset = 0;
-	if (error_info)
-		*error_info = 0;
+	if (!section)
+		return NULL;
 
-	cmd = &desc.params.download_pkg;
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_download_pkg);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	if (sect_type != ICE_SID_RXPARSER_MARKER_PTYPE)
+		return NULL;
 
-	if (last_buf)
-		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
+	if (index > ICE_MAX_MARKER_PTYPE_TCAMS_IN_BUF)
+		return NULL;
 
-	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
-	if (status == ICE_ERR_AQ_ERROR) {
-		/* Read error from buffer only when the FW returned an error */
-		struct ice_aqc_download_pkg_resp *resp;
+	if (offset)
+		*offset = 0;
 
-		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
-		if (error_offset)
-			*error_offset = le32_to_cpu(resp->error_offset);
-		if (error_info)
-			*error_info = le32_to_cpu(resp->error_info);
-	}
+	marker_ptype = section;
+	if (index >= le16_to_cpu(marker_ptype->count))
+		return NULL;
 
-	return status;
+	return marker_ptype->tcam + index;
 }
 
 /**
- * ice_find_seg_in_pkg
- * @hw: pointer to the hardware structure
- * @seg_type: the segment type to search for (i.e., SEGMENT_TYPE_CPK)
- * @pkg_hdr: pointer to the package header to be searched
- *
- * This function searches a package file for a particular segment type. On
- * success it returns a pointer to the segment header, otherwise it will
- * return NULL.
+ * ice_fill_hw_ptype - fill the enabled PTYPE bit information
+ * @hw: pointer to the HW structure
  */
-static struct ice_generic_seg_hdr *
-ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type,
-		    struct ice_pkg_hdr *pkg_hdr)
+static void
+ice_fill_hw_ptype(struct ice_hw *hw)
 {
-	u32 i;
-
-	ice_debug(hw, ICE_DBG_PKG, "Package format version: %d.%d.%d.%d\n",
-		  pkg_hdr->format_ver.major, pkg_hdr->format_ver.minor,
-		  pkg_hdr->format_ver.update, pkg_hdr->format_ver.draft);
-
-	/* Search all package segments for the requested segment type */
-	for (i = 0; i < le32_to_cpu(pkg_hdr->seg_count); i++) {
-		struct ice_generic_seg_hdr *seg;
+	struct ice_marker_ptype_tcam_entry *tcam;
+	struct ice_seg *seg = hw->seg;
+	struct ice_pkg_enum state;
 
-		seg = (struct ice_generic_seg_hdr *)
-			((u8 *)pkg_hdr + le32_to_cpu(pkg_hdr->seg_offset[i]));
+	bitmap_zero(hw->hw_ptype, ICE_FLOW_PTYPE_MAX);
+	if (!seg)
+		return;
 
-		if (le32_to_cpu(seg->seg_type) == seg_type)
-			return seg;
-	}
+	memset(&state, 0, sizeof(state));
 
-	return NULL;
+	do {
+		tcam = ice_pkg_enum_entry(seg, &state,
+					  ICE_SID_RXPARSER_MARKER_PTYPE, NULL,
+					  ice_marker_ptype_tcam_handler);
+		if (tcam &&
+		    le16_to_cpu(tcam->addr) < ICE_MARKER_PTYPE_TCAM_ADDR_MAX &&
+		    le16_to_cpu(tcam->ptype) < ICE_FLOW_PTYPE_MAX)
+			set_bit(le16_to_cpu(tcam->ptype), hw->hw_ptype);
+
+		seg = NULL;
+	} while (tcam);
 }
 
 /**
- * ice_dwnld_cfg_bufs
- * @hw: pointer to the hardware structure
- * @bufs: pointer to an array of buffers
- * @count: the number of buffers in the array
+ * ice_boost_tcam_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the boost TCAM entry to be returned
+ * @offset: pointer to receive absolute offset, always 0 for boost TCAM sections
  *
- * Obtains global config lock and downloads the package configuration buffers
- * to the firmware. Metadata buffers are skipped, and the first metadata buffer
- * found indicates that the rest of the buffers are all metadata buffers.
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual boost TCAM entries.
  */
-static enum ice_status
-ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+static void *
+ice_boost_tcam_handler(u32 sect_type, void *section, u32 index, u32 *offset)
 {
-	enum ice_status status;
-	struct ice_buf_hdr *bh;
-	u32 offset, info, i;
+	struct ice_boost_tcam_section *boost;
 
-	if (!bufs || !count)
-		return ICE_ERR_PARAM;
-
-	/* If the first buffer's first section has its metadata bit set
-	 * then there are no buffers to be downloaded, and the operation is
-	 * considered a success.
-	 */
-	bh = (struct ice_buf_hdr *)bufs;
-	if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
-		return 0;
+	if (!section)
+		return NULL;
 
-	/* reset pkg_dwnld_status in case this function is called in the
-	 * reset/rebuild flow
-	 */
-	hw->pkg_dwnld_status = ICE_AQ_RC_OK;
+	if (sect_type != ICE_SID_RXPARSER_BOOST_TCAM)
+		return NULL;
 
-	status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
-	if (status) {
-		if (status == ICE_ERR_AQ_NO_WORK)
-			hw->pkg_dwnld_status = ICE_AQ_RC_EEXIST;
-		else
-			hw->pkg_dwnld_status = hw->adminq.sq_last_status;
-		return status;
-	}
+	if (index > ICE_MAX_BST_TCAMS_IN_BUF)
+		return NULL;
 
-	for (i = 0; i < count; i++) {
-		bool last = ((i + 1) == count);
+	if (offset)
+		*offset = 0;
 
-		if (!last) {
-			/* check next buffer for metadata flag */
-			bh = (struct ice_buf_hdr *)(bufs + i + 1);
+	boost = section;
+	if (index >= le16_to_cpu(boost->count))
+		return NULL;
 
-			/* A set metadata flag in the next buffer will signal
-			 * that the current buffer will be the last buffer
-			 * downloaded
-			 */
-			if (le16_to_cpu(bh->section_count))
-				if (le32_to_cpu(bh->section_entry[0].type) &
-				    ICE_METADATA_BUF)
-					last = true;
-		}
+	return boost->tcam + index;
+}
 
-		bh = (struct ice_buf_hdr *)(bufs + i);
+/**
+ * ice_find_boost_entry
+ * @ice_seg: pointer to the ice segment (non-NULL)
+ * @addr: Boost TCAM address of entry to search for
+ * @entry: returns pointer to the entry
+ *
+ * Finds a particular Boost TCAM entry and returns a pointer to that entry
+ * if it is found. The ice_seg parameter must not be NULL since the first call
+ * to ice_pkg_enum_entry requires a pointer to an actual ice_segment structure.
+ */
+static enum ice_status
+ice_find_boost_entry(struct ice_seg *ice_seg, u16 addr,
+		     struct ice_boost_tcam_entry **entry)
+{
+	struct ice_boost_tcam_entry *tcam;
+	struct ice_pkg_enum state;
 
-		status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last,
-					     &offset, &info, NULL);
+	memset(&state, 0, sizeof(state));
 
-		/* Save AQ status from download package */
-		hw->pkg_dwnld_status = hw->adminq.sq_last_status;
-		if (status) {
-			ice_debug(hw, ICE_DBG_PKG,
-				  "Pkg download failed: err %d off %d inf %d\n",
-				  status, offset, info);
+	if (!ice_seg)
+		return ICE_ERR_PARAM;
 
-			break;
+	do {
+		tcam = ice_pkg_enum_entry(ice_seg, &state,
+					  ICE_SID_RXPARSER_BOOST_TCAM, NULL,
+					  ice_boost_tcam_handler);
+		if (tcam && le16_to_cpu(tcam->addr) == addr) {
+			*entry = tcam;
+			return 0;
 		}
 
-		if (last)
-			break;
-	}
-
-	ice_release_global_cfg_lock(hw);
+		ice_seg = NULL;
+	} while (tcam);
 
-	return status;
+	*entry = NULL;
+	return ICE_ERR_CFG;
 }
 
 /**
- * ice_aq_get_pkg_info_list
- * @hw: pointer to the hardware structure
- * @pkg_info: the buffer which will receive the information list
- * @buf_size: the size of the pkg_info information buffer
- * @cd: pointer to command details structure or NULL
+ * ice_label_enum_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the label entry to be returned
+ * @offset: pointer to receive absolute offset, always zero for label sections
  *
- * Get Package Info List (0x0C43)
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * Handles enumeration of individual label entries.
  */
-static enum ice_status
-ice_aq_get_pkg_info_list(struct ice_hw *hw,
-			 struct ice_aqc_get_pkg_info_resp *pkg_info,
-			 u16 buf_size, struct ice_sq_cd *cd)
+static void *
+ice_label_enum_handler(u32 __always_unused sect_type, void *section, u32 index,
+		       u32 *offset)
 {
-	struct ice_aq_desc desc;
+	struct ice_label_section *labels;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_pkg_info_list);
+	if (!section)
+		return NULL;
 
-	return ice_aq_send_cmd(hw, &desc, pkg_info, buf_size, cd);
+	if (index > ICE_MAX_LABELS_IN_BUF)
+		return NULL;
+
+	if (offset)
+		*offset = 0;
+
+	labels = section;
+	if (index >= le16_to_cpu(labels->count))
+		return NULL;
+
+	return labels->label + index;
 }
 
 /**
- * ice_download_pkg
- * @hw: pointer to the hardware structure
- * @ice_seg: pointer to the segment of the package to be downloaded
+ * ice_enum_labels
+ * @ice_seg: pointer to the ice segment (NULL on subsequent calls)
+ * @type: the section type that will contain the label (0 on subsequent calls)
+ * @state: ice_pkg_enum structure that will hold the state of the enumeration
+ * @value: pointer to a value that will return the label's value if found
  *
- * Handles the download of a complete package.
+ * Enumerates a list of labels in the package. The caller will call
+ * ice_enum_labels(ice_seg, type, ...) to start the enumeration, then call
+ * ice_enum_labels(NULL, 0, ...) to continue. When the function returns a NULL
+ * the end of the list has been reached.
  */
-static enum ice_status
-ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
+static char *
+ice_enum_labels(struct ice_seg *ice_seg, u32 type, struct ice_pkg_enum *state,
+		u16 *value)
 {
-	struct ice_buf_table *ice_buf_tbl;
+	struct ice_label *label;
 
-	ice_debug(hw, ICE_DBG_PKG, "Segment version: %d.%d.%d.%d\n",
-		  ice_seg->hdr.seg_ver.major, ice_seg->hdr.seg_ver.minor,
-		  ice_seg->hdr.seg_ver.update, ice_seg->hdr.seg_ver.draft);
+	/* Check for valid label section on first call */
+	if (type && !(type >= ICE_SID_LBL_FIRST && type <= ICE_SID_LBL_LAST))
+		return NULL;
 
-	ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
-		  le32_to_cpu(ice_seg->hdr.seg_type),
-		  le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_name);
+	label = ice_pkg_enum_entry(ice_seg, state, type, NULL,
+				   ice_label_enum_handler);
+	if (!label)
+		return NULL;
 
-	ice_buf_tbl = ice_find_buf_table(ice_seg);
+	*value = le16_to_cpu(label->value);
+	return label->name;
+}
 
-	ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
-		  le32_to_cpu(ice_buf_tbl->buf_count));
+/**
+ * ice_add_tunnel_hint
+ * @hw: pointer to the HW structure
+ * @label_name: label text
+ * @val: value of the tunnel port boost entry
+ */
+static void ice_add_tunnel_hint(struct ice_hw *hw, char *label_name, u16 val)
+{
+	if (hw->tnl.count < ICE_TUNNEL_MAX_ENTRIES) {
+		u16 i;
+
+		for (i = 0; tnls[i].type != TNL_LAST; i++) {
+			size_t len = strlen(tnls[i].label_prefix);
+
+			/* Look for matching label start, before continuing */
+			if (strncmp(label_name, tnls[i].label_prefix, len))
+				continue;
 
-	return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
-				  le32_to_cpu(ice_buf_tbl->buf_count));
+			/* Make sure this label matches our PF. Note that the PF
+			 * character ('0' - '7') will be located where our
+			 * prefix string's null terminator is located.
+			 */
+			if ((label_name[len] - '0') == hw->pf_id) {
+				hw->tnl.tbl[hw->tnl.count].type = tnls[i].type;
+				hw->tnl.tbl[hw->tnl.count].valid = false;
+				hw->tnl.tbl[hw->tnl.count].in_use = false;
+				hw->tnl.tbl[hw->tnl.count].marked = false;
+				hw->tnl.tbl[hw->tnl.count].boost_addr = val;
+				hw->tnl.tbl[hw->tnl.count].port = 0;
+				hw->tnl.count++;
+				break;
+			}
+		}
+	}
 }
 
 /**
- * ice_init_pkg_info
- * @hw: pointer to the hardware structure
- * @pkg_hdr: pointer to the driver's package hdr
+ * ice_add_dvm_hint
+ * @hw: pointer to the HW structure
+ * @val: value of the boost entry
+ * @enable: true if entry needs to be enabled, or false if needs to be disabled
+ */
+static void ice_add_dvm_hint(struct ice_hw *hw, u16 val, bool enable)
+{
+	if (hw->dvm_upd.count < ICE_DVM_MAX_ENTRIES) {
+		hw->dvm_upd.tbl[hw->dvm_upd.count].boost_addr = val;
+		hw->dvm_upd.tbl[hw->dvm_upd.count].enable = enable;
+		hw->dvm_upd.count++;
+	}
+}
+
+/**
+ * ice_init_pkg_hints
+ * @hw: pointer to the HW structure
+ * @ice_seg: pointer to the segment of the package scan (non-NULL)
  *
- * Saves off the package details into the HW structure.
+ * This function will scan the package and save off relevant information
+ * (hints or metadata) for driver use. The ice_seg parameter must not be NULL
+ * since the first call to ice_enum_labels requires a pointer to an actual
+ * ice_seg structure.
  */
-static enum ice_status
-ice_init_pkg_info(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
+static void ice_init_pkg_hints(struct ice_hw *hw, struct ice_seg *ice_seg)
 {
-	struct ice_global_metadata_seg *meta_seg;
-	struct ice_generic_seg_hdr *seg_hdr;
+	struct ice_pkg_enum state;
+	char *label_name;
+	u16 val;
+	int i;
 
-	if (!pkg_hdr)
-		return ICE_ERR_PARAM;
+	memset(&hw->tnl, 0, sizeof(hw->tnl));
+	memset(&state, 0, sizeof(state));
 
-	meta_seg = (struct ice_global_metadata_seg *)
-		   ice_find_seg_in_pkg(hw, SEGMENT_TYPE_METADATA, pkg_hdr);
-	if (meta_seg) {
-		hw->pkg_ver = meta_seg->pkg_ver;
-		memcpy(hw->pkg_name, meta_seg->pkg_name, sizeof(hw->pkg_name));
+	if (!ice_seg)
+		return;
 
-		ice_debug(hw, ICE_DBG_PKG, "Pkg: %d.%d.%d.%d, %s\n",
-			  meta_seg->pkg_ver.major, meta_seg->pkg_ver.minor,
-			  meta_seg->pkg_ver.update, meta_seg->pkg_ver.draft,
-			  meta_seg->pkg_name);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Did not find metadata segment in driver package\n");
-		return ICE_ERR_CFG;
+	label_name = ice_enum_labels(ice_seg, ICE_SID_LBL_RXPARSER_TMEM, &state,
+				     &val);
+
+	while (label_name) {
+		if (!strncmp(label_name, ICE_TNL_PRE, strlen(ICE_TNL_PRE)))
+			/* check for a tunnel entry */
+			ice_add_tunnel_hint(hw, label_name, val);
+
+		/* check for a dvm mode entry */
+		else if (!strncmp(label_name, ICE_DVM_PRE, strlen(ICE_DVM_PRE)))
+			ice_add_dvm_hint(hw, val, true);
+
+		/* check for a svm mode entry */
+		else if (!strncmp(label_name, ICE_SVM_PRE, strlen(ICE_SVM_PRE)))
+			ice_add_dvm_hint(hw, val, false);
+
+		label_name = ice_enum_labels(NULL, 0, &state, &val);
 	}
 
-	seg_hdr = ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg_hdr);
-	if (seg_hdr) {
-		hw->ice_pkg_ver = seg_hdr->seg_ver;
-		memcpy(hw->ice_pkg_name, seg_hdr->seg_name,
-		       sizeof(hw->ice_pkg_name));
-
-		ice_debug(hw, ICE_DBG_PKG, "Ice Pkg: %d.%d.%d.%d, %s\n",
-			  seg_hdr->seg_ver.major, seg_hdr->seg_ver.minor,
-			  seg_hdr->seg_ver.update, seg_hdr->seg_ver.draft,
-			  seg_hdr->seg_name);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Did not find ice segment in driver package\n");
-		return ICE_ERR_CFG;
+	/* Cache the appropriate boost TCAM entry pointers for tunnels */
+	for (i = 0; i < hw->tnl.count; i++) {
+		ice_find_boost_entry(ice_seg, hw->tnl.tbl[i].boost_addr,
+				     &hw->tnl.tbl[i].boost_entry);
+		if (hw->tnl.tbl[i].boost_entry)
+			hw->tnl.tbl[i].valid = true;
 	}
 
-	return 0;
+	/* Cache the appropriate boost TCAM entry pointers for DVM and SVM */
+	for (i = 0; i < hw->dvm_upd.count; i++)
+		ice_find_boost_entry(ice_seg, hw->dvm_upd.tbl[i].boost_addr,
+				     &hw->dvm_upd.tbl[i].boost_entry);
 }
 
+/* Key creation */
+
+#define ICE_DC_KEY	0x1	/* don't care */
+#define ICE_DC_KEYINV	0x1
+#define ICE_NM_KEY	0x0	/* never match */
+#define ICE_NM_KEYINV	0x0
+#define ICE_0_KEY	0x1	/* match 0 */
+#define ICE_0_KEYINV	0x0
+#define ICE_1_KEY	0x0	/* match 1 */
+#define ICE_1_KEYINV	0x1
+
 /**
- * ice_get_pkg_info
- * @hw: pointer to the hardware structure
+ * ice_gen_key_word - generate 16-bits of a key/mask word
+ * @val: the value
+ * @valid: valid bits mask (change only the valid bits)
+ * @dont_care: don't care mask
+ * @nvr_mtch: never match mask
+ * @key: pointer to an array of where the resulting key portion
+ * @key_inv: pointer to an array of where the resulting key invert portion
  *
- * Store details of the package currently loaded in HW into the HW structure.
+ * This function generates 16-bits from a 8-bit value, an 8-bit don't care mask
+ * and an 8-bit never match mask. The 16-bits of output are divided into 8 bits
+ * of key and 8 bits of key invert.
+ *
+ *     '0' =    b01, always match a 0 bit
+ *     '1' =    b10, always match a 1 bit
+ *     '?' =    b11, don't care bit (always matches)
+ *     '~' =    b00, never match bit
+ *
+ * Input:
+ *          val:         b0  1  0  1  0  1
+ *          dont_care:   b0  0  1  1  0  0
+ *          never_mtch:  b0  0  0  0  1  1
+ *          ------------------------------
+ * Result:  key:        b01 10 11 11 00 00
  */
-static enum ice_status ice_get_pkg_info(struct ice_hw *hw)
+static enum ice_status
+ice_gen_key_word(u8 val, u8 valid, u8 dont_care, u8 nvr_mtch, u8 *key,
+		 u8 *key_inv)
 {
-	struct ice_aqc_get_pkg_info_resp *pkg_info;
-	enum ice_status status;
-	u16 size;
-	u32 i;
-
-	size = sizeof(*pkg_info) + (sizeof(pkg_info->pkg_info[0]) *
-				    (ICE_PKG_CNT - 1));
-	pkg_info = kzalloc(size, GFP_KERNEL);
-	if (!pkg_info)
-		return ICE_ERR_NO_MEMORY;
-
-	status = ice_aq_get_pkg_info_list(hw, pkg_info, size, NULL);
-	if (status)
-		goto init_pkg_free_alloc;
+	u8 in_key = *key, in_key_inv = *key_inv;
+	u8 i;
 
-	for (i = 0; i < le32_to_cpu(pkg_info->count); i++) {
-#define ICE_PKG_FLAG_COUNT	4
-		char flags[ICE_PKG_FLAG_COUNT + 1] = { 0 };
-		u8 place = 0;
+	/* 'dont_care' and 'nvr_mtch' masks cannot overlap */
+	if ((dont_care ^ nvr_mtch) != (dont_care | nvr_mtch))
+		return ICE_ERR_CFG;
 
-		if (pkg_info->pkg_info[i].is_active) {
-			flags[place++] = 'A';
-			hw->active_pkg_ver = pkg_info->pkg_info[i].ver;
-			memcpy(hw->active_pkg_name,
-			       pkg_info->pkg_info[i].name,
-			       sizeof(hw->active_pkg_name));
-			hw->active_pkg_in_nvm = pkg_info->pkg_info[i].is_in_nvm;
+	*key = 0;
+	*key_inv = 0;
+
+	/* encode the 8 bits into 8-bit key and 8-bit key invert */
+	for (i = 0; i < 8; i++) {
+		*key >>= 1;
+		*key_inv >>= 1;
+
+		if (!(valid & 0x1)) { /* change only valid bits */
+			*key |= (in_key & 0x1) << 7;
+			*key_inv |= (in_key_inv & 0x1) << 7;
+		} else if (dont_care & 0x1) { /* don't care bit */
+			*key |= ICE_DC_KEY << 7;
+			*key_inv |= ICE_DC_KEYINV << 7;
+		} else if (nvr_mtch & 0x1) { /* never match bit */
+			*key |= ICE_NM_KEY << 7;
+			*key_inv |= ICE_NM_KEYINV << 7;
+		} else if (val & 0x01) { /* exact 1 match */
+			*key |= ICE_1_KEY << 7;
+			*key_inv |= ICE_1_KEYINV << 7;
+		} else { /* exact 0 match */
+			*key |= ICE_0_KEY << 7;
+			*key_inv |= ICE_0_KEYINV << 7;
 		}
-		if (pkg_info->pkg_info[i].is_active_at_boot)
-			flags[place++] = 'B';
-		if (pkg_info->pkg_info[i].is_modified)
-			flags[place++] = 'M';
-		if (pkg_info->pkg_info[i].is_in_nvm)
-			flags[place++] = 'N';
 
-		ice_debug(hw, ICE_DBG_PKG, "Pkg[%d]: %d.%d.%d.%d,%s,%s\n",
-			  i, pkg_info->pkg_info[i].ver.major,
-			  pkg_info->pkg_info[i].ver.minor,
-			  pkg_info->pkg_info[i].ver.update,
-			  pkg_info->pkg_info[i].ver.draft,
-			  pkg_info->pkg_info[i].name, flags);
+		dont_care >>= 1;
+		nvr_mtch >>= 1;
+		valid >>= 1;
+		val >>= 1;
+		in_key >>= 1;
+		in_key_inv >>= 1;
 	}
 
-init_pkg_free_alloc:
-	kfree(pkg_info);
+	return 0;
+}
 
-	return status;
+/**
+ * ice_bits_max_set - determine if the number of bits set is within a maximum
+ * @mask: pointer to the byte array which is the mask
+ * @size: the number of bytes in the mask
+ * @max: the max number of set bits
+ *
+ * This function determines if there are at most 'max' number of bits set in an
+ * array. Returns true if the number for bits set is <= max or will return false
+ * otherwise.
+ */
+static bool ice_bits_max_set(const u8 *mask, u16 size, u16 max)
+{
+	u16 count = 0;
+	u16 i;
+
+	/* check each byte */
+	for (i = 0; i < size; i++) {
+		/* if 0, go to next byte */
+		if (!mask[i])
+			continue;
+
+		/* We know there is at least one set bit in this byte because of
+		 * the above check; if we already have found 'max' number of
+		 * bits set, then we can return failure now.
+		 */
+		if (count == max)
+			return false;
+
+		/* count the bits in this byte, checking threshold */
+		count += hweight8(mask[i]);
+		if (count > max)
+			return false;
+	}
+
+	return true;
 }
 
 /**
- * ice_verify_pkg - verify package
- * @pkg: pointer to the package buffer
- * @len: size of the package buffer
+ * ice_set_key - generate a variable sized key with multiples of 16-bits
+ * @key: pointer to where the key will be stored
+ * @size: the size of the complete key in bytes (must be even)
+ * @val: array of 8-bit values that makes up the value portion of the key
+ * @upd: array of 8-bit masks that determine what key portion to update
+ * @dc: array of 8-bit masks that make up the don't care mask
+ * @nm: array of 8-bit masks that make up the never match mask
+ * @off: the offset of the first byte in the key to update
+ * @len: the number of bytes in the key update
  *
- * Verifies various attributes of the package file, including length, format
- * version, and the requirement of at least one segment.
+ * This function generates a key from a value, a don't care mask and a never
+ * match mask.
+ * upd, dc, and nm are optional parameters, and can be NULL:
+ *	upd == NULL --> upd mask is all 1's (update all bits)
+ *	dc == NULL --> dc mask is all 0's (no don't care bits)
+ *	nm == NULL --> nm mask is all 0's (no never match bits)
  */
-static enum ice_status ice_verify_pkg(struct ice_pkg_hdr *pkg, u32 len)
+enum ice_status
+ice_set_key(u8 *key, u16 size, u8 *val, u8 *upd, u8 *dc, u8 *nm, u16 off,
+	    u16 len)
 {
-	u32 seg_count;
-	u32 i;
+	u16 half_size;
+	u16 i;
 
-	if (len < sizeof(*pkg))
-		return ICE_ERR_BUF_TOO_SHORT;
+	/* size must be a multiple of 2 bytes. */
+	if (size % 2)
+		return ICE_ERR_CFG;
+	half_size = size / 2;
 
-	if (pkg->format_ver.major != ICE_PKG_FMT_VER_MAJ ||
-	    pkg->format_ver.minor != ICE_PKG_FMT_VER_MNR ||
-	    pkg->format_ver.update != ICE_PKG_FMT_VER_UPD ||
-	    pkg->format_ver.draft != ICE_PKG_FMT_VER_DFT)
+	if (off + len > half_size)
 		return ICE_ERR_CFG;
 
-	/* pkg must have at least one segment */
-	seg_count = le32_to_cpu(pkg->seg_count);
-	if (seg_count < 1)
+	/* Make sure at most one bit is set in the never match mask. Having more
+	 * than one never match mask bit set will cause HW to consume excessive
+	 * power otherwise; this is a power management efficiency check.
+	 */
+#define ICE_NVR_MTCH_BITS_MAX	1
+	if (nm && !ice_bits_max_set(nm, len, ICE_NVR_MTCH_BITS_MAX))
 		return ICE_ERR_CFG;
 
-	/* make sure segment array fits in package length */
-	if (len < sizeof(*pkg) + ((seg_count - 1) * sizeof(pkg->seg_offset)))
-		return ICE_ERR_BUF_TOO_SHORT;
+	for (i = 0; i < len; i++)
+		if (ice_gen_key_word(val[i], upd ? upd[i] : 0xff,
+				     dc ? dc[i] : 0, nm ? nm[i] : 0,
+				     key + off + i, key + half_size + off + i))
+			return ICE_ERR_CFG;
 
-	/* all segments must fit within length */
-	for (i = 0; i < seg_count; i++) {
-		u32 off = le32_to_cpu(pkg->seg_offset[i]);
-		struct ice_generic_seg_hdr *seg;
+	return 0;
+}
 
-		/* segment header must fit */
-		if (len < off + sizeof(*seg))
-			return ICE_ERR_BUF_TOO_SHORT;
+/**
+ * ice_acquire_global_cfg_lock
+ * @hw: pointer to the HW structure
+ * @access: access type (read or write)
+ *
+ * This function will request ownership of the global config lock for reading
+ * or writing of the package. When attempting to obtain write access, the
+ * caller must check for the following two return values:
+ *
+ * ICE_SUCCESS        - Means the caller has acquired the global config lock
+ *                      and can perform writing of the package.
+ * ICE_ERR_AQ_NO_WORK - Indicates another driver has already written the
+ *                      package or has found that no update was necessary; in
+ *                      this case, the caller can just skip performing any
+ *                      update of the package.
+ */
+static enum ice_status
+ice_acquire_global_cfg_lock(struct ice_hw *hw,
+			    enum ice_aq_res_access_type access)
+{
+	enum ice_status status;
 
-		seg = (struct ice_generic_seg_hdr *)((u8 *)pkg + off);
+	status = ice_acquire_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID, access,
+				 ICE_GLOBAL_CFG_LOCK_TIMEOUT);
 
-		/* segment body must fit */
-		if (len < off + le32_to_cpu(seg->seg_size))
-			return ICE_ERR_BUF_TOO_SHORT;
-	}
+	if (!status)
+		mutex_lock(&ice_global_cfg_lock_sw);
+	else if (status == ICE_ERR_AQ_NO_WORK)
+		ice_debug(hw, ICE_DBG_PKG, "Global config lock: No work to do\n");
 
-	return 0;
+	return status;
 }
 
 /**
- * ice_free_seg - free package segment pointer
- * @hw: pointer to the hardware structure
+ * ice_release_global_cfg_lock
+ * @hw: pointer to the HW structure
  *
- * Frees the package segment pointer in the proper manner, depending on if the
- * segment was allocated or just the passed in pointer was stored.
+ * This function will release the global config lock.
  */
-void ice_free_seg(struct ice_hw *hw)
+static void ice_release_global_cfg_lock(struct ice_hw *hw)
 {
-	if (hw->pkg_copy) {
-		devm_kfree(ice_hw_to_dev(hw), hw->pkg_copy);
-		hw->pkg_copy = NULL;
-		hw->pkg_size = 0;
-	}
-	hw->seg = NULL;
+	mutex_unlock(&ice_global_cfg_lock_sw);
+	ice_release_res(hw, ICE_GLOBAL_CFG_LOCK_RES_ID);
 }
 
 /**
- * ice_init_pkg_regs - initialize additional package registers
- * @hw: pointer to the hardware structure
+ * ice_acquire_change_lock
+ * @hw: pointer to the HW structure
+ * @access: access type (read or write)
+ *
+ * This function will request ownership of the change lock.
  */
-static void ice_init_pkg_regs(struct ice_hw *hw)
+enum ice_status
+ice_acquire_change_lock(struct ice_hw *hw, enum ice_aq_res_access_type access)
 {
-#define ICE_SW_BLK_INP_MASK_L 0xFFFFFFFF
-#define ICE_SW_BLK_INP_MASK_H 0x0000FFFF
-#define ICE_SW_BLK_IDX	0
-
-	/* setup Switch block input mask, which is 48-bits in two parts */
-	wr32(hw, GL_PREEXT_L2_PMASK0(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_L);
-	wr32(hw, GL_PREEXT_L2_PMASK1(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_H);
+	return ice_acquire_res(hw, ICE_CHANGE_LOCK_RES_ID, access,
+			       ICE_CHANGE_LOCK_TIMEOUT);
 }
 
 /**
- * ice_chk_pkg_version - check package version for compatibility with driver
- * @pkg_ver: pointer to a version structure to check
+ * ice_release_change_lock
+ * @hw: pointer to the HW structure
  *
- * Check to make sure that the package about to be downloaded is compatible with
- * the driver. To be compatible, the major and minor components of the package
- * version must match our ICE_PKG_SUPP_VER_MAJ and ICE_PKG_SUPP_VER_MNR
- * definitions.
+ * This function will release the change lock using the proper Admin Command.
  */
-static enum ice_status ice_chk_pkg_version(struct ice_pkg_ver *pkg_ver)
+void ice_release_change_lock(struct ice_hw *hw)
 {
-	if (pkg_ver->major != ICE_PKG_SUPP_VER_MAJ ||
-	    pkg_ver->minor != ICE_PKG_SUPP_VER_MNR)
-		return ICE_ERR_NOT_SUPPORTED;
-
-	return 0;
+	ice_release_res(hw, ICE_CHANGE_LOCK_RES_ID);
 }
 
 /**
- * ice_init_pkg - initialize/download package
+ * ice_aq_download_pkg
  * @hw: pointer to the hardware structure
- * @buf: pointer to the package buffer
- * @len: size of the package buffer
- *
- * This function initializes a package. The package contains HW tables
- * required to do packet processing. First, the function extracts package
- * information such as version. Then it finds the ice configuration segment
- * within the package; this function then saves a copy of the segment pointer
- * within the supplied package buffer. Next, the function will cache any hints
- * from the package, followed by downloading the package itself. Note, that if
- * a previous PF driver has already downloaded the package successfully, then
- * the current driver will not have to download the package again.
- *
- * The local package contents will be used to query default behavior and to
- * update specific sections of the HW's version of the package (e.g. to update
- * the parse graph to understand new protocols).
+ * @pkg_buf: the package buffer to transfer
+ * @buf_size: the size of the package buffer
+ * @last_buf: last buffer indicator
+ * @error_offset: returns error offset
+ * @error_info: returns error information
+ * @cd: pointer to command details structure or NULL
  *
- * This function stores a pointer to the package buffer memory, and it is
- * expected that the supplied buffer will not be freed immediately. If the
- * package buffer needs to be freed, such as when read from a file, use
- * ice_copy_and_init_pkg() instead of directly calling ice_init_pkg() in this
- * case.
+ * Download Package (0x0C40)
  */
-enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
+static enum ice_status
+ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		    u16 buf_size, bool last_buf, u32 *error_offset,
+		    u32 *error_info, struct ice_sq_cd *cd)
 {
-	struct ice_pkg_hdr *pkg;
+	struct ice_aqc_download_pkg *cmd;
+	struct ice_aq_desc desc;
 	enum ice_status status;
-	struct ice_seg *seg;
 
-	if (!buf || !len)
-		return ICE_ERR_PARAM;
-
-	pkg = (struct ice_pkg_hdr *)buf;
-	status = ice_verify_pkg(pkg, len);
-	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "failed to verify pkg (err: %d)\n",
-			  status);
-		return status;
-	}
-
-	/* initialize package info */
-	status = ice_init_pkg_info(hw, pkg);
-	if (status)
-		return status;
+	if (error_offset)
+		*error_offset = 0;
+	if (error_info)
+		*error_info = 0;
 
-	/* before downloading the package, check package version for
-	 * compatibility with driver
-	 */
-	status = ice_chk_pkg_version(&hw->pkg_ver);
-	if (status)
-		return status;
+	cmd = &desc.params.download_pkg;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_download_pkg);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
 
-	/* find segment in given package */
-	seg = (struct ice_seg *)ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg);
-	if (!seg) {
-		ice_debug(hw, ICE_DBG_INIT, "no ice segment in package.\n");
-		return ICE_ERR_CFG;
-	}
 
-	/* download package */
-	status = ice_download_pkg(hw, seg);
-	if (status == ICE_ERR_AQ_NO_WORK) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "package previously loaded - no work.\n");
-		status = 0;
-	}
+	if (last_buf)
+		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
 
-	/* Get information on the package currently loaded in HW, then make sure
-	 * the driver is compatible with this version.
-	 */
-	if (!status) {
-		status = ice_get_pkg_info(hw);
-		if (!status)
-			status = ice_chk_pkg_version(&hw->active_pkg_ver);
-	}
+	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
+	if (status == ICE_ERR_AQ_ERROR) {
+		/* Read error from buffer only when the FW returned an error */
+		struct ice_aqc_download_pkg_resp *resp;
 
-	if (!status) {
-		hw->seg = seg;
-		/* on successful package download update other required
-		 * registers to support the package and fill HW tables
-		 * with package content.
-		 */
-		ice_init_pkg_regs(hw);
-		ice_fill_blk_tbls(hw);
-	} else {
-		ice_debug(hw, ICE_DBG_INIT, "package load failed, %d\n",
-			  status);
+		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
+		if (error_offset)
+			*error_offset = le32_to_cpu(resp->error_offset);
+		if (error_info)
+			*error_info = le32_to_cpu(resp->error_info);
 	}
 
 	return status;
 }
 
 /**
- * ice_copy_and_init_pkg - initialize/download a copy of the package
+ * ice_aq_upload_section
  * @hw: pointer to the hardware structure
- * @buf: pointer to the package buffer
- * @len: size of the package buffer
- *
- * This function copies the package buffer, and then calls ice_init_pkg() to
- * initialize the copied package contents.
- *
- * The copying is necessary if the package buffer supplied is constant, or if
- * the memory may disappear shortly after calling this function.
- *
- * If the package buffer resides in the data segment and can be modified, the
- * caller is free to use ice_init_pkg() instead of ice_copy_and_init_pkg().
- *
- * However, if the package buffer needs to be copied first, such as when being
- * read from a file, the caller should use ice_copy_and_init_pkg().
+ * @pkg_buf: the package buffer which will receive the section
+ * @buf_size: the size of the package buffer
+ * @cd: pointer to command details structure or NULL
  *
- * This function will first copy the package buffer, before calling
- * ice_init_pkg(). The caller is free to immediately destroy the original
- * package buffer, as the new copy will be managed by this function and
- * related routines.
+ * Upload Section (0x0C41)
  */
-enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len)
+enum ice_status
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		      u16 buf_size, struct ice_sq_cd *cd)
 {
-	enum ice_status status;
-	u8 *buf_copy;
-
-	if (!buf || !len)
-		return ICE_ERR_PARAM;
+	struct ice_aq_desc desc;
 
-	buf_copy = devm_kmemdup(ice_hw_to_dev(hw), buf, len, GFP_KERNEL);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_upload_section);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
 
-	status = ice_init_pkg(hw, buf_copy, len);
-	if (status) {
-		/* Free the copy, since we failed to initialize the package */
-		devm_kfree(ice_hw_to_dev(hw), buf_copy);
-	} else {
-		/* Track the copied pkg so we can free it later */
-		hw->pkg_copy = buf_copy;
-		hw->pkg_size = len;
-	}
 
-	return status;
+	return ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
 }
 
-/* PTG Management */
-
 /**
- * ice_ptg_find_ptype - Search for packet type group using packet type (ptype)
+ * ice_aq_update_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to search for
- * @ptg: pointer to variable that receives the PTG
+ * @pkg_buf: the package cmd buffer
+ * @buf_size: the size of the package cmd buffer
+ * @last_buf: last buffer indicator
+ * @error_offset: returns error offset
+ * @error_info: returns error information
+ * @cd: pointer to command details structure or NULL
  *
- * This function will search the PTGs for a particular ptype, returning the
- * PTG ID that contains it through the PTG parameter, with the value of
- * ICE_DEFAULT_PTG (0) meaning it is part the default PTG.
+ * Update Package (0x0C42)
  */
 static enum ice_status
-ice_ptg_find_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 *ptg)
+ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf, u16 buf_size,
+		  bool last_buf, u32 *error_offset, u32 *error_info,
+		  struct ice_sq_cd *cd)
 {
-	if (ptype >= ICE_XLT1_CNT || !ptg)
-		return ICE_ERR_PARAM;
+	struct ice_aqc_download_pkg *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	*ptg = hw->blk[blk].xlt1.ptypes[ptype].ptg;
-	return 0;
+	if (error_offset)
+		*error_offset = 0;
+	if (error_info)
+		*error_info = 0;
+
+	cmd = &desc.params.download_pkg;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_pkg);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+
+	if (last_buf)
+		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
+
+	status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
+	if (status == ICE_ERR_AQ_ERROR) {
+		/* Read error from buffer only when the FW returned an error */
+		struct ice_aqc_download_pkg_resp *resp;
+
+		resp = (struct ice_aqc_download_pkg_resp *)pkg_buf;
+		if (error_offset)
+			*error_offset = le32_to_cpu(resp->error_offset);
+		if (error_info)
+			*error_info = le32_to_cpu(resp->error_info);
+	}
+
+	return status;
 }
 
 /**
- * ice_ptg_alloc_val - Allocates a new packet type group ID by value
+ * ice_find_seg_in_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptg: the PTG to allocate
+ * @seg_type: the segment type to search for (i.e., SEGMENT_TYPE_CPK)
+ * @pkg_hdr: pointer to the package header to be searched
  *
- * This function allocates a given packet type group ID specified by the PTG
- * parameter.
+ * This function searches a package file for a particular segment type. On
+ * success it returns a pointer to the segment header, otherwise it will
+ * return NULL.
  */
-static void ice_ptg_alloc_val(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+static struct ice_generic_seg_hdr *
+ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type,
+		    struct ice_pkg_hdr *pkg_hdr)
 {
-	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = true;
+	u32 i;
+
+	ice_debug(hw, ICE_DBG_PKG, "Package format version: %d.%d.%d.%d\n",
+		  pkg_hdr->pkg_format_ver.major, pkg_hdr->pkg_format_ver.minor,
+		  pkg_hdr->pkg_format_ver.update,
+		  pkg_hdr->pkg_format_ver.draft);
+
+	/* Search all package segments for the requested segment type */
+	for (i = 0; i < le32_to_cpu(pkg_hdr->seg_count); i++) {
+		struct ice_generic_seg_hdr *seg;
+
+		seg = (struct ice_generic_seg_hdr *)
+			((u8 *)pkg_hdr + le32_to_cpu(pkg_hdr->seg_offset[i]));
+
+		if (le32_to_cpu(seg->seg_type) == seg_type)
+			return seg;
+	}
+
+	return NULL;
 }
 
 /**
- * ice_ptg_remove_ptype - Removes ptype from a particular packet type group
+ * ice_update_pkg_no_lock
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to remove
- * @ptg: the PTG to remove the ptype from
- *
- * This function will remove the ptype from the specific PTG, and move it to
- * the default PTG (ICE_DEFAULT_PTG).
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
  */
 static enum ice_status
-ice_ptg_remove_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+ice_update_pkg_no_lock(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
 {
-	struct ice_ptg_ptype **ch;
-	struct ice_ptg_ptype *p;
-
-	if (ptype > ICE_XLT1_CNT - 1)
-		return ICE_ERR_PARAM;
+	enum ice_status status = 0;
+	u32 i;
 
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use)
-		return ICE_ERR_DOES_NOT_EXIST;
+	for (i = 0; i < count; i++) {
+		struct ice_buf_hdr *bh = (struct ice_buf_hdr *)(bufs + i);
+		bool last = ((i + 1) == count);
+		u32 offset, info;
 
-	/* Should not happen if .in_use is set, bad config */
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype)
-		return ICE_ERR_CFG;
+		status = ice_aq_update_pkg(hw, bh, le16_to_cpu(bh->data_end),
+					   last, &offset, &info, NULL);
 
-	/* find the ptype within this PTG, and bypass the link over it */
-	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	ch = &hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	while (p) {
-		if (ptype == (p - hw->blk[blk].xlt1.ptypes)) {
-			*ch = p->next_ptype;
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "Update pkg failed: err %d off %d inf %d\n",
+				  status, offset, info);
 			break;
 		}
-
-		ch = &p->next_ptype;
-		p = p->next_ptype;
 	}
 
-	hw->blk[blk].xlt1.ptypes[ptype].ptg = ICE_DEFAULT_PTG;
-	hw->blk[blk].xlt1.ptypes[ptype].next_ptype = NULL;
-
-	return 0;
+	return status;
 }
 
 /**
- * ice_ptg_add_mv_ptype - Adds/moves ptype to a particular packet type group
+ * ice_update_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @ptype: the ptype to add or move
- * @ptg: the PTG to add or move the ptype to
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
  *
- * This function will either add or move a ptype to a particular PTG depending
- * on if the ptype is already part of another group. Note that using a
- * a destination PTG ID of ICE_DEFAULT_PTG (0) will move the ptype to the
- * default PTG.
+ * Obtains change lock and updates package.
  */
 static enum ice_status
-ice_ptg_add_mv_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
 {
 	enum ice_status status;
-	u8 original_ptg;
-
-	if (ptype > ICE_XLT1_CNT - 1)
-		return ICE_ERR_PARAM;
 
-	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use && ptg != ICE_DEFAULT_PTG)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	status = ice_ptg_find_ptype(hw, blk, ptype, &original_ptg);
+	status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
 	if (status)
 		return status;
 
-	/* Is ptype already in the correct PTG? */
-	if (original_ptg == ptg)
-		return 0;
+	status = ice_update_pkg_no_lock(hw, bufs, count);
 
-	/* Remove from original PTG and move back to the default PTG */
-	if (original_ptg != ICE_DEFAULT_PTG)
-		ice_ptg_remove_ptype(hw, blk, ptype, original_ptg);
+	ice_release_change_lock(hw);
 
-	/* Moving to default PTG? Then we're done with this request */
-	if (ptg == ICE_DEFAULT_PTG)
-		return 0;
+	return status;
+}
 
-	/* Add ptype to PTG at beginning of list */
-	hw->blk[blk].xlt1.ptypes[ptype].next_ptype =
-		hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
-	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype =
-		&hw->blk[blk].xlt1.ptypes[ptype];
-
-	hw->blk[blk].xlt1.ptypes[ptype].ptg = ptg;
-	hw->blk[blk].xlt1.t[ptype] = ptg;
+/**
+ * ice_dwnld_cfg_bufs
+ * @hw: pointer to the hardware structure
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
+ *
+ * Obtains global config lock and downloads the package configuration buffers
+ * to the firmware. Metadata buffers are skipped, and the first metadata buffer
+ * found indicates that the rest of the buffers are all metadata buffers.
+ */
+static enum ice_status
+ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+{
+	enum ice_status status;
+	struct ice_buf_hdr *bh;
+	u32 offset, info, i;
 
-	return 0;
-}
+	if (!bufs || !count)
+		return ICE_ERR_PARAM;
 
-/* Block / table size info */
-struct ice_blk_size_details {
-	u16 xlt1;			/* # XLT1 entries */
-	u16 xlt2;			/* # XLT2 entries */
-	u16 prof_tcam;			/* # profile ID TCAM entries */
-	u16 prof_id;			/* # profile IDs */
-	u8 prof_cdid_bits;		/* # CDID one-hot bits used in key */
-	u16 prof_redir;			/* # profile redirection entries */
-	u16 es;				/* # extraction sequence entries */
-	u16 fvw;			/* # field vector words */
-	u8 overwrite;			/* overwrite existing entries allowed */
-	u8 reverse;			/* reverse FV order */
-};
+	/* If the first buffer's first section has its metadata bit set
+	 * then there are no buffers to be downloaded, and the operation is
+	 * considered a success.
+	 */
+	bh = (struct ice_buf_hdr *)bufs;
+	if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
+		return 0;
 
-static const struct ice_blk_size_details blk_sizes[ICE_BLK_COUNT] = {
-	/**
-	 * Table Definitions
-	 * XLT1 - Number of entries in XLT1 table
-	 * XLT2 - Number of entries in XLT2 table
-	 * TCAM - Number of entries Profile ID TCAM table
-	 * CDID - Control Domain ID of the hardware block
-	 * PRED - Number of entries in the Profile Redirection Table
-	 * FV   - Number of entries in the Field Vector
-	 * FVW  - Width (in WORDs) of the Field Vector
-	 * OVR  - Overwrite existing table entries
-	 * REV  - Reverse FV
+	/* reset pkg_dwnld_status in case this function is called in the
+	 * reset/rebuild flow
 	 */
-	/*          XLT1        , XLT2        ,TCAM, PID,CDID,PRED,   FV, FVW */
-	/*          Overwrite   , Reverse FV */
-	/* SW  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 256,   0,  256, 256,  48,
-		    false, false },
-	/* ACL */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  32,
-		    false, false },
-	/* FD  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
-		    false, true  },
-	/* RSS */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
-		    true,  true  },
-	/* PE  */ { ICE_XLT1_CNT, ICE_XLT2_CNT,  64,  32,   0,   32,  32,  24,
-		    false, false },
-};
+	hw->pkg_dwnld_status = ICE_AQ_RC_OK;
 
-enum ice_sid_all {
-	ICE_SID_XLT1_OFF = 0,
-	ICE_SID_XLT2_OFF,
-	ICE_SID_PR_OFF,
-	ICE_SID_PR_REDIR_OFF,
-	ICE_SID_ES_OFF,
-	ICE_SID_OFF_COUNT,
-};
+	status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
+	if (status) {
+		if (status == ICE_ERR_AQ_NO_WORK)
+			hw->pkg_dwnld_status = ICE_AQ_RC_EEXIST;
+		else
+			hw->pkg_dwnld_status = hw->adminq.sq_last_status;
+		return status;
+	}
 
-/* VSIG Management */
+	for (i = 0; i < count; i++) {
+		bool last = ((i + 1) == count);
+
+		if (!last) {
+			/* check next buffer for metadata flag */
+			bh = (struct ice_buf_hdr *)(bufs + i + 1);
+
+			/* A set metadata flag in the next buffer will signal
+			 * that the current buffer will be the last buffer
+			 * downloaded
+			 */
+			if (le16_to_cpu(bh->section_count))
+				if (le32_to_cpu(bh->section_entry[0].type) &
+				    ICE_METADATA_BUF)
+					last = true;
+		}
+
+		bh = (struct ice_buf_hdr *)(bufs + i);
+
+		status = ice_aq_download_pkg(hw, bh, ICE_PKG_BUF_SIZE, last,
+					     &offset, &info, NULL);
+
+		/* Save AQ status from download package */
+		hw->pkg_dwnld_status = hw->adminq.sq_last_status;
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "Pkg download failed: err %d off %d inf %d\n",
+				  status, offset, info);
+
+			break;
+		}
+
+		if (last)
+			break;
+	}
+
+	if (!status) {
+		status = ice_set_vlan_mode(hw);
+		if (status)
+			ice_debug(hw, ICE_DBG_PKG, "Failed to set VLAN mode: err %d\n",
+				  status);
+	}
+
+	ice_release_global_cfg_lock(hw);
+
+	return status;
+}
 
 /**
- * ice_vsig_find_vsi - find a VSIG that contains a specified VSI
+ * ice_aq_get_pkg_info_list
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI of interest
- * @vsig: pointer to receive the VSI group
+ * @pkg_info: the buffer which will receive the information list
+ * @buf_size: the size of the pkg_info information buffer
+ * @cd: pointer to command details structure or NULL
  *
- * This function will lookup the VSI entry in the XLT2 list and return
- * the VSI group its associated with.
+ * Get Package Info List (0x0C43)
  */
 static enum ice_status
-ice_vsig_find_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 *vsig)
+ice_aq_get_pkg_info_list(struct ice_hw *hw,
+			 struct ice_aqc_get_pkg_info_resp *pkg_info,
+			 u16 buf_size, struct ice_sq_cd *cd)
 {
-	if (!vsig || vsi >= ICE_MAX_VSI)
-		return ICE_ERR_PARAM;
+	struct ice_aq_desc desc;
 
-	/* As long as there's a default or valid VSIG associated with the input
-	 * VSI, the functions returns a success. Any handling of VSIG will be
-	 * done by the following add, update or remove functions.
-	 */
-	*vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_pkg_info_list);
 
-	return 0;
+
+	return ice_aq_send_cmd(hw, &desc, pkg_info, buf_size, cd);
 }
 
 /**
- * ice_vsig_alloc_val - allocate a new VSIG by value
+ * ice_download_pkg
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsig: the VSIG to allocate
+ * @ice_seg: pointer to the segment of the package to be downloaded
  *
- * This function will allocate a given VSIG specified by the VSIG parameter.
+ * Handles the download of a complete package.
  */
-static u16 ice_vsig_alloc_val(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+static enum ice_status
+ice_download_pkg(struct ice_hw *hw, struct ice_seg *ice_seg)
 {
-	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_buf_table *ice_buf_tbl;
+	enum ice_status status;
 
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) {
-		INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
-		hw->blk[blk].xlt2.vsig_tbl[idx].in_use = true;
-	}
+	ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n",
+		  ice_seg->hdr.seg_format_ver.major,
+		  ice_seg->hdr.seg_format_ver.minor,
+		  ice_seg->hdr.seg_format_ver.update,
+		  ice_seg->hdr.seg_format_ver.draft);
 
-	return ICE_VSIG_VALUE(idx, hw->pf_id);
+	ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
+		  le32_to_cpu(ice_seg->hdr.seg_type),
+		  le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id);
+
+	ice_buf_tbl = ice_find_buf_table(ice_seg);
+
+	ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
+		  le32_to_cpu(ice_buf_tbl->buf_count));
+
+	status = ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
+				    le32_to_cpu(ice_buf_tbl->buf_count));
+
+	ice_post_pkg_dwnld_vlan_mode_cfg(hw);
+
+	return status;
 }
 
 /**
- * ice_vsig_remove_vsi - remove VSI from VSIG
+ * ice_init_pkg_info
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI to remove
- * @vsig: VSI group to remove from
+ * @pkg_hdr: pointer to the driver's package hdr
  *
- * The function will remove the input VSI from its VSI group and move it
- * to the DEFAULT_VSIG.
+ * Saves off the package details into the HW structure.
  */
 static enum ice_status
-ice_vsig_remove_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+ice_init_pkg_info(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 {
-	struct ice_vsig_vsi **vsi_head, *vsi_cur, *vsi_tgt;
-	u16 idx;
-
-	idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_generic_seg_hdr *seg_hdr;
 
-	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+	if (!pkg_hdr)
 		return ICE_ERR_PARAM;
 
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	/* entry already in default VSIG, don't have to remove */
-	if (idx == ICE_DEFAULT_VSIG)
-		return 0;
-
-	vsi_head = &hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
-	if (!(*vsi_head))
-		return ICE_ERR_CFG;
+	seg_hdr = (struct ice_generic_seg_hdr *)
+		ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE, pkg_hdr);
+	if (seg_hdr) {
+		struct ice_meta_sect *meta;
+		struct ice_pkg_enum state;
 
-	vsi_tgt = &hw->blk[blk].xlt2.vsis[vsi];
-	vsi_cur = (*vsi_head);
+		memset(&state, 0, sizeof(state));
 
-	/* iterate the VSI list, skip over the entry to be removed */
-	while (vsi_cur) {
-		if (vsi_tgt == vsi_cur) {
-			(*vsi_head) = vsi_cur->next_vsi;
-			break;
+		/* Get package information from the Metadata Section */
+		meta = ice_pkg_enum_section((struct ice_seg *)seg_hdr, &state,
+					    ICE_SID_METADATA);
+		if (!meta) {
+			ice_debug(hw, ICE_DBG_INIT, "Did not find ice metadata section in package\n");
+			return ICE_ERR_CFG;
 		}
-		vsi_head = &vsi_cur->next_vsi;
-		vsi_cur = vsi_cur->next_vsi;
-	}
 
-	/* verify if VSI was removed from group list */
-	if (!vsi_cur)
-		return ICE_ERR_DOES_NOT_EXIST;
+		hw->pkg_ver = meta->ver;
+		memcpy(hw->pkg_name, meta->name, sizeof(meta->name));
 
-	vsi_cur->vsig = ICE_DEFAULT_VSIG;
-	vsi_cur->changed = 1;
-	vsi_cur->next_vsi = NULL;
+		ice_debug(hw, ICE_DBG_PKG, "Pkg: %d.%d.%d.%d, %s\n",
+			  meta->ver.major, meta->ver.minor, meta->ver.update,
+			  meta->ver.draft, meta->name);
+
+		hw->ice_seg_fmt_ver = seg_hdr->seg_format_ver;
+		memcpy(hw->ice_seg_id, seg_hdr->seg_id,
+		       sizeof(hw->ice_seg_id));
+
+		ice_debug(hw, ICE_DBG_PKG, "Ice Seg: %d.%d.%d.%d, %s\n",
+			  seg_hdr->seg_format_ver.major,
+			  seg_hdr->seg_format_ver.minor,
+			  seg_hdr->seg_format_ver.update,
+			  seg_hdr->seg_format_ver.draft,
+			  seg_hdr->seg_id);
+	} else {
+		ice_debug(hw, ICE_DBG_INIT, "Did not find ice segment in driver package\n");
+		return ICE_ERR_CFG;
+	}
 
 	return 0;
 }
 
 /**
- * ice_vsig_add_mv_vsi - add or move a VSI to a VSI group
+ * ice_get_pkg_info
  * @hw: pointer to the hardware structure
- * @blk: HW block
- * @vsi: VSI to move
- * @vsig: destination VSI group
  *
- * This function will move or add the input VSI to the target VSIG.
- * The function will find the original VSIG the VSI belongs to and
- * move the entry to the DEFAULT_VSIG, update the original VSIG and
- * then move entry to the new VSIG.
+ * Store details of the package currently loaded in HW into the HW structure.
  */
-static enum ice_status
-ice_vsig_add_mv_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+static enum ice_status ice_get_pkg_info(struct ice_hw *hw)
 {
-	struct ice_vsig_vsi *tmp;
+	struct ice_aqc_get_pkg_info_resp *pkg_info;
 	enum ice_status status;
-	u16 orig_vsig, idx;
+	u16 size;
+	u32 i;
 
-	idx = vsig & ICE_VSIG_IDX_M;
+	size = struct_size(pkg_info, pkg_info, ICE_PKG_CNT);
+	pkg_info = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
+	if (!pkg_info)
+		return ICE_ERR_NO_MEMORY;
 
-	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
-		return ICE_ERR_PARAM;
-
-	/* if VSIG not in use and VSIG is not default type this VSIG
-	 * doesn't exist.
-	 */
-	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use &&
-	    vsig != ICE_DEFAULT_VSIG)
-		return ICE_ERR_DOES_NOT_EXIST;
-
-	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	status = ice_aq_get_pkg_info_list(hw, pkg_info, size, NULL);
 	if (status)
-		return status;
+		goto init_pkg_free_alloc;
 
-	/* no update required if vsigs match */
-	if (orig_vsig == vsig)
-		return 0;
+	for (i = 0; i < le32_to_cpu(pkg_info->count); i++) {
+#define ICE_PKG_FLAG_COUNT	4
+		char flags[ICE_PKG_FLAG_COUNT + 1] = { 0 };
+		u8 place = 0;
 
-	if (orig_vsig != ICE_DEFAULT_VSIG) {
-		/* remove entry from orig_vsig and add to default VSIG */
-		status = ice_vsig_remove_vsi(hw, blk, vsi, orig_vsig);
-		if (status)
-			return status;
+		if (pkg_info->pkg_info[i].is_active) {
+			flags[place++] = 'A';
+			hw->active_pkg_ver = pkg_info->pkg_info[i].ver;
+			hw->active_track_id =
+				le32_to_cpu(pkg_info->pkg_info[i].track_id);
+			memcpy(hw->active_pkg_name,
+			       pkg_info->pkg_info[i].name,
+			       sizeof(pkg_info->pkg_info[i].name));
+			hw->active_pkg_in_nvm = pkg_info->pkg_info[i].is_in_nvm;
+		}
+		if (pkg_info->pkg_info[i].is_active_at_boot)
+			flags[place++] = 'B';
+		if (pkg_info->pkg_info[i].is_modified)
+			flags[place++] = 'M';
+		if (pkg_info->pkg_info[i].is_in_nvm)
+			flags[place++] = 'N';
+
+		ice_debug(hw, ICE_DBG_PKG, "Pkg[%d]: %d.%d.%d.%d,%s,%s\n",
+			  i, pkg_info->pkg_info[i].ver.major,
+			  pkg_info->pkg_info[i].ver.minor,
+			  pkg_info->pkg_info[i].ver.update,
+			  pkg_info->pkg_info[i].ver.draft,
+			  pkg_info->pkg_info[i].name, flags);
 	}
 
-	if (idx == ICE_DEFAULT_VSIG)
-		return 0;
+init_pkg_free_alloc:
+	devm_kfree(ice_hw_to_dev(hw), pkg_info);
 
-	/* Create VSI entry and add VSIG and prop_mask values */
-	hw->blk[blk].xlt2.vsis[vsi].vsig = vsig;
-	hw->blk[blk].xlt2.vsis[vsi].changed = 1;
+	return status;
+}
 
-	/* Add new entry to the head of the VSIG list */
-	tmp = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
-	hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi =
-		&hw->blk[blk].xlt2.vsis[vsi];
-	hw->blk[blk].xlt2.vsis[vsi].next_vsi = tmp;
-	hw->blk[blk].xlt2.t[vsi] = vsig;
+/**
+ * ice_find_label_value
+ * @ice_seg: pointer to the ice segment (non-NULL)
+ * @name: name of the label to search for
+ * @type: the section type that will contain the label
+ * @value: pointer to a value that will return the label's value if found
+ *
+ * Finds a label's value given the label name and the section type to search.
+ * The ice_seg parameter must not be NULL since the first call to
+ * ice_enum_labels requires a pointer to an actual ice_seg structure.
+ */
+enum ice_status
+ice_find_label_value(struct ice_seg *ice_seg, char const *name, u32 type,
+		     u16 *value)
+{
+	struct ice_pkg_enum state;
+	char *label_name;
+	u16 val;
 
-	return 0;
+	memset(&state, 0, sizeof(state));
+
+	if (!ice_seg)
+		return ICE_ERR_PARAM;
+
+	do {
+		label_name = ice_enum_labels(ice_seg, type, &state, &val);
+		if (label_name && !strcmp(label_name, name)) {
+			*value = val;
+			return 0;
+		}
+
+		ice_seg = NULL;
+	} while (label_name);
+
+	return ICE_ERR_CFG;
 }
 
-/* Block / table section IDs */
-static const u32 ice_blk_sids[ICE_BLK_COUNT][ICE_SID_OFF_COUNT] = {
-	/* SWITCH */
-	{	ICE_SID_XLT1_SW,
-		ICE_SID_XLT2_SW,
-		ICE_SID_PROFID_TCAM_SW,
-		ICE_SID_PROFID_REDIR_SW,
-		ICE_SID_FLD_VEC_SW
-	},
+/**
+ * ice_verify_pkg - verify package
+ * @pkg: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * Verifies various attributes of the package file, including length, format
+ * version, and the requirement of at least one segment.
+ */
+static enum ice_status ice_verify_pkg(struct ice_pkg_hdr *pkg, u32 len)
+{
+	u32 seg_count;
+	u32 i;
 
-	/* ACL */
-	{	ICE_SID_XLT1_ACL,
-		ICE_SID_XLT2_ACL,
-		ICE_SID_PROFID_TCAM_ACL,
-		ICE_SID_PROFID_REDIR_ACL,
-		ICE_SID_FLD_VEC_ACL
-	},
+	if (len < struct_size(pkg, seg_offset, 1))
+		return ICE_ERR_BUF_TOO_SHORT;
 
-	/* FD */
-	{	ICE_SID_XLT1_FD,
-		ICE_SID_XLT2_FD,
-		ICE_SID_PROFID_TCAM_FD,
-		ICE_SID_PROFID_REDIR_FD,
-		ICE_SID_FLD_VEC_FD
-	},
+	if (pkg->pkg_format_ver.major != ICE_PKG_FMT_VER_MAJ ||
+	    pkg->pkg_format_ver.minor != ICE_PKG_FMT_VER_MNR ||
+	    pkg->pkg_format_ver.update != ICE_PKG_FMT_VER_UPD ||
+	    pkg->pkg_format_ver.draft != ICE_PKG_FMT_VER_DFT)
+		return ICE_ERR_CFG;
 
-	/* RSS */
-	{	ICE_SID_XLT1_RSS,
-		ICE_SID_XLT2_RSS,
-		ICE_SID_PROFID_TCAM_RSS,
-		ICE_SID_PROFID_REDIR_RSS,
-		ICE_SID_FLD_VEC_RSS
-	},
+	/* pkg must have at least one segment */
+	seg_count = le32_to_cpu(pkg->seg_count);
+	if (seg_count < 1)
+		return ICE_ERR_CFG;
 
-	/* PE */
-	{	ICE_SID_XLT1_PE,
-		ICE_SID_XLT2_PE,
-		ICE_SID_PROFID_TCAM_PE,
-		ICE_SID_PROFID_REDIR_PE,
-		ICE_SID_FLD_VEC_PE
+	/* make sure segment array fits in package length */
+	if (len < struct_size(pkg, seg_offset, seg_count))
+		return ICE_ERR_BUF_TOO_SHORT;
+
+	/* all segments must fit within length */
+	for (i = 0; i < seg_count; i++) {
+		u32 off = le32_to_cpu(pkg->seg_offset[i]);
+		struct ice_generic_seg_hdr *seg;
+
+		/* segment header must fit */
+		if (len < off + sizeof(*seg))
+			return ICE_ERR_BUF_TOO_SHORT;
+
+		seg = (struct ice_generic_seg_hdr *)((u8 *)pkg + off);
+
+		/* segment body must fit */
+		if (len < off + le32_to_cpu(seg->seg_size))
+			return ICE_ERR_BUF_TOO_SHORT;
 	}
-};
+
+	return 0;
+}
 
 /**
- * ice_init_sw_xlt1_db - init software XLT1 database from HW tables
+ * ice_free_seg - free package segment pointer
  * @hw: pointer to the hardware structure
- * @blk: the HW block to initialize
+ *
+ * Frees the package segment pointer in the proper manner, depending on if the
+ * segment was allocated or just the passed in pointer was stored.
  */
-static void ice_init_sw_xlt1_db(struct ice_hw *hw, enum ice_block blk)
+void ice_free_seg(struct ice_hw *hw)
 {
-	u16 pt;
-
-	for (pt = 0; pt < hw->blk[blk].xlt1.count; pt++) {
-		u8 ptg;
-
-		ptg = hw->blk[blk].xlt1.t[pt];
-		if (ptg != ICE_DEFAULT_PTG) {
-			ice_ptg_alloc_val(hw, blk, ptg);
-			ice_ptg_add_mv_ptype(hw, blk, pt, ptg);
-		}
+	if (hw->pkg_copy) {
+		devm_kfree(ice_hw_to_dev(hw), hw->pkg_copy);
+		hw->pkg_copy = NULL;
+		hw->pkg_size = 0;
 	}
+	hw->seg = NULL;
 }
 
 /**
- * ice_init_sw_xlt2_db - init software XLT2 database from HW tables
+ * ice_init_pkg_regs - initialize additional package registers
  * @hw: pointer to the hardware structure
- * @blk: the HW block to initialize
  */
-static void ice_init_sw_xlt2_db(struct ice_hw *hw, enum ice_block blk)
+static void ice_init_pkg_regs(struct ice_hw *hw)
 {
-	u16 vsi;
-
-	for (vsi = 0; vsi < hw->blk[blk].xlt2.count; vsi++) {
-		u16 vsig;
+#define ICE_SW_BLK_INP_MASK_L 0xFFFFFFFF
+#define ICE_SW_BLK_INP_MASK_H 0x0000FFFF
+#define ICE_SW_BLK_IDX	0
 
-		vsig = hw->blk[blk].xlt2.t[vsi];
-		if (vsig) {
-			ice_vsig_alloc_val(hw, blk, vsig);
-			ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
-			/* no changes at this time, since this has been
-			 * initialized from the original package
-			 */
-			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
-		}
-	}
+	/* setup Switch block input mask, which is 48-bits in two parts */
+	wr32(hw, GL_PREEXT_L2_PMASK0(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_L);
+	wr32(hw, GL_PREEXT_L2_PMASK1(ICE_SW_BLK_IDX), ICE_SW_BLK_INP_MASK_H);
 }
 
 /**
- * ice_init_sw_db - init software database from HW tables
- * @hw: pointer to the hardware structure
+ * ice_chk_pkg_version - check package version for compatibility with driver
+ * @pkg_ver: pointer to a version structure to check
+ *
+ * Check to make sure that the package about to be downloaded is compatible with
+ * the driver. To be compatible, the major and minor components of the package
+ * version must match our ICE_PKG_SUPP_VER_MAJ and ICE_PKG_SUPP_VER_MNR
+ * definitions.
  */
-static void ice_init_sw_db(struct ice_hw *hw)
+static enum ice_status ice_chk_pkg_version(struct ice_pkg_ver *pkg_ver)
 {
-	u16 i;
+	/* The major is 0xFF indicates that it is a custom DDP */
+	if (pkg_ver->major == 0xFF)
+		return 0;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		ice_init_sw_xlt1_db(hw, (enum ice_block)i);
-		ice_init_sw_xlt2_db(hw, (enum ice_block)i);
-	}
+	if (pkg_ver->major != ICE_PKG_SUPP_VER_MAJ ||
+	    pkg_ver->minor != ICE_PKG_SUPP_VER_MNR)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	return 0;
 }
 
 /**
- * ice_fill_tbl - Reads content of a single table type into database
+ * ice_chk_pkg_compat
  * @hw: pointer to the hardware structure
- * @block_id: Block ID of the table to copy
- * @sid: Section ID of the table to copy
+ * @ospkg: pointer to the package hdr
+ * @seg: pointer to the package segment hdr
  *
- * Will attempt to read the entire content of a given table of a single block
- * into the driver database. We assume that the buffer will always
- * be as large or larger than the data contained in the package. If
- * this condition is not met, there is most likely an error in the package
- * contents.
+ * This function checks the package version compatibility with driver and NVM
  */
-static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
+static enum ice_status
+ice_chk_pkg_compat(struct ice_hw *hw, struct ice_pkg_hdr *ospkg,
+		   struct ice_seg **seg)
 {
-	u32 dst_len, sect_len, offset = 0;
-	struct ice_prof_redir_section *pr;
-	struct ice_prof_id_section *pid;
-	struct ice_xlt1_section *xlt1;
-	struct ice_xlt2_section *xlt2;
-	struct ice_sw_fv_section *es;
-	struct ice_pkg_enum state;
-	u8 *src, *dst;
-	void *sect;
+	struct ice_aqc_get_pkg_info_resp *pkg;
+	enum ice_status status;
+	u16 size;
+	u32 i;
 
-	/* if the HW segment pointer is null then the first iteration of
-	 * ice_pkg_enum_section() will fail. In this case the HW tables will
-	 * not be filled and return success.
-	 */
-	if (!hw->seg) {
-		ice_debug(hw, ICE_DBG_PKG, "hw->seg is NULL, tables are not filled\n");
-		return;
+	/* Check package version compatibility */
+	status = ice_chk_pkg_version(&hw->pkg_ver);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Package version check failed.\n");
+		return status;
 	}
 
-	memset(&state, 0, sizeof(state));
+	/* find ICE segment in given package */
+	*seg = (struct ice_seg *)ice_find_seg_in_pkg(hw, SEGMENT_TYPE_ICE,
+						     ospkg);
+	if (!*seg) {
+		ice_debug(hw, ICE_DBG_INIT, "no ice segment in package.\n");
+		return ICE_ERR_CFG;
+	}
 
-	sect = ice_pkg_enum_section(hw->seg, &state, sid);
+	/* Check if FW is compatible with the OS package */
+	size = struct_size(pkg, pkg_info, ICE_PKG_CNT);
+	pkg = devm_kzalloc(ice_hw_to_dev(hw), size, GFP_KERNEL);
+	if (!pkg)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_aq_get_pkg_info_list(hw, pkg, size, NULL);
+	if (status)
+		goto fw_ddp_compat_free_alloc;
+
+	for (i = 0; i < le32_to_cpu(pkg->count); i++) {
+		/* loop till we find the NVM package */
+		if (!pkg->pkg_info[i].is_in_nvm)
+			continue;
+		if ((*seg)->hdr.seg_format_ver.major !=
+			pkg->pkg_info[i].ver.major ||
+		    (*seg)->hdr.seg_format_ver.minor >
+			pkg->pkg_info[i].ver.minor) {
+			status = ICE_ERR_FW_DDP_MISMATCH;
+			ice_debug(hw, ICE_DBG_INIT, "OS package is not compatible with NVM.\n");
+		}
+		/* done processing NVM package so break */
+		break;
+	}
+fw_ddp_compat_free_alloc:
+	devm_kfree(ice_hw_to_dev(hw), pkg);
+	return status;
+}
+
+/**
+ * ice_sw_fv_handler
+ * @sect_type: section type
+ * @section: pointer to section
+ * @index: index of the field vector entry to be returned
+ * @offset: ptr to variable that receives the offset in the field vector table
+ *
+ * This is a callback function that can be passed to ice_pkg_enum_entry.
+ * This function treats the given section as of type ice_sw_fv_section and
+ * enumerates offset field. "offset" is an index into the field vector table.
+ */
+static void *
+ice_sw_fv_handler(u32 sect_type, void *section, u32 index, u32 *offset)
+{
+	struct ice_sw_fv_section *fv_section = section;
+
+	if (!section || sect_type != ICE_SID_FLD_VEC_SW)
+		return NULL;
+	if (index >= le16_to_cpu(fv_section->count))
+		return NULL;
+	if (offset)
+		/* "index" passed in to this function is relative to a given
+		 * 4k block. To get to the true index into the field vector
+		 * table need to add the relative index to the base_offset
+		 * field of this section
+		 */
+		*offset = le16_to_cpu(fv_section->base_offset) + index;
+	return fv_section->fv + index;
+}
+
+/**
+ * ice_get_prof_index_max - get the max profile index for used profile
+ * @hw: pointer to the HW struct
+ *
+ * Calling this function will get the max profile index for used profile
+ * and store the index number in struct ice_switch_info *switch_info
+ * in hw for following use.
+ */
+static int ice_get_prof_index_max(struct ice_hw *hw)
+{
+	u16 prof_index = 0, j, max_prof_index = 0;
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	bool flag = false;
+	struct ice_fv *fv;
+	u32 offset;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!hw->seg)
+		return ICE_ERR_PARAM;
+
+	ice_seg = hw->seg;
+
+	do {
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		if (!fv)
+			break;
+		ice_seg = NULL;
+
+		/* in the profile that not be used, the prot_id is set to 0xff
+		 * and the off is set to 0x1ff for all the field vectors.
+		 */
+		for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+			if (fv->ew[j].prot_id != ICE_PROT_INVALID ||
+			    fv->ew[j].off != ICE_FV_OFFSET_INVAL)
+				flag = true;
+		if (flag && prof_index > max_prof_index)
+			max_prof_index = prof_index;
+
+		prof_index++;
+		flag = false;
+	} while (fv);
+
+	hw->switch_info->max_used_prof_index = max_prof_index;
+
+	return 0;
+}
+
+/**
+ * ice_init_pkg - initialize/download package
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * This function initializes a package. The package contains HW tables
+ * required to do packet processing. First, the function extracts package
+ * information such as version. Then it finds the ice configuration segment
+ * within the package; this function then saves a copy of the segment pointer
+ * within the supplied package buffer. Next, the function will cache any hints
+ * from the package, followed by downloading the package itself. Note, that if
+ * a previous PF driver has already downloaded the package successfully, then
+ * the current driver will not have to download the package again.
+ *
+ * The local package contents will be used to query default behavior and to
+ * update specific sections of the HW's version of the package (e.g. to update
+ * the parse graph to understand new protocols).
+ *
+ * This function stores a pointer to the package buffer memory, and it is
+ * expected that the supplied buffer will not be freed immediately. If the
+ * package buffer needs to be freed, such as when read from a file, use
+ * ice_copy_and_init_pkg() instead of directly calling ice_init_pkg() in this
+ * case.
+ */
+enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
+{
+	struct ice_pkg_hdr *pkg;
+	enum ice_status status;
+	struct ice_seg *seg;
+
+	if (!buf || !len)
+		return ICE_ERR_PARAM;
+
+	pkg = (struct ice_pkg_hdr *)buf;
+	status = ice_verify_pkg(pkg, len);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "failed to verify pkg (err: %d)\n",
+			  status);
+		return status;
+	}
+
+	/* initialize package info */
+	status = ice_init_pkg_info(hw, pkg);
+	if (status)
+		return status;
+
+	/* before downloading the package, check package version for
+	 * compatibility with driver
+	 */
+	status = ice_chk_pkg_compat(hw, pkg, &seg);
+	if (status)
+		return status;
+
+	/* initialize package hints and then download package */
+	ice_init_pkg_hints(hw, seg);
+	status = ice_download_pkg(hw, seg);
+	if (status == ICE_ERR_AQ_NO_WORK) {
+		ice_debug(hw, ICE_DBG_INIT, "package previously loaded - no work.\n");
+		status = 0;
+	}
+
+	/* Get information on the package currently loaded in HW, then make sure
+	 * the driver is compatible with this version.
+	 */
+	if (!status) {
+		status = ice_get_pkg_info(hw);
+		if (!status)
+			status = ice_chk_pkg_version(&hw->active_pkg_ver);
+	}
+
+	if (!status) {
+		hw->seg = seg;
+		/* on successful package download update other required
+		 * registers to support the package and fill HW tables
+		 * with package content.
+		 */
+		ice_init_pkg_regs(hw);
+		ice_fill_blk_tbls(hw);
+		ice_fill_hw_ptype(hw);
+		ice_get_prof_index_max(hw);
+	} else {
+		ice_debug(hw, ICE_DBG_INIT, "package load failed, %d\n",
+			  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_copy_and_init_pkg - initialize/download a copy of the package
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the package buffer
+ * @len: size of the package buffer
+ *
+ * This function copies the package buffer, and then calls ice_init_pkg() to
+ * initialize the copied package contents.
+ *
+ * The copying is necessary if the package buffer supplied is constant, or if
+ * the memory may disappear shortly after calling this function.
+ *
+ * If the package buffer resides in the data segment and can be modified, the
+ * caller is free to use ice_init_pkg() instead of ice_copy_and_init_pkg().
+ *
+ * However, if the package buffer needs to be copied first, such as when being
+ * read from a file, the caller should use ice_copy_and_init_pkg().
+ *
+ * This function will first copy the package buffer, before calling
+ * ice_init_pkg(). The caller is free to immediately destroy the original
+ * package buffer, as the new copy will be managed by this function and
+ * related routines.
+ */
+enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len)
+{
+	enum ice_status status;
+	u8 *buf_copy;
+
+	if (!buf || !len)
+		return ICE_ERR_PARAM;
+
+	buf_copy = devm_kmemdup(ice_hw_to_dev(hw), buf, len, GFP_KERNEL);
+
+	status = ice_init_pkg(hw, buf_copy, len);
+	if (status) {
+		/* Free the copy, since we failed to initialize the package */
+		devm_kfree(ice_hw_to_dev(hw), buf_copy);
+	} else {
+		/* Track the copied pkg so we can free it later */
+		hw->pkg_copy = buf_copy;
+		hw->pkg_size = len;
+	}
+
+	return status;
+}
+
+/**
+ * ice_pkg_buf_alloc
+ * @hw: pointer to the HW structure
+ *
+ * Allocates a package buffer and returns a pointer to the buffer header.
+ * Note: all package contents must be in Little Endian form.
+ */
+static struct ice_buf_build *ice_pkg_buf_alloc(struct ice_hw *hw)
+{
+	struct ice_buf_build *bld;
+	struct ice_buf_hdr *buf;
+
+	bld = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*bld), GFP_KERNEL);
+	if (!bld)
+		return NULL;
+
+	buf = (struct ice_buf_hdr *)bld;
+	buf->data_end = cpu_to_le16(offsetof(struct ice_buf_hdr,
+					     section_entry));
+	return bld;
+}
+
+/**
+ * ice_get_sw_prof_type - determine switch profile type
+ * @hw: pointer to the HW structure
+ * @fv: pointer to the switch field vector
+ */
+static enum ice_prof_type
+ice_get_sw_prof_type(struct ice_hw *hw, struct ice_fv *fv)
+{
+	u16 i;
+
+	for (i = 0; i < hw->blk[ICE_BLK_SW].es.fvw; i++) {
+		/* UDP tunnel will have UDP_OF protocol ID and VNI offset */
+		if (fv->ew[i].prot_id == (u8)ICE_PROT_UDP_OF &&
+		    fv->ew[i].off == ICE_VNI_OFFSET)
+			return ICE_PROF_TUN_UDP;
+
+		/* GRE tunnel will have GRE protocol */
+		if (fv->ew[i].prot_id == (u8)ICE_PROT_GRE_OF)
+			return ICE_PROF_TUN_GRE;
+	}
+
+	return ICE_PROF_NON_TUN;
+}
+
+/**
+ * ice_get_sw_fv_bitmap - Get switch field vector bitmap based on profile type
+ * @hw: pointer to hardware structure
+ * @req_profs: type of profiles requested
+ * @bm: pointer to memory for returning the bitmap of field vectors
+ */
+void
+ice_get_sw_fv_bitmap(struct ice_hw *hw, enum ice_prof_type req_profs,
+		     unsigned long *bm)
+{
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+
+	if (req_profs == ICE_PROF_ALL) {
+		bitmap_set(bm, 0, ICE_MAX_NUM_PROFILES);
+		return;
+	}
+
+	memset(&state, 0, sizeof(state));
+	bitmap_zero(bm, ICE_MAX_NUM_PROFILES);
+	ice_seg = hw->seg;
+	do {
+		enum ice_prof_type prof_type;
+		u32 offset;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		ice_seg = NULL;
+
+		if (fv) {
+			/* Determine field vector type */
+			prof_type = ice_get_sw_prof_type(hw, fv);
+
+			if (req_profs & prof_type)
+				set_bit((u16)offset, bm);
+		}
+	} while (fv);
+}
+
+/**
+ * ice_get_sw_fv_list
+ * @hw: pointer to the HW structure
+ * @prot_ids: field vector to search for with a given protocol ID
+ * @ids_cnt: lookup/protocol count
+ * @bm: bitmap of field vectors to consider
+ * @fv_list: Head of a list
+ *
+ * Finds all the field vector entries from switch block that contain
+ * a given protocol ID and returns a list of structures of type
+ * "ice_sw_fv_list_entry". Every structure in the list has a field vector
+ * definition and profile ID information
+ * NOTE: The caller of the function is responsible for freeing the memory
+ * allocated for every list entry.
+ */
+enum ice_status
+ice_get_sw_fv_list(struct ice_hw *hw, u8 *prot_ids, u16 ids_cnt,
+		   unsigned long *bm, struct list_head *fv_list)
+{
+	struct ice_sw_fv_list_entry *fvl;
+	struct ice_sw_fv_list_entry *tmp;
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+	u32 offset;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!ids_cnt || !hw->seg)
+		return ICE_ERR_PARAM;
+
+	ice_seg = hw->seg;
+	do {
+		u16 i;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&offset, ice_sw_fv_handler);
+		if (!fv)
+			break;
+		ice_seg = NULL;
+
+		/* If field vector is not in the bitmap list, then skip this
+		 * profile.
+		 */
+		if (!test_bit((u16)offset, bm))
+			continue;
+
+		for (i = 0; i < ids_cnt; i++) {
+			int j;
+
+			/* This code assumes that if a switch field vector line
+			 * has a matching protocol, then this line will contain
+			 * the entries necessary to represent every field in
+			 * that protocol header.
+			 */
+			for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+				if (fv->ew[j].prot_id == prot_ids[i])
+					break;
+			if (j >= hw->blk[ICE_BLK_SW].es.fvw)
+				break;
+			if (i + 1 == ids_cnt) {
+				fvl = devm_kzalloc(ice_hw_to_dev(hw),
+						   sizeof(*fvl), GFP_KERNEL);
+				if (!fvl)
+					goto err;
+				fvl->fv_ptr = fv;
+				fvl->profile_id = offset;
+				list_add(&fvl->list_entry, fv_list);
+				break;
+			}
+		}
+	} while (fv);
+	if (list_empty(fv_list))
+		return ICE_ERR_CFG;
+	return 0;
+
+err:
+	list_for_each_entry_safe(fvl, tmp, fv_list, list_entry) {
+		list_del(&fvl->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fvl);
+	}
+
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_init_prof_result_bm - Initialize the profile result index bitmap
+ * @hw: pointer to hardware structure
+ */
+void ice_init_prof_result_bm(struct ice_hw *hw)
+{
+	struct ice_pkg_enum state;
+	struct ice_seg *ice_seg;
+	struct ice_fv *fv;
+
+	memset(&state, 0, sizeof(state));
+
+	if (!hw->seg)
+		return;
+
+	ice_seg = hw->seg;
+	do {
+		u32 off;
+		u16 i;
+
+		fv = ice_pkg_enum_entry(ice_seg, &state, ICE_SID_FLD_VEC_SW,
+					&off, ice_sw_fv_handler);
+		ice_seg = NULL;
+		if (!fv)
+			break;
+
+		bitmap_zero(hw->switch_info->prof_res_bm[off],
+			    ICE_MAX_FV_WORDS);
+
+		/* Determine empty field vector indices, these can be
+		 * used for recipe results. Skip index 0, since it is
+		 * always used for Switch ID.
+		 */
+		for (i = 1; i < ICE_MAX_FV_WORDS; i++)
+			if (fv->ew[i].prot_id == ICE_PROT_INVALID &&
+			    fv->ew[i].off == ICE_FV_OFFSET_INVAL)
+				set_bit(i, hw->switch_info->prof_res_bm[off]);
+	} while (fv);
+}
+
+/**
+ * ice_pkg_buf_free
+ * @hw: pointer to the HW structure
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Frees a package buffer
+ */
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld)
+{
+	devm_kfree(ice_hw_to_dev(hw), bld);
+}
+
+/**
+ * ice_pkg_buf_reserve_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @count: the number of sections to reserve
+ *
+ * Reserves one or more section table entries in a package buffer. This routine
+ * can be called multiple times as long as they are made before calling
+ * ice_pkg_buf_alloc_section(). Once ice_pkg_buf_alloc_section()
+ * is called once, the number of sections that can be allocated will not be able
+ * to be increased; not using all reserved sections is fine, but this will
+ * result in some wasted space in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+static enum ice_status
+ice_pkg_buf_reserve_section(struct ice_buf_build *bld, u16 count)
+{
+	struct ice_buf_hdr *buf;
+	u16 section_count;
+	u16 data_end;
+
+	if (!bld)
+		return ICE_ERR_PARAM;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* already an active section, can't increase table size */
+	section_count = le16_to_cpu(buf->section_count);
+	if (section_count > 0)
+		return ICE_ERR_CFG;
+
+	if (bld->reserved_section_table_entries + count > ICE_MAX_S_COUNT)
+		return ICE_ERR_CFG;
+	bld->reserved_section_table_entries += count;
+
+	data_end = le16_to_cpu(buf->data_end) +
+		flex_array_size(buf, section_entry, count);
+	buf->data_end = cpu_to_le16(data_end);
+
+	return 0;
+}
+
+/**
+ * ice_pkg_buf_alloc_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @type: the section type value
+ * @size: the size of the section to reserve (in bytes)
+ *
+ * Reserves memory in the buffer for a section's content and updates the
+ * buffers' status accordingly. This routine returns a pointer to the first
+ * byte of the section start within the buffer, which is used to fill in the
+ * section contents.
+ * Note: all package contents must be in Little Endian form.
+ */
+static void *
+ice_pkg_buf_alloc_section(struct ice_buf_build *bld, u32 type, u16 size)
+{
+	struct ice_buf_hdr *buf;
+	u16 sect_count;
+	u16 data_end;
+
+	if (!bld || !type || !size)
+		return NULL;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* check for enough space left in buffer */
+	data_end = le16_to_cpu(buf->data_end);
+
+	/* section start must align on 4 byte boundary */
+	data_end = ALIGN(data_end, 4);
+
+	if ((data_end + size) > ICE_MAX_S_DATA_END)
+		return NULL;
+
+	/* check for more available section table entries */
+	sect_count = le16_to_cpu(buf->section_count);
+	if (sect_count < bld->reserved_section_table_entries) {
+		void *section_ptr = ((u8 *)buf) + data_end;
+
+		buf->section_entry[sect_count].offset = cpu_to_le16(data_end);
+		buf->section_entry[sect_count].size = cpu_to_le16(size);
+		buf->section_entry[sect_count].type = cpu_to_le32(type);
+
+		data_end += size;
+		buf->data_end = cpu_to_le16(data_end);
+
+		buf->section_count = cpu_to_le16(sect_count + 1);
+		return section_ptr;
+	}
+
+	/* no free section table entries */
+	return NULL;
+}
+
+/**
+ * ice_pkg_buf_alloc_single_section
+ * @hw: pointer to the HW structure
+ * @type: the section type value
+ * @size: the size of the section to reserve (in bytes)
+ * @section: returns pointer to the section
+ *
+ * Allocates a package buffer with a single section.
+ * Note: all package contents must be in Little Endian form.
+ */
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+				 void **section)
+{
+	struct ice_buf_build *buf;
+
+	if (!section)
+		return NULL;
+
+	buf = ice_pkg_buf_alloc(hw);
+	if (!buf)
+		return NULL;
+
+	if (ice_pkg_buf_reserve_section(buf, 1))
+		goto ice_pkg_buf_alloc_single_section_err;
+
+	*section = ice_pkg_buf_alloc_section(buf, type, size);
+	if (!*section)
+		goto ice_pkg_buf_alloc_single_section_err;
+
+	return buf;
+
+ice_pkg_buf_alloc_single_section_err:
+	ice_pkg_buf_free(hw, buf);
+	return NULL;
+}
+
+/**
+ * ice_pkg_buf_unreserve_section
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ * @count: the number of sections to unreserve
+ *
+ * Unreserves one or more section table entries in a package buffer, releasing
+ * space that can be used for section data. This routine can be called
+ * multiple times as long as they are made before calling
+ * ice_pkg_buf_alloc_section(). Once ice_pkg_buf_alloc_section()
+ * is called once, the number of sections that can be allocated will not be able
+ * to be increased; not using all reserved sections is fine, but this will
+ * result in some wasted space in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+enum ice_status
+ice_pkg_buf_unreserve_section(struct ice_buf_build *bld, u16 count)
+{
+	struct ice_buf_hdr *buf;
+	u16 section_count;
+	u16 data_end;
+
+	if (!bld)
+		return ICE_ERR_PARAM;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+
+	/* already an active section, can't decrease table size */
+	section_count = le16_to_cpu(buf->section_count);
+	if (section_count > 0)
+		return ICE_ERR_CFG;
+
+	if (count > bld->reserved_section_table_entries)
+		return ICE_ERR_CFG;
+	bld->reserved_section_table_entries -= count;
+
+	data_end = le16_to_cpu(buf->data_end) -
+		flex_array_size(buf, section_entry, count);
+	buf->data_end = cpu_to_le16(data_end);
+
+	return 0;
+}
+
+/**
+ * ice_pkg_buf_get_free_space
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Returns the number of free bytes remaining in the buffer.
+ * Note: all package contents must be in Little Endian form.
+ */
+u16 ice_pkg_buf_get_free_space(struct ice_buf_build *bld)
+{
+	struct ice_buf_hdr *buf;
+
+	if (!bld)
+		return 0;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+	return ICE_MAX_S_DATA_END - le16_to_cpu(buf->data_end);
+}
+
+/**
+ * ice_pkg_buf_get_active_sections
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Returns the number of active sections. Before using the package buffer
+ * in an update package command, the caller should make sure that there is at
+ * least one active section - otherwise, the buffer is not legal and should
+ * not be used.
+ * Note: all package contents must be in Little Endian form.
+ */
+static u16 ice_pkg_buf_get_active_sections(struct ice_buf_build *bld)
+{
+	struct ice_buf_hdr *buf;
+
+	if (!bld)
+		return 0;
+
+	buf = (struct ice_buf_hdr *)&bld->buf;
+	return le16_to_cpu(buf->section_count);
+}
+
+/**
+ * ice_pkg_buf
+ * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc())
+ *
+ * Return a pointer to the buffer's header
+ */
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld)
+{
+	if (!bld)
+		return NULL;
+
+	return &bld->buf;
+}
+
+/**
+ * ice_tunnel_port_in_use_hlpr - helper function to determine tunnel usage
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @index: optionally returns index
+ *
+ * Returns whether a port is already in use as a tunnel, and optionally its
+ * index
+ */
+static bool ice_tunnel_port_in_use_hlpr(struct ice_hw *hw, u16 port, u16 *index)
+{
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].in_use && hw->tnl.tbl[i].port == port) {
+			if (index)
+				*index = i;
+			return true;
+		}
+
+	return false;
+}
+
+/**
+ * ice_tunnel_port_in_use
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @index: optionally returns index
+ *
+ * Returns whether a port is already in use as a tunnel, and optionally its
+ * index
+ */
+bool ice_tunnel_port_in_use(struct ice_hw *hw, u16 port, u16 *index)
+{
+	bool res;
+
+	mutex_lock(&hw->tnl_lock);
+	res = ice_tunnel_port_in_use_hlpr(hw, port, index);
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_tunnel_get_type
+ * @hw: pointer to the HW structure
+ * @port: port to search for
+ * @type: returns tunnel index
+ *
+ * For a given port number, will return the type of tunnel.
+ */
+bool
+ice_tunnel_get_type(struct ice_hw *hw, u16 port, enum ice_tunnel_type *type)
+{
+	bool res = false;
+	u16 i;
+
+	mutex_lock(&hw->tnl_lock);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].in_use && hw->tnl.tbl[i].port == port) {
+			*type = hw->tnl.tbl[i].type;
+			res = true;
+			break;
+		}
+
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_find_free_tunnel_entry
+ * @hw: pointer to the HW structure
+ * @type: tunnel type
+ * @index: optionally returns index
+ *
+ * Returns whether there is a free tunnel entry, and optionally its index
+ */
+static bool
+ice_find_free_tunnel_entry(struct ice_hw *hw, enum ice_tunnel_type type,
+			   u16 *index)
+{
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && !hw->tnl.tbl[i].in_use &&
+		    hw->tnl.tbl[i].type == type) {
+			if (index)
+				*index = i;
+			return true;
+		}
+
+	return false;
+}
+
+/**
+ * ice_get_open_tunnel_port - retrieve an open tunnel port
+ * @hw: pointer to the HW structure
+ * @type: tunnel type (TNL_ALL will return any open port)
+ * @port: returns open port
+ */
+bool
+ice_get_open_tunnel_port(struct ice_hw *hw, enum ice_tunnel_type type,
+			 u16 *port)
+{
+	bool res = false;
+	u16 i;
+
+	mutex_lock(&hw->tnl_lock);
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (type == TNL_ALL || hw->tnl.tbl[i].type == type)) {
+			*port = hw->tnl.tbl[i].port;
+			res = true;
+			break;
+		}
+
+	mutex_unlock(&hw->tnl_lock);
+
+	return res;
+}
+
+/**
+ * ice_is_create_tunnel_possible
+ * @hw: pointer to the HW structure
+ * @type: type of tunnel
+ * @port: port of tunnel to create
+ *
+ * Function returns ICE_SUCCESS if a tunnel can be created using specified
+ * tunnel type and port. If the tunnel is already present in hardware then
+ * ICE_ERR_ALREADY_EXISTS is returned, or if there's no space, then
+ * ICE_ERR_OUT_OF_RANGE.
+ */
+enum ice_status
+ice_is_create_tunnel_possible(struct ice_hw *hw, enum ice_tunnel_type type,
+			      u16 port)
+{
+	u16 index;
+
+	if (ice_tunnel_port_in_use_hlpr(hw, port, &index))
+		return ICE_ERR_ALREADY_EXISTS;
+
+	if (!ice_find_free_tunnel_entry(hw, type, &index))
+		return ICE_ERR_OUT_OF_RANGE;
+
+	return 0;
+}
+
+/**
+ * ice_is_tunnel_empty - check if udp tunnel is empty
+ * @hw: pointer to the HW structure
+ */
+bool ice_is_tunnel_empty(struct ice_hw *hw)
+{
+	int i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use)
+			return false;
+	return true;
+}
+
+/**
+ * ice_upd_dvm_boost_entry
+ * @hw: pointer to the HW structure
+ * @entry: pointer to double vlan boost entry info
+ */
+static enum ice_status
+ice_upd_dvm_boost_entry(struct ice_hw *hw, struct ice_dvm_entry *entry)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u8 val, dc, nm;
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_upd_dvm_boost_entry_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    struct_size(sect_rx, tcam, 1));
+	if (!sect_rx)
+		goto ice_upd_dvm_boost_entry_err;
+	sect_rx->count = cpu_to_le16(1);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    struct_size(sect_tx, tcam, 1));
+	if (!sect_tx)
+		goto ice_upd_dvm_boost_entry_err;
+	sect_tx->count = cpu_to_le16(1);
+
+	/* copy original boost entry to update package buffer */
+	memcpy(sect_rx->tcam, entry->boost_entry, sizeof(*sect_rx->tcam));
+
+	/* re-write the don't care and never match bits accordingly */
+	if (entry->enable) {
+		/* all bits are don't care */
+		val = 0x00;
+		dc = 0xFF;
+		nm = 0x00;
+	} else {
+		/* disable, one never match bit, the rest are don't care */
+		val = 0x00;
+		dc = 0xF7;
+		nm = 0x08;
+	}
+
+	ice_set_key((u8 *)&sect_rx->tcam[0].key, sizeof(sect_rx->tcam[0].key),
+		    &val, NULL, &dc, &nm, 0, sizeof(u8));
+
+	/* exact copy of entry to Tx section entry */
+	memcpy(sect_tx->tcam, sect_rx->tcam, sizeof(*sect_tx->tcam));
+
+	status = ice_update_pkg_no_lock(hw, ice_pkg_buf(bld), 1);
+
+ice_upd_dvm_boost_entry_err:
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_set_dvm_boost_entries
+ * @hw: pointer to the HW structure
+ *
+ * Enable double vlan by updating the appropriate boost tcam entries.
+ */
+enum ice_status ice_set_dvm_boost_entries(struct ice_hw *hw)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < hw->dvm_upd.count; i++) {
+		status = ice_upd_dvm_boost_entry(hw, &hw->dvm_upd.tbl[i]);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_create_tunnel
+ * @hw: pointer to the HW structure
+ * @type: type of tunnel
+ * @port: port of tunnel to create
+ *
+ * Create a tunnel by updating the parse graph in the parser. We do that by
+ * creating a package buffer with the tunnel info and issuing an update package
+ * command.
+ */
+enum ice_status
+ice_create_tunnel(struct ice_hw *hw, enum ice_tunnel_type type, u16 port)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u16 index;
+
+	mutex_lock(&hw->tnl_lock);
+
+	if (ice_tunnel_port_in_use_hlpr(hw, port, &index)) {
+		hw->tnl.tbl[index].ref++;
+		status = 0;
+		goto ice_create_tunnel_end;
+	}
+
+	if (!ice_find_free_tunnel_entry(hw, type, &index)) {
+		status = ICE_ERR_OUT_OF_RANGE;
+		goto ice_create_tunnel_end;
+	}
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_create_tunnel_end;
+	}
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_create_tunnel_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    struct_size(sect_rx, tcam, 1));
+	if (!sect_rx)
+		goto ice_create_tunnel_err;
+	sect_rx->count = cpu_to_le16(1);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    struct_size(sect_tx, tcam, 1));
+	if (!sect_tx)
+		goto ice_create_tunnel_err;
+	sect_tx->count = cpu_to_le16(1);
+
+	/* copy original boost entry to update package buffer */
+	memcpy(sect_rx->tcam, hw->tnl.tbl[index].boost_entry,
+	       sizeof(*sect_rx->tcam));
+
+	/* over-write the never-match dest port key bits with the encoded port
+	 * bits
+	 */
+	ice_set_key((u8 *)&sect_rx->tcam[0].key, sizeof(sect_rx->tcam[0].key),
+		    (u8 *)&port, NULL, NULL, NULL,
+		    (u16)offsetof(struct ice_boost_key_value, hv_dst_port_key),
+		    sizeof(sect_rx->tcam[0].key.key.hv_dst_port_key));
+
+	/* exact copy of entry to Tx section entry */
+	memcpy(sect_tx->tcam, sect_rx->tcam, sizeof(*sect_tx->tcam));
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+	if (!status) {
+		hw->tnl.tbl[index].port = port;
+		hw->tnl.tbl[index].in_use = true;
+		hw->tnl.tbl[index].ref = 1;
+	}
+
+ice_create_tunnel_err:
+	ice_pkg_buf_free(hw, bld);
+
+ice_create_tunnel_end:
+	mutex_unlock(&hw->tnl_lock);
+
+	return status;
+}
+
+/**
+ * ice_destroy_tunnel
+ * @hw: pointer to the HW structure
+ * @port: port of tunnel to destroy (ignored if the all parameter is true)
+ * @all: flag that states to destroy all tunnels
+ *
+ * Destroys a tunnel or all tunnels by creating an update package buffer
+ * targeting the specific updates requested and then performing an update
+ * package.
+ */
+enum ice_status ice_destroy_tunnel(struct ice_hw *hw, u16 port, bool all)
+{
+	struct ice_boost_tcam_section *sect_rx, *sect_tx;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	struct ice_buf_build *bld;
+	u16 count = 0;
+	u16 index;
+	u16 size;
+	u16 i, j;
+
+	mutex_lock(&hw->tnl_lock);
+
+	if (!all && ice_tunnel_port_in_use_hlpr(hw, port, &index))
+		if (hw->tnl.tbl[index].ref > 1) {
+			hw->tnl.tbl[index].ref--;
+			status = 0;
+			goto ice_destroy_tunnel_end;
+		}
+
+	/* determine count */
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (all || hw->tnl.tbl[i].port == port))
+			count++;
+
+	if (!count) {
+		status = ICE_ERR_PARAM;
+		goto ice_destroy_tunnel_end;
+	}
+
+	/* size of section - there is at least one entry */
+	size = struct_size(sect_rx, tcam, count);
+
+	bld = ice_pkg_buf_alloc(hw);
+	if (!bld) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_destroy_tunnel_end;
+	}
+
+	/* allocate 2 sections, one for Rx parser, one for Tx parser */
+	if (ice_pkg_buf_reserve_section(bld, 2))
+		goto ice_destroy_tunnel_err;
+
+	sect_rx = ice_pkg_buf_alloc_section(bld, ICE_SID_RXPARSER_BOOST_TCAM,
+					    size);
+	if (!sect_rx)
+		goto ice_destroy_tunnel_err;
+	sect_rx->count = cpu_to_le16(count);
+
+	sect_tx = ice_pkg_buf_alloc_section(bld, ICE_SID_TXPARSER_BOOST_TCAM,
+					    size);
+	if (!sect_tx)
+		goto ice_destroy_tunnel_err;
+	sect_tx->count = cpu_to_le16(count);
+
+	/* copy original boost entry to update package buffer, one copy to Rx
+	 * section, another copy to the Tx section
+	 */
+	for (i = 0, j = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++)
+		if (hw->tnl.tbl[i].valid && hw->tnl.tbl[i].in_use &&
+		    (all || hw->tnl.tbl[i].port == port)) {
+			memcpy(sect_rx->tcam + j, hw->tnl.tbl[i].boost_entry,
+			       sizeof(*sect_rx->tcam));
+			memcpy(sect_tx->tcam + j, hw->tnl.tbl[i].boost_entry,
+			       sizeof(*sect_tx->tcam));
+			hw->tnl.tbl[i].marked = true;
+			j++;
+		}
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+	if (!status)
+		for (i = 0; i < hw->tnl.count &&
+		     i < ICE_TUNNEL_MAX_ENTRIES; i++)
+			if (hw->tnl.tbl[i].marked) {
+				hw->tnl.tbl[i].ref = 0;
+				hw->tnl.tbl[i].port = 0;
+				hw->tnl.tbl[i].in_use = false;
+				hw->tnl.tbl[i].marked = false;
+			}
+
+ice_destroy_tunnel_err:
+	ice_pkg_buf_free(hw, bld);
+
+ice_destroy_tunnel_end:
+	mutex_unlock(&hw->tnl_lock);
+
+	return status;
+}
+
+/**
+ * ice_replay_tunnels
+ * @hw: pointer to the HW structure
+ *
+ * Replays all tunnels
+ */
+enum ice_status ice_replay_tunnels(struct ice_hw *hw)
+{
+	enum ice_status status = 0;
+	u16 i;
+
+	for (i = 0; i < hw->tnl.count && i < ICE_TUNNEL_MAX_ENTRIES; i++) {
+		enum ice_tunnel_type type = hw->tnl.tbl[i].type;
+		u16 refs = hw->tnl.tbl[i].ref;
+		u16 port = hw->tnl.tbl[i].port;
+
+		if (!hw->tnl.tbl[i].in_use)
+			continue;
+
+		/* Replay tunnels one at a time by destroying them, then
+		 * recreating them
+		 */
+		hw->tnl.tbl[i].ref = 1; /* make sure to destroy in one call */
+		status = ice_destroy_tunnel(hw, port, false);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "ERR: 0x%x - destroy tunnel port 0x%x\n",
+				  status, port);
+			break;
+		}
+
+		status = ice_create_tunnel(hw, type, port);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PKG, "ERR: 0x%x - create tunnel port 0x%x\n",
+				  status, port);
+			break;
+		}
+
+		/* reset to original ref count */
+		hw->tnl.tbl[i].ref = refs;
+	}
+
+	return status;
+}
+
+/**
+ * ice_find_prot_off - find prot ID and offset pair, based on prof and FV index
+ * @hw: pointer to the hardware structure
+ * @blk: hardware block
+ * @prof: profile ID
+ * @fv_idx: field vector word index
+ * @prot: variable to receive the protocol ID
+ * @off: variable to receive the protocol offset
+ */
+enum ice_status
+ice_find_prot_off(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 fv_idx,
+		  u8 *prot, u16 *off)
+{
+	struct ice_fv_word *fv_ext;
+
+	if (prof >= hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	if (fv_idx >= hw->blk[blk].es.fvw)
+		return ICE_ERR_PARAM;
+
+	fv_ext = hw->blk[blk].es.t + (prof * hw->blk[blk].es.fvw);
+
+	*prot = fv_ext[fv_idx].prot_id;
+	*off = fv_ext[fv_idx].off;
+
+	return 0;
+}
+
+/* PTG Management */
+
+/**
+ * ice_ptg_update_xlt1 - Updates packet type groups in HW via XLT1 table
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will update the XLT1 hardware table to reflect the new
+ * packet type group configuration.
+ */
+enum ice_status ice_ptg_update_xlt1(struct ice_hw *hw, enum ice_block blk)
+{
+	struct ice_xlt1_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+	u16 index;
+
+	bld = ice_pkg_buf_alloc_single_section(hw, ice_sect_id(blk, ICE_XLT1),
+					       struct_size(sect, value, ICE_XLT1_CNT),
+					       (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	sect->count = cpu_to_le16(ICE_XLT1_CNT);
+	sect->offset = cpu_to_le16(0);
+	for (index = 0; index < ICE_XLT1_CNT; index++)
+		sect->value[index] = hw->blk[blk].xlt1.ptypes[index].ptg;
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_ptg_find_ptype - Search for packet type group using packet type (ptype)
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to search for
+ * @ptg: pointer to variable that receives the PTG
+ *
+ * This function will search the PTGs for a particular ptype, returning the
+ * PTG ID that contains it through the PTG parameter, with the value of
+ * ICE_DEFAULT_PTG (0) meaning it is part the default PTG.
+ */
+static enum ice_status
+ice_ptg_find_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 *ptg)
+{
+	if (ptype >= ICE_XLT1_CNT || !ptg)
+		return ICE_ERR_PARAM;
+
+	*ptg = hw->blk[blk].xlt1.ptypes[ptype].ptg;
+	return 0;
+}
+
+/**
+ * ice_ptg_alloc_val - Allocates a new packet type group ID by value
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptg: the PTG to allocate
+ *
+ * This function allocates a given packet type group ID specified by the PTG
+ * parameter.
+ */
+static void ice_ptg_alloc_val(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+{
+	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = true;
+}
+
+/**
+ * ice_ptg_free - Frees a packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptg: the PTG ID to free
+ *
+ * This function frees a packet type group, and returns all the current ptypes
+ * within it to the default PTG.
+ */
+void ice_ptg_free(struct ice_hw *hw, enum ice_block blk, u8 ptg)
+{
+	struct ice_ptg_ptype *p, *temp;
+
+	hw->blk[blk].xlt1.ptg_tbl[ptg].in_use = false;
+	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	while (p) {
+		p->ptg = ICE_DEFAULT_PTG;
+		temp = p->next_ptype;
+		p->next_ptype = NULL;
+		p = temp;
+	}
+
+	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype = NULL;
+}
+
+/**
+ * ice_ptg_remove_ptype - Removes ptype from a particular packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to remove
+ * @ptg: the PTG to remove the ptype from
+ *
+ * This function will remove the ptype from the specific PTG, and move it to
+ * the default PTG (ICE_DEFAULT_PTG).
+ */
+static enum ice_status
+ice_ptg_remove_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+{
+	struct ice_ptg_ptype **ch;
+	struct ice_ptg_ptype *p;
+
+	if (ptype > ICE_XLT1_CNT - 1)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Should not happen if .in_use is set, bad config */
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype)
+		return ICE_ERR_CFG;
+
+	/* find the ptype within this PTG, and bypass the link over it */
+	p = hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	ch = &hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	while (p) {
+		if (ptype == (p - hw->blk[blk].xlt1.ptypes)) {
+			*ch = p->next_ptype;
+			break;
+		}
+
+		ch = &p->next_ptype;
+		p = p->next_ptype;
+	}
+
+	hw->blk[blk].xlt1.ptypes[ptype].ptg = ICE_DEFAULT_PTG;
+	hw->blk[blk].xlt1.ptypes[ptype].next_ptype = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_ptg_add_mv_ptype - Adds/moves ptype to a particular packet type group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @ptype: the ptype to add or move
+ * @ptg: the PTG to add or move the ptype to
+ *
+ * This function will either add or move a ptype to a particular PTG depending
+ * on if the ptype is already part of another group. Note that using a
+ * a destination PTG ID of ICE_DEFAULT_PTG (0) will move the ptype to the
+ * default PTG.
+ */
+static enum ice_status
+ice_ptg_add_mv_ptype(struct ice_hw *hw, enum ice_block blk, u16 ptype, u8 ptg)
+{
+	enum ice_status status;
+	u8 original_ptg;
+
+	if (ptype > ICE_XLT1_CNT - 1)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt1.ptg_tbl[ptg].in_use && ptg != ICE_DEFAULT_PTG)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	status = ice_ptg_find_ptype(hw, blk, ptype, &original_ptg);
+	if (status)
+		return status;
+
+	/* Is ptype already in the correct PTG? */
+	if (original_ptg == ptg)
+		return 0;
+
+	/* Remove from original PTG and move back to the default PTG */
+	if (original_ptg != ICE_DEFAULT_PTG)
+		ice_ptg_remove_ptype(hw, blk, ptype, original_ptg);
+
+	/* Moving to default PTG? Then we're done with this request */
+	if (ptg == ICE_DEFAULT_PTG)
+		return 0;
+
+	/* Add ptype to PTG at beginning of list */
+	hw->blk[blk].xlt1.ptypes[ptype].next_ptype =
+		hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype;
+	hw->blk[blk].xlt1.ptg_tbl[ptg].first_ptype =
+		&hw->blk[blk].xlt1.ptypes[ptype];
+
+	hw->blk[blk].xlt1.ptypes[ptype].ptg = ptg;
+	hw->blk[blk].xlt1.t[ptype] = ptg;
+
+	return 0;
+}
+
+/* Block / table size info */
+struct ice_blk_size_details {
+	u16 xlt1;			/* # XLT1 entries */
+	u16 xlt2;			/* # XLT2 entries */
+	u16 prof_tcam;			/* # profile ID TCAM entries */
+	u16 prof_id;			/* # profile IDs */
+	u8 prof_cdid_bits;		/* # CDID one-hot bits used in key */
+	u16 prof_redir;			/* # profile redirection entries */
+	u16 es;				/* # extraction sequence entries */
+	u16 fvw;			/* # field vector words */
+	u8 overwrite;			/* overwrite existing entries allowed */
+	u8 reverse;			/* reverse FV order */
+};
+
+static const struct ice_blk_size_details blk_sizes[ICE_BLK_COUNT] = {
+	/**
+	 * Table Definitions
+	 * XLT1 - Number of entries in XLT1 table
+	 * XLT2 - Number of entries in XLT2 table
+	 * TCAM - Number of entries Profile ID TCAM table
+	 * CDID - Control Domain ID of the hardware block
+	 * PRED - Number of entries in the Profile Redirection Table
+	 * FV   - Number of entries in the Field Vector
+	 * FVW  - Width (in WORDs) of the Field Vector
+	 * OVR  - Overwrite existing table entries
+	 * REV  - Reverse FV
+	 */
+	/*          XLT1        , XLT2        ,TCAM, PID,CDID,PRED,   FV, FVW */
+	/*          Overwrite   , Reverse FV */
+	/* SW  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 256,   0,  256, 256,  48,
+		    false, false },
+	/* ACL */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  32,
+		    false, false },
+	/* FD  */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
+		    false, true  },
+	/* RSS */ { ICE_XLT1_CNT, ICE_XLT2_CNT, 512, 128,   0,  128, 128,  24,
+		    true,  true  },
+	/* PE  */ { ICE_XLT1_CNT, ICE_XLT2_CNT,  64,  32,   0,   32,  32,  24,
+		    false, false },
+};
+
+enum ice_sid_all {
+	ICE_SID_XLT1_OFF = 0,
+	ICE_SID_XLT2_OFF,
+	ICE_SID_PR_OFF,
+	ICE_SID_PR_REDIR_OFF,
+	ICE_SID_ES_OFF,
+	ICE_SID_OFF_COUNT,
+};
+
+/* Characteristic handling */
+
+/**
+ * ice_match_prop_lst - determine if properties of two lists match
+ * @list1: first properties list
+ * @list2: second properties list
+ *
+ * Count, cookies and the order must match in order to be considered equivalent.
+ */
+static bool
+ice_match_prop_lst(struct list_head *list1, struct list_head *list2)
+{
+	struct ice_vsig_prof *tmp1;
+	struct ice_vsig_prof *tmp2;
+	u16 chk_count = 0;
+	u16 count = 0;
+
+	/* compare counts */
+	list_for_each_entry(tmp1, list1, list)
+		count++;
+	list_for_each_entry(tmp2, list2, list)
+		chk_count++;
+	/* cppcheck-suppress knownConditionTrueFalse */
+	if (!count || count != chk_count)
+		return false;
+
+	tmp1 = list_first_entry(list1, struct ice_vsig_prof, list);
+	tmp2 = list_first_entry(list2, struct ice_vsig_prof, list);
+
+	/* profile cookies must compare, and in the exact same order to take
+	 * into account priority
+	 */
+	while (count--) {
+		if (tmp2->profile_cookie != tmp1->profile_cookie)
+			return false;
+
+		tmp1 = list_next_entry(tmp1, list);
+		tmp2 = list_next_entry(tmp2, list);
+	}
+
+	return true;
+}
+
+/* VSIG Management */
+
+/**
+ * ice_vsig_update_xlt2_sect - update one section of XLT2 table
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: HW VSI number to program
+ * @vsig: VSIG for the VSI
+ *
+ * This function will update the XLT2 hardware table with the input VSI
+ * group configuration.
+ */
+static enum ice_status
+ice_vsig_update_xlt2_sect(struct ice_hw *hw, enum ice_block blk, u16 vsi,
+			  u16 vsig)
+{
+	struct ice_xlt2_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+
+	bld = ice_pkg_buf_alloc_single_section(hw, ice_sect_id(blk, ICE_XLT2),
+					       struct_size(sect, value, 1),
+					       (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	sect->count = cpu_to_le16(1);
+	sect->offset = cpu_to_le16(vsi);
+	sect->value[0] = cpu_to_le16(vsig);
+
+	status = ice_update_pkg(hw, ice_pkg_buf(bld), 1);
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_vsig_update_xlt2 - update XLT2 table with VSIG configuration
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will update the XLT2 hardware table with the input VSI
+ * group configuration of used vsis.
+ */
+enum ice_status ice_vsig_update_xlt2(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 vsi;
+
+	for (vsi = 0; vsi < ICE_MAX_VSI; vsi++) {
+		/* update only vsis that have been changed */
+		if (hw->blk[blk].xlt2.vsis[vsi].changed) {
+			enum ice_status status;
+			u16 vsig;
+
+			vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+			status = ice_vsig_update_xlt2_sect(hw, blk, vsi, vsig);
+			if (status)
+				return status;
+
+			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsig_find_vsi - find a VSIG that contains a specified VSI
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI of interest
+ * @vsig: pointer to receive the VSI group
+ *
+ * This function will lookup the VSI entry in the XLT2 list and return
+ * the VSI group its associated with.
+ */
+static enum ice_status
+ice_vsig_find_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 *vsig)
+{
+	if (!vsig || vsi >= ICE_MAX_VSI)
+		return ICE_ERR_PARAM;
+
+	/* As long as there's a default or valid VSIG associated with the input
+	 * VSI, the functions returns a success. Any handling of VSIG will be
+	 * done by the following add, update or remove functions.
+	 */
+	*vsig = hw->blk[blk].xlt2.vsis[vsi].vsig;
+
+	return 0;
+}
+
+/**
+ * ice_vsig_alloc_val - allocate a new VSIG by value
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: the VSIG to allocate
+ *
+ * This function will allocate a given VSIG specified by the VSIG parameter.
+ */
+static u16 ice_vsig_alloc_val(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) {
+		INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
+		hw->blk[blk].xlt2.vsig_tbl[idx].in_use = true;
+	}
+
+	return ICE_VSIG_VALUE(idx, hw->pf_id);
+}
+
+/**
+ * ice_vsig_alloc - Finds a free entry and allocates a new VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ *
+ * This function will iterate through the VSIG list and mark the first
+ * unused entry for the new VSIG entry as used and return that value.
+ */
+static u16 ice_vsig_alloc(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (!hw->blk[blk].xlt2.vsig_tbl[i].in_use)
+			return ice_vsig_alloc_val(hw, blk, i);
+
+	return ICE_DEFAULT_VSIG;
+}
+
+/**
+ * ice_find_dup_props_vsig - find VSI group with a specified set of properties
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @chs: characteristic list
+ * @vsig: returns the VSIG with the matching profiles, if found
+ *
+ * Each VSIG is associated with a characteristic set; i.e. all VSIs under
+ * a group have the same characteristic set. To check if there exists a VSIG
+ * which has the same characteristics as the input characteristics; this
+ * function will iterate through the XLT2 list and return the VSIG that has a
+ * matching configuration. In order to make sure that priorities are accounted
+ * for, the list must match exactly, including the order in which the
+ * characteristics are listed.
+ */
+static enum ice_status
+ice_find_dup_props_vsig(struct ice_hw *hw, enum ice_block blk,
+			struct list_head *chs, u16 *vsig)
+{
+	struct ice_xlt2 *xlt2 = &hw->blk[blk].xlt2;
+	u16 i;
+
+	for (i = 0; i < xlt2->count; i++)
+		if (xlt2->vsig_tbl[i].in_use &&
+		    ice_match_prop_lst(chs, &xlt2->vsig_tbl[i].prop_lst)) {
+			*vsig = ICE_VSIG_VALUE(i, hw->pf_id);
+			return 0;
+		}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_vsig_free - free VSI group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to remove
+ *
+ * The function will remove all VSIs associated with the input VSIG and move
+ * them to the DEFAULT_VSIG and mark the VSIG available.
+ */
+static enum ice_status
+ice_vsig_free(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	struct ice_vsig_prof *dtmp, *del;
+	struct ice_vsig_vsi *vsi_cur;
+	u16 idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+	if (idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	hw->blk[blk].xlt2.vsig_tbl[idx].in_use = false;
+
+	vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	/* If the VSIG has at least 1 VSI then iterate through the
+	 * list and remove the VSIs before deleting the group.
+	 */
+	if (vsi_cur) {
+		/* remove all vsis associated with this VSIG XLT2 entry */
+		do {
+			struct ice_vsig_vsi *tmp = vsi_cur->next_vsi;
+
+			vsi_cur->vsig = ICE_DEFAULT_VSIG;
+			vsi_cur->changed = 1;
+			vsi_cur->next_vsi = NULL;
+			vsi_cur = tmp;
+		} while (vsi_cur);
+
+		/* NULL terminate head of VSI list */
+		hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi = NULL;
+	}
+
+	/* free characteristic list */
+	list_for_each_entry_safe(del, dtmp,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list) {
+		list_del(&del->list);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	/* if VSIG characteristic list was cleared for reset
+	 * re-initialize the list head
+	 */
+	INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst);
+
+	return 0;
+}
+
+/**
+ * ice_vsig_remove_vsi - remove VSI from VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI to remove
+ * @vsig: VSI group to remove from
+ *
+ * The function will remove the input VSI from its VSI group and move it
+ * to the DEFAULT_VSIG.
+ */
+static enum ice_status
+ice_vsig_remove_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_vsig_vsi **vsi_head, *vsi_cur, *vsi_tgt;
+	u16 idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* entry already in default VSIG, don't have to remove */
+	if (idx == ICE_DEFAULT_VSIG)
+		return 0;
+
+	vsi_head = &hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	if (!(*vsi_head))
+		return ICE_ERR_CFG;
+
+	vsi_tgt = &hw->blk[blk].xlt2.vsis[vsi];
+	vsi_cur = (*vsi_head);
+
+	/* iterate the VSI list, skip over the entry to be removed */
+	while (vsi_cur) {
+		if (vsi_tgt == vsi_cur) {
+			(*vsi_head) = vsi_cur->next_vsi;
+			break;
+		}
+		vsi_head = &vsi_cur->next_vsi;
+		vsi_cur = vsi_cur->next_vsi;
+	}
+
+	/* verify if VSI was removed from group list */
+	if (!vsi_cur)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	vsi_cur->vsig = ICE_DEFAULT_VSIG;
+	vsi_cur->changed = 1;
+	vsi_cur->next_vsi = NULL;
+
+	return 0;
+}
+
+/**
+ * ice_vsig_add_mv_vsi - add or move a VSI to a VSI group
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsi: VSI to move
+ * @vsig: destination VSI group
+ *
+ * This function will move or add the input VSI to the target VSIG.
+ * The function will find the original VSIG the VSI belongs to and
+ * move the entry to the DEFAULT_VSIG, update the original VSIG and
+ * then move entry to the new VSIG.
+ */
+static enum ice_status
+ice_vsig_add_mv_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_vsig_vsi *tmp;
+	enum ice_status status;
+	u16 orig_vsig, idx;
+
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	if (vsi >= ICE_MAX_VSI || idx >= ICE_MAX_VSIGS)
+		return ICE_ERR_PARAM;
+
+	/* if VSIG not in use and VSIG is not default type this VSIG
+	 * doesn't exist.
+	 */
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use &&
+	    vsig != ICE_DEFAULT_VSIG)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	if (status)
+		return status;
+
+	/* no update required if vsigs match */
+	if (orig_vsig == vsig)
+		return 0;
+
+	if (orig_vsig != ICE_DEFAULT_VSIG) {
+		/* remove entry from orig_vsig and add to default VSIG */
+		status = ice_vsig_remove_vsi(hw, blk, vsi, orig_vsig);
+		if (status)
+			return status;
+	}
+
+	if (idx == ICE_DEFAULT_VSIG)
+		return 0;
+
+	/* Create VSI entry and add VSIG and prop_mask values */
+	hw->blk[blk].xlt2.vsis[vsi].vsig = vsig;
+	hw->blk[blk].xlt2.vsis[vsi].changed = 1;
+
+	/* Add new entry to the head of the VSIG list */
+	tmp = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi =
+		&hw->blk[blk].xlt2.vsis[vsi];
+	hw->blk[blk].xlt2.vsis[vsi].next_vsi = tmp;
+	hw->blk[blk].xlt2.t[vsi] = vsig;
+
+	return 0;
+}
+
+/**
+ * ice_prof_has_mask_idx - determine if profile index masking is identical
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @prof: profile to check
+ * @idx: profile index to check
+ * @mask: mask to match
+ */
+static bool
+ice_prof_has_mask_idx(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 idx,
+		      u16 mask)
+{
+	bool expect_no_mask = false;
+	bool found = false;
+	bool match = false;
+	u16 i;
+
+	/* If mask is 0x0000 or 0xffff, then there is no masking */
+	if (mask == 0 || mask == 0xffff)
+		expect_no_mask = true;
+
+	/* Scan the enabled masks on this profile, for the specified idx */
+	for (i = hw->blk[blk].masks.first; i < hw->blk[blk].masks.first +
+	     hw->blk[blk].masks.count; i++)
+		if (hw->blk[blk].es.mask_ena[prof] & BIT(i))
+			if (hw->blk[blk].masks.masks[i].in_use &&
+			    hw->blk[blk].masks.masks[i].idx == idx) {
+				found = true;
+				if (hw->blk[blk].masks.masks[i].mask == mask)
+					match = true;
+				break;
+			}
+
+	if (expect_no_mask) {
+		if (found)
+			return false;
+	} else {
+		if (!match)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_prof_has_mask - determine if profile masking is identical
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @prof: profile to check
+ * @masks: masks to match
+ */
+static bool
+ice_prof_has_mask(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 *masks)
+{
+	u16 i;
+
+	/* es->mask_ena[prof] will have the mask */
+	for (i = 0; i < hw->blk[blk].es.fvw; i++)
+		if (!ice_prof_has_mask_idx(hw, blk, prof, i, masks[i]))
+			return false;
+
+	return true;
+}
+
+/**
+ * ice_find_prof_id_with_mask - find profile ID for a given field vector
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @fv: field vector to search for
+ * @masks: masks for fv
+ * @prof_id: receives the profile ID
+ */
+static enum ice_status
+ice_find_prof_id_with_mask(struct ice_hw *hw, enum ice_block blk,
+			   struct ice_fv_word *fv, u16 *masks, u8 *prof_id)
+{
+	struct ice_es *es = &hw->blk[blk].es;
+	u8 i;
+
+	/* For FD and RSS we don't want to re-use a existed profile with the
+	 * same field vector and mask. This will cause rule interference.
+	 */
+	if (blk == ICE_BLK_FD || blk == ICE_BLK_RSS)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	for (i = 0; i < (u8)es->count; i++) {
+		u16 off = i * es->fvw;
+
+		if (memcmp(&es->t[off], fv, es->fvw * sizeof(*fv)))
+			continue;
+
+		/* check if masks settings are the same for this profile */
+		if (masks && !ice_prof_has_mask(hw, blk, i, masks))
+			continue;
+
+		*prof_id = i;
+		return 0;
+	}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_prof_id_rsrc_type - get profile ID resource type for a block type
+ * @blk: the block type
+ * @rsrc_type: pointer to variable to receive the resource type
+ */
+static bool ice_prof_id_rsrc_type(enum ice_block blk, u16 *rsrc_type)
+{
+	switch (blk) {
+	case ICE_BLK_SW:
+		*rsrc_type = ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_ACL:
+		*rsrc_type = ICE_AQC_RES_TYPE_ACL_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_FD:
+		*rsrc_type = ICE_AQC_RES_TYPE_FD_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_RSS:
+		*rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID;
+		break;
+	case ICE_BLK_PE:
+		*rsrc_type = ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_PROFID;
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/**
+ * ice_tcam_ent_rsrc_type - get TCAM entry resource type for a block type
+ * @blk: the block type
+ * @rsrc_type: pointer to variable to receive the resource type
+ */
+static bool ice_tcam_ent_rsrc_type(enum ice_block blk, u16 *rsrc_type)
+{
+	switch (blk) {
+	case ICE_BLK_SW:
+		*rsrc_type = ICE_AQC_RES_TYPE_SWITCH_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_ACL:
+		*rsrc_type = ICE_AQC_RES_TYPE_ACL_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_FD:
+		*rsrc_type = ICE_AQC_RES_TYPE_FD_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_RSS:
+		*rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM;
+		break;
+	case ICE_BLK_PE:
+		*rsrc_type = ICE_AQC_RES_TYPE_QHASH_PROF_BLDR_TCAM;
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
+/**
+ * ice_alloc_tcam_ent - allocate hardware TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block to allocate the TCAM for
+ * @btm: true to allocate from bottom of table, false to allocate from top
+ * @tcam_idx: pointer to variable to receive the TCAM entry
+ *
+ * This function allocates a new entry in a Profile ID TCAM for a specific
+ * block.
+ */
+static enum ice_status
+ice_alloc_tcam_ent(struct ice_hw *hw, enum ice_block blk, bool btm,
+		   u16 *tcam_idx)
+{
+	u16 res_type;
+
+	if (!ice_tcam_ent_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_alloc_hw_res(hw, res_type, 1, btm, tcam_idx);
+}
+
+/**
+ * ice_free_tcam_ent - free hardware TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the TCAM entry
+ * @tcam_idx: the TCAM entry to free
+ *
+ * This function frees an entry in a Profile ID TCAM for a specific block.
+ */
+static enum ice_status
+ice_free_tcam_ent(struct ice_hw *hw, enum ice_block blk, u16 tcam_idx)
+{
+	u16 res_type;
+
+	if (!ice_tcam_ent_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_free_hw_res(hw, res_type, 1, &tcam_idx);
+}
+
+/**
+ * ice_alloc_prof_id - allocate profile ID
+ * @hw: pointer to the HW struct
+ * @blk: the block to allocate the profile ID for
+ * @prof_id: pointer to variable to receive the profile ID
+ *
+ * This function allocates a new profile ID, which also corresponds to a Field
+ * Vector (Extraction Sequence) entry.
+ */
+static enum ice_status
+ice_alloc_prof_id(struct ice_hw *hw, enum ice_block blk, u8 *prof_id)
+{
+	enum ice_status status;
+	u16 res_type;
+	u16 get_prof;
+
+	if (!ice_prof_id_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	status = ice_alloc_hw_res(hw, res_type, 1, false, &get_prof);
+	if (!status)
+		*prof_id = (u8)get_prof;
+
+	return status;
+}
+
+/**
+ * ice_free_prof_id - free profile ID
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID to free
+ *
+ * This function frees a profile ID, which also corresponds to a Field Vector.
+ */
+static enum ice_status
+ice_free_prof_id(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	u16 tmp_prof_id = (u16)prof_id;
+	u16 res_type;
+
+	if (!ice_prof_id_rsrc_type(blk, &res_type))
+		return ICE_ERR_PARAM;
+
+	return ice_free_hw_res(hw, res_type, 1, &tmp_prof_id);
+}
+
+/**
+ * ice_prof_inc_ref - increment reference count for profile
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID for which to increment the reference count
+ */
+static enum ice_status
+ice_prof_inc_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	if (prof_id > hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	hw->blk[blk].es.ref_count[prof_id]++;
+
+	return 0;
+}
+
+/**
+ * ice_write_prof_mask_reg - write profile mask register
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @mask_idx: mask index
+ * @idx: index of the FV which will use the mask
+ * @mask: the 16-bit mask
+ */
+static void
+ice_write_prof_mask_reg(struct ice_hw *hw, enum ice_block blk, u16 mask_idx,
+			u16 idx, u16 mask)
+{
+	u32 offset;
+	u32 val;
+
+	switch (blk) {
+	case ICE_BLK_RSS:
+		offset = GLQF_HMASK(mask_idx);
+		val = (idx << GLQF_HMASK_MSK_INDEX_S) &
+			GLQF_HMASK_MSK_INDEX_M;
+		val |= (mask << GLQF_HMASK_MASK_S) & GLQF_HMASK_MASK_M;
+		break;
+	case ICE_BLK_FD:
+		offset = GLQF_FDMASK(mask_idx);
+		val = (idx << GLQF_FDMASK_MSK_INDEX_S) &
+			GLQF_FDMASK_MSK_INDEX_M;
+		val |= (mask << GLQF_FDMASK_MASK_S) &
+			GLQF_FDMASK_MASK_M;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "No profile masks for block %d\n",
+			  blk);
+		return;
+	}
+
+	wr32(hw, offset, val);
+	ice_debug(hw, ICE_DBG_PKG, "write mask, blk %d (%d): %x = %x\n",
+		  blk, idx, offset, val);
+}
+
+/**
+ * ice_write_prof_mask_enable_res - write profile mask enable register
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ * @enable_mask: enable mask
+ */
+static void
+ice_write_prof_mask_enable_res(struct ice_hw *hw, enum ice_block blk,
+			       u16 prof_id, u32 enable_mask)
+{
+	u32 offset;
+
+	switch (blk) {
+	case ICE_BLK_RSS:
+		offset = GLQF_HMASK_SEL(prof_id);
+		break;
+	case ICE_BLK_FD:
+		offset = GLQF_FDMASK_SEL(prof_id);
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "No profile masks for block %d\n",
+			  blk);
+		return;
+	}
+
+	wr32(hw, offset, enable_mask);
+	ice_debug(hw, ICE_DBG_PKG, "write mask enable, blk %d (%d): %x = %x\n",
+		  blk, prof_id, offset, enable_mask);
+}
+
+/**
+ * ice_init_prof_masks - initial prof masks
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ */
+static void ice_init_prof_masks(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 per_pf;
+	u16 i;
+
+	mutex_init(&hw->blk[blk].masks.lock);
+
+	per_pf = ICE_PROF_MASK_COUNT / hw->dev_caps.num_funcs;
+
+	hw->blk[blk].masks.count = per_pf;
+	hw->blk[blk].masks.first = hw->pf_id * per_pf;
+
+	memset(hw->blk[blk].masks.masks, 0, sizeof(hw->blk[blk].masks.masks));
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++)
+		ice_write_prof_mask_reg(hw, blk, i, 0, 0);
+}
+
+/**
+ * ice_init_all_prof_masks - initial all prof masks
+ * @hw: pointer to the HW struct
+ */
+void ice_init_all_prof_masks(struct ice_hw *hw)
+{
+	ice_init_prof_masks(hw, ICE_BLK_RSS);
+	ice_init_prof_masks(hw, ICE_BLK_FD);
+}
+
+/**
+ * ice_alloc_prof_mask - allocate profile mask
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @idx: index of FV which will use the mask
+ * @mask: the 16-bit mask
+ * @mask_idx: variable to receive the mask index
+ */
+static enum ice_status
+ice_alloc_prof_mask(struct ice_hw *hw, enum ice_block blk, u16 idx, u16 mask,
+		    u16 *mask_idx)
+{
+	bool found_unused = false, found_copy = false;
+	enum ice_status status = ICE_ERR_MAX_LIMIT;
+	u16 unused_idx = 0, copy_idx = 0;
+	u16 i;
+
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++)
+		if (hw->blk[blk].masks.masks[i].in_use) {
+			/* if mask is in use and it exactly duplicates the
+			 * desired mask and index, then in can be reused
+			 */
+			if (hw->blk[blk].masks.masks[i].mask == mask &&
+			    hw->blk[blk].masks.masks[i].idx == idx) {
+				found_copy = true;
+				copy_idx = i;
+				break;
+			}
+		} else {
+			/* save off unused index, but keep searching in case
+			 * there is an exact match later on
+			 */
+			if (!found_unused) {
+				found_unused = true;
+				unused_idx = i;
+			}
+		}
+
+	if (found_copy)
+		i = copy_idx;
+	else if (found_unused)
+		i = unused_idx;
+	else
+		goto err_ice_alloc_prof_mask;
+
+	/* update mask for a new entry */
+	if (found_unused) {
+		hw->blk[blk].masks.masks[i].in_use = true;
+		hw->blk[blk].masks.masks[i].mask = mask;
+		hw->blk[blk].masks.masks[i].idx = idx;
+		hw->blk[blk].masks.masks[i].ref = 0;
+		ice_write_prof_mask_reg(hw, blk, i, idx, mask);
+	}
+
+	hw->blk[blk].masks.masks[i].ref++;
+	*mask_idx = i;
+	status = 0;
+
+err_ice_alloc_prof_mask:
+	mutex_unlock(&hw->blk[blk].masks.lock);
+
+	return status;
+}
+
+/**
+ * ice_free_prof_mask - free profile mask
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @mask_idx: index of mask
+ */
+static enum ice_status
+ice_free_prof_mask(struct ice_hw *hw, enum ice_block blk, u16 mask_idx)
+{
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	if (!(mask_idx >= hw->blk[blk].masks.first &&
+	      mask_idx < hw->blk[blk].masks.first + hw->blk[blk].masks.count))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	if (!hw->blk[blk].masks.masks[mask_idx].in_use)
+		goto exit_ice_free_prof_mask;
+
+	if (hw->blk[blk].masks.masks[mask_idx].ref > 1) {
+		hw->blk[blk].masks.masks[mask_idx].ref--;
+		goto exit_ice_free_prof_mask;
+	}
+
+	/* remove mask */
+	hw->blk[blk].masks.masks[mask_idx].in_use = false;
+	hw->blk[blk].masks.masks[mask_idx].mask = 0;
+	hw->blk[blk].masks.masks[mask_idx].idx = 0;
+
+	/* update mask as unused entry */
+	ice_debug(hw, ICE_DBG_PKG, "Free mask, blk %d, mask %d\n", blk,
+		  mask_idx);
+	ice_write_prof_mask_reg(hw, blk, mask_idx, 0, 0);
+
+exit_ice_free_prof_mask:
+	mutex_unlock(&hw->blk[blk].masks.lock);
+
+	return 0;
+}
+
+/**
+ * ice_free_prof_masks - free all profile masks for a profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ */
+static enum ice_status
+ice_free_prof_masks(struct ice_hw *hw, enum ice_block blk, u16 prof_id)
+{
+	u32 mask_bm;
+	u16 i;
+
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return ICE_ERR_PARAM;
+
+	mask_bm = hw->blk[blk].es.mask_ena[prof_id];
+	for (i = 0; i < BITS_PER_BYTE * sizeof(mask_bm); i++)
+		if (mask_bm & BIT(i))
+			ice_free_prof_mask(hw, blk, i);
+
+	return 0;
+}
+
+/**
+ * ice_shutdown_prof_masks - releases lock for masking
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ *
+ * This should be called before unloading the driver
+ */
+static void ice_shutdown_prof_masks(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].masks.lock);
+
+	for (i = hw->blk[blk].masks.first;
+	     i < hw->blk[blk].masks.first + hw->blk[blk].masks.count; i++) {
+		ice_write_prof_mask_reg(hw, blk, i, 0, 0);
+
+		hw->blk[blk].masks.masks[i].in_use = false;
+		hw->blk[blk].masks.masks[i].idx = 0;
+		hw->blk[blk].masks.masks[i].mask = 0;
+	}
+
+	mutex_unlock(&hw->blk[blk].masks.lock);
+	mutex_destroy(&hw->blk[blk].masks.lock);
+}
+
+/**
+ * ice_shutdown_all_prof_masks - releases all locks for masking
+ * @hw: pointer to the HW struct
+ *
+ * This should be called before unloading the driver
+ */
+void ice_shutdown_all_prof_masks(struct ice_hw *hw)
+{
+	ice_shutdown_prof_masks(hw, ICE_BLK_RSS);
+	ice_shutdown_prof_masks(hw, ICE_BLK_FD);
+}
+
+/**
+ * ice_update_prof_masking - set registers according to masking
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof_id: profile ID
+ * @masks: masks
+ */
+static enum ice_status
+ice_update_prof_masking(struct ice_hw *hw, enum ice_block blk, u16 prof_id,
+			u16 *masks)
+{
+	bool err = false;
+	u32 ena_mask = 0;
+	u16 idx;
+	u16 i;
+
+	/* Only support FD and RSS masking, otherwise nothing to be done */
+	if (blk != ICE_BLK_RSS && blk != ICE_BLK_FD)
+		return 0;
+
+	for (i = 0; i < hw->blk[blk].es.fvw; i++)
+		if (masks[i] && masks[i] != 0xFFFF) {
+			if (!ice_alloc_prof_mask(hw, blk, i, masks[i], &idx)) {
+				ena_mask |= BIT(idx);
+			} else {
+				/* not enough bitmaps */
+				err = true;
+				break;
+			}
+		}
+
+	if (err) {
+		/* free any bitmaps we have allocated */
+		for (i = 0; i < BITS_PER_BYTE * sizeof(ena_mask); i++)
+			if (ena_mask & BIT(i))
+				ice_free_prof_mask(hw, blk, i);
+
+		return ICE_ERR_OUT_OF_RANGE;
+	}
+
+	/* enable the masks for this profile */
+	ice_write_prof_mask_enable_res(hw, blk, prof_id, ena_mask);
+
+	/* store enabled masks with profile so that they can be freed later */
+	hw->blk[blk].es.mask_ena[prof_id] = ena_mask;
+
+	return 0;
+}
+
+/**
+ * ice_write_es - write an extraction sequence to hardware
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write the extraction sequence
+ * @prof_id: the profile ID to write
+ * @fv: pointer to the extraction sequence to write - NULL to clear extraction
+ */
+static void
+ice_write_es(struct ice_hw *hw, enum ice_block blk, u8 prof_id,
+	     struct ice_fv_word *fv)
+{
+	u16 off;
+
+	off = prof_id * hw->blk[blk].es.fvw;
+	if (!fv) {
+		memset(&hw->blk[blk].es.t[off], 0,
+		       hw->blk[blk].es.fvw * sizeof(*fv));
+		hw->blk[blk].es.written[prof_id] = false;
+	} else {
+		memcpy(&hw->blk[blk].es.t[off], fv,
+		       hw->blk[blk].es.fvw * sizeof(*fv));
+	}
+}
+
+/**
+ * ice_prof_dec_ref - decrement reference count for profile
+ * @hw: pointer to the HW struct
+ * @blk: the block from which to free the profile ID
+ * @prof_id: the profile ID for which to decrement the reference count
+ */
+static enum ice_status
+ice_prof_dec_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id)
+{
+	if (prof_id > hw->blk[blk].es.count)
+		return ICE_ERR_PARAM;
+
+	if (hw->blk[blk].es.ref_count[prof_id] > 0) {
+		if (!--hw->blk[blk].es.ref_count[prof_id]) {
+			ice_write_es(hw, blk, prof_id, NULL);
+			ice_free_prof_masks(hw, blk, prof_id);
+			return ice_free_prof_id(hw, blk, prof_id);
+		}
+	}
+
+	return 0;
+}
+
+/* Block / table section IDs */
+static const u32 ice_blk_sids[ICE_BLK_COUNT][ICE_SID_OFF_COUNT] = {
+	/* SWITCH */
+	{	ICE_SID_XLT1_SW,
+		ICE_SID_XLT2_SW,
+		ICE_SID_PROFID_TCAM_SW,
+		ICE_SID_PROFID_REDIR_SW,
+		ICE_SID_FLD_VEC_SW
+	},
+
+	/* ACL */
+	{	ICE_SID_XLT1_ACL,
+		ICE_SID_XLT2_ACL,
+		ICE_SID_PROFID_TCAM_ACL,
+		ICE_SID_PROFID_REDIR_ACL,
+		ICE_SID_FLD_VEC_ACL
+	},
+
+	/* FD */
+	{	ICE_SID_XLT1_FD,
+		ICE_SID_XLT2_FD,
+		ICE_SID_PROFID_TCAM_FD,
+		ICE_SID_PROFID_REDIR_FD,
+		ICE_SID_FLD_VEC_FD
+	},
+
+	/* RSS */
+	{	ICE_SID_XLT1_RSS,
+		ICE_SID_XLT2_RSS,
+		ICE_SID_PROFID_TCAM_RSS,
+		ICE_SID_PROFID_REDIR_RSS,
+		ICE_SID_FLD_VEC_RSS
+	},
+
+	/* PE */
+	{	ICE_SID_XLT1_PE,
+		ICE_SID_XLT2_PE,
+		ICE_SID_PROFID_TCAM_PE,
+		ICE_SID_PROFID_REDIR_PE,
+		ICE_SID_FLD_VEC_PE
+	}
+};
+
+/**
+ * ice_init_sw_xlt1_db - init software XLT1 database from HW tables
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block to initialize
+ */
+static void ice_init_sw_xlt1_db(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 pt;
+
+	for (pt = 0; pt < hw->blk[blk].xlt1.count; pt++) {
+		u8 ptg;
+
+		ptg = hw->blk[blk].xlt1.t[pt];
+		if (ptg != ICE_DEFAULT_PTG) {
+			ice_ptg_alloc_val(hw, blk, ptg);
+			ice_ptg_add_mv_ptype(hw, blk, pt, ptg);
+		}
+	}
+}
+
+/**
+ * ice_init_sw_xlt2_db - init software XLT2 database from HW tables
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block to initialize
+ */
+static void ice_init_sw_xlt2_db(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 vsi;
+
+	for (vsi = 0; vsi < hw->blk[blk].xlt2.count; vsi++) {
+		u16 vsig;
+
+		vsig = hw->blk[blk].xlt2.t[vsi];
+		if (vsig) {
+			ice_vsig_alloc_val(hw, blk, vsig);
+			ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
+			/* no changes at this time, since this has been
+			 * initialized from the original package
+			 */
+			hw->blk[blk].xlt2.vsis[vsi].changed = 0;
+		}
+	}
+}
+
+/**
+ * ice_init_sw_db - init software database from HW tables
+ * @hw: pointer to the hardware structure
+ */
+static void ice_init_sw_db(struct ice_hw *hw)
+{
+	u16 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		ice_init_sw_xlt1_db(hw, (enum ice_block)i);
+		ice_init_sw_xlt2_db(hw, (enum ice_block)i);
+	}
+}
+
+/**
+ * ice_fill_tbl - Reads content of a single table type into database
+ * @hw: pointer to the hardware structure
+ * @block_id: Block ID of the table to copy
+ * @sid: Section ID of the table to copy
+ *
+ * Will attempt to read the entire content of a given table of a single block
+ * into the driver database. We assume that the buffer will always
+ * be as large or larger than the data contained in the package. If
+ * this condition is not met, there is most likely an error in the package
+ * contents.
+ */
+static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
+{
+	u32 dst_len, sect_len, offset = 0;
+	struct ice_prof_redir_section *pr;
+	struct ice_prof_id_section *pid;
+	struct ice_xlt1_section *xlt1;
+	struct ice_xlt2_section *xlt2;
+	struct ice_sw_fv_section *es;
+	struct ice_pkg_enum state;
+	u8 *src, *dst;
+	void *sect;
+
+	/* if the HW segment pointer is null then the first iteration of
+	 * ice_pkg_enum_section() will fail. In this case the HW tables will
+	 * not be filled and return success.
+	 */
+	if (!hw->seg) {
+		ice_debug(hw, ICE_DBG_PKG, "hw->seg is NULL, tables are not filled\n");
+		return;
+	}
+
+	memset(&state, 0, sizeof(state));
+
+	sect = ice_pkg_enum_section(hw->seg, &state, sid);
 
 	while (sect) {
 		switch (sid) {
@@ -1262,7 +4132,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_XLT1_RSS:
 		case ICE_SID_XLT1_ACL:
 		case ICE_SID_XLT1_PE:
-			xlt1 = (struct ice_xlt1_section *)sect;
+			xlt1 = sect;
 			src = xlt1->value;
 			sect_len = le16_to_cpu(xlt1->count) *
 				sizeof(*hw->blk[block_id].xlt1.t);
@@ -1270,282 +4140,2522 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 			dst_len = hw->blk[block_id].xlt1.count *
 				sizeof(*hw->blk[block_id].xlt1.t);
 			break;
-		case ICE_SID_XLT2_SW:
-		case ICE_SID_XLT2_FD:
-		case ICE_SID_XLT2_RSS:
-		case ICE_SID_XLT2_ACL:
-		case ICE_SID_XLT2_PE:
-			xlt2 = (struct ice_xlt2_section *)sect;
-			src = (__force u8 *)xlt2->value;
-			sect_len = le16_to_cpu(xlt2->count) *
-				sizeof(*hw->blk[block_id].xlt2.t);
-			dst = (u8 *)hw->blk[block_id].xlt2.t;
-			dst_len = hw->blk[block_id].xlt2.count *
-				sizeof(*hw->blk[block_id].xlt2.t);
+		case ICE_SID_XLT2_SW:
+		case ICE_SID_XLT2_FD:
+		case ICE_SID_XLT2_RSS:
+		case ICE_SID_XLT2_ACL:
+		case ICE_SID_XLT2_PE:
+			xlt2 = sect;
+			src = (__force u8 *)xlt2->value;
+			sect_len = le16_to_cpu(xlt2->count) *
+				sizeof(*hw->blk[block_id].xlt2.t);
+			dst = (u8 *)hw->blk[block_id].xlt2.t;
+			dst_len = hw->blk[block_id].xlt2.count *
+				sizeof(*hw->blk[block_id].xlt2.t);
+			break;
+		case ICE_SID_PROFID_TCAM_SW:
+		case ICE_SID_PROFID_TCAM_FD:
+		case ICE_SID_PROFID_TCAM_RSS:
+		case ICE_SID_PROFID_TCAM_ACL:
+		case ICE_SID_PROFID_TCAM_PE:
+			pid = sect;
+			src = (u8 *)pid->entry;
+			sect_len = le16_to_cpu(pid->count) *
+				sizeof(*hw->blk[block_id].prof.t);
+			dst = (u8 *)hw->blk[block_id].prof.t;
+			dst_len = hw->blk[block_id].prof.count *
+				sizeof(*hw->blk[block_id].prof.t);
+			break;
+		case ICE_SID_PROFID_REDIR_SW:
+		case ICE_SID_PROFID_REDIR_FD:
+		case ICE_SID_PROFID_REDIR_RSS:
+		case ICE_SID_PROFID_REDIR_ACL:
+		case ICE_SID_PROFID_REDIR_PE:
+			pr = sect;
+			src = pr->redir_value;
+			sect_len = le16_to_cpu(pr->count) *
+				sizeof(*hw->blk[block_id].prof_redir.t);
+			dst = hw->blk[block_id].prof_redir.t;
+			dst_len = hw->blk[block_id].prof_redir.count *
+				sizeof(*hw->blk[block_id].prof_redir.t);
+			break;
+		case ICE_SID_FLD_VEC_SW:
+		case ICE_SID_FLD_VEC_FD:
+		case ICE_SID_FLD_VEC_RSS:
+		case ICE_SID_FLD_VEC_ACL:
+		case ICE_SID_FLD_VEC_PE:
+			es = sect;
+			src = (u8 *)es->fv;
+			sect_len = (u32)(le16_to_cpu(es->count) *
+					 hw->blk[block_id].es.fvw) *
+				sizeof(*hw->blk[block_id].es.t);
+			dst = (u8 *)hw->blk[block_id].es.t;
+			dst_len = (u32)(hw->blk[block_id].es.count *
+					hw->blk[block_id].es.fvw) *
+				sizeof(*hw->blk[block_id].es.t);
+			break;
+		default:
+			return;
+		}
+
+		/* if the section offset exceeds destination length, terminate
+		 * table fill.
+		 */
+		if (offset > dst_len)
+			return;
+
+		/* if the sum of section size and offset exceed destination size
+		 * then we are out of bounds of the HW table size for that PF.
+		 * Changing section length to fill the remaining table space
+		 * of that PF.
+		 */
+		if ((offset + sect_len) > dst_len)
+			sect_len = dst_len - offset;
+
+		memcpy(dst + offset, src, sect_len);
+		offset += sect_len;
+		sect = ice_pkg_enum_section(NULL, &state, sid);
+	}
+}
+
+/**
+ * ice_fill_blk_tbls - Read package context for tables
+ * @hw: pointer to the hardware structure
+ *
+ * Reads the current package contents and populates the driver
+ * database with the data iteratively for all advanced feature
+ * blocks. Assume that the HW tables have been allocated.
+ */
+void ice_fill_blk_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		enum ice_block blk_id = (enum ice_block)i;
+
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt1.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt2.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof_redir.sid);
+		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].es.sid);
+	}
+
+	ice_init_sw_db(hw);
+}
+
+/**
+ * ice_free_prof_map - free profile map
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_free_prof_map(struct ice_hw *hw, u8 blk_idx)
+{
+	struct ice_es *es = &hw->blk[blk_idx].es;
+	struct ice_prof_map *del, *tmp;
+
+	mutex_lock(&es->prof_map_lock);
+	list_for_each_entry_safe(del, tmp, &es->prof_map, list) {
+		list_del(&del->list);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+	INIT_LIST_HEAD(&es->prof_map);
+	mutex_unlock(&es->prof_map_lock);
+}
+
+/**
+ * ice_free_flow_profs - free flow profile entries
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_free_flow_profs(struct ice_hw *hw, u8 blk_idx)
+{
+	struct ice_flow_prof *p, *tmp;
+
+	mutex_lock(&hw->fl_profs_locks[blk_idx]);
+	list_for_each_entry_safe(p, tmp, &hw->fl_profs[blk_idx], l_entry) {
+		struct ice_flow_entry *e, *t;
+
+		list_for_each_entry_safe(e, t, &p->entries, l_entry)
+			ice_flow_rem_entry(hw, (enum ice_block)blk_idx,
+					   ICE_FLOW_ENTRY_HNDL(e));
+
+		list_del(&p->l_entry);
+		if (p->acts)
+			devm_kfree(ice_hw_to_dev(hw), p->acts);
+
+		mutex_destroy(&p->entries_lock);
+		devm_kfree(ice_hw_to_dev(hw), p);
+	}
+	mutex_unlock(&hw->fl_profs_locks[blk_idx]);
+
+	/* if driver is in reset and tables are being cleared
+	 * re-initialize the flow profile list heads
+	 */
+	INIT_LIST_HEAD(&hw->fl_profs[blk_idx]);
+}
+
+/**
+ * ice_free_vsig_tbl - free complete VSIG table entries
+ * @hw: pointer to the hardware structure
+ * @blk: the HW block on which to free the VSIG table entries
+ */
+static void ice_free_vsig_tbl(struct ice_hw *hw, enum ice_block blk)
+{
+	u16 i;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl)
+		return;
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (hw->blk[blk].xlt2.vsig_tbl[i].in_use)
+			ice_vsig_free(hw, blk, i);
+}
+
+/**
+ * ice_free_hw_tbls - free hardware table memory
+ * @hw: pointer to the hardware structure
+ */
+void ice_free_hw_tbls(struct ice_hw *hw)
+{
+	struct ice_rss_cfg *r, *rt;
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		if (hw->blk[i].is_list_init) {
+			struct ice_es *es = &hw->blk[i].es;
+
+			ice_free_prof_map(hw, i);
+			mutex_destroy(&es->prof_map_lock);
+
+			ice_free_flow_profs(hw, i);
+			mutex_destroy(&hw->fl_profs_locks[i]);
+
+			hw->blk[i].is_list_init = false;
+		}
+		ice_free_vsig_tbl(hw, (enum ice_block)i);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptypes);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptg_tbl);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsig_tbl);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsis);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof_redir.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.t);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.ref_count);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.written);
+		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.mask_ena);
+	}
+
+	list_for_each_entry_safe(r, rt, &hw->rss_list_head, l_entry) {
+		list_del(&r->l_entry);
+		devm_kfree(ice_hw_to_dev(hw), r);
+	}
+	mutex_destroy(&hw->rss_locks);
+	ice_shutdown_all_prof_masks(hw);
+	memset(hw->blk, 0, sizeof(hw->blk));
+}
+
+/**
+ * ice_init_flow_profs - init flow profile locks and list heads
+ * @hw: pointer to the hardware structure
+ * @blk_idx: HW block index
+ */
+static void ice_init_flow_profs(struct ice_hw *hw, u8 blk_idx)
+{
+	mutex_init(&hw->fl_profs_locks[blk_idx]);
+	INIT_LIST_HEAD(&hw->fl_profs[blk_idx]);
+}
+
+/**
+ * ice_clear_hw_tbls - clear HW tables and flow profiles
+ * @hw: pointer to the hardware structure
+ */
+void ice_clear_hw_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
+		struct ice_prof_tcam *prof = &hw->blk[i].prof;
+		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
+		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
+		struct ice_es *es = &hw->blk[i].es;
+
+		if (hw->blk[i].is_list_init) {
+			ice_free_prof_map(hw, i);
+			ice_free_flow_profs(hw, i);
+		}
+
+		ice_free_vsig_tbl(hw, (enum ice_block)i);
+
+		memset(xlt1->ptypes, 0, xlt1->count * sizeof(*xlt1->ptypes));
+		memset(xlt1->ptg_tbl, 0,
+		       ICE_MAX_PTGS * sizeof(*xlt1->ptg_tbl));
+		memset(xlt1->t, 0, xlt1->count * sizeof(*xlt1->t));
+
+		memset(xlt2->vsis, 0, xlt2->count * sizeof(*xlt2->vsis));
+		memset(xlt2->vsig_tbl, 0,
+		       xlt2->count * sizeof(*xlt2->vsig_tbl));
+		memset(xlt2->t, 0, xlt2->count * sizeof(*xlt2->t));
+
+		memset(prof->t, 0, prof->count * sizeof(*prof->t));
+		memset(prof_redir->t, 0,
+		       prof_redir->count * sizeof(*prof_redir->t));
+
+		memset(es->t, 0, es->count * sizeof(*es->t) * es->fvw);
+		memset(es->ref_count, 0, es->count * sizeof(*es->ref_count));
+		memset(es->written, 0, es->count * sizeof(*es->written));
+		memset(es->mask_ena, 0, es->count * sizeof(*es->mask_ena));
+	}
+}
+
+/**
+ * ice_init_hw_tbls - init hardware table memory
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_init_hw_tbls(struct ice_hw *hw)
+{
+	u8 i;
+
+	mutex_init(&hw->rss_locks);
+	INIT_LIST_HEAD(&hw->rss_list_head);
+	ice_init_all_prof_masks(hw);
+	for (i = 0; i < ICE_BLK_COUNT; i++) {
+		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
+		struct ice_prof_tcam *prof = &hw->blk[i].prof;
+		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
+		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
+		struct ice_es *es = &hw->blk[i].es;
+		u16 j;
+
+		if (hw->blk[i].is_list_init)
+			continue;
+
+		ice_init_flow_profs(hw, i);
+		mutex_init(&es->prof_map_lock);
+		INIT_LIST_HEAD(&es->prof_map);
+		hw->blk[i].is_list_init = true;
+
+		hw->blk[i].overwrite = blk_sizes[i].overwrite;
+		es->reverse = blk_sizes[i].reverse;
+
+		xlt1->sid = ice_blk_sids[i][ICE_SID_XLT1_OFF];
+		xlt1->count = blk_sizes[i].xlt1;
+
+		xlt1->ptypes = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
+					    sizeof(*xlt1->ptypes), GFP_KERNEL);
+
+		if (!xlt1->ptypes)
+			goto err;
+
+		xlt1->ptg_tbl = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTGS,
+					     sizeof(*xlt1->ptg_tbl),
+					     GFP_KERNEL);
+
+		if (!xlt1->ptg_tbl)
+			goto err;
+
+		xlt1->t = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
+				       sizeof(*xlt1->t), GFP_KERNEL);
+		if (!xlt1->t)
+			goto err;
+
+		xlt2->sid = ice_blk_sids[i][ICE_SID_XLT2_OFF];
+		xlt2->count = blk_sizes[i].xlt2;
+
+		xlt2->vsis = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+					  sizeof(*xlt2->vsis), GFP_KERNEL);
+
+		if (!xlt2->vsis)
+			goto err;
+
+		xlt2->vsig_tbl = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+					      sizeof(*xlt2->vsig_tbl),
+					      GFP_KERNEL);
+		if (!xlt2->vsig_tbl)
+			goto err;
+
+		for (j = 0; j < xlt2->count; j++)
+			INIT_LIST_HEAD(&xlt2->vsig_tbl[j].prop_lst);
+
+		xlt2->t = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
+				       sizeof(*xlt2->t), GFP_KERNEL);
+		if (!xlt2->t)
+			goto err;
+
+		prof->sid = ice_blk_sids[i][ICE_SID_PR_OFF];
+		prof->count = blk_sizes[i].prof_tcam;
+		prof->max_prof_id = blk_sizes[i].prof_id;
+		prof->cdid_bits = blk_sizes[i].prof_cdid_bits;
+		prof->t = devm_kcalloc(ice_hw_to_dev(hw), prof->count,
+				       sizeof(*prof->t), GFP_KERNEL);
+
+		if (!prof->t)
+			goto err;
+
+		prof_redir->sid = ice_blk_sids[i][ICE_SID_PR_REDIR_OFF];
+		prof_redir->count = blk_sizes[i].prof_redir;
+		prof_redir->t = devm_kcalloc(ice_hw_to_dev(hw),
+					     prof_redir->count,
+					     sizeof(*prof_redir->t),
+					     GFP_KERNEL);
+
+		if (!prof_redir->t)
+			goto err;
+
+		es->sid = ice_blk_sids[i][ICE_SID_ES_OFF];
+		es->count = blk_sizes[i].es;
+		es->fvw = blk_sizes[i].fvw;
+		es->t = devm_kcalloc(ice_hw_to_dev(hw),
+				     (u32)(es->count * es->fvw),
+				     sizeof(*es->t), GFP_KERNEL);
+		if (!es->t)
+			goto err;
+
+		es->ref_count = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					     sizeof(*es->ref_count),
+					     GFP_KERNEL);
+
+		if (!es->ref_count)
+			goto err;
+
+		es->written = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					   sizeof(*es->written), GFP_KERNEL);
+
+		if (!es->written)
+			goto err;
+
+		es->mask_ena = devm_kcalloc(ice_hw_to_dev(hw), es->count,
+					    sizeof(*es->mask_ena), GFP_KERNEL);
+
+		if (!es->mask_ena)
+			goto err;
+	}
+	return 0;
+
+err:
+	ice_free_hw_tbls(hw);
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_prof_gen_key - generate profile ID key
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write profile ID to
+ * @ptg: packet type group (PTG) portion of key
+ * @vsig: VSIG portion of key
+ * @cdid: CDID portion of key
+ * @flags: flag portion of key
+ * @vl_msk: valid mask
+ * @dc_msk: don't care mask
+ * @nm_msk: never match mask
+ * @key: output of profile ID key
+ */
+static enum ice_status
+ice_prof_gen_key(struct ice_hw *hw, enum ice_block blk, u8 ptg, u16 vsig,
+		 u8 cdid, u16 flags, u8 vl_msk[ICE_TCAM_KEY_VAL_SZ],
+		 u8 dc_msk[ICE_TCAM_KEY_VAL_SZ], u8 nm_msk[ICE_TCAM_KEY_VAL_SZ],
+		 u8 key[ICE_TCAM_KEY_SZ])
+{
+	struct ice_prof_id_key inkey;
+
+	inkey.xlt1 = ptg;
+	inkey.xlt2_cdid = cpu_to_le16(vsig);
+	inkey.flags = cpu_to_le16(flags);
+
+	switch (hw->blk[blk].prof.cdid_bits) {
+	case 0:
+		break;
+	case 2:
+#define ICE_CD_2_M 0xC000U
+#define ICE_CD_2_S 14
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_2_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_2_S);
+		break;
+	case 4:
+#define ICE_CD_4_M 0xF000U
+#define ICE_CD_4_S 12
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_4_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_4_S);
+		break;
+	case 8:
+#define ICE_CD_8_M 0xFF00U
+#define ICE_CD_8_S 16
+		inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_8_M);
+		inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_8_S);
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_PKG, "Error in profile config\n");
+		break;
+	}
+
+	return ice_set_key(key, ICE_TCAM_KEY_SZ, (u8 *)&inkey, vl_msk, dc_msk,
+			   nm_msk, 0, ICE_TCAM_KEY_SZ / 2);
+}
+
+/**
+ * ice_tcam_write_entry - write TCAM entry
+ * @hw: pointer to the HW struct
+ * @blk: the block in which to write profile ID to
+ * @idx: the entry index to write to
+ * @prof_id: profile ID
+ * @ptg: packet type group (PTG) portion of key
+ * @vsig: VSIG portion of key
+ * @cdid: CDID portion of key
+ * @flags: flag portion of key
+ * @vl_msk: valid mask
+ * @dc_msk: don't care mask
+ * @nm_msk: never match mask
+ */
+static enum ice_status
+ice_tcam_write_entry(struct ice_hw *hw, enum ice_block blk, u16 idx,
+		     u8 prof_id, u8 ptg, u16 vsig, u8 cdid, u16 flags,
+		     u8 vl_msk[ICE_TCAM_KEY_VAL_SZ],
+		     u8 dc_msk[ICE_TCAM_KEY_VAL_SZ],
+		     u8 nm_msk[ICE_TCAM_KEY_VAL_SZ])
+{
+	struct ice_prof_tcam_entry;
+	enum ice_status status;
+
+	status = ice_prof_gen_key(hw, blk, ptg, vsig, cdid, flags, vl_msk,
+				  dc_msk, nm_msk, hw->blk[blk].prof.t[idx].key);
+	if (!status) {
+		hw->blk[blk].prof.t[idx].addr = cpu_to_le16(idx);
+		hw->blk[blk].prof.t[idx].prof_id = prof_id;
+	}
+
+	return status;
+}
+
+/**
+ * ice_vsig_get_ref - returns number of VSIs belong to a VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to query
+ * @refs: pointer to variable to receive the reference count
+ */
+static enum ice_status
+ice_vsig_get_ref(struct ice_hw *hw, enum ice_block blk, u16 vsig, u16 *refs)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_vsi *ptr;
+
+	*refs = 0;
+
+	if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	ptr = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	while (ptr) {
+		(*refs)++;
+		ptr = ptr->next_vsi;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_has_prof_vsig - check to see if VSIG has a specific profile
+ * @hw: pointer to the hardware structure
+ * @blk: HW block
+ * @vsig: VSIG to check against
+ * @hdl: profile handle
+ */
+static bool
+ice_has_prof_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_prof *ent;
+
+	list_for_each_entry(ent, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list)
+		if (ent->profile_cookie == hdl)
+			return true;
+
+	ice_debug(hw, ICE_DBG_INIT, "Characteristic list for VSI group %d not found.\n",
+		  vsig);
+	return false;
+}
+
+/**
+ * ice_prof_bld_es - build profile ID extraction sequence changes
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_es(struct ice_hw *hw, enum ice_block blk,
+		struct ice_buf_build *bld, struct list_head *chgs)
+{
+	u16 vec_size = hw->blk[blk].es.fvw * sizeof(struct ice_fv_word);
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_PTG_ES_ADD && tmp->add_prof) {
+			u16 off = tmp->prof_id * hw->blk[blk].es.fvw;
+			struct ice_pkg_es *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_VEC_TBL);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, es, 1) + vec_size - sizeof(p->es[0]));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->prof_id);
+
+			memcpy(p->es, &hw->blk[blk].es.t[off], vec_size);
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_tcam - build profile ID TCAM changes
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_tcam(struct ice_hw *hw, enum ice_block blk,
+		  struct ice_buf_build *bld, struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_TCAM_ADD && tmp->add_tcam_idx) {
+			struct ice_prof_id_section *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_PROF_TCAM);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, entry, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->entry[0].addr = cpu_to_le16(tmp->tcam_idx);
+			p->entry[0].prof_id = tmp->prof_id;
+
+			memcpy(p->entry[0].key,
+			       &hw->blk[blk].prof.t[tmp->tcam_idx].key,
+			       sizeof(hw->blk[blk].prof.t->key));
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_xlt1 - build XLT1 changes
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_xlt1(enum ice_block blk, struct ice_buf_build *bld,
+		  struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry)
+		if (tmp->type == ICE_PTG_ES_ADD && tmp->add_ptg) {
+			struct ice_xlt1_section *p;
+			u32 id;
+
+			id = ice_sect_id(blk, ICE_XLT1);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, value, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->ptype);
+			p->value[0] = tmp->ptg;
+		}
+
+	return 0;
+}
+
+/**
+ * ice_prof_bld_xlt2 - build XLT2 changes
+ * @blk: hardware block
+ * @bld: the update package buffer build to add to
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_prof_bld_xlt2(enum ice_block blk, struct ice_buf_build *bld,
+		  struct list_head *chgs)
+{
+	struct ice_chs_chg *tmp;
+
+	list_for_each_entry(tmp, chgs, list_entry) {
+		struct ice_xlt2_section *p;
+		u32 id;
+
+		switch (tmp->type) {
+		case ICE_VSIG_ADD:
+		case ICE_VSI_MOVE:
+		case ICE_VSIG_REM:
+			id = ice_sect_id(blk, ICE_XLT2);
+			p = ice_pkg_buf_alloc_section(bld, id,
+						      struct_size(p, value, 1));
+
+			if (!p)
+				return ICE_ERR_MAX_LIMIT;
+
+			p->count = cpu_to_le16(1);
+			p->offset = cpu_to_le16(tmp->vsi);
+			p->value[0] = cpu_to_le16(tmp->vsig);
+			break;
+		default:
 			break;
-		case ICE_SID_PROFID_TCAM_SW:
-		case ICE_SID_PROFID_TCAM_FD:
-		case ICE_SID_PROFID_TCAM_RSS:
-		case ICE_SID_PROFID_TCAM_ACL:
-		case ICE_SID_PROFID_TCAM_PE:
-			pid = (struct ice_prof_id_section *)sect;
-			src = (u8 *)pid->entry;
-			sect_len = le16_to_cpu(pid->count) *
-				sizeof(*hw->blk[block_id].prof.t);
-			dst = (u8 *)hw->blk[block_id].prof.t;
-			dst_len = hw->blk[block_id].prof.count *
-				sizeof(*hw->blk[block_id].prof.t);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_upd_prof_hw - update hardware using the change list
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @chgs: the list of changes to make in hardware
+ */
+static enum ice_status
+ice_upd_prof_hw(struct ice_hw *hw, enum ice_block blk,
+		struct list_head *chgs)
+{
+	struct ice_buf_build *b;
+	struct ice_chs_chg *tmp;
+	enum ice_status status;
+	u16 pkg_sects;
+	u16 xlt1 = 0;
+	u16 xlt2 = 0;
+	u16 tcam = 0;
+	u16 es = 0;
+	u16 sects;
+
+	/* count number of sections we need */
+	list_for_each_entry(tmp, chgs, list_entry) {
+		switch (tmp->type) {
+		case ICE_PTG_ES_ADD:
+			if (tmp->add_ptg)
+				xlt1++;
+			if (tmp->add_prof)
+				es++;
 			break;
-		case ICE_SID_PROFID_REDIR_SW:
-		case ICE_SID_PROFID_REDIR_FD:
-		case ICE_SID_PROFID_REDIR_RSS:
-		case ICE_SID_PROFID_REDIR_ACL:
-		case ICE_SID_PROFID_REDIR_PE:
-			pr = (struct ice_prof_redir_section *)sect;
-			src = pr->redir_value;
-			sect_len = le16_to_cpu(pr->count) *
-				sizeof(*hw->blk[block_id].prof_redir.t);
-			dst = hw->blk[block_id].prof_redir.t;
-			dst_len = hw->blk[block_id].prof_redir.count *
-				sizeof(*hw->blk[block_id].prof_redir.t);
+		case ICE_TCAM_ADD:
+			tcam++;
 			break;
-		case ICE_SID_FLD_VEC_SW:
-		case ICE_SID_FLD_VEC_FD:
-		case ICE_SID_FLD_VEC_RSS:
-		case ICE_SID_FLD_VEC_ACL:
-		case ICE_SID_FLD_VEC_PE:
-			es = (struct ice_sw_fv_section *)sect;
-			src = (u8 *)es->fv;
-			sect_len = (u32)(le16_to_cpu(es->count) *
-					 hw->blk[block_id].es.fvw) *
-				sizeof(*hw->blk[block_id].es.t);
-			dst = (u8 *)hw->blk[block_id].es.t;
-			dst_len = (u32)(hw->blk[block_id].es.count *
-					hw->blk[block_id].es.fvw) *
-				sizeof(*hw->blk[block_id].es.t);
+		case ICE_VSIG_ADD:
+		case ICE_VSI_MOVE:
+		case ICE_VSIG_REM:
+			xlt2++;
 			break;
 		default:
-			return;
+			break;
+		}
+	}
+	sects = xlt1 + xlt2 + tcam + es;
+
+	if (!sects)
+		return 0;
+
+	/* Build update package buffer */
+	b = ice_pkg_buf_alloc(hw);
+	if (!b)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_pkg_buf_reserve_section(b, sects);
+	if (status)
+		goto error_tmp;
+
+	/* Preserve order of table update: ES, TCAM, PTG, VSIG */
+	if (es) {
+		status = ice_prof_bld_es(hw, blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (tcam) {
+		status = ice_prof_bld_tcam(hw, blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (xlt1) {
+		status = ice_prof_bld_xlt1(blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	if (xlt2) {
+		status = ice_prof_bld_xlt2(blk, b, chgs);
+		if (status)
+			goto error_tmp;
+	}
+
+	/* After package buffer build check if the section count in buffer is
+	 * non-zero and matches the number of sections detected for package
+	 * update.
+	 */
+	pkg_sects = ice_pkg_buf_get_active_sections(b);
+	if (!pkg_sects || pkg_sects != sects) {
+		status = ICE_ERR_INVAL_SIZE;
+		goto error_tmp;
+	}
+
+	/* update package */
+	status = ice_update_pkg(hw, ice_pkg_buf(b), 1);
+	if (status == ICE_ERR_AQ_ERROR)
+		ice_debug(hw, ICE_DBG_INIT, "Unable to update HW profile\n");
+
+error_tmp:
+	ice_pkg_buf_free(hw, b);
+	return status;
+}
+
+/**
+ * ice_update_fd_mask - set Flow Director Field Vector mask for a profile
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @mask_sel: mask select
+ *
+ * This function enable any of the masks selected by the mask select parameter
+ * for the profile specified.
+ */
+static void ice_update_fd_mask(struct ice_hw *hw, u16 prof_id, u32 mask_sel)
+{
+	wr32(hw, GLQF_FDMASK_SEL(prof_id), mask_sel);
+
+	ice_debug(hw, ICE_DBG_INIT, "fd mask(%d): %x = %x\n", prof_id,
+		  GLQF_FDMASK_SEL(prof_id), mask_sel);
+}
+
+struct ice_fd_src_dst_pair {
+	u8 prot_id;
+	u8 count;
+	u16 off;
+};
+
+static const struct ice_fd_src_dst_pair ice_fd_pairs[] = {
+	/* These are defined in pairs */
+	{ ICE_PROT_IPV4_OF_OR_S, 2, 12 },
+	{ ICE_PROT_IPV4_OF_OR_S, 2, 16 },
+
+	{ ICE_PROT_IPV4_IL, 2, 12 },
+	{ ICE_PROT_IPV4_IL, 2, 16 },
+
+	{ ICE_PROT_IPV6_OF_OR_S, 8, 8 },
+	{ ICE_PROT_IPV6_OF_OR_S, 8, 24 },
+
+	{ ICE_PROT_IPV6_IL, 8, 8 },
+	{ ICE_PROT_IPV6_IL, 8, 24 },
+
+	{ ICE_PROT_TCP_IL, 1, 0 },
+	{ ICE_PROT_TCP_IL, 1, 2 },
+
+	{ ICE_PROT_UDP_OF, 1, 0 },
+	{ ICE_PROT_UDP_OF, 1, 2 },
+
+	{ ICE_PROT_UDP_IL_OR_S, 1, 0 },
+	{ ICE_PROT_UDP_IL_OR_S, 1, 2 },
+
+	{ ICE_PROT_SCTP_IL, 1, 0 },
+	{ ICE_PROT_SCTP_IL, 1, 2 }
+};
+
+#define ICE_FD_SRC_DST_PAIR_COUNT	ARRAY_SIZE(ice_fd_pairs)
+
+/**
+ * ice_update_fd_swap - set register appropriately for a FD FV extraction
+ * @hw: pointer to the HW struct
+ * @prof_id: profile ID
+ * @es: extraction sequence (length of array is determined by the block)
+ */
+static enum ice_status
+ice_update_fd_swap(struct ice_hw *hw, u16 prof_id, struct ice_fv_word *es)
+{
+	DECLARE_BITMAP(pair_list, ICE_FD_SRC_DST_PAIR_COUNT);
+	u8 pair_start[ICE_FD_SRC_DST_PAIR_COUNT] = { 0 };
+#define ICE_FD_FV_NOT_FOUND (-2)
+	s8 first_free = ICE_FD_FV_NOT_FOUND;
+	u8 used[ICE_MAX_FV_WORDS] = { 0 };
+	s8 orig_free, si;
+	u32 mask_sel = 0;
+	u8 i, j, k;
+
+	bitmap_zero(pair_list, ICE_FD_SRC_DST_PAIR_COUNT);
+
+	/* This code assumes that the Flow Director field vectors are assigned
+	 * from the end of the FV indexes working towards the zero index, that
+	 * only complete fields will be included and will be consecutive, and
+	 * that there are no gaps between valid indexes.
+	 */
+
+	/* Determine swap fields present */
+	for (i = 0; i < hw->blk[ICE_BLK_FD].es.fvw; i++) {
+		/* Find the first free entry, assuming right to left population.
+		 * This is where we can start adding additional pairs if needed.
+		 */
+		if (first_free == ICE_FD_FV_NOT_FOUND && es[i].prot_id !=
+		    ICE_PROT_INVALID)
+			first_free = i - 1;
+
+		for (j = 0; j < ICE_FD_SRC_DST_PAIR_COUNT; j++)
+			if (es[i].prot_id == ice_fd_pairs[j].prot_id &&
+			    es[i].off == ice_fd_pairs[j].off) {
+				set_bit(j, pair_list);
+				pair_start[j] = i;
+			}
+	}
+
+	orig_free = first_free;
+
+	/* determine missing swap fields that need to be added */
+	for (i = 0; i < ICE_FD_SRC_DST_PAIR_COUNT; i += 2) {
+		u8 bit1 = test_bit(i + 1, pair_list);
+		u8 bit0 = test_bit(i, pair_list);
+
+		if (bit0 ^ bit1) {
+			u8 index;
+
+			/* add the appropriate 'paired' entry */
+			if (!bit0)
+				index = i;
+			else
+				index = i + 1;
+
+			/* check for room */
+			if (first_free + 1 < (s8)ice_fd_pairs[index].count)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* place in extraction sequence */
+			for (k = 0; k < ice_fd_pairs[index].count; k++) {
+				es[first_free - k].prot_id =
+					ice_fd_pairs[index].prot_id;
+				es[first_free - k].off =
+					ice_fd_pairs[index].off + (k * 2);
+
+				if (k > first_free)
+					return ICE_ERR_OUT_OF_RANGE;
+
+				/* keep track of non-relevant fields */
+				mask_sel |= BIT(first_free - k);
+			}
+
+			pair_start[index] = first_free;
+			first_free -= ice_fd_pairs[index].count;
+		}
+	}
+
+	/* fill in the swap array */
+	si = hw->blk[ICE_BLK_FD].es.fvw - 1;
+	while (si >= 0) {
+		u8 indexes_used = 1;
+
+		/* assume flat at this index */
+#define ICE_SWAP_VALID	0x80
+		used[si] = si | ICE_SWAP_VALID;
+
+		if (orig_free == ICE_FD_FV_NOT_FOUND || si <= orig_free) {
+			si -= indexes_used;
+			continue;
+		}
+
+		/* check for a swap location */
+		for (j = 0; j < ICE_FD_SRC_DST_PAIR_COUNT; j++)
+			if (es[si].prot_id == ice_fd_pairs[j].prot_id &&
+			    es[si].off == ice_fd_pairs[j].off) {
+				u8 idx;
+
+				/* determine the appropriate matching field */
+				idx = j + ((j % 2) ? -1 : 1);
+
+				indexes_used = ice_fd_pairs[idx].count;
+				for (k = 0; k < indexes_used; k++) {
+					used[si - k] = (pair_start[idx] - k) |
+						ICE_SWAP_VALID;
+				}
+
+				break;
+			}
+
+		si -= indexes_used;
+	}
+
+	/* for each set of 4 swap and 4 inset indexes, write the appropriate
+	 * register
+	 */
+	for (j = 0; j < hw->blk[ICE_BLK_FD].es.fvw / 4; j++) {
+		u32 raw_swap = 0;
+		u32 raw_in = 0;
+
+		for (k = 0; k < 4; k++) {
+			u8 idx;
+
+			idx = (j * 4) + k;
+			if (used[idx] && !(mask_sel & BIT(idx))) {
+				raw_swap |= used[idx] << (k * BITS_PER_BYTE);
+#define ICE_INSET_DFLT 0x9f
+				raw_in |= ICE_INSET_DFLT << (k * BITS_PER_BYTE);
+			}
+		}
+
+		/* write the appropriate swap register set */
+		wr32(hw, GLQF_FDSWAP(prof_id, j), raw_swap);
+
+		ice_debug(hw, ICE_DBG_INIT, "swap wr(%d, %d): %x = %08x\n",
+			  prof_id, j, GLQF_FDSWAP(prof_id, j), raw_swap);
+
+		/* write the appropriate inset register set */
+		wr32(hw, GLQF_FDINSET(prof_id, j), raw_in);
+
+		ice_debug(hw, ICE_DBG_INIT, "inset wr(%d, %d): %x = %08x\n",
+			  prof_id, j, GLQF_FDINSET(prof_id, j), raw_in);
+	}
+
+	/* initially clear the mask select for this profile */
+	ice_update_fd_mask(hw, prof_id, 0);
+
+	return 0;
+}
+
+/* The entries here needs to match the order of enum ice_ptype_attrib */
+static const struct ice_ptype_attrib_info ice_ptype_attributes[] = {
+	{ ICE_GTP_PDU_EH,	ICE_GTP_PDU_FLAG_MASK },
+	{ ICE_GTP_SESSION,	ICE_GTP_FLAGS_MASK },
+	{ ICE_GTP_DOWNLINK,	ICE_GTP_FLAGS_MASK },
+	{ ICE_GTP_UPLINK,	ICE_GTP_FLAGS_MASK },
+};
+
+/**
+ * ice_get_ptype_attrib_info - get ptype attribute information
+ * @type: attribute type
+ * @info: pointer to variable to the attribute information
+ */
+static void
+ice_get_ptype_attrib_info(enum ice_ptype_attrib_type type,
+			  struct ice_ptype_attrib_info *info)
+{
+	*info = ice_ptype_attributes[type];
+}
+
+/**
+ * ice_add_prof_attrib - add any PTG with attributes to profile
+ * @prof: pointer to the profile to which PTG entries will be added
+ * @ptg: PTG to be added
+ * @ptype: PTYPE that needs to be looked up
+ * @attr: array of attributes that will be considered
+ * @attr_cnt: number of elements in the attribute array
+ */
+static enum ice_status
+ice_add_prof_attrib(struct ice_prof_map *prof, u8 ptg, u16 ptype,
+		    const struct ice_ptype_attributes *attr, u16 attr_cnt)
+{
+	bool found = false;
+	u16 i;
+
+	for (i = 0; i < attr_cnt; i++) {
+		if (attr[i].ptype == ptype) {
+			found = true;
+
+			prof->ptg[prof->ptg_cnt] = ptg;
+			ice_get_ptype_attrib_info(attr[i].attrib,
+						  &prof->attr[prof->ptg_cnt]);
+
+			if (++prof->ptg_cnt >= ICE_MAX_PTG_PER_PROFILE)
+				return ICE_ERR_MAX_LIMIT;
+		}
+	}
+
+	if (!found)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	return 0;
+}
+
+/**
+ * ice_add_prof - add profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @ptypes: array of bitmaps indicating ptypes (ICE_FLOW_PTYPE_MAX bits)
+ * @attr: array of attributes
+ * @attr_cnt: number of elements in attrib array
+ * @es: extraction sequence (length of array is determined by the block)
+ * @masks: mask for extraction sequence
+ *
+ * This function registers a profile, which matches a set of PTYPES with a
+ * particular extraction sequence. While the hardware profile is allocated
+ * it will not be written until the first call to ice_add_flow that specifies
+ * the ID value used here.
+ */
+enum ice_status
+ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[],
+	     const struct ice_ptype_attributes *attr, u16 attr_cnt,
+	     struct ice_fv_word *es, u16 *masks)
+{
+	u32 bytes = DIV_ROUND_UP(ICE_FLOW_PTYPE_MAX, BITS_PER_BYTE);
+	DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT);
+	struct ice_prof_map *prof;
+	enum ice_status status;
+	u8 byte = 0;
+	u8 prof_id;
+
+	bitmap_zero(ptgs_used, ICE_XLT1_CNT);
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+
+	/* search for existing profile */
+	status = ice_find_prof_id_with_mask(hw, blk, es, masks, &prof_id);
+	if (status) {
+		/* allocate profile ID */
+		status = ice_alloc_prof_id(hw, blk, &prof_id);
+		if (status)
+			goto err_ice_add_prof;
+		if (blk == ICE_BLK_FD) {
+			/* For Flow Director block, the extraction sequence may
+			 * need to be altered in the case where there are paired
+			 * fields that have no match. This is necessary because
+			 * for Flow Director, src and dest fields need to paired
+			 * for filter programming and these values are swapped
+			 * during Tx.
+			 */
+			status = ice_update_fd_swap(hw, prof_id, es);
+			if (status)
+				goto err_ice_add_prof;
+		}
+		status = ice_update_prof_masking(hw, blk, prof_id, masks);
+		if (status)
+			goto err_ice_add_prof;
+
+		/* and write new es */
+		ice_write_es(hw, blk, prof_id, es);
+	}
+
+	ice_prof_inc_ref(hw, blk, prof_id);
+
+	/* add profile info */
+
+	prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*prof), GFP_KERNEL);
+	if (!prof)
+		goto err_ice_add_prof;
+
+	prof->profile_cookie = id;
+	prof->prof_id = prof_id;
+	prof->ptg_cnt = 0;
+	prof->context = 0;
+
+	/* build list of ptgs */
+	while (bytes && prof->ptg_cnt < ICE_MAX_PTG_PER_PROFILE) {
+		u8 bit;
+
+		if (!ptypes[byte]) {
+			bytes--;
+			byte++;
+			continue;
+		}
+
+		/* Examine 8 bits per byte */
+		for_each_set_bit(bit, (unsigned long *)&ptypes[byte],
+				 BITS_PER_BYTE) {
+			u16 ptype;
+			u8 ptg;
+
+			ptype = byte * BITS_PER_BYTE + bit;
+
+			/* The package should place all ptypes in a non-zero
+			 * PTG, so the following call should never fail.
+			 */
+			if (ice_ptg_find_ptype(hw, blk, ptype, &ptg))
+				continue;
+
+			/* If PTG is already added, skip and continue */
+			if (test_bit(ptg, ptgs_used))
+				continue;
+
+			set_bit(ptg, ptgs_used);
+			/* Check to see there are any attributes for this
+			 * ptype, and add them if found.
+			 */
+			status = ice_add_prof_attrib(prof, ptg, ptype, attr,
+						     attr_cnt);
+			if (status == ICE_ERR_MAX_LIMIT)
+				break;
+			if (status) {
+				/* This is simple a ptype/PTG with no
+				 * attribute
+				 */
+				prof->ptg[prof->ptg_cnt] = ptg;
+				prof->attr[prof->ptg_cnt].flags = 0;
+				prof->attr[prof->ptg_cnt].mask = 0;
+
+				if (++prof->ptg_cnt >= ICE_MAX_PTG_PER_PROFILE)
+					break;
+			}
+		}
+
+		bytes--;
+		byte++;
+	}
+
+	list_add(&prof->list, &hw->blk[blk].es.prof_map);
+	status = 0;
+
+err_ice_add_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_search_prof_id - Search for a profile tracking ID
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ *
+ * This will search for a profile tracking ID which was previously added.
+ * The profile map lock should be held before calling this function.
+ */
+struct ice_prof_map *
+ice_search_prof_id(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_prof_map *entry = NULL;
+	struct ice_prof_map *map;
+
+	list_for_each_entry(map, &hw->blk[blk].es.prof_map, list)
+		if (map->profile_cookie == id) {
+			entry = map;
+			break;
+		}
+
+	return entry;
+}
+
+/**
+ * ice_set_prof_context - Set context for a given profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @cntxt: context
+ */
+enum ice_status
+ice_set_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 cntxt)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *entry;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	entry = ice_search_prof_id(hw, blk, id);
+	if (entry) {
+		entry->context = cntxt;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_get_prof_context - Get context for a given profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ * @cntxt: pointer to variable to receive the context
+ */
+enum ice_status
+ice_get_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 *cntxt)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *entry;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	entry = ice_search_prof_id(hw, blk, id);
+	if (entry) {
+		*cntxt = entry->context;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_vsig_prof_id_count - count profiles in a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG to remove the profile from
+ */
+static u16
+ice_vsig_prof_id_count(struct ice_hw *hw, enum ice_block blk, u16 vsig)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M, count = 0;
+	struct ice_vsig_prof *p;
+
+	list_for_each_entry(p, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list)
+		count++;
+
+	return count;
+}
+
+/**
+ * ice_rel_tcam_idx - release a TCAM index
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @idx: the index to release
+ */
+static enum ice_status
+ice_rel_tcam_idx(struct ice_hw *hw, enum ice_block blk, u16 idx)
+{
+	/* Masks to invoke a never match entry */
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFE, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x01, 0x00, 0x00, 0x00, 0x00 };
+	enum ice_status status;
+
+	/* write the TCAM entry */
+	status = ice_tcam_write_entry(hw, blk, idx, 0, 0, 0, 0, 0, vl_msk,
+				      dc_msk, nm_msk);
+	if (status)
+		return status;
+
+	/* release the TCAM entry */
+	status = ice_free_tcam_ent(hw, blk, idx);
+
+	return status;
+}
+
+/**
+ * ice_rem_prof_id - remove one profile from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @prof: pointer to profile structure to remove
+ */
+static enum ice_status
+ice_rem_prof_id(struct ice_hw *hw, enum ice_block blk,
+		struct ice_vsig_prof *prof)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < prof->tcam_count; i++)
+		if (prof->tcam[i].in_use) {
+			prof->tcam[i].in_use = false;
+			status = ice_rel_tcam_idx(hw, blk,
+						  prof->tcam[i].tcam_idx);
+			if (status)
+				return ICE_ERR_HW_TABLE;
 		}
 
-		/* if the section offset exceeds destination length, terminate
-		 * table fill.
+	return 0;
+}
+
+/**
+ * ice_rem_vsig - remove VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG to remove
+ * @chg: the change list
+ */
+static enum ice_status
+ice_rem_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+	     struct list_head *chg)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_vsi *vsi_cur;
+	struct ice_vsig_prof *d, *t;
+	enum ice_status status;
+
+	/* remove TCAM entries */
+	list_for_each_entry_safe(d, t,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list) {
+		status = ice_rem_prof_id(hw, blk, d);
+		if (status)
+			return status;
+
+		list_del(&d->list);
+		devm_kfree(ice_hw_to_dev(hw), d);
+	}
+
+	/* Move all VSIS associated with this VSIG to the default VSIG */
+	vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi;
+	/* If the VSIG has at least 1 VSI then iterate through the list
+	 * and remove the VSIs before deleting the group.
+	 */
+	if (vsi_cur)
+		do {
+			struct ice_vsig_vsi *tmp = vsi_cur->next_vsi;
+			struct ice_chs_chg *p;
+
+			p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p),
+					 GFP_KERNEL);
+			if (!p)
+				return ICE_ERR_NO_MEMORY;
+
+			p->type = ICE_VSIG_REM;
+			p->orig_vsig = vsig;
+			p->vsig = ICE_DEFAULT_VSIG;
+			p->vsi = vsi_cur - hw->blk[blk].xlt2.vsis;
+
+			list_add(&p->list_entry, chg);
+
+			vsi_cur = tmp;
+		} while (vsi_cur);
+
+	return ice_vsig_free(hw, blk, vsig);
+}
+
+/**
+ * ice_rem_prof_id_vsig - remove a specific profile from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG to remove the profile from
+ * @hdl: profile handle indicating which profile to remove
+ * @chg: list to receive a record of changes
+ */
+static enum ice_status
+ice_rem_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl,
+		     struct list_head *chg)
+{
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+	struct ice_vsig_prof *p, *t;
+	enum ice_status status;
+
+	list_for_each_entry_safe(p, t,
+				 &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+				 list)
+		if (p->profile_cookie == hdl) {
+			if (ice_vsig_prof_id_count(hw, blk, vsig) == 1)
+				/* this is the last profile, remove the VSIG */
+				return ice_rem_vsig(hw, blk, vsig, chg);
+
+			status = ice_rem_prof_id(hw, blk, p);
+			if (!status) {
+				list_del(&p->list);
+				devm_kfree(ice_hw_to_dev(hw), p);
+			}
+			return status;
+		}
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_rem_flow_all - remove all flows with a particular profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ */
+static enum ice_status
+ice_rem_flow_all(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_chs_chg *del, *tmp;
+	enum ice_status status;
+	struct list_head chg;
+	u16 i;
+
+	INIT_LIST_HEAD(&chg);
+
+	for (i = 1; i < ICE_MAX_VSIGS; i++)
+		if (hw->blk[blk].xlt2.vsig_tbl[i].in_use) {
+			if (ice_has_prof_vsig(hw, blk, i, id)) {
+				status = ice_rem_prof_id_vsig(hw, blk, i, id,
+							      &chg);
+				if (status)
+					goto err_ice_rem_flow_all;
+			}
+		}
+
+	status = ice_upd_prof_hw(hw, blk, &chg);
+
+err_ice_rem_flow_all:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_prof - remove profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @id: profile tracking ID
+ *
+ * This will remove the profile specified by the ID parameter, which was
+ * previously created through ice_add_prof. If any existing entries
+ * are associated with this profile, they will be removed as well.
+ */
+enum ice_status ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id)
+{
+	struct ice_prof_map *pmap;
+	enum ice_status status;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+
+	pmap = ice_search_prof_id(hw, blk, id);
+	if (!pmap) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_rem_prof;
+	}
+
+	/* remove all flows with this profile */
+	status = ice_rem_flow_all(hw, blk, pmap->profile_cookie);
+	if (status)
+		goto err_ice_rem_prof;
+
+	/* dereference profile, and possibly remove */
+	ice_prof_dec_ref(hw, blk, pmap->prof_id);
+
+	list_del(&pmap->list);
+	devm_kfree(ice_hw_to_dev(hw), pmap);
+
+err_ice_rem_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_get_prof - get profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @hdl: profile handle
+ * @chg: change list
+ */
+static enum ice_status
+ice_get_prof(struct ice_hw *hw, enum ice_block blk, u64 hdl,
+	     struct list_head *chg)
+{
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_chs_chg *p;
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	/* Get the details on the profile specified by the handle ID */
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_get_prof;
+	}
+
+	for (i = 0; i < map->ptg_cnt; i++)
+		if (!hw->blk[blk].es.written[map->prof_id]) {
+			/* add ES to change list */
+			p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p),
+					 GFP_KERNEL);
+			if (!p) {
+				status = ICE_ERR_NO_MEMORY;
+				goto err_ice_get_prof;
+			}
+
+			p->type = ICE_PTG_ES_ADD;
+			p->ptype = 0;
+			p->ptg = map->ptg[i];
+			p->attr = map->attr[i];
+			p->add_ptg = 0;
+
+			p->add_prof = 1;
+			p->prof_id = map->prof_id;
+
+			hw->blk[blk].es.written[map->prof_id] = true;
+
+			list_add(&p->list_entry, chg);
+		}
+
+err_ice_get_prof:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	/* let caller clean up the change list */
+	return status;
+}
+
+/**
+ * ice_get_profs_vsig - get a copy of the list of profiles from a VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: VSIG from which to copy the list
+ * @lst: output list
+ *
+ * This routine makes a copy of the list of profiles in the specified VSIG.
+ */
+static enum ice_status
+ice_get_profs_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+		   struct list_head *lst)
+{
+	struct ice_vsig_prof *ent1, *ent2;
+	u16 idx = vsig & ICE_VSIG_IDX_M;
+
+	list_for_each_entry(ent1, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list) {
+		struct ice_vsig_prof *p;
+
+		/* copy to the input list */
+		p = devm_kmemdup(ice_hw_to_dev(hw), ent1, sizeof(*p),
+				 GFP_KERNEL);
+		if (!p)
+			goto err_ice_get_profs_vsig;
+
+		list_add_tail(&p->list, lst);
+	}
+
+	return 0;
+
+err_ice_get_profs_vsig:
+	list_for_each_entry_safe(ent1, ent2, lst, list) {
+		list_del(&ent1->list);
+		devm_kfree(ice_hw_to_dev(hw), ent1);
+	}
+
+	return ICE_ERR_NO_MEMORY;
+}
+
+/**
+ * ice_add_prof_to_lst - add profile entry to a list
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @lst: the list to be added to
+ * @hdl: profile handle of entry to add
+ */
+static enum ice_status
+ice_add_prof_to_lst(struct ice_hw *hw, enum ice_block blk,
+		    struct list_head *lst, u64 hdl)
+{
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_vsig_prof *p;
+	u16 i;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_add_prof_to_lst;
+	}
+
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_prof_to_lst;
+	}
+
+	p->profile_cookie = map->profile_cookie;
+	p->prof_id = map->prof_id;
+	p->tcam_count = map->ptg_cnt;
+
+	for (i = 0; i < map->ptg_cnt; i++) {
+		p->tcam[i].prof_id = map->prof_id;
+		p->tcam[i].tcam_idx = ICE_INVALID_TCAM;
+		p->tcam[i].ptg = map->ptg[i];
+		p->tcam[i].attr = map->attr[i];
+	}
+
+	list_add(&p->list, lst);
+
+err_ice_add_prof_to_lst:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+/**
+ * ice_move_vsi - move VSI to another VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI to move
+ * @vsig: the VSIG to move the VSI to
+ * @chg: the change list
+ */
+static enum ice_status
+ice_move_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig,
+	     struct list_head *chg)
+{
+	enum ice_status status;
+	struct ice_chs_chg *p;
+	u16 orig_vsig;
+
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig);
+	if (!status)
+		status = ice_vsig_add_mv_vsi(hw, blk, vsi, vsig);
+
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), p);
+		return status;
+	}
+
+	p->type = ICE_VSI_MOVE;
+	p->vsi = vsi;
+	p->orig_vsig = orig_vsig;
+	p->vsig = vsig;
+
+	list_add(&p->list_entry, chg);
+
+	return 0;
+}
+
+/**
+ * ice_set_tcam_flags - set TCAM flag don't care mask
+ * @mask: mask for flags
+ * @dc_mask: pointer to the don't care mask
+ */
+static void ice_set_tcam_flags(u16 mask, u8 dc_mask[ICE_TCAM_KEY_VAL_SZ])
+{
+	u16 *flag_word;
+
+	/* flags are lowest u16 */
+	flag_word = (u16 *)dc_mask;
+	*flag_word = ~mask;
+}
+
+/**
+ * ice_rem_chg_tcam_ent - remove a specific TCAM entry from change list
+ * @hw: pointer to the HW struct
+ * @idx: the index of the TCAM entry to remove
+ * @chg: the list of change structures to search
+ */
+static void
+ice_rem_chg_tcam_ent(struct ice_hw *hw, u16 idx, struct list_head *chg)
+{
+	struct ice_chs_chg *pos, *tmp;
+
+	list_for_each_entry_safe(tmp, pos, chg, list_entry)
+		if (tmp->type == ICE_TCAM_ADD && tmp->tcam_idx == idx) {
+			list_del(&tmp->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), tmp);
+		}
+}
+
+/**
+ * ice_prof_tcam_ena_dis - add enable or disable TCAM change
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @enable: true to enable, false to disable
+ * @vsig: the VSIG of the TCAM entry
+ * @tcam: pointer the TCAM info structure of the TCAM to disable
+ * @chg: the change list
+ *
+ * This function appends an enable or disable TCAM entry in the change log
+ */
+static enum ice_status
+ice_prof_tcam_ena_dis(struct ice_hw *hw, enum ice_block blk, bool enable,
+		      u16 vsig, struct ice_tcam_inf *tcam,
+		      struct list_head *chg)
+{
+	enum ice_status status;
+	struct ice_chs_chg *p;
+
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0x00, 0x00, 0x00 };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+	/* if disabling, free the TCAM */
+	if (!enable) {
+		status = ice_rel_tcam_idx(hw, blk, tcam->tcam_idx);
+
+		/* if we have already created a change for this TCAM entry, then
+		 * we need to remove that entry, in order to prevent writing to
+		 * a TCAM entry we no longer will have ownership of.
 		 */
-		if (offset > dst_len)
-			return;
+		ice_rem_chg_tcam_ent(hw, tcam->tcam_idx, chg);
+		tcam->tcam_idx = 0;
+		tcam->in_use = 0;
+		return status;
+	}
 
-		/* if the sum of section size and offset exceed destination size
-		 * then we are out of bounds of the HW table size for that PF.
-		 * Changing section length to fill the remaining table space
-		 * of that PF.
+	/* for re-enabling, reallocate a TCAM */
+	/* for entries with empty attribute masks, allocate entry from
+	 * the bottom of the TCAM table; otherwise, allocate from the
+	 * top of the table in order to give it higher priority
+	 */
+	status = ice_alloc_tcam_ent(hw, blk, tcam->attr.mask == 0,
+				    &tcam->tcam_idx);
+	if (status)
+		return status;
+
+	/* add TCAM to change list */
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
+
+	/* set don't care masks for TCAM flags */
+	ice_set_tcam_flags(tcam->attr.mask, dc_msk);
+
+	status = ice_tcam_write_entry(hw, blk, tcam->tcam_idx, tcam->prof_id,
+				      tcam->ptg, vsig, 0, tcam->attr.flags,
+				      vl_msk, dc_msk, nm_msk);
+	if (status)
+		goto err_ice_prof_tcam_ena_dis;
+
+	tcam->in_use = 1;
+
+	p->type = ICE_TCAM_ADD;
+	p->add_tcam_idx = true;
+	p->prof_id = tcam->prof_id;
+	p->ptg = tcam->ptg;
+	p->vsig = 0;
+	p->tcam_idx = tcam->tcam_idx;
+
+	/* log change */
+	list_add(&p->list_entry, chg);
+
+	return 0;
+
+err_ice_prof_tcam_ena_dis:
+	devm_kfree(ice_hw_to_dev(hw), p);
+	return status;
+}
+
+/**
+ * ice_ptg_attr_in_use - determine if PTG and attribute pair is in use
+ * @ptg_attr: pointer to the PTG and attribute pair to check
+ * @ptgs_used: bitmap that denotes which PTGs are in use
+ * @attr_used: array of PTG and attributes pairs already used
+ * @attr_cnt: count of entries in the attr_used array
+ */
+static bool
+ice_ptg_attr_in_use(struct ice_tcam_inf *ptg_attr, unsigned long *ptgs_used,
+		    struct ice_tcam_inf *attr_used[], u16 attr_cnt)
+{
+	u16 i;
+
+	if (!test_bit(ptg_attr->ptg, ptgs_used))
+		return false;
+
+	/* the PTG is used, so now look for correct attributes */
+	for (i = 0; i < attr_cnt; i++)
+		if (attr_used[i]->ptg == ptg_attr->ptg &&
+		    attr_used[i]->attr.flags == ptg_attr->attr.flags &&
+		    attr_used[i]->attr.mask == ptg_attr->attr.mask)
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_adj_prof_priorities - adjust profile based on priorities
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG for which to adjust profile priorities
+ * @chg: the change list
+ */
+static enum ice_status
+ice_adj_prof_priorities(struct ice_hw *hw, enum ice_block blk, u16 vsig,
+			struct list_head *chg)
+{
+	DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT);
+	struct ice_tcam_inf **attr_used;
+	enum ice_status status = 0;
+	struct ice_vsig_prof *t;
+	u16 attr_used_cnt = 0;
+	u16 idx;
+
+#define ICE_MAX_PTG_ATTRS	1024
+	attr_used = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTG_ATTRS,
+				 sizeof(*attr_used), GFP_KERNEL);
+	if (!attr_used)
+		return ICE_ERR_NO_MEMORY;
+
+	bitmap_zero(ptgs_used, ICE_XLT1_CNT);
+	idx = vsig & ICE_VSIG_IDX_M;
+
+	/* Priority is based on the order in which the profiles are added. The
+	 * newest added profile has highest priority and the oldest added
+	 * profile has the lowest priority. Since the profile property list for
+	 * a VSIG is sorted from newest to oldest, this code traverses the list
+	 * in order and enables the first of each PTG that it finds (that is not
+	 * already enabled); it also disables any duplicate PTGs that it finds
+	 * in the older profiles (that are currently enabled).
+	 */
+
+	list_for_each_entry(t, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst,
+			    list) {
+		u16 i;
+
+		for (i = 0; i < t->tcam_count; i++) {
+			bool used;
+
+			/* Scan the priorities from newest to oldest.
+			 * Make sure that the newest profiles take priority.
+			 */
+			used = ice_ptg_attr_in_use(&t->tcam[i], ptgs_used,
+						   attr_used, attr_used_cnt);
+
+			if (used && t->tcam[i].in_use) {
+				/* need to mark this PTG as never match, as it
+				 * was already in use and therefore duplicate
+				 * (and lower priority)
+				 */
+				status = ice_prof_tcam_ena_dis(hw, blk, false,
+							       vsig,
+							       &t->tcam[i],
+							       chg);
+				if (status)
+					goto err_ice_adj_prof_priorities;
+			} else if (!used && !t->tcam[i].in_use) {
+				/* need to enable this PTG, as it in not in use
+				 * and not enabled (highest priority)
+				 */
+				status = ice_prof_tcam_ena_dis(hw, blk, true,
+							       vsig,
+							       &t->tcam[i],
+							       chg);
+				if (status)
+					goto err_ice_adj_prof_priorities;
+			}
+
+			/* keep track of used ptgs */
+			set_bit(t->tcam[i].ptg, ptgs_used);
+			if (attr_used_cnt < ICE_MAX_PTG_ATTRS)
+				attr_used[attr_used_cnt++] = &t->tcam[i];
+			else
+				ice_debug(hw, ICE_DBG_INIT, "Warn: ICE_MAX_PTG_ATTRS exceeded\n");
+		}
+	}
+
+err_ice_adj_prof_priorities:
+	devm_kfree(ice_hw_to_dev(hw), attr_used);
+	return status;
+}
+
+/**
+ * ice_add_prof_id_vsig - add profile to VSIG
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsig: the VSIG to which this profile is to be added
+ * @hdl: the profile handle indicating the profile to add
+ * @rev: true to add entries to the end of the list
+ * @chg: the change list
+ */
+static enum ice_status
+ice_add_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl,
+		     bool rev, struct list_head *chg)
+{
+	/* Masks that ignore flags */
+	u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+	u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0x00, 0x00, 0x00 };
+	u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 };
+	enum ice_status status = 0;
+	struct ice_prof_map *map;
+	struct ice_vsig_prof *t;
+	struct ice_chs_chg *p;
+	u16 vsig_idx, i;
+
+	/* Error, if this VSIG already has this profile */
+	if (ice_has_prof_vsig(hw, blk, vsig, hdl))
+		return ICE_ERR_ALREADY_EXISTS;
+
+	/* new VSIG profile structure */
+	t = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ICE_ERR_NO_MEMORY;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	/* Get the details on the profile specified by the handle ID */
+	map = ice_search_prof_id(hw, blk, hdl);
+	if (!map) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto err_ice_add_prof_id_vsig;
+	}
+
+	t->profile_cookie = map->profile_cookie;
+	t->prof_id = map->prof_id;
+	t->tcam_count = map->ptg_cnt;
+
+	/* create TCAM entries */
+	for (i = 0; i < map->ptg_cnt; i++) {
+		u16 tcam_idx;
+
+		/* add TCAM to change list */
+		p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+		if (!p) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		/* allocate the TCAM entry index */
+		/* for entries with empty attribute masks, allocate entry from
+		 * the bottom of the TCAM table; otherwise, allocate from the
+		 * top of the table in order to give it higher priority
 		 */
-		if ((offset + sect_len) > dst_len)
-			sect_len = dst_len - offset;
+		status = ice_alloc_tcam_ent(hw, blk, map->attr[i].mask == 0,
+					    &tcam_idx);
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), p);
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		t->tcam[i].ptg = map->ptg[i];
+		t->tcam[i].prof_id = map->prof_id;
+		t->tcam[i].tcam_idx = tcam_idx;
+		t->tcam[i].attr = map->attr[i];
+		t->tcam[i].in_use = true;
+
+		p->type = ICE_TCAM_ADD;
+		p->add_tcam_idx = true;
+		p->prof_id = t->tcam[i].prof_id;
+		p->ptg = t->tcam[i].ptg;
+		p->vsig = vsig;
+		p->tcam_idx = t->tcam[i].tcam_idx;
+
+		/* set don't care masks for TCAM flags */
+		ice_set_tcam_flags(t->tcam[i].attr.mask, dc_msk);
+
+		/* write the TCAM entry */
+		status = ice_tcam_write_entry(hw, blk, t->tcam[i].tcam_idx,
+					      t->tcam[i].prof_id,
+					      t->tcam[i].ptg, vsig, 0,
+					      t->tcam[i].attr.flags, vl_msk,
+					      dc_msk, nm_msk);
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), p);
+			goto err_ice_add_prof_id_vsig;
+		}
+
+		/* log change */
+		list_add(&p->list_entry, chg);
+	}
+
+	/* add profile to VSIG */
+	vsig_idx = vsig & ICE_VSIG_IDX_M;
+	if (rev)
+		list_add_tail(&t->list,
+			      &hw->blk[blk].xlt2.vsig_tbl[vsig_idx].prop_lst);
+	else
+		list_add(&t->list,
+			 &hw->blk[blk].xlt2.vsig_tbl[vsig_idx].prop_lst);
+
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
 
-		memcpy(dst + offset, src, sect_len);
-		offset += sect_len;
-		sect = ice_pkg_enum_section(NULL, &state, sid);
-	}
+err_ice_add_prof_id_vsig:
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	/* let caller clean up the change list */
+	devm_kfree(ice_hw_to_dev(hw), t);
+	return status;
 }
 
 /**
- * ice_fill_blk_tbls - Read package context for tables
- * @hw: pointer to the hardware structure
- *
- * Reads the current package contents and populates the driver
- * database with the data iteratively for all advanced feature
- * blocks. Assume that the HW tables have been allocated.
+ * ice_create_prof_id_vsig - add a new VSIG with a single profile
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the initial VSI that will be in VSIG
+ * @hdl: the profile handle of the profile that will be added to the VSIG
+ * @chg: the change list
  */
-void ice_fill_blk_tbls(struct ice_hw *hw)
+static enum ice_status
+ice_create_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl,
+			struct list_head *chg)
 {
-	u8 i;
+	enum ice_status status;
+	struct ice_chs_chg *p;
+	u16 new_vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		enum ice_block blk_id = (enum ice_block)i;
+	p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ICE_ERR_NO_MEMORY;
 
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt1.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].xlt2.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].prof_redir.sid);
-		ice_fill_tbl(hw, blk_id, hw->blk[blk_id].es.sid);
+	new_vsig = ice_vsig_alloc(hw, blk);
+	if (!new_vsig) {
+		status = ICE_ERR_HW_TABLE;
+		goto err_ice_create_prof_id_vsig;
 	}
 
-	ice_init_sw_db(hw);
+	status = ice_move_vsi(hw, blk, vsi, new_vsig, chg);
+	if (status)
+		goto err_ice_create_prof_id_vsig;
+
+	status = ice_add_prof_id_vsig(hw, blk, new_vsig, hdl, false, chg);
+	if (status)
+		goto err_ice_create_prof_id_vsig;
+
+	p->type = ICE_VSIG_ADD;
+	p->vsi = vsi;
+	p->orig_vsig = ICE_DEFAULT_VSIG;
+	p->vsig = new_vsig;
+
+	list_add(&p->list_entry, chg);
+
+	return 0;
+
+err_ice_create_prof_id_vsig:
+	/* let caller clean up the change list */
+	devm_kfree(ice_hw_to_dev(hw), p);
+	return status;
 }
 
 /**
- * ice_free_hw_tbls - free hardware table memory
- * @hw: pointer to the hardware structure
+ * ice_create_vsig_from_lst - create a new VSIG with a list of profiles
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the initial VSI that will be in VSIG
+ * @lst: the list of profile that will be added to the VSIG
+ * @new_vsig: return of new VSIG
+ * @chg: the change list
  */
-void ice_free_hw_tbls(struct ice_hw *hw)
+static enum ice_status
+ice_create_vsig_from_lst(struct ice_hw *hw, enum ice_block blk, u16 vsi,
+			 struct list_head *lst, u16 *new_vsig,
+			 struct list_head *chg)
 {
-	u8 i;
+	struct ice_vsig_prof *t;
+	enum ice_status status;
+	u16 vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		hw->blk[i].is_list_init = false;
+	vsig = ice_vsig_alloc(hw, blk);
+	if (!vsig)
+		return ICE_ERR_HW_TABLE;
 
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptypes);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptg_tbl);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsig_tbl);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt2.vsis);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].prof_redir.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.t);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.ref_count);
-		devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.written);
+	status = ice_move_vsi(hw, blk, vsi, vsig, chg);
+	if (status)
+		return status;
+
+	list_for_each_entry(t, lst, list) {
+		/* Reverse the order here since we are copying the list */
+		status = ice_add_prof_id_vsig(hw, blk, vsig, t->profile_cookie,
+					      true, chg);
+		if (status)
+			return status;
 	}
 
-	memset(hw->blk, 0, sizeof(hw->blk));
+	*new_vsig = vsig;
+
+	return 0;
 }
 
 /**
- * ice_clear_hw_tbls - clear HW tables and flow profiles
- * @hw: pointer to the hardware structure
+ * ice_find_prof_vsig - find a VSIG with a specific profile handle
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @hdl: the profile handle of the profile to search for
+ * @vsig: returns the VSIG with the matching profile
  */
-void ice_clear_hw_tbls(struct ice_hw *hw)
+static bool
+ice_find_prof_vsig(struct ice_hw *hw, enum ice_block blk, u64 hdl, u16 *vsig)
 {
-	u8 i;
+	struct ice_vsig_prof *t;
+	enum ice_status status;
+	struct list_head lst;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
-		struct ice_prof_tcam *prof = &hw->blk[i].prof;
-		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
-		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
-		struct ice_es *es = &hw->blk[i].es;
+	INIT_LIST_HEAD(&lst);
 
-		memset(xlt1->ptypes, 0, xlt1->count * sizeof(*xlt1->ptypes));
-		memset(xlt1->ptg_tbl, 0,
-		       ICE_MAX_PTGS * sizeof(*xlt1->ptg_tbl));
-		memset(xlt1->t, 0, xlt1->count * sizeof(*xlt1->t));
+	t = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return false;
 
-		memset(xlt2->vsis, 0, xlt2->count * sizeof(*xlt2->vsis));
-		memset(xlt2->vsig_tbl, 0,
-		       xlt2->count * sizeof(*xlt2->vsig_tbl));
-		memset(xlt2->t, 0, xlt2->count * sizeof(*xlt2->t));
+	t->profile_cookie = hdl;
+	list_add(&t->list, &lst);
 
-		memset(prof->t, 0, prof->count * sizeof(*prof->t));
-		memset(prof_redir->t, 0,
-		       prof_redir->count * sizeof(*prof_redir->t));
+	status = ice_find_dup_props_vsig(hw, blk, &lst, vsig);
 
-		memset(es->t, 0, es->count * sizeof(*es->t));
-		memset(es->ref_count, 0, es->count * sizeof(*es->ref_count));
-		memset(es->written, 0, es->count * sizeof(*es->written));
+	list_del(&t->list);
+	devm_kfree(ice_hw_to_dev(hw), t);
+
+	return !status;
+}
+
+/**
+ * ice_add_vsi_flow - add VSI flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: input VSI
+ * @vsig: target VSIG to include the input VSI
+ *
+ * Calling this function will add the VSI to a given VSIG and
+ * update the HW tables accordingly. This call can be used to
+ * add multiple VSIs to a VSIG if we know beforehand that those
+ * VSIs have the same characteristics of the VSIG. This will
+ * save time in generating a new VSIG and TCAMs till a match is
+ * found and subsequent rollback when a matching VSIG is found.
+ */
+enum ice_status
+ice_add_vsi_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig)
+{
+	struct ice_chs_chg *tmp, *del;
+	struct list_head chg;
+	enum ice_status status;
+
+	/* if target VSIG is default the move is invalid */
+	if ((vsig & ICE_VSIG_IDX_M) == ICE_DEFAULT_VSIG)
+		return ICE_ERR_PARAM;
+
+	INIT_LIST_HEAD(&chg);
+
+	/* move VSI to the VSIG that matches */
+	status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+	/* update hardware if success */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
+
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
 	}
+
+	return status;
 }
 
 /**
- * ice_init_hw_tbls - init hardware table memory
- * @hw: pointer to the hardware structure
+ * ice_add_prof_id_flow - add profile flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI to enable with the profile specified by ID
+ * @hdl: profile handle
+ *
+ * Calling this function will update the hardware tables to enable the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be enabled.
  */
-enum ice_status ice_init_hw_tbls(struct ice_hw *hw)
+enum ice_status
+ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl)
 {
-	u8 i;
+	struct ice_vsig_prof *tmp1, *del1;
+	struct ice_chs_chg *tmp, *del;
+	struct list_head union_lst;
+	enum ice_status status;
+	struct list_head chg;
+	u16 vsig;
 
-	for (i = 0; i < ICE_BLK_COUNT; i++) {
-		struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir;
-		struct ice_prof_tcam *prof = &hw->blk[i].prof;
-		struct ice_xlt1 *xlt1 = &hw->blk[i].xlt1;
-		struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2;
-		struct ice_es *es = &hw->blk[i].es;
-		u16 j;
+	INIT_LIST_HEAD(&union_lst);
+	INIT_LIST_HEAD(&chg);
 
-		if (hw->blk[i].is_list_init)
-			continue;
+	/* Get profile */
+	status = ice_get_prof(hw, blk, hdl, &chg);
+	if (status)
+		return status;
 
-		hw->blk[i].is_list_init = true;
+	/* determine if VSI is already part of a VSIG */
+	status = ice_vsig_find_vsi(hw, blk, vsi, &vsig);
+	if (!status && vsig) {
+		bool only_vsi;
+		u16 or_vsig;
+		u16 ref;
 
-		hw->blk[i].overwrite = blk_sizes[i].overwrite;
-		es->reverse = blk_sizes[i].reverse;
+		/* found in VSIG */
+		or_vsig = vsig;
 
-		xlt1->sid = ice_blk_sids[i][ICE_SID_XLT1_OFF];
-		xlt1->count = blk_sizes[i].xlt1;
+		/* make sure that there is no overlap/conflict between the new
+		 * characteristics and the existing ones; we don't support that
+		 * scenario
+		 */
+		if (ice_has_prof_vsig(hw, blk, vsig, hdl)) {
+			status = ICE_ERR_ALREADY_EXISTS;
+			goto err_ice_add_prof_id_flow;
+		}
 
-		xlt1->ptypes = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
-					    sizeof(*xlt1->ptypes), GFP_KERNEL);
+		/* last VSI in the VSIG? */
+		status = ice_vsig_get_ref(hw, blk, vsig, &ref);
+		if (status)
+			goto err_ice_add_prof_id_flow;
+		only_vsi = (ref == 1);
 
-		if (!xlt1->ptypes)
-			goto err;
+		/* create a union of the current profiles and the one being
+		 * added
+		 */
+		status = ice_get_profs_vsig(hw, blk, vsig, &union_lst);
+		if (status)
+			goto err_ice_add_prof_id_flow;
 
-		xlt1->ptg_tbl = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_PTGS,
-					     sizeof(*xlt1->ptg_tbl),
-					     GFP_KERNEL);
+		status = ice_add_prof_to_lst(hw, blk, &union_lst, hdl);
+		if (status)
+			goto err_ice_add_prof_id_flow;
+
+		/* search for an existing VSIG with an exact charc match */
+		status = ice_find_dup_props_vsig(hw, blk, &union_lst, &vsig);
+		if (!status) {
+			/* move VSI to the VSIG that matches */
+			status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* VSI has been moved out of or_vsig. If the or_vsig had
+			 * only that VSI it is now empty and can be removed.
+			 */
+			if (only_vsi) {
+				status = ice_rem_vsig(hw, blk, or_vsig, &chg);
+				if (status)
+					goto err_ice_add_prof_id_flow;
+			}
+		} else if (only_vsi) {
+			/* If the original VSIG only contains one VSI, then it
+			 * will be the requesting VSI. In this case the VSI is
+			 * not sharing entries and we can simply add the new
+			 * profile to the VSIG.
+			 */
+			status = ice_add_prof_id_vsig(hw, blk, vsig, hdl, false,
+						      &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* Adjust priorities */
+			status = ice_adj_prof_priorities(hw, blk, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		} else {
+			/* No match, so we need a new VSIG */
+			status = ice_create_vsig_from_lst(hw, blk, vsi,
+							  &union_lst, &vsig,
+							  &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+
+			/* Adjust priorities */
+			status = ice_adj_prof_priorities(hw, blk, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		}
+	} else {
+		/* need to find or add a VSIG */
+		/* search for an existing VSIG with an exact charc match */
+		if (ice_find_prof_vsig(hw, blk, hdl, &vsig)) {
+			/* found an exact match */
+			/* add or move VSI to the VSIG that matches */
+			status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		} else {
+			/* we did not find an exact match */
+			/* we need to add a VSIG */
+			status = ice_create_prof_id_vsig(hw, blk, vsi, hdl,
+							 &chg);
+			if (status)
+				goto err_ice_add_prof_id_flow;
+		}
+	}
 
-		if (!xlt1->ptg_tbl)
-			goto err;
+	/* update hardware */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
 
-		xlt1->t = devm_kcalloc(ice_hw_to_dev(hw), xlt1->count,
-				       sizeof(*xlt1->t), GFP_KERNEL);
-		if (!xlt1->t)
-			goto err;
+err_ice_add_prof_id_flow:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
 
-		xlt2->sid = ice_blk_sids[i][ICE_SID_XLT2_OFF];
-		xlt2->count = blk_sizes[i].xlt2;
+	list_for_each_entry_safe(del1, tmp1, &union_lst, list) {
+		list_del(&del1->list);
+		devm_kfree(ice_hw_to_dev(hw), del1);
+	}
 
-		xlt2->vsis = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-					  sizeof(*xlt2->vsis), GFP_KERNEL);
+	return status;
+}
 
-		if (!xlt2->vsis)
-			goto err;
+/**
+ * ice_add_flow - add flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: array of VSIs to enable with the profile specified by ID
+ * @count: number of elements in the VSI array
+ * @id: profile tracking ID
+ *
+ * Calling this function will update the hardware tables to enable the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be enabled.
+ */
+enum ice_status
+ice_add_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id)
+{
+	enum ice_status status;
+	u16 i;
 
-		xlt2->vsig_tbl = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-					      sizeof(*xlt2->vsig_tbl),
-					      GFP_KERNEL);
-		if (!xlt2->vsig_tbl)
-			goto err;
+	for (i = 0; i < count; i++) {
+		status = ice_add_prof_id_flow(hw, blk, vsi[i], id);
+		if (status)
+			return status;
+	}
 
-		for (j = 0; j < xlt2->count; j++)
-			INIT_LIST_HEAD(&xlt2->vsig_tbl[j].prop_lst);
+	return 0;
+}
 
-		xlt2->t = devm_kcalloc(ice_hw_to_dev(hw), xlt2->count,
-				       sizeof(*xlt2->t), GFP_KERNEL);
-		if (!xlt2->t)
-			goto err;
+/**
+ * ice_rem_prof_from_list - remove a profile from list
+ * @hw: pointer to the HW struct
+ * @lst: list to remove the profile from
+ * @hdl: the profile handle indicating the profile to remove
+ */
+static enum ice_status
+ice_rem_prof_from_list(struct ice_hw *hw, struct list_head *lst, u64 hdl)
+{
+	struct ice_vsig_prof *ent, *tmp;
 
-		prof->sid = ice_blk_sids[i][ICE_SID_PR_OFF];
-		prof->count = blk_sizes[i].prof_tcam;
-		prof->max_prof_id = blk_sizes[i].prof_id;
-		prof->cdid_bits = blk_sizes[i].prof_cdid_bits;
-		prof->t = devm_kcalloc(ice_hw_to_dev(hw), prof->count,
-				       sizeof(*prof->t), GFP_KERNEL);
+	list_for_each_entry_safe(ent, tmp, lst, list)
+		if (ent->profile_cookie == hdl) {
+			list_del(&ent->list);
+			devm_kfree(ice_hw_to_dev(hw), ent);
+			return 0;
+		}
 
-		if (!prof->t)
-			goto err;
+	return ICE_ERR_DOES_NOT_EXIST;
+}
 
-		prof_redir->sid = ice_blk_sids[i][ICE_SID_PR_REDIR_OFF];
-		prof_redir->count = blk_sizes[i].prof_redir;
-		prof_redir->t = devm_kcalloc(ice_hw_to_dev(hw),
-					     prof_redir->count,
-					     sizeof(*prof_redir->t),
-					     GFP_KERNEL);
+/**
+ * ice_rem_prof_id_flow - remove flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: the VSI from which to remove the profile specified by ID
+ * @hdl: profile tracking handle
+ *
+ * Calling this function will update the hardware tables to remove the
+ * profile indicated by the ID parameter for the VSIs specified in the VSI
+ * array. Once successfully called, the flow will be disabled.
+ */
+enum ice_status
+ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl)
+{
+	struct ice_vsig_prof *tmp1, *del1;
+	struct ice_chs_chg *tmp, *del;
+	struct list_head chg, copy;
+	enum ice_status status;
+	u16 vsig;
 
-		if (!prof_redir->t)
-			goto err;
+	INIT_LIST_HEAD(&copy);
+	INIT_LIST_HEAD(&chg);
 
-		es->sid = ice_blk_sids[i][ICE_SID_ES_OFF];
-		es->count = blk_sizes[i].es;
-		es->fvw = blk_sizes[i].fvw;
-		es->t = devm_kcalloc(ice_hw_to_dev(hw),
-				     (u32)(es->count * es->fvw),
-				     sizeof(*es->t), GFP_KERNEL);
-		if (!es->t)
-			goto err;
+	/* determine if VSI is already part of a VSIG */
+	status = ice_vsig_find_vsi(hw, blk, vsi, &vsig);
+	if (!status && vsig) {
+		bool last_profile;
+		bool only_vsi;
+		u16 ref;
 
-		es->ref_count = devm_kcalloc(ice_hw_to_dev(hw), es->count,
-					     sizeof(*es->ref_count),
-					     GFP_KERNEL);
-		if (!es->ref_count)
-			goto err;
+		/* found in VSIG */
+		last_profile = ice_vsig_prof_id_count(hw, blk, vsig) == 1;
+		status = ice_vsig_get_ref(hw, blk, vsig, &ref);
+		if (status)
+			goto err_ice_rem_prof_id_flow;
+		only_vsi = (ref == 1);
+
+		if (only_vsi) {
+			/* If the original VSIG only contains one reference,
+			 * which will be the requesting VSI, then the VSI is not
+			 * sharing entries and we can simply remove the specific
+			 * characteristics from the VSIG.
+			 */
 
-		es->written = devm_kcalloc(ice_hw_to_dev(hw), es->count,
-					   sizeof(*es->written), GFP_KERNEL);
-		if (!es->written)
-			goto err;
+			if (last_profile) {
+				/* If there are no profiles left for this VSIG,
+				 * then simply remove the VSIG.
+				 */
+				status = ice_rem_vsig(hw, blk, vsig, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			} else {
+				status = ice_rem_prof_id_vsig(hw, blk, vsig,
+							      hdl, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+				/* Adjust priorities */
+				status = ice_adj_prof_priorities(hw, blk, vsig,
+								 &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			}
+
+		} else {
+			/* Make a copy of the VSIG's list of Profiles */
+			status = ice_get_profs_vsig(hw, blk, vsig, &copy);
+			if (status)
+				goto err_ice_rem_prof_id_flow;
+
+			/* Remove specified profile entry from the list */
+			status = ice_rem_prof_from_list(hw, &copy, hdl);
+			if (status)
+				goto err_ice_rem_prof_id_flow;
+
+			if (list_empty(&copy)) {
+				status = ice_move_vsi(hw, blk, vsi,
+						      ICE_DEFAULT_VSIG, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+			} else if (!ice_find_dup_props_vsig(hw, blk, &copy,
+							    &vsig)) {
+				/* found an exact match */
+				/* add or move VSI to the VSIG that matches */
+				/* Search for a VSIG with a matching profile
+				 * list
+				 */
+
+				/* Found match, move VSI to the matching VSIG */
+				status = ice_move_vsi(hw, blk, vsi, vsig, &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			} else {
+				/* since no existing VSIG supports this
+				 * characteristic pattern, we need to create a
+				 * new VSIG and TCAM entries
+				 */
+				status = ice_create_vsig_from_lst(hw, blk, vsi,
+								  &copy, &vsig,
+								  &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+
+				/* Adjust priorities */
+				status = ice_adj_prof_priorities(hw, blk, vsig,
+								 &chg);
+				if (status)
+					goto err_ice_rem_prof_id_flow;
+			}
+		}
+	} else {
+		status = ICE_ERR_DOES_NOT_EXIST;
 	}
-	return 0;
 
-err:
-	ice_free_hw_tbls(hw);
-	return ICE_ERR_NO_MEMORY;
+	/* update hardware tables */
+	if (!status)
+		status = ice_upd_prof_hw(hw, blk, &chg);
+
+err_ice_rem_prof_id_flow:
+	list_for_each_entry_safe(del, tmp, &chg, list_entry) {
+		list_del(&del->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), del);
+	}
+
+	list_for_each_entry_safe(del1, tmp1, &copy, list) {
+		list_del(&del1->list);
+		devm_kfree(ice_hw_to_dev(hw), del1);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_flow - remove flow
+ * @hw: pointer to the HW struct
+ * @blk: hardware block
+ * @vsi: array of VSIs from which to remove the profile specified by ID
+ * @count: number of elements in the VSI array
+ * @id: profile tracking ID
+ *
+ * The function will remove flows from the specified VSIs that were enabled
+ * using ice_add_flow. The ID value will indicated which profile will be
+ * removed. Once successfully called, the flow will be disabled.
+ */
+enum ice_status
+ice_rem_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id)
+{
+	enum ice_status status;
+	u16 i;
+
+	for (i = 0; i < count; i++) {
+		status = ice_rem_prof_id_flow(hw, blk, vsi[i], id);
+		if (status)
+			return status;
+	}
+
+	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
index 37eb282742d1744a1ddcdb1187999182b7f1305f..356d9f46f554e69529986b65ab9f53d03ab946af 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_FLEX_PIPE_H_
 #define _ICE_FLEX_PIPE_H_
@@ -18,6 +18,72 @@
 
 #define ICE_PKG_CNT 4
 
+enum ice_status
+ice_acquire_change_lock(struct ice_hw *hw, enum ice_aq_res_access_type access);
+void ice_release_change_lock(struct ice_hw *hw);
+enum ice_status
+ice_find_prot_off(struct ice_hw *hw, enum ice_block blk, u8 prof, u16 fv_idx,
+		  u8 *prot, u16 *off);
+enum ice_status
+ice_find_label_value(struct ice_seg *ice_seg, char const *name, u32 type,
+		     u16 *value);
+void
+ice_get_sw_fv_bitmap(struct ice_hw *hw, enum ice_prof_type type,
+		     unsigned long *bm);
+void
+ice_init_prof_result_bm(struct ice_hw *hw);
+enum ice_status
+ice_get_sw_fv_list(struct ice_hw *hw, u8 *prot_ids, u16 ids_cnt,
+		   unsigned long *bm, struct list_head *fv_list);
+enum ice_status
+ice_pkg_buf_unreserve_section(struct ice_buf_build *bld, u16 count);
+u16 ice_pkg_buf_get_free_space(struct ice_buf_build *bld);
+enum ice_status
+ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
+		      u16 buf_size, struct ice_sq_cd *cd);
+bool
+ice_get_open_tunnel_port(struct ice_hw *hw, enum ice_tunnel_type type,
+			 u16 *port);
+enum ice_status
+ice_is_create_tunnel_possible(struct ice_hw *hw, enum ice_tunnel_type type,
+			      u16 port);
+bool ice_is_tunnel_empty(struct ice_hw *hw);
+enum ice_status
+ice_create_tunnel(struct ice_hw *hw, enum ice_tunnel_type type, u16 port);
+enum ice_status ice_set_dvm_boost_entries(struct ice_hw *hw);
+enum ice_status ice_destroy_tunnel(struct ice_hw *hw, u16 port, bool all);
+bool ice_tunnel_port_in_use(struct ice_hw *hw, u16 port, u16 *index);
+bool
+ice_tunnel_get_type(struct ice_hw *hw, u16 port, enum ice_tunnel_type *type);
+enum ice_status ice_replay_tunnels(struct ice_hw *hw);
+
+/* RX parser PType functions */
+bool ice_hw_ptype_ena(struct ice_hw *hw, u16 ptype);
+
+/* XLT1/PType group functions */
+enum ice_status ice_ptg_update_xlt1(struct ice_hw *hw, enum ice_block blk);
+void ice_ptg_free(struct ice_hw *hw, enum ice_block blk, u8 ptg);
+
+/* XLT2/VSI group functions */
+enum ice_status ice_vsig_update_xlt2(struct ice_hw *hw, enum ice_block blk);
+enum ice_status
+ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[],
+	     const struct ice_ptype_attributes *attr, u16 attr_cnt,
+	     struct ice_fv_word *es, u16 *masks);
+void ice_init_all_prof_masks(struct ice_hw *hw);
+void ice_shutdown_all_prof_masks(struct ice_hw *hw);
+struct ice_prof_map *
+ice_search_prof_id(struct ice_hw *hw, enum ice_block blk, u64 id);
+enum ice_status
+ice_add_vsi_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig);
+enum ice_status
+ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl);
+enum ice_status
+ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl);
+enum ice_status
+ice_set_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 cntxt);
+enum ice_status
+ice_get_prof_context(struct ice_hw *hw, enum ice_block blk, u64 id, u64 *cntxt);
 enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buff, u32 len);
 enum ice_status
 ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len);
@@ -26,4 +92,21 @@ void ice_free_seg(struct ice_hw *hw);
 void ice_fill_blk_tbls(struct ice_hw *hw);
 void ice_clear_hw_tbls(struct ice_hw *hw);
 void ice_free_hw_tbls(struct ice_hw *hw);
+enum ice_status
+ice_add_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id);
+enum ice_status
+ice_rem_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi[], u8 count,
+	     u64 id);
+enum ice_status
+ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id);
+struct ice_buf_build *
+ice_pkg_buf_alloc_single_section(struct ice_hw *hw, u32 type, u16 size,
+				 void **section);
+struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld);
+void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld);
+
+enum ice_status
+ice_set_key(u8 *key, u16 size, u8 *val, u8 *upd, u8 *dc, u8 *nm, u16 off,
+	    u16 len);
 #endif /* _ICE_FLEX_PIPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_type.h b/drivers/net/ethernet/intel/ice/ice_flex_type.h
index 5d5a7eaffa308f51772ca28ca1347678c896e62c..32bcb77040482e6e6224cf93ef8cfd5bcc31da23 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_flex_type.h
@@ -1,8 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_FLEX_TYPE_H_
 #define _ICE_FLEX_TYPE_H_
+
+#define ICE_FV_OFFSET_INVAL	0x1FF
+
 /* Extraction Sequence (Field Vector) Table */
 struct ice_fv_word {
 	u8 prot_id;
@@ -10,6 +13,8 @@ struct ice_fv_word {
 	u8 resvrd;
 } __packed;
 
+#define ICE_MAX_NUM_PROFILES 256
+
 #define ICE_MAX_FV_WORDS 48
 struct ice_fv {
 	struct ice_fv_word ew[ICE_MAX_FV_WORDS];
@@ -17,9 +22,9 @@ struct ice_fv {
 
 /* Package and segment headers and tables */
 struct ice_pkg_hdr {
-	struct ice_pkg_ver format_ver;
+	struct ice_pkg_ver pkg_format_ver;
 	__le32 seg_count;
-	__le32 seg_offset[1];
+	__le32 seg_offset[];
 };
 
 /* generic segment */
@@ -27,9 +32,9 @@ struct ice_generic_seg_hdr {
 #define SEGMENT_TYPE_METADATA	0x00000001
 #define SEGMENT_TYPE_ICE	0x00000010
 	__le32 seg_type;
-	struct ice_pkg_ver seg_ver;
+	struct ice_pkg_ver seg_format_ver;
 	__le32 seg_size;
-	char seg_name[ICE_PKG_NAME_SIZE];
+	char seg_id[ICE_PKG_NAME_SIZE];
 };
 
 /* ice specific segment */
@@ -50,12 +55,12 @@ struct ice_device_id_entry {
 struct ice_seg {
 	struct ice_generic_seg_hdr hdr;
 	__le32 device_table_count;
-	struct ice_device_id_entry device_table[1];
+	struct ice_device_id_entry device_table[];
 };
 
 struct ice_nvm_table {
 	__le32 table_count;
-	__le32 vers[1];
+	__le32 vers[];
 };
 
 struct ice_buf {
@@ -65,14 +70,14 @@ struct ice_buf {
 
 struct ice_buf_table {
 	__le32 buf_count;
-	struct ice_buf buf_array[1];
+	struct ice_buf buf_array[];
 };
 
 /* global metadata specific segment */
 struct ice_global_metadata_seg {
 	struct ice_generic_seg_hdr hdr;
 	struct ice_pkg_ver pkg_ver;
-	__le32 track_id;
+	__le32 rsvd;
 	char pkg_name[ICE_PKG_NAME_SIZE];
 };
 
@@ -98,48 +103,141 @@ struct ice_section_entry {
 struct ice_buf_hdr {
 	__le16 section_count;
 	__le16 data_end;
-	struct ice_section_entry section_entry[1];
+	struct ice_section_entry section_entry[];
 };
 
 #define ICE_MAX_ENTRIES_IN_BUF(hd_sz, ent_sz) ((ICE_PKG_BUF_SIZE - \
-	sizeof(struct ice_buf_hdr) - (hd_sz)) / (ent_sz))
+	struct_size((struct ice_buf_hdr *)0, section_entry, 1) - (hd_sz)) /\
+	(ent_sz))
 
 /* ice package section IDs */
+#define ICE_SID_METADATA		1
+#define ICE_SID_XLT0_SW			10
+#define ICE_SID_XLT_KEY_BUILDER_SW	11
 #define ICE_SID_XLT1_SW			12
 #define ICE_SID_XLT2_SW			13
 #define ICE_SID_PROFID_TCAM_SW		14
 #define ICE_SID_PROFID_REDIR_SW		15
 #define ICE_SID_FLD_VEC_SW		16
+#define ICE_SID_CDID_KEY_BUILDER_SW	17
+#define ICE_SID_CDID_REDIR_SW		18
 
+#define ICE_SID_XLT0_ACL		20
+#define ICE_SID_XLT_KEY_BUILDER_ACL	21
 #define ICE_SID_XLT1_ACL		22
 #define ICE_SID_XLT2_ACL		23
 #define ICE_SID_PROFID_TCAM_ACL		24
 #define ICE_SID_PROFID_REDIR_ACL	25
 #define ICE_SID_FLD_VEC_ACL		26
+#define ICE_SID_CDID_KEY_BUILDER_ACL	27
+#define ICE_SID_CDID_REDIR_ACL		28
 
+#define ICE_SID_XLT0_FD			30
+#define ICE_SID_XLT_KEY_BUILDER_FD	31
 #define ICE_SID_XLT1_FD			32
 #define ICE_SID_XLT2_FD			33
 #define ICE_SID_PROFID_TCAM_FD		34
 #define ICE_SID_PROFID_REDIR_FD		35
 #define ICE_SID_FLD_VEC_FD		36
+#define ICE_SID_CDID_KEY_BUILDER_FD	37
+#define ICE_SID_CDID_REDIR_FD		38
 
+#define ICE_SID_XLT0_RSS		40
+#define ICE_SID_XLT_KEY_BUILDER_RSS	41
 #define ICE_SID_XLT1_RSS		42
 #define ICE_SID_XLT2_RSS		43
 #define ICE_SID_PROFID_TCAM_RSS		44
 #define ICE_SID_PROFID_REDIR_RSS	45
 #define ICE_SID_FLD_VEC_RSS		46
-
+#define ICE_SID_CDID_KEY_BUILDER_RSS	47
+#define ICE_SID_CDID_REDIR_RSS		48
+
+#define ICE_SID_RXPARSER_CAM		50
+#define ICE_SID_RXPARSER_NOMATCH_CAM	51
+#define ICE_SID_RXPARSER_IMEM		52
+#define ICE_SID_RXPARSER_XLT0_BUILDER	53
+#define ICE_SID_RXPARSER_NODE_PTYPE	54
+#define ICE_SID_RXPARSER_MARKER_PTYPE	55
 #define ICE_SID_RXPARSER_BOOST_TCAM	56
-
+#define ICE_SID_RXPARSER_PROTO_GRP	57
+#define ICE_SID_RXPARSER_METADATA_INIT	58
+#define ICE_SID_RXPARSER_XLT0		59
+
+#define ICE_SID_TXPARSER_CAM		60
+#define ICE_SID_TXPARSER_NOMATCH_CAM	61
+#define ICE_SID_TXPARSER_IMEM		62
+#define ICE_SID_TXPARSER_XLT0_BUILDER	63
+#define ICE_SID_TXPARSER_NODE_PTYPE	64
+#define ICE_SID_TXPARSER_MARKER_PTYPE	65
+#define ICE_SID_TXPARSER_BOOST_TCAM	66
+#define ICE_SID_TXPARSER_PROTO_GRP	67
+#define ICE_SID_TXPARSER_METADATA_INIT	68
+#define ICE_SID_TXPARSER_XLT0		69
+
+#define ICE_SID_RXPARSER_INIT_REDIR	70
+#define ICE_SID_TXPARSER_INIT_REDIR	71
+#define ICE_SID_RXPARSER_MARKER_GRP	72
+#define ICE_SID_TXPARSER_MARKER_GRP	73
+#define ICE_SID_RXPARSER_LAST_PROTO	74
+#define ICE_SID_TXPARSER_LAST_PROTO	75
+#define ICE_SID_RXPARSER_PG_SPILL	76
+#define ICE_SID_TXPARSER_PG_SPILL	77
+#define ICE_SID_RXPARSER_NOMATCH_SPILL	78
+#define ICE_SID_TXPARSER_NOMATCH_SPILL	79
+
+#define ICE_SID_XLT0_PE			80
+#define ICE_SID_XLT_KEY_BUILDER_PE	81
 #define ICE_SID_XLT1_PE			82
 #define ICE_SID_XLT2_PE			83
 #define ICE_SID_PROFID_TCAM_PE		84
 #define ICE_SID_PROFID_REDIR_PE		85
 #define ICE_SID_FLD_VEC_PE		86
+#define ICE_SID_CDID_KEY_BUILDER_PE	87
+#define ICE_SID_CDID_REDIR_PE		88
 
 /* Label Metadata section IDs */
 #define ICE_SID_LBL_FIRST		0x80000010
+#define ICE_SID_LBL_RXPARSER_IMEM	0x80000010
+#define ICE_SID_LBL_TXPARSER_IMEM	0x80000011
+#define ICE_SID_LBL_RESERVED_12		0x80000012
+#define ICE_SID_LBL_RESERVED_13		0x80000013
+#define ICE_SID_LBL_RXPARSER_MARKER	0x80000014
+#define ICE_SID_LBL_TXPARSER_MARKER	0x80000015
+#define ICE_SID_LBL_PTYPE		0x80000016
+#define ICE_SID_LBL_PROTOCOL_ID		0x80000017
 #define ICE_SID_LBL_RXPARSER_TMEM	0x80000018
+#define ICE_SID_LBL_TXPARSER_TMEM	0x80000019
+#define ICE_SID_LBL_RXPARSER_PG		0x8000001A
+#define ICE_SID_LBL_TXPARSER_PG		0x8000001B
+#define ICE_SID_LBL_RXPARSER_M_TCAM	0x8000001C
+#define ICE_SID_LBL_TXPARSER_M_TCAM	0x8000001D
+#define ICE_SID_LBL_SW_PROFID_TCAM	0x8000001E
+#define ICE_SID_LBL_ACL_PROFID_TCAM	0x8000001F
+#define ICE_SID_LBL_PE_PROFID_TCAM	0x80000020
+#define ICE_SID_LBL_RSS_PROFID_TCAM	0x80000021
+#define ICE_SID_LBL_FD_PROFID_TCAM	0x80000022
+#define ICE_SID_LBL_FLAG		0x80000023
+#define ICE_SID_LBL_REG			0x80000024
+#define ICE_SID_LBL_SW_PTG		0x80000025
+#define ICE_SID_LBL_ACL_PTG		0x80000026
+#define ICE_SID_LBL_PE_PTG		0x80000027
+#define ICE_SID_LBL_RSS_PTG		0x80000028
+#define ICE_SID_LBL_FD_PTG		0x80000029
+#define ICE_SID_LBL_SW_VSIG		0x8000002A
+#define ICE_SID_LBL_ACL_VSIG		0x8000002B
+#define ICE_SID_LBL_PE_VSIG		0x8000002C
+#define ICE_SID_LBL_RSS_VSIG		0x8000002D
+#define ICE_SID_LBL_FD_VSIG		0x8000002E
+#define ICE_SID_LBL_PTYPE_META		0x8000002F
+#define ICE_SID_LBL_SW_PROFID		0x80000030
+#define ICE_SID_LBL_ACL_PROFID		0x80000031
+#define ICE_SID_LBL_PE_PROFID		0x80000032
+#define ICE_SID_LBL_RSS_PROFID		0x80000033
+#define ICE_SID_LBL_FD_PROFID		0x80000034
+#define ICE_SID_LBL_RXPARSER_MARKER_GRP	0x80000035
+#define ICE_SID_LBL_TXPARSER_MARKER_GRP	0x80000036
+#define ICE_SID_LBL_RXPARSER_PROTO	0x80000037
+#define ICE_SID_LBL_TXPARSER_PROTO	0x80000038
 /* The following define MUST be updated to reflect the last label section ID */
 #define ICE_SID_LBL_LAST		0x80000038
 
@@ -152,6 +250,293 @@ enum ice_block {
 	ICE_BLK_COUNT
 };
 
+enum ice_sect {
+	ICE_XLT0 = 0,
+	ICE_XLT_KB,
+	ICE_XLT1,
+	ICE_XLT2,
+	ICE_PROF_TCAM,
+	ICE_PROF_REDIR,
+	ICE_VEC_TBL,
+	ICE_CDID_KB,
+	ICE_CDID_REDIR,
+	ICE_SECT_COUNT
+};
+
+/* Packet Type (PTYPE) values */
+#define ICE_PTYPE_MAC_PAY		1
+#define ICE_MAC_PTP				2
+#define ICE_MAC_LLDP				6
+#define ICE_MAC_ARP				11
+#define ICE_PTYPE_IPV4FRAG_PAY		22
+#define ICE_PTYPE_IPV4_PAY		23
+#define ICE_PTYPE_IPV4_UDP_PAY		24
+#define ICE_PTYPE_IPV4_TCP_PAY		26
+#define ICE_PTYPE_IPV4_SCTP_PAY		27
+#define ICE_PTYPE_IPV4_ICMP_PAY		28
+#define ICE_MAC_IPV4_IPV4_FRAG			29
+#define ICE_MAC_IPV4_IPV4_PAY			30
+#define ICE_MAC_IPV4_IPV4_UDP_PAY		31
+#define ICE_MAC_IPV4_IPV4_TCP			33
+#define ICE_MAC_IPV4_IPV4_SCTP			34
+#define ICE_MAC_IPV4_IPV4_ICMP			35
+#define ICE_MAC_IPV4_IPV6_FRAG			36
+#define ICE_MAC_IPV4_IPV6_PAY			37
+#define ICE_MAC_IPV4_IPV6_UDP_PAY		38
+#define ICE_MAC_IPV4_IPV6_TCP			40
+#define ICE_MAC_IPV4_IPV6_SCTP			41
+#define ICE_MAC_IPV4_IPV6_ICMPV6		42
+#define ICE_MAC_IPV4_TUN_PAY			43
+#define ICE_MAC_IPV4_TUN_IPV4_FRAG		44
+#define ICE_MAC_IPV4_TUN_IPV4_PAY		45
+#define ICE_MAC_IPV4_TUN_IPV4_UDP_PAY		46
+#define ICE_MAC_IPV4_TUN_IPV4_TCP		48
+#define ICE_MAC_IPV4_TUN_IPV4_SCTP		49
+#define ICE_MAC_IPV4_TUN_IPV4_ICMP		50
+#define ICE_MAC_IPV4_TUN_IPV6_FRAG		51
+#define ICE_MAC_IPV4_TUN_IPV6_PAY		52
+#define ICE_MAC_IPV4_TUN_IPV6_UDP_PAY		53
+#define ICE_MAC_IPV4_TUN_IPV6_TCP		55
+#define ICE_MAC_IPV4_TUN_IPV6_SCTP		56
+#define ICE_MAC_IPV4_TUN_IPV6_ICMPV6		57
+#define ICE_MAC_IPV4_TUN_ICE_MAC_PAY		58
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_FRAG	59
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_PAY	60
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_UDP_PAY	61
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_TCP	63
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_SCTP	64
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV4_ICMP	65
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_FRAG	66
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_PAY	67
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_UDP_PAY	68
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_TCP	70
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_SCTP	71
+#define ICE_MAC_IPV4_TUN_ICE_MAC_IPV6_ICMPV6	72
+#define ICE_PTYPE_IPV6FRAG_PAY		88
+#define ICE_PTYPE_IPV6_PAY		89
+#define ICE_PTYPE_IPV6_UDP_PAY		90
+#define ICE_PTYPE_IPV6_TCP_PAY		92
+#define ICE_PTYPE_IPV6_SCTP_PAY		93
+#define ICE_PTYPE_IPV6_ICMP_PAY		94
+#define ICE_MAC_IPV6_IPV4_FRAG			95
+#define ICE_MAC_IPV6_IPV4_PAY			96
+#define ICE_MAC_IPV6_IPV4_UDP_PAY		97
+#define ICE_MAC_IPV6_IPV4_TCP			99
+#define ICE_MAC_IPV6_IPV4_SCTP			100
+#define ICE_MAC_IPV6_IPV4_ICMP			101
+#define ICE_MAC_IPV6_IPV6_FRAG			102
+#define ICE_MAC_IPV6_IPV6_PAY			103
+#define ICE_MAC_IPV6_IPV6_UDP_PAY		104
+#define ICE_MAC_IPV6_IPV6_TCP			106
+#define ICE_MAC_IPV6_IPV6_SCTP			107
+#define ICE_MAC_IPV6_IPV6_ICMPV6		108
+#define ICE_MAC_IPV6_TUN_PAY			109
+#define ICE_MAC_IPV6_TUN_IPV4_FRAG		110
+#define ICE_MAC_IPV6_TUN_IPV4_PAY		111
+#define ICE_MAC_IPV6_TUN_IPV4_UDP_PAY		112
+#define ICE_MAC_IPV6_TUN_IPV4_TCP		114
+#define ICE_MAC_IPV6_TUN_IPV4_SCTP		115
+#define ICE_MAC_IPV6_TUN_IPV4_ICMP		116
+#define ICE_MAC_IPV6_TUN_IPV6_FRAG		117
+#define ICE_MAC_IPV6_TUN_IPV6_PAY		118
+#define ICE_MAC_IPV6_TUN_IPV6_UDP_PAY		119
+#define ICE_MAC_IPV6_TUN_IPV6_TCP		121
+#define ICE_MAC_IPV6_TUN_IPV6_SCTP		122
+#define ICE_MAC_IPV6_TUN_IPV6_ICMPV6		123
+#define ICE_MAC_IPV6_TUN_MAC_PAY		124
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_FRAG		125
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_PAY		126
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_UDP_PAY	127
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_TCP		129
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_SCTP		130
+#define ICE_MAC_IPV6_TUN_MAC_IPV4_ICMP		131
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_FRAG		132
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_PAY		133
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_UDP_PAY	134
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_TCP		136
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_SCTP		137
+#define ICE_MAC_IPV6_TUN_MAC_IPV6_ICMPV6	138
+#define ICE_MAC_IPV4_ESP			160
+#define ICE_MAC_IPV6_ESP			161
+#define ICE_MAC_IPV4_AH				162
+#define ICE_MAC_IPV6_AH				163
+#define ICE_MAC_IPV4_NAT_T_ESP			164
+#define ICE_MAC_IPV6_NAT_T_ESP			165
+#define ICE_MAC_IPV4_NAT_T_IKE			166
+#define ICE_MAC_IPV6_NAT_T_IKE			167
+#define ICE_MAC_IPV4_NAT_T_KEEP			168
+#define ICE_MAC_IPV6_NAT_T_KEEP			169
+#define ICE_MAC_CONTROL				278
+#define ICE_MAC_PPPOD_PAY			300
+#define ICE_MAC_PPPOE_PAY			301
+#define ICE_MAC_PPPOE_IPV4_FRAG			302
+#define ICE_MAC_PPPOE_IPV4_PAY			303
+#define ICE_MAC_PPPOE_IPV4_UDP_PAY		304
+#define ICE_MAC_PPPOE_IPV4_TCP			305
+#define ICE_MAC_PPPOE_IPV4_SCTP			306
+#define ICE_MAC_PPPOE_IPV4_ICMP			307
+#define ICE_MAC_PPPOE_IPV6_FRAG			308
+#define ICE_MAC_PPPOE_IPV6_PAY			309
+#define ICE_MAC_PPPOE_IPV6_UDP_PAY		310
+#define ICE_MAC_PPPOE_IPV6_TCP			311
+#define ICE_MAC_PPPOE_IPV6_SCTP			312
+#define ICE_MAC_PPPOE_IPV6_ICMPV6		313
+#define ICE_MAC_IPV4_GTPC_TEID			325
+#define ICE_MAC_IPV6_GTPC_TEID			326
+#define ICE_MAC_IPV4_GTPC			327
+#define ICE_MAC_IPV6_GTPC			328
+#define ICE_MAC_IPV4_GTPU			329
+#define ICE_MAC_IPV6_GTPU			330
+#define ICE_MAC_IPV4_GTPU_IPV4_FRAG		331
+#define ICE_MAC_IPV4_GTPU_IPV4_PAY		332
+#define ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY		333
+#define ICE_MAC_IPV4_GTPU_IPV4_TCP		334
+#define ICE_MAC_IPV4_GTPU_IPV4_ICMP		335
+#define ICE_MAC_IPV6_GTPU_IPV4_FRAG		336
+#define ICE_MAC_IPV6_GTPU_IPV4_PAY		337
+#define ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY		338
+#define ICE_MAC_IPV6_GTPU_IPV4_TCP		339
+#define ICE_MAC_IPV6_GTPU_IPV4_ICMP		340
+#define ICE_MAC_IPV4_GTPU_IPV6_FRAG		341
+#define ICE_MAC_IPV4_GTPU_IPV6_PAY		342
+#define ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY		343
+#define ICE_MAC_IPV4_GTPU_IPV6_TCP		344
+#define ICE_MAC_IPV4_GTPU_IPV6_ICMPV6		345
+#define ICE_MAC_IPV6_GTPU_IPV6_FRAG		346
+#define ICE_MAC_IPV6_GTPU_IPV6_PAY		347
+#define ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY		348
+#define ICE_MAC_IPV6_GTPU_IPV6_TCP		349
+#define ICE_MAC_IPV6_GTPU_IPV6_ICMPV6		350
+#define ICE_MAC_IPV4_PFCP_NODE			351
+#define ICE_MAC_IPV4_PFCP_SESSION		352
+#define ICE_MAC_IPV6_PFCP_NODE			353
+#define ICE_MAC_IPV6_PFCP_SESSION		354
+#define ICE_MAC_IPV4_L2TPV3			360
+#define ICE_MAC_IPV6_L2TPV3			361
+#define ICE_MAC_IPV4_L2TPV2_CONTROL		392
+#define ICE_MAC_IPV6_L2TPV2_CONTROL		393
+#define ICE_MAC_IPV4_L2TPV2			394
+#define ICE_MAC_IPV6_L2TPV2			395
+#define ICE_MAC_IPV4_PPPOL2TPV2			396
+#define ICE_MAC_IPV6_PPPOL2TPV2			397
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_FRAG	398
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_PAY	399
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_UDP_PAY	400
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_TCP	401
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_SCTP	402
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV4_ICMP	403
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_FRAG	404
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_PAY	405
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_UDP_PAY	406
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_TCP	407
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_SCTP	408
+#define ICE_MAC_IPV4_PPPOL2TPV2_IPV6_ICMPV6	409
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_FRAG	410
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_PAY	411
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_UDP_PAY	412
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_TCP	413
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_SCTP	414
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV4_ICMP	415
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_FRAG	416
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_PAY	417
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_UDP_PAY	418
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_TCP	419
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_SCTP	420
+#define ICE_MAC_IPV6_PPPOL2TPV2_IPV6_ICMPV6	421
+
+/* Attributes that can modify PTYPE definitions.
+ *
+ * These values will represent special attributes for PTYPES, which will
+ * resolve into metadata packet flags definitions that can be used in the TCAM
+ * for identifying a PTYPE with specific characteristics.
+ */
+enum ice_ptype_attrib_type {
+	/* GTP PTYPES */
+	ICE_PTYPE_ATTR_GTP_PDU_EH,
+	ICE_PTYPE_ATTR_GTP_SESSION,
+	ICE_PTYPE_ATTR_GTP_DOWNLINK,
+	ICE_PTYPE_ATTR_GTP_UPLINK,
+};
+
+struct ice_ptype_attrib_info {
+	u16 flags;
+	u16 mask;
+};
+
+/* TCAM flag definitions */
+#define ICE_GTP_PDU			BIT(14)
+#define ICE_GTP_PDU_LINK		BIT(13)
+
+/* GTP attributes */
+#define ICE_GTP_PDU_FLAG_MASK		(ICE_GTP_PDU)
+#define ICE_GTP_PDU_EH			ICE_GTP_PDU
+
+#define ICE_GTP_FLAGS_MASK		(ICE_GTP_PDU | ICE_GTP_PDU_LINK)
+#define ICE_GTP_SESSION			0
+#define ICE_GTP_DOWNLINK		ICE_GTP_PDU
+#define ICE_GTP_UPLINK			(ICE_GTP_PDU | ICE_GTP_PDU_LINK)
+
+struct ice_ptype_attributes {
+	u16 ptype;
+	enum ice_ptype_attrib_type attrib;
+};
+
+struct ice_meta_sect {
+	struct ice_pkg_ver ver;
+#define ICE_META_SECT_NAME_SIZE	28
+	char name[ICE_META_SECT_NAME_SIZE];
+	__le32 track_id;
+};
+
+/* Packet Type Groups (PTG) - Inner Most fields (IM) */
+#define ICE_PTG_IM_IPV4_TCP		16
+#define ICE_PTG_IM_IPV4_UDP		17
+#define ICE_PTG_IM_IPV4_SCTP		18
+#define ICE_PTG_IM_IPV4_PAY		20
+#define ICE_PTG_IM_IPV4_OTHER		21
+#define ICE_PTG_IM_IPV6_TCP		32
+#define ICE_PTG_IM_IPV6_UDP		33
+#define ICE_PTG_IM_IPV6_SCTP		34
+#define ICE_PTG_IM_IPV6_OTHER		37
+#define ICE_PTG_IM_L2_OTHER		67
+
+struct ice_flex_fields {
+	union {
+		struct {
+			u8 src_ip;
+			u8 dst_ip;
+			u8 flow_label;	/* valid for IPv6 only */
+		} ip_fields;
+
+		struct {
+			u8 src_prt;
+			u8 dst_prt;
+		} tcp_udp_fields;
+
+		struct {
+			u8 src_ip;
+			u8 dst_ip;
+			u8 src_prt;
+			u8 dst_prt;
+		} ip_tcp_udp_fields;
+
+		struct {
+			u8 src_prt;
+			u8 dst_prt;
+			u8 flow_label;	/* valid for IPv6 only */
+			u8 spi;
+		} ip_esp_fields;
+
+		struct {
+			u32 offset;
+			u32 length;
+		} off_len;
+	} fields;
+};
+
+#define ICE_XLT1_DFLT_GRP	0
+#define ICE_XLT1_TABLE_SIZE	1024
+
 /* package labels */
 struct ice_label {
 	__le16 value;
@@ -161,17 +546,22 @@ struct ice_label {
 
 struct ice_label_section {
 	__le16 count;
-	struct ice_label label[1];
+	struct ice_label label[];
 };
 
-#define ICE_MAX_LABELS_IN_BUF ICE_MAX_ENTRIES_IN_BUF( \
-	sizeof(struct ice_label_section) - sizeof(struct ice_label), \
-	sizeof(struct ice_label))
+#define ICE_MAX_LABELS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_label_section *)0, label, 1) - \
+	sizeof(struct ice_label), sizeof(struct ice_label))
 
 struct ice_sw_fv_section {
 	__le16 count;
 	__le16 base_offset;
-	struct ice_fv fv[1];
+	struct ice_fv fv[];
+};
+
+struct ice_sw_fv_list_entry {
+	struct list_head list_entry;
+	u32 profile_id;
+	struct ice_fv *fv_ptr;
 };
 
 /* The BOOST TCAM stores the match packet header in reverse order, meaning
@@ -208,30 +598,54 @@ struct ice_boost_tcam_entry {
 struct ice_boost_tcam_section {
 	__le16 count;
 	__le16 reserved;
-	struct ice_boost_tcam_entry tcam[1];
+	struct ice_boost_tcam_entry tcam[];
 };
 
-#define ICE_MAX_BST_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF( \
-	sizeof(struct ice_boost_tcam_section) - \
+#define ICE_MAX_BST_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_boost_tcam_section *)0, tcam, 1) - \
 	sizeof(struct ice_boost_tcam_entry), \
 	sizeof(struct ice_boost_tcam_entry))
 
+/* package Marker PType TCAM entry */
+struct ice_marker_ptype_tcam_entry {
+#define ICE_MARKER_PTYPE_TCAM_ADDR_MAX	1024
+	__le16 addr;
+	__le16 ptype;
+	u8 keys[20];
+};
+
+struct ice_marker_ptype_tcam_section {
+	__le16 count;
+	__le16 reserved;
+	struct ice_marker_ptype_tcam_entry tcam[];
+};
+
+#define ICE_MAX_MARKER_PTYPE_TCAMS_IN_BUF ICE_MAX_ENTRIES_IN_BUF(struct_size((struct ice_marker_ptype_tcam_section *)0, tcam, 1) - \
+	sizeof(struct ice_marker_ptype_tcam_entry), \
+	sizeof(struct ice_marker_ptype_tcam_entry))
+
 struct ice_xlt1_section {
 	__le16 count;
 	__le16 offset;
-	u8 value[1];
-} __packed;
+	u8 value[];
+};
 
 struct ice_xlt2_section {
 	__le16 count;
 	__le16 offset;
-	__le16 value[1];
+	__le16 value[];
 };
 
 struct ice_prof_redir_section {
 	__le16 count;
 	__le16 offset;
-	u8 redir_value[1];
+	u8 redir_value[];
+};
+
+/* package buffer building */
+
+struct ice_buf_build {
+	struct ice_buf buf;
+	u16 reserved_section_table_entries;
 };
 
 struct ice_pkg_enum {
@@ -248,11 +662,65 @@ struct ice_pkg_enum {
 	void *(*handler)(u32 sect_type, void *section, u32 index, u32 *offset);
 };
 
+/* Tunnel enabling */
+
+enum ice_tunnel_type {
+	TNL_VXLAN = 0,
+	TNL_GENEVE,
+	TNL_ECPRI,
+	TNL_GTP,
+	TNL_LAST = 0xFF,
+	TNL_ALL = 0xFF,
+};
+
+struct ice_tunnel_type_scan {
+	enum ice_tunnel_type type;
+	const char *label_prefix;
+};
+
+struct ice_tunnel_entry {
+	enum ice_tunnel_type type;
+	u16 boost_addr;
+	u16 port;
+	u16 ref;
+	struct ice_boost_tcam_entry *boost_entry;
+	u8 valid;
+	u8 in_use;
+	u8 marked;
+};
+
+#define ICE_TUNNEL_MAX_ENTRIES	16
+
+struct ice_tunnel_table {
+	struct ice_tunnel_entry tbl[ICE_TUNNEL_MAX_ENTRIES];
+	u16 count;
+};
+
+struct ice_dvm_entry {
+	u16 boost_addr;
+	u16 enable;
+	struct ice_boost_tcam_entry *boost_entry;
+};
+
+#define ICE_DVM_MAX_ENTRIES	48
+
+struct ice_dvm_table {
+	struct ice_dvm_entry tbl[ICE_DVM_MAX_ENTRIES];
+	u16 count;
+};
+
+struct ice_pkg_es {
+	__le16 count;
+	__le16 offset;
+	struct ice_fv_word es[];
+};
+
 struct ice_es {
 	u32 sid;
 	u16 count;
 	u16 fvw;
 	u16 *ref_count;
+	u32 *mask_ena;
 	struct list_head prof_map;
 	struct ice_fv_word *t;
 	struct mutex prof_map_lock;	/* protect access to profiles list */
@@ -280,6 +748,37 @@ struct ice_ptg_ptype {
 	u8 ptg;
 };
 
+#define ICE_MAX_TCAM_PER_PROFILE	32
+#define ICE_MAX_PTG_PER_PROFILE		32
+
+struct ice_prof_map {
+	struct list_head list;
+	u64 profile_cookie;
+	u64 context;
+	u8 prof_id;
+	u8 ptg_cnt;
+	u8 ptg[ICE_MAX_PTG_PER_PROFILE];
+	struct ice_ptype_attrib_info attr[ICE_MAX_PTG_PER_PROFILE];
+};
+
+#define ICE_INVALID_TCAM	0xFFFF
+
+struct ice_tcam_inf {
+	u16 tcam_idx;
+	struct ice_ptype_attrib_info attr;
+	u8 ptg;
+	u8 prof_id;
+	u8 in_use;
+};
+
+struct ice_vsig_prof {
+	struct list_head list;
+	u64 profile_cookie;
+	u8 prof_id;
+	u8 tcam_count;
+	struct ice_tcam_inf tcam[ICE_MAX_TCAM_PER_PROFILE];
+};
+
 struct ice_vsig_entry {
 	struct list_head prop_lst;
 	struct ice_vsig_vsi *first_vsi;
@@ -316,8 +815,8 @@ struct ice_xlt1 {
 #define ICE_PF_NUM_S	13
 #define ICE_PF_NUM_M	(0x07 << ICE_PF_NUM_S)
 #define ICE_VSIG_VALUE(vsig, pf_id) \
-	(u16)((((u16)(vsig)) & ICE_VSIG_IDX_M) | \
-	      (((u16)(pf_id) << ICE_PF_NUM_S) & ICE_PF_NUM_M))
+	((u16)((((u16)(vsig)) & ICE_VSIG_IDX_M) | \
+	       (((u16)(pf_id) << ICE_PF_NUM_S) & ICE_PF_NUM_M)))
 #define ICE_DEFAULT_VSIG	0
 
 /* XLT2 Table */
@@ -329,6 +828,32 @@ struct ice_xlt2 {
 	u16 count;
 };
 
+/* Extraction sequence - list of match fields:
+ * protocol ID, offset, profile length
+ */
+union ice_match_fld {
+	struct {
+		u8 prot_id;
+		u8 offset;
+		u8 length;
+		u8 reserved; /* must be zero */
+	} fld;
+	u32 val;
+};
+
+#define ICE_MATCH_LIST_SZ	20
+struct ice_match {
+	u8 count;
+	union ice_match_fld list[ICE_MATCH_LIST_SZ];
+} __packed;
+
+/* Profile ID Management */
+struct ice_prof_id_key {
+	__le16 flags;
+	u8 xlt1;
+	__le16 xlt2_cdid;
+} __packed;
+
 /* Keys are made up of two values, each one-half the size of the key.
  * For TCAM, the entire key is 80 bits wide (or 2, 40-bit wide values)
  */
@@ -343,8 +868,8 @@ struct ice_prof_tcam_entry {
 
 struct ice_prof_id_section {
 	__le16 count;
-	struct ice_prof_tcam_entry entry[1];
-} __packed;
+	struct ice_prof_tcam_entry entry[];
+};
 
 struct ice_prof_tcam {
 	u32 sid;
@@ -360,6 +885,21 @@ struct ice_prof_redir {
 	u16 count;
 };
 
+struct ice_mask {
+	u16 mask;	/* 16-bit mask */
+	u16 idx;	/* index */
+	u16 ref;	/* reference count */
+	u8 in_use;	/* non-zero if used */
+};
+
+struct ice_masks {
+	struct mutex lock;  /* lock to protect this structure */
+	u16 first;	/* first mask owned by the PF */
+	u16 count;	/* number of masks owned by the PF */
+#define ICE_PROF_MASK_COUNT 32
+	struct ice_mask masks[ICE_PROF_MASK_COUNT];
+};
+
 /* Tables per block */
 struct ice_blk_info {
 	struct ice_xlt1 xlt1;
@@ -367,8 +907,72 @@ struct ice_blk_info {
 	struct ice_prof_tcam prof;
 	struct ice_prof_redir prof_redir;
 	struct ice_es es;
+	struct ice_masks masks;
 	u8 overwrite; /* set to true to allow overwrite of table entries */
 	u8 is_list_init;
 };
 
+enum ice_chg_type {
+	ICE_TCAM_NONE = 0,
+	ICE_PTG_ES_ADD,
+	ICE_TCAM_ADD,
+	ICE_VSIG_ADD,
+	ICE_VSIG_REM,
+	ICE_VSI_MOVE,
+};
+
+struct ice_chs_chg {
+	struct list_head list_entry;
+	enum ice_chg_type type;
+
+	u8 add_ptg;
+	u8 add_vsig;
+	u8 add_tcam_idx;
+	u8 add_prof;
+	u16 ptype;
+	u8 ptg;
+	u8 prof_id;
+	u16 vsi;
+	u16 vsig;
+	u16 orig_vsig;
+	u16 tcam_idx;
+	struct ice_ptype_attrib_info attr;
+};
+
+#define ICE_FLOW_PTYPE_MAX		ICE_XLT1_CNT
+
+enum ice_prof_type {
+	ICE_PROF_NON_TUN = 0x1,
+	ICE_PROF_TUN_UDP = 0x2,
+	ICE_PROF_TUN_GRE = 0x4,
+	ICE_PROF_TUN_PPPOE = 0x8,
+	ICE_PROF_TUN_ALL = 0xE,
+	ICE_PROF_ALL = 0xFF,
+};
+
+/* Number of bits/bytes contained in meta init entry. Note, this should be a
+ * multiple of 32 bits.
+ */
+#define ICE_META_INIT_BITS	192
+#define ICE_META_INIT_DW_CNT	(ICE_META_INIT_BITS / (sizeof(__le32) * \
+				 BITS_PER_BYTE))
+
+/* The meta init Flag field starts at this bit */
+#define ICE_META_FLAGS_ST		123
+
+/* The entry and bit to check for Double VLAN Mode (DVM) support */
+#define ICE_META_VLAN_MODE_ENTRY	0
+#define ICE_META_FLAG_VLAN_MODE		60
+#define ICE_META_VLAN_MODE_BIT		(ICE_META_FLAGS_ST + \
+					 ICE_META_FLAG_VLAN_MODE)
+
+struct ice_meta_init_entry {
+	__le32 bm[ICE_META_INIT_DW_CNT];
+};
+
+struct ice_meta_init_section {
+	__le16 count;
+	__le16 offset;
+	struct ice_meta_init_entry entry[1];
+};
 #endif /* _ICE_FLEX_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.c b/drivers/net/ethernet/intel/ice/ice_flow.c
new file mode 100644
index 0000000000000000000000000000000000000000..74802e3d0754fc7b3d221c6240524de48c2b2d5d
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_flow.c
@@ -0,0 +1,4222 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_flow.h"
+
+
+/* Size of known protocol header fields */
+#define ICE_FLOW_FLD_SZ_ETH_TYPE	2
+#define ICE_FLOW_FLD_SZ_VLAN		2
+#define ICE_FLOW_FLD_SZ_IPV4_ADDR	4
+#define ICE_FLOW_FLD_SZ_IPV6_ADDR	16
+#define ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR	4
+#define ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR	6
+#define ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR	8
+#define ICE_FLOW_FLD_SZ_IPV4_ID		2
+#define ICE_FLOW_FLD_SZ_IPV6_ID		4
+#define ICE_FLOW_FLD_SZ_IP_DSCP		1
+#define ICE_FLOW_FLD_SZ_IP_TTL		1
+#define ICE_FLOW_FLD_SZ_IP_PROT		1
+#define ICE_FLOW_FLD_SZ_PORT		2
+#define ICE_FLOW_FLD_SZ_TCP_FLAGS	1
+#define ICE_FLOW_FLD_SZ_ICMP_TYPE	1
+#define ICE_FLOW_FLD_SZ_ICMP_CODE	1
+#define ICE_FLOW_FLD_SZ_ARP_OPER	2
+#define ICE_FLOW_FLD_SZ_GRE_KEYID	4
+#define ICE_FLOW_FLD_SZ_GTP_TEID	4
+#define ICE_FLOW_FLD_SZ_GTP_QFI		2
+#define ICE_FLOW_FLD_SZ_PPPOE_SESS_ID   2
+#define ICE_FLOW_FLD_SZ_PFCP_SEID 8
+#define ICE_FLOW_FLD_SZ_L2TPV3_SESS_ID	4
+#define ICE_FLOW_FLD_SZ_ESP_SPI	4
+#define ICE_FLOW_FLD_SZ_AH_SPI	4
+#define ICE_FLOW_FLD_SZ_NAT_T_ESP_SPI	4
+#define ICE_FLOW_FLD_SZ_VXLAN_VNI	4
+#define ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID	2
+
+/* Describe properties of a protocol header field */
+struct ice_flow_field_info {
+	enum ice_flow_seg_hdr hdr;
+	s16 off;	/* Offset from start of a protocol header, in bits */
+	u16 size;	/* Size of fields in bits */
+	u16 mask;	/* 16-bit mask for field */
+};
+
+#define ICE_FLOW_FLD_INFO(_hdr, _offset_bytes, _size_bytes) { \
+	.hdr = _hdr, \
+	.off = (_offset_bytes) * BITS_PER_BYTE, \
+	.size = (_size_bytes) * BITS_PER_BYTE, \
+	.mask = 0, \
+}
+
+#define ICE_FLOW_FLD_INFO_MSK(_hdr, _offset_bytes, _size_bytes, _mask) { \
+	.hdr = _hdr, \
+	.off = (_offset_bytes) * BITS_PER_BYTE, \
+	.size = (_size_bytes) * BITS_PER_BYTE, \
+	.mask = _mask, \
+}
+
+/* Table containing properties of supported protocol header fields */
+static const
+struct ice_flow_field_info ice_flds_info[ICE_FLOW_FIELD_IDX_MAX] = {
+	/* Ether */
+	/* ICE_FLOW_FIELD_IDX_ETH_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, 0, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ETH_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, ETH_ALEN, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_S_VLAN */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VLAN, 12, ICE_FLOW_FLD_SZ_VLAN),
+	/* ICE_FLOW_FIELD_IDX_C_VLAN */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VLAN, 14, ICE_FLOW_FLD_SZ_VLAN),
+	/* ICE_FLOW_FIELD_IDX_ETH_TYPE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ETH, 0, ICE_FLOW_FLD_SZ_ETH_TYPE),
+	/* IPv4 / IPv6 */
+	/* ICE_FLOW_FIELD_IDX_IPV4_DSCP */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_IPV4, 0, ICE_FLOW_FLD_SZ_IP_DSCP,
+			      0x00fc),
+	/* ICE_FLOW_FIELD_IDX_IPV6_DSCP */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_IPV6, 0, ICE_FLOW_FLD_SZ_IP_DSCP,
+			      0x0ff0),
+	/* ICE_FLOW_FIELD_IDX_IPV4_TTL */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 8,
+			      ICE_FLOW_FLD_SZ_IP_TTL, 0xff00),
+	/* ICE_FLOW_FIELD_IDX_IPV4_PROT */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 8,
+			      ICE_FLOW_FLD_SZ_IP_PROT, 0x00ff),
+	/* ICE_FLOW_FIELD_IDX_IPV6_TTL */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 6,
+			      ICE_FLOW_FLD_SZ_IP_TTL, 0x00ff),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PROT */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_NONE, 6,
+			      ICE_FLOW_FLD_SZ_IP_PROT, 0xff00),
+	/* ICE_FLOW_FIELD_IDX_IPV4_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 12, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV4_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 16, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8, ICE_FLOW_FLD_SZ_IPV6_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24, ICE_FLOW_FLD_SZ_IPV6_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV4_FRAG */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV_FRAG, 4,
+			  ICE_FLOW_FLD_SZ_IPV4_ID),
+	/* ICE_FLOW_FIELD_IDX_IPV6_FRAG */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV_FRAG, 4,
+			  ICE_FLOW_FLD_SZ_IPV6_ID),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE32_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE48_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR),
+	/* ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24,
+			  ICE_FLOW_FLD_SZ_IPV6_PRE64_ADDR),
+	/* Transport */
+	/* ICE_FLOW_FIELD_IDX_TCP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_TCP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_UDP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_UDP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 0, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_SCTP_DST_PORT */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 2, ICE_FLOW_FLD_SZ_PORT),
+	/* ICE_FLOW_FIELD_IDX_TCP_FLAGS */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 13, ICE_FLOW_FLD_SZ_TCP_FLAGS),
+	/* ARP */
+	/* ICE_FLOW_FIELD_IDX_ARP_SIP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 14, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_ARP_DIP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 24, ICE_FLOW_FLD_SZ_IPV4_ADDR),
+	/* ICE_FLOW_FIELD_IDX_ARP_SHA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 8, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ARP_DHA */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 18, ETH_ALEN),
+	/* ICE_FLOW_FIELD_IDX_ARP_OP */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ARP, 6, ICE_FLOW_FLD_SZ_ARP_OPER),
+	/* ICMP */
+	/* ICE_FLOW_FIELD_IDX_ICMP_TYPE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ICMP, 0, ICE_FLOW_FLD_SZ_ICMP_TYPE),
+	/* ICE_FLOW_FIELD_IDX_ICMP_CODE */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ICMP, 1, ICE_FLOW_FLD_SZ_ICMP_CODE),
+	/* GRE */
+	/* ICE_FLOW_FIELD_IDX_GRE_KEYID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GRE, 12, ICE_FLOW_FLD_SZ_GRE_KEYID),
+	/* GTP */
+	/* ICE_FLOW_FIELD_IDX_GTPC_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPC_TEID, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_IP_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_IP, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_EH_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_EH, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_EH_QFI */
+	ICE_FLOW_FLD_INFO_MSK(ICE_FLOW_SEG_HDR_GTPU_EH, 22,
+			      ICE_FLOW_FLD_SZ_GTP_QFI, 0x3f00),
+	/* ICE_FLOW_FIELD_IDX_GTPU_UP_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_UP, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_GTPU_DWN, 12,
+			  ICE_FLOW_FLD_SZ_GTP_TEID),
+	/* PPPOE */
+	/* ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_PPPOE, 2,
+			  ICE_FLOW_FLD_SZ_PPPOE_SESS_ID),
+	/* PFCP */
+	/* ICE_FLOW_FIELD_IDX_PFCP_SEID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_PFCP_SESSION, 12,
+			  ICE_FLOW_FLD_SZ_PFCP_SEID),
+	/* L2TPV3 */
+	/* ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_L2TPV3, 0,
+			  ICE_FLOW_FLD_SZ_L2TPV3_SESS_ID),
+	/* ESP */
+	/* ICE_FLOW_FIELD_IDX_ESP_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ESP, 0,
+			  ICE_FLOW_FLD_SZ_ESP_SPI),
+	/* AH */
+	/* ICE_FLOW_FIELD_IDX_AH_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_AH, 4,
+			  ICE_FLOW_FLD_SZ_AH_SPI),
+	/* NAT_T_ESP */
+	/* ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_NAT_T_ESP, 8,
+			  ICE_FLOW_FLD_SZ_NAT_T_ESP_SPI),
+	/* ICE_FLOW_FIELD_IDX_VXLAN_VNI */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_VXLAN, 12,
+			  ICE_FLOW_FLD_SZ_VXLAN_VNI),
+	/* ECPRI_TP0 */
+	/* ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_ECPRI_TP0, 4,
+			  ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID),
+	/* UDP_ECPRI_TP0 */
+	/* ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID */
+	ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0, 12,
+			  ICE_FLOW_FLD_SZ_ECPRI_TP0_PC_ID),
+};
+
+/* Bitmaps indicating relevant packet types for a particular protocol header
+ *
+ * Packet types for packets with an Outer/First/Single MAC header
+ */
+static const u32 ice_ptypes_mac_ofos[] = {
+	0xFDC00846, 0xBFBF7F7E, 0xF70001DF, 0xFEFDFDFB,
+	0x0000077E, 0x000003FF, 0x00000000, 0x00000000,
+	0x00400000, 0x03FFF000, 0xFFFFFFE0, 0x00100707,
+	0xFFFFFF00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last MAC VLAN header */
+static const u32 ice_ptypes_macvlan_il[] = {
+	0x00000000, 0xBC000000, 0x000001DF, 0xF0000000,
+	0x0000077E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header, does NOT
+ * include IPV4 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv4_ofos[] = {
+	0x1DC00000, 0x24000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000155, 0x00000000, 0x00000000,
+	0x00000000, 0x000FC000, 0x000002A0, 0x00100000,
+	0x00001500, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header, includes
+ * IPV4 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv4_ofos_all[] = {
+	0x1DC00000, 0x24000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000155, 0x00000000, 0x00000000,
+	0x00000000, 0x000FC000, 0x83E0FAA0, 0x00000101,
+	0x03FFD500, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv4 header */
+static const u32 ice_ptypes_ipv4_il[] = {
+	0xE0000000, 0xB807700E, 0x80000003, 0xE01DC03B,
+	0x0000000E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x001FF800, 0x00100000,
+	0xFC0FC000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header, does NOT
+ * include IVP6 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv6_ofos[] = {
+	0x00000000, 0x00000000, 0x77000000, 0x10002000,
+	0x00000000, 0x000002AA, 0x00000000, 0x00000000,
+	0x00000000, 0x03F00000, 0x00000540, 0x00000000,
+	0x00002A00, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header, includes
+ * IPV6 other PTYPEs
+ */
+static const u32 ice_ptypes_ipv6_ofos_all[] = {
+	0x00000000, 0x00000000, 0x77000000, 0x10002000,
+	0x00000000, 0x000002AA, 0x00000000, 0x00000000,
+	0x00000000, 0x03F00000, 0x7C1F0540, 0x00000206,
+	0xFC002A00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv6 header */
+static const u32 ice_ptypes_ipv6_il[] = {
+	0x00000000, 0x03B80770, 0x000001DC, 0x0EE00000,
+	0x00000770, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x7FE00000, 0x00000000,
+	0x03F00000, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv4 header - no L4 */
+static const u32 ice_ptypes_ipv4_ofos_no_l4[] = {
+	0x10C00000, 0x04000800, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv4 header - no L4 */
+static const u32 ice_ptypes_ipv4_il_no_l4[] = {
+	0x60000000, 0x18043008, 0x80000002, 0x6010c021,
+	0x00000008, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outer/First/Single IPv6 header - no L4 */
+static const u32 ice_ptypes_ipv6_ofos_no_l4[] = {
+	0x00000000, 0x00000000, 0x43000000, 0x10002000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last IPv6 header - no L4 */
+static const u32 ice_ptypes_ipv6_il_no_l4[] = {
+	0x00000000, 0x02180430, 0x0000010c, 0x086010c0,
+	0x00000430, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First ARP header */
+static const u32 ice_ptypes_arp_of[] = {
+	0x00000800, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* UDP Packet types for non-tunneled packets or tunneled
+ * packets with inner UDP.
+ */
+static const u32 ice_ptypes_udp_il[] = {
+	0x81000000, 0x20204040, 0x04000010, 0x80810102,
+	0x00000040, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00410000, 0x908427E0, 0x00100007,
+	0x10410000, 0x00000004, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last TCP header */
+static const u32 ice_ptypes_tcp_il[] = {
+	0x04000000, 0x80810102, 0x10000040, 0x02040408,
+	0x00000102, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00820000, 0x21084000, 0x00000000,
+	0x20820000, 0x00000008, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last SCTP header */
+static const u32 ice_ptypes_sctp_il[] = {
+	0x08000000, 0x01020204, 0x20000081, 0x04080810,
+	0x00000204, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x01040000, 0x00000000, 0x00000000,
+	0x41040000, 0x00000010, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First ICMP header */
+static const u32 ice_ptypes_icmp_of[] = {
+	0x10000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last ICMP header */
+static const u32 ice_ptypes_icmp_il[] = {
+	0x00000000, 0x02040408, 0x40000102, 0x08101020,
+	0x00000408, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x42108000, 0x00000000,
+	0x82080000, 0x00000020, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Outermost/First GRE header */
+static const u32 ice_ptypes_gre_of[] = {
+	0x00000000, 0xBFBF7800, 0x000001DF, 0xFEFDE000,
+	0x0000017E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with an Innermost/Last MAC header */
+static const u32 ice_ptypes_mac_il[] = {
+	0x00000000, 0x20000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPC */
+static const u32 ice_ptypes_gtpc[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x000001E0, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for VXLAN with VNI */
+static const u32 ice_ptypes_vxlan_vni[] = {
+	0x00000000, 0xBFBFF800, 0x00EFDFDF, 0xFEFDE000,
+	0x03BF7F7E, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPC with TEID */
+static const u32 ice_ptypes_gtpc_tid[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000060, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for GTPU */
+static const struct ice_ptype_attributes ice_attr_gtpu_session[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_SESSION },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_SESSION },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_eh[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_PDU_EH },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_PDU_EH },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_down[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_DOWNLINK },
+};
+
+static const struct ice_ptype_attributes ice_attr_gtpu_up[] = {
+	{ ICE_MAC_IPV4_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV4_ICMP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV4_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_FRAG,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_PAY,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_UDP_PAY, ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_TCP,	  ICE_PTYPE_ATTR_GTP_UPLINK },
+	{ ICE_MAC_IPV6_GTPU_IPV6_ICMPV6,  ICE_PTYPE_ATTR_GTP_UPLINK },
+};
+
+static const u32 ice_ptypes_gtpu[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x7FFFFE00, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for pppoe */
+static const u32 ice_ptypes_pppoe[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x03ffe000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with PFCP NODE header */
+static const u32 ice_ptypes_pfcp_node[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x80000000, 0x00000002,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with PFCP SESSION header */
+static const u32 ice_ptypes_pfcp_session[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000005,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for l2tpv3 */
+static const u32 ice_ptypes_l2tpv3[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000300,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for esp */
+static const u32 ice_ptypes_esp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000003, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for ah */
+static const u32 ice_ptypes_ah[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x0000000C, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Packet types for packets with NAT_T ESP header */
+static const u32 ice_ptypes_nat_t_esp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000030, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_mac_non_ip_ofos[] = {
+	0x00000846, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00400000, 0x03FFF000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_gtpu_no_ip[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000600, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ecpri_tp0[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000400,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_udp_ecpri_tp0[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00100000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_l2tpv2[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0xFFFFFF00, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ppp[] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0xFFFFF000, 0x0000003F, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ipv4_frag[] = {
+	0x00400000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+static const u32 ice_ptypes_ipv6_frag[] = {
+	0x00000000, 0x00000000, 0x01000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+/* Manage parameters and info. used during the creation of a flow profile */
+struct ice_flow_prof_params {
+	enum ice_block blk;
+	u16 entry_length; /* # of bytes formatted entry will require */
+	u8 es_cnt;
+	struct ice_flow_prof *prof;
+
+	/* For ACL, the es[0] will have the data of ICE_RX_MDID_PKT_FLAGS_15_0
+	 * This will give us the direction flags.
+	 */
+	struct ice_fv_word es[ICE_MAX_FV_WORDS];
+	/* attributes can be used to add attributes to a particular PTYPE */
+	const struct ice_ptype_attributes *attr;
+	u16 attr_cnt;
+
+	u16 mask[ICE_MAX_FV_WORDS];
+	DECLARE_BITMAP(ptypes, ICE_FLOW_PTYPE_MAX);
+};
+
+#define ICE_FLOW_RSS_HDRS_INNER_MASK \
+	(ICE_FLOW_SEG_HDR_PPPOE | ICE_FLOW_SEG_HDR_GTPC | \
+	ICE_FLOW_SEG_HDR_GTPC_TEID | ICE_FLOW_SEG_HDR_GTPU | \
+	ICE_FLOW_SEG_HDR_PFCP_SESSION | ICE_FLOW_SEG_HDR_L2TPV3 | \
+	ICE_FLOW_SEG_HDR_ESP | ICE_FLOW_SEG_HDR_AH | \
+	ICE_FLOW_SEG_HDR_NAT_T_ESP | ICE_FLOW_SEG_HDR_GTPU_NON_IP | \
+	ICE_FLOW_SEG_HDR_ECPRI_TP0 | ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0 | \
+	ICE_FLOW_SEG_HDR_L2TPV2 | ICE_FLOW_SEG_HDR_PPP)
+
+#define ICE_FLOW_SEG_HDRS_L3_MASK	\
+	(ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6 | \
+	 ICE_FLOW_SEG_HDR_ARP)
+#define ICE_FLOW_SEG_HDRS_L4_MASK	\
+	(ICE_FLOW_SEG_HDR_ICMP | ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | \
+	 ICE_FLOW_SEG_HDR_SCTP)
+/* mask for L4 protocols that are NOT part of IPV4/6 OTHER PTYPE groups */
+#define ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER	\
+	(ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP)
+
+/**
+ * ice_flow_val_hdrs - validates packet segments for valid protocol headers
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ */
+static enum ice_status
+ice_flow_val_hdrs(struct ice_flow_seg_info *segs, u8 segs_cnt)
+{
+	u8 i;
+
+	for (i = 0; i < segs_cnt; i++) {
+		/* Multiple L3 headers */
+		if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK &&
+		    !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK))
+			return ICE_ERR_PARAM;
+
+		/* Multiple L4 headers */
+		if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK &&
+		    !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK))
+			return ICE_ERR_PARAM;
+	}
+
+	return 0;
+}
+
+/* Sizes of fixed known protocol headers without header options */
+#define ICE_FLOW_PROT_HDR_SZ_MAC	14
+#define ICE_FLOW_PROT_HDR_SZ_MAC_VLAN	(ICE_FLOW_PROT_HDR_SZ_MAC + 2)
+#define ICE_FLOW_PROT_HDR_SZ_IPV4	20
+#define ICE_FLOW_PROT_HDR_SZ_IPV6	40
+#define ICE_FLOW_PROT_HDR_SZ_ARP	28
+#define ICE_FLOW_PROT_HDR_SZ_ICMP	8
+#define ICE_FLOW_PROT_HDR_SZ_TCP	20
+#define ICE_FLOW_PROT_HDR_SZ_UDP	8
+#define ICE_FLOW_PROT_HDR_SZ_SCTP	12
+
+/**
+ * ice_flow_calc_seg_sz - calculates size of a packet segment based on headers
+ * @params: information about the flow to be processed
+ * @seg: index of packet segment whose header size is to be determined
+ */
+static u16 ice_flow_calc_seg_sz(struct ice_flow_prof_params *params, u8 seg)
+{
+	u16 sz;
+
+	/* L2 headers */
+	sz = (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_VLAN) ?
+		ICE_FLOW_PROT_HDR_SZ_MAC_VLAN : ICE_FLOW_PROT_HDR_SZ_MAC;
+
+	/* L3 headers */
+	if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_IPV4)
+		sz += ICE_FLOW_PROT_HDR_SZ_IPV4;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_IPV6)
+		sz += ICE_FLOW_PROT_HDR_SZ_IPV6;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_ARP)
+		sz += ICE_FLOW_PROT_HDR_SZ_ARP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK)
+		/* A L3 header is required if L4 is specified */
+		return 0;
+
+	/* L4 headers */
+	if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_ICMP)
+		sz += ICE_FLOW_PROT_HDR_SZ_ICMP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_TCP)
+		sz += ICE_FLOW_PROT_HDR_SZ_TCP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_UDP)
+		sz += ICE_FLOW_PROT_HDR_SZ_UDP;
+	else if (params->prof->segs[seg].hdrs & ICE_FLOW_SEG_HDR_SCTP)
+		sz += ICE_FLOW_PROT_HDR_SZ_SCTP;
+
+	return sz;
+}
+
+/**
+ * ice_flow_proc_seg_hdrs - process protocol headers present in pkt segments
+ * @params: information about the flow to be processed
+ *
+ * This function identifies the packet types associated with the protocol
+ * headers being present in packet segments of the specified flow profile.
+ */
+static enum ice_status
+ice_flow_proc_seg_hdrs(struct ice_flow_prof_params *params)
+{
+	struct ice_flow_prof *prof;
+	u8 i;
+
+	memset(params->ptypes, 0xff, sizeof(params->ptypes));
+
+	prof = params->prof;
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		const unsigned long *src;
+		u32 hdrs;
+
+		hdrs = prof->segs[i].hdrs;
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ETH) {
+			src = !i ? (const unsigned long *)ice_ptypes_mac_ofos :
+				(const unsigned long *)ice_ptypes_mac_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (i && hdrs & ICE_FLOW_SEG_HDR_VLAN) {
+			src = (const unsigned long *)ice_ptypes_macvlan_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (!i && hdrs & ICE_FLOW_SEG_HDR_ARP) {
+			bitmap_and(params->ptypes, params->ptypes,
+				   (const unsigned long *)ice_ptypes_arp_of,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) {
+			src = (const unsigned long *)ice_ptypes_ecpri_tp0;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+		    (hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)) {
+			src = i ?
+				(const unsigned long *)ice_ptypes_ipv4_il :
+				(const unsigned long *)ice_ptypes_ipv4_ofos_all;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+			   (hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)) {
+			src = i ?
+				(const unsigned long *)ice_ptypes_ipv6_il :
+				(const unsigned long *)ice_ptypes_ipv6_ofos_all;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+				(hdrs & ICE_FLOW_SEG_HDR_IPV_FRAG)) {
+			src = (const unsigned long *)ice_ptypes_ipv4_frag;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+				(hdrs & ICE_FLOW_SEG_HDR_IPV_FRAG)) {
+			src = (const unsigned long *)ice_ptypes_ipv6_frag;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV4) &&
+			   !(hdrs & ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER)) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv4_ofos_no_l4 :
+				(const unsigned long *)ice_ptypes_ipv4_il_no_l4;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv4_ofos :
+				(const unsigned long *)ice_ptypes_ipv4_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if ((hdrs & ICE_FLOW_SEG_HDR_IPV6) &&
+			   !(hdrs & ICE_FLOW_SEG_HDRS_L4_MASK_NO_OTHER)) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv6_ofos_no_l4 :
+				(const unsigned long *)ice_ptypes_ipv6_il_no_l4;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+			src = !i ? (const unsigned long *)ice_ptypes_ipv6_ofos :
+				(const unsigned long *)ice_ptypes_ipv6_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ETH_NON_IP) {
+			src = (const unsigned long *)ice_ptypes_mac_non_ip_ofos;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_PPPOE) {
+			src = (const unsigned long *)ice_ptypes_pppoe;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else {
+			src = (const unsigned long *)ice_ptypes_pppoe;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+		}
+		if (hdrs & ICE_FLOW_SEG_HDR_UDP) {
+			src = (const unsigned long *)ice_ptypes_udp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_TCP) {
+			bitmap_and(params->ptypes, params->ptypes,
+				   (const unsigned long *)ice_ptypes_tcp_il,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_SCTP) {
+			src = (const unsigned long *)ice_ptypes_sctp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_ICMP) {
+			src = !i ? (const unsigned long *)ice_ptypes_icmp_of :
+				(const unsigned long *)ice_ptypes_icmp_il;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GRE) {
+			if (!i) {
+				src = (const unsigned long *)ice_ptypes_gre_of;
+				bitmap_and(params->ptypes, params->ptypes,
+					   src, ICE_FLOW_PTYPE_MAX);
+			}
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPC) {
+			src = (const unsigned long *)ice_ptypes_gtpc;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPC_TEID) {
+			src = (const unsigned long *)ice_ptypes_gtpc_tid;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_NON_IP) {
+			src = (const unsigned long *)ice_ptypes_gtpu_no_ip;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_DWN) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with downlink */
+			params->attr = ice_attr_gtpu_down;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_down);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_UP) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with uplink */
+			params->attr = ice_attr_gtpu_up;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_up);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_EH) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet with Extension Header */
+			params->attr = ice_attr_gtpu_eh;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_eh);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_IP) {
+			src = (const unsigned long *)ice_ptypes_gtpu;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+
+			/* Attributes for GTP packet without Extension Header */
+			params->attr = ice_attr_gtpu_session;
+			params->attr_cnt = ARRAY_SIZE(ice_attr_gtpu_session);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_L2TPV2) {
+			src = (const unsigned long *)ice_ptypes_l2tpv2;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_L2TPV3) {
+			src = (const unsigned long *)ice_ptypes_l2tpv3;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_ESP) {
+			src = (const unsigned long *)ice_ptypes_esp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_AH) {
+			src = (const unsigned long *)ice_ptypes_ah;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_NAT_T_ESP) {
+			src = (const unsigned long *)ice_ptypes_nat_t_esp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_VXLAN) {
+			src = (const unsigned long *)ice_ptypes_vxlan_vni;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else if (hdrs & ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0) {
+			src = (const unsigned long *)ice_ptypes_udp_ecpri_tp0;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_PPP) {
+			src = (const unsigned long *)ice_ptypes_ppp;
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		}
+
+		if (hdrs & ICE_FLOW_SEG_HDR_PFCP) {
+			if (hdrs & ICE_FLOW_SEG_HDR_PFCP_NODE)
+				src =
+				(const unsigned long *)ice_ptypes_pfcp_node;
+			else
+				src =
+				(const unsigned long *)ice_ptypes_pfcp_session;
+
+			bitmap_and(params->ptypes, params->ptypes, src,
+				   ICE_FLOW_PTYPE_MAX);
+		} else {
+			src = (const unsigned long *)ice_ptypes_pfcp_node;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+
+			src = (const unsigned long *)ice_ptypes_pfcp_session;
+			bitmap_andnot(params->ptypes, params->ptypes, src,
+				      ICE_FLOW_PTYPE_MAX);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_pkt_flags - Create an extr sequence entry for packet flags
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @flags: The value of pkt_flags[x:x] in Rx/Tx MDID metadata.
+ *
+ * This function will allocate an extraction sequence entries for a DWORD size
+ * chunk of the packet flags.
+ */
+static enum ice_status
+ice_flow_xtract_pkt_flags(struct ice_hw *hw,
+			  struct ice_flow_prof_params *params,
+			  enum ice_flex_mdid_pkt_flags flags)
+{
+	u8 fv_words = hw->blk[params->blk].es.fvw;
+	u8 idx;
+
+	/* Make sure the number of extraction sequence entries required does not
+	 * exceed the block's capacity.
+	 */
+	if (params->es_cnt >= fv_words)
+		return ICE_ERR_MAX_LIMIT;
+
+	/* some blocks require a reversed field vector layout */
+	if (hw->blk[params->blk].es.reverse)
+		idx = fv_words - params->es_cnt - 1;
+	else
+		idx = params->es_cnt;
+
+	params->es[idx].prot_id = ICE_PROT_META_ID;
+	params->es[idx].off = flags;
+	params->es_cnt++;
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_fld - Create an extraction sequence entry for the given field
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @seg: packet segment index of the field to be extracted
+ * @fld: ID of field to be extracted
+ * @match: bitfield of all fields
+ *
+ * This function determines the protocol ID, offset, and size of the given
+ * field. It then allocates one or more extraction sequence entries for the
+ * given field, and fill the entries with protocol ID and offset information.
+ */
+static enum ice_status
+ice_flow_xtract_fld(struct ice_hw *hw, struct ice_flow_prof_params *params,
+		    u8 seg, enum ice_flow_field fld, u64 match)
+{
+	enum ice_flow_field sib = ICE_FLOW_FIELD_IDX_MAX;
+	enum ice_prot_id prot_id = ICE_PROT_ID_INVAL;
+	u8 fv_words = hw->blk[params->blk].es.fvw;
+	struct ice_flow_fld_info *flds;
+	u16 cnt, ese_bits, i;
+	u16 sib_mask = 0;
+	u16 mask;
+	u16 off;
+
+	flds = params->prof->segs[seg].fields;
+
+	switch (fld) {
+	case ICE_FLOW_FIELD_IDX_ETH_DA:
+	case ICE_FLOW_FIELD_IDX_ETH_SA:
+	case ICE_FLOW_FIELD_IDX_S_VLAN:
+	case ICE_FLOW_FIELD_IDX_C_VLAN:
+		prot_id = seg == 0 ? ICE_PROT_MAC_OF_OR_S : ICE_PROT_MAC_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_ETH_TYPE:
+		prot_id = seg == 0 ? ICE_PROT_ETYPE_OL : ICE_PROT_ETYPE_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_DSCP:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_DSCP:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_TTL:
+	case ICE_FLOW_FIELD_IDX_IPV4_PROT:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+
+		/* TTL and PROT share the same extraction seq. entry.
+		 * Each is considered a sibling to the other in terms of sharing
+		 * the same extraction sequence entry.
+		 */
+		if (fld == ICE_FLOW_FIELD_IDX_IPV4_TTL)
+			sib = ICE_FLOW_FIELD_IDX_IPV4_PROT;
+		else
+			sib = ICE_FLOW_FIELD_IDX_IPV4_TTL;
+
+		/* If the sibling field is also included, that field's
+		 * mask needs to be included.
+		 */
+		if (match & BIT(sib))
+			sib_mask = ice_flds_info[sib].mask;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_TTL:
+	case ICE_FLOW_FIELD_IDX_IPV6_PROT:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+
+		/* TTL and PROT share the same extraction seq. entry.
+		 * Each is considered a sibling to the other in terms of sharing
+		 * the same extraction sequence entry.
+		 */
+		if (fld == ICE_FLOW_FIELD_IDX_IPV6_TTL)
+			sib = ICE_FLOW_FIELD_IDX_IPV6_PROT;
+		else
+			sib = ICE_FLOW_FIELD_IDX_IPV6_TTL;
+
+		/* If the sibling field is also included, that field's
+		 * mask needs to be included.
+		 */
+		if (match & BIT(sib))
+			sib_mask = ice_flds_info[sib].mask;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_SA:
+	case ICE_FLOW_FIELD_IDX_IPV4_DA:
+		prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV4_ID:
+		prot_id = ICE_PROT_IPV4_OF_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA:
+	case ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA:
+		prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_IPV6_ID:
+		prot_id = ICE_PROT_IPV6_FRAG;
+		break;
+	case ICE_FLOW_FIELD_IDX_TCP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_TCP_DST_PORT:
+	case ICE_FLOW_FIELD_IDX_TCP_FLAGS:
+		prot_id = ICE_PROT_TCP_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_UDP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_UDP_DST_PORT:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT:
+	case ICE_FLOW_FIELD_IDX_SCTP_DST_PORT:
+		prot_id = ICE_PROT_SCTP_IL;
+		break;
+	case ICE_FLOW_FIELD_IDX_VXLAN_VNI:
+	case ICE_FLOW_FIELD_IDX_GTPC_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_IP_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_UP_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_EH_TEID:
+	case ICE_FLOW_FIELD_IDX_GTPU_EH_QFI:
+		/* GTP is accessed through UDP OF protocol */
+		prot_id = ICE_PROT_UDP_OF;
+		break;
+	case ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID:
+		prot_id = ICE_PROT_PPPOE;
+		break;
+	case ICE_FLOW_FIELD_IDX_PFCP_SEID:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID:
+		prot_id = ICE_PROT_L2TPV3;
+		break;
+	case ICE_FLOW_FIELD_IDX_ESP_SPI:
+		prot_id = ICE_PROT_ESP_F;
+		break;
+	case ICE_FLOW_FIELD_IDX_AH_SPI:
+		prot_id = ICE_PROT_ESP_2;
+		break;
+	case ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID:
+		prot_id = ICE_PROT_ECPRI;
+		break;
+	case ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID:
+		prot_id = ICE_PROT_UDP_IL_OR_S;
+		break;
+	case ICE_FLOW_FIELD_IDX_ARP_SIP:
+	case ICE_FLOW_FIELD_IDX_ARP_DIP:
+	case ICE_FLOW_FIELD_IDX_ARP_SHA:
+	case ICE_FLOW_FIELD_IDX_ARP_DHA:
+	case ICE_FLOW_FIELD_IDX_ARP_OP:
+		prot_id = ICE_PROT_ARP_OF;
+		break;
+	case ICE_FLOW_FIELD_IDX_ICMP_TYPE:
+	case ICE_FLOW_FIELD_IDX_ICMP_CODE:
+		/* ICMP type and code share the same extraction seq. entry */
+		prot_id = (params->prof->segs[seg].hdrs &
+			   ICE_FLOW_SEG_HDR_IPV4) ?
+			ICE_PROT_ICMP_IL : ICE_PROT_ICMPV6_IL;
+		sib = fld == ICE_FLOW_FIELD_IDX_ICMP_TYPE ?
+			ICE_FLOW_FIELD_IDX_ICMP_CODE :
+			ICE_FLOW_FIELD_IDX_ICMP_TYPE;
+		break;
+	case ICE_FLOW_FIELD_IDX_GRE_KEYID:
+		prot_id = ICE_PROT_GRE_OF;
+		break;
+	default:
+		return ICE_ERR_NOT_IMPL;
+	}
+
+	/* Each extraction sequence entry is a word in size, and extracts a
+	 * word-aligned offset from a protocol header.
+	 */
+	ese_bits = ICE_FLOW_FV_EXTRACT_SZ * BITS_PER_BYTE;
+
+	flds[fld].xtrct.prot_id = prot_id;
+	flds[fld].xtrct.off = (ice_flds_info[fld].off / ese_bits) *
+		ICE_FLOW_FV_EXTRACT_SZ;
+	flds[fld].xtrct.disp = (u8)(ice_flds_info[fld].off % ese_bits);
+	flds[fld].xtrct.idx = params->es_cnt;
+	flds[fld].xtrct.mask = ice_flds_info[fld].mask;
+
+	/* Adjust the next field-entry index after accommodating the number of
+	 * entries this field consumes
+	 */
+	cnt = DIV_ROUND_UP(flds[fld].xtrct.disp + ice_flds_info[fld].size,
+			   ese_bits);
+
+	/* Fill in the extraction sequence entries needed for this field */
+	off = flds[fld].xtrct.off;
+	mask = flds[fld].xtrct.mask;
+	for (i = 0; i < cnt; i++) {
+		/* Only consume an extraction sequence entry if there is no
+		 * sibling field associated with this field or the sibling entry
+		 * already extracts the word shared with this field.
+		 */
+		if (sib == ICE_FLOW_FIELD_IDX_MAX ||
+		    flds[sib].xtrct.prot_id == ICE_PROT_ID_INVAL ||
+		    flds[sib].xtrct.off != off) {
+			u8 idx;
+
+			/* Make sure the number of extraction sequence required
+			 * does not exceed the block's capability
+			 */
+			if (params->es_cnt >= fv_words)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* some blocks require a reversed field vector layout */
+			if (hw->blk[params->blk].es.reverse)
+				idx = fv_words - params->es_cnt - 1;
+			else
+				idx = params->es_cnt;
+
+			params->es[idx].prot_id = prot_id;
+			params->es[idx].off = off;
+			params->mask[idx] = mask | sib_mask;
+			params->es_cnt++;
+		}
+
+		off += ICE_FLOW_FV_EXTRACT_SZ;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_xtract_raws - Create extract sequence entries for raw bytes
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ * @seg: index of packet segment whose raw fields are to be extracted
+ */
+static enum ice_status
+ice_flow_xtract_raws(struct ice_hw *hw, struct ice_flow_prof_params *params,
+		     u8 seg)
+{
+	u16 fv_words;
+	u16 hdrs_sz;
+	u8 i;
+
+	if (!params->prof->segs[seg].raws_cnt)
+		return 0;
+
+	if (params->prof->segs[seg].raws_cnt >
+	    ARRAY_SIZE(params->prof->segs[seg].raws))
+		return ICE_ERR_MAX_LIMIT;
+
+	/* Offsets within the segment headers are not supported */
+	hdrs_sz = ice_flow_calc_seg_sz(params, seg);
+	if (!hdrs_sz)
+		return ICE_ERR_PARAM;
+
+	fv_words = hw->blk[params->blk].es.fvw;
+
+	for (i = 0; i < params->prof->segs[seg].raws_cnt; i++) {
+		struct ice_flow_seg_fld_raw *raw;
+		u16 off, cnt, j;
+
+		raw = &params->prof->segs[seg].raws[i];
+
+		/* Storing extraction information */
+		raw->info.xtrct.prot_id = ICE_PROT_MAC_OF_OR_S;
+		raw->info.xtrct.off = (raw->off / ICE_FLOW_FV_EXTRACT_SZ) *
+			ICE_FLOW_FV_EXTRACT_SZ;
+		raw->info.xtrct.disp = (raw->off % ICE_FLOW_FV_EXTRACT_SZ) *
+			BITS_PER_BYTE;
+		raw->info.xtrct.idx = params->es_cnt;
+
+		/* Determine the number of field vector entries this raw field
+		 * consumes.
+		 */
+		cnt = DIV_ROUND_UP(raw->info.xtrct.disp + (raw->info.src.last * BITS_PER_BYTE),
+				   (ICE_FLOW_FV_EXTRACT_SZ * BITS_PER_BYTE));
+		off = raw->info.xtrct.off;
+		for (j = 0; j < cnt; j++) {
+			u16 idx;
+
+			/* Make sure the number of extraction sequence required
+			 * does not exceed the block's capability
+			 */
+			if (params->es_cnt >= hw->blk[params->blk].es.count ||
+			    params->es_cnt >= ICE_MAX_FV_WORDS)
+				return ICE_ERR_MAX_LIMIT;
+
+			/* some blocks require a reversed field vector layout */
+			if (hw->blk[params->blk].es.reverse)
+				idx = fv_words - params->es_cnt - 1;
+			else
+				idx = params->es_cnt;
+
+			params->es[idx].prot_id = raw->info.xtrct.prot_id;
+			params->es[idx].off = off;
+			params->es_cnt++;
+			off += ICE_FLOW_FV_EXTRACT_SZ;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_create_xtrct_seq - Create an extraction sequence for given segments
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ *
+ * This function iterates through all matched fields in the given segments, and
+ * creates an extraction sequence for the fields.
+ */
+static enum ice_status
+ice_flow_create_xtrct_seq(struct ice_hw *hw,
+			  struct ice_flow_prof_params *params)
+{
+	enum ice_status status = 0;
+	u8 i;
+
+	/* For ACL, we also need to extract the direction bit (Rx,Tx) data from
+	 * packet flags
+	 */
+	if (params->blk == ICE_BLK_ACL) {
+		status = ice_flow_xtract_pkt_flags(hw, params,
+						   ICE_RX_MDID_PKT_FLAGS_15_0);
+		if (status)
+			return status;
+	}
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		u64 match = params->prof->segs[i].match;
+		enum ice_flow_field j;
+
+		for_each_set_bit(j, (unsigned long *)&match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			status = ice_flow_xtract_fld(hw, params, i, j, match);
+			if (status)
+				return status;
+			clear_bit(j, (unsigned long *)&match);
+		}
+
+		/* Process raw matching bytes */
+		status = ice_flow_xtract_raws(hw, params, i);
+		if (status)
+			return status;
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_sel_acl_scen - returns the specific scenario
+ * @hw: pointer to the hardware structure
+ * @params: information about the flow to be processed
+ *
+ * This function will return the specific scenario based on the
+ * params passed to it
+ */
+static enum ice_status
+ice_flow_sel_acl_scen(struct ice_hw *hw, struct ice_flow_prof_params *params)
+{
+	/* Find the best-fit scenario for the provided match width */
+	struct ice_acl_scen *cand_scen = NULL, *scen;
+
+	if (!hw->acl_tbl)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Loop through each scenario and match against the scenario width
+	 * to select the specific scenario
+	 */
+	list_for_each_entry(scen, &hw->acl_tbl->scens, list_entry)
+		if (scen->eff_width >= params->entry_length &&
+		    (!cand_scen || cand_scen->eff_width > scen->eff_width))
+			cand_scen = scen;
+	if (!cand_scen)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	params->prof->cfg.scen = cand_scen;
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_def_entry_frmt - Determine the layout of flow entries
+ * @params: information about the flow to be processed
+ */
+static enum ice_status
+ice_flow_acl_def_entry_frmt(struct ice_flow_prof_params *params)
+{
+	u16 index, i, range_idx = 0;
+
+	index = ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+
+	for (i = 0; i < params->prof->segs_cnt; i++) {
+		struct ice_flow_seg_info *seg = &params->prof->segs[i];
+		u8 j;
+
+		for_each_set_bit(j, (unsigned long *)&seg->match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			struct ice_flow_fld_info *fld = &seg->fields[j];
+
+			fld->entry.mask = ICE_FLOW_FLD_OFF_INVAL;
+
+			if (fld->type == ICE_FLOW_FLD_TYPE_RANGE) {
+				fld->entry.last = ICE_FLOW_FLD_OFF_INVAL;
+
+				/* Range checking only supported for single
+				 * words
+				 */
+				if (DIV_ROUND_UP(ice_flds_info[j].size + fld->xtrct.disp, BITS_PER_BYTE * 2) > 1)
+					return ICE_ERR_PARAM;
+
+				/* Ranges must define low and high values */
+				if (fld->src.val == ICE_FLOW_FLD_OFF_INVAL ||
+				    fld->src.last == ICE_FLOW_FLD_OFF_INVAL)
+					return ICE_ERR_PARAM;
+
+				fld->entry.val = range_idx++;
+			} else {
+				/* Store adjusted byte-length of field for later
+				 * use, taking into account potential
+				 * non-byte-aligned displacement
+				 */
+				fld->entry.last = DIV_ROUND_UP(ice_flds_info[j].size + (fld->xtrct.disp % BITS_PER_BYTE),
+							       BITS_PER_BYTE);
+				fld->entry.val = index;
+				index += fld->entry.last;
+			}
+		}
+
+		for (j = 0; j < seg->raws_cnt; j++) {
+			struct ice_flow_seg_fld_raw *raw = &seg->raws[j];
+
+			raw->info.entry.mask = ICE_FLOW_FLD_OFF_INVAL;
+			raw->info.entry.val = index;
+			raw->info.entry.last = raw->info.src.last;
+			index += raw->info.entry.last;
+		}
+	}
+
+	/* Currently only support using the byte selection base, which only
+	 * allows for an effective entry size of 30 bytes. Reject anything
+	 * larger.
+	 */
+	if (index > ICE_AQC_ACL_PROF_BYTE_SEL_ELEMS)
+		return ICE_ERR_PARAM;
+
+	/* Only 8 range checkers per profile, reject anything trying to use
+	 * more
+	 */
+	if (range_idx > ICE_AQC_ACL_PROF_RANGES_NUM_CFG)
+		return ICE_ERR_PARAM;
+
+	/* Store # bytes required for entry for later use */
+	params->entry_length = index - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+
+	return 0;
+}
+
+/**
+ * ice_flow_proc_segs - process all packet segments associated with a profile
+ * @hw: pointer to the HW struct
+ * @params: information about the flow to be processed
+ */
+static enum ice_status
+ice_flow_proc_segs(struct ice_hw *hw, struct ice_flow_prof_params *params)
+{
+	enum ice_status status;
+
+	status = ice_flow_proc_seg_hdrs(params);
+	if (status)
+		return status;
+
+	status = ice_flow_create_xtrct_seq(hw, params);
+	if (status)
+		return status;
+
+	switch (params->blk) {
+	case ICE_BLK_FD:
+	case ICE_BLK_RSS:
+		status = 0;
+		break;
+	case ICE_BLK_ACL:
+		status = ice_flow_acl_def_entry_frmt(params);
+		if (status)
+			return status;
+		status = ice_flow_sel_acl_scen(hw, params);
+		if (status)
+			return status;
+		break;
+	default:
+		return ICE_ERR_NOT_IMPL;
+	}
+
+	return status;
+}
+
+#define ICE_FLOW_FIND_PROF_CHK_FLDS	0x00000001
+#define ICE_FLOW_FIND_PROF_CHK_VSI	0x00000002
+#define ICE_FLOW_FIND_PROF_NOT_CHK_DIR	0x00000004
+
+/**
+ * ice_flow_find_prof_conds - Find a profile matching headers and conditions
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @vsi_handle: software VSI handle to check VSI (ICE_FLOW_FIND_PROF_CHK_VSI)
+ * @conds: additional conditions to be checked (ICE_FLOW_FIND_PROF_CHK_*)
+ */
+static struct ice_flow_prof *
+ice_flow_find_prof_conds(struct ice_hw *hw, enum ice_block blk,
+			 enum ice_flow_dir dir, struct ice_flow_seg_info *segs,
+			 u8 segs_cnt, u16 vsi_handle, u32 conds)
+{
+	struct ice_flow_prof *p, *prof = NULL;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry)
+		if ((p->dir == dir || conds & ICE_FLOW_FIND_PROF_NOT_CHK_DIR) &&
+		    segs_cnt && segs_cnt == p->segs_cnt) {
+			u8 i;
+
+			/* Check for profile-VSI association if specified */
+			if ((conds & ICE_FLOW_FIND_PROF_CHK_VSI) &&
+			    ice_is_vsi_valid(hw, vsi_handle) &&
+			    !test_bit(vsi_handle, p->vsis))
+				continue;
+
+			/* Protocol headers must be checked. Matched fields are
+			 * checked if specified.
+			 */
+			for (i = 0; i < segs_cnt; i++)
+				if (segs[i].hdrs != p->segs[i].hdrs ||
+				    ((conds & ICE_FLOW_FIND_PROF_CHK_FLDS) &&
+				     segs[i].match != p->segs[i].match))
+					break;
+
+			/* A match is found if all segments are matched */
+			if (i == segs_cnt) {
+				prof = p;
+				break;
+			}
+		}
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return prof;
+}
+
+/**
+ * ice_flow_find_prof - Look up a profile matching headers and matched fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ */
+u64
+ice_flow_find_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		   struct ice_flow_seg_info *segs, u8 segs_cnt)
+{
+	struct ice_flow_prof *p;
+
+	p = ice_flow_find_prof_conds(hw, blk, dir, segs, segs_cnt,
+				     ICE_MAX_VSI, ICE_FLOW_FIND_PROF_CHK_FLDS);
+
+	return p ? p->id : ICE_FLOW_PROF_ID_INVAL;
+}
+
+/**
+ * ice_flow_find_prof_id - Look up a profile with given profile ID
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: unique ID to identify this flow profile
+ */
+static struct ice_flow_prof *
+ice_flow_find_prof_id(struct ice_hw *hw, enum ice_block blk, u64 prof_id)
+{
+	struct ice_flow_prof *p;
+
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry)
+		if (p->id == prof_id)
+			return p;
+
+	return NULL;
+}
+
+/**
+ * ice_dealloc_flow_entry - Deallocate flow entry memory
+ * @hw: pointer to the HW struct
+ * @entry: flow entry to be removed
+ */
+static void
+ice_dealloc_flow_entry(struct ice_hw *hw, struct ice_flow_entry *entry)
+{
+	if (!entry)
+		return;
+
+	if (entry->entry)
+		devm_kfree(ice_hw_to_dev(hw), entry->entry);
+
+	if (entry->range_buf) {
+		devm_kfree(ice_hw_to_dev(hw), entry->range_buf);
+		entry->range_buf = NULL;
+	}
+
+	if (entry->acts) {
+		devm_kfree(ice_hw_to_dev(hw), entry->acts);
+		entry->acts = NULL;
+		entry->acts_cnt = 0;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), entry);
+}
+
+/**
+ * ice_flow_get_hw_prof - return the HW profile for a specific profile ID handle
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: the profile ID handle
+ * @hw_prof_id: pointer to variable to receive the HW profile ID
+ */
+enum ice_status
+ice_flow_get_hw_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		     u8 *hw_prof_id)
+{
+	enum ice_status status = ICE_ERR_DOES_NOT_EXIST;
+	struct ice_prof_map *map;
+
+	mutex_lock(&hw->blk[blk].es.prof_map_lock);
+	map = ice_search_prof_id(hw, blk, prof_id);
+	if (map) {
+		*hw_prof_id = map->prof_id;
+		status = 0;
+	}
+	mutex_unlock(&hw->blk[blk].es.prof_map_lock);
+	return status;
+}
+
+#define ICE_ACL_INVALID_SCEN	0x3f
+
+/**
+ * ice_flow_acl_is_prof_in_use - Verify if the profile is associated to any PF
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @buf: destination buffer function writes partial extraction sequence to
+ *
+ * returns ICE_SUCCESS if no PF is associated to the given profile
+ * returns ICE_ERR_IN_USE if at least one PF is associated to the given profile
+ * returns other error code for real error
+ */
+static enum ice_status
+ice_flow_acl_is_prof_in_use(struct ice_hw *hw, struct ice_flow_prof *prof,
+			    struct ice_aqc_acl_prof_generic_frmt *buf)
+{
+	enum ice_status status;
+	u8 prof_id = 0;
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_query_acl_prof(hw, prof_id, buf, NULL);
+	if (status)
+		return status;
+
+	/* If all PF's associated scenarios are all 0 or all
+	 * ICE_ACL_INVALID_SCEN (63) for the given profile then the latter has
+	 * not been configured yet.
+	 */
+	if (buf->pf_scenario_num[0] == 0 && buf->pf_scenario_num[1] == 0 &&
+	    buf->pf_scenario_num[2] == 0 && buf->pf_scenario_num[3] == 0 &&
+	    buf->pf_scenario_num[4] == 0 && buf->pf_scenario_num[5] == 0 &&
+	    buf->pf_scenario_num[6] == 0 && buf->pf_scenario_num[7] == 0)
+		return 0;
+
+	if (buf->pf_scenario_num[0] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[1] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[2] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[3] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[4] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[5] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[6] == ICE_ACL_INVALID_SCEN &&
+	    buf->pf_scenario_num[7] == ICE_ACL_INVALID_SCEN)
+		return 0;
+
+	return ICE_ERR_IN_USE;
+}
+
+/**
+ * ice_flow_acl_free_act_cntr - Free the ACL rule's actions
+ * @hw: pointer to the hardware structure
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ */
+static enum ice_status
+ice_flow_acl_free_act_cntr(struct ice_hw *hw, struct ice_flow_action *acts,
+			   u8 acts_cnt)
+{
+	int i;
+
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_BYTES ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES) {
+			struct ice_acl_cntrs cntrs = { 0 };
+			enum ice_status status;
+
+			/* amount is unused in the dealloc path but the common
+			 * parameter check routine wants a value set, as zero
+			 * is invalid for the check. Just set it.
+			 */
+			cntrs.amount = 1;
+			cntrs.bank = 0; /* Only bank0 for the moment */
+			cntrs.first_cntr =
+					le16_to_cpu(acts[i].data.acl_act.value);
+			cntrs.last_cntr =
+					le16_to_cpu(acts[i].data.acl_act.value);
+
+			if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES)
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_DUAL;
+			else
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_SINGLE;
+
+			status = ice_aq_dealloc_acl_cntrs(hw, &cntrs, NULL);
+			if (status)
+				return status;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_flow_acl_disassoc_scen - Disassociate the scenario from the profile
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ *
+ * Disassociate the scenario from the profile for the PF of the VSI.
+ */
+static enum ice_status
+ice_flow_acl_disassoc_scen(struct ice_hw *hw, struct ice_flow_prof *prof)
+{
+	struct ice_aqc_acl_prof_generic_frmt buf;
+	enum ice_status status = 0;
+	u8 prof_id = 0;
+
+	memset(&buf, 0, sizeof(buf));
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_query_acl_prof(hw, prof_id, &buf, NULL);
+	if (status)
+		return status;
+
+	/* Clear scenario for this PF */
+	buf.pf_scenario_num[hw->pf_id] = ICE_ACL_INVALID_SCEN;
+	status = ice_prgm_acl_prof_xtrct(hw, prof_id, &buf, NULL);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_entry_sync - Remove a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry: flow entry to be removed
+ */
+static enum ice_status
+ice_flow_rem_entry_sync(struct ice_hw *hw, enum ice_block blk,
+			struct ice_flow_entry *entry)
+{
+	if (!entry)
+		return ICE_ERR_BAD_PTR;
+
+	if (blk == ICE_BLK_ACL) {
+		enum ice_status status;
+
+		if (ice_dcf_is_acl_capable(hw))
+			return ICE_ERR_IN_USE;
+		if (!entry->prof)
+			return ICE_ERR_BAD_PTR;
+
+		status = ice_acl_rem_entry(hw, entry->prof->cfg.scen,
+					   entry->scen_entry_idx);
+		if (status)
+			return status;
+
+		/* Checks if we need to release an ACL counter. */
+		if (entry->acts_cnt && entry->acts)
+			ice_flow_acl_free_act_cntr(hw, entry->acts,
+						   entry->acts_cnt);
+	}
+
+	list_del(&entry->l_entry);
+
+	ice_dealloc_flow_entry(hw, entry);
+
+	return 0;
+}
+
+/**
+ * ice_flow_add_prof_sync - Add a flow profile for packet segments and fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @prof_id: unique ID to identify this flow profile
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @acts: array of default actions
+ * @acts_cnt: number of default actions
+ * @prof: stores the returned flow profile added
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ */
+static enum ice_status
+ice_flow_add_prof_sync(struct ice_hw *hw, enum ice_block blk,
+		       enum ice_flow_dir dir, u64 prof_id,
+		       struct ice_flow_seg_info *segs, u8 segs_cnt,
+		       struct ice_flow_action *acts, u8 acts_cnt,
+		       struct ice_flow_prof **prof)
+{
+	struct ice_flow_prof_params *params;
+	enum ice_status status;
+	u8 i;
+
+	if (!prof || (acts_cnt && !acts))
+		return ICE_ERR_BAD_PTR;
+
+	params = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return ICE_ERR_NO_MEMORY;
+
+	params->prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*params->prof),
+				    GFP_KERNEL);
+	if (!params->prof) {
+		status = ICE_ERR_NO_MEMORY;
+		goto free_params;
+	}
+
+	/* initialize extraction sequence to all invalid (0xff) */
+	for (i = 0; i < ICE_MAX_FV_WORDS; i++) {
+		params->es[i].prot_id = ICE_PROT_INVALID;
+		params->es[i].off = ICE_FV_OFFSET_INVAL;
+	}
+
+	params->blk = blk;
+	params->prof->id = prof_id;
+	params->prof->dir = dir;
+	params->prof->segs_cnt = segs_cnt;
+
+	/* Make a copy of the segments that need to be persistent in the flow
+	 * profile instance
+	 */
+	for (i = 0; i < segs_cnt; i++)
+		memcpy(&params->prof->segs[i], &segs[i], sizeof(*segs));
+
+	/* Make a copy of the actions that need to be persistent in the flow
+	 * profile instance.
+	 */
+	if (acts_cnt) {
+		params->prof->acts = devm_kmemdup(ice_hw_to_dev(hw), acts,
+						  acts_cnt * sizeof(*acts),
+						  GFP_KERNEL);
+
+		if (!params->prof->acts) {
+			status = ICE_ERR_NO_MEMORY;
+			goto out;
+		}
+	}
+
+
+	status = ice_flow_proc_segs(hw, params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FLOW, "Error processing a flow's packet segments\n");
+		goto out;
+	}
+
+	/* Add a HW profile for this flow profile */
+	status = ice_add_prof(hw, blk, prof_id, (u8 *)params->ptypes,
+			      params->attr, params->attr_cnt, params->es,
+			      params->mask);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FLOW, "Error adding a HW flow profile\n");
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&params->prof->entries);
+	mutex_init(&params->prof->entries_lock);
+	*prof = params->prof;
+
+out:
+	if (status) {
+		if (params->prof->acts)
+			devm_kfree(ice_hw_to_dev(hw), params->prof->acts);
+		devm_kfree(ice_hw_to_dev(hw), params->prof);
+	}
+free_params:
+	devm_kfree(ice_hw_to_dev(hw), params);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_prof_sync - remove a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile to remove
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ */
+static enum ice_status
+ice_flow_rem_prof_sync(struct ice_hw *hw, enum ice_block blk,
+		       struct ice_flow_prof *prof)
+{
+	enum ice_status status;
+
+	/* Remove all remaining flow entries before removing the flow profile */
+	if (!list_empty(&prof->entries)) {
+		struct ice_flow_entry *e, *t;
+
+		mutex_lock(&prof->entries_lock);
+
+		list_for_each_entry_safe(e, t, &prof->entries, l_entry) {
+			status = ice_flow_rem_entry_sync(hw, blk, e);
+			if (status)
+				break;
+		}
+
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	if (blk == ICE_BLK_ACL) {
+		struct ice_aqc_acl_profile_ranges query_rng_buf;
+		struct ice_aqc_acl_prof_generic_frmt buf;
+		u8 prof_id = 0;
+
+		/* Disassociate the scenario from the profile for the PF */
+		status = ice_flow_acl_disassoc_scen(hw, prof);
+		if (status)
+			return status;
+
+		/* Clear the range-checker if the profile ID is no longer
+		 * used by any PF
+		 */
+		status = ice_flow_acl_is_prof_in_use(hw, prof, &buf);
+		if (status && status != ICE_ERR_IN_USE) {
+			return status;
+		} else if (!status) {
+			/* Clear the range-checker value for profile ID */
+			memset(&query_rng_buf, 0,
+			       sizeof(struct ice_aqc_acl_profile_ranges));
+
+			status = ice_flow_get_hw_prof(hw, blk, prof->id,
+						      &prof_id);
+			if (status)
+				return status;
+
+			status = ice_prog_acl_prof_ranges(hw, prof_id,
+							  &query_rng_buf, NULL);
+			if (status)
+				return status;
+		}
+	}
+
+	/* Remove all hardware profiles associated with this flow profile */
+	status = ice_rem_prof(hw, blk, prof->id);
+	if (!status) {
+		list_del(&prof->l_entry);
+		mutex_destroy(&prof->entries_lock);
+		if (prof->acts)
+			devm_kfree(ice_hw_to_dev(hw), prof->acts);
+		devm_kfree(ice_hw_to_dev(hw), prof);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_set_xtrct_seq_fld - Populate xtrct seq for single field
+ * @buf: Destination buffer function writes partial xtrct sequence to
+ * @info: Info about field
+ */
+static void
+ice_flow_acl_set_xtrct_seq_fld(struct ice_aqc_acl_prof_generic_frmt *buf,
+			       struct ice_flow_fld_info *info)
+{
+	u16 dst, i;
+	u8 src;
+
+	src = info->xtrct.idx * ICE_FLOW_FV_EXTRACT_SZ +
+		info->xtrct.disp / BITS_PER_BYTE;
+	dst = info->entry.val;
+	for (i = 0; i < info->entry.last; i++)
+		/* HW stores field vector words in LE, convert words back to BE
+		 * so constructed entries will end up in network order
+		 */
+		buf->byte_selection[dst++] = src++ ^ 1;
+}
+
+/**
+ * ice_flow_acl_set_xtrct_seq - Program ACL extraction sequence
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ */
+static enum ice_status
+ice_flow_acl_set_xtrct_seq(struct ice_hw *hw, struct ice_flow_prof *prof)
+{
+	struct ice_aqc_acl_prof_generic_frmt buf;
+	struct ice_flow_fld_info *info;
+	enum ice_status status;
+	u8 prof_id = 0;
+	u16 i;
+
+	memset(&buf, 0, sizeof(buf));
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	status = ice_flow_acl_is_prof_in_use(hw, prof, &buf);
+	if (status && status != ICE_ERR_IN_USE)
+		return status;
+
+	if (!status) {
+		/* Program the profile dependent configuration. This is done
+		 * only once regardless of the number of PFs using that profile
+		 */
+		memset(&buf, 0, sizeof(buf));
+
+		for (i = 0; i < prof->segs_cnt; i++) {
+			struct ice_flow_seg_info *seg = &prof->segs[i];
+			u16 j;
+
+			for_each_set_bit(j, (unsigned long *)&seg->match,
+					 ICE_FLOW_FIELD_IDX_MAX) {
+				info = &seg->fields[j];
+
+				if (info->type == ICE_FLOW_FLD_TYPE_RANGE)
+					buf.word_selection[info->entry.val] =
+						info->xtrct.idx;
+				else
+					ice_flow_acl_set_xtrct_seq_fld(&buf,
+								       info);
+			}
+
+			for (j = 0; j < seg->raws_cnt; j++) {
+				info = &seg->raws[j].info;
+				ice_flow_acl_set_xtrct_seq_fld(&buf, info);
+			}
+		}
+
+		memset(&buf.pf_scenario_num[0], ICE_ACL_INVALID_SCEN,
+		       ICE_AQC_ACL_PROF_PF_SCEN_NUM_ELEMS);
+	}
+
+	/* Update the current PF */
+	buf.pf_scenario_num[hw->pf_id] = (u8)prof->cfg.scen->id;
+	status = ice_prgm_acl_prof_xtrct(hw, prof_id, &buf, NULL);
+
+	return status;
+}
+
+/**
+ * ice_flow_assoc_vsig_vsi - associate a VSI with VSIG
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @vsi_handle: software VSI handle
+ * @vsig: target VSI group
+ *
+ * Assumption: the caller has already verified that the VSI to
+ * be added has the same characteristics as the VSIG and will
+ * thereby have access to all resources added to that VSIG.
+ */
+enum ice_status
+ice_flow_assoc_vsig_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+			u16 vsig)
+{
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) || blk >= ICE_BLK_COUNT)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+	status = ice_add_vsi_flow(hw, blk, ice_get_hw_vsi_num(hw, vsi_handle),
+				  vsig);
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_assoc_prof - associate a VSI with a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile
+ * @vsi_handle: software VSI handle
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ * and the software VSI handle has been validated
+ */
+static enum ice_status
+ice_flow_assoc_prof(struct ice_hw *hw, enum ice_block blk,
+		    struct ice_flow_prof *prof, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+
+	if (!test_bit(vsi_handle, prof->vsis)) {
+		if (blk == ICE_BLK_ACL) {
+			status = ice_flow_acl_set_xtrct_seq(hw, prof);
+			if (status)
+				return status;
+		}
+		status = ice_add_prof_id_flow(hw, blk,
+					      ice_get_hw_vsi_num(hw,
+								 vsi_handle),
+					      prof->id);
+		if (!status)
+			set_bit(vsi_handle, prof->vsis);
+		else
+			ice_debug(hw, ICE_DBG_FLOW, "HW profile add failed, %d\n",
+				  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_disassoc_prof - disassociate a VSI from a flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @prof: pointer to flow profile
+ * @vsi_handle: software VSI handle
+ *
+ * Assumption: the caller has acquired the lock to the profile list
+ * and the software VSI handle has been validated
+ */
+static enum ice_status
+ice_flow_disassoc_prof(struct ice_hw *hw, enum ice_block blk,
+		       struct ice_flow_prof *prof, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+
+	if (test_bit(vsi_handle, prof->vsis)) {
+		status = ice_rem_prof_id_flow(hw, blk,
+					      ice_get_hw_vsi_num(hw,
+								 vsi_handle),
+					      prof->id);
+		if (!status)
+			clear_bit(vsi_handle, prof->vsis);
+		else
+			ice_debug(hw, ICE_DBG_FLOW, "HW profile remove failed, %d\n",
+				  status);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_add_prof - Add a flow profile for packet segments and matched fields
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @dir: flow direction
+ * @prof_id: unique ID to identify this flow profile
+ * @segs: array of one or more packet segments that describe the flow
+ * @segs_cnt: number of packet segments provided
+ * @acts: array of default actions
+ * @acts_cnt: number of default actions
+ * @prof: stores the returned flow profile added
+ */
+enum ice_status
+ice_flow_add_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		  u64 prof_id, struct ice_flow_seg_info *segs, u8 segs_cnt,
+		  struct ice_flow_action *acts, u8 acts_cnt,
+		  struct ice_flow_prof **prof)
+{
+	enum ice_status status;
+
+	if (segs_cnt > ICE_FLOW_SEG_MAX)
+		return ICE_ERR_MAX_LIMIT;
+
+	if (!segs_cnt)
+		return ICE_ERR_PARAM;
+
+	if (!segs)
+		return ICE_ERR_BAD_PTR;
+
+	status = ice_flow_val_hdrs(segs, segs_cnt);
+	if (status)
+		return status;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	status = ice_flow_add_prof_sync(hw, blk, dir, prof_id, segs, segs_cnt,
+					acts, acts_cnt, prof);
+	if (!status)
+		list_add(&(*prof)->l_entry, &hw->fl_profs[blk]);
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_prof - Remove a flow profile and all entries associated with it
+ * @hw: pointer to the HW struct
+ * @blk: the block for which the flow profile is to be removed
+ * @prof_id: unique ID of the flow profile to be removed
+ */
+enum ice_status
+ice_flow_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id)
+{
+	struct ice_flow_prof *prof;
+	enum ice_status status;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	prof = ice_flow_find_prof_id(hw, blk, prof_id);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto out;
+	}
+
+	/* prof becomes invalid after the call */
+	status = ice_flow_rem_prof_sync(hw, blk, prof);
+
+out:
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return status;
+}
+
+/**
+ * ice_flow_find_entry - look for a flow entry using its unique ID
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry_id: unique ID to identify this flow entry
+ *
+ * This function looks for the flow entry with the specified unique ID in all
+ * flow profiles of the specified classification stage. If the entry is found,
+ * and it returns the handle to the flow entry. Otherwise, it returns
+ * ICE_FLOW_ENTRY_ID_INVAL.
+ */
+u64 ice_flow_find_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_id)
+{
+	struct ice_flow_entry *found = NULL;
+	struct ice_flow_prof *p;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	list_for_each_entry(p, &hw->fl_profs[blk], l_entry) {
+		struct ice_flow_entry *e;
+
+		mutex_lock(&p->entries_lock);
+		list_for_each_entry(e, &p->entries, l_entry)
+			if (e->id == entry_id) {
+				found = e;
+				break;
+			}
+		mutex_unlock(&p->entries_lock);
+
+		if (found)
+			break;
+	}
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+
+	return found ? ICE_FLOW_ENTRY_HNDL(found) : ICE_FLOW_ENTRY_HANDLE_INVAL;
+}
+
+/**
+ * ice_flow_acl_check_actions - Checks the ACL rule's actions
+ * @hw: pointer to the hardware structure
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ * @cnt_alloc: indicates if an ACL counter has been allocated.
+ */
+static enum ice_status
+ice_flow_acl_check_actions(struct ice_hw *hw, struct ice_flow_action *acts,
+			   u8 acts_cnt, bool *cnt_alloc)
+{
+	DECLARE_BITMAP(dup_check, ICE_AQC_TBL_MAX_ACTION_PAIRS * 2);
+	int i;
+
+	bitmap_zero(dup_check, ICE_AQC_TBL_MAX_ACTION_PAIRS * 2);
+	*cnt_alloc = false;
+
+	if (acts_cnt > ICE_FLOW_ACL_MAX_NUM_ACT)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type != ICE_FLOW_ACT_NOP &&
+		    acts[i].type != ICE_FLOW_ACT_DROP &&
+		    acts[i].type != ICE_FLOW_ACT_CNTR_PKT &&
+		    acts[i].type != ICE_FLOW_ACT_FWD_QUEUE)
+			return ICE_ERR_CFG;
+
+		/* If the caller want to add two actions of the same type, then
+		 * it is considered invalid configuration.
+		 */
+		if (test_and_set_bit(acts[i].type, dup_check))
+			return ICE_ERR_PARAM;
+	}
+
+	/* Checks if ACL counters are needed. */
+	for (i = 0; i < acts_cnt; i++) {
+		if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_BYTES ||
+		    acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES) {
+			struct ice_acl_cntrs cntrs = { 0 };
+			enum ice_status status;
+
+			cntrs.amount = 1;
+			cntrs.bank = 0; /* Only bank0 for the moment */
+
+			if (acts[i].type == ICE_FLOW_ACT_CNTR_PKT_BYTES)
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_DUAL;
+			else
+				cntrs.type = ICE_AQC_ACL_CNT_TYPE_SINGLE;
+
+			status = ice_aq_alloc_acl_cntrs(hw, &cntrs, NULL);
+			if (status)
+				return status;
+			/* Counter index within the bank */
+			acts[i].data.acl_act.value =
+						cpu_to_le16(cntrs.first_cntr);
+			*cnt_alloc = true;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_frmt_entry_range - Format an ACL range checker for a given field
+ * @fld: number of the given field
+ * @info: info about field
+ * @range_buf: range checker configuration buffer
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @range: Input/output param indicating which range checkers are being used
+ */
+static void
+ice_flow_acl_frmt_entry_range(u16 fld, struct ice_flow_fld_info *info,
+			      struct ice_aqc_acl_profile_ranges *range_buf,
+			      u8 *data, u8 *range)
+{
+	u16 new_mask;
+
+	/* If not specified, default mask is all bits in field */
+	new_mask = (info->src.mask == ICE_FLOW_FLD_OFF_INVAL ?
+		    BIT(ice_flds_info[fld].size) - 1 :
+		    (*(u16 *)(data + info->src.mask))) << info->xtrct.disp;
+
+	/* If the mask is 0, then we don't need to worry about this input
+	 * range checker value.
+	 */
+	if (new_mask) {
+		u16 new_high =
+			(*(u16 *)(data + info->src.last)) << info->xtrct.disp;
+		u16 new_low =
+			(*(u16 *)(data + info->src.val)) << info->xtrct.disp;
+		u8 range_idx = info->entry.val;
+
+		range_buf->checker_cfg[range_idx].low_boundary =
+			cpu_to_be16(new_low);
+		range_buf->checker_cfg[range_idx].high_boundary =
+			cpu_to_be16(new_high);
+		range_buf->checker_cfg[range_idx].mask = cpu_to_be16(new_mask);
+
+		/* Indicate which range checker is being used */
+		*range |= BIT(range_idx);
+	}
+}
+
+/**
+ * ice_flow_acl_frmt_entry_fld - Partially format ACL entry for a given field
+ * @fld: number of the given field
+ * @info: info about the field
+ * @buf: buffer containing the entry
+ * @dontcare: buffer containing don't care mask for entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ */
+static void
+ice_flow_acl_frmt_entry_fld(u16 fld, struct ice_flow_fld_info *info, u8 *buf,
+			    u8 *dontcare, u8 *data)
+{
+	u16 dst, src, mask, k, end_disp, tmp_s = 0, tmp_m = 0;
+	bool use_mask = false;
+	u8 disp;
+
+	src = info->src.val;
+	mask = info->src.mask;
+	dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+	disp = info->xtrct.disp % BITS_PER_BYTE;
+
+	if (mask != ICE_FLOW_FLD_OFF_INVAL)
+		use_mask = true;
+
+	for (k = 0; k < info->entry.last; k++, dst++) {
+		/* Add overflow bits from previous byte */
+		buf[dst] = (tmp_s & 0xff00) >> 8;
+
+		/* If mask is not valid, tmp_m is always zero, so just setting
+		 * dontcare to 0 (no masked bits). If mask is valid, pulls in
+		 * overflow bits of mask from prev byte
+		 */
+		dontcare[dst] = (tmp_m & 0xff00) >> 8;
+
+		/* If there is displacement, last byte will only contain
+		 * displaced data, but there is no more data to read from user
+		 * buffer, so skip so as not to potentially read beyond end of
+		 * user buffer
+		 */
+		if (!disp || k < info->entry.last - 1) {
+			/* Store shifted data to use in next byte */
+			tmp_s = data[src++] << disp;
+
+			/* Add current (shifted) byte */
+			buf[dst] |= tmp_s & 0xff;
+
+			/* Handle mask if valid */
+			if (use_mask) {
+				tmp_m = (~data[mask++] & 0xff) << disp;
+				dontcare[dst] |= tmp_m & 0xff;
+			}
+		}
+	}
+
+	/* Fill in don't care bits at beginning of field */
+	if (disp) {
+		dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+		for (k = 0; k < disp; k++)
+			dontcare[dst] |= BIT(k);
+	}
+
+	end_disp = (disp + ice_flds_info[fld].size) % BITS_PER_BYTE;
+
+	/* Fill in don't care bits at end of field */
+	if (end_disp) {
+		dst = info->entry.val - ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX +
+		      info->entry.last - 1;
+		for (k = end_disp; k < BITS_PER_BYTE; k++)
+			dontcare[dst] |= BIT(k);
+	}
+}
+
+/**
+ * ice_flow_acl_frmt_entry - Format ACL entry
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @e: pointer to the flow entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @acts: array of actions to be performed on a match
+ * @acts_cnt: number of actions
+ *
+ * Formats the key (and key_inverse) to be matched from the data passed in,
+ * along with data from the flow profile. This key/key_inverse pair makes up
+ * the 'entry' for an ACL flow entry.
+ */
+static enum ice_status
+ice_flow_acl_frmt_entry(struct ice_hw *hw, struct ice_flow_prof *prof,
+			struct ice_flow_entry *e, u8 *data,
+			struct ice_flow_action *acts, u8 acts_cnt)
+{
+	u8 *buf = NULL, *dontcare = NULL, *key = NULL, range = 0, dir_flag_msk;
+	struct ice_aqc_acl_profile_ranges *range_buf = NULL;
+	enum ice_status status;
+	bool cnt_alloc;
+	u8 prof_id = 0;
+	u16 i, buf_sz;
+
+	status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id, &prof_id);
+	if (status)
+		return status;
+
+	/* Format the result action */
+
+	status = ice_flow_acl_check_actions(hw, acts, acts_cnt, &cnt_alloc);
+	if (status)
+		return status;
+
+	status = ICE_ERR_NO_MEMORY;
+
+	e->acts = devm_kmemdup(ice_hw_to_dev(hw), acts,
+			       acts_cnt * sizeof(*acts), GFP_KERNEL);
+	if (!e->acts)
+		goto out;
+
+	e->acts_cnt = acts_cnt;
+
+	/* Format the matching data */
+	buf_sz = prof->cfg.scen->width;
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_sz, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	dontcare = devm_kzalloc(ice_hw_to_dev(hw), buf_sz, GFP_KERNEL);
+	if (!dontcare)
+		goto out;
+
+	/* 'key' buffer will store both key and key_inverse, so must be twice
+	 * size of buf
+	 */
+	key = devm_kzalloc(ice_hw_to_dev(hw), buf_sz * 2, GFP_KERNEL);
+	if (!key)
+		goto out;
+
+	range_buf = devm_kzalloc(ice_hw_to_dev(hw),
+				 sizeof(struct ice_aqc_acl_profile_ranges),
+				 GFP_KERNEL);
+	if (!range_buf)
+		goto out;
+
+	/* Set don't care mask to all 1's to start, will zero out used bytes */
+	memset(dontcare, 0xff, buf_sz);
+
+	for (i = 0; i < prof->segs_cnt; i++) {
+		struct ice_flow_seg_info *seg = &prof->segs[i];
+		u8 j;
+
+		for_each_set_bit(j, (unsigned long *)&seg->match,
+				 ICE_FLOW_FIELD_IDX_MAX) {
+			struct ice_flow_fld_info *info = &seg->fields[j];
+
+			if (info->type == ICE_FLOW_FLD_TYPE_RANGE)
+				ice_flow_acl_frmt_entry_range(j, info,
+							      range_buf, data,
+							      &range);
+			else
+				ice_flow_acl_frmt_entry_fld(j, info, buf,
+							    dontcare, data);
+		}
+
+		for (j = 0; j < seg->raws_cnt; j++) {
+			struct ice_flow_fld_info *info = &seg->raws[j].info;
+			u16 dst, src, mask, k;
+			bool use_mask = false;
+
+			src = info->src.val;
+			dst = info->entry.val -
+					ICE_AQC_ACL_PROF_BYTE_SEL_START_IDX;
+			mask = info->src.mask;
+
+			if (mask != ICE_FLOW_FLD_OFF_INVAL)
+				use_mask = true;
+
+			for (k = 0; k < info->entry.last; k++, dst++) {
+				buf[dst] = data[src++];
+				if (use_mask)
+					dontcare[dst] = ~data[mask++];
+				else
+					dontcare[dst] = 0;
+			}
+		}
+	}
+
+	buf[prof->cfg.scen->pid_idx] = (u8)prof_id;
+	dontcare[prof->cfg.scen->pid_idx] = 0;
+
+	/* Format the buffer for direction flags */
+	dir_flag_msk = BIT(ICE_FLG_PKT_DIR);
+
+
+	if (prof->dir == ICE_FLOW_RX)
+		buf[prof->cfg.scen->pkt_dir_idx] = dir_flag_msk;
+
+	if (range) {
+		buf[prof->cfg.scen->rng_chk_idx] = range;
+		/* Mark any unused range checkers as don't care */
+		dontcare[prof->cfg.scen->rng_chk_idx] = ~range;
+		e->range_buf = range_buf;
+	} else {
+		devm_kfree(ice_hw_to_dev(hw), range_buf);
+	}
+
+	status = ice_set_key(key, buf_sz * 2, buf, NULL, dontcare, NULL, 0,
+			     buf_sz);
+	if (status)
+		goto out;
+
+	e->entry = key;
+	e->entry_sz = buf_sz * 2;
+
+out:
+	if (buf)
+		devm_kfree(ice_hw_to_dev(hw), buf);
+
+	if (dontcare)
+		devm_kfree(ice_hw_to_dev(hw), dontcare);
+
+	if (status && key)
+		devm_kfree(ice_hw_to_dev(hw), key);
+
+	if (status && range_buf) {
+		devm_kfree(ice_hw_to_dev(hw), range_buf);
+		e->range_buf = NULL;
+	}
+
+	if (status && e->acts) {
+		devm_kfree(ice_hw_to_dev(hw), e->acts);
+		e->acts = NULL;
+		e->acts_cnt = 0;
+	}
+
+	if (status && cnt_alloc)
+		ice_flow_acl_free_act_cntr(hw, acts, acts_cnt);
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_find_scen_entry_cond - Find an ACL scenario entry that matches
+ *				       the compared data.
+ * @prof: pointer to flow profile
+ * @e: pointer to the comparing flow entry
+ * @do_chg_action: decide if we want to change the ACL action
+ * @do_add_entry: decide if we want to add the new ACL entry
+ * @do_rem_entry: decide if we want to remove the current ACL entry
+ *
+ * Find an ACL scenario entry that matches the compared data. In the same time,
+ * this function also figure out:
+ * a/ If we want to change the ACL action
+ * b/ If we want to add the new ACL entry
+ * c/ If we want to remove the current ACL entry
+ */
+static struct ice_flow_entry *
+ice_flow_acl_find_scen_entry_cond(struct ice_flow_prof *prof,
+				  struct ice_flow_entry *e, bool *do_chg_action,
+				  bool *do_add_entry, bool *do_rem_entry)
+{
+	struct ice_flow_entry *p, *return_entry = NULL;
+	u8 i, j;
+
+	/* Check if:
+	 * a/ There exists an entry with same matching data, but different
+	 *    priority, then we remove this existing ACL entry. Then, we
+	 *    will add the new entry to the ACL scenario.
+	 * b/ There exists an entry with same matching data, priority, and
+	 *    result action, then we do nothing
+	 * c/ There exists an entry with same matching data, priority, but
+	 *    different, action, then do only change the action's entry.
+	 * d/ Else, we add this new entry to the ACL scenario.
+	 */
+	*do_chg_action = false;
+	*do_add_entry = true;
+	*do_rem_entry = false;
+	list_for_each_entry(p, &prof->entries, l_entry) {
+		if (memcmp(p->entry, e->entry, p->entry_sz))
+			continue;
+
+		/* From this point, we have the same matching_data. */
+		*do_add_entry = false;
+		return_entry = p;
+
+		if (p->priority != e->priority) {
+			/* matching data && !priority */
+			*do_add_entry = true;
+			*do_rem_entry = true;
+			break;
+		}
+
+		/* From this point, we will have matching_data && priority */
+		if (p->acts_cnt != e->acts_cnt)
+			*do_chg_action = true;
+		for (i = 0; i < p->acts_cnt; i++) {
+			bool found_not_match = false;
+
+			for (j = 0; j < e->acts_cnt; j++)
+				if (memcmp(&p->acts[i], &e->acts[j],
+					   sizeof(struct ice_flow_action))) {
+					found_not_match = true;
+					break;
+				}
+
+			if (found_not_match) {
+				*do_chg_action = true;
+				break;
+			}
+		}
+
+		/* (do_chg_action = true) means :
+		 *    matching_data && priority && !result_action
+		 * (do_chg_action = false) means :
+		 *    matching_data && priority && result_action
+		 */
+		break;
+	}
+
+	return return_entry;
+}
+
+/**
+ * ice_flow_acl_convert_to_acl_prio - Convert to ACL priority
+ * @p: flow priority
+ */
+static enum ice_acl_entry_prio
+ice_flow_acl_convert_to_acl_prio(enum ice_flow_priority p)
+{
+	enum ice_acl_entry_prio acl_prio;
+
+	switch (p) {
+	case ICE_FLOW_PRIO_LOW:
+		acl_prio = ICE_ACL_PRIO_LOW;
+		break;
+	case ICE_FLOW_PRIO_NORMAL:
+		acl_prio = ICE_ACL_PRIO_NORMAL;
+		break;
+	case ICE_FLOW_PRIO_HIGH:
+		acl_prio = ICE_ACL_PRIO_HIGH;
+		break;
+	default:
+		acl_prio = ICE_ACL_PRIO_NORMAL;
+		break;
+	}
+
+	return acl_prio;
+}
+
+/**
+ * ice_flow_acl_union_rng_chk - Perform union operation between two
+ *                              range-range checker buffers
+ * @dst_buf: pointer to destination range checker buffer
+ * @src_buf: pointer to source range checker buffer
+ *
+ * For this function, we do the union between dst_buf and src_buf
+ * range checker buffer, and we will save the result back to dst_buf
+ */
+static enum ice_status
+ice_flow_acl_union_rng_chk(struct ice_aqc_acl_profile_ranges *dst_buf,
+			   struct ice_aqc_acl_profile_ranges *src_buf)
+{
+	u8 i, j;
+
+	if (!dst_buf || !src_buf)
+		return ICE_ERR_BAD_PTR;
+
+	for (i = 0; i < ICE_AQC_ACL_PROF_RANGES_NUM_CFG; i++) {
+		struct ice_acl_rng_data *cfg_data = NULL, *in_data;
+		bool will_populate = false;
+
+		in_data = &src_buf->checker_cfg[i];
+
+		if (!in_data->mask)
+			break;
+
+		for (j = 0; j < ICE_AQC_ACL_PROF_RANGES_NUM_CFG; j++) {
+			cfg_data = &dst_buf->checker_cfg[j];
+
+			if (!cfg_data->mask ||
+			    !memcmp(cfg_data, in_data,
+				    sizeof(struct ice_acl_rng_data))) {
+				will_populate = true;
+				break;
+			}
+		}
+
+		if (will_populate) {
+			memcpy(cfg_data, in_data,
+			       sizeof(struct ice_acl_rng_data));
+		} else {
+			/* No available slot left to program range checker */
+			return ICE_ERR_MAX_LIMIT;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flow_acl_add_scen_entry_sync - Add entry to ACL scenario sync
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @entry: double pointer to the flow entry
+ *
+ * For this function, we will look at the current added entries in the
+ * corresponding ACL scenario. Then, we will perform matching logic to
+ * see if we want to add/modify/do nothing with this new entry.
+ */
+static enum ice_status
+ice_flow_acl_add_scen_entry_sync(struct ice_hw *hw, struct ice_flow_prof *prof,
+				 struct ice_flow_entry **entry)
+{
+	bool do_add_entry, do_rem_entry, do_chg_action, do_chg_rng_chk;
+	struct ice_aqc_acl_profile_ranges query_rng_buf, cfg_rng_buf;
+	struct ice_acl_act_entry *acts = NULL;
+	struct ice_flow_entry *exist;
+	enum ice_status status = 0;
+	struct ice_flow_entry *e;
+	u8 i;
+
+	if (!entry || !(*entry) || !prof)
+		return ICE_ERR_BAD_PTR;
+
+	e = *entry;
+
+	do_chg_rng_chk = false;
+	if (e->range_buf) {
+		u8 prof_id = 0;
+
+		status = ice_flow_get_hw_prof(hw, ICE_BLK_ACL, prof->id,
+					      &prof_id);
+		if (status)
+			return status;
+
+		/* Query the current range-checker value in FW */
+		status = ice_query_acl_prof_ranges(hw, prof_id, &query_rng_buf,
+						   NULL);
+		if (status)
+			return status;
+		memcpy(&cfg_rng_buf, &query_rng_buf,
+		       sizeof(struct ice_aqc_acl_profile_ranges));
+
+		/* Generate the new range-checker value */
+		status = ice_flow_acl_union_rng_chk(&cfg_rng_buf, e->range_buf);
+		if (status)
+			return status;
+
+		/* Reconfigure the range check if the buffer is changed. */
+		do_chg_rng_chk = false;
+		if (memcmp(&query_rng_buf, &cfg_rng_buf,
+			   sizeof(struct ice_aqc_acl_profile_ranges))) {
+			status = ice_prog_acl_prof_ranges(hw, prof_id,
+							  &cfg_rng_buf, NULL);
+			if (status)
+				return status;
+
+			do_chg_rng_chk = true;
+		}
+	}
+
+	/* Figure out if we want to (change the ACL action) and/or
+	 * (Add the new ACL entry) and/or (Remove the current ACL entry)
+	 */
+	exist = ice_flow_acl_find_scen_entry_cond(prof, e, &do_chg_action,
+						  &do_add_entry, &do_rem_entry);
+	if (do_rem_entry) {
+		status = ice_flow_rem_entry_sync(hw, ICE_BLK_ACL, exist);
+		if (status)
+			return status;
+	}
+
+	/* Prepare the result action buffer */
+	acts = devm_kcalloc(ice_hw_to_dev(hw), e->entry_sz,
+			    sizeof(struct ice_acl_act_entry), GFP_KERNEL);
+	if (!acts)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < e->acts_cnt; i++)
+		memcpy(&acts[i], &e->acts[i].data.acl_act,
+		       sizeof(struct ice_acl_act_entry));
+
+	if (do_add_entry) {
+		enum ice_acl_entry_prio prio;
+		u8 *keys, *inverts;
+		u16 entry_idx;
+
+		keys = (u8 *)e->entry;
+		inverts = keys + (e->entry_sz / 2);
+		prio = ice_flow_acl_convert_to_acl_prio(e->priority);
+
+		status = ice_acl_add_entry(hw, prof->cfg.scen, prio, keys,
+					   inverts, acts, e->acts_cnt,
+					   &entry_idx);
+		if (status)
+			goto out;
+
+		e->scen_entry_idx = entry_idx;
+		list_add(&e->l_entry, &prof->entries);
+	} else {
+		if (do_chg_action) {
+			/* For the action memory info, update the SW's copy of
+			 * exist entry with e's action memory info
+			 */
+			devm_kfree(ice_hw_to_dev(hw), exist->acts);
+			exist->acts_cnt = e->acts_cnt;
+			exist->acts = devm_kcalloc(ice_hw_to_dev(hw),
+						   exist->acts_cnt,
+						   sizeof(struct ice_flow_action),
+						   GFP_KERNEL);
+			if (!exist->acts) {
+				status = ICE_ERR_NO_MEMORY;
+				goto out;
+			}
+
+			memcpy(exist->acts, e->acts,
+			       sizeof(struct ice_flow_action) * e->acts_cnt);
+
+			status = ice_acl_prog_act(hw, prof->cfg.scen, acts,
+						  e->acts_cnt,
+						  exist->scen_entry_idx);
+			if (status)
+				goto out;
+		}
+
+		if (do_chg_rng_chk) {
+			/* In this case, we want to update the range checker
+			 * information of the exist entry
+			 */
+			status = ice_flow_acl_union_rng_chk(exist->range_buf,
+							    e->range_buf);
+			if (status)
+				goto out;
+		}
+
+		/* As we don't add the new entry to our SW DB, deallocate its
+		 * memories, and return the exist entry to the caller
+		 */
+		ice_dealloc_flow_entry(hw, e);
+		*(entry) = exist;
+	}
+out:
+	devm_kfree(ice_hw_to_dev(hw), acts);
+
+	return status;
+}
+
+/**
+ * ice_flow_acl_add_scen_entry - Add entry to ACL scenario
+ * @hw: pointer to the hardware structure
+ * @prof: pointer to flow profile
+ * @e: double pointer to the flow entry
+ */
+static enum ice_status
+ice_flow_acl_add_scen_entry(struct ice_hw *hw, struct ice_flow_prof *prof,
+			    struct ice_flow_entry **e)
+{
+	enum ice_status status;
+
+	mutex_lock(&prof->entries_lock);
+	status = ice_flow_acl_add_scen_entry_sync(hw, prof, e);
+	mutex_unlock(&prof->entries_lock);
+
+	return status;
+}
+
+/**
+ * ice_flow_add_entry - Add a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @prof_id: ID of the profile to add a new flow entry to
+ * @entry_id: unique ID to identify this flow entry
+ * @vsi_handle: software VSI handle for the flow entry
+ * @prio: priority of the flow entry
+ * @data: pointer to a data buffer containing flow entry's match values/masks
+ * @acts: arrays of actions to be performed on a match
+ * @acts_cnt: number of actions
+ * @entry_h: pointer to buffer that receives the new flow entry's handle
+ */
+enum ice_status
+ice_flow_add_entry(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		   u64 entry_id, u16 vsi_handle, enum ice_flow_priority prio,
+		   void *data, struct ice_flow_action *acts, u8 acts_cnt,
+		   u64 *entry_h)
+{
+	struct ice_flow_entry *e = NULL;
+	struct ice_flow_prof *prof;
+	enum ice_status status = 0;
+
+	/* ACL entries must indicate an action */
+	if (blk == ICE_BLK_ACL && (!acts || !acts_cnt))
+		return ICE_ERR_PARAM;
+
+
+	/* No flow entry data is expected for RSS */
+	if (!entry_h || (!data && blk != ICE_BLK_RSS))
+		return ICE_ERR_BAD_PTR;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->fl_profs_locks[blk]);
+
+	prof = ice_flow_find_prof_id(hw, blk, prof_id);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+	} else {
+		/* Allocate memory for the entry being added and associate
+		 * the VSI to the found flow profile
+		 */
+		e = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*e), GFP_KERNEL);
+		if (!e)
+			status = ICE_ERR_NO_MEMORY;
+		else
+			status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle);
+	}
+
+	mutex_unlock(&hw->fl_profs_locks[blk]);
+	if (status)
+		goto out;
+
+	e->id = entry_id;
+	e->vsi_handle = vsi_handle;
+	e->prof = prof;
+	e->priority = prio;
+
+	switch (blk) {
+	case ICE_BLK_FD:
+	case ICE_BLK_RSS:
+		break;
+	case ICE_BLK_ACL:
+		/* ACL will handle the entry management */
+		if (ice_dcf_is_acl_capable(hw)) {
+			status = ICE_ERR_IN_USE;
+			goto out;
+		}
+		status = ice_flow_acl_frmt_entry(hw, prof, e, (u8 *)data, acts,
+						 acts_cnt);
+		if (status)
+			goto out;
+
+		status = ice_flow_acl_add_scen_entry(hw, prof, &e);
+		if (status)
+			goto out;
+
+		break;
+	default:
+		status = ICE_ERR_NOT_IMPL;
+		goto out;
+	}
+
+	if (blk != ICE_BLK_ACL) {
+		/* ACL will handle the entry management */
+		mutex_lock(&prof->entries_lock);
+		list_add(&e->l_entry, &prof->entries);
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	*entry_h = ICE_FLOW_ENTRY_HNDL(e);
+
+out:
+	if (status && e) {
+		if (e->entry)
+			devm_kfree(ice_hw_to_dev(hw), e->entry);
+		devm_kfree(ice_hw_to_dev(hw), e);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_rem_entry - Remove a flow entry
+ * @hw: pointer to the HW struct
+ * @blk: classification stage
+ * @entry_h: handle to the flow entry to be removed
+ */
+enum ice_status ice_flow_rem_entry(struct ice_hw *hw, enum ice_block blk,
+				   u64 entry_h)
+{
+	struct ice_flow_entry *entry;
+	struct ice_flow_prof *prof;
+	enum ice_status status = 0;
+
+	if (entry_h == ICE_FLOW_ENTRY_HANDLE_INVAL)
+		return ICE_ERR_PARAM;
+
+	entry = ICE_FLOW_ENTRY_PTR(entry_h);
+
+	/* Retain the pointer to the flow profile as the entry will be freed */
+	prof = entry->prof;
+
+	if (prof) {
+		mutex_lock(&prof->entries_lock);
+		status = ice_flow_rem_entry_sync(hw, blk, entry);
+		mutex_unlock(&prof->entries_lock);
+	}
+
+	return status;
+}
+
+/**
+ * ice_flow_set_fld_ext - specifies locations of field from entry's input buffer
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @field_type: type of the field
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's
+ *            input buffer
+ * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from
+ *            entry's input buffer
+ *
+ * This helper function stores information of a field being matched, including
+ * the type of the field and the locations of the value to match, the mask, and
+ * the upper-bound value in the start of the input buffer for a flow entry.
+ * This function should only be used for fixed-size data structures.
+ *
+ * This function also opportunistically determines the protocol headers to be
+ * present based on the fields being set. Some fields cannot be used alone to
+ * determine the protocol headers present. Sometimes, fields for particular
+ * protocol headers are not matched. In those cases, the protocol headers
+ * must be explicitly set.
+ */
+static void
+ice_flow_set_fld_ext(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		     enum ice_flow_fld_match_type field_type, u16 val_loc,
+		     u16 mask_loc, u16 last_loc)
+{
+	u64 bit = BIT_ULL(fld);
+
+	seg->match |= bit;
+	if (field_type == ICE_FLOW_FLD_TYPE_RANGE)
+		seg->range |= bit;
+
+	seg->fields[fld].type = field_type;
+	seg->fields[fld].src.val = val_loc;
+	seg->fields[fld].src.mask = mask_loc;
+	seg->fields[fld].src.last = last_loc;
+
+	ICE_FLOW_SET_HDRS(seg, ice_flds_info[fld].hdr);
+}
+
+/**
+ * ice_flow_set_fld - specifies locations of field from entry's input buffer
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's
+ *            input buffer
+ * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from
+ *            entry's input buffer
+ * @range: indicate if field being matched is to be in a range
+ *
+ * This function specifies the locations, in the form of byte offsets from the
+ * start of the input buffer for a flow entry, from where the value to match,
+ * the mask value, and upper value can be extracted. These locations are then
+ * stored in the flow profile. When adding a flow entry associated with the
+ * flow profile, these locations will be used to quickly extract the values and
+ * create the content of a match entry. This function should only be used for
+ * fixed-size data structures.
+ */
+void
+ice_flow_set_fld(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		 u16 val_loc, u16 mask_loc, u16 last_loc, bool range)
+{
+	enum ice_flow_fld_match_type t = range ?
+		ICE_FLOW_FLD_TYPE_RANGE : ICE_FLOW_FLD_TYPE_REG;
+
+	ice_flow_set_fld_ext(seg, fld, t, val_loc, mask_loc, last_loc);
+}
+
+/**
+ * ice_flow_set_fld_prefix - sets locations of prefix field from entry's buf
+ * @seg: packet segment the field being set belongs to
+ * @fld: field to be set
+ * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from
+ *           entry's input buffer
+ * @pref_loc: location of prefix value from entry's input buffer
+ * @pref_sz: size of the location holding the prefix value
+ *
+ * This function specifies the locations, in the form of byte offsets from the
+ * start of the input buffer for a flow entry, from where the value to match
+ * and the IPv4 prefix value can be extracted. These locations are then stored
+ * in the flow profile. When adding flow entries to the associated flow profile,
+ * these locations can be used to quickly extract the values to create the
+ * content of a match entry. This function should only be used for fixed-size
+ * data structures.
+ */
+void
+ice_flow_set_fld_prefix(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+			u16 val_loc, u16 pref_loc, u8 pref_sz)
+{
+	/* For this type of field, the "mask" location is for the prefix value's
+	 * location and the "last" location is for the size of the location of
+	 * the prefix value.
+	 */
+	ice_flow_set_fld_ext(seg, fld, ICE_FLOW_FLD_TYPE_PREFIX, val_loc,
+			     pref_loc, (u16)pref_sz);
+}
+
+/**
+ * ice_flow_add_fld_raw - sets locations of a raw field from entry's input buf
+ * @seg: packet segment the field being set belongs to
+ * @off: offset of the raw field from the beginning of the segment in bytes
+ * @len: length of the raw pattern to be matched
+ * @val_loc: location of the value to match from entry's input buffer
+ * @mask_loc: location of mask value from entry's input buffer
+ *
+ * This function specifies the offset of the raw field to be match from the
+ * beginning of the specified packet segment, and the locations, in the form of
+ * byte offsets from the start of the input buffer for a flow entry, from where
+ * the value to match and the mask value to be extracted. These locations are
+ * then stored in the flow profile. When adding flow entries to the associated
+ * flow profile, these locations can be used to quickly extract the values to
+ * create the content of a match entry. This function should only be used for
+ * fixed-size data structures.
+ */
+void
+ice_flow_add_fld_raw(struct ice_flow_seg_info *seg, u16 off, u8 len,
+		     u16 val_loc, u16 mask_loc)
+{
+	if (seg->raws_cnt < ICE_FLOW_SEG_RAW_FLD_MAX) {
+		seg->raws[seg->raws_cnt].off = off;
+		seg->raws[seg->raws_cnt].info.type = ICE_FLOW_FLD_TYPE_SIZE;
+		seg->raws[seg->raws_cnt].info.src.val = val_loc;
+		seg->raws[seg->raws_cnt].info.src.mask = mask_loc;
+		/* The "last" field is used to store the length of the field */
+		seg->raws[seg->raws_cnt].info.src.last = len;
+	}
+
+	/* Overflows of "raws" will be handled as an error condition later in
+	 * the flow when this information is processed.
+	 */
+	seg->raws_cnt++;
+}
+
+/**
+ * ice_flow_rem_vsi_prof - remove vsi from flow profile
+ * @hw: pointer to the hardware structure
+ * @blk: classification stage
+ * @vsi_handle: software VSI handle
+ * @prof_id: unique ID to identify this flow profile
+ *
+ * This function removes the flow entries associated to the input
+ * vsi handle and disassociates the vsi from the flow profile.
+ */
+enum ice_status ice_flow_rem_vsi_prof(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+				      u64 prof_id)
+{
+	struct ice_flow_prof *prof = NULL;
+	enum ice_status status = 0;
+
+	if (blk >= ICE_BLK_COUNT || !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* find flow profile pointer with input package block and profile id */
+	prof = ice_flow_find_prof_id(hw, ICE_BLK_FD, prof_id);
+	if (!prof) {
+		ice_debug(hw, ICE_DBG_PKG,
+			  "Cannot find flow profile id=%llu\n", prof_id);
+		return ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	/* Remove all remaining flow entries before removing the flow profile */
+	if (!list_empty(&prof->entries)) {
+		struct ice_flow_entry *e, *t;
+
+		mutex_lock(&prof->entries_lock);
+		list_for_each_entry_safe(e, t, &prof->entries, l_entry) {
+			if (e->vsi_handle != vsi_handle)
+				continue;
+
+			status = ice_flow_rem_entry_sync(hw, blk, e);
+			if (status)
+				break;
+		}
+		mutex_unlock(&prof->entries_lock);
+	}
+	if (status)
+		return status;
+
+	/* disassociate the flow profile from sw vsi handle */
+	status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+	if (status)
+		ice_debug(hw, ICE_DBG_PKG,
+			  "ice_flow_disassoc_prof() failed with status=%d\n",
+			  status);
+	return status;
+}
+
+#define ICE_FLOW_RSS_SEG_HDR_L2_MASKS \
+(ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN)
+
+#define ICE_FLOW_RSS_SEG_HDR_L3_MASKS \
+	(ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6)
+
+#define ICE_FLOW_RSS_SEG_HDR_L4_MASKS \
+	(ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP)
+
+#define ICE_FLOW_RSS_SEG_HDR_VAL_MASKS \
+	(ICE_FLOW_RSS_SEG_HDR_L2_MASKS | \
+	 ICE_FLOW_RSS_SEG_HDR_L3_MASKS | \
+	 ICE_FLOW_RSS_SEG_HDR_L4_MASKS)
+
+/**
+ * ice_flow_set_rss_seg_info - setup packet segments for RSS
+ * @segs: pointer to the flow field segment(s)
+ * @seg_cnt: segment count
+ * @cfg: configure parameters
+ *
+ * Helper function to extract fields from hash bitmap and use flow
+ * header value to set flow field segment for further use in flow
+ * profile entry or removal.
+ */
+static enum ice_status
+ice_flow_set_rss_seg_info(struct ice_flow_seg_info *segs, u8 seg_cnt,
+			  const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_flow_seg_info *seg;
+	u64 val;
+	u8 i;
+
+	/* set inner most segment */
+	seg = &segs[seg_cnt - 1];
+
+	for_each_set_bit(i, (const unsigned long *)&cfg->hash_flds,
+			 ICE_FLOW_FIELD_IDX_MAX)
+		ice_flow_set_fld(seg, (enum ice_flow_field)i,
+				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	ICE_FLOW_SET_HDRS(seg, cfg->addl_hdrs);
+
+	/* set outer most header */
+	if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV4)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV4 |
+						   ICE_FLOW_SEG_HDR_IPV_FRAG |
+						   ICE_FLOW_SEG_HDR_IPV_OTHER;
+	else if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV6)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV6 |
+						   ICE_FLOW_SEG_HDR_IPV_FRAG |
+						   ICE_FLOW_SEG_HDR_IPV_OTHER;
+
+	if (seg->hdrs & ~ICE_FLOW_RSS_SEG_HDR_VAL_MASKS &
+	    ~ICE_FLOW_RSS_HDRS_INNER_MASK & ~ICE_FLOW_SEG_HDR_IPV_OTHER &
+	    ~ICE_FLOW_SEG_HDR_IPV_FRAG)
+		return ICE_ERR_PARAM;
+
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L3_MASKS);
+	if (val && !is_power_of_2(val))
+		return ICE_ERR_CFG;
+
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L4_MASKS);
+	if (val && !is_power_of_2(val))
+		return ICE_ERR_CFG;
+
+	return 0;
+}
+
+/**
+ * ice_rem_vsi_rss_list - remove VSI from RSS list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ *
+ * Remove the VSI from all RSS configurations in the list.
+ */
+void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_rss_cfg *r, *tmp;
+
+	if (list_empty(&hw->rss_list_head))
+		return;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry)
+		if (test_and_clear_bit(vsi_handle, r->vsis))
+			if (bitmap_empty(r->vsis, ICE_MAX_VSI)) {
+				list_del(&r->l_entry);
+				devm_kfree(ice_hw_to_dev(hw), r);
+			}
+	mutex_unlock(&hw->rss_locks);
+}
+
+/**
+ * ice_rem_vsi_rss_cfg - remove RSS configurations associated with VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function will iterate through all flow profiles and disassociate
+ * the VSI from that profile. If the flow profile has no VSIs it will
+ * be removed.
+ */
+enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_prof *p, *t;
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	if (list_empty(&hw->fl_profs[blk]))
+		return 0;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry_safe(p, t, &hw->fl_profs[blk], l_entry)
+		if (test_bit(vsi_handle, p->vsis)) {
+			status = ice_flow_disassoc_prof(hw, blk, p, vsi_handle);
+			if (status)
+				break;
+
+			if (bitmap_empty(p->vsis, ICE_MAX_VSI)) {
+				status = ice_flow_rem_prof(hw, blk, p->id);
+				if (status)
+					break;
+			}
+		}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/**
+ * ice_get_rss_hdr_type - get a RSS profile's header type
+ * @prof: RSS flow profile
+ */
+static enum ice_rss_cfg_hdr_type
+ice_get_rss_hdr_type(struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type = ICE_RSS_ANY_HEADERS;
+
+	if (prof->segs_cnt == ICE_FLOW_SEG_SINGLE) {
+		hdr_type = ICE_RSS_OUTER_HEADERS;
+	} else if (prof->segs_cnt == ICE_FLOW_SEG_MAX) {
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs == ICE_FLOW_SEG_HDR_NONE)
+			hdr_type = ICE_RSS_INNER_HEADERS;
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV4;
+		if (prof->segs[ICE_RSS_OUTER_HEADERS].hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV6;
+	}
+
+	return hdr_type;
+}
+
+/**
+ * ice_rem_rss_list - remove RSS configuration from list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @prof: pointer to flow profile
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static void
+ice_rem_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type;
+	struct ice_rss_cfg *r, *tmp;
+
+	/* Search for RSS hash fields associated to the VSI that match the
+	 * hash configurations associated to the flow profile. If found
+	 * remove from the RSS entry list of the VSI context and delete entry.
+	 */
+	hdr_type = ice_get_rss_hdr_type(prof);
+	list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry)
+		if (r->hash.hash_flds == prof->segs[prof->segs_cnt - 1].match &&
+		    r->hash.addl_hdrs == prof->segs[prof->segs_cnt - 1].hdrs &&
+		    r->hash.hdr_type == hdr_type) {
+			clear_bit(vsi_handle, r->vsis);
+			if (bitmap_empty(r->vsis, ICE_MAX_VSI)) {
+				list_del(&r->l_entry);
+				devm_kfree(ice_hw_to_dev(hw), r);
+			}
+			return;
+		}
+}
+
+/**
+ * ice_add_rss_list - add RSS configuration to list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @prof: pointer to flow profile
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
+{
+	enum ice_rss_cfg_hdr_type hdr_type;
+	struct ice_rss_cfg *r, *rss_cfg;
+
+	hdr_type = ice_get_rss_hdr_type(prof);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry)
+		if (r->hash.hash_flds == prof->segs[prof->segs_cnt - 1].match &&
+		    r->hash.addl_hdrs == prof->segs[prof->segs_cnt - 1].hdrs &&
+		    r->hash.hdr_type == hdr_type) {
+			set_bit(vsi_handle, r->vsis);
+			return 0;
+		}
+
+	rss_cfg = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rss_cfg),
+			       GFP_KERNEL);
+	if (!rss_cfg)
+		return ICE_ERR_NO_MEMORY;
+
+	rss_cfg->hash.hash_flds = prof->segs[prof->segs_cnt - 1].match;
+	rss_cfg->hash.addl_hdrs = prof->segs[prof->segs_cnt - 1].hdrs;
+	rss_cfg->hash.hdr_type = hdr_type;
+	rss_cfg->hash.symm = prof->cfg.symm;
+	set_bit(vsi_handle, rss_cfg->vsis);
+
+	list_add_tail(&rss_cfg->l_entry, &hw->rss_list_head);
+
+	return 0;
+}
+
+#define ICE_FLOW_PROF_HASH_S	0
+#define ICE_FLOW_PROF_HASH_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_HASH_S)
+#define ICE_FLOW_PROF_HDR_S	32
+#define ICE_FLOW_PROF_HDR_M	(0x3FFFFFFFULL << ICE_FLOW_PROF_HDR_S)
+#define ICE_FLOW_PROF_ENCAP_S	62
+#define ICE_FLOW_PROF_ENCAP_M	(0x3ULL << ICE_FLOW_PROF_ENCAP_S)
+
+/* Flow profile ID format:
+ * [0:31] - Packet match fields
+ * [32:61] - Protocol header
+ * [62:63] - Encapsulation flag:
+ *	     0 if non-tunneled
+ *	     1 if tunneled
+ *	     2 for tunneled with outer ipv4
+ *	     3 for tunneled with outer ipv6
+ */
+#define ICE_FLOW_GEN_PROFID(hash, hdr, encap) \
+	((u64)(((u64)(hash) & ICE_FLOW_PROF_HASH_M) | \
+	       (((u64)(hdr) << ICE_FLOW_PROF_HDR_S) & ICE_FLOW_PROF_HDR_M) | \
+	       (((u64)(encap) << ICE_FLOW_PROF_ENCAP_S) & ICE_FLOW_PROF_ENCAP_M)))
+
+static void
+ice_rss_config_xor_word(struct ice_hw *hw, u8 prof_id, u8 src, u8 dst)
+{
+	u32 s = ((src % 4) << 3); /* byte shift */
+	u32 v = dst | 0x80; /* value to program */
+	u8 i = src / 4; /* register index */
+	u32 reg;
+
+	reg = rd32(hw, GLQF_HSYMM(prof_id, i));
+	reg = (reg & ~(0xff << s)) | (v << s);
+	wr32(hw, GLQF_HSYMM(prof_id, i), reg);
+}
+
+static void
+ice_rss_config_xor(struct ice_hw *hw, u8 prof_id, u8 src, u8 dst, u8 len)
+{
+	int fv_last_word =
+		ICE_FLOW_SW_FIELD_VECTOR_MAX / ICE_FLOW_FV_EXTRACT_SZ - 1;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		ice_rss_config_xor_word(hw, prof_id,
+					/* Yes, field vector in GLQF_HSYMM and
+					 * GLQF_HINSET is inversed!
+					 */
+					fv_last_word - (src + i),
+					fv_last_word - (dst + i));
+		ice_rss_config_xor_word(hw, prof_id,
+					fv_last_word - (dst + i),
+					fv_last_word - (src + i));
+	}
+}
+
+static void
+ice_rss_update_symm(struct ice_hw *hw,
+		    struct ice_flow_prof *prof)
+{
+	struct ice_prof_map *map;
+	u8 prof_id, m;
+
+	mutex_lock(&hw->blk[ICE_BLK_RSS].es.prof_map_lock);
+	map = ice_search_prof_id(hw, ICE_BLK_RSS, prof->id);
+	if (map)
+		prof_id = map->prof_id;
+	mutex_unlock(&hw->blk[ICE_BLK_RSS].es.prof_map_lock);
+	if (!map)
+		return;
+	/* clear to default */
+	for (m = 0; m < 6; m++)
+		wr32(hw, GLQF_HSYMM(prof_id, m), 0);
+	if (prof->cfg.symm) {
+		struct ice_flow_seg_info *seg =
+			&prof->segs[prof->segs_cnt - 1];
+
+		struct ice_flow_seg_xtrct *ipv4_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV4_SA].xtrct;
+		struct ice_flow_seg_xtrct *ipv4_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV4_DA].xtrct;
+		struct ice_flow_seg_xtrct *ipv6_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV6_SA].xtrct;
+		struct ice_flow_seg_xtrct *ipv6_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_IPV6_DA].xtrct;
+
+		struct ice_flow_seg_xtrct *tcp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_TCP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *tcp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_TCP_DST_PORT].xtrct;
+
+		struct ice_flow_seg_xtrct *udp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_UDP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *udp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_UDP_DST_PORT].xtrct;
+
+		struct ice_flow_seg_xtrct *sctp_src =
+			&seg->fields[ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT].xtrct;
+		struct ice_flow_seg_xtrct *sctp_dst =
+			&seg->fields[ICE_FLOW_FIELD_IDX_SCTP_DST_PORT].xtrct;
+
+		/* xor IPv4 */
+		if (ipv4_src->prot_id != 0 && ipv4_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   ipv4_src->idx, ipv4_dst->idx, 2);
+
+		/* xor IPv6 */
+		if (ipv6_src->prot_id != 0 && ipv6_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   ipv6_src->idx, ipv6_dst->idx, 8);
+
+		/* xor TCP */
+		if (tcp_src->prot_id != 0 && tcp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   tcp_src->idx, tcp_dst->idx, 1);
+
+		/* xor UDP */
+		if (udp_src->prot_id != 0 && udp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   udp_src->idx, udp_dst->idx, 1);
+
+		/* xor SCTP */
+		if (sctp_src->prot_id != 0 && sctp_dst->prot_id != 0)
+			ice_rss_config_xor(hw, prof_id,
+					   sctp_src->idx, sctp_dst->idx, 1);
+	}
+}
+
+/**
+ * ice_add_rss_cfg_sync - add an RSS configuration
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_flow_seg_info *segs;
+	enum ice_status status;
+	u8 segs_cnt;
+
+
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			ICE_FLOW_SEG_SINGLE : ICE_FLOW_SEG_MAX;
+
+	segs = devm_kcalloc(ice_hw_to_dev(hw), segs_cnt, sizeof(*segs),
+			    GFP_KERNEL);
+	if (!segs)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Construct the packet segment info from the hashed fields */
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
+	if (status)
+		goto exit;
+
+	/* Search for a flow profile that has matching headers, hash fields
+	 * and has the input VSI associated to it. If found, no further
+	 * operations required and exit.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS |
+					ICE_FLOW_FIND_PROF_CHK_VSI);
+	if (prof) {
+		if (prof->cfg.symm == cfg->symm)
+			goto exit;
+		prof->cfg.symm = cfg->symm;
+		goto update_symm;
+	}
+
+	/* Check if a flow profile exists with the same protocol headers and
+	 * associated with the input VSI. If so disassociate the VSI from
+	 * this profile. The VSI will be added to a new profile created with
+	 * the protocol header and new hash field configuration.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle, ICE_FLOW_FIND_PROF_CHK_VSI);
+	if (prof) {
+		status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+		if (!status)
+			ice_rem_rss_list(hw, vsi_handle, prof);
+		else
+			goto exit;
+
+		/* Remove profile if it has no VSIs associated */
+		if (bitmap_empty(prof->vsis, ICE_MAX_VSI)) {
+			status = ice_flow_rem_prof(hw, blk, prof->id);
+			if (status)
+				goto exit;
+		}
+	}
+
+	/* Search for a profile that has same match fields only. If this
+	 * exists then associate the VSI to this profile.
+	 */
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS);
+	if (prof) {
+		if (prof->cfg.symm == cfg->symm) {
+			status = ice_flow_assoc_prof(hw, blk, prof,
+						     vsi_handle);
+			if (!status)
+				status = ice_add_rss_list(hw, vsi_handle,
+							  prof);
+		} else {
+			/* if a profile exist but with different symmetric
+			 * requirement, just return error.
+			 */
+			status = ICE_ERR_NOT_SUPPORTED;
+		}
+		goto exit;
+	}
+
+	/* Create a new flow profile with generated profile and packet
+	 * segment information.
+	 */
+	status = ice_flow_add_prof(hw, blk, ICE_FLOW_RX,
+				   ICE_FLOW_GEN_PROFID(cfg->hash_flds,
+						       segs[segs_cnt - 1].hdrs,
+						       cfg->hdr_type),
+				   segs, segs_cnt, NULL, 0, &prof);
+	if (status)
+		goto exit;
+
+	status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle);
+	/* If association to a new flow profile failed then this profile can
+	 * be removed.
+	 */
+	if (status) {
+		ice_flow_rem_prof(hw, blk, prof->id);
+		goto exit;
+	}
+
+	status = ice_add_rss_list(hw, vsi_handle, prof);
+
+	prof->cfg.symm = cfg->symm;
+update_symm:
+	ice_rss_update_symm(hw, prof);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), segs);
+	return status;
+}
+
+/**
+ * ice_add_rss_cfg - add an RSS configuration with specified hashed fields
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * This function will generate a flow profile based on fields associated with
+ * the input fields to hash on, the flow type and use the VSI number to add
+ * a flow entry to the profile.
+ */
+enum ice_status
+ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_rss_hash_cfg local_cfg;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) ||
+	    !cfg || cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
+		return ICE_ERR_PARAM;
+
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		mutex_lock(&hw->rss_locks);
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		mutex_unlock(&hw->rss_locks);
+	} else {
+		mutex_lock(&hw->rss_locks);
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_add_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+		mutex_unlock(&hw->rss_locks);
+	}
+
+	return status;
+}
+
+/**
+ * ice_rem_rss_cfg_sync - remove an existing RSS configuration
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * Assumption: lock has already been acquired for RSS list
+ */
+static enum ice_status
+ice_rem_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
+{
+	const enum ice_block blk = ICE_BLK_RSS;
+	struct ice_flow_seg_info *segs;
+	struct ice_flow_prof *prof;
+	enum ice_status status;
+	u8 segs_cnt;
+
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			ICE_FLOW_SEG_SINGLE : ICE_FLOW_SEG_MAX;
+	segs = devm_kcalloc(ice_hw_to_dev(hw), segs_cnt, sizeof(*segs),
+			    GFP_KERNEL);
+	if (!segs)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Construct the packet segment info from the hashed fields */
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
+	if (status)
+		goto out;
+
+	prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt,
+					vsi_handle,
+					ICE_FLOW_FIND_PROF_CHK_FLDS);
+	if (!prof) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto out;
+	}
+
+	status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle);
+	if (status)
+		goto out;
+
+	/* Remove RSS configuration from VSI context before deleting
+	 * the flow profile.
+	 */
+	ice_rem_rss_list(hw, vsi_handle, prof);
+
+	if (bitmap_empty(prof->vsis, ICE_MAX_VSI))
+		status = ice_flow_rem_prof(hw, blk, prof->id);
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), segs);
+	return status;
+}
+
+/**
+ * ice_rem_rss_cfg - remove an existing RSS config with matching hashed fields
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @cfg: configure parameters
+ *
+ * This function will lookup the flow profile based on the input
+ * hash field bitmap, iterate through the profile entry list of
+ * that profile and find entry associated with input VSI to be
+ * removed. Calls are made to underlying flow apis which will in
+ * turn build or update buffers for RSS XLT1 section.
+ */
+enum ice_status
+ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
+{
+	struct ice_rss_hash_cfg local_cfg;
+	enum ice_status status;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle) ||
+	    !cfg || cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->rss_locks);
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+	} else {
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_rem_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+	}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/* Mapping of AVF hash bit fields to an L3-L4 hash combination.
+ * As the ice_flow_avf_hdr_field represent individual bit shifts in a hash,
+ * convert its values to their appropriate flow L3, L4 values.
+ */
+#define ICE_FLOW_AVF_RSS_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4))
+#define ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP))
+#define ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP))
+#define ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS \
+	(ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS | \
+	 ICE_FLOW_AVF_RSS_IPV4_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP))
+
+#define ICE_FLOW_AVF_RSS_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6))
+#define ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP))
+#define ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS \
+	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP))
+#define ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS \
+	(ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS | \
+	 ICE_FLOW_AVF_RSS_IPV6_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP))
+
+/**
+ * ice_add_avf_rss_cfg - add an RSS configuration for AVF driver
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @avf_hash: hash bit fields (ICE_AVF_FLOW_FIELD_*) to configure
+ *
+ * This function will take the hash bitmap provided by the AVF driver via a
+ * message, convert it to ICE-compatible values, and configure RSS flow
+ * profiles.
+ */
+enum ice_status
+ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 avf_hash)
+{
+	enum ice_status status = 0;
+	struct ice_rss_hash_cfg hcfg;
+	u64 hash_flds;
+
+	if (avf_hash == ICE_AVF_FLOW_FIELD_INVALID ||
+	    !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Make sure no unsupported bits are specified */
+	if (avf_hash & ~(ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS |
+			 ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS))
+		return ICE_ERR_CFG;
+
+	hash_flds = avf_hash;
+
+	/* Always create an L3 RSS configuration for any L4 RSS configuration */
+	if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS)
+		hash_flds |= ICE_FLOW_AVF_RSS_IPV4_MASKS;
+
+	if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS)
+		hash_flds |= ICE_FLOW_AVF_RSS_IPV6_MASKS;
+
+	/* Create the corresponding RSS configuration for each valid hash bit */
+	while (hash_flds) {
+		u64 rss_hash = ICE_HASH_INVALID;
+
+		if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS) {
+			if (hash_flds & ICE_FLOW_AVF_RSS_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_IPV4_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_TCP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_UDP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS;
+			} else if (hash_flds &
+				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP)) {
+				rss_hash = ICE_FLOW_HASH_IPV4 |
+					ICE_FLOW_HASH_SCTP_PORT;
+				hash_flds &=
+					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP);
+			}
+		} else if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS) {
+			if (hash_flds & ICE_FLOW_AVF_RSS_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_IPV6_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_TCP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS;
+			} else if (hash_flds &
+				   ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_UDP_PORT;
+				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS;
+			} else if (hash_flds &
+				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP)) {
+				rss_hash = ICE_FLOW_HASH_IPV6 |
+					ICE_FLOW_HASH_SCTP_PORT;
+				hash_flds &=
+					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP);
+			}
+		}
+
+		if (rss_hash == ICE_HASH_INVALID)
+			return ICE_ERR_OUT_OF_RANGE;
+
+		hcfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		hcfg.hash_flds = rss_hash;
+		hcfg.symm = false;
+		hcfg.hdr_type = ICE_RSS_ANY_HEADERS;
+		status = ice_add_rss_cfg(hw, vsi_handle, &hcfg);
+		if (status)
+			break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_replay_rss_cfg - replay RSS configurations associated with VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ */
+enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
+{
+	enum ice_status status = 0;
+	struct ice_rss_cfg *r;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry) {
+		if (test_bit(vsi_handle, r->vsis)) {
+			status = ice_add_rss_cfg_sync(hw, vsi_handle, &r->hash);
+			if (status)
+				break;
+		}
+	}
+	mutex_unlock(&hw->rss_locks);
+
+	return status;
+}
+
+/**
+ * ice_get_rss_cfg - returns hashed fields for the given header types
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: software VSI handle
+ * @hdrs: protocol header type
+ *
+ * This function will return the match fields of the first instance of flow
+ * profile having the given header types and containing input VSI
+ */
+u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs)
+{
+	u64 rss_hash = ICE_HASH_INVALID;
+	struct ice_rss_cfg *r;
+
+	/* verify if the protocol header is non zero and VSI is valid */
+	if (hdrs == ICE_FLOW_SEG_HDR_NONE || !ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_HASH_INVALID;
+
+	mutex_lock(&hw->rss_locks);
+	list_for_each_entry(r, &hw->rss_list_head, l_entry)
+		if (test_bit(vsi_handle, r->vsis) &&
+		    r->hash.addl_hdrs == hdrs) {
+			rss_hash = r->hash.hash_flds;
+			break;
+		}
+	mutex_unlock(&hw->rss_locks);
+
+	return rss_hash;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
new file mode 100644
index 0000000000000000000000000000000000000000..d41fcfff3bd3da4587999811bdc6e582f06af021
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -0,0 +1,572 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FLOW_H_
+#define _ICE_FLOW_H_
+
+#include "ice_flex_type.h"
+#include "ice_acl.h"
+
+#define ICE_IPV4_MAKE_PREFIX_MASK(prefix) ((u32)(~0) << (32 - (prefix)))
+#define ICE_FLOW_PROF_ID_INVAL		0xfffffffffffffffful
+#define ICE_FLOW_PROF_ID_BYPASS		0
+#define ICE_FLOW_PROF_ID_DEFAULT	1
+#define ICE_FLOW_ENTRY_HANDLE_INVAL	0
+#define ICE_FLOW_VSI_INVAL		0xffff
+#define ICE_FLOW_FLD_OFF_INVAL		0xffff
+
+/* Generate flow hash field from flow field type(s) */
+#define ICE_FLOW_HASH_ETH	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA))
+#define ICE_FLOW_HASH_IPV4	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA))
+#define ICE_FLOW_HASH_IPV6	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA))
+#define ICE_FLOW_HASH_IPV6_PRE32	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA))
+#define ICE_FLOW_HASH_IPV6_PRE48	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA))
+#define ICE_FLOW_HASH_IPV6_PRE64	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA))
+#define ICE_FLOW_HASH_TCP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT))
+#define ICE_FLOW_HASH_UDP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT))
+#define ICE_FLOW_HASH_SCTP_PORT	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT))
+
+#define ICE_HASH_INVALID	0
+#define ICE_HASH_TCP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_TCP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_UDP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_SCTP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_SCTP_PORT)
+
+#define ICE_HASH_TCP_IPV6_PRE32	 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE32	 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE32 \
+	(ICE_FLOW_HASH_IPV6_PRE32 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_TCP_IPV6_PRE48	 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE48	 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE48 \
+	(ICE_FLOW_HASH_IPV6_PRE48 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_TCP_IPV6_PRE64	 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_TCP_PORT)
+#define ICE_HASH_UDP_IPV6_PRE64	 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV6_PRE64 \
+	(ICE_FLOW_HASH_IPV6_PRE64 | ICE_FLOW_HASH_SCTP_PORT)
+
+#define ICE_FLOW_HASH_VXLAN_VNI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_VXLAN_VNI))
+
+#define ICE_FLOW_HASH_GTP_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID))
+
+#define ICE_FLOW_HASH_GTP_IPV4_TEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_TEID)
+#define ICE_FLOW_HASH_GTP_IPV6_TEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_TEID)
+
+#define ICE_FLOW_HASH_GTP_U_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID))
+
+#define ICE_FLOW_HASH_GTP_U_IPV4_TEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_U_TEID)
+#define ICE_FLOW_HASH_GTP_U_IPV6_TEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_U_TEID)
+
+#define ICE_FLOW_HASH_GTP_U_EH_TEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_EH_TEID))
+
+#define ICE_FLOW_HASH_GTP_U_EH_QFI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_EH_QFI))
+
+#define ICE_FLOW_HASH_GTP_U_IPV4_EH \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_GTP_U_EH_TEID | \
+	 ICE_FLOW_HASH_GTP_U_EH_QFI)
+#define ICE_FLOW_HASH_GTP_U_IPV6_EH \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_GTP_U_EH_TEID | \
+	 ICE_FLOW_HASH_GTP_U_EH_QFI)
+
+#define ICE_FLOW_HASH_PPPOE_SESS_ID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID))
+
+#define ICE_FLOW_HASH_PPPOE_SESS_ID_ETH \
+	(ICE_FLOW_HASH_ETH | ICE_FLOW_HASH_PPPOE_SESS_ID)
+#define ICE_FLOW_HASH_PPPOE_TCP_ID \
+	(ICE_FLOW_HASH_TCP_PORT | ICE_FLOW_HASH_PPPOE_SESS_ID)
+#define ICE_FLOW_HASH_PPPOE_UDP_ID \
+	(ICE_FLOW_HASH_UDP_PORT | ICE_FLOW_HASH_PPPOE_SESS_ID)
+
+#define ICE_FLOW_HASH_PFCP_SEID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID))
+#define ICE_FLOW_HASH_PFCP_IPV4_SEID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_PFCP_SEID)
+#define ICE_FLOW_HASH_PFCP_IPV6_SEID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_PFCP_SEID)
+
+#define ICE_FLOW_HASH_L2TPV3_SESS_ID \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID))
+#define ICE_FLOW_HASH_L2TPV3_IPV4_SESS_ID \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_L2TPV3_SESS_ID)
+#define ICE_FLOW_HASH_L2TPV3_IPV6_SESS_ID \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_L2TPV3_SESS_ID)
+
+#define ICE_FLOW_HASH_ESP_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI))
+#define ICE_FLOW_HASH_ESP_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_ESP_SPI)
+#define ICE_FLOW_HASH_ESP_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_ESP_SPI)
+
+#define ICE_FLOW_HASH_AH_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI))
+#define ICE_FLOW_HASH_AH_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_AH_SPI)
+#define ICE_FLOW_HASH_AH_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_AH_SPI)
+
+#define ICE_FLOW_HASH_NAT_T_ESP_SPI \
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI))
+#define ICE_FLOW_HASH_NAT_T_ESP_IPV4_SPI \
+	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_NAT_T_ESP_SPI)
+#define ICE_FLOW_HASH_NAT_T_ESP_IPV6_SPI \
+	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_NAT_T_ESP_SPI)
+
+/* Protocol header fields within a packet segment. A segment consists of one or
+ * more protocol headers that make up a logical group of protocol headers. Each
+ * logical group of protocol headers encapsulates or is encapsulated using/by
+ * tunneling or encapsulation protocols for network virtualization such as GRE,
+ * VxLAN, etc.
+ */
+enum ice_flow_seg_hdr {
+	ICE_FLOW_SEG_HDR_NONE		= 0x00000000,
+	ICE_FLOW_SEG_HDR_ETH		= 0x00000001,
+	ICE_FLOW_SEG_HDR_VLAN		= 0x00000002,
+	ICE_FLOW_SEG_HDR_IPV4		= 0x00000004,
+	ICE_FLOW_SEG_HDR_IPV6		= 0x00000008,
+	ICE_FLOW_SEG_HDR_ARP		= 0x00000010,
+	ICE_FLOW_SEG_HDR_ICMP		= 0x00000020,
+	ICE_FLOW_SEG_HDR_TCP		= 0x00000040,
+	ICE_FLOW_SEG_HDR_UDP		= 0x00000080,
+	ICE_FLOW_SEG_HDR_SCTP		= 0x00000100,
+	ICE_FLOW_SEG_HDR_GRE		= 0x00000200,
+	ICE_FLOW_SEG_HDR_GTPC		= 0x00000400,
+	ICE_FLOW_SEG_HDR_GTPC_TEID	= 0x00000800,
+	ICE_FLOW_SEG_HDR_GTPU_IP	= 0x00001000,
+	ICE_FLOW_SEG_HDR_GTPU_EH	= 0x00002000,
+	ICE_FLOW_SEG_HDR_GTPU_DWN	= 0x00004000,
+	ICE_FLOW_SEG_HDR_GTPU_UP	= 0x00008000,
+	ICE_FLOW_SEG_HDR_PPPOE		= 0x00010000,
+	ICE_FLOW_SEG_HDR_PFCP_NODE	= 0x00020000,
+	ICE_FLOW_SEG_HDR_PFCP_SESSION	= 0x00040000,
+	ICE_FLOW_SEG_HDR_L2TPV3		= 0x00080000,
+	ICE_FLOW_SEG_HDR_ESP		= 0x00100000,
+	ICE_FLOW_SEG_HDR_AH		= 0x00200000,
+	ICE_FLOW_SEG_HDR_NAT_T_ESP	= 0x00400000,
+	ICE_FLOW_SEG_HDR_ETH_NON_IP	= 0x00800000,
+	ICE_FLOW_SEG_HDR_GTPU_NON_IP	= 0x01000000,
+	ICE_FLOW_SEG_HDR_VXLAN		= 0x02000000,
+	ICE_FLOW_SEG_HDR_ECPRI_TP0	= 0x04000000,
+	ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0	= 0x08000000,
+	ICE_FLOW_SEG_HDR_L2TPV2		= 0x10000000,
+	ICE_FLOW_SEG_HDR_PPP		= 0x20000000,
+	/* The following is an additive bit for ICE_FLOW_SEG_HDR_IPV4 and
+	 * ICE_FLOW_SEG_HDR_IPV6.
+	 */
+	ICE_FLOW_SEG_HDR_IPV_FRAG	= 0x40000000,
+	ICE_FLOW_SEG_HDR_IPV_OTHER	= 0x80000000,
+};
+
+/* These segements all have the same PTYPES, but are otherwise distinguished by
+ * the value of the gtp_eh_pdu and gtp_eh_pdu_link flags:
+ *
+ *                                gtp_eh_pdu     gtp_eh_pdu_link
+ * ICE_FLOW_SEG_HDR_GTPU_IP           0              0
+ * ICE_FLOW_SEG_HDR_GTPU_EH           1              don't care
+ * ICE_FLOW_SEG_HDR_GTPU_DWN          1              0
+ * ICE_FLOW_SEG_HDR_GTPU_UP           1              1
+ */
+#define ICE_FLOW_SEG_HDR_GTPU (ICE_FLOW_SEG_HDR_GTPU_IP | \
+			       ICE_FLOW_SEG_HDR_GTPU_EH | \
+			       ICE_FLOW_SEG_HDR_GTPU_DWN | \
+			       ICE_FLOW_SEG_HDR_GTPU_UP)
+#define ICE_FLOW_SEG_HDR_PFCP (ICE_FLOW_SEG_HDR_PFCP_NODE | \
+			       ICE_FLOW_SEG_HDR_PFCP_SESSION)
+
+enum ice_flow_field {
+	/* L2 */
+	ICE_FLOW_FIELD_IDX_ETH_DA,
+	ICE_FLOW_FIELD_IDX_ETH_SA,
+	ICE_FLOW_FIELD_IDX_S_VLAN,
+	ICE_FLOW_FIELD_IDX_C_VLAN,
+	ICE_FLOW_FIELD_IDX_ETH_TYPE,
+	/* L3 */
+	ICE_FLOW_FIELD_IDX_IPV4_DSCP,
+	ICE_FLOW_FIELD_IDX_IPV6_DSCP,
+	ICE_FLOW_FIELD_IDX_IPV4_TTL,
+	ICE_FLOW_FIELD_IDX_IPV4_PROT,
+	ICE_FLOW_FIELD_IDX_IPV6_TTL,
+	ICE_FLOW_FIELD_IDX_IPV6_PROT,
+	ICE_FLOW_FIELD_IDX_IPV4_SA,
+	ICE_FLOW_FIELD_IDX_IPV4_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_DA,
+	ICE_FLOW_FIELD_IDX_IPV4_ID,
+	ICE_FLOW_FIELD_IDX_IPV6_ID,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE32_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE32_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE48_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE48_DA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA,
+	ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA,
+	/* L4 */
+	ICE_FLOW_FIELD_IDX_TCP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_TCP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_UDP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT,
+	ICE_FLOW_FIELD_IDX_SCTP_DST_PORT,
+	ICE_FLOW_FIELD_IDX_TCP_FLAGS,
+	/* ARP */
+	ICE_FLOW_FIELD_IDX_ARP_SIP,
+	ICE_FLOW_FIELD_IDX_ARP_DIP,
+	ICE_FLOW_FIELD_IDX_ARP_SHA,
+	ICE_FLOW_FIELD_IDX_ARP_DHA,
+	ICE_FLOW_FIELD_IDX_ARP_OP,
+	/* ICMP */
+	ICE_FLOW_FIELD_IDX_ICMP_TYPE,
+	ICE_FLOW_FIELD_IDX_ICMP_CODE,
+	/* GRE */
+	ICE_FLOW_FIELD_IDX_GRE_KEYID,
+	/* GTPC_TEID */
+	ICE_FLOW_FIELD_IDX_GTPC_TEID,
+	/* GTPU_IP */
+	ICE_FLOW_FIELD_IDX_GTPU_IP_TEID,
+	/* GTPU_EH */
+	ICE_FLOW_FIELD_IDX_GTPU_EH_TEID,
+	ICE_FLOW_FIELD_IDX_GTPU_EH_QFI,
+	/* GTPU_UP */
+	ICE_FLOW_FIELD_IDX_GTPU_UP_TEID,
+	/* GTPU_DWN */
+	ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID,
+	/* PPPOE */
+	ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID,
+	/* PFCP */
+	ICE_FLOW_FIELD_IDX_PFCP_SEID,
+	/* L2TPV3 */
+	ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID,
+	/* ESP */
+	ICE_FLOW_FIELD_IDX_ESP_SPI,
+	/* AH */
+	ICE_FLOW_FIELD_IDX_AH_SPI,
+	/* NAT_T ESP */
+	ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI,
+	/* VXLAN VNI */
+	ICE_FLOW_FIELD_IDX_VXLAN_VNI,
+	/* ECPRI_TP0 */
+	ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID,
+	/* UDP_ECPRI_TP0 */
+	ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID,
+	 /* The total number of enums must not exceed 64 */
+	ICE_FLOW_FIELD_IDX_MAX
+};
+
+
+/* Flow headers and fields for AVF support */
+enum ice_flow_avf_hdr_field {
+	/* Values 0 - 28 are reserved for future use */
+	ICE_AVF_FLOW_FIELD_INVALID		= 0,
+	ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP	= 29,
+	ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP,
+	ICE_AVF_FLOW_FIELD_IPV4_UDP,
+	ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK,
+	ICE_AVF_FLOW_FIELD_IPV4_TCP,
+	ICE_AVF_FLOW_FIELD_IPV4_SCTP,
+	ICE_AVF_FLOW_FIELD_IPV4_OTHER,
+	ICE_AVF_FLOW_FIELD_FRAG_IPV4,
+	/* Values 37-38 are reserved */
+	ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP	= 39,
+	ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP,
+	ICE_AVF_FLOW_FIELD_IPV6_UDP,
+	ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK,
+	ICE_AVF_FLOW_FIELD_IPV6_TCP,
+	ICE_AVF_FLOW_FIELD_IPV6_SCTP,
+	ICE_AVF_FLOW_FIELD_IPV6_OTHER,
+	ICE_AVF_FLOW_FIELD_FRAG_IPV6,
+	ICE_AVF_FLOW_FIELD_RSVD47,
+	ICE_AVF_FLOW_FIELD_FCOE_OX,
+	ICE_AVF_FLOW_FIELD_FCOE_RX,
+	ICE_AVF_FLOW_FIELD_FCOE_OTHER,
+	/* Values 51-62 are reserved */
+	ICE_AVF_FLOW_FIELD_L2_PAYLOAD		= 63,
+	ICE_AVF_FLOW_FIELD_MAX
+};
+
+/* Supported RSS offloads  This macro is defined to support
+ * VIRTCHNL_OP_GET_RSS_HENA_CAPS ops. PF driver sends the RSS hardware
+ * capabilities to the caller of this ops.
+ */
+#define ICE_DEFAULT_RSS_HENA ( \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
+	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP))
+
+enum ice_rss_cfg_hdr_type {
+	ICE_RSS_OUTER_HEADERS, /* take outer headers as inputset. */
+	ICE_RSS_INNER_HEADERS, /* take inner headers as inputset. */
+	/* take inner headers as inputset for packet with outer ipv4. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV4,
+	/* take inner headers as inputset for packet with outer ipv6. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV6,
+	/* take outer headers first then inner headers as inputset */
+	ICE_RSS_ANY_HEADERS
+};
+
+struct ice_rss_hash_cfg {
+	u32 addl_hdrs; /* protocol header fields */
+	u64 hash_flds; /* hash bit field (ICE_FLOW_HASH_*) to configure */
+	enum ice_rss_cfg_hdr_type hdr_type; /* to specify inner or outer */
+	bool symm; /* symmetric or asymmetric hash */
+};
+
+enum ice_flow_dir {
+	ICE_FLOW_DIR_UNDEFINED	= 0,
+	ICE_FLOW_TX		= 0x01,
+	ICE_FLOW_RX		= 0x02,
+	ICE_FLOW_TX_RX		= ICE_FLOW_RX | ICE_FLOW_TX
+};
+
+enum ice_flow_priority {
+	ICE_FLOW_PRIO_LOW,
+	ICE_FLOW_PRIO_NORMAL,
+	ICE_FLOW_PRIO_HIGH
+};
+
+#define ICE_FLOW_SEG_SINGLE		1
+#define ICE_FLOW_SEG_MAX		2
+#define ICE_FLOW_SEG_RAW_FLD_MAX	2
+#define ICE_FLOW_PROFILE_MAX		1024
+#define ICE_FLOW_SW_FIELD_VECTOR_MAX	48
+#define ICE_FLOW_ACL_FIELD_VECTOR_MAX	32
+#define ICE_FLOW_FV_EXTRACT_SZ		2
+
+#define ICE_FLOW_SET_HDRS(seg, val)	((seg)->hdrs |= (u32)(val))
+
+struct ice_flow_seg_xtrct {
+	u8 prot_id;	/* Protocol ID of extracted header field */
+	u16 off;	/* Starting offset of the field in header in bytes */
+	u8 idx;		/* Index of FV entry used */
+	u8 disp;	/* Displacement of field in bits fr. FV entry's start */
+	u16 mask;	/* Mask for field */
+};
+
+
+enum ice_flow_fld_match_type {
+	ICE_FLOW_FLD_TYPE_REG,		/* Value, mask */
+	ICE_FLOW_FLD_TYPE_RANGE,	/* Value, mask, last (upper bound) */
+	ICE_FLOW_FLD_TYPE_PREFIX,	/* IP address, prefix, size of prefix */
+	ICE_FLOW_FLD_TYPE_SIZE,		/* Value, mask, size of match */
+};
+
+struct ice_flow_fld_loc {
+	/* Describe offsets of field information relative to the beginning of
+	 * input buffer provided when adding flow entries.
+	 */
+	u16 val;	/* Offset where the value is located */
+	u16 mask;	/* Offset where the mask/prefix value is located */
+	u16 last;	/* Length or offset where the upper value is located */
+};
+
+struct ice_flow_fld_info {
+	enum ice_flow_fld_match_type type;
+	/* Location where to retrieve data from an input buffer */
+	struct ice_flow_fld_loc src;
+	/* Location where to put the data into the final entry buffer */
+	struct ice_flow_fld_loc entry;
+	struct ice_flow_seg_xtrct xtrct;
+};
+
+struct ice_flow_seg_fld_raw {
+	struct ice_flow_fld_info info;
+	u16 off;	/* Offset from the start of the segment */
+};
+
+struct ice_flow_seg_info {
+	u32 hdrs;	/* Bitmask indicating protocol headers present */
+	u64 match;	/* Bitmask indicating header fields to be matched */
+	u64 range;	/* Bitmask indicating header fields matched as ranges */
+
+	struct ice_flow_fld_info fields[ICE_FLOW_FIELD_IDX_MAX];
+
+	u8 raws_cnt;	/* Number of raw fields to be matched */
+	struct ice_flow_seg_fld_raw raws[ICE_FLOW_SEG_RAW_FLD_MAX];
+};
+
+/* This structure describes a flow entry, and is tracked only in this file */
+struct ice_flow_entry {
+	struct list_head l_entry;
+
+	u64 id;
+	struct ice_flow_prof *prof;
+	/* Action list */
+	struct ice_flow_action *acts;
+	/* Flow entry's content */
+	void *entry;
+	/* Range buffer (For ACL only) */
+	struct ice_aqc_acl_profile_ranges *range_buf;
+	enum ice_flow_priority priority;
+	u16 vsi_handle;
+	u16 entry_sz;
+	/* Entry index in the ACL's scenario */
+	u16 scen_entry_idx;
+#define ICE_FLOW_ACL_MAX_NUM_ACT	2
+	u8 acts_cnt;
+};
+
+#define ICE_FLOW_ENTRY_HNDL(e)	((u64)e)
+#define ICE_FLOW_ENTRY_PTR(h)	((struct ice_flow_entry *)(h))
+
+struct ice_flow_prof {
+	struct list_head l_entry;
+
+	u64 id;
+	enum ice_flow_dir dir;
+	u8 segs_cnt;
+	u8 acts_cnt;
+
+	/* Keep track of flow entries associated with this flow profile */
+	struct mutex entries_lock;
+	struct list_head entries;
+
+	struct ice_flow_seg_info segs[ICE_FLOW_SEG_MAX];
+
+	/* software VSI handles referenced by this flow profile */
+	DECLARE_BITMAP(vsis, ICE_MAX_VSI);
+
+	union {
+		/* struct sw_recipe */
+		struct ice_acl_scen *scen;
+		/* struct fd */
+		u32 data;
+		bool symm; /* Symmetric Hash for RSS */
+	} cfg;
+
+	/* Default actions */
+	struct ice_flow_action *acts;
+};
+
+struct ice_rss_cfg {
+	struct list_head l_entry;
+	/* bitmap of VSIs added to the RSS entry */
+	DECLARE_BITMAP(vsis, ICE_MAX_VSI);
+	struct ice_rss_hash_cfg hash;
+};
+
+enum ice_flow_action_type {
+	ICE_FLOW_ACT_NOP,
+	ICE_FLOW_ACT_ALLOW,
+	ICE_FLOW_ACT_DROP,
+	ICE_FLOW_ACT_CNTR_PKT,
+	ICE_FLOW_ACT_FWD_VSI,
+	ICE_FLOW_ACT_FWD_VSI_LIST,	/* Should be abstracted away */
+	ICE_FLOW_ACT_FWD_QUEUE,		/* Can Queues be abstracted away? */
+	ICE_FLOW_ACT_FWD_QUEUE_GROUP,	/* Can Queues be abstracted away? */
+	ICE_FLOW_ACT_PUSH,
+	ICE_FLOW_ACT_POP,
+	ICE_FLOW_ACT_MODIFY,
+	ICE_FLOW_ACT_CNTR_BYTES,
+	ICE_FLOW_ACT_CNTR_PKT_BYTES,
+	ICE_FLOW_ACT_GENERIC_0,
+	ICE_FLOW_ACT_GENERIC_1,
+	ICE_FLOW_ACT_GENERIC_2,
+	ICE_FLOW_ACT_GENERIC_3,
+	ICE_FLOW_ACT_GENERIC_4,
+	ICE_FLOW_ACT_RPT_FLOW_ID,
+	ICE_FLOW_ACT_BUILD_PROF_IDX,
+};
+
+struct ice_flow_action {
+	enum ice_flow_action_type type;
+	union {
+		struct ice_acl_act_entry acl_act;
+		u32 dummy;
+	} data;
+};
+
+u64
+ice_flow_find_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		   struct ice_flow_seg_info *segs, u8 segs_cnt);
+enum ice_status
+ice_flow_add_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir,
+		  u64 prof_id, struct ice_flow_seg_info *segs, u8 segs_cnt,
+		  struct ice_flow_action *acts, u8 acts_cnt,
+		  struct ice_flow_prof **prof);
+enum ice_status
+ice_flow_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id);
+enum ice_status
+ice_flow_assoc_vsig_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi_handle,
+			u16 vsig);
+enum ice_status
+ice_flow_get_hw_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		     u8 *hw_prof);
+u64 ice_flow_find_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_id);
+enum ice_status
+ice_flow_add_entry(struct ice_hw *hw, enum ice_block blk, u64 prof_id,
+		   u64 entry_id, u16 vsi, enum ice_flow_priority prio,
+		   void *data, struct ice_flow_action *acts, u8 acts_cnt,
+		   u64 *entry_h);
+enum ice_status
+ice_flow_rem_entry(struct ice_hw *hw, enum ice_block blk, u64 entry_h);
+void
+ice_flow_set_fld(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+		 u16 val_loc, u16 mask_loc, u16 last_loc, bool range);
+void
+ice_flow_set_fld_prefix(struct ice_flow_seg_info *seg, enum ice_flow_field fld,
+			u16 val_loc, u16 prefix_loc, u8 prefix_sz);
+void
+ice_flow_add_fld_raw(struct ice_flow_seg_info *seg, u16 off, u8 len,
+		     u16 val_loc, u16 mask_loc);
+enum ice_status ice_flow_rem_vsi_prof(struct ice_hw *hw, enum ice_block blk,
+				      u16 vsi_handle, u64 prof_id);
+void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds);
+enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg);
+enum ice_status
+ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg);
+u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs);
+#endif /* _ICE_FLOW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_fltr.c b/drivers/net/ethernet/intel/ice/ice_fltr.c
new file mode 100644
index 0000000000000000000000000000000000000000..3acf5656479f6f0b0e614bd7dff27f8ba80e18d2
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fltr.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_fltr.h"
+
+/**
+ * ice_fltr_free_list - free filter lists helper
+ * @dev: pointer to the device struct
+ * @h: pointer to the list head to be freed
+ *
+ * Helper function to free filter lists previously created using
+ * ice_fltr_add_mac_to_list
+ */
+void ice_fltr_free_list(struct device *dev, struct list_head *h)
+{
+	struct ice_fltr_list_entry *e, *tmp;
+
+	list_for_each_entry_safe(e, tmp, h, list_entry) {
+		list_del(&e->list_entry);
+		devm_kfree(dev, e);
+	}
+}
+
+/**
+ * ice_fltr_add_entry_to_list - allocate and add filter entry to list
+ * @dev: pointer to device needed by alloc function
+ * @info: filter info struct that gets added to the passed in list
+ * @list: pointer to the list which contains MAC filters entry
+ */
+static int
+ice_fltr_add_entry_to_list(struct device *dev, struct ice_fltr_info *info,
+			   struct list_head *list)
+{
+	struct ice_fltr_list_entry *entry;
+
+	entry = devm_kzalloc(dev, sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->fltr_info = *info;
+
+	INIT_LIST_HEAD(&entry->list_entry);
+	list_add(&entry->list_entry, list);
+
+	return 0;
+}
+
+/**
+ * ice_fltr_set_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi: the VSI being configured
+ * @promisc_mask: mask of promiscuous config bits
+ *
+ * Set VSI with all associated VLANs to given promiscuous mode(s)
+ */
+enum ice_status
+ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask)
+{
+	return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, false);
+}
+
+/**
+ * ice_fltr_clear_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi: the VSI being configured
+ * @promisc_mask: mask of promiscuous config bits
+ *
+ * Clear VSI with all associated VLANs to given promiscuous mode(s)
+ */
+enum ice_status
+ice_fltr_clear_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask)
+{
+	return ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_mask, true);
+}
+
+/**
+ * ice_fltr_clear_vsi_promisc - clear specified promiscuous mode(s)
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
+ * @lport: logical port number to clear mode
+ */
+enum ice_status
+ice_fltr_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			   u16 vid, u8 lport)
+{
+	return ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask, vid);
+}
+
+/**
+ * ice_fltr_set_vsi_promisc - set given VSI to given promiscuous mode(s)
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
+ * @lport: logical port number to set promiscuous mode
+ */
+enum ice_status
+ice_fltr_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 u16 vid, u8 lport)
+{
+	return ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid);
+}
+
+/**
+ * ice_fltr_add_mac_list - add list of MAC filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+enum ice_status
+ice_fltr_add_mac_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_mac_list - remove list of MAC filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+enum ice_status
+ice_fltr_remove_mac_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_vlan_list - add list of VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_vlan_list - remove list of VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan_list - add list of MAC VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_mac_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_mac_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_mac_vlan_list - remove list of MAC VLAN filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_mac_vlan_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_mac_vlan(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_add_eth_list - add list of ethertype filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_add_eth_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_add_eth_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_eth_list - remove list of ethertype filters
+ * @vsi: pointer to VSI struct
+ * @list: list of filters
+ */
+static enum ice_status
+ice_fltr_remove_eth_list(struct ice_vsi *vsi, struct list_head *list)
+{
+	return ice_remove_eth_mac(&vsi->back->hw, list);
+}
+
+/**
+ * ice_fltr_remove_all - remove all filters associated with VSI
+ * @vsi: pointer to VSI struct
+ */
+void ice_fltr_remove_all(struct ice_vsi *vsi)
+{
+	ice_remove_vsi_fltr(&vsi->back->hw, vsi->idx);
+}
+
+/**
+ * ice_fltr_add_mac_to_list - add MAC filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @mac: MAC address to add
+ * @action: filter action
+ */
+int
+ice_fltr_add_mac_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 const u8 *mac, enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = ICE_FLTR_TX;
+	info.src_id = ICE_SRC_ID_VSI;
+	info.lkup_type = ICE_SW_LKUP_MAC;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+
+	ether_addr_copy(info.l_data.mac.mac_addr, mac);
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_vlan_to_list - add VLAN filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @vlan: VLAN filter details
+ */
+static int
+ice_fltr_add_vlan_to_list(struct ice_vsi *vsi, struct list_head *list,
+			  struct ice_vlan *vlan)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = ICE_FLTR_TX;
+	info.src_id = ICE_SRC_ID_VSI;
+	info.lkup_type = ICE_SW_LKUP_VLAN;
+	info.fltr_act = vlan->fwd_act;
+	info.vsi_handle = vsi->idx;
+	info.l_data.vlan.vlan_id = vlan->vid;
+	info.l_data.vlan.tpid = vlan->tpid;
+	info.l_data.vlan.tpid_valid = true;
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan_to_list - add MAC VLAN filter info to
+ * exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @mac: MAC addr to add
+ * @vlan_id: VLAN ID to add
+ * @action: filter action
+ */
+static int
+ice_fltr_add_mac_vlan_to_list(struct ice_vsi *vsi, struct list_head *list,
+			      const u8 *mac, u16 vlan_id,
+			      enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	if (!is_valid_ether_addr(mac) ||
+	    is_broadcast_ether_addr(mac) || !vlan_id)
+		return -EINVAL;
+
+	info.flag = ICE_FLTR_TX_RX;
+	info.lkup_type = ICE_SW_LKUP_MAC_VLAN;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+	info.src = vsi->vsi_num;
+
+	info.l_data.mac_vlan.vlan_id = vlan_id;
+	ether_addr_copy(info.l_data.mac_vlan.mac_addr, mac);
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_add_eth_to_list - add ethertype filter info to exsisting list
+ * @vsi: pointer to VSI struct
+ * @list: list to add filter info to
+ * @ethertype: ethertype of packet that matches filter
+ * @flag: filter direction, Tx or Rx
+ * @action: filter action
+ */
+static int
+ice_fltr_add_eth_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 u16 ethertype, u16 flag,
+			 enum ice_sw_fwd_act_type action)
+{
+	struct ice_fltr_info info = { 0 };
+
+	info.flag = flag;
+	info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
+	info.fltr_act = action;
+	info.vsi_handle = vsi->idx;
+	info.l_data.ethertype_mac.ethertype = ethertype;
+
+	if (flag == ICE_FLTR_TX)
+		info.src_id = ICE_SRC_ID_VSI;
+	else
+		info.src_id = ICE_SRC_ID_LPORT;
+
+	return ice_fltr_add_entry_to_list(ice_pf_to_dev(vsi->back), &info,
+					  list);
+}
+
+/**
+ * ice_fltr_prepare_mac - add or remove MAC rule
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @action: action to be performed on filter match
+ * @mac_action: pointer to add or remove MAC function
+ */
+static enum ice_status
+ice_fltr_prepare_mac(struct ice_vsi *vsi, const u8 *mac,
+		     enum ice_sw_fwd_act_type action,
+		     enum ice_status (*mac_action)(struct ice_vsi *,
+						   struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_mac_to_list(vsi, &tmp_list, mac, action)) {
+		ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	result = mac_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_mac_and_broadcast - add or remove MAC and broadcast filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @action: action to be performed on filter match
+ * @mac_action: pointer to add or remove MAC function
+ */
+static enum ice_status
+ice_fltr_prepare_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+				   enum ice_sw_fwd_act_type action,
+				   enum ice_status(*mac_action)
+				   (struct ice_vsi *, struct list_head *))
+{
+	u8 broadcast[ETH_ALEN];
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	eth_broadcast_addr(broadcast);
+	if (ice_fltr_add_mac_to_list(vsi, &tmp_list, mac, action) ||
+	    ice_fltr_add_mac_to_list(vsi, &tmp_list, broadcast, action)) {
+		ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	result = mac_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_vlan - add or remove VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ * @vlan_action: pointer to add or remove VLAN function
+ */
+static enum ice_status
+ice_fltr_prepare_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan,
+		      enum ice_status (*vlan_action)(struct ice_vsi *,
+						     struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_vlan_to_list(vsi, &tmp_list, vlan))
+		return ICE_ERR_NO_MEMORY;
+
+	result = vlan_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_mac_vlan - add or remove MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: VLAN ID to add
+ * @action: action to be performed on filter match
+ * @mac_vlan_action: pointer to add or remove MAC VLAN function
+ */
+static enum ice_status
+ice_fltr_prepare_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			  enum ice_sw_fwd_act_type action,
+			  enum ice_status (mac_vlan_action)(struct ice_vsi *,
+							    struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_mac_vlan_to_list(vsi, &tmp_list, mac, vlan_id,
+					  action))
+		return ICE_ERR_NO_MEMORY;
+
+	result = mac_vlan_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_prepare_eth - add or remove ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of packet to be filtered
+ * @flag: direction of packet, Tx or Rx
+ * @action: action to be performed on filter match
+ * @eth_action: pointer to add or remove ethertype function
+ */
+static enum ice_status
+ice_fltr_prepare_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		     enum ice_sw_fwd_act_type action,
+		     enum ice_status (*eth_action)(struct ice_vsi *,
+						   struct list_head *))
+{
+	enum ice_status result;
+	LIST_HEAD(tmp_list);
+
+	if (ice_fltr_add_eth_to_list(vsi, &tmp_list, ethertype, flag, action))
+		return ICE_ERR_NO_MEMORY;
+
+	result = eth_action(vsi, &tmp_list);
+	ice_fltr_free_list(ice_pf_to_dev(vsi->back), &tmp_list);
+	return result;
+}
+
+/**
+ * ice_fltr_add_mac - add single MAC filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status ice_fltr_add_mac(struct ice_vsi *vsi, const u8 *mac,
+				 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac(vsi, mac, action, ice_fltr_add_mac_list);
+}
+
+/**
+ * ice_fltr_add_mac_and_broadcast - add single MAC and broadcast
+ * @vsi: pointer to VSI struct
+ * @mac: MAC to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status
+ice_fltr_add_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+			       enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_and_broadcast(vsi, mac, action,
+						  ice_fltr_add_mac_list);
+}
+
+/**
+ * ice_fltr_remove_mac - remove MAC filter
+ * @vsi: pointer to VSI struct
+ * @mac: filter MAC to remove
+ * @action: action to remove
+ */
+enum ice_status ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
+				    enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac(vsi, mac, action, ice_fltr_remove_mac_list);
+}
+
+/**
+ * ice_fltr_add_vlan - add single VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ */
+enum ice_status ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_add_vlan_list);
+}
+
+/**
+ * ice_fltr_remove_vlan - remove VLAN filter
+ * @vsi: pointer to VSI struct
+ * @vlan: VLAN filter details
+ */
+enum ice_status ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	return ice_fltr_prepare_vlan(vsi, vlan, ice_fltr_remove_vlan_list);
+}
+
+/**
+ * ice_fltr_add_mac_vlan - add single MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: VLAN ID to add
+ * @action: action to be performed on filter match
+ */
+enum ice_status
+ice_fltr_add_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+		      enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_vlan(vsi, mac, vlan_id, action,
+					 ice_fltr_add_mac_vlan_list);
+}
+
+/**
+ * ice_fltr_remove_mac_vlan - remove MAC VLAN filter
+ * @vsi: pointer to VSI struct
+ * @mac: MAC address to add
+ * @vlan_id: filter MAC VLAN to remove
+ * @action: action to remove
+ */
+enum ice_status
+ice_fltr_remove_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_mac_vlan(vsi, mac, vlan_id, action,
+					 ice_fltr_remove_mac_vlan_list);
+}
+
+/**
+ * ice_fltr_add_eth - add specyfic ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of filter
+ * @flag: direction of packet to be filtered, Tx or Rx
+ * @action: action to be performed on filter match
+ */
+enum ice_status ice_fltr_add_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+				 enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_eth(vsi, ethertype, flag, action,
+				    ice_fltr_add_eth_list);
+}
+
+/**
+ * ice_fltr_remove_eth - remove ethertype filter
+ * @vsi: pointer to VSI struct
+ * @ethertype: ethertype of filter
+ * @flag: direction of filter
+ * @action: action to remove
+ */
+enum ice_status ice_fltr_remove_eth(struct ice_vsi *vsi, u16 ethertype,
+				    u16 flag, enum ice_sw_fwd_act_type action)
+{
+	return ice_fltr_prepare_eth(vsi, ethertype, flag, action,
+				    ice_fltr_remove_eth_list);
+}
+
+/**
+ * ice_fltr_update_rule_flags - update lan_en/lb_en flags
+ * @hw: pointer to hw
+ * @rule_id: id of rule being updated
+ * @recipe_id: recipe id of rule
+ * @act: current action field
+ * @type: Rx or Tx
+ * @src: source VSI
+ * @new_flags: combinations of lb_en and lan_en
+ */
+static enum ice_status
+ice_fltr_update_rule_flags(struct ice_hw *hw, u16 rule_id, u16 recipe_id,
+			   u32 act, u16 type, u16 src, u32 new_flags)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status err;
+	u32 flags_mask;
+
+	s_rule = kzalloc(ICE_SW_RULE_RX_TX_NO_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	flags_mask = ICE_SINGLE_ACT_LB_ENABLE | ICE_SINGLE_ACT_LAN_ENABLE;
+	act &= ~flags_mask;
+	act |= (flags_mask & new_flags);
+
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(recipe_id);
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(rule_id);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	if (type & ICE_FLTR_RX) {
+		s_rule->pdata.lkup_tx_rx.src =
+			cpu_to_le16(hw->port_info->lport);
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+
+	} else {
+		s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(src);
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+	}
+
+	err = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
+			      ice_aqc_opc_update_sw_rules, NULL);
+
+	kfree(s_rule);
+	return err;
+}
+
+/**
+ * ice_fltr_build_action - build action for rule
+ * @vsi_id: id of VSI which is use to build action
+ */
+static u32
+ice_fltr_build_action(u16 vsi_id)
+{
+	return ((vsi_id << ICE_SINGLE_ACT_VSI_ID_S) & ICE_SINGLE_ACT_VSI_ID_M) |
+		ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT;
+}
+
+/**
+ * ice_fltr_find_adv_entry - find advanced rule
+ * @rules: list of rules
+ * @rule_id: id of wanted rule
+ */
+static struct ice_adv_fltr_mgmt_list_entry *
+ice_fltr_find_adv_entry(struct list_head *rules, u16 rule_id)
+{
+	struct ice_adv_fltr_mgmt_list_entry *entry;
+
+	list_for_each_entry(entry, rules, list_entry) {
+		if (entry->rule_info.fltr_rule_id == rule_id)
+			return entry;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_fltr_update_adv_rule_flags - update flags on advanced rule
+ * @vsi: pointer to VSI
+ * @recipe_id: id of recipe
+ * @entry: advanced rule entry
+ * @new_flags: flags to update
+ */
+static enum ice_status
+ice_fltr_update_adv_rule_flags(struct ice_vsi *vsi, u16 recipe_id,
+			       struct ice_adv_fltr_mgmt_list_entry *entry,
+			       u32 new_flags)
+{
+	struct ice_adv_rule_info *info = &entry->rule_info;
+	struct ice_sw_act_ctrl *act = &info->sw_act;
+	u32 action;
+
+	if (act->fltr_act != ICE_FWD_TO_VSI)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	action = ice_fltr_build_action(act->fwd_id.hw_vsi_id);
+
+	return ice_fltr_update_rule_flags(&vsi->back->hw, info->fltr_rule_id,
+					  recipe_id, action, info->sw_act.flag,
+					  act->src, new_flags);
+}
+
+/**
+ * ice_fltr_find_regular_entry - find regular rule
+ * @rules: list of rules
+ * @rule_id: id of wanted rule
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_fltr_find_regular_entry(struct list_head *rules, u16 rule_id)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+
+	list_for_each_entry(entry, rules, list_entry) {
+		if (entry->fltr_info.fltr_rule_id == rule_id)
+			return entry;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_fltr_update_regular_rule - update flags on regular rule
+ * @vsi: pointer to VSI
+ * @recipe_id: id of recipe
+ * @entry: regular rule entry
+ * @new_flags: flags to update
+ */
+static enum ice_status
+ice_fltr_update_regular_rule(struct ice_vsi *vsi, u16 recipe_id,
+			     struct ice_fltr_mgmt_list_entry *entry,
+			     u32 new_flags)
+{
+	struct ice_fltr_info *info = &entry->fltr_info;
+	u32 action;
+
+	if (info->fltr_act != ICE_FWD_TO_VSI)
+		return ICE_ERR_NOT_SUPPORTED;
+
+	action = ice_fltr_build_action(info->fwd_id.hw_vsi_id);
+
+	return ice_fltr_update_rule_flags(&vsi->back->hw, info->fltr_rule_id,
+					  recipe_id, action, info->flag,
+					  info->src, new_flags);
+}
+
+/**
+ * ice_fltr_update_flags - update flags on rule
+ * @vsi: pointer to VSI
+ * @rule_id: id of rule
+ * @recipe_id: id of recipe
+ * @new_flags: flags to update
+ *
+ * Function updates flags on regular and advance rule.
+ *
+ * Flags should be a combination of ICE_SINGLE_ACT_LB_ENABLE and
+ * ICE_SINGLE_ACT_LAN_ENABLE.
+ */
+enum ice_status
+ice_fltr_update_flags(struct ice_vsi *vsi, u16 rule_id, u16 recipe_id,
+		      u32 new_flags)
+{
+	struct ice_adv_fltr_mgmt_list_entry *adv_entry;
+	struct ice_fltr_mgmt_list_entry *regular_entry;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_sw_recipe *recp_list;
+	struct list_head *fltr_rules;
+
+	recp_list = &hw->switch_info->recp_list[recipe_id];
+	if (!recp_list)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	fltr_rules = &recp_list->filt_rules;
+	regular_entry = ice_fltr_find_regular_entry(fltr_rules, rule_id);
+	if (regular_entry)
+		return ice_fltr_update_regular_rule(vsi, recipe_id,
+						    regular_entry, new_flags);
+
+	adv_entry = ice_fltr_find_adv_entry(fltr_rules, rule_id);
+	if (adv_entry)
+		return ice_fltr_update_adv_rule_flags(vsi, recipe_id,
+						      adv_entry, new_flags);
+
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_fltr_update_flags_dflt_rule - update flags on default rule
+ * @vsi: pointer to VSI
+ * @rule_id: id of rule
+ * @direction: Tx or Rx
+ * @new_flags: flags to update
+ *
+ * Function updates flags on default rule with ICE_SW_LKUP_DFLT.
+ *
+ * Flags should be a combination of ICE_SINGLE_ACT_LB_ENABLE and
+ * ICE_SINGLE_ACT_LAN_ENABLE.
+ */
+enum ice_status
+ice_fltr_update_flags_dflt_rule(struct ice_vsi *vsi, u16 rule_id, u8 direction,
+				u32 new_flags)
+{
+	u32 action = ice_fltr_build_action(vsi->vsi_num);
+	struct ice_hw *hw = &vsi->back->hw;
+
+	return ice_fltr_update_rule_flags(hw, rule_id, ICE_SW_LKUP_DFLT, action,
+					  direction, vsi->vsi_num, new_flags);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fltr.h b/drivers/net/ethernet/intel/ice/ice_fltr.h
new file mode 100644
index 0000000000000000000000000000000000000000..20723d0b88e3c3fa343891c7b4eba0683429d0b9
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fltr.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FLTR_H_
+#define _ICE_FLTR_H_
+
+#include "ice_vlan.h"
+
+void ice_fltr_free_list(struct device *dev, struct list_head *h);
+enum ice_status
+ice_fltr_set_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask);
+enum ice_status
+ice_fltr_clear_vlan_vsi_promisc(struct ice_hw *hw, struct ice_vsi *vsi, u8 promisc_mask);
+enum ice_status
+ice_fltr_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			   u16 vid, u8 lport);
+enum ice_status
+ice_fltr_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 u16 vid, u8 lport);
+int
+ice_fltr_add_mac_to_list(struct ice_vsi *vsi, struct list_head *list,
+			 const u8 *mac, enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac(struct ice_vsi *vsi, const u8 *mac,
+		 enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac_and_broadcast(struct ice_vsi *vsi, const u8 *mac,
+			       enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_add_mac_list(struct ice_vsi *vsi, struct list_head *list);
+enum ice_status
+ice_fltr_remove_mac(struct ice_vsi *vsi, const u8 *mac,
+		    enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_mac_list(struct ice_vsi *vsi, struct list_head *list);
+
+enum ice_status ice_fltr_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+enum ice_status ice_fltr_remove_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+enum ice_status
+ice_fltr_add_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+		      enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_mac_vlan(struct ice_vsi *vsi, const u8 *mac, u16 vlan_id,
+			 enum ice_sw_fwd_act_type action);
+
+enum ice_status
+ice_fltr_add_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		 enum ice_sw_fwd_act_type action);
+enum ice_status
+ice_fltr_remove_eth(struct ice_vsi *vsi, u16 ethertype, u16 flag,
+		    enum ice_sw_fwd_act_type action);
+void ice_fltr_remove_all(struct ice_vsi *vsi);
+
+enum ice_status
+ice_fltr_update_flags(struct ice_vsi *vsi, u16 rule_id, u16 recipe_id,
+		      u32 new_flags);
+enum ice_status
+ice_fltr_update_flags_dflt_rule(struct ice_vsi *vsi, u16 rule_id, u8 direction,
+				u32 new_flags);
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_fw_update.c b/drivers/net/ethernet/intel/ice/ice_fw_update.c
new file mode 100644
index 0000000000000000000000000000000000000000..8cb0d88351fb5c3e023f9512e074b5e019798d00
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fw_update.c
@@ -0,0 +1,845 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <asm/unaligned.h>
+#include <linux/uuid.h>
+#include <linux/crc32.h>
+#if IS_ENABLED(CONFIG_PLDMFW)
+#include <linux/pldmfw.h>
+#else
+#include "kcompat_pldmfw.h"
+#endif
+
+#include "ice.h"
+#include "ice_fw_update.h"
+
+struct ice_fwu_priv {
+	struct pldmfw context;
+
+	struct ice_pf *pf;
+	struct netlink_ext_ack *extack;
+
+	/* Track which NVM banks to activate at the end of the update */
+	u8 activate_flags;
+};
+
+/**
+ * ice_send_package_data - Send record package data to firmware
+ * @context: PLDM fw update structure
+ * @data: pointer to the package data
+ * @length: length of the package data
+ *
+ * Send a copy of the package data associated with the PLDM record matching
+ * this device to the firmware.
+ *
+ * Note that this function sends an AdminQ command that will fail unless the
+ * NVM resource has been acquired.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_send_package_data(struct pldmfw *context, const u8 *data, u16 length)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct device *dev = context->dev;
+	struct ice_pf *pf = priv->pf;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 *package_data;
+
+	dev_dbg(dev, "Sending PLDM record package data to firmware\n");
+
+	package_data = kmemdup(data, length, GFP_KERNEL);
+	if (!package_data)
+		return -ENOMEM;
+
+	status = ice_nvm_set_pkg_data(hw, false, package_data, length, NULL);
+
+	kfree(package_data);
+
+	if (status) {
+		dev_err(dev, "Failed to send record package data to firmware, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to record package data to firmware");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_check_component_response - Report firmware response to a component
+ * @pf: device private data structure
+ * @id: component id being checked
+ * @response: indicates whether this component can be updated
+ * @code: code indicating reason for response
+ * @extack: netlink extended ACK structure
+ *
+ * Check whether firmware indicates if this component can be updated. Report
+ * a suitable error message over the netlink extended ACK if the component
+ * cannot be updated.
+ *
+ * Returns: zero if the component can be updated, or -ECANCELED of the
+ * firmware indicates the component cannot be updated.
+ */
+static int
+ice_check_component_response(struct ice_pf *pf, u16 id, u8 response, u8 code,
+			     struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	const char *component;
+
+	switch (id) {
+	case NVM_COMP_ID_OROM:
+		component = "fw.undi";
+		break;
+	case NVM_COMP_ID_NVM:
+		component = "fw.mgmt";
+		break;
+	case NVM_COMP_ID_NETLIST:
+		component = "fw.netlist";
+		break;
+	default:
+		WARN(1, "Unexpected unknown component identifier 0x%02x", id);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev, "%s: firmware response 0x%x, code 0x%x\n",
+		component, response, code);
+
+	switch (response) {
+	case ICE_AQ_NVM_PASS_COMP_CAN_BE_UPDATED:
+		/* firmware indicated this update is good to proceed */
+		return 0;
+	case ICE_AQ_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE:
+		dev_warn(dev, "firmware recommends not updating %s, as it may result in a downgrade. Continuing anyways\n",
+			 component);
+		return 0;
+	case ICE_AQ_NVM_PASS_COMP_CAN_NOT_BE_UPDATED:
+		dev_info(dev, "firmware has rejected updating %s\n", component);
+		break;
+	}
+
+	switch (code) {
+	case ICE_AQ_NVM_PASS_COMP_STAMP_IDENTICAL_CODE:
+		dev_err(dev, "Component comparison stamp for %s is identical to the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is identical to running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_STAMP_LOWER:
+		dev_err(dev, "Component comparison stamp for %s is lower than the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is lower than running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_INVALID_STAMP_CODE:
+		dev_err(dev, "Component comparison stamp for %s is invalid\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component comparison stamp is invalid");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_CONFLICT_CODE:
+		dev_err(dev, "%s conflicts with a previous component table\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component table conflict occurred");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE:
+		dev_err(dev, "Pre-requisites for component %s have not been met\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component pre-requisites not met");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_NOT_SUPPORTED_CODE:
+		dev_err(dev, "%s is not a supported component\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component not supported");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE:
+		dev_err(dev, "Security restrictions prevent %s from being downgraded\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component cannot be downgraded");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE:
+		dev_err(dev, "Received an incomplete component image for %s\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Incomplete component image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE:
+		dev_err(dev, "Component version for %s is identical to the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component version is identical to running image");
+		break;
+	case ICE_AQ_NVM_PASS_COMP_VER_STR_LOWER_CODE:
+		dev_err(dev, "Component version for %s is lower than the running image\n",
+			component);
+		NL_SET_ERR_MSG_MOD(extack, "Component version is lower than the running image");
+		break;
+	default:
+		dev_err(dev, "Unexpected response code 0x02%x for %s\n",
+			code, component);
+		NL_SET_ERR_MSG_MOD(extack, "Received unexpected response code from firmware");
+		break;
+	}
+
+	return -ECANCELED;
+}
+
+/**
+ * ice_send_component_table - Send PLDM component table to firmware
+ * @context: PLDM fw update structure
+ * @component: the component to process
+ * @transfer_flag: relative transfer order of this component
+ *
+ * Read relevant data from the component and forward it to the device
+ * firmware. Check the response to determine if the firmware indicates that
+ * the update can proceed.
+ *
+ * This function sends AdminQ commands related to the NVM, and assumes that
+ * the NVM resource has been acquired.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_send_component_table(struct pldmfw *context, struct pldmfw_component *component,
+			 u8 transfer_flag)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_aqc_nvm_comp_tbl *comp_tbl;
+	u8 comp_response, comp_response_code;
+	struct device *dev = context->dev;
+	struct ice_pf *pf = priv->pf;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	size_t length;
+
+	switch (component->identifier) {
+	case NVM_COMP_ID_OROM:
+	case NVM_COMP_ID_NVM:
+	case NVM_COMP_ID_NETLIST:
+		break;
+	default:
+		dev_err(dev, "Unable to update due to a firmware component with unknown ID %u\n",
+			component->identifier);
+		NL_SET_ERR_MSG_MOD(extack, "Unable to update due to unknown firmware component");
+		return -EOPNOTSUPP;
+	}
+
+	length = struct_size(comp_tbl, cvs, component->version_len);
+	comp_tbl = kzalloc(length, GFP_KERNEL);
+	if (!comp_tbl)
+		return -ENOMEM;
+
+	comp_tbl->comp_class = cpu_to_le16(component->classification);
+	comp_tbl->comp_id = cpu_to_le16(component->identifier);
+	comp_tbl->comp_class_idx = FWU_COMP_CLASS_IDX_NOT_USE;
+	comp_tbl->comp_cmp_stamp = cpu_to_le32(component->comparison_stamp);
+	comp_tbl->cvs_type = component->version_type;
+	comp_tbl->cvs_len = component->version_len;
+	memcpy(comp_tbl->cvs, component->version_string, component->version_len);
+
+	dev_dbg(dev, "Sending component table to firmware\n");
+
+	status = ice_nvm_pass_component_tbl(hw, (u8 *)comp_tbl, length,
+					    transfer_flag, &comp_response,
+					    &comp_response_code, NULL);
+
+	kfree(comp_tbl);
+
+	if (status) {
+		dev_err(dev, "Failed to transfer component table to firmware, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to transfer component table to firmware");
+		return -EIO;
+	}
+
+	return ice_check_component_response(pf, component->identifier, comp_response,
+					    comp_response_code, extack);
+}
+
+/**
+ * ice_write_one_nvm_block - Write an NVM block and await completion response
+ * @pf: the PF data structure
+ * @module: the module to write to
+ * @offset: offset in bytes
+ * @block_size: size of the block to write, up to 4k
+ * @block: pointer to block of data to write
+ * @last_cmd: whether this is the last command
+ * @extack: netlink extended ACK structure
+ *
+ * Write a block of data to a flash module, and await for the completion
+ * response message from firmware.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
+			u16 block_size, u8 *block, bool last_cmd,
+			struct netlink_ext_ack *extack)
+{
+	u16 completion_module, completion_retval;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 completion_offset;
+	int err;
+
+	memset(&event, 0, sizeof(event));
+
+	dev_dbg(dev, "Writing block of %u bytes for module 0x%02x at offset %u\n",
+		block_size, module, offset);
+
+	status = ice_aq_update_nvm(hw, module, offset, block_size, block,
+				   last_cmd, 0, NULL);
+	if (status) {
+		dev_err(dev, "Failed to flash module 0x%02x with block of size %u at offset %u, err %s aq_err %s\n",
+			module, block_size, offset, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to program flash module");
+		return -EIO;
+	}
+
+	/* In most cases, firmware reports a write completion within a few
+	 * milliseconds. However, it has been observed that a completion might
+	 * take more than a second to complete in some cases. The timeout here
+	 * is conservative and is intended to prevent failure to update when
+	 * firmware is slow to respond.
+	 */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_write, 15*HZ, &event);
+	if (err) {
+		dev_err(dev, "Timed out while trying to flash module 0x%02x with block of size %u at offset %u, err %d\n",
+			module, block_size, offset, err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		return -EIO;
+	}
+
+	completion_module = le16_to_cpu(event.desc.params.nvm.module_typeid);
+	completion_retval = le16_to_cpu(event.desc.retval);
+
+	completion_offset = le16_to_cpu(event.desc.params.nvm.offset_low);
+	completion_offset |= event.desc.params.nvm.offset_high << 16;
+
+	if (completion_module != module) {
+		dev_err(dev, "Unexpected module_typeid in write completion: got 0x%x, expected 0x%x\n",
+			completion_module, module);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		return -EIO;
+	}
+
+	if (completion_offset != offset) {
+		dev_err(dev, "Unexpected offset in write completion: got %u, expected %u\n",
+			completion_offset, offset);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		return -EIO;
+	}
+
+	if (completion_retval) {
+		dev_err(dev, "Firmware ailed to flash module 0x%02x with block of size %u at offset %u, err %s\n",
+			module, block_size, offset,
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to program flash module");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_write_nvm_module - Write data to an NVM module
+ * @pf: the PF driver structure
+ * @module: the module id to program
+ * @component: the name of the component being updated
+ * @image: buffer of image data to write to the NVM
+ * @length: length of the buffer
+ * @extack: netlink extended ACK structure
+ *
+ * Loop over the data for a given NVM module and program it in 4 Kb
+ * blocks. Notify devlink core of progress after each block is programmed.
+ * Loops over a block of data and programs the NVM in 4k block chunks.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_write_nvm_module(struct ice_pf *pf, u16 module, const char *component,
+		     const u8 *image, u32 length,
+		     struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct devlink *devlink;
+	u32 offset = 0;
+	bool last_cmd;
+	u8 *block;
+	int err;
+
+	dev_dbg(dev, "Beginning write of flash component '%s', module 0x%02x\n", component, module);
+
+	devlink = priv_to_devlink(pf);
+
+	devlink_flash_update_status_notify(devlink, "Flashing",
+					   component, 0, length);
+
+	block = kzalloc(ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	do {
+		u32 block_size;
+
+		block_size = min_t(u32, ICE_AQ_MAX_BUF_LEN, length - offset);
+		last_cmd = !(offset + block_size < length);
+
+		/* ice_aq_update_nvm may copy the firmware response into the
+		 * buffer, so we must make a copy since the source data is
+		 * constant.
+		 */
+		memcpy(block, image + offset, block_size);
+
+		err = ice_write_one_nvm_block(pf, module, offset, block_size,
+					      block, last_cmd, extack);
+		if (err)
+			break;
+
+		offset += block_size;
+
+		devlink_flash_update_status_notify(devlink, "Flashing",
+						   component, offset, length);
+	} while (!last_cmd);
+
+	dev_dbg(dev, "Completed write of flash component '%s', module 0x%02x\n", component, module);
+
+	if (err)
+		devlink_flash_update_status_notify(devlink, "Flashing failed",
+						   component, length, length);
+	else
+		devlink_flash_update_status_notify(devlink, "Flashing done",
+						   component, length, length);
+
+	kfree(block);
+	return err;
+}
+
+/**
+ * ice_erase_nvm_module - Erase an NVM module and await firmware completion
+ * @pf: the PF data structure
+ * @module: the module to erase
+ * @component: name of the component being updated
+ * @extack: netlink extended ACK structure
+ *
+ * Erase the inactive NVM bank associated with this module, and await for
+ * a completion response message from firmware.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_erase_nvm_module(struct ice_pf *pf, u16 module, const char *component,
+		     struct netlink_ext_ack *extack)
+{
+	u16 completion_module, completion_retval;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	struct devlink *devlink;
+	enum ice_status status;
+	int err;
+
+	dev_dbg(dev, "Beginning erase of flash component '%s', module 0x%02x\n", component, module);
+
+	memset(&event, 0, sizeof(event));
+
+	devlink = priv_to_devlink(pf);
+
+	devlink_flash_update_status_notify(devlink, "Erasing", component, 0, 0);
+
+	status = ice_aq_erase_nvm(hw, module, NULL);
+	if (status) {
+		dev_err(dev, "Failed to erase %s (module 0x%02x), err %s aq_err %s\n",
+			component, module, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to erase flash module");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	/* Yes, this really can take minutes to complete */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_erase, 300 * HZ, &event);
+	if (err) {
+		dev_err(dev, "Timed out waiting for firmware to respond with erase completion for %s (module 0x%02x), err %d\n",
+			component, module, err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		goto out_notify_devlink;
+	}
+
+	completion_module = le16_to_cpu(event.desc.params.nvm.module_typeid);
+	completion_retval = le16_to_cpu(event.desc.retval);
+
+	if (completion_module != module) {
+		dev_err(dev, "Unexpected module_typeid in erase completion for %s: got 0x%x, expected 0x%x\n",
+			component, completion_module, module);
+		NL_SET_ERR_MSG_MOD(extack, "Unexpected firmware response");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	if (completion_retval) {
+		dev_err(dev, "Firmware failed to erase %s (module 0x02%x), aq_err %s\n",
+			component, module,
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to erase flash");
+		err = -EIO;
+		goto out_notify_devlink;
+	}
+
+	dev_dbg(dev, "Completed erase of flash component '%s', module 0x%02x\n", component, module);
+
+out_notify_devlink:
+	if (err)
+		devlink_flash_update_status_notify(devlink, "Erasing failed",
+						   component, 0, 0);
+	else
+		devlink_flash_update_status_notify(devlink, "Erasing done",
+						   component, 0, 0);
+
+	return err;
+}
+
+/**
+ * ice_switch_flash_banks - Tell firmware to switch NVM banks
+ * @pf: Pointer to the PF data structure
+ * @activate_flags: flags used for the activation command
+ * @extack: netlink extended ACK structure
+ *
+ * Notify firmware to activate the newly written flash banks, and wait for the
+ * firmware response.
+ *
+ * Returns: zero on success or an error code on failure.
+ */
+static int ice_switch_flash_banks(struct ice_pf *pf, u8 activate_flags,
+				  struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u16 completion_retval;
+	int err;
+
+	memset(&event, 0, sizeof(event));
+
+	status = ice_nvm_write_activate(hw, activate_flags);
+	if (status) {
+		dev_err(dev, "Failed to switch active flash banks, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to switch active flash banks");
+		return -EIO;
+	}
+
+	/* In most cases, we expect firmware to respond with a completion
+	 * within a few milliseconds. However, it has been observed in
+	 * practice that firmware may sometimes take longer. The wait time
+	 * here is conservative to reduce the risk of a failed update simply
+	 * because we did not wait long enough for firmware to respond.
+	 */
+	err = ice_aq_wait_for_event(pf, ice_aqc_opc_nvm_write_activate, 30*HZ,
+				    &event);
+	if (err) {
+		dev_err(dev, "Timed out waiting for firmware to switch active flash banks, err %d\n",
+			err);
+		NL_SET_ERR_MSG_MOD(extack, "Timed out waiting for firmware");
+		return err;
+	}
+
+	completion_retval = le16_to_cpu(event.desc.retval);
+	if (completion_retval) {
+		dev_err(dev, "Firmware failed to switch active flash banks aq_err %s\n",
+			ice_aq_str((enum ice_aq_err)completion_retval));
+		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to switch active flash banks");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_flash_component - Flash a component of the NVM
+ * @context: PLDM fw update structure
+ * @component: the component table to program
+ *
+ * Program the flash contents for a given component. First, determine the
+ * module id. Then, erase the secondary bank for this module. Finally, write
+ * the contents of the component to the NVM.
+ *
+ * Note this function assumes the caller has acquired the NVM resource.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+ice_flash_component(struct pldmfw *context, struct pldmfw_component *component)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_pf *pf = priv->pf;
+	const char *name;
+	u16 module;
+	u8 flag;
+	int err;
+
+	switch (component->identifier) {
+	case NVM_COMP_ID_OROM:
+		module = ICE_SR_1ST_OROM_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_OROM;
+		name = "fw.undi";
+		break;
+	case NVM_COMP_ID_NVM:
+		module = ICE_SR_1ST_NVM_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_NVM;
+		name = "fw.mgmt";
+		break;
+	case NVM_COMP_ID_NETLIST:
+		module = ICE_SR_NETLIST_BANK_PTR;
+		flag = ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+		name = "fw.netlist";
+		break;
+	default:
+		/* This should not trigger, since we check the id before
+		 * sending the component table to firmware.
+		 */
+		WARN(1, "Unexpected unknown component identifier 0x%02x",
+		     component->identifier);
+		return -EINVAL;
+	}
+
+	/* Mark this component for activating at the end */
+	priv->activate_flags |= flag;
+
+	err = ice_erase_nvm_module(pf, module, name, extack);
+	if (err)
+		return err;
+
+	return ice_write_nvm_module(pf, module, name, component->component_data,
+				    component->component_size, extack);
+}
+
+/**
+ * ice_finalize_update - Perform last steps to complete device update
+ * @context: PLDM fw update structure
+ *
+ * Called as the last step of the update process. Complete the update by
+ * telling the firmware to switch active banks, and perform a reset of
+ * configured.
+ *
+ * Returns: 0 on success, or an error code on failure.
+ */
+static int ice_finalize_update(struct pldmfw *context)
+{
+	struct ice_fwu_priv *priv = container_of(context, struct ice_fwu_priv, context);
+	struct netlink_ext_ack *extack = priv->extack;
+	struct ice_pf *pf = priv->pf;
+	int err;
+
+	/* Finally, notify firmware to activate the written NVM banks */
+	err = ice_switch_flash_banks(pf, priv->activate_flags, extack);
+	if (err)
+		return err;
+
+	/* Perform an immediate reset only if PRESERVE_ALL is selected */
+	if ((priv->activate_flags & ICE_AQC_NVM_PRESERVATION_M) == ICE_AQC_NVM_PRESERVE_ALL) {
+		struct device *dev = ice_pf_to_dev(pf);
+		struct ice_hw *hw = &pf->hw;
+		enum ice_status status;
+
+		status = ice_aq_nvm_update_empr(hw);
+		if (status) {
+			dev_err(dev, "Failed to trigger immediate device reset, err %s aq_err %s\n",
+				ice_stat_str(status),
+				ice_aq_str(hw->adminq.sq_last_status));
+			NL_SET_ERR_MSG_MOD(extack, "Failed to trigger immediate device reset");
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static const struct pldmfw_ops ice_fwu_ops = {
+	.match_record = &pldmfw_op_pci_match_record,
+	.send_package_data = &ice_send_package_data,
+	.send_component_table = &ice_send_component_table,
+	.flash_component = &ice_flash_component,
+	.finalize_update = &ice_finalize_update,
+};
+
+/**
+ * ice_flash_pldm_image - Write a PLDM-formatted firmware image to the device
+ * @pf: private device driver structure
+ * @fw: firmware object pointing to the relevant firmware file
+ * @preservation: preservation level to request from firmware
+ * @extack: netlink extended ACK structure
+ *
+ * Parse the data for a given firmware file, verifying that it is a valid PLDM
+ * formatted image that matches this device.
+ *
+ * Extract the device record Package Data and Component Tables and send them
+ * to the firmware. Extract and write the flash data for each of the three
+ * main flash components, "fw.mgmt", "fw.undi", and "fw.netlist". Notify
+ * firmware once the data is written to the inactive banks.
+ *
+ * Returns: zero on success or a negative error code on failure.
+ */
+int ice_flash_pldm_image(struct ice_pf *pf, const struct firmware *fw,
+			 u8 preservation, struct netlink_ext_ack *extack)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	struct ice_fwu_priv priv;
+	enum ice_status status;
+	int err;
+
+	switch (preservation) {
+	case ICE_AQC_NVM_PRESERVE_ALL:
+	case ICE_AQC_NVM_PRESERVE_SELECTED:
+	case ICE_AQC_NVM_NO_PRESERVATION:
+	case ICE_AQC_NVM_FACTORY_DEFAULT:
+		break;
+	default:
+		WARN(1, "Unexpected preservation level request %u", preservation);
+		return -EINVAL;
+	}
+
+	memset(&priv, 0, sizeof(priv));
+
+	priv.context.ops = &ice_fwu_ops;
+	priv.context.dev = dev;
+	priv.extack = extack;
+	priv.pf = pf;
+	priv.activate_flags = preservation;
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status) {
+		dev_err(dev, "Failed to acquire device flash lock, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire device flash lock");
+		return -EIO;
+	}
+
+	err = pldmfw_flash_image(&priv.context, fw);
+	if (err == -ENOENT) {
+		dev_err(dev, "Firmware image has no record matching this device\n");
+		NL_SET_ERR_MSG_MOD(extack, "Firmware image has no record matching this device");
+	} else if (err) {
+		/* Do not set a generic extended ACK message here. A more
+		 * specific message may already have been set by one of our
+		 * ops.
+		 */
+		dev_err(dev, "Failed to flash PLDM image, err %d", err);
+	}
+
+	ice_release_nvm(hw);
+
+	return err;
+}
+
+/**
+ * ice_check_for_pending_update - Check for a pending flash update
+ * @pf: the PF driver structure
+ * @component: if not NULL, the name of the component being updated
+ * @extack: Netlink extended ACK structure
+ *
+ * Check whether the device already has a pending flash update. If such an
+ * update is found, cancel it so that the requested update may proceed.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int ice_check_for_pending_update(struct ice_pf *pf, const char *component,
+				 struct netlink_ext_ack *extack)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw_dev_caps *dev_caps;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 pending = 0;
+	int err;
+
+	dev_caps = kzalloc(sizeof(*dev_caps), GFP_KERNEL);
+	if (!dev_caps)
+		return -ENOMEM;
+
+	/* Read the most recent device capabilities from firmware. Do not use
+	 * the cached values in hw->dev_caps, because the pending update flag
+	 * may have changed, e.g. if an update was previously completed and
+	 * the system has not yet rebooted.
+	 */
+	status = ice_discover_dev_caps(hw, dev_caps);
+	if (status) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to read device capabilities");
+		kfree(dev_caps);
+		return -EIO;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_nvm) {
+		dev_info(dev, "The fw.mgmt flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_NVM;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_orom) {
+		dev_info(dev, "The fw.undi flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_OROM;
+	}
+
+	if (dev_caps->common_cap.nvm_update_pending_netlist) {
+		dev_info(dev, "The fw.netlist flash component has a pending update\n");
+		pending |= ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+	}
+
+	kfree(dev_caps);
+
+	/* If the flash_update request is for a specific component, ignore all
+	 * of the other components.
+	 */
+	if (component) {
+		if (strcmp(component, "fw.mgmt") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_NVM;
+		else if (strcmp(component, "fw.undi") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_OROM;
+		else if (strcmp(component, "fw.netlist") == 0)
+			pending &= ICE_AQC_NVM_ACTIV_SEL_NETLIST;
+		else
+			WARN(1, "Unexpected flash component %s", component);
+	}
+
+	/* There is no previous pending update, so this request may continue */
+	if (!pending)
+		return 0;
+
+	/* In order to allow overwriting a previous pending update, notify
+	 * firmware to cancel that update by issuing the appropriate command.
+	 */
+	devlink_flash_update_status_notify(devlink,
+					   "Canceling previous pending update",
+					   component, 0, 0);
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status) {
+		dev_err(dev, "Failed to acquire device flash lock, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire device flash lock");
+		return -EIO;
+	}
+
+	pending |= ICE_AQC_NVM_REVERT_LAST_ACTIV;
+	err = ice_switch_flash_banks(pf, pending, extack);
+
+	ice_release_nvm(hw);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fw_update.h b/drivers/net/ethernet/intel/ice/ice_fw_update.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e083c0f9695b05c8d3a032255b13faf4b66b2ec
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fw_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FW_UPDATE_H_
+#define _ICE_FW_UPDATE_H_
+
+int ice_flash_pldm_image(struct ice_pf *pf, const struct firmware *fw,
+			 u8 preservation, struct netlink_ext_ack *extack);
+int ice_check_for_pending_update(struct ice_pf *pf, const char *component,
+				 struct netlink_ext_ack *extack);
+
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c
new file mode 100644
index 0000000000000000000000000000000000000000..cee8cf7ef36b67513f1a8c6c34494f3eec1587ec
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_fwlog.h"
+
+/**
+ * cache_cfg - Cache FW logging config
+ * @hw: pointer to the HW structure
+ * @cfg: config to cache
+ */
+static void cache_cfg(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	hw->fwlog_cfg = *cfg;
+}
+
+/**
+ * valid_module_entries - validate all the  module entry IDs and log levels
+ * @hw: pointer to the HW structure
+ * @entries: entries to validate
+ * @num_entries: number of entries to validate
+ */
+static bool
+valid_module_entries(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		     u16 num_entries)
+{
+	u16 i;
+
+	if (!entries) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Null ice_fwlog_module_entry array\n");
+		return false;
+	}
+
+	if (!num_entries) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "num_entries must be non-zero\n");
+		return false;
+	}
+
+	for (i = 0; i < num_entries; i++) {
+		struct ice_fwlog_module_entry *entry = &entries[i];
+
+		if (entry->module_id >= ICE_AQC_FW_LOG_ID_MAX) {
+			ice_debug(hw, ICE_DBG_FW_LOG, "Invalid module_id %u, max valid module_id is %u\n",
+				  entry->module_id, ICE_AQC_FW_LOG_ID_MAX - 1);
+			return false;
+		}
+
+		if (entry->log_level >= ICE_FWLOG_LEVEL_INVALID) {
+			ice_debug(hw, ICE_DBG_FW_LOG, "Invalid log_level %u, max valid log_level is %u\n",
+				  entry->log_level,
+				  ICE_AQC_FW_LOG_ID_MAX - 1);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * valid_cfg - validate entire configuration
+ * @hw: pointer to the HW structure
+ * @cfg: config to validate
+ */
+static bool valid_cfg(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	if (!cfg) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Null ice_fwlog_cfg\n");
+		return false;
+	}
+
+	if (cfg->log_resolution < ICE_AQC_FW_LOG_MIN_RESOLUTION ||
+	    cfg->log_resolution > ICE_AQC_FW_LOG_MAX_RESOLUTION) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Unsupported log_resolution %u, must be between %u and %u\n",
+			  cfg->log_resolution, ICE_AQC_FW_LOG_MIN_RESOLUTION,
+			  ICE_AQC_FW_LOG_MAX_RESOLUTION);
+		return false;
+	}
+
+	if (!valid_module_entries(hw, cfg->module_entries,
+				  ICE_AQC_FW_LOG_ID_MAX))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_fwlog_init - Initialize cached structures for tracking FW logging
+ * @hw: pointer to the HW structure
+ * @cfg: config used to initialize the cached structures
+ *
+ * This function should be called on driver initialization and before calling
+ * ice_init_hw(). Firmware logging will be configured based on these settings
+ * and also the PF will be registered on init.
+ */
+enum ice_status
+ice_fwlog_init(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	if (!valid_cfg(hw, cfg))
+		return ICE_ERR_PARAM;
+
+	cache_cfg(hw, cfg);
+
+	return 0;
+}
+
+/**
+ * ice_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30)
+ * @hw: pointer to the HW structure
+ * @entries: entries to configure
+ * @num_entries: number of @entries
+ * @options: options from ice_fwlog_cfg->options structure
+ * @log_resolution: logging resolution
+ */
+static enum ice_status
+ice_aq_fwlog_set(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		 u16 num_entries, u16 options, u16 log_resolution)
+{
+	struct ice_aqc_fw_log_cfg_resp *fw_modules;
+	struct ice_aqc_fw_log *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i;
+
+	fw_modules = devm_kcalloc(ice_hw_to_dev(hw), num_entries,
+				  sizeof(*fw_modules), GFP_KERNEL);
+	if (!fw_modules)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_entries; i++) {
+		fw_modules[i].module_identifier =
+			cpu_to_le16(entries[i].module_id);
+		fw_modules[i].log_level = entries[i].log_level;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_config);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd = &desc.params.fw_log;
+
+	cmd->cmd_flags = ICE_AQC_FW_LOG_CONF_SET_VALID;
+	cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution);
+	cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries);
+
+	if (options & ICE_FWLOG_OPTION_ARQ_ENA)
+		cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_AQ_EN;
+	if (options & ICE_FWLOG_OPTION_UART_ENA)
+		cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_UART_EN;
+
+	status = ice_aq_send_cmd(hw, &desc, fw_modules,
+				 sizeof(*fw_modules) * num_entries,
+				 NULL);
+
+	devm_kfree(ice_hw_to_dev(hw), fw_modules);
+
+	return status;
+}
+
+/**
+ * ice_fwlog_supported - Cached for whether FW supports FW logging or not
+ * @hw: pointer to the HW structure
+ *
+ * This will always return false if called before ice_init_hw(), so it must be
+ * called after ice_init_hw().
+ */
+bool ice_fwlog_supported(struct ice_hw *hw)
+{
+	return hw->fwlog_support_ena;
+}
+
+/**
+ * ice_fwlog_set - Set the firmware logging settings
+ * @hw: pointer to the HW structure
+ * @cfg: config used to set firmware logging
+ *
+ * This function should be called whenever the driver needs to set the firmware
+ * logging configuration. It can be called on initialization, reset, or during
+ * runtime.
+ *
+ * If the PF wishes to receive FW logging then it must register via
+ * ice_fwlog_register. Note, that ice_fwlog_register does not need to be called
+ * for init.
+ */
+enum ice_status
+ice_fwlog_set(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!valid_cfg(hw, cfg))
+		return ICE_ERR_PARAM;
+
+	status = ice_aq_fwlog_set(hw, cfg->module_entries,
+				  ICE_AQC_FW_LOG_ID_MAX, cfg->options,
+				  cfg->log_resolution);
+	if (!status)
+		cache_cfg(hw, cfg);
+
+	return status;
+}
+
+/**
+ * update_cached_entries - Update module entries in cached FW logging config
+ * @hw: pointer to the HW structure
+ * @entries: entries to cache
+ * @num_entries: number of @entries
+ */
+static void
+update_cached_entries(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
+		      u16 num_entries)
+{
+	u16 i;
+
+	for (i = 0; i < num_entries; i++) {
+		struct ice_fwlog_module_entry *updated = &entries[i];
+		u16 j;
+
+		for (j = 0; j < ICE_AQC_FW_LOG_ID_MAX; j++) {
+			struct ice_fwlog_module_entry *cached =
+				&hw->fwlog_cfg.module_entries[j];
+
+			if (cached->module_id == updated->module_id) {
+				cached->log_level = updated->log_level;
+				break;
+			}
+		}
+	}
+}
+
+/**
+ * ice_fwlog_update_modules - Update the log level 1 or more FW logging modules
+ * @hw: pointer to the HW structure
+ * @entries: array of ice_fwlog_module_entry(s)
+ * @num_entries: number of entries
+ *
+ * This function should be called to update the log level of 1 or more FW
+ * logging modules via module ID.
+ *
+ * Only the entries passed in will be affected. All other firmware logging
+ * settings will be unaffected.
+ */
+enum ice_status
+ice_fwlog_update_modules(struct ice_hw *hw,
+			 struct ice_fwlog_module_entry *entries,
+			 u16 num_entries)
+{
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!valid_module_entries(hw, entries, num_entries))
+		return ICE_ERR_PARAM;
+
+	cfg = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return ICE_ERR_NO_MEMORY;
+
+	status = ice_fwlog_get(hw, cfg);
+	if (status)
+		goto status_out;
+
+	status = ice_aq_fwlog_set(hw, entries, num_entries, cfg->options,
+				  cfg->log_resolution);
+	if (!status)
+		update_cached_entries(hw, entries, num_entries);
+
+status_out:
+	devm_kfree(ice_hw_to_dev(hw), cfg);
+	return status;
+}
+
+/**
+ *ice_aq_fwlog_register - Register PF for firmware logging events (0xFF31)
+ * @hw: pointer to the HW structure
+ * @reg: true to register and false to unregister
+ */
+static enum ice_status ice_aq_fwlog_register(struct ice_hw *hw, bool reg)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_register);
+
+	if (reg)
+		desc.params.fw_log.cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_fwlog_register - Register the PF for firmware logging
+ * @hw: pointer to the HW structure
+ *
+ * After this call the PF will start to receive firmware logging based on the
+ * configuration set in ice_fwlog_set.
+ */
+enum ice_status ice_fwlog_register(struct ice_hw *hw)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	status = ice_aq_fwlog_register(hw, true);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to register for firmware logging events over ARQ\n");
+	else
+		hw->fwlog_cfg.options |= ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	return status;
+}
+
+/**
+ * ice_fwlog_unregister - Unregister the PF from firmware logging
+ * @hw: pointer to the HW structure
+ */
+enum ice_status ice_fwlog_unregister(struct ice_hw *hw)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	status = ice_aq_fwlog_register(hw, false);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to unregister from firmware logging events over ARQ\n");
+	else
+		hw->fwlog_cfg.options &= ~ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	return status;
+}
+
+/**
+ * ice_aq_fwlog_get - Get the current firmware logging configuration (0xFF32)
+ * @hw: pointer to the HW structure
+ * @cfg: firmware logging configuration to populate
+ */
+static enum ice_status
+ice_aq_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	struct ice_aqc_fw_log_cfg_resp *fw_modules;
+	struct ice_aqc_fw_log *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 i, module_id_cnt;
+	void *buf;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	buf = devm_kcalloc(ice_hw_to_dev(hw), 1, ICE_AQ_MAX_BUF_LEN,
+			   GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_query);
+	cmd = &desc.params.fw_log;
+
+	cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_QUERY;
+
+	status = ice_aq_send_cmd(hw, &desc, buf, ICE_AQ_MAX_BUF_LEN, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "Failed to get FW log configuration\n");
+		goto status_out;
+	}
+
+	module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt);
+	if (module_id_cnt < ICE_AQC_FW_LOG_ID_MAX) {
+		ice_debug(hw, ICE_DBG_FW_LOG, "FW returned less than the expected number of FW log module IDs\n");
+	} else {
+		if (module_id_cnt > ICE_AQC_FW_LOG_ID_MAX)
+			ice_debug(hw, ICE_DBG_FW_LOG, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n",
+				  ICE_AQC_FW_LOG_ID_MAX);
+		module_id_cnt = ICE_AQC_FW_LOG_ID_MAX;
+	}
+
+	cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution);
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_AQ_EN)
+		cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA;
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_UART_EN)
+		cfg->options |= ICE_FWLOG_OPTION_UART_ENA;
+	if (cmd->cmd_flags & ICE_AQC_FW_LOG_QUERY_REGISTERED)
+		cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED;
+
+	fw_modules = (struct ice_aqc_fw_log_cfg_resp *)buf;
+
+	for (i = 0; i < module_id_cnt; i++) {
+		struct ice_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i];
+
+		cfg->module_entries[i].module_id =
+			le16_to_cpu(fw_module->module_identifier);
+		cfg->module_entries[i].log_level = fw_module->log_level;
+	}
+
+status_out:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_fwlog_set_support_ena - Set if FW logging is supported by FW
+ * @hw: pointer to the HW struct
+ *
+ * If FW returns success to the ice_aq_fwlog_get call then it supports FW
+ * logging, else it doesn't. Set the fwlog_support_ena flag accordingly.
+ *
+ * This function is only meant to be called during driver init to determine if
+ * the FW support FW logging.
+ */
+void ice_fwlog_set_support_ena(struct ice_hw *hw)
+{
+	struct ice_fwlog_cfg *cfg;
+	enum ice_status status;
+
+	hw->fwlog_support_ena = false;
+
+	cfg = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return;
+
+	/* don't call ice_fwlog_get() because that would overwrite the cached
+	 * configuration from the call to ice_fwlog_init(), which is expected to
+	 * be called prior to this function
+	 */
+	status = ice_aq_fwlog_get(hw, cfg);
+	if (status)
+		ice_debug(hw, ICE_DBG_FW_LOG, "ice_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n",
+			  status);
+	else
+		hw->fwlog_support_ena = true;
+
+	devm_kfree(ice_hw_to_dev(hw), cfg);
+}
+
+/**
+ * ice_fwlog_get - Get the firmware logging settings
+ * @hw: pointer to the HW structure
+ * @cfg: config to populate based on current firmware logging settings
+ */
+enum ice_status
+ice_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
+{
+	enum ice_status status;
+
+	if (!ice_fwlog_supported(hw))
+		return ICE_ERR_NOT_SUPPORTED;
+
+	if (!cfg)
+		return ICE_ERR_PARAM;
+
+	status = ice_aq_fwlog_get(hw, cfg);
+	if (status)
+		return status;
+
+	cache_cfg(hw, cfg);
+
+	return 0;
+}
+
+/**
+ * ice_fwlog_event_dump - Dump the event received over the Admin Receive Queue
+ * @hw: pointer to the HW structure
+ * @desc: Admin Receive Queue descriptor
+ * @buf: buffer that contains the FW log event data
+ *
+ * If the driver receives the ice_aqc_opc_fw_logs_event on the Admin Receive
+ * Queue, then it should call this function to dump the FW log data.
+ */
+void
+ice_fwlog_event_dump(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf)
+{
+	if (!ice_fwlog_supported(hw))
+		return;
+
+	ice_info_fwlog(hw, 32, 1, buf, le16_to_cpu(desc->datalen));
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h
new file mode 100644
index 0000000000000000000000000000000000000000..0914cb7c627b4fd664ce3d350cb0e93c362235fe
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_FWLOG_H_
+#define _ICE_FWLOG_H_
+#include "ice_adminq_cmd.h"
+
+struct ice_hw;
+
+/* Only a single log level should be set and all log levels under the set value
+ * are enabled, e.g. if log level is set to ICE_FW_LOG_LEVEL_VERBOSE, then all
+ * other log levels are included (except ICE_FW_LOG_LEVEL_NONE)
+ */
+enum ice_fwlog_level {
+	ICE_FWLOG_LEVEL_NONE = 0,
+	ICE_FWLOG_LEVEL_ERROR = 1,
+	ICE_FWLOG_LEVEL_WARNING = 2,
+	ICE_FWLOG_LEVEL_NORMAL = 3,
+	ICE_FWLOG_LEVEL_VERBOSE = 4,
+	ICE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */
+};
+
+struct ice_fwlog_module_entry {
+	/* module ID for the corresponding firmware logging event */
+	u16 module_id;
+	/* verbosity level for the module_id */
+	u8 log_level;
+};
+
+struct ice_fwlog_cfg {
+	/* list of modules for configuring log level */
+	struct ice_fwlog_module_entry module_entries[ICE_AQC_FW_LOG_ID_MAX];
+#define ICE_FWLOG_OPTION_ARQ_ENA		BIT(0)
+#define ICE_FWLOG_OPTION_UART_ENA		BIT(1)
+	/* set before calling ice_fwlog_init() so the PF registers for firmware
+	 * logging on initialization
+	 */
+#define ICE_FWLOG_OPTION_REGISTER_ON_INIT	BIT(2)
+	/* set in the ice_fwlog_get() response if the PF is registered for FW
+	 * logging events over ARQ
+	 */
+#define ICE_FWLOG_OPTION_IS_REGISTERED		BIT(3)
+	/* options used to configure firmware logging */
+	u16 options;
+	/* minimum number of log events sent per Admin Receive Queue event */
+	u8 log_resolution;
+};
+
+void ice_fwlog_set_support_ena(struct ice_hw *hw);
+bool ice_fwlog_supported(struct ice_hw *hw);
+enum ice_status ice_fwlog_init(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status ice_fwlog_set(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status ice_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg);
+enum ice_status
+ice_fwlog_update_modules(struct ice_hw *hw,
+			 struct ice_fwlog_module_entry *entries,
+			 u16 num_entries);
+enum ice_status ice_fwlog_register(struct ice_hw *hw);
+enum ice_status ice_fwlog_unregister(struct ice_hw *hw);
+void
+ice_fwlog_event_dump(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf);
+#endif /* _ICE_FWLOG_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 9138b19de87e02237a7a62e5694eb3aa4cdda94a..a42c1af118a8ed89fed754fdb7d37f07aef361f0 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -1,63 +1,3006 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
-
-/* Machine-generated file */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
+/* Machine-generated file; do not edit */
 #ifndef _ICE_HW_AUTOGEN_H_
 #define _ICE_HW_AUTOGEN_H_
 
-#define PF0INT_ITR_0(_i)			(0x03000004 + ((_i) * 4096))
-#define PF0INT_ITR_1(_i)			(0x03000008 + ((_i) * 4096))
-#define PF0INT_ITR_2(_i)			(0x0300000C + ((_i) * 4096))
-#define QTX_COMM_DBELL(_DBQM)			(0x002C0000 + ((_DBQM) * 4))
-#define QTX_COMM_HEAD(_DBQM)			(0x000E0000 + ((_DBQM) * 4))
+
+
+#define GL_RDPU_CNTRL				0x00052054 /* Reset Source: CORER */
+#define GL_RDPU_CNTRL_RX_PAD_EN_S		0
+#define GL_RDPU_CNTRL_RX_PAD_EN_M		BIT(0)
+#define GL_RDPU_CNTRL_UDP_ZERO_EN_S		1
+#define GL_RDPU_CNTRL_UDP_ZERO_EN_M		BIT(1)
+#define GL_RDPU_CNTRL_BLNC_EN_S			2
+#define GL_RDPU_CNTRL_BLNC_EN_M			BIT(2)
+#define GL_RDPU_CNTRL_RECIPE_BYPASS_S		3
+#define GL_RDPU_CNTRL_RECIPE_BYPASS_M		BIT(3)
+#define GL_RDPU_CNTRL_RLAN_ACK_REQ_PM_TH_S	4
+#define GL_RDPU_CNTRL_RLAN_ACK_REQ_PM_TH_M	ICE_M(0x3F, 4)
+#define GL_RDPU_CNTRL_PE_ACK_REQ_PM_TH_S	10
+#define GL_RDPU_CNTRL_PE_ACK_REQ_PM_TH_M	ICE_M(0x3F, 10)
+#define GL_RDPU_CNTRL_REQ_WB_PM_TH_S		16
+#define GL_RDPU_CNTRL_REQ_WB_PM_TH_M		ICE_M(0x1F, 16)
+#define GL_RDPU_CNTRL_ECO_S			21
+#define GL_RDPU_CNTRL_ECO_M			ICE_M(0x7FF, 21)
+#define MSIX_PBA(_i)				(0x00008000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: FLR */
+#define MSIX_PBA_MAX_INDEX			2
+#define MSIX_PBA_PENBIT_S			0
+#define MSIX_PBA_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TADD(_i)				(0x00000000 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TADD_MAX_INDEX			64
+#define MSIX_TADD_MSIXTADD10_S			0
+#define MSIX_TADD_MSIXTADD10_M			ICE_M(0x3, 0)
+#define MSIX_TADD_MSIXTADD_S			2
+#define MSIX_TADD_MSIXTADD_M			ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TUADD(_i)				(0x00000004 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TUADD_MAX_INDEX			64
+#define MSIX_TUADD_MSIXTUADD_S			0
+#define MSIX_TUADD_MSIXTUADD_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TVCTRL(_i)				(0x0000000C + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL_MAX_INDEX			64
+#define MSIX_TVCTRL_MASK_S			0
+#define MSIX_TVCTRL_MASK_M			BIT(0)
+#define PF0_FW_HLP_ARQBAH_PAGE			0x02D00180 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_FW_HLP_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ARQBAL_PAGE			0x02D00080 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_FW_HLP_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ARQH_PAGE			0x02D00380 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_FW_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_PAGE			0x02D00280 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_FW_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_FW_HLP_ARQT_PAGE			0x02D00480 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_FW_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQBAH_PAGE			0x02D00100 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_FW_HLP_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ATQBAL_PAGE			0x02D00000 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_LSB_S	0
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_FW_HLP_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ATQH_PAGE			0x02D00300 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_FW_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_PAGE			0x02D00200 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_FW_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_FW_HLP_ATQT_PAGE			0x02D00400 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_FW_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQBAH_PAGE			0x02D40180 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_FW_PSM_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ARQBAL_PAGE			0x02D40080 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_FW_PSM_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ARQH_PAGE			0x02D40380 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQH_PAGE_ARQH_S		0
+#define PF0_FW_PSM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_PAGE			0x02D40280 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_FW_PSM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_FW_PSM_ARQT_PAGE			0x02D40480 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQT_PAGE_ARQT_S		0
+#define PF0_FW_PSM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQBAH_PAGE			0x02D40100 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_FW_PSM_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ATQBAL_PAGE			0x02D40000 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_LSB_S	0
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_FW_PSM_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ATQH_PAGE			0x02D40300 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQH_PAGE_ATQH_S		0
+#define PF0_FW_PSM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_PAGE			0x02D40200 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_FW_PSM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_FW_PSM_ATQT_PAGE			0x02D40400 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQT_PAGE_ATQT_S		0
+#define PF0_FW_PSM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQBAH_PAGE			0x02D80190 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_CPM_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ARQBAL_PAGE			0x02D80090 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_CPM_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ARQH_PAGE			0x02D80390 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_CPM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_PAGE			0x02D80290 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_CPM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_CPM_ARQT_PAGE			0x02D80490 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_CPM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQBAH_PAGE			0x02D80110 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_CPM_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ATQBAL_PAGE			0x02D80010 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_CPM_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ATQH_PAGE			0x02D80310 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_CPM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_PAGE			0x02D80210 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_CPM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_CPM_ATQT_PAGE			0x02D80410 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_CPM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQBAH_PAGE			0x02D00190 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_HLP_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ARQBAL_PAGE			0x02D00090 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_HLP_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ARQH_PAGE			0x02D00390 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_PAGE			0x02D00290 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_HLP_ARQT_PAGE			0x02D00490 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQBAH_PAGE			0x02D00110 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_HLP_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ATQBAL_PAGE			0x02D00010 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_HLP_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ATQH_PAGE			0x02D00310 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_PAGE			0x02D00210 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_HLP_ATQT_PAGE			0x02D00410 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQBAH_PAGE			0x02D40190 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAH_PAGE_ARQBAH_S	0
+#define PF0_MBX_PSM_ARQBAH_PAGE_ARQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ARQBAL_PAGE			0x02D40090 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_S	6
+#define PF0_MBX_PSM_ARQBAL_PAGE_ARQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ARQH_PAGE			0x02D40390 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQH_PAGE_ARQH_S		0
+#define PF0_MBX_PSM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_PAGE			0x02D40290 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQLEN_S	0
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQVFE_S	28
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQVFE_M	BIT(28)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_MBX_PSM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_MBX_PSM_ARQT_PAGE			0x02D40490 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQT_PAGE_ARQT_S		0
+#define PF0_MBX_PSM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQBAH_PAGE			0x02D40110 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAH_PAGE_ATQBAH_S	0
+#define PF0_MBX_PSM_ATQBAH_PAGE_ATQBAH_M	ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ATQBAL_PAGE			0x02D40010 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAL_PAGE_ATQBAL_S	6
+#define PF0_MBX_PSM_ATQBAL_PAGE_ATQBAL_M	ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ATQH_PAGE			0x02D40310 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQH_PAGE_ATQH_S		0
+#define PF0_MBX_PSM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_PAGE			0x02D40210 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQLEN_S	0
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQLEN_M	ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQVFE_S	28
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQVFE_M	BIT(28)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_MBX_PSM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_MBX_PSM_ATQT_PAGE			0x02D40410 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQT_PAGE_ATQT_S		0
+#define PF0_MBX_PSM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQBAH_PAGE			0x02D801A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_SB_CPM_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ARQBAL_PAGE			0x02D800A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_SB_CPM_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ARQH_PAGE			0x02D803A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQH_PAGE_ARQH_S		0
+#define PF0_SB_CPM_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_PAGE			0x02D802A0 /* Reset Source: PFR */
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_SB_CPM_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_SB_CPM_ARQT_PAGE			0x02D804A0 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQT_PAGE_ARQT_S		0
+#define PF0_SB_CPM_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQBAH_PAGE			0x02D80120 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_SB_CPM_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ATQBAL_PAGE			0x02D80020 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_SB_CPM_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ATQH_PAGE			0x02D80320 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQH_PAGE_ATQH_S		0
+#define PF0_SB_CPM_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_PAGE			0x02D80220 /* Reset Source: PFR */
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_SB_CPM_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_SB_CPM_ATQT_PAGE			0x02D80420 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQT_PAGE_ATQT_S		0
+#define PF0_SB_CPM_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQBAH_PAGE			0x02D001A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAH_PAGE_ARQBAH_S		0
+#define PF0_SB_HLP_ARQBAH_PAGE_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ARQBAL_PAGE			0x02D000A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_LSB_S	0
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_LSB_M	ICE_M(0x3F, 0)
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_S		6
+#define PF0_SB_HLP_ARQBAL_PAGE_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ARQH_PAGE			0x02D003A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQH_PAGE_ARQH_S		0
+#define PF0_SB_HLP_ARQH_PAGE_ARQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_PAGE			0x02D002A0 /* Reset Source: PFR */
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQLEN_S		0
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQVFE_S		28
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQVFE_M		BIT(28)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQOVFL_S	29
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQOVFL_M	BIT(29)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQCRIT_S	30
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQCRIT_M	BIT(30)
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQENABLE_S	31
+#define PF0_SB_HLP_ARQLEN_PAGE_ARQENABLE_M	BIT(31)
+#define PF0_SB_HLP_ARQT_PAGE			0x02D004A0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQT_PAGE_ARQT_S		0
+#define PF0_SB_HLP_ARQT_PAGE_ARQT_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQBAH_PAGE			0x02D00120 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAH_PAGE_ATQBAH_S		0
+#define PF0_SB_HLP_ATQBAH_PAGE_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ATQBAL_PAGE			0x02D00020 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAL_PAGE_ATQBAL_S		6
+#define PF0_SB_HLP_ATQBAL_PAGE_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ATQH_PAGE			0x02D00320 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQH_PAGE_ATQH_S		0
+#define PF0_SB_HLP_ATQH_PAGE_ATQH_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_PAGE			0x02D00220 /* Reset Source: PFR */
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQLEN_S		0
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQVFE_S		28
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQVFE_M		BIT(28)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQOVFL_S	29
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQOVFL_M	BIT(29)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQCRIT_S	30
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQCRIT_M	BIT(30)
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQENABLE_S	31
+#define PF0_SB_HLP_ATQLEN_PAGE_ATQENABLE_M	BIT(31)
+#define PF0_SB_HLP_ATQT_PAGE			0x02D00420 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQT_PAGE_ATQT_S		0
+#define PF0_SB_HLP_ATQT_PAGE_ATQT_M		ICE_M(0x3FF, 0)
+#define PF0INT_DYN_CTL(_i)			(0x03000000 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_DYN_CTL_MAX_INDEX		2047
+#define PF0INT_DYN_CTL_INTENA_S			0
+#define PF0INT_DYN_CTL_INTENA_M			BIT(0)
+#define PF0INT_DYN_CTL_CLEARPBA_S		1
+#define PF0INT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define PF0INT_DYN_CTL_SWINT_TRIG_S		2
+#define PF0INT_DYN_CTL_SWINT_TRIG_M		BIT(2)
+#define PF0INT_DYN_CTL_ITR_INDX_S		3
+#define PF0INT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
+#define PF0INT_DYN_CTL_INTERVAL_S		5
+#define PF0INT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define PF0INT_DYN_CTL_SW_ITR_INDX_ENA_S	24
+#define PF0INT_DYN_CTL_SW_ITR_INDX_ENA_M	BIT(24)
+#define PF0INT_DYN_CTL_SW_ITR_INDX_S		25
+#define PF0INT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define PF0INT_DYN_CTL_WB_ON_ITR_S		30
+#define PF0INT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define PF0INT_DYN_CTL_INTENA_MSK_S		31
+#define PF0INT_DYN_CTL_INTENA_MSK_M		BIT(31)
+#define PF0INT_ITR_0(_i)			(0x03000004 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_0_MAX_INDEX			2047
+#define PF0INT_ITR_0_INTERVAL_S			0
+#define PF0INT_ITR_0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_ITR_1(_i)			(0x03000008 + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_1_MAX_INDEX			2047
+#define PF0INT_ITR_1_INTERVAL_S			0
+#define PF0INT_ITR_1_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_ITR_2(_i)			(0x0300000C + ((_i) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define PF0INT_ITR_2_MAX_INDEX			2047
+#define PF0INT_ITR_2_INTERVAL_S			0
+#define PF0INT_ITR_2_INTERVAL_M			ICE_M(0xFFF, 0)
+#define PF0INT_OICR_CPM_PAGE			0x02D03000 /* Reset Source: CORER */
+#define PF0INT_OICR_CPM_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_CPM_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_CPM_PAGE_QUEUE_S		1
+#define PF0INT_OICR_CPM_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_CPM_PAGE_RSV1_S		2
+#define PF0INT_OICR_CPM_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_CPM_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_CPM_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_CPM_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_CPM_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_CPM_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_CPM_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_CPM_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_CPM_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_CPM_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_CPM_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_CPM_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_CPM_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_CPM_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_CPM_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_CPM_PAGE_RSV2_S		17
+#define PF0INT_OICR_CPM_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_CPM_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_CPM_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_CPM_PAGE_GRST_S		20
+#define PF0INT_OICR_CPM_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_CPM_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_CPM_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_CPM_PAGE_GPIO_S		22
+#define PF0INT_OICR_CPM_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_CPM_PAGE_RSV3_S		23
+#define PF0INT_OICR_CPM_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_CPM_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_CPM_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_CPM_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_CPM_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_CPM_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_CPM_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_CPM_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_CPM_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_CPM_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_CPM_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_CPM_PAGE_VFLR_S		29
+#define PF0INT_OICR_CPM_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_CPM_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_CPM_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_CPM_PAGE_SWINT_S		31
+#define PF0INT_OICR_CPM_PAGE_SWINT_M		BIT(31)
+#define PF0INT_OICR_ENA_CPM_PAGE		0x02D03100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_CPM_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_CPM_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_CPM_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_CPM_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_HLP_PAGE		0x02D01100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_HLP_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_HLP_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_HLP_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_HLP_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_PSM_PAGE		0x02D02100 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_PSM_PAGE_RSV0_S		0
+#define PF0INT_OICR_ENA_PSM_PAGE_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_PSM_PAGE_INT_ENA_S	1
+#define PF0INT_OICR_ENA_PSM_PAGE_INT_ENA_M	ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_HLP_PAGE			0x02D01000 /* Reset Source: CORER */
+#define PF0INT_OICR_HLP_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_HLP_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_HLP_PAGE_QUEUE_S		1
+#define PF0INT_OICR_HLP_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_HLP_PAGE_RSV1_S		2
+#define PF0INT_OICR_HLP_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_HLP_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_HLP_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_HLP_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_HLP_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_HLP_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_HLP_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_HLP_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_HLP_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_HLP_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_HLP_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_HLP_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_HLP_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_HLP_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_HLP_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_HLP_PAGE_RSV2_S		17
+#define PF0INT_OICR_HLP_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_HLP_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_HLP_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_HLP_PAGE_GRST_S		20
+#define PF0INT_OICR_HLP_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_HLP_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_HLP_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_HLP_PAGE_GPIO_S		22
+#define PF0INT_OICR_HLP_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_HLP_PAGE_RSV3_S		23
+#define PF0INT_OICR_HLP_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_HLP_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_HLP_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_HLP_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_HLP_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_HLP_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_HLP_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_HLP_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_HLP_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_HLP_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_HLP_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_HLP_PAGE_VFLR_S		29
+#define PF0INT_OICR_HLP_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_HLP_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_HLP_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_HLP_PAGE_SWINT_S		31
+#define PF0INT_OICR_HLP_PAGE_SWINT_M		BIT(31)
+#define PF0INT_OICR_PSM_PAGE			0x02D02000 /* Reset Source: CORER */
+#define PF0INT_OICR_PSM_PAGE_INTEVENT_S		0
+#define PF0INT_OICR_PSM_PAGE_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_PSM_PAGE_QUEUE_S		1
+#define PF0INT_OICR_PSM_PAGE_QUEUE_M		BIT(1)
+#define PF0INT_OICR_PSM_PAGE_RSV1_S		2
+#define PF0INT_OICR_PSM_PAGE_RSV1_M		ICE_M(0xFF, 2)
+#define PF0INT_OICR_PSM_PAGE_HH_COMP_S		10
+#define PF0INT_OICR_PSM_PAGE_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_PSM_PAGE_TSYN_TX_S		11
+#define PF0INT_OICR_PSM_PAGE_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_PSM_PAGE_TSYN_EVNT_S	12
+#define PF0INT_OICR_PSM_PAGE_TSYN_EVNT_M	BIT(12)
+#define PF0INT_OICR_PSM_PAGE_TSYN_TGT_S		13
+#define PF0INT_OICR_PSM_PAGE_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_PSM_PAGE_HLP_RDY_S		14
+#define PF0INT_OICR_PSM_PAGE_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_PSM_PAGE_CPM_RDY_S		15
+#define PF0INT_OICR_PSM_PAGE_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_PSM_PAGE_ECC_ERR_S		16
+#define PF0INT_OICR_PSM_PAGE_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_PSM_PAGE_RSV2_S		17
+#define PF0INT_OICR_PSM_PAGE_RSV2_M		ICE_M(0x3, 17)
+#define PF0INT_OICR_PSM_PAGE_MAL_DETECT_S	19
+#define PF0INT_OICR_PSM_PAGE_MAL_DETECT_M	BIT(19)
+#define PF0INT_OICR_PSM_PAGE_GRST_S		20
+#define PF0INT_OICR_PSM_PAGE_GRST_M		BIT(20)
+#define PF0INT_OICR_PSM_PAGE_PCI_EXCEPTION_S	21
+#define PF0INT_OICR_PSM_PAGE_PCI_EXCEPTION_M	BIT(21)
+#define PF0INT_OICR_PSM_PAGE_GPIO_S		22
+#define PF0INT_OICR_PSM_PAGE_GPIO_M		BIT(22)
+#define PF0INT_OICR_PSM_PAGE_RSV3_S		23
+#define PF0INT_OICR_PSM_PAGE_RSV3_M		BIT(23)
+#define PF0INT_OICR_PSM_PAGE_STORM_DETECT_S	24
+#define PF0INT_OICR_PSM_PAGE_STORM_DETECT_M	BIT(24)
+#define PF0INT_OICR_PSM_PAGE_LINK_STAT_CHANGE_S 25
+#define PF0INT_OICR_PSM_PAGE_LINK_STAT_CHANGE_M BIT(25)
+#define PF0INT_OICR_PSM_PAGE_HMC_ERR_S		26
+#define PF0INT_OICR_PSM_PAGE_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_PSM_PAGE_PE_PUSH_S		27
+#define PF0INT_OICR_PSM_PAGE_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_PSM_PAGE_PE_CRITERR_S	28
+#define PF0INT_OICR_PSM_PAGE_PE_CRITERR_M	BIT(28)
+#define PF0INT_OICR_PSM_PAGE_VFLR_S		29
+#define PF0INT_OICR_PSM_PAGE_VFLR_M		BIT(29)
+#define PF0INT_OICR_PSM_PAGE_XLR_HW_DONE_S	30
+#define PF0INT_OICR_PSM_PAGE_XLR_HW_DONE_M	BIT(30)
+#define PF0INT_OICR_PSM_PAGE_SWINT_S		31
+#define PF0INT_OICR_PSM_PAGE_SWINT_M		BIT(31)
+#define QRX_TAIL_PAGE(_QRX)			(0x03800000 + ((_QRX) * 4096)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRX_TAIL_PAGE_MAX_INDEX			2047
+#define QRX_TAIL_PAGE_TAIL_S			0
+#define QRX_TAIL_PAGE_TAIL_M			ICE_M(0x1FFF, 0)
+#define QTX_COMM_DBELL_PAGE(_DBQM)		(0x04000000 + ((_DBQM) * 4096)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_DBELL_PAGE_MAX_INDEX		16383
+#define QTX_COMM_DBELL_PAGE_QTX_COMM_DBELL_S	0
+#define QTX_COMM_DBELL_PAGE_QTX_COMM_DBELL_M	ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_DBELL_PAGE(_DBLQ)		(0x02F00000 + ((_DBLQ) * 4096)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_DBELL_PAGE_MAX_INDEX	255
+#define QTX_COMM_DBLQ_DBELL_PAGE_TAIL_S		0
+#define QTX_COMM_DBLQ_DBELL_PAGE_TAIL_M		ICE_M(0x1FFF, 0)
+#define VSI_MBX_ARQBAH(_VSI)			(0x02000018 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQBAH_MAX_INDEX		767
+#define VSI_MBX_ARQBAH_ARQBAH_S			0
+#define VSI_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_MBX_ARQBAL(_VSI)			(0x02000014 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQBAL_MAX_INDEX		767
+#define VSI_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define VSI_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VSI_MBX_ARQBAL_ARQBAL_S			6
+#define VSI_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VSI_MBX_ARQH(_VSI)			(0x02000020 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQH_MAX_INDEX			767
+#define VSI_MBX_ARQH_ARQH_S			0
+#define VSI_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ARQLEN(_VSI)			(0x0200001C + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_MBX_ARQLEN_MAX_INDEX		767
+#define VSI_MBX_ARQLEN_ARQLEN_S			0
+#define VSI_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ARQLEN_ARQVFE_S			28
+#define VSI_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define VSI_MBX_ARQLEN_ARQOVFL_S		29
+#define VSI_MBX_ARQLEN_ARQOVFL_M		BIT(29)
+#define VSI_MBX_ARQLEN_ARQCRIT_S		30
+#define VSI_MBX_ARQLEN_ARQCRIT_M		BIT(30)
+#define VSI_MBX_ARQLEN_ARQENABLE_S		31
+#define VSI_MBX_ARQLEN_ARQENABLE_M		BIT(31)
+#define VSI_MBX_ARQT(_VSI)			(0x02000024 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ARQT_MAX_INDEX			767
+#define VSI_MBX_ARQT_ARQT_S			0
+#define VSI_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQBAH(_VSI)			(0x02000004 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQBAH_MAX_INDEX		767
+#define VSI_MBX_ATQBAH_ATQBAH_S			0
+#define VSI_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_MBX_ATQBAL(_VSI)			(0x02000000 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQBAL_MAX_INDEX		767
+#define VSI_MBX_ATQBAL_ATQBAL_S			6
+#define VSI_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VSI_MBX_ATQH(_VSI)			(0x0200000C + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQH_MAX_INDEX			767
+#define VSI_MBX_ATQH_ATQH_S			0
+#define VSI_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQLEN(_VSI)			(0x02000008 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_MBX_ATQLEN_MAX_INDEX		767
+#define VSI_MBX_ATQLEN_ATQLEN_S			0
+#define VSI_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VSI_MBX_ATQLEN_ATQVFE_S			28
+#define VSI_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define VSI_MBX_ATQLEN_ATQOVFL_S		29
+#define VSI_MBX_ATQLEN_ATQOVFL_M		BIT(29)
+#define VSI_MBX_ATQLEN_ATQCRIT_S		30
+#define VSI_MBX_ATQLEN_ATQCRIT_M		BIT(30)
+#define VSI_MBX_ATQLEN_ATQENABLE_S		31
+#define VSI_MBX_ATQLEN_ATQENABLE_M		BIT(31)
+#define VSI_MBX_ATQT(_VSI)			(0x02000010 + ((_VSI) * 4096)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_MBX_ATQT_MAX_INDEX			767
+#define VSI_MBX_ATQT_ATQT_S			0
+#define VSI_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define GL_ACL_ACCESS_CMD			0x00391000 /* Reset Source: CORER */
+#define GL_ACL_ACCESS_CMD_TABLE_ID_S		0
+#define GL_ACL_ACCESS_CMD_TABLE_ID_M		ICE_M(0xFF, 0)
+#define GL_ACL_ACCESS_CMD_ENTRY_INDEX_S		8
+#define GL_ACL_ACCESS_CMD_ENTRY_INDEX_M		ICE_M(0xFFF, 8)
+#define GL_ACL_ACCESS_CMD_OPERATION_S		20
+#define GL_ACL_ACCESS_CMD_OPERATION_M		BIT(20)
+#define GL_ACL_ACCESS_CMD_OBJ_TYPE_S		24
+#define GL_ACL_ACCESS_CMD_OBJ_TYPE_M		ICE_M(0xF, 24)
+#define GL_ACL_ACCESS_CMD_EXECUTE_S		31
+#define GL_ACL_ACCESS_CMD_EXECUTE_M		BIT(31)
+#define GL_ACL_ACCESS_STATUS			0x00391004 /* Reset Source: CORER */
+#define GL_ACL_ACCESS_STATUS_BUSY_S		0
+#define GL_ACL_ACCESS_STATUS_BUSY_M		BIT(0)
+#define GL_ACL_ACCESS_STATUS_DONE_S		1
+#define GL_ACL_ACCESS_STATUS_DONE_M		BIT(1)
+#define GL_ACL_ACCESS_STATUS_ERROR_S		2
+#define GL_ACL_ACCESS_STATUS_ERROR_M		BIT(2)
+#define GL_ACL_ACCESS_STATUS_OPERATION_S	3
+#define GL_ACL_ACCESS_STATUS_OPERATION_M	BIT(3)
+#define GL_ACL_ACCESS_STATUS_ERROR_CODE_S	4
+#define GL_ACL_ACCESS_STATUS_ERROR_CODE_M	ICE_M(0xF, 4)
+#define GL_ACL_ACCESS_STATUS_TABLE_ID_S		8
+#define GL_ACL_ACCESS_STATUS_TABLE_ID_M		ICE_M(0xFF, 8)
+#define GL_ACL_ACCESS_STATUS_ENTRY_INDEX_S	16
+#define GL_ACL_ACCESS_STATUS_ENTRY_INDEX_M	ICE_M(0xFFF, 16)
+#define GL_ACL_ACCESS_STATUS_OBJ_TYPE_S		28
+#define GL_ACL_ACCESS_STATUS_OBJ_TYPE_M		ICE_M(0xF, 28)
+#define GL_ACL_ACTMEM_ACT(_i)			(0x00393824 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_ACL_ACTMEM_ACT_MAX_INDEX		1
+#define GL_ACL_ACTMEM_ACT_VALUE_S		0
+#define GL_ACL_ACTMEM_ACT_VALUE_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_ACTMEM_ACT_MDID_S		20
+#define GL_ACL_ACTMEM_ACT_MDID_M		ICE_M(0x3F, 20)
+#define GL_ACL_ACTMEM_ACT_PRIORITY_S		28
+#define GL_ACL_ACTMEM_ACT_PRIORITY_M		ICE_M(0x7, 28)
+#define GL_ACL_CHICKEN_REGISTER			0x00393810 /* Reset Source: CORER */
+#define GL_ACL_CHICKEN_REGISTER_TCAM_DATA_POL_CH_S 0
+#define GL_ACL_CHICKEN_REGISTER_TCAM_DATA_POL_CH_M BIT(0)
+#define GL_ACL_CHICKEN_REGISTER_TCAM_ADDR_POL_CH_S 1
+#define GL_ACL_CHICKEN_REGISTER_TCAM_ADDR_POL_CH_M BIT(1)
+#define GL_ACL_DEFAULT_ACT(_i)			(0x00391168 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_DEFAULT_ACT_MAX_INDEX		15
+#define GL_ACL_DEFAULT_ACT_VALUE_S		0
+#define GL_ACL_DEFAULT_ACT_VALUE_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_DEFAULT_ACT_MDID_S		20
+#define GL_ACL_DEFAULT_ACT_MDID_M		ICE_M(0x3F, 20)
+#define GL_ACL_DEFAULT_ACT_PRIORITY_S		28
+#define GL_ACL_DEFAULT_ACT_PRIORITY_M		ICE_M(0x7, 28)
+#define GL_ACL_PROFILE_BWSB_SEL(_i)		(0x00391008 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_BWSB_SEL_MAX_INDEX	31
+#define GL_ACL_PROFILE_BWSB_SEL_BSB_SRC_OFF_S	0
+#define GL_ACL_PROFILE_BWSB_SEL_BSB_SRC_OFF_M	ICE_M(0x3F, 0)
+#define GL_ACL_PROFILE_BWSB_SEL_WSB_SRC_OFF_S	8
+#define GL_ACL_PROFILE_BWSB_SEL_WSB_SRC_OFF_M	ICE_M(0x1F, 8)
+#define GL_ACL_PROFILE_DWSB_SEL(_i)		(0x00391088 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_DWSB_SEL_MAX_INDEX	15
+#define GL_ACL_PROFILE_DWSB_SEL_DWORD_SEL_OFF_S 0
+#define GL_ACL_PROFILE_DWSB_SEL_DWORD_SEL_OFF_M ICE_M(0xF, 0)
+#define GL_ACL_PROFILE_PF_CFG(_i)		(0x003910C8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_PF_CFG_MAX_INDEX		7
+#define GL_ACL_PROFILE_PF_CFG_SCEN_SEL_S	0
+#define GL_ACL_PROFILE_PF_CFG_SCEN_SEL_M	ICE_M(0x3F, 0)
+#define GL_ACL_PROFILE_RC_CFG(_i)		(0x003910E8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_RC_CFG_MAX_INDEX		7
+#define GL_ACL_PROFILE_RC_CFG_LOW_BOUND_S	0
+#define GL_ACL_PROFILE_RC_CFG_LOW_BOUND_M	ICE_M(0xFFFF, 0)
+#define GL_ACL_PROFILE_RC_CFG_HIGH_BOUND_S	16
+#define GL_ACL_PROFILE_RC_CFG_HIGH_BOUND_M	ICE_M(0xFFFF, 16)
+#define GL_ACL_PROFILE_RCF_MASK(_i)		(0x00391108 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_ACL_PROFILE_RCF_MASK_MAX_INDEX	7
+#define GL_ACL_PROFILE_RCF_MASK_MASK_S		0
+#define GL_ACL_PROFILE_RCF_MASK_MASK_M		ICE_M(0xFFFF, 0)
+#define GL_ACL_SCENARIO_ACT_CFG(_i)		(0x003938AC + ((_i) * 4)) /* _i=0...19 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_ACT_CFG_MAX_INDEX	19
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_SEL_S	0
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_SEL_M	ICE_M(0xF, 0)
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_EN_S	8
+#define GL_ACL_SCENARIO_ACT_CFG_ACTMEM_EN_M	BIT(8)
+#define GL_ACL_SCENARIO_CFG_H(_i)		(0x0039386C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_CFG_H_MAX_INDEX		15
+#define GL_ACL_SCENARIO_CFG_H_SELECT4_S		0
+#define GL_ACL_SCENARIO_CFG_H_SELECT4_M		ICE_M(0x1F, 0)
+#define GL_ACL_SCENARIO_CFG_H_CHUNKMASK_S	8
+#define GL_ACL_SCENARIO_CFG_H_CHUNKMASK_M	ICE_M(0xFF, 8)
+#define GL_ACL_SCENARIO_CFG_H_START_COMPARE_S	24
+#define GL_ACL_SCENARIO_CFG_H_START_COMPARE_M	BIT(24)
+#define GL_ACL_SCENARIO_CFG_H_START_SET_S	28
+#define GL_ACL_SCENARIO_CFG_H_START_SET_M	BIT(28)
+#define GL_ACL_SCENARIO_CFG_L(_i)		(0x0039382C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GL_ACL_SCENARIO_CFG_L_MAX_INDEX		15
+#define GL_ACL_SCENARIO_CFG_L_SELECT0_S		0
+#define GL_ACL_SCENARIO_CFG_L_SELECT0_M		ICE_M(0x7F, 0)
+#define GL_ACL_SCENARIO_CFG_L_SELECT1_S		8
+#define GL_ACL_SCENARIO_CFG_L_SELECT1_M		ICE_M(0x7F, 8)
+#define GL_ACL_SCENARIO_CFG_L_SELECT2_S		16
+#define GL_ACL_SCENARIO_CFG_L_SELECT2_M		ICE_M(0x7F, 16)
+#define GL_ACL_SCENARIO_CFG_L_SELECT3_S		24
+#define GL_ACL_SCENARIO_CFG_L_SELECT3_M		ICE_M(0x7F, 24)
+#define GL_ACL_TCAM_KEY_H			0x00393818 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_H_GL_ACL_FFU_TCAM_KEY_H_S 0
+#define GL_ACL_TCAM_KEY_H_GL_ACL_FFU_TCAM_KEY_H_M ICE_M(0xFF, 0)
+#define GL_ACL_TCAM_KEY_INV_H			0x00393820 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_INV_H_GL_ACL_FFU_TCAM_KEY_INV_H_S 0
+#define GL_ACL_TCAM_KEY_INV_H_GL_ACL_FFU_TCAM_KEY_INV_H_M ICE_M(0xFF, 0)
+#define GL_ACL_TCAM_KEY_INV_L			0x0039381C /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_INV_L_GL_ACL_FFU_TCAM_KEY_INV_L_S 0
+#define GL_ACL_TCAM_KEY_INV_L_GL_ACL_FFU_TCAM_KEY_INV_L_M ICE_M(0xFFFFFFFF, 0)
+#define GL_ACL_TCAM_KEY_L			0x00393814 /* Reset Source: CORER */
+#define GL_ACL_TCAM_KEY_L_GL_ACL_FFU_TCAM_KEY_L_S 0
+#define GL_ACL_TCAM_KEY_L_GL_ACL_FFU_TCAM_KEY_L_M ICE_M(0xFFFFFFFF, 0)
+#define VSI_ACL_DEF_SEL(_VSI)			(0x00391800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_ACL_DEF_SEL_MAX_INDEX		767
+#define VSI_ACL_DEF_SEL_RX_PROFILE_MISS_SEL_S	0
+#define VSI_ACL_DEF_SEL_RX_PROFILE_MISS_SEL_M	ICE_M(0x3, 0)
+#define VSI_ACL_DEF_SEL_RX_TABLES_MISS_SEL_S	4
+#define VSI_ACL_DEF_SEL_RX_TABLES_MISS_SEL_M	ICE_M(0x3, 4)
+#define VSI_ACL_DEF_SEL_TX_PROFILE_MISS_SEL_S	8
+#define VSI_ACL_DEF_SEL_TX_PROFILE_MISS_SEL_M	ICE_M(0x3, 8)
+#define VSI_ACL_DEF_SEL_TX_TABLES_MISS_SEL_S	12
+#define VSI_ACL_DEF_SEL_TX_TABLES_MISS_SEL_M	ICE_M(0x3, 12)
+#define GL_SWT_L2TAG0(_i)			(0x000492A8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAG0_MAX_INDEX			7
+#define GL_SWT_L2TAG0_DATA_S			0
+#define GL_SWT_L2TAG0_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_L2TAG1(_i)			(0x000492C8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAG1_MAX_INDEX			7
+#define GL_SWT_L2TAG1_DATA_S			0
+#define GL_SWT_L2TAG1_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_L2TAGCTRL(_i)			(0x001D2660 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGCTRL_MAX_INDEX		7
+#define GL_SWT_L2TAGCTRL_LENGTH_S		0
+#define GL_SWT_L2TAGCTRL_LENGTH_M		ICE_M(0x7F, 0)
+#define GL_SWT_L2TAGCTRL_HAS_UP_S		7
+#define GL_SWT_L2TAGCTRL_HAS_UP_M		BIT(7)
+#define GL_SWT_L2TAGCTRL_ISVLAN_S		9
+#define GL_SWT_L2TAGCTRL_ISVLAN_M		BIT(9)
+#define GL_SWT_L2TAGCTRL_INNERUP_S		10
+#define GL_SWT_L2TAGCTRL_INNERUP_M		BIT(10)
+#define GL_SWT_L2TAGCTRL_OUTERUP_S		11
+#define GL_SWT_L2TAGCTRL_OUTERUP_M		BIT(11)
+#define GL_SWT_L2TAGCTRL_LONG_S			12
+#define GL_SWT_L2TAGCTRL_LONG_M			BIT(12)
+#define GL_SWT_L2TAGCTRL_ISMPLS_S		13
+#define GL_SWT_L2TAGCTRL_ISMPLS_M		BIT(13)
+#define GL_SWT_L2TAGCTRL_ISNSH_S		14
+#define GL_SWT_L2TAGCTRL_ISNSH_M		BIT(14)
+#define GL_SWT_L2TAGCTRL_ETHERTYPE_S		16
+#define GL_SWT_L2TAGCTRL_ETHERTYPE_M		ICE_M(0xFFFF, 16)
+#define GL_SWT_L2TAGRXEB(_i)			(0x00052000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGRXEB_MAX_INDEX		7
+#define GL_SWT_L2TAGRXEB_OFFSET_S		0
+#define GL_SWT_L2TAGRXEB_OFFSET_M		ICE_M(0xFF, 0)
+#define GL_SWT_L2TAGRXEB_LENGTH_S		8
+#define GL_SWT_L2TAGRXEB_LENGTH_M		ICE_M(0x3, 8)
+#define GL_SWT_L2TAGTXIB(_i)			(0x000492E8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_SWT_L2TAGTXIB_MAX_INDEX		7
+#define GL_SWT_L2TAGTXIB_OFFSET_S		0
+#define GL_SWT_L2TAGTXIB_OFFSET_M		ICE_M(0xFF, 0)
+#define GL_SWT_L2TAGTXIB_LENGTH_S		8
+#define GL_SWT_L2TAGTXIB_LENGTH_M		ICE_M(0x3, 8)
+#define GLCM_PE_CACHESIZE			0x005046B4 /* Reset Source: CORER */
+#define GLCM_PE_CACHESIZE_WORD_SIZE_S		0
+#define GLCM_PE_CACHESIZE_WORD_SIZE_M		ICE_M(0xFFF, 0)
+#define GLCM_PE_CACHESIZE_SETS_S		12
+#define GLCM_PE_CACHESIZE_SETS_M		ICE_M(0xF, 12)
+#define GLCM_PE_CACHESIZE_WAYS_S		16
+#define GLCM_PE_CACHESIZE_WAYS_M		ICE_M(0x1FF, 16)
+#define GLCOMM_CQ_CTL(_CQ)			(0x000F0000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLCOMM_CQ_CTL_MAX_INDEX			511
+#define GLCOMM_CQ_CTL_COMP_TYPE_S		0
+#define GLCOMM_CQ_CTL_COMP_TYPE_M		ICE_M(0x7, 0)
+#define GLCOMM_CQ_CTL_CMD_S			4
+#define GLCOMM_CQ_CTL_CMD_M			ICE_M(0x7, 4)
+#define GLCOMM_CQ_CTL_ID_S			16
+#define GLCOMM_CQ_CTL_ID_M			ICE_M(0x3FFF, 16)
+#define GLCOMM_MIN_MAX_PKT			0x000FC064 /* Reset Source: CORER */
+#define GLCOMM_MIN_MAX_PKT_MAHDL_S		0
+#define GLCOMM_MIN_MAX_PKT_MAHDL_M		ICE_M(0x3FFF, 0)
+#define GLCOMM_MIN_MAX_PKT_MIHDL_S		16
+#define GLCOMM_MIN_MAX_PKT_MIHDL_M		ICE_M(0x3F, 16)
+#define GLCOMM_MIN_MAX_PKT_LSO_COMS_MIHDL_S	22
+#define GLCOMM_MIN_MAX_PKT_LSO_COMS_MIHDL_M	ICE_M(0x3FF, 22)
+#define GLCOMM_PKT_SHAPER_PROF(_i)		(0x002D2DA8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLCOMM_PKT_SHAPER_PROF_MAX_INDEX	7
+#define GLCOMM_PKT_SHAPER_PROF_PKTCNT_S		0
+#define GLCOMM_PKT_SHAPER_PROF_PKTCNT_M		ICE_M(0x3F, 0)
+#define GLCOMM_QTX_CNTX_CTL			0x002D2DC8 /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_CTL_QUEUE_ID_S		0
+#define GLCOMM_QTX_CNTX_CTL_QUEUE_ID_M		ICE_M(0x3FFF, 0)
+#define GLCOMM_QTX_CNTX_CTL_CMD_S		16
+#define GLCOMM_QTX_CNTX_CTL_CMD_M		ICE_M(0x7, 16)
+#define GLCOMM_QTX_CNTX_CTL_CMD_EXEC_S		19
+#define GLCOMM_QTX_CNTX_CTL_CMD_EXEC_M		BIT(19)
+#define GLCOMM_QTX_CNTX_DATA(_i)		(0x002D2D40 + ((_i) * 4)) /* _i=0...9 */ /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_DATA_MAX_INDEX		9
+#define GLCOMM_QTX_CNTX_DATA_DATA_S		0
+#define GLCOMM_QTX_CNTX_DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLCOMM_QTX_CNTX_STAT			0x002D2DCC /* Reset Source: CORER */
+#define GLCOMM_QTX_CNTX_STAT_CMD_IN_PROG_S	0
+#define GLCOMM_QTX_CNTX_STAT_CMD_IN_PROG_M	BIT(0)
+#define GLCOMM_QUANTA_PROF(_i)			(0x002D2D68 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLCOMM_QUANTA_PROF_MAX_INDEX		15
+#define GLCOMM_QUANTA_PROF_QUANTA_SIZE_S	0
+#define GLCOMM_QUANTA_PROF_QUANTA_SIZE_M	ICE_M(0x3FFF, 0)
+#define GLCOMM_QUANTA_PROF_MAX_CMD_S		16
+#define GLCOMM_QUANTA_PROF_MAX_CMD_M		ICE_M(0xFF, 16)
+#define GLCOMM_QUANTA_PROF_MAX_DESC_S		24
+#define GLCOMM_QUANTA_PROF_MAX_DESC_M		ICE_M(0x3F, 24)
+#define GLLAN_TCLAN_CACHE_CTL			0x000FC0B8 /* Reset Source: CORER */
+#define GLLAN_TCLAN_CACHE_CTL_MIN_FETCH_THRESH_S 0
+#define GLLAN_TCLAN_CACHE_CTL_MIN_FETCH_THRESH_M ICE_M(0x3F, 0)
+#define GLLAN_TCLAN_CACHE_CTL_FETCH_CL_ALIGN_S	6
+#define GLLAN_TCLAN_CACHE_CTL_FETCH_CL_ALIGN_M	BIT(6)
+#define GLLAN_TCLAN_CACHE_CTL_MIN_ALLOC_THRESH_S 7
+#define GLLAN_TCLAN_CACHE_CTL_MIN_ALLOC_THRESH_M ICE_M(0x7F, 7)
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_ENTRY_CNT_S 14
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_ENTRY_CNT_M ICE_M(0xFF, 14)
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_DESC_LIM_S	22
+#define GLLAN_TCLAN_CACHE_CTL_CACHE_DESC_LIM_M	ICE_M(0x3FF, 22)
+#define GLTCLAN_CQ_CNTX0(_CQ)			(0x000F0800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX0_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX0_RING_ADDR_LSB_S	0
+#define GLTCLAN_CQ_CNTX0_RING_ADDR_LSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX1(_CQ)			(0x000F1000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX1_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX1_RING_ADDR_MSB_S	0
+#define GLTCLAN_CQ_CNTX1_RING_ADDR_MSB_M	ICE_M(0x1FFFFFF, 0)
+#define GLTCLAN_CQ_CNTX10(_CQ)			(0x000F5800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX10_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX10_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX10_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX11(_CQ)			(0x000F6000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX11_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX11_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX11_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX12(_CQ)			(0x000F6800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX12_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX12_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX12_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX13(_CQ)			(0x000F7000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX13_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX13_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX13_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX14(_CQ)			(0x000F7800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX14_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX14_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX14_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX15(_CQ)			(0x000F8000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX15_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX15_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX15_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX16(_CQ)			(0x000F8800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX16_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX16_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX16_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX17(_CQ)			(0x000F9000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX17_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX17_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX17_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX18(_CQ)			(0x000F9800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX18_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX18_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX18_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX19(_CQ)			(0x000FA000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX19_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX19_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX19_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX2(_CQ)			(0x000F1800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX2_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX2_RING_LEN_S		0
+#define GLTCLAN_CQ_CNTX2_RING_LEN_M		ICE_M(0x3FFFF, 0)
+#define GLTCLAN_CQ_CNTX20(_CQ)			(0x000FA800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX20_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX20_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX20_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX21(_CQ)			(0x000FB000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX21_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX21_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX21_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX3(_CQ)			(0x000F2000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX3_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX3_GENERATION_S		0
+#define GLTCLAN_CQ_CNTX3_GENERATION_M		BIT(0)
+#define GLTCLAN_CQ_CNTX3_CQ_WR_PTR_S		1
+#define GLTCLAN_CQ_CNTX3_CQ_WR_PTR_M		ICE_M(0x3FFFFF, 1)
+#define GLTCLAN_CQ_CNTX4(_CQ)			(0x000F2800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX4_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX4_PF_NUM_S		0
+#define GLTCLAN_CQ_CNTX4_PF_NUM_M		ICE_M(0x7, 0)
+#define GLTCLAN_CQ_CNTX4_VMVF_NUM_S		3
+#define GLTCLAN_CQ_CNTX4_VMVF_NUM_M		ICE_M(0x3FF, 3)
+#define GLTCLAN_CQ_CNTX4_VMVF_TYPE_S		13
+#define GLTCLAN_CQ_CNTX4_VMVF_TYPE_M		ICE_M(0x3, 13)
+#define GLTCLAN_CQ_CNTX5(_CQ)			(0x000F3000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX5_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX5_TPH_EN_S		0
+#define GLTCLAN_CQ_CNTX5_TPH_EN_M		BIT(0)
+#define GLTCLAN_CQ_CNTX5_CPU_ID_S		1
+#define GLTCLAN_CQ_CNTX5_CPU_ID_M		ICE_M(0xFF, 1)
+#define GLTCLAN_CQ_CNTX5_FLUSH_ON_ITR_DIS_S	9
+#define GLTCLAN_CQ_CNTX5_FLUSH_ON_ITR_DIS_M	BIT(9)
+#define GLTCLAN_CQ_CNTX6(_CQ)			(0x000F3800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX6_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX6_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX6_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX7(_CQ)			(0x000F4000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX7_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX7_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX7_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX8(_CQ)			(0x000F4800 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX8_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX8_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX8_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCLAN_CQ_CNTX9(_CQ)			(0x000F5000 + ((_CQ) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLTCLAN_CQ_CNTX9_MAX_INDEX		511
+#define GLTCLAN_CQ_CNTX9_CQ_CACHLINE_S		0
+#define GLTCLAN_CQ_CNTX9_CQ_CACHLINE_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBELL(_DBQM)			(0x002C0000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_DBELL_MAX_INDEX		16383
+#define QTX_COMM_DBELL_QTX_COMM_DBELL_S		0
+#define QTX_COMM_DBELL_QTX_COMM_DBELL_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_CNTX(_i, _DBLQ)		(0x002D0000 + ((_i) * 1024 + (_DBLQ) * 4)) /* _i=0...4, _DBLQ=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_CNTX_MAX_INDEX		4
+#define QTX_COMM_DBLQ_CNTX_DATA_S		0
+#define QTX_COMM_DBLQ_CNTX_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define QTX_COMM_DBLQ_DBELL(_DBLQ)		(0x002D1400 + ((_DBLQ) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_COMM_DBLQ_DBELL_MAX_INDEX		255
+#define QTX_COMM_DBLQ_DBELL_TAIL_S		0
+#define QTX_COMM_DBLQ_DBELL_TAIL_M		ICE_M(0x1FFF, 0)
+#define QTX_COMM_HEAD(_DBQM)			(0x000E0000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QTX_COMM_HEAD_MAX_INDEX			16383
 #define QTX_COMM_HEAD_HEAD_S			0
 #define QTX_COMM_HEAD_HEAD_M			ICE_M(0x1FFF, 0)
-#define PF_FW_ARQBAH				0x00080180
-#define PF_FW_ARQBAL				0x00080080
-#define PF_FW_ARQH				0x00080380
+#define QTX_COMM_HEAD_RS_PENDING_S		16
+#define QTX_COMM_HEAD_RS_PENDING_M		BIT(16)
+#define GL_FW_TOOL_ARQBAH			0x000801C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQBAH_ARQBAH_S		0
+#define GL_FW_TOOL_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_FW_TOOL_ARQBAL			0x000800C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQBAL_ARQBAL_LSB_S		0
+#define GL_FW_TOOL_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define GL_FW_TOOL_ARQBAL_ARQBAL_S		6
+#define GL_FW_TOOL_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define GL_FW_TOOL_ARQH				0x000803C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQH_ARQH_S			0
+#define GL_FW_TOOL_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ARQLEN			0x000802C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQLEN_ARQLEN_S		0
+#define GL_FW_TOOL_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ARQLEN_ARQVFE_S		28
+#define GL_FW_TOOL_ARQLEN_ARQVFE_M		BIT(28)
+#define GL_FW_TOOL_ARQLEN_ARQOVFL_S		29
+#define GL_FW_TOOL_ARQLEN_ARQOVFL_M		BIT(29)
+#define GL_FW_TOOL_ARQLEN_ARQCRIT_S		30
+#define GL_FW_TOOL_ARQLEN_ARQCRIT_M		BIT(30)
+#define GL_FW_TOOL_ARQLEN_ARQENABLE_S		31
+#define GL_FW_TOOL_ARQLEN_ARQENABLE_M		BIT(31)
+#define GL_FW_TOOL_ARQT				0x000804C0 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ARQT_ARQT_S			0
+#define GL_FW_TOOL_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQBAH			0x00080140 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQBAH_ATQBAH_S		0
+#define GL_FW_TOOL_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_FW_TOOL_ATQBAL			0x00080040 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQBAL_ATQBAL_LSB_S		0
+#define GL_FW_TOOL_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define GL_FW_TOOL_ATQBAL_ATQBAL_S		6
+#define GL_FW_TOOL_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define GL_FW_TOOL_ATQH				0x00080340 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQH_ATQH_S			0
+#define GL_FW_TOOL_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQLEN			0x00080240 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQLEN_ATQLEN_S		0
+#define GL_FW_TOOL_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define GL_FW_TOOL_ATQLEN_ATQVFE_S		28
+#define GL_FW_TOOL_ATQLEN_ATQVFE_M		BIT(28)
+#define GL_FW_TOOL_ATQLEN_ATQOVFL_S		29
+#define GL_FW_TOOL_ATQLEN_ATQOVFL_M		BIT(29)
+#define GL_FW_TOOL_ATQLEN_ATQCRIT_S		30
+#define GL_FW_TOOL_ATQLEN_ATQCRIT_M		BIT(30)
+#define GL_FW_TOOL_ATQLEN_ATQENABLE_S		31
+#define GL_FW_TOOL_ATQLEN_ATQENABLE_M		BIT(31)
+#define GL_FW_TOOL_ATQT				0x00080440 /* Reset Source: EMPR */
+#define GL_FW_TOOL_ATQT_ATQT_S			0
+#define GL_FW_TOOL_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define GL_MBX_PASID				0x00231EC0 /* Reset Source: CORER */
+#define GL_MBX_PASID_PASID_MODE_S		0
+#define GL_MBX_PASID_PASID_MODE_M		BIT(0)
+#define GL_MBX_PASID_PASID_MODE_VALID_S		1
+#define GL_MBX_PASID_PASID_MODE_VALID_M		BIT(1)
+#define PF_FW_ARQBAH				0x00080180 /* Reset Source: EMPR */
+#define PF_FW_ARQBAH_ARQBAH_S			0
+#define PF_FW_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_FW_ARQBAL				0x00080080 /* Reset Source: EMPR */
+#define PF_FW_ARQBAL_ARQBAL_LSB_S		0
+#define PF_FW_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_FW_ARQBAL_ARQBAL_S			6
+#define PF_FW_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_FW_ARQH				0x00080380 /* Reset Source: EMPR */
+#define PF_FW_ARQH_ARQH_S			0
 #define PF_FW_ARQH_ARQH_M			ICE_M(0x3FF, 0)
-#define PF_FW_ARQLEN				0x00080280
+#define PF_FW_ARQLEN				0x00080280 /* Reset Source: EMPR */
+#define PF_FW_ARQLEN_ARQLEN_S			0
 #define PF_FW_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ARQLEN_ARQVFE_S			28
 #define PF_FW_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_FW_ARQLEN_ARQOVFL_S			29
 #define PF_FW_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_FW_ARQLEN_ARQCRIT_S			30
 #define PF_FW_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_FW_ARQLEN_ARQENABLE_S		31
 #define PF_FW_ARQLEN_ARQENABLE_M		BIT(31)
-#define PF_FW_ARQT				0x00080480
-#define PF_FW_ATQBAH				0x00080100
-#define PF_FW_ATQBAL				0x00080000
-#define PF_FW_ATQH				0x00080300
+#define PF_FW_ARQT				0x00080480 /* Reset Source: EMPR */
+#define PF_FW_ARQT_ARQT_S			0
+#define PF_FW_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQBAH				0x00080100 /* Reset Source: EMPR */
+#define PF_FW_ATQBAH_ATQBAH_S			0
+#define PF_FW_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_FW_ATQBAL				0x00080000 /* Reset Source: EMPR */
+#define PF_FW_ATQBAL_ATQBAL_LSB_S		0
+#define PF_FW_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_FW_ATQBAL_ATQBAL_S			6
+#define PF_FW_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_FW_ATQH				0x00080300 /* Reset Source: EMPR */
+#define PF_FW_ATQH_ATQH_S			0
 #define PF_FW_ATQH_ATQH_M			ICE_M(0x3FF, 0)
-#define PF_FW_ATQLEN				0x00080200
+#define PF_FW_ATQLEN				0x00080200 /* Reset Source: EMPR */
+#define PF_FW_ATQLEN_ATQLEN_S			0
 #define PF_FW_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQLEN_ATQVFE_S			28
 #define PF_FW_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_FW_ATQLEN_ATQOVFL_S			29
 #define PF_FW_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_FW_ATQLEN_ATQCRIT_S			30
 #define PF_FW_ATQLEN_ATQCRIT_M			BIT(30)
-#define VF_MBX_ARQLEN(_VF)			(0x0022BC00 + ((_VF) * 4))
+#define PF_FW_ATQLEN_ATQENABLE_S		31
 #define PF_FW_ATQLEN_ATQENABLE_M		BIT(31)
-#define PF_FW_ATQT				0x00080400
-#define PF_MBX_ARQBAH				0x0022E400
-#define PF_MBX_ARQBAL				0x0022E380
-#define PF_MBX_ARQH				0x0022E500
+#define PF_FW_ATQT				0x00080400 /* Reset Source: EMPR */
+#define PF_FW_ATQT_ATQT_S			0
+#define PF_FW_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ARQBAH				0x0022E400 /* Reset Source: CORER */
+#define PF_MBX_ARQBAH_ARQBAH_S			0
+#define PF_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_MBX_ARQBAL				0x0022E380 /* Reset Source: CORER */
+#define PF_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define PF_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_MBX_ARQBAL_ARQBAL_S			6
+#define PF_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_MBX_ARQH				0x0022E500 /* Reset Source: CORER */
+#define PF_MBX_ARQH_ARQH_S			0
 #define PF_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
-#define PF_MBX_ARQLEN				0x0022E480
+#define PF_MBX_ARQLEN				0x0022E480 /* Reset Source: PFR */
+#define PF_MBX_ARQLEN_ARQLEN_S			0
 #define PF_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ARQLEN_ARQVFE_S			28
+#define PF_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_MBX_ARQLEN_ARQOVFL_S			29
+#define PF_MBX_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_MBX_ARQLEN_ARQCRIT_S			30
+#define PF_MBX_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_MBX_ARQLEN_ARQENABLE_S		31
 #define PF_MBX_ARQLEN_ARQENABLE_M		BIT(31)
-#define PF_MBX_ARQT				0x0022E580
-#define PF_MBX_ATQBAH				0x0022E180
-#define PF_MBX_ATQBAL				0x0022E100
-#define PF_MBX_ATQH				0x0022E280
+#define PF_MBX_ARQT				0x0022E580 /* Reset Source: CORER */
+#define PF_MBX_ARQT_ARQT_S			0
+#define PF_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ATQBAH				0x0022E180 /* Reset Source: CORER */
+#define PF_MBX_ATQBAH_ATQBAH_S			0
+#define PF_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_MBX_ATQBAL				0x0022E100 /* Reset Source: CORER */
+#define PF_MBX_ATQBAL_ATQBAL_S			6
+#define PF_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_MBX_ATQH				0x0022E280 /* Reset Source: CORER */
+#define PF_MBX_ATQH_ATQH_S			0
 #define PF_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
-#define PF_MBX_ATQLEN				0x0022E200
+#define PF_MBX_ATQLEN				0x0022E200 /* Reset Source: PFR */
+#define PF_MBX_ATQLEN_ATQLEN_S			0
 #define PF_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_MBX_ATQLEN_ATQVFE_S			28
+#define PF_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_MBX_ATQLEN_ATQOVFL_S			29
+#define PF_MBX_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_MBX_ATQLEN_ATQCRIT_S			30
+#define PF_MBX_ATQLEN_ATQCRIT_M			BIT(30)
+#define PF_MBX_ATQLEN_ATQENABLE_S		31
 #define PF_MBX_ATQLEN_ATQENABLE_M		BIT(31)
-#define PF_MBX_ATQT				0x0022E300
-#define PRTDCB_GENS				0x00083020
+#define PF_MBX_ATQT				0x0022E300 /* Reset Source: CORER */
+#define PF_MBX_ATQT_ATQT_S			0
+#define PF_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQBAH				0x0022FF00 /* Reset Source: CORER */
+#define PF_SB_ARQBAH_ARQBAH_S			0
+#define PF_SB_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_SB_ARQBAL				0x0022FE80 /* Reset Source: CORER */
+#define PF_SB_ARQBAL_ARQBAL_LSB_S		0
+#define PF_SB_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF_SB_ARQBAL_ARQBAL_S			6
+#define PF_SB_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_SB_ARQH				0x00230000 /* Reset Source: CORER */
+#define PF_SB_ARQH_ARQH_S			0
+#define PF_SB_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQLEN				0x0022FF80 /* Reset Source: PFR */
+#define PF_SB_ARQLEN_ARQLEN_S			0
+#define PF_SB_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_SB_ARQLEN_ARQVFE_S			28
+#define PF_SB_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_SB_ARQLEN_ARQOVFL_S			29
+#define PF_SB_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_SB_ARQLEN_ARQCRIT_S			30
+#define PF_SB_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_SB_ARQLEN_ARQENABLE_S		31
+#define PF_SB_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF_SB_ARQT				0x00230080 /* Reset Source: CORER */
+#define PF_SB_ARQT_ARQT_S			0
+#define PF_SB_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQBAH				0x0022FC80 /* Reset Source: CORER */
+#define PF_SB_ATQBAH_ATQBAH_S			0
+#define PF_SB_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define PF_SB_ATQBAL				0x0022FC00 /* Reset Source: CORER */
+#define PF_SB_ATQBAL_ATQBAL_S			6
+#define PF_SB_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define PF_SB_ATQH				0x0022FD80 /* Reset Source: CORER */
+#define PF_SB_ATQH_ATQH_S			0
+#define PF_SB_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQLEN				0x0022FD00 /* Reset Source: PFR */
+#define PF_SB_ATQLEN_ATQLEN_S			0
+#define PF_SB_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_SB_ATQLEN_ATQVFE_S			28
+#define PF_SB_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_SB_ATQLEN_ATQOVFL_S			29
+#define PF_SB_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_SB_ATQLEN_ATQCRIT_S			30
+#define PF_SB_ATQLEN_ATQCRIT_M			BIT(30)
+#define PF_SB_ATQLEN_ATQENABLE_S		31
+#define PF_SB_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF_SB_ATQT				0x0022FE00 /* Reset Source: CORER */
+#define PF_SB_ATQT_ATQT_S			0
+#define PF_SB_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF_SB_REM_DEV_CTL			0x002300F0 /* Reset Source: CORER */
+#define PF_SB_REM_DEV_CTL_DEST_EN_S		0
+#define PF_SB_REM_DEV_CTL_DEST_EN_M		ICE_M(0xFFFF, 0)
+#define PF0_FW_HLP_ARQBAH			0x000801C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_FW_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ARQBAL			0x000800C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_FW_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_FW_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ARQH				0x000803C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQH_ARQH_S			0
+#define PF0_FW_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN			0x000802C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_FW_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_FW_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_FW_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_FW_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_FW_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_FW_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_FW_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_FW_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_FW_HLP_ARQT				0x000804C8 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ARQT_ARQT_S			0
+#define PF0_FW_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQBAH			0x00080148 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_FW_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_HLP_ATQBAL			0x00080048 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQBAL_ATQBAL_LSB_S		0
+#define PF0_FW_HLP_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_FW_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_HLP_ATQH				0x00080348 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQH_ATQH_S			0
+#define PF0_FW_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN			0x00080248 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_FW_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_FW_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_FW_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_FW_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_FW_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_FW_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_FW_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_FW_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_FW_HLP_ATQT				0x00080448 /* Reset Source: EMPR */
+#define PF0_FW_HLP_ATQT_ATQT_S			0
+#define PF0_FW_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQBAH			0x000801C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAH_ARQBAH_S		0
+#define PF0_FW_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ARQBAL			0x000800C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_FW_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ARQBAL_ARQBAL_S		6
+#define PF0_FW_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ARQH				0x000803C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQH_ARQH_S			0
+#define PF0_FW_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN			0x000802C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQLEN_ARQLEN_S		0
+#define PF0_FW_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ARQLEN_ARQVFE_S		28
+#define PF0_FW_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_FW_PSM_ARQLEN_ARQOVFL_S		29
+#define PF0_FW_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_FW_PSM_ARQLEN_ARQCRIT_S		30
+#define PF0_FW_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_FW_PSM_ARQLEN_ARQENABLE_S		31
+#define PF0_FW_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_FW_PSM_ARQT				0x000804C4 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ARQT_ARQT_S			0
+#define PF0_FW_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQBAH			0x00080144 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAH_ATQBAH_S		0
+#define PF0_FW_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_FW_PSM_ATQBAL			0x00080044 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQBAL_ATQBAL_LSB_S		0
+#define PF0_FW_PSM_ATQBAL_ATQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_FW_PSM_ATQBAL_ATQBAL_S		6
+#define PF0_FW_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_FW_PSM_ATQH				0x00080344 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQH_ATQH_S			0
+#define PF0_FW_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN			0x00080244 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQLEN_ATQLEN_S		0
+#define PF0_FW_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_FW_PSM_ATQLEN_ATQVFE_S		28
+#define PF0_FW_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_FW_PSM_ATQLEN_ATQOVFL_S		29
+#define PF0_FW_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_FW_PSM_ATQLEN_ATQCRIT_S		30
+#define PF0_FW_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_FW_PSM_ATQLEN_ATQENABLE_S		31
+#define PF0_FW_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_FW_PSM_ATQT				0x00080444 /* Reset Source: EMPR */
+#define PF0_FW_PSM_ATQT_ATQT_S			0
+#define PF0_FW_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQBAH			0x0022E5D8 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ARQBAL			0x0022E5D4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ARQH			0x0022E5E0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQH_ARQH_S			0
+#define PF0_MBX_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN			0x0022E5DC /* Reset Source: PFR */
+#define PF0_MBX_CPM_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_CPM_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_CPM_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_CPM_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_CPM_ARQT			0x0022E5E4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ARQT_ARQT_S			0
+#define PF0_MBX_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQBAH			0x0022E5C4 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_CPM_ATQBAL			0x0022E5C0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_CPM_ATQH			0x0022E5CC /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQH_ATQH_S			0
+#define PF0_MBX_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN			0x0022E5C8 /* Reset Source: PFR */
+#define PF0_MBX_CPM_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_CPM_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_CPM_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_CPM_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_CPM_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_CPM_ATQT			0x0022E5D0 /* Reset Source: CORER */
+#define PF0_MBX_CPM_ATQT_ATQT_S			0
+#define PF0_MBX_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQBAH			0x0022E600 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ARQBAL			0x0022E5FC /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ARQH			0x0022E608 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQH_ARQH_S			0
+#define PF0_MBX_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN			0x0022E604 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_HLP_ARQT			0x0022E60C /* Reset Source: CORER */
+#define PF0_MBX_HLP_ARQT_ARQT_S			0
+#define PF0_MBX_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQBAH			0x0022E5EC /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_HLP_ATQBAL			0x0022E5E8 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_HLP_ATQH			0x0022E5F4 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQH_ATQH_S			0
+#define PF0_MBX_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN			0x0022E5F0 /* Reset Source: PFR */
+#define PF0_MBX_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_HLP_ATQT			0x0022E5F8 /* Reset Source: CORER */
+#define PF0_MBX_HLP_ATQT_ATQT_S			0
+#define PF0_MBX_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQBAH			0x0022E628 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAH_ARQBAH_S		0
+#define PF0_MBX_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ARQBAL			0x0022E624 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_S		6
+#define PF0_MBX_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ARQH			0x0022E630 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQH_ARQH_S			0
+#define PF0_MBX_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN			0x0022E62C /* Reset Source: PFR */
+#define PF0_MBX_PSM_ARQLEN_ARQLEN_S		0
+#define PF0_MBX_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ARQLEN_ARQVFE_S		28
+#define PF0_MBX_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_MBX_PSM_ARQLEN_ARQOVFL_S		29
+#define PF0_MBX_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_MBX_PSM_ARQLEN_ARQCRIT_S		30
+#define PF0_MBX_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_MBX_PSM_ARQLEN_ARQENABLE_S		31
+#define PF0_MBX_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_MBX_PSM_ARQT			0x0022E634 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ARQT_ARQT_S			0
+#define PF0_MBX_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQBAH			0x0022E614 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAH_ATQBAH_S		0
+#define PF0_MBX_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_MBX_PSM_ATQBAL			0x0022E610 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQBAL_ATQBAL_S		6
+#define PF0_MBX_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_MBX_PSM_ATQH			0x0022E61C /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQH_ATQH_S			0
+#define PF0_MBX_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN			0x0022E618 /* Reset Source: PFR */
+#define PF0_MBX_PSM_ATQLEN_ATQLEN_S		0
+#define PF0_MBX_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_MBX_PSM_ATQLEN_ATQVFE_S		28
+#define PF0_MBX_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_MBX_PSM_ATQLEN_ATQOVFL_S		29
+#define PF0_MBX_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_MBX_PSM_ATQLEN_ATQCRIT_S		30
+#define PF0_MBX_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_MBX_PSM_ATQLEN_ATQENABLE_S		31
+#define PF0_MBX_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_MBX_PSM_ATQT			0x0022E620 /* Reset Source: CORER */
+#define PF0_MBX_PSM_ATQT_ATQT_S			0
+#define PF0_MBX_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQBAH			0x0022E650 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAH_ARQBAH_S		0
+#define PF0_SB_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ARQBAL			0x0022E64C /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_SB_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_SB_CPM_ARQBAL_ARQBAL_S		6
+#define PF0_SB_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ARQH				0x0022E658 /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQH_ARQH_S			0
+#define PF0_SB_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN			0x0022E654 /* Reset Source: PFR */
+#define PF0_SB_CPM_ARQLEN_ARQLEN_S		0
+#define PF0_SB_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ARQLEN_ARQVFE_S		28
+#define PF0_SB_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_SB_CPM_ARQLEN_ARQOVFL_S		29
+#define PF0_SB_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_SB_CPM_ARQLEN_ARQCRIT_S		30
+#define PF0_SB_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_SB_CPM_ARQLEN_ARQENABLE_S		31
+#define PF0_SB_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_SB_CPM_ARQT				0x0022E65C /* Reset Source: CORER */
+#define PF0_SB_CPM_ARQT_ARQT_S			0
+#define PF0_SB_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQBAH			0x0022E63C /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAH_ATQBAH_S		0
+#define PF0_SB_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_CPM_ATQBAL			0x0022E638 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQBAL_ATQBAL_S		6
+#define PF0_SB_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_CPM_ATQH				0x0022E644 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQH_ATQH_S			0
+#define PF0_SB_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN			0x0022E640 /* Reset Source: PFR */
+#define PF0_SB_CPM_ATQLEN_ATQLEN_S		0
+#define PF0_SB_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_ATQLEN_ATQVFE_S		28
+#define PF0_SB_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_SB_CPM_ATQLEN_ATQOVFL_S		29
+#define PF0_SB_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_SB_CPM_ATQLEN_ATQCRIT_S		30
+#define PF0_SB_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_SB_CPM_ATQLEN_ATQENABLE_S		31
+#define PF0_SB_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_SB_CPM_ATQT				0x0022E648 /* Reset Source: CORER */
+#define PF0_SB_CPM_ATQT_ATQT_S			0
+#define PF0_SB_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_CPM_REM_DEV_CTL			0x002300F4 /* Reset Source: CORER */
+#define PF0_SB_CPM_REM_DEV_CTL_DEST_EN_S	0
+#define PF0_SB_CPM_REM_DEV_CTL_DEST_EN_M	ICE_M(0xFFFF, 0)
+#define PF0_SB_HLP_ARQBAH			0x002300D8 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAH_ARQBAH_S		0
+#define PF0_SB_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ARQBAL			0x002300D4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define PF0_SB_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define PF0_SB_HLP_ARQBAL_ARQBAL_S		6
+#define PF0_SB_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ARQH				0x002300E0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQH_ARQH_S			0
+#define PF0_SB_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN			0x002300DC /* Reset Source: PFR */
+#define PF0_SB_HLP_ARQLEN_ARQLEN_S		0
+#define PF0_SB_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ARQLEN_ARQVFE_S		28
+#define PF0_SB_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define PF0_SB_HLP_ARQLEN_ARQOVFL_S		29
+#define PF0_SB_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define PF0_SB_HLP_ARQLEN_ARQCRIT_S		30
+#define PF0_SB_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define PF0_SB_HLP_ARQLEN_ARQENABLE_S		31
+#define PF0_SB_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF0_SB_HLP_ARQT				0x002300E4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ARQT_ARQT_S			0
+#define PF0_SB_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQBAH			0x002300C4 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAH_ATQBAH_S		0
+#define PF0_SB_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define PF0_SB_HLP_ATQBAL			0x002300C0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQBAL_ATQBAL_S		6
+#define PF0_SB_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define PF0_SB_HLP_ATQH				0x002300CC /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQH_ATQH_S			0
+#define PF0_SB_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN			0x002300C8 /* Reset Source: PFR */
+#define PF0_SB_HLP_ATQLEN_ATQLEN_S		0
+#define PF0_SB_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_ATQLEN_ATQVFE_S		28
+#define PF0_SB_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define PF0_SB_HLP_ATQLEN_ATQOVFL_S		29
+#define PF0_SB_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define PF0_SB_HLP_ATQLEN_ATQCRIT_S		30
+#define PF0_SB_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define PF0_SB_HLP_ATQLEN_ATQENABLE_S		31
+#define PF0_SB_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF0_SB_HLP_ATQT				0x002300D0 /* Reset Source: CORER */
+#define PF0_SB_HLP_ATQT_ATQT_S			0
+#define PF0_SB_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define PF0_SB_HLP_REM_DEV_CTL			0x002300E8 /* Reset Source: CORER */
+#define PF0_SB_HLP_REM_DEV_CTL_DEST_EN_S	0
+#define PF0_SB_HLP_REM_DEV_CTL_DEST_EN_M	ICE_M(0xFFFF, 0)
+#define SB_REM_DEV_DEST(_i)			(0x002300F8 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define SB_REM_DEV_DEST_MAX_INDEX		7
+#define SB_REM_DEV_DEST_DEST_S			0
+#define SB_REM_DEV_DEST_DEST_M			ICE_M(0xF, 0)
+#define SB_REM_DEV_DEST_DEST_VALID_S		31
+#define SB_REM_DEV_DEST_DEST_VALID_M		BIT(31)
+#define VF_MBX_ARQBAH(_VF)			(0x0022B800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQBAH_MAX_INDEX			255
+#define VF_MBX_ARQBAH_ARQBAH_S			0
+#define VF_MBX_ARQBAH_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ARQBAL(_VF)			(0x0022B400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQBAL_MAX_INDEX			255
+#define VF_MBX_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_ARQBAL_ARQBAL_S			6
+#define VF_MBX_ARQBAL_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ARQH(_VF)			(0x0022C000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQH_MAX_INDEX			255
+#define VF_MBX_ARQH_ARQH_S			0
+#define VF_MBX_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN(_VF)			(0x0022BC00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VF_MBX_ARQLEN_MAX_INDEX			255
+#define VF_MBX_ARQLEN_ARQLEN_S			0
+#define VF_MBX_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN_ARQVFE_S			28
+#define VF_MBX_ARQLEN_ARQVFE_M			BIT(28)
+#define VF_MBX_ARQLEN_ARQOVFL_S			29
+#define VF_MBX_ARQLEN_ARQOVFL_M			BIT(29)
+#define VF_MBX_ARQLEN_ARQCRIT_S			30
+#define VF_MBX_ARQLEN_ARQCRIT_M			BIT(30)
+#define VF_MBX_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_ARQT(_VF)			(0x0022C400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ARQT_MAX_INDEX			255
+#define VF_MBX_ARQT_ARQT_S			0
+#define VF_MBX_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQBAH(_VF)			(0x0022A400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQBAH_MAX_INDEX			255
+#define VF_MBX_ATQBAH_ATQBAH_S			0
+#define VF_MBX_ATQBAH_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ATQBAL(_VF)			(0x0022A000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQBAL_MAX_INDEX			255
+#define VF_MBX_ATQBAL_ATQBAL_S			6
+#define VF_MBX_ATQBAL_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ATQH(_VF)			(0x0022AC00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQH_MAX_INDEX			255
+#define VF_MBX_ATQH_ATQH_S			0
+#define VF_MBX_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN(_VF)			(0x0022A800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VF_MBX_ATQLEN_MAX_INDEX			255
+#define VF_MBX_ATQLEN_ATQLEN_S			0
+#define VF_MBX_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN_ATQVFE_S			28
+#define VF_MBX_ATQLEN_ATQVFE_M			BIT(28)
+#define VF_MBX_ATQLEN_ATQOVFL_S			29
+#define VF_MBX_ATQLEN_ATQOVFL_M			BIT(29)
+#define VF_MBX_ATQLEN_ATQCRIT_S			30
+#define VF_MBX_ATQLEN_ATQCRIT_M			BIT(30)
+#define VF_MBX_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_ATQT(_VF)			(0x0022B000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VF_MBX_ATQT_MAX_INDEX			255
+#define VF_MBX_ATQT_ATQT_S			0
+#define VF_MBX_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQBAH(_VF128)		(0x0022D400 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAH_MAX_INDEX		127
+#define VF_MBX_CPM_ARQBAH_ARQBAH_S		0
+#define VF_MBX_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAL(_VF128)		(0x0022D200 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAL_MAX_INDEX		127
+#define VF_MBX_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_CPM_ARQBAL_ARQBAL_S		6
+#define VF_MBX_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ARQH(_VF128)			(0x0022D800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQH_MAX_INDEX		127
+#define VF_MBX_CPM_ARQH_ARQH_S			0
+#define VF_MBX_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN(_VF128)		(0x0022D600 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_MBX_CPM_ARQLEN_MAX_INDEX		127
+#define VF_MBX_CPM_ARQLEN_ARQLEN_S		0
+#define VF_MBX_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN_ARQVFE_S		28
+#define VF_MBX_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_CPM_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ARQT(_VF128)			(0x0022DA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQT_MAX_INDEX		127
+#define VF_MBX_CPM_ARQT_ARQT_S			0
+#define VF_MBX_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQBAH(_VF128)		(0x0022CA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAH_MAX_INDEX		127
+#define VF_MBX_CPM_ATQBAH_ATQBAH_S		0
+#define VF_MBX_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ATQBAL(_VF128)		(0x0022C800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAL_MAX_INDEX		127
+#define VF_MBX_CPM_ATQBAL_ATQBAL_S		6
+#define VF_MBX_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ATQH(_VF128)			(0x0022CE00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQH_MAX_INDEX		127
+#define VF_MBX_CPM_ATQH_ATQH_S			0
+#define VF_MBX_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN(_VF128)		(0x0022CC00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_MBX_CPM_ATQLEN_MAX_INDEX		127
+#define VF_MBX_CPM_ATQLEN_ATQLEN_S		0
+#define VF_MBX_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN_ATQVFE_S		28
+#define VF_MBX_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_CPM_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ATQT(_VF128)			(0x0022D000 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQT_MAX_INDEX		127
+#define VF_MBX_CPM_ATQT_ATQT_S			0
+#define VF_MBX_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQBAH(_VF16)		(0x0022DD80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAH_MAX_INDEX		15
+#define VF_MBX_HLP_ARQBAH_ARQBAH_S		0
+#define VF_MBX_HLP_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ARQBAL(_VF16)		(0x0022DD40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAL_MAX_INDEX		15
+#define VF_MBX_HLP_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_HLP_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_HLP_ARQBAL_ARQBAL_S		6
+#define VF_MBX_HLP_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ARQH(_VF16)			(0x0022DE00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQH_MAX_INDEX		15
+#define VF_MBX_HLP_ARQH_ARQH_S			0
+#define VF_MBX_HLP_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN(_VF16)		(0x0022DDC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_HLP_ARQLEN_MAX_INDEX		15
+#define VF_MBX_HLP_ARQLEN_ARQLEN_S		0
+#define VF_MBX_HLP_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN_ARQVFE_S		28
+#define VF_MBX_HLP_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_HLP_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_HLP_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_HLP_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_HLP_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ARQT(_VF16)			(0x0022DE40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQT_MAX_INDEX		15
+#define VF_MBX_HLP_ARQT_ARQT_S			0
+#define VF_MBX_HLP_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQBAH(_VF16)		(0x0022DC40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAH_MAX_INDEX		15
+#define VF_MBX_HLP_ATQBAH_ATQBAH_S		0
+#define VF_MBX_HLP_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ATQBAL(_VF16)		(0x0022DC00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAL_MAX_INDEX		15
+#define VF_MBX_HLP_ATQBAL_ATQBAL_S		6
+#define VF_MBX_HLP_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ATQH(_VF16)			(0x0022DCC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQH_MAX_INDEX		15
+#define VF_MBX_HLP_ATQH_ATQH_S			0
+#define VF_MBX_HLP_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN(_VF16)		(0x0022DC80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_HLP_ATQLEN_MAX_INDEX		15
+#define VF_MBX_HLP_ATQLEN_ATQLEN_S		0
+#define VF_MBX_HLP_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN_ATQVFE_S		28
+#define VF_MBX_HLP_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_HLP_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_HLP_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_HLP_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_HLP_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ATQT(_VF16)			(0x0022DD00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQT_MAX_INDEX		15
+#define VF_MBX_HLP_ATQT_ATQT_S			0
+#define VF_MBX_HLP_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQBAH(_VF16)		(0x0022E000 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAH_MAX_INDEX		15
+#define VF_MBX_PSM_ARQBAH_ARQBAH_S		0
+#define VF_MBX_PSM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ARQBAL(_VF16)		(0x0022DFC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAL_MAX_INDEX		15
+#define VF_MBX_PSM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_MBX_PSM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_PSM_ARQBAL_ARQBAL_S		6
+#define VF_MBX_PSM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ARQH(_VF16)			(0x0022E080 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQH_MAX_INDEX		15
+#define VF_MBX_PSM_ARQH_ARQH_S			0
+#define VF_MBX_PSM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN(_VF16)		(0x0022E040 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_PSM_ARQLEN_MAX_INDEX		15
+#define VF_MBX_PSM_ARQLEN_ARQLEN_S		0
+#define VF_MBX_PSM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN_ARQVFE_S		28
+#define VF_MBX_PSM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_MBX_PSM_ARQLEN_ARQOVFL_S		29
+#define VF_MBX_PSM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ARQLEN_ARQCRIT_S		30
+#define VF_MBX_PSM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ARQLEN_ARQENABLE_S		31
+#define VF_MBX_PSM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ARQT(_VF16)			(0x0022E0C0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQT_MAX_INDEX		15
+#define VF_MBX_PSM_ARQT_ARQT_S			0
+#define VF_MBX_PSM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQBAH(_VF16)		(0x0022DEC0 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAH_MAX_INDEX		15
+#define VF_MBX_PSM_ATQBAH_ATQBAH_S		0
+#define VF_MBX_PSM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ATQBAL(_VF16)		(0x0022DE80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAL_MAX_INDEX		15
+#define VF_MBX_PSM_ATQBAL_ATQBAL_S		6
+#define VF_MBX_PSM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ATQH(_VF16)			(0x0022DF40 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQH_MAX_INDEX		15
+#define VF_MBX_PSM_ATQH_ATQH_S			0
+#define VF_MBX_PSM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN(_VF16)		(0x0022DF00 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: PFR */
+#define VF_MBX_PSM_ATQLEN_MAX_INDEX		15
+#define VF_MBX_PSM_ATQLEN_ATQLEN_S		0
+#define VF_MBX_PSM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN_ATQVFE_S		28
+#define VF_MBX_PSM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_MBX_PSM_ATQLEN_ATQOVFL_S		29
+#define VF_MBX_PSM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ATQLEN_ATQCRIT_S		30
+#define VF_MBX_PSM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ATQLEN_ATQENABLE_S		31
+#define VF_MBX_PSM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ATQT(_VF16)			(0x0022DF80 + ((_VF16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQT_MAX_INDEX		15
+#define VF_MBX_PSM_ATQT_ATQT_S			0
+#define VF_MBX_PSM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQBAH(_VF128)		(0x0022F400 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAH_MAX_INDEX		127
+#define VF_SB_CPM_ARQBAH_ARQBAH_S		0
+#define VF_SB_CPM_ARQBAH_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ARQBAL(_VF128)		(0x0022F200 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAL_MAX_INDEX		127
+#define VF_SB_CPM_ARQBAL_ARQBAL_LSB_S		0
+#define VF_SB_CPM_ARQBAL_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_SB_CPM_ARQBAL_ARQBAL_S		6
+#define VF_SB_CPM_ARQBAL_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ARQH(_VF128)			(0x0022F800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQH_MAX_INDEX		127
+#define VF_SB_CPM_ARQH_ARQH_S			0
+#define VF_SB_CPM_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN(_VF128)		(0x0022F600 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_SB_CPM_ARQLEN_MAX_INDEX		127
+#define VF_SB_CPM_ARQLEN_ARQLEN_S		0
+#define VF_SB_CPM_ARQLEN_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN_ARQVFE_S		28
+#define VF_SB_CPM_ARQLEN_ARQVFE_M		BIT(28)
+#define VF_SB_CPM_ARQLEN_ARQOVFL_S		29
+#define VF_SB_CPM_ARQLEN_ARQOVFL_M		BIT(29)
+#define VF_SB_CPM_ARQLEN_ARQCRIT_S		30
+#define VF_SB_CPM_ARQLEN_ARQCRIT_M		BIT(30)
+#define VF_SB_CPM_ARQLEN_ARQENABLE_S		31
+#define VF_SB_CPM_ARQLEN_ARQENABLE_M		BIT(31)
+#define VF_SB_CPM_ARQT(_VF128)			(0x0022FA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ARQT_MAX_INDEX		127
+#define VF_SB_CPM_ARQT_ARQT_S			0
+#define VF_SB_CPM_ARQT_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQBAH(_VF128)		(0x0022EA00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAH_MAX_INDEX		127
+#define VF_SB_CPM_ATQBAH_ATQBAH_S		0
+#define VF_SB_CPM_ATQBAH_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ATQBAL(_VF128)		(0x0022E800 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAL_MAX_INDEX		127
+#define VF_SB_CPM_ATQBAL_ATQBAL_S		6
+#define VF_SB_CPM_ATQBAL_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ATQH(_VF128)			(0x0022EE00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQH_MAX_INDEX		127
+#define VF_SB_CPM_ATQH_ATQH_S			0
+#define VF_SB_CPM_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN(_VF128)		(0x0022EC00 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: PFR */
+#define VF_SB_CPM_ATQLEN_MAX_INDEX		127
+#define VF_SB_CPM_ATQLEN_ATQLEN_S		0
+#define VF_SB_CPM_ATQLEN_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN_ATQVFE_S		28
+#define VF_SB_CPM_ATQLEN_ATQVFE_M		BIT(28)
+#define VF_SB_CPM_ATQLEN_ATQOVFL_S		29
+#define VF_SB_CPM_ATQLEN_ATQOVFL_M		BIT(29)
+#define VF_SB_CPM_ATQLEN_ATQCRIT_S		30
+#define VF_SB_CPM_ATQLEN_ATQCRIT_M		BIT(30)
+#define VF_SB_CPM_ATQLEN_ATQENABLE_S		31
+#define VF_SB_CPM_ATQLEN_ATQENABLE_M		BIT(31)
+#define VF_SB_CPM_ATQT(_VF128)			(0x0022F000 + ((_VF128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VF_SB_CPM_ATQT_MAX_INDEX		127
+#define VF_SB_CPM_ATQT_ATQT_S			0
+#define VF_SB_CPM_ATQT_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_REM_DEV_CTL			0x002300EC /* Reset Source: CORER */
+#define VF_SB_CPM_REM_DEV_CTL_DEST_EN_S		0
+#define VF_SB_CPM_REM_DEV_CTL_DEST_EN_M		ICE_M(0xFFFF, 0)
+#define VP_MBX_CPM_PF_VF_CTRL(_VP128)		(0x00231800 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VP_MBX_CPM_PF_VF_CTRL_MAX_INDEX		127
+#define VP_MBX_CPM_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_CPM_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_MBX_HLP_PF_VF_CTRL(_VP16)		(0x00231A00 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VP_MBX_HLP_PF_VF_CTRL_MAX_INDEX		15
+#define VP_MBX_HLP_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_HLP_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_MBX_PF_VF_CTRL(_VSI)			(0x00230800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VP_MBX_PF_VF_CTRL_MAX_INDEX		767
+#define VP_MBX_PF_VF_CTRL_QUEUE_EN_S		0
+#define VP_MBX_PF_VF_CTRL_QUEUE_EN_M		BIT(0)
+#define VP_MBX_PSM_PF_VF_CTRL(_VP16)		(0x00231A40 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VP_MBX_PSM_PF_VF_CTRL_MAX_INDEX		15
+#define VP_MBX_PSM_PF_VF_CTRL_QUEUE_EN_S	0
+#define VP_MBX_PSM_PF_VF_CTRL_QUEUE_EN_M	BIT(0)
+#define VP_SB_CPM_PF_VF_CTRL(_VP128)		(0x00231C00 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VP_SB_CPM_PF_VF_CTRL_MAX_INDEX		127
+#define VP_SB_CPM_PF_VF_CTRL_QUEUE_EN_S		0
+#define VP_SB_CPM_PF_VF_CTRL_QUEUE_EN_M		BIT(0)
+#define GL_DCB_TDSCP2TC_BLOCK_DIS		0x00049218 /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_DIS_DSCP2TC_BLOCK_DIS_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_DIS_DSCP2TC_BLOCK_DIS_M BIT(0)
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4(_i)		(0x00049018 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_MAX_INDEX	63
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_TC_BLOCK_LUT_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_IPV4_TC_BLOCK_LUT_M ICE_M(0xFFFFFFFF, 0)
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6(_i)		(0x00049118 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_MAX_INDEX	63
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_TC_BLOCK_LUT_S 0
+#define GL_DCB_TDSCP2TC_BLOCK_IPV6_TC_BLOCK_LUT_M ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_GENC				0x00083044 /* Reset Source: CORER */
+#define GLDCB_GENC_PCIRTT_S			0
+#define GLDCB_GENC_PCIRTT_M			ICE_M(0xFFFF, 0)
+#define GLDCB_PRS_RETSTCC(_i)			(0x002000B0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_PRS_RETSTCC_MAX_INDEX		31
+#define GLDCB_PRS_RETSTCC_BWSHARE_S		0
+#define GLDCB_PRS_RETSTCC_BWSHARE_M		ICE_M(0x7F, 0)
+#define GLDCB_PRS_RETSTCC_ETSTC_S		31
+#define GLDCB_PRS_RETSTCC_ETSTC_M		BIT(31)
+#define GLDCB_PRS_RSPMC				0x00200160 /* Reset Source: CORER */
+#define GLDCB_PRS_RSPMC_RSPM_S			0
+#define GLDCB_PRS_RSPMC_RSPM_M			ICE_M(0xFF, 0)
+#define GLDCB_PRS_RSPMC_RPM_MODE_S		8
+#define GLDCB_PRS_RSPMC_RPM_MODE_M		ICE_M(0x3, 8)
+#define GLDCB_PRS_RSPMC_PRR_MAX_EXP_S		10
+#define GLDCB_PRS_RSPMC_PRR_MAX_EXP_M		ICE_M(0xF, 10)
+#define GLDCB_PRS_RSPMC_PFCTIMER_S		14
+#define GLDCB_PRS_RSPMC_PFCTIMER_M		ICE_M(0x3FFF, 14)
+#define GLDCB_PRS_RSPMC_RPM_DIS_S		31
+#define GLDCB_PRS_RSPMC_RPM_DIS_M		BIT(31)
+#define GLDCB_RETSTCC(_i)			(0x00122140 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RETSTCC_MAX_INDEX			31
+#define GLDCB_RETSTCC_BWSHARE_S			0
+#define GLDCB_RETSTCC_BWSHARE_M			ICE_M(0x7F, 0)
+#define GLDCB_RETSTCC_ETSTC_S			31
+#define GLDCB_RETSTCC_ETSTC_M			BIT(31)
+#define GLDCB_RETSTCS(_i)			(0x001221C0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RETSTCS_MAX_INDEX			31
+#define GLDCB_RETSTCS_CREDITS_S			0
+#define GLDCB_RETSTCS_CREDITS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_RTC2PFC_RCB			0x00122100 /* Reset Source: CORER */
+#define GLDCB_RTC2PFC_RCB_TC2PFC_S		0
+#define GLDCB_RTC2PFC_RCB_TC2PFC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_SWT_RETSTCC(_i)			(0x0020A040 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_SWT_RETSTCC_MAX_INDEX		31
+#define GLDCB_SWT_RETSTCC_BWSHARE_S		0
+#define GLDCB_SWT_RETSTCC_BWSHARE_M		ICE_M(0x7F, 0)
+#define GLDCB_SWT_RETSTCC_ETSTC_S		31
+#define GLDCB_SWT_RETSTCC_ETSTC_M		BIT(31)
+#define GLDCB_TC2PFC				0x001D2694 /* Reset Source: CORER */
+#define GLDCB_TC2PFC_TC2PFC_S			0
+#define GLDCB_TC2PFC_TC2PFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCB_MNG_SP			0x000AE12C /* Reset Source: CORER */
+#define GLDCB_TCB_MNG_SP_MNG_SP_S		0
+#define GLDCB_TCB_MNG_SP_MNG_SP_M		BIT(0)
+#define GLDCB_TCB_TCLL_CFG			0x000AE134 /* Reset Source: CORER */
+#define GLDCB_TCB_TCLL_CFG_LLTC_S		0
+#define GLDCB_TCB_TCLL_CFG_LLTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCB_WB_SP				0x000AE310 /* Reset Source: CORER */
+#define GLDCB_TCB_WB_SP_WB_SP_S			0
+#define GLDCB_TCB_WB_SP_WB_SP_M			BIT(0)
+#define GLDCB_TCUPM_IMM_EN			0x000BC824 /* Reset Source: CORER */
+#define GLDCB_TCUPM_IMM_EN_IMM_EN_S		0
+#define GLDCB_TCUPM_IMM_EN_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCUPM_LEGACY_TC			0x000BC828 /* Reset Source: CORER */
+#define GLDCB_TCUPM_LEGACY_TC_LEGTC_S		0
+#define GLDCB_TCUPM_LEGACY_TC_LEGTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TCUPM_NO_EXCEED_DIS		0x000BC830 /* Reset Source: CORER */
+#define GLDCB_TCUPM_NO_EXCEED_DIS_NON_EXCEED_DIS_S 0
+#define GLDCB_TCUPM_NO_EXCEED_DIS_NON_EXCEED_DIS_M BIT(0)
+#define GLDCB_TCUPM_WB_DIS			0x000BC834 /* Reset Source: CORER */
+#define GLDCB_TCUPM_WB_DIS_PORT_DISABLE_S	0
+#define GLDCB_TCUPM_WB_DIS_PORT_DISABLE_M	BIT(0)
+#define GLDCB_TCUPM_WB_DIS_TC_DISABLE_S		1
+#define GLDCB_TCUPM_WB_DIS_TC_DISABLE_M		BIT(1)
+#define GLDCB_TFPFCI				0x0009949C /* Reset Source: CORER */
+#define GLDCB_TFPFCI_GLDCB_TFPFCI_S		0
+#define GLDCB_TFPFCI_GLDCB_TFPFCI_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_IMM_TCB			0x000A0190 /* Reset Source: CORER */
+#define GLDCB_TLPM_IMM_TCB_IMM_EN_S		0
+#define GLDCB_TLPM_IMM_TCB_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_IMM_TCUPM			0x000A018C /* Reset Source: CORER */
+#define GLDCB_TLPM_IMM_TCUPM_IMM_EN_S		0
+#define GLDCB_TLPM_IMM_TCUPM_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TLPM_PCI_DM			0x000A0180 /* Reset Source: CORER */
+#define GLDCB_TLPM_PCI_DM_MONITOR_S		0
+#define GLDCB_TLPM_PCI_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define GLDCB_TLPM_PCI_DTHR			0x000A0184 /* Reset Source: CORER */
+#define GLDCB_TLPM_PCI_DTHR_PCI_TDATA_S		0
+#define GLDCB_TLPM_PCI_DTHR_PCI_TDATA_M		ICE_M(0xFFF, 0)
+#define GLDCB_TPB_IMM_TLPM			0x00099468 /* Reset Source: CORER */
+#define GLDCB_TPB_IMM_TLPM_IMM_EN_S		0
+#define GLDCB_TPB_IMM_TLPM_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TPB_IMM_TPB			0x0009946C /* Reset Source: CORER */
+#define GLDCB_TPB_IMM_TPB_IMM_EN_S		0
+#define GLDCB_TPB_IMM_TPB_IMM_EN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_TPB_TCLL_CFG			0x00099464 /* Reset Source: CORER */
+#define GLDCB_TPB_TCLL_CFG_LLTC_S		0
+#define GLDCB_TPB_TCLL_CFG_LLTC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTCB_BULK_DWRR_REG_QUANTA		0x000AE0E0 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_REG_QUANTA_QUANTA_S	0
+#define GLTCB_BULK_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_BULK_DWRR_REG_SAT			0x000AE0F0 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_REG_SAT_SATURATION_S	0
+#define GLTCB_BULK_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_BULK_DWRR_WB_QUANTA		0x000AE0E4 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_WB_QUANTA_QUANTA_S	0
+#define GLTCB_BULK_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_BULK_DWRR_WB_SAT			0x000AE0F4 /* Reset Source: CORER */
+#define GLTCB_BULK_DWRR_WB_SAT_SATURATION_S	0
+#define GLTCB_BULK_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_CREDIT_EXP_CTL			0x000AE120 /* Reset Source: CORER */
+#define GLTCB_CREDIT_EXP_CTL_EN_S		0
+#define GLTCB_CREDIT_EXP_CTL_EN_M		BIT(0)
+#define GLTCB_CREDIT_EXP_CTL_MIN_PKT_S		1
+#define GLTCB_CREDIT_EXP_CTL_MIN_PKT_M		ICE_M(0x1FF, 1)
+#define GLTCB_LL_DWRR_REG_QUANTA		0x000AE0E8 /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_REG_QUANTA_QUANTA_S	0
+#define GLTCB_LL_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_LL_DWRR_REG_SAT			0x000AE0F8 /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_REG_SAT_SATURATION_S	0
+#define GLTCB_LL_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_LL_DWRR_WB_QUANTA			0x000AE0EC /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_WB_QUANTA_QUANTA_S	0
+#define GLTCB_LL_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define GLTCB_LL_DWRR_WB_SAT			0x000AE0FC /* Reset Source: CORER */
+#define GLTCB_LL_DWRR_WB_SAT_SATURATION_S	0
+#define GLTCB_LL_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define GLTCB_WB_RL				0x000AE238 /* Reset Source: CORER */
+#define GLTCB_WB_RL_PERIOD_S			0
+#define GLTCB_WB_RL_PERIOD_M			ICE_M(0xFFFF, 0)
+#define GLTCB_WB_RL_EN_S			16
+#define GLTCB_WB_RL_EN_M			BIT(16)
+#define GLTPB_WB_RL				0x00099460 /* Reset Source: CORER */
+#define GLTPB_WB_RL_PERIOD_S			0
+#define GLTPB_WB_RL_PERIOD_M			ICE_M(0xFFFF, 0)
+#define GLTPB_WB_RL_EN_S			16
+#define GLTPB_WB_RL_EN_M			BIT(16)
+#define PRTDCB_FCCFG				0x001E4640 /* Reset Source: GLOBR */
+#define PRTDCB_FCCFG_TFCE_S			3
+#define PRTDCB_FCCFG_TFCE_M			ICE_M(0x3, 3)
+#define PRTDCB_FCRTV				0x001E4600 /* Reset Source: GLOBR */
+#define PRTDCB_FCRTV_FC_REFRESH_TH_S		0
+#define PRTDCB_FCRTV_FC_REFRESH_TH_M		ICE_M(0xFFFF, 0)
+#define PRTDCB_FCTTVN(_i)			(0x001E4580 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: GLOBR */
+#define PRTDCB_FCTTVN_MAX_INDEX			3
+#define PRTDCB_FCTTVN_TTV_2N_S			0
+#define PRTDCB_FCTTVN_TTV_2N_M			ICE_M(0xFFFF, 0)
+#define PRTDCB_FCTTVN_TTV_2N_P1_S		16
+#define PRTDCB_FCTTVN_TTV_2N_P1_M		ICE_M(0xFFFF, 16)
+#define PRTDCB_GENC				0x00083000 /* Reset Source: CORER */
+#define PRTDCB_GENC_NUMTC_S			2
+#define PRTDCB_GENC_NUMTC_M			ICE_M(0xF, 2)
+#define PRTDCB_GENC_FCOEUP_S			6
+#define PRTDCB_GENC_FCOEUP_M			ICE_M(0x7, 6)
+#define PRTDCB_GENC_FCOEUP_VALID_S		9
+#define PRTDCB_GENC_FCOEUP_VALID_M		BIT(9)
+#define PRTDCB_GENC_PFCLDA_S			16
+#define PRTDCB_GENC_PFCLDA_M			ICE_M(0xFFFF, 16)
+#define PRTDCB_GENS				0x00083020 /* Reset Source: CORER */
 #define PRTDCB_GENS_DCBX_STATUS_S		0
 #define PRTDCB_GENS_DCBX_STATUS_M		ICE_M(0x7, 0)
-#define GL_PREEXT_L2_PMASK0(_i)			(0x0020F0FC + ((_i) * 4))
-#define GL_PREEXT_L2_PMASK1(_i)			(0x0020F108 + ((_i) * 4))
-#define GLFLXP_RXDID_FLAGS(_i, _j)		(0x0045D000 + ((_i) * 4 + (_j) * 256))
+#define PRTDCB_PRS_RETSC			0x002001A0 /* Reset Source: CORER */
+#define PRTDCB_PRS_RETSC_ETS_MODE_S		0
+#define PRTDCB_PRS_RETSC_ETS_MODE_M		BIT(0)
+#define PRTDCB_PRS_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_PRS_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_PRS_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_PRS_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_PRS_RPRRC			0x00200180 /* Reset Source: CORER */
+#define PRTDCB_PRS_RPRRC_BWSHARE_S		0
+#define PRTDCB_PRS_RPRRC_BWSHARE_M		ICE_M(0x3FF, 0)
+#define PRTDCB_PRS_RPRRC_BWSHARE_DIS_S		31
+#define PRTDCB_PRS_RPRRC_BWSHARE_DIS_M		BIT(31)
+#define PRTDCB_RETSC				0x001222A0 /* Reset Source: CORER */
+#define PRTDCB_RETSC_ETS_MODE_S			0
+#define PRTDCB_RETSC_ETS_MODE_M			BIT(0)
+#define PRTDCB_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_RPRRC				0x001220C0 /* Reset Source: CORER */
+#define PRTDCB_RPRRC_BWSHARE_S			0
+#define PRTDCB_RPRRC_BWSHARE_M			ICE_M(0x3FF, 0)
+#define PRTDCB_RPRRC_BWSHARE_DIS_S		31
+#define PRTDCB_RPRRC_BWSHARE_DIS_M		BIT(31)
+#define PRTDCB_RPRRS				0x001220E0 /* Reset Source: CORER */
+#define PRTDCB_RPRRS_CREDITS_S			0
+#define PRTDCB_RPRRS_CREDITS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTDCB_RUP_TDPU				0x00040960 /* Reset Source: CORER */
+#define PRTDCB_RUP_TDPU_NOVLANUP_S		0
+#define PRTDCB_RUP_TDPU_NOVLANUP_M		ICE_M(0x7, 0)
+#define PRTDCB_RUP2TC				0x001D2640 /* Reset Source: CORER */
+#define PRTDCB_RUP2TC_UP0TC_S			0
+#define PRTDCB_RUP2TC_UP0TC_M			ICE_M(0x7, 0)
+#define PRTDCB_RUP2TC_UP1TC_S			3
+#define PRTDCB_RUP2TC_UP1TC_M			ICE_M(0x7, 3)
+#define PRTDCB_RUP2TC_UP2TC_S			6
+#define PRTDCB_RUP2TC_UP2TC_M			ICE_M(0x7, 6)
+#define PRTDCB_RUP2TC_UP3TC_S			9
+#define PRTDCB_RUP2TC_UP3TC_M			ICE_M(0x7, 9)
+#define PRTDCB_RUP2TC_UP4TC_S			12
+#define PRTDCB_RUP2TC_UP4TC_M			ICE_M(0x7, 12)
+#define PRTDCB_RUP2TC_UP5TC_S			15
+#define PRTDCB_RUP2TC_UP5TC_M			ICE_M(0x7, 15)
+#define PRTDCB_RUP2TC_UP6TC_S			18
+#define PRTDCB_RUP2TC_UP6TC_M			ICE_M(0x7, 18)
+#define PRTDCB_RUP2TC_UP7TC_S			21
+#define PRTDCB_RUP2TC_UP7TC_M			ICE_M(0x7, 21)
+#define PRTDCB_SWT_RETSC			0x0020A140 /* Reset Source: CORER */
+#define PRTDCB_SWT_RETSC_ETS_MODE_S		0
+#define PRTDCB_SWT_RETSC_ETS_MODE_M		BIT(0)
+#define PRTDCB_SWT_RETSC_NON_ETS_MODE_S		1
+#define PRTDCB_SWT_RETSC_NON_ETS_MODE_M		BIT(1)
+#define PRTDCB_SWT_RETSC_ETS_MAX_EXP_S		2
+#define PRTDCB_SWT_RETSC_ETS_MAX_EXP_M		ICE_M(0xF, 2)
+#define PRTDCB_TCB_DWRR_CREDITS			0x000AE000 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_CREDITS_CREDITS_S	0
+#define PRTDCB_TCB_DWRR_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTDCB_TCB_DWRR_QUANTA			0x000AE020 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_QUANTA_QUANTA_S		0
+#define PRTDCB_TCB_DWRR_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define PRTDCB_TCB_DWRR_SAT			0x000AE040 /* Reset Source: CORER */
+#define PRTDCB_TCB_DWRR_SAT_SATURATION_S	0
+#define PRTDCB_TCB_DWRR_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define PRTDCB_TCUPM_NO_EXCEED_DM		0x000BC3C0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_NO_EXCEED_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_NO_EXCEED_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_REG_CM			0x000BC360 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_CM_MONITOR_S		0
+#define PRTDCB_TCUPM_REG_CM_MONITOR_M		ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_REG_CTHR			0x000BC380 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_H_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_L_S	15
+#define PRTDCB_TCUPM_REG_CTHR_PORTOFFTH_L_M	ICE_M(0x7FFF, 15)
+#define PRTDCB_TCUPM_REG_DM			0x000BC3A0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_DM_MONITOR_S		0
+#define PRTDCB_TCUPM_REG_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_REG_DTHR			0x000BC3E0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_H_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_L_S	12
+#define PRTDCB_TCUPM_REG_DTHR_PORTOFFTH_L_M	ICE_M(0xFFF, 12)
+#define PRTDCB_TCUPM_REG_PE_HB_DM		0x000BC400 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_PE_HB_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_REG_PE_HB_DM_MONITOR_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR		0x000BC420 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_H_S 0
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_H_M ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_L_S 12
+#define PRTDCB_TCUPM_REG_PE_HB_DTHR_PORTOFFTH_L_M ICE_M(0xFFF, 12)
+#define PRTDCB_TCUPM_WAIT_PFC_CM		0x000BC440 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_CM_MONITOR_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_CM_MONITOR_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR		0x000BC460 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR_PORTOFFTH_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_CTHR_PORTOFFTH_M	ICE_M(0x7FFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_DM		0x000BC480 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_DM_MONITOR_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR		0x000BC4A0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR_PORTOFFTH_S	0
+#define PRTDCB_TCUPM_WAIT_PFC_DTHR_PORTOFFTH_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM		0x000BC4C0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM_MONITOR_S 0
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DM_MONITOR_M ICE_M(0xFFF, 0)
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR	0x000BC4E0 /* Reset Source: CORER */
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR_PORTOFFTH_S 0
+#define PRTDCB_TCUPM_WAIT_PFC_PE_HB_DTHR_PORTOFFTH_M ICE_M(0xFFF, 0)
+#define PRTDCB_TDPUC				0x00040940 /* Reset Source: CORER */
+#define PRTDCB_TDPUC_MAX_TXFRAME_S		0
+#define PRTDCB_TDPUC_MAX_TXFRAME_M		ICE_M(0xFFFF, 0)
+#define PRTDCB_TDPUC_MAL_LENGTH_S		16
+#define PRTDCB_TDPUC_MAL_LENGTH_M		BIT(16)
+#define PRTDCB_TDPUC_MAL_CMD_S			17
+#define PRTDCB_TDPUC_MAL_CMD_M			BIT(17)
+#define PRTDCB_TDPUC_TTL_DROP_S			18
+#define PRTDCB_TDPUC_TTL_DROP_M			BIT(18)
+#define PRTDCB_TDPUC_UR_DROP_S			19
+#define PRTDCB_TDPUC_UR_DROP_M			BIT(19)
+#define PRTDCB_TDPUC_DUMMY_S			20
+#define PRTDCB_TDPUC_DUMMY_M			BIT(20)
+#define PRTDCB_TDPUC_BIG_PKT_SIZE_S		21
+#define PRTDCB_TDPUC_BIG_PKT_SIZE_M		BIT(21)
+#define PRTDCB_TDPUC_L2_ACCEPT_FAIL_S		22
+#define PRTDCB_TDPUC_L2_ACCEPT_FAIL_M		BIT(22)
+#define PRTDCB_TDPUC_DSCP_CHECK_FAIL_S		23
+#define PRTDCB_TDPUC_DSCP_CHECK_FAIL_M		BIT(23)
+#define PRTDCB_TDPUC_RCU_ANTISPOOF_S		24
+#define PRTDCB_TDPUC_RCU_ANTISPOOF_M		BIT(24)
+#define PRTDCB_TDPUC_NIC_DSI_S			25
+#define PRTDCB_TDPUC_NIC_DSI_M			BIT(25)
+#define PRTDCB_TDPUC_NIC_IPSEC_S		26
+#define PRTDCB_TDPUC_NIC_IPSEC_M		BIT(26)
+#define PRTDCB_TDPUC_CLEAR_DROP_S		31
+#define PRTDCB_TDPUC_CLEAR_DROP_M		BIT(31)
+#define PRTDCB_TFCS				0x001E4560 /* Reset Source: GLOBR */
+#define PRTDCB_TFCS_TXOFF_S			0
+#define PRTDCB_TFCS_TXOFF_M			BIT(0)
+#define PRTDCB_TFCS_TXOFF0_S			8
+#define PRTDCB_TFCS_TXOFF0_M			BIT(8)
+#define PRTDCB_TFCS_TXOFF1_S			9
+#define PRTDCB_TFCS_TXOFF1_M			BIT(9)
+#define PRTDCB_TFCS_TXOFF2_S			10
+#define PRTDCB_TFCS_TXOFF2_M			BIT(10)
+#define PRTDCB_TFCS_TXOFF3_S			11
+#define PRTDCB_TFCS_TXOFF3_M			BIT(11)
+#define PRTDCB_TFCS_TXOFF4_S			12
+#define PRTDCB_TFCS_TXOFF4_M			BIT(12)
+#define PRTDCB_TFCS_TXOFF5_S			13
+#define PRTDCB_TFCS_TXOFF5_M			BIT(13)
+#define PRTDCB_TFCS_TXOFF6_S			14
+#define PRTDCB_TFCS_TXOFF6_M			BIT(14)
+#define PRTDCB_TFCS_TXOFF7_S			15
+#define PRTDCB_TFCS_TXOFF7_M			BIT(15)
+#define PRTDCB_TLPM_REG_DM			0x000A0000 /* Reset Source: CORER */
+#define PRTDCB_TLPM_REG_DM_MONITOR_S		0
+#define PRTDCB_TLPM_REG_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define PRTDCB_TLPM_REG_DTHR			0x000A0020 /* Reset Source: CORER */
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_H_S	0
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_H_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_L_S	12
+#define PRTDCB_TLPM_REG_DTHR_PORTOFFTH_L_M	ICE_M(0xFFF, 12)
+#define PRTDCB_TLPM_WAIT_PFC_DM			0x000A0040 /* Reset Source: CORER */
+#define PRTDCB_TLPM_WAIT_PFC_DM_MONITOR_S	0
+#define PRTDCB_TLPM_WAIT_PFC_DM_MONITOR_M	ICE_M(0x7FFFF, 0)
+#define PRTDCB_TLPM_WAIT_PFC_DTHR		0x000A0060 /* Reset Source: CORER */
+#define PRTDCB_TLPM_WAIT_PFC_DTHR_PORTOFFTH_S	0
+#define PRTDCB_TLPM_WAIT_PFC_DTHR_PORTOFFTH_M	ICE_M(0xFFF, 0)
+#define PRTDCB_TPFCTS(_i)			(0x001E4660 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTDCB_TPFCTS_MAX_INDEX			7
+#define PRTDCB_TPFCTS_PFCTIMER_S		0
+#define PRTDCB_TPFCTS_PFCTIMER_M		ICE_M(0x3FFF, 0)
+#define PRTDCB_TUP2TC				0x001D26C0 /* Reset Source: CORER */
+#define PRTDCB_TUP2TC_UP0TC_S			0
+#define PRTDCB_TUP2TC_UP0TC_M			ICE_M(0x7, 0)
+#define PRTDCB_TUP2TC_UP1TC_S			3
+#define PRTDCB_TUP2TC_UP1TC_M			ICE_M(0x7, 3)
+#define PRTDCB_TUP2TC_UP2TC_S			6
+#define PRTDCB_TUP2TC_UP2TC_M			ICE_M(0x7, 6)
+#define PRTDCB_TUP2TC_UP3TC_S			9
+#define PRTDCB_TUP2TC_UP3TC_M			ICE_M(0x7, 9)
+#define PRTDCB_TUP2TC_UP4TC_S			12
+#define PRTDCB_TUP2TC_UP4TC_M			ICE_M(0x7, 12)
+#define PRTDCB_TUP2TC_UP5TC_S			15
+#define PRTDCB_TUP2TC_UP5TC_M			ICE_M(0x7, 15)
+#define PRTDCB_TUP2TC_UP6TC_S			18
+#define PRTDCB_TUP2TC_UP6TC_M			ICE_M(0x7, 18)
+#define PRTDCB_TUP2TC_UP7TC_S			21
+#define PRTDCB_TUP2TC_UP7TC_M			ICE_M(0x7, 21)
+#define PRTDCB_TX_DSCP2UP_CTL			0x00040980 /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP2UP_ENA_S	0
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP2UP_ENA_M	BIT(0)
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP_DEFAULT_UP_S 1
+#define PRTDCB_TX_DSCP2UP_CTL_DSCP_DEFAULT_UP_M ICE_M(0x7, 1)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT(_i)		(0x000409A0 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_MAX_INDEX	7
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_0_S 0
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_0_M ICE_M(0x7, 0)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_1_S 4
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_1_M ICE_M(0x7, 4)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_2_S 8
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_2_M ICE_M(0x7, 8)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_3_S 12
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_3_M ICE_M(0x7, 12)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_4_S 16
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_4_M ICE_M(0x7, 16)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_5_S 20
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_5_M ICE_M(0x7, 20)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_6_S 24
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_6_M ICE_M(0x7, 24)
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_7_S 28
+#define PRTDCB_TX_DSCP2UP_IPV4_LUT_DSCP2UP_LUT_7_M ICE_M(0x7, 28)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT(_i)		(0x00040AA0 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: CORER */
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_MAX_INDEX	7
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_0_S 0
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_0_M ICE_M(0x7, 0)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_1_S 4
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_1_M ICE_M(0x7, 4)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_2_S 8
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_2_M ICE_M(0x7, 8)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_3_S 12
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_3_M ICE_M(0x7, 12)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_4_S 16
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_4_M ICE_M(0x7, 16)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_5_S 20
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_5_M ICE_M(0x7, 20)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_6_S 24
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_6_M ICE_M(0x7, 24)
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_7_S 28
+#define PRTDCB_TX_DSCP2UP_IPV6_LUT_DSCP2UP_LUT_7_M ICE_M(0x7, 28)
+#define PRTTCB_BULK_DWRR_REG_CREDITS		0x000AE060 /* Reset Source: CORER */
+#define PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_S	0
+#define PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_BULK_DWRR_WB_CREDITS		0x000AE080 /* Reset Source: CORER */
+#define PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_S	0
+#define PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_CREDIT_EXP			0x000AE100 /* Reset Source: CORER */
+#define PRTTCB_CREDIT_EXP_EXPANSION_S		0
+#define PRTTCB_CREDIT_EXP_EXPANSION_M		ICE_M(0xFF, 0)
+#define PRTTCB_LL_DWRR_REG_CREDITS		0x000AE0A0 /* Reset Source: CORER */
+#define PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_S	0
+#define PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define PRTTCB_LL_DWRR_WB_CREDITS		0x000AE0C0 /* Reset Source: CORER */
+#define PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_S	0
+#define PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define TCDCB_TCUPM_WAIT_CM(_i)			(0x000BC520 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_CM_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_CM_MONITOR_S		0
+#define TCDCB_TCUPM_WAIT_CM_MONITOR_M		ICE_M(0x7FFF, 0)
+#define TCDCB_TCUPM_WAIT_CTHR(_i)		(0x000BC5A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_CTHR_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_CTHR_TCOFFTH_S		0
+#define TCDCB_TCUPM_WAIT_CTHR_TCOFFTH_M		ICE_M(0x7FFF, 0)
+#define TCDCB_TCUPM_WAIT_DM(_i)			(0x000BC620 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_DM_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_DM_MONITOR_S		0
+#define TCDCB_TCUPM_WAIT_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define TCDCB_TCUPM_WAIT_DTHR(_i)		(0x000BC6A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_DTHR_MAX_INDEX		31
+#define TCDCB_TCUPM_WAIT_DTHR_TCOFFTH_S		0
+#define TCDCB_TCUPM_WAIT_DTHR_TCOFFTH_M		ICE_M(0xFFF, 0)
+#define TCDCB_TCUPM_WAIT_PE_HB_DM(_i)		(0x000BC720 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MAX_INDEX	31
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MONITOR_S	0
+#define TCDCB_TCUPM_WAIT_PE_HB_DM_MONITOR_M	ICE_M(0xFFF, 0)
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR(_i)		(0x000BC7A0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_MAX_INDEX	31
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_TCOFFTH_S	0
+#define TCDCB_TCUPM_WAIT_PE_HB_DTHR_TCOFFTH_M	ICE_M(0xFFF, 0)
+#define TCDCB_TLPM_WAIT_DM(_i)			(0x000A0080 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TLPM_WAIT_DM_MAX_INDEX		31
+#define TCDCB_TLPM_WAIT_DM_MONITOR_S		0
+#define TCDCB_TLPM_WAIT_DM_MONITOR_M		ICE_M(0x7FFFF, 0)
+#define TCDCB_TLPM_WAIT_DTHR(_i)		(0x000A0100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCDCB_TLPM_WAIT_DTHR_MAX_INDEX		31
+#define TCDCB_TLPM_WAIT_DTHR_TCOFFTH_S		0
+#define TCDCB_TLPM_WAIT_DTHR_TCOFFTH_M		ICE_M(0xFFF, 0)
+#define TCTCB_WB_RL_TC_CFG(_i)			(0x000AE138 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCTCB_WB_RL_TC_CFG_MAX_INDEX		31
+#define TCTCB_WB_RL_TC_CFG_TOKENS_S		0
+#define TCTCB_WB_RL_TC_CFG_TOKENS_M		ICE_M(0xFFF, 0)
+#define TCTCB_WB_RL_TC_CFG_BURST_SIZE_S		12
+#define TCTCB_WB_RL_TC_CFG_BURST_SIZE_M		ICE_M(0x3FF, 12)
+#define TCTCB_WB_RL_TC_STAT(_i)			(0x000AE1B8 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TCTCB_WB_RL_TC_STAT_MAX_INDEX		31
+#define TCTCB_WB_RL_TC_STAT_BUCKET_S		0
+#define TCTCB_WB_RL_TC_STAT_BUCKET_M		ICE_M(0x1FFFF, 0)
+#define TPB_BULK_DWRR_REG_QUANTA		0x00099340 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_REG_QUANTA_QUANTA_S	0
+#define TPB_BULK_DWRR_REG_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_BULK_DWRR_REG_SAT			0x00099350 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_REG_SAT_SATURATION_S	0
+#define TPB_BULK_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_BULK_DWRR_WB_QUANTA			0x00099344 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_WB_QUANTA_QUANTA_S	0
+#define TPB_BULK_DWRR_WB_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_BULK_DWRR_WB_SAT			0x00099354 /* Reset Source: CORER */
+#define TPB_BULK_DWRR_WB_SAT_SATURATION_S	0
+#define TPB_BULK_DWRR_WB_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_GLDCB_TCB_WB_SP			0x0009966C /* Reset Source: CORER */
+#define TPB_GLDCB_TCB_WB_SP_WB_SP_S		0
+#define TPB_GLDCB_TCB_WB_SP_WB_SP_M		BIT(0)
+#define TPB_GLTCB_CREDIT_EXP_CTL		0x00099664 /* Reset Source: CORER */
+#define TPB_GLTCB_CREDIT_EXP_CTL_EN_S		0
+#define TPB_GLTCB_CREDIT_EXP_CTL_EN_M		BIT(0)
+#define TPB_GLTCB_CREDIT_EXP_CTL_MIN_PKT_S	1
+#define TPB_GLTCB_CREDIT_EXP_CTL_MIN_PKT_M	ICE_M(0x1FF, 1)
+#define TPB_LL_DWRR_REG_QUANTA			0x00099348 /* Reset Source: CORER */
+#define TPB_LL_DWRR_REG_QUANTA_QUANTA_S		0
+#define TPB_LL_DWRR_REG_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define TPB_LL_DWRR_REG_SAT			0x00099358 /* Reset Source: CORER */
+#define TPB_LL_DWRR_REG_SAT_SATURATION_S	0
+#define TPB_LL_DWRR_REG_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_LL_DWRR_WB_QUANTA			0x0009934C /* Reset Source: CORER */
+#define TPB_LL_DWRR_WB_QUANTA_QUANTA_S		0
+#define TPB_LL_DWRR_WB_QUANTA_QUANTA_M		ICE_M(0x7FF, 0)
+#define TPB_LL_DWRR_WB_SAT			0x0009935C /* Reset Source: CORER */
+#define TPB_LL_DWRR_WB_SAT_SATURATION_S		0
+#define TPB_LL_DWRR_WB_SAT_SATURATION_M		ICE_M(0x1FFFF, 0)
+#define TPB_PRTDCB_TCB_DWRR_CREDITS		0x000991C0 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_CREDITS_CREDITS_S	0
+#define TPB_PRTDCB_TCB_DWRR_CREDITS_CREDITS_M	ICE_M(0x3FFFF, 0)
+#define TPB_PRTDCB_TCB_DWRR_QUANTA		0x00099220 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_QUANTA_QUANTA_S	0
+#define TPB_PRTDCB_TCB_DWRR_QUANTA_QUANTA_M	ICE_M(0x7FF, 0)
+#define TPB_PRTDCB_TCB_DWRR_SAT			0x00099260 /* Reset Source: CORER */
+#define TPB_PRTDCB_TCB_DWRR_SAT_SATURATION_S	0
+#define TPB_PRTDCB_TCB_DWRR_SAT_SATURATION_M	ICE_M(0x1FFFF, 0)
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS	0x000992A0 /* Reset Source: CORER */
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_BULK_DWRR_REG_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS		0x000992C0 /* Reset Source: CORER */
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_BULK_DWRR_WB_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_CREDIT_EXP			0x00099644 /* Reset Source: CORER */
+#define TPB_PRTTCB_CREDIT_EXP_EXPANSION_S	0
+#define TPB_PRTTCB_CREDIT_EXP_EXPANSION_M	ICE_M(0xFF, 0)
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS		0x00099300 /* Reset Source: CORER */
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_LL_DWRR_REG_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS		0x00099320 /* Reset Source: CORER */
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_S 0
+#define TPB_PRTTCB_LL_DWRR_WB_CREDITS_CREDITS_M ICE_M(0x3FFFF, 0)
+#define TPB_WB_RL_TC_CFG(_i)			(0x00099360 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TPB_WB_RL_TC_CFG_MAX_INDEX		31
+#define TPB_WB_RL_TC_CFG_TOKENS_S		0
+#define TPB_WB_RL_TC_CFG_TOKENS_M		ICE_M(0xFFF, 0)
+#define TPB_WB_RL_TC_CFG_BURST_SIZE_S		12
+#define TPB_WB_RL_TC_CFG_BURST_SIZE_M		ICE_M(0x3FF, 12)
+#define TPB_WB_RL_TC_STAT(_i)			(0x000993E0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define TPB_WB_RL_TC_STAT_MAX_INDEX		31
+#define TPB_WB_RL_TC_STAT_BUCKET_S		0
+#define TPB_WB_RL_TC_STAT_BUCKET_M		ICE_M(0x1FFFF, 0)
+#define GL_ACLEXT_CDMD_L1SEL(_i)		(0x00210054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_ACLEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_ACLEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_ACLEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_ACLEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_ACLEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_ACLEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_ACLEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_ACLEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_ACLEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_ACLEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_ACLEXT_CTLTBL_L2ADDR(_i)		(0x00210084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_ACLEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_ACLEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_CTLTBL_L2DATA(_i)		(0x00210090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_ACLEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_ACLEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_DFLT_L2PRFL(_i)		(0x00210138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_ACLEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_ACLEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_DFLT_L2PRFL_ACL(_i)		(0x00393800 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_MAX_INDEX	2
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_DFLT_PRFL_S	0
+#define GL_ACLEXT_DFLT_L2PRFL_ACL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FLGS_L1SEL0_1(_i)		(0x0021006C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_ACLEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_ACLEXT_FLGS_L1SEL2_3(_i)		(0x00210078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_ACLEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_ACLEXT_FLGS_L1TBL(_i)		(0x00210060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_ACLEXT_FLGS_L1TBL_LSB_S		0
+#define GL_ACLEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FLGS_L1TBL_MSB_S		16
+#define GL_ACLEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_ACLEXT_FORCE_L1CDID(_i)		(0x00210018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_ACLEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_ACLEXT_FORCE_PID(_i)			(0x00210000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_FORCE_PID_MAX_INDEX		2
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_ACLEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_ACLEXT_K2N_L2ADDR(_i)		(0x00210144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_ACLEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_K2N_L2DATA(_i)		(0x00210150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_K2N_L2DATA_DATA0_S		0
+#define GL_ACLEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_K2N_L2DATA_DATA1_S		8
+#define GL_ACLEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_K2N_L2DATA_DATA2_S		16
+#define GL_ACLEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_K2N_L2DATA_DATA3_S		24
+#define GL_ACLEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2_PMASK0(_i)			(0x002100FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_ACLEXT_L2_PMASK0_BITMASK_S		0
+#define GL_ACLEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_L2_PMASK1(_i)			(0x00210108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_ACLEXT_L2_PMASK1_BITMASK_S		0
+#define GL_ACLEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_ACLEXT_L2_TMASK0(_i)			(0x00210498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_ACLEXT_L2_TMASK0_BITMASK_S		0
+#define GL_ACLEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_L2_TMASK1(_i)			(0x002104A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_ACLEXT_L2_TMASK1_BITMASK_S		0
+#define GL_ACLEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP0_3(_i)			(0x002100A8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2BMP0_3_MAX_INDEX		2
+#define GL_ACLEXT_L2BMP0_3_BMP0_S		0
+#define GL_ACLEXT_L2BMP0_3_BMP0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP0_3_BMP1_S		8
+#define GL_ACLEXT_L2BMP0_3_BMP1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_L2BMP0_3_BMP2_S		16
+#define GL_ACLEXT_L2BMP0_3_BMP2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_L2BMP0_3_BMP3_S		24
+#define GL_ACLEXT_L2BMP0_3_BMP3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2BMP4_7(_i)			(0x002100B4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2BMP4_7_MAX_INDEX		2
+#define GL_ACLEXT_L2BMP4_7_BMP4_S		0
+#define GL_ACLEXT_L2BMP4_7_BMP4_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_L2BMP4_7_BMP5_S		8
+#define GL_ACLEXT_L2BMP4_7_BMP5_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_L2BMP4_7_BMP6_S		16
+#define GL_ACLEXT_L2BMP4_7_BMP6_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_L2BMP4_7_BMP7_S		24
+#define GL_ACLEXT_L2BMP4_7_BMP7_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_L2PRTMOD(_i)			(0x0021009C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_ACLEXT_L2PRTMOD_XLT1_S		0
+#define GL_ACLEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_ACLEXT_L2PRTMOD_XLT2_S		8
+#define GL_ACLEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_ACLEXT_N2N_L2ADDR(_i)		(0x0021015C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_ACLEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_N2N_L2DATA(_i)		(0x00210168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_N2N_L2DATA_DATA0_S		0
+#define GL_ACLEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_ACLEXT_N2N_L2DATA_DATA1_S		8
+#define GL_ACLEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_ACLEXT_N2N_L2DATA_DATA2_S		16
+#define GL_ACLEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_ACLEXT_N2N_L2DATA_DATA3_S		24
+#define GL_ACLEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_ACLEXT_P2P_L1ADDR(_i)		(0x00210024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_ACLEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_ACLEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_ACLEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_ACLEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_ACLEXT_P2P_L1DATA(_i)		(0x00210030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_ACLEXT_P2P_L1DATA_DATA_S		0
+#define GL_ACLEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_PID_L2GKTYPE(_i)		(0x002100F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_ACLEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_ACLEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_ACLEXT_PLVL_SEL(_i)			(0x0021000C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_ACLEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_ACLEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_ACLEXT_TCAM_L2ADDR(_i)		(0x00210114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_ACLEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_TCAM_L2DATALSB(_i)		(0x00210120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_ACLEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_ACLEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_TCAM_L2DATAMSB(_i)		(0x0021012C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_ACLEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_ACLEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_ACLEXT_XLT0_L1ADDR(_i)		(0x0021003C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_ACLEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT0_L1DATA(_i)		(0x00210048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT0_L1DATA_DATA_S		0
+#define GL_ACLEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_XLT1_L2ADDR(_i)		(0x002100C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_ACLEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT1_L2DATA(_i)		(0x002100CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT1_L2DATA_DATA_S		0
+#define GL_ACLEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_ACLEXT_XLT2_L2ADDR(_i)		(0x002100D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_ACLEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_ACLEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_ACLEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_ACLEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_ACLEXT_XLT2_L2DATA(_i)		(0x002100E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_ACLEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_ACLEXT_XLT2_L2DATA_DATA_S		0
+#define GL_ACLEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_CDMD_L1SEL(_i)		(0x0020F054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_PREEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_PREEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_PREEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_PREEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_PREEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_PREEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_PREEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_PREEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_PREEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_PREEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_PREEXT_CTLTBL_L2ADDR(_i)		(0x0020F084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_PREEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_PREEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_CTLTBL_L2DATA(_i)		(0x0020F090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_PREEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_PREEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_DFLT_L2PRFL(_i)		(0x0020F138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_PREEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_PREEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FLGS_L1SEL0_1(_i)		(0x0020F06C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_PREEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_PREEXT_FLGS_L1SEL2_3(_i)		(0x0020F078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_PREEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_PREEXT_FLGS_L1TBL(_i)		(0x0020F060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_PREEXT_FLGS_L1TBL_LSB_S		0
+#define GL_PREEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FLGS_L1TBL_MSB_S		16
+#define GL_PREEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_PREEXT_FORCE_L1CDID(_i)		(0x0020F018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_PREEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_PREEXT_FORCE_PID(_i)			(0x0020F000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_FORCE_PID_MAX_INDEX		2
+#define GL_PREEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_PREEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_PREEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_PREEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_PREEXT_K2N_L2ADDR(_i)		(0x0020F144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_PREEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_PREEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_PREEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_K2N_L2DATA(_i)		(0x0020F150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_K2N_L2DATA_DATA0_S		0
+#define GL_PREEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_K2N_L2DATA_DATA1_S		8
+#define GL_PREEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_K2N_L2DATA_DATA2_S		16
+#define GL_PREEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_K2N_L2DATA_DATA3_S		24
+#define GL_PREEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2_PMASK0(_i)			(0x0020F0FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_PREEXT_L2_PMASK0_BITMASK_S		0
+#define GL_PREEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_L2_PMASK1(_i)			(0x0020F108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_PREEXT_L2_PMASK1_BITMASK_S		0
+#define GL_PREEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_PREEXT_L2_TMASK0(_i)			(0x0020F498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_PREEXT_L2_TMASK0_BITMASK_S		0
+#define GL_PREEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_L2_TMASK1(_i)			(0x0020F4A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_PREEXT_L2_TMASK1_BITMASK_S		0
+#define GL_PREEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP0_3(_i)			(0x0020F0A8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2BMP0_3_MAX_INDEX		2
+#define GL_PREEXT_L2BMP0_3_BMP0_S		0
+#define GL_PREEXT_L2BMP0_3_BMP0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP0_3_BMP1_S		8
+#define GL_PREEXT_L2BMP0_3_BMP1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_L2BMP0_3_BMP2_S		16
+#define GL_PREEXT_L2BMP0_3_BMP2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_L2BMP0_3_BMP3_S		24
+#define GL_PREEXT_L2BMP0_3_BMP3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2BMP4_7(_i)			(0x0020F0B4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2BMP4_7_MAX_INDEX		2
+#define GL_PREEXT_L2BMP4_7_BMP4_S		0
+#define GL_PREEXT_L2BMP4_7_BMP4_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_L2BMP4_7_BMP5_S		8
+#define GL_PREEXT_L2BMP4_7_BMP5_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_L2BMP4_7_BMP6_S		16
+#define GL_PREEXT_L2BMP4_7_BMP6_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_L2BMP4_7_BMP7_S		24
+#define GL_PREEXT_L2BMP4_7_BMP7_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_L2PRTMOD(_i)			(0x0020F09C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_PREEXT_L2PRTMOD_XLT1_S		0
+#define GL_PREEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_PREEXT_L2PRTMOD_XLT2_S		8
+#define GL_PREEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_PREEXT_N2N_L2ADDR(_i)		(0x0020F15C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_PREEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_PREEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_PREEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_N2N_L2DATA(_i)		(0x0020F168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_N2N_L2DATA_DATA0_S		0
+#define GL_PREEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PREEXT_N2N_L2DATA_DATA1_S		8
+#define GL_PREEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PREEXT_N2N_L2DATA_DATA2_S		16
+#define GL_PREEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PREEXT_N2N_L2DATA_DATA3_S		24
+#define GL_PREEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PREEXT_P2P_L1ADDR(_i)		(0x0020F024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_PREEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_PREEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_PREEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_PREEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_PREEXT_P2P_L1DATA(_i)		(0x0020F030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_PREEXT_P2P_L1DATA_DATA_S		0
+#define GL_PREEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_PID_L2GKTYPE(_i)		(0x0020F0F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_PREEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_PREEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_PREEXT_PLVL_SEL(_i)			(0x0020F00C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_PREEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_PREEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_PREEXT_TCAM_L2ADDR(_i)		(0x0020F114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_PREEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_TCAM_L2DATALSB(_i)		(0x0020F120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_PREEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_PREEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_TCAM_L2DATAMSB(_i)		(0x0020F12C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_PREEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_PREEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_PREEXT_XLT0_L1ADDR(_i)		(0x0020F03C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_PREEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT0_L1DATA(_i)		(0x0020F048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT0_L1DATA_DATA_S		0
+#define GL_PREEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_XLT1_L2ADDR(_i)		(0x0020F0C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_PREEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT1_L2DATA(_i)		(0x0020F0CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT1_L2DATA_DATA_S		0
+#define GL_PREEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PREEXT_XLT2_L2ADDR(_i)		(0x0020F0D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_PREEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_PREEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_PREEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_PREEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PREEXT_XLT2_L2DATA(_i)		(0x0020F0E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PREEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_PREEXT_XLT2_L2DATA_DATA_S		0
+#define GL_PREEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_CDMD_L1SEL(_i)		(0x0020E054 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CDMD_L1SEL_MAX_INDEX		2
+#define GL_PSTEXT_CDMD_L1SEL_RX_SEL_S		0
+#define GL_PSTEXT_CDMD_L1SEL_RX_SEL_M		ICE_M(0x1F, 0)
+#define GL_PSTEXT_CDMD_L1SEL_TX_SEL_S		8
+#define GL_PSTEXT_CDMD_L1SEL_TX_SEL_M		ICE_M(0x1F, 8)
+#define GL_PSTEXT_CDMD_L1SEL_AUX0_SEL_S		16
+#define GL_PSTEXT_CDMD_L1SEL_AUX0_SEL_M		ICE_M(0x1F, 16)
+#define GL_PSTEXT_CDMD_L1SEL_AUX1_SEL_S		24
+#define GL_PSTEXT_CDMD_L1SEL_AUX1_SEL_M		ICE_M(0x1F, 24)
+#define GL_PSTEXT_CDMD_L1SEL_BIDIR_ENA_S	30
+#define GL_PSTEXT_CDMD_L1SEL_BIDIR_ENA_M	ICE_M(0x3, 30)
+#define GL_PSTEXT_CTLTBL_L2ADDR(_i)		(0x0020E084 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CTLTBL_L2ADDR_MAX_INDEX	2
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_OFF_S	0
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_OFF_M	ICE_M(0x7, 0)
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_IDX_S	8
+#define GL_PSTEXT_CTLTBL_L2ADDR_LINE_IDX_M	ICE_M(0x7, 8)
+#define GL_PSTEXT_CTLTBL_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_CTLTBL_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_CTLTBL_L2DATA(_i)		(0x0020E090 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_CTLTBL_L2DATA_MAX_INDEX	2
+#define GL_PSTEXT_CTLTBL_L2DATA_DATA_S		0
+#define GL_PSTEXT_CTLTBL_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_DFLT_L2PRFL(_i)		(0x0020E138 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_DFLT_L2PRFL_MAX_INDEX		2
+#define GL_PSTEXT_DFLT_L2PRFL_DFLT_PRFL_S	0
+#define GL_PSTEXT_DFLT_L2PRFL_DFLT_PRFL_M	ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FL15_BMPLSB(_i)		(0x0020E480 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FL15_BMPLSB_MAX_INDEX		2
+#define GL_PSTEXT_FL15_BMPLSB_BMPLSB_S		0
+#define GL_PSTEXT_FL15_BMPLSB_BMPLSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_FL15_BMPMSB(_i)		(0x0020E48C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FL15_BMPMSB_MAX_INDEX		2
+#define GL_PSTEXT_FL15_BMPMSB_BMPMSB_S		0
+#define GL_PSTEXT_FL15_BMPMSB_BMPMSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_FLGS_L1SEL0_1(_i)		(0x0020E06C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1SEL0_1_MAX_INDEX	2
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS0_S		0
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS0_M		ICE_M(0x1FF, 0)
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS1_S		16
+#define GL_PSTEXT_FLGS_L1SEL0_1_FLS1_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_FLGS_L1SEL2_3(_i)		(0x0020E078 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1SEL2_3_MAX_INDEX	2
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS2_S		0
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS2_M		ICE_M(0x1FF, 0)
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS3_S		16
+#define GL_PSTEXT_FLGS_L1SEL2_3_FLS3_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_FLGS_L1TBL(_i)		(0x0020E060 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FLGS_L1TBL_MAX_INDEX		2
+#define GL_PSTEXT_FLGS_L1TBL_LSB_S		0
+#define GL_PSTEXT_FLGS_L1TBL_LSB_M		ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FLGS_L1TBL_MSB_S		16
+#define GL_PSTEXT_FLGS_L1TBL_MSB_M		ICE_M(0xFFFF, 16)
+#define GL_PSTEXT_FORCE_L1CDID(_i)		(0x0020E018 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FORCE_L1CDID_MAX_INDEX	2
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_S	0
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_M	ICE_M(0xF, 0)
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_EN_S 31
+#define GL_PSTEXT_FORCE_L1CDID_STATIC_CDID_EN_M BIT(31)
+#define GL_PSTEXT_FORCE_PID(_i)			(0x0020E000 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_FORCE_PID_MAX_INDEX		2
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_S	0
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_M	ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_EN_S	31
+#define GL_PSTEXT_FORCE_PID_STATIC_PID_EN_M	BIT(31)
+#define GL_PSTEXT_K2N_L2ADDR(_i)		(0x0020E144 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_K2N_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_K2N_L2ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_K2N_L2ADDR_LINE_IDX_M		ICE_M(0x7F, 0)
+#define GL_PSTEXT_K2N_L2ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_K2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_K2N_L2DATA(_i)		(0x0020E150 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_K2N_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_K2N_L2DATA_DATA0_S		0
+#define GL_PSTEXT_K2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_K2N_L2DATA_DATA1_S		8
+#define GL_PSTEXT_K2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PSTEXT_K2N_L2DATA_DATA2_S		16
+#define GL_PSTEXT_K2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PSTEXT_K2N_L2DATA_DATA3_S		24
+#define GL_PSTEXT_K2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PSTEXT_L2_PMASK0(_i)			(0x0020E0FC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_PMASK0_MAX_INDEX		2
+#define GL_PSTEXT_L2_PMASK0_BITMASK_S		0
+#define GL_PSTEXT_L2_PMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_L2_PMASK1(_i)			(0x0020E108 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_PMASK1_MAX_INDEX		2
+#define GL_PSTEXT_L2_PMASK1_BITMASK_S		0
+#define GL_PSTEXT_L2_PMASK1_BITMASK_M		ICE_M(0xFFFF, 0)
+#define GL_PSTEXT_L2_TMASK0(_i)			(0x0020E498 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_TMASK0_MAX_INDEX		2
+#define GL_PSTEXT_L2_TMASK0_BITMASK_S		0
+#define GL_PSTEXT_L2_TMASK0_BITMASK_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_L2_TMASK1(_i)			(0x0020E4A4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2_TMASK1_MAX_INDEX		2
+#define GL_PSTEXT_L2_TMASK1_BITMASK_S		0
+#define GL_PSTEXT_L2_TMASK1_BITMASK_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_L2PRTMOD(_i)			(0x0020E09C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_L2PRTMOD_MAX_INDEX		2
+#define GL_PSTEXT_L2PRTMOD_XLT1_S		0
+#define GL_PSTEXT_L2PRTMOD_XLT1_M		ICE_M(0x3, 0)
+#define GL_PSTEXT_L2PRTMOD_XLT2_S		8
+#define GL_PSTEXT_L2PRTMOD_XLT2_M		ICE_M(0x3, 8)
+#define GL_PSTEXT_N2N_L2ADDR(_i)		(0x0020E15C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_N2N_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_N2N_L2ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_N2N_L2ADDR_LINE_IDX_M		ICE_M(0x3F, 0)
+#define GL_PSTEXT_N2N_L2ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_N2N_L2ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_N2N_L2DATA(_i)		(0x0020E168 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_N2N_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_N2N_L2DATA_DATA0_S		0
+#define GL_PSTEXT_N2N_L2DATA_DATA0_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_N2N_L2DATA_DATA1_S		8
+#define GL_PSTEXT_N2N_L2DATA_DATA1_M		ICE_M(0xFF, 8)
+#define GL_PSTEXT_N2N_L2DATA_DATA2_S		16
+#define GL_PSTEXT_N2N_L2DATA_DATA2_M		ICE_M(0xFF, 16)
+#define GL_PSTEXT_N2N_L2DATA_DATA3_S		24
+#define GL_PSTEXT_N2N_L2DATA_DATA3_M		ICE_M(0xFF, 24)
+#define GL_PSTEXT_P2P_L1ADDR(_i)		(0x0020E024 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_P2P_L1ADDR_MAX_INDEX		2
+#define GL_PSTEXT_P2P_L1ADDR_LINE_IDX_S		0
+#define GL_PSTEXT_P2P_L1ADDR_LINE_IDX_M		BIT(0)
+#define GL_PSTEXT_P2P_L1ADDR_AUTO_INC_S		31
+#define GL_PSTEXT_P2P_L1ADDR_AUTO_INC_M		BIT(31)
+#define GL_PSTEXT_P2P_L1DATA(_i)		(0x0020E030 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_P2P_L1DATA_MAX_INDEX		2
+#define GL_PSTEXT_P2P_L1DATA_DATA_S		0
+#define GL_PSTEXT_P2P_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_PID_L2GKTYPE(_i)		(0x0020E0F0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PID_L2GKTYPE_MAX_INDEX	2
+#define GL_PSTEXT_PID_L2GKTYPE_PID_GKTYPE_S	0
+#define GL_PSTEXT_PID_L2GKTYPE_PID_GKTYPE_M	ICE_M(0x3, 0)
+#define GL_PSTEXT_PLVL_SEL(_i)			(0x0020E00C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PLVL_SEL_MAX_INDEX		2
+#define GL_PSTEXT_PLVL_SEL_PLVL_SEL_S		0
+#define GL_PSTEXT_PLVL_SEL_PLVL_SEL_M		BIT(0)
+#define GL_PSTEXT_PRFLM_CTRL(_i)		(0x0020E474 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_CTRL_MAX_INDEX		2
+#define GL_PSTEXT_PRFLM_CTRL_PRFL_IDX_S		0
+#define GL_PSTEXT_PRFLM_CTRL_PRFL_IDX_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_CTRL_RD_REQ_S		30
+#define GL_PSTEXT_PRFLM_CTRL_RD_REQ_M		BIT(30)
+#define GL_PSTEXT_PRFLM_CTRL_WR_REQ_S		31
+#define GL_PSTEXT_PRFLM_CTRL_WR_REQ_M		BIT(31)
+#define GL_PSTEXT_PRFLM_DATA_0(_i)		(0x0020E174 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_0_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_0_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_0_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_0_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_0_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_PRFLM_DATA_1(_i)		(0x0020E274 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_1_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_1_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_1_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_1_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_1_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_PRFLM_DATA_2(_i)		(0x0020E374 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_PSTEXT_PRFLM_DATA_2_MAX_INDEX	63
+#define GL_PSTEXT_PRFLM_DATA_2_PROT_S		0
+#define GL_PSTEXT_PRFLM_DATA_2_PROT_M		ICE_M(0xFF, 0)
+#define GL_PSTEXT_PRFLM_DATA_2_OFF_S		16
+#define GL_PSTEXT_PRFLM_DATA_2_OFF_M		ICE_M(0x1FF, 16)
+#define GL_PSTEXT_TCAM_L2ADDR(_i)		(0x0020E114 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_TCAM_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_TCAM_L2ADDR_LINE_IDX_M	ICE_M(0x3FF, 0)
+#define GL_PSTEXT_TCAM_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_TCAM_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_TCAM_L2DATALSB(_i)		(0x0020E120 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2DATALSB_MAX_INDEX	2
+#define GL_PSTEXT_TCAM_L2DATALSB_DATALSB_S	0
+#define GL_PSTEXT_TCAM_L2DATALSB_DATALSB_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_TCAM_L2DATAMSB(_i)		(0x0020E12C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_TCAM_L2DATAMSB_MAX_INDEX	2
+#define GL_PSTEXT_TCAM_L2DATAMSB_DATAMSB_S	0
+#define GL_PSTEXT_TCAM_L2DATAMSB_DATAMSB_M	ICE_M(0xFF, 0)
+#define GL_PSTEXT_XLT0_L1ADDR(_i)		(0x0020E03C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT0_L1ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT0_L1ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT0_L1ADDR_LINE_IDX_M	ICE_M(0xFF, 0)
+#define GL_PSTEXT_XLT0_L1ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT0_L1ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT0_L1DATA(_i)		(0x0020E048 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT0_L1DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT0_L1DATA_DATA_S		0
+#define GL_PSTEXT_XLT0_L1DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_XLT1_L2ADDR(_i)		(0x0020E0C0 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT1_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT1_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT1_L2ADDR_LINE_IDX_M	ICE_M(0x7FF, 0)
+#define GL_PSTEXT_XLT1_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT1_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT1_L2DATA(_i)		(0x0020E0CC + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT1_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT1_L2DATA_DATA_S		0
+#define GL_PSTEXT_XLT1_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_PSTEXT_XLT2_L2ADDR(_i)		(0x0020E0D8 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT2_L2ADDR_MAX_INDEX		2
+#define GL_PSTEXT_XLT2_L2ADDR_LINE_IDX_S	0
+#define GL_PSTEXT_XLT2_L2ADDR_LINE_IDX_M	ICE_M(0x1FF, 0)
+#define GL_PSTEXT_XLT2_L2ADDR_AUTO_INC_S	31
+#define GL_PSTEXT_XLT2_L2ADDR_AUTO_INC_M	BIT(31)
+#define GL_PSTEXT_XLT2_L2DATA(_i)		(0x0020E0E4 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GL_PSTEXT_XLT2_L2DATA_MAX_INDEX		2
+#define GL_PSTEXT_XLT2_L2DATA_DATA_S		0
+#define GL_PSTEXT_XLT2_L2DATA_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLFLXP_PTYPE_TRANSLATION(_i)		(0x0045C000 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLFLXP_PTYPE_TRANSLATION_MAX_INDEX	255
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_S	0
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_M	ICE_M(0xFF, 0)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_1_S	8
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_1_M	ICE_M(0xFF, 8)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_2_S	16
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_2_M	ICE_M(0xFF, 16)
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_3_S	24
+#define GLFLXP_PTYPE_TRANSLATION_PTYPE_4N_3_M	ICE_M(0xFF, 24)
+#define GLFLXP_RX_CMD_LX_PROT_IDX(_i)		(0x0045C400 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLFLXP_RX_CMD_LX_PROT_IDX_MAX_INDEX	255
+#define GLFLXP_RX_CMD_LX_PROT_IDX_INNER_CLOUD_OFFSET_INDEX_S 0
+#define GLFLXP_RX_CMD_LX_PROT_IDX_INNER_CLOUD_OFFSET_INDEX_M ICE_M(0x7, 0)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_OFFSET_INDEX_S 4
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_OFFSET_INDEX_M ICE_M(0x7, 4)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_PAYLOAD_OFFSET_INDEX_S 8
+#define GLFLXP_RX_CMD_LX_PROT_IDX_PAYLOAD_OFFSET_INDEX_M ICE_M(0x7, 8)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L3_PROTOCOL_S 12
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L3_PROTOCOL_M ICE_M(0x3, 12)
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_PROTOCOL_S 14
+#define GLFLXP_RX_CMD_LX_PROT_IDX_L4_PROTOCOL_M ICE_M(0x3, 14)
+#define GLFLXP_RX_CMD_PROTIDS(_i, _j)		(0x0045A000 + ((_i) * 4 + (_j) * 1024)) /* _i=0...255, _j=0...5 */ /* Reset Source: CORER */
+#define GLFLXP_RX_CMD_PROTIDS_MAX_INDEX		255
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_S	0
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_M	ICE_M(0xFF, 0)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_1_S	8
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_1_M	ICE_M(0xFF, 8)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_2_S	16
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_2_M	ICE_M(0xFF, 16)
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_3_S	24
+#define GLFLXP_RX_CMD_PROTIDS_PROTID_4N_3_M	ICE_M(0xFF, 24)
+#define GLFLXP_RXDID_FLAGS(_i, _j)		(0x0045D000 + ((_i) * 4 + (_j) * 256)) /* _i=0...63, _j=0...4 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLAGS_MAX_INDEX		63
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S	0
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M	ICE_M(0x3F, 0)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S	8
@@ -66,56 +3009,1480 @@
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M	ICE_M(0x3F, 16)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S	24
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M	ICE_M(0x3F, 24)
-#define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045c800 + ((_i) * 4))
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE(_i)	(0x0045D600 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_MAX_INDEX	63
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_FLEXIFLAGS1_OVERRIDE_S 0
+#define GLFLXP_RXDID_FLAGS1_OVERRIDE_FLEXIFLAGS1_OVERRIDE_M ICE_M(0xF, 0)
+#define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045C800 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_0_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_0_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_0_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_1(_i)		(0x0045c900 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_1(_i)		(0x0045C900 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_1_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_1_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_1_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_2(_i)		(0x0045ca00 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_2(_i)		(0x0045CA00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_2_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_2_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_2_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define GLFLXP_RXDID_FLX_WRD_3(_i)		(0x0045cb00 + ((_i) * 4))
+#define GLFLXP_RXDID_FLX_WRD_3(_i)		(0x0045CB00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_3_MAX_INDEX	63
 #define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_3_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_3_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
 #define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_S	30
 #define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_M	ICE_M(0x3, 30)
-#define QRXFLXP_CNTXT(_QRX)			(0x00480000 + ((_QRX) * 4))
+#define GLFLXP_RXDID_FLX_WRD_4(_i)		(0x0045CC00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_4_MAX_INDEX	63
+#define GLFLXP_RXDID_FLX_WRD_4_PROT_MDID_S	0
+#define GLFLXP_RXDID_FLX_WRD_4_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_4_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_4_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
+#define GLFLXP_RXDID_FLX_WRD_4_RXDID_OPCODE_S	30
+#define GLFLXP_RXDID_FLX_WRD_4_RXDID_OPCODE_M	ICE_M(0x3, 30)
+#define GLFLXP_RXDID_FLX_WRD_5(_i)		(0x0045CD00 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLFLXP_RXDID_FLX_WRD_5_MAX_INDEX	63
+#define GLFLXP_RXDID_FLX_WRD_5_PROT_MDID_S	0
+#define GLFLXP_RXDID_FLX_WRD_5_PROT_MDID_M	ICE_M(0xFF, 0)
+#define GLFLXP_RXDID_FLX_WRD_5_EXTRACTION_OFFSET_S 8
+#define GLFLXP_RXDID_FLX_WRD_5_EXTRACTION_OFFSET_M ICE_M(0x3FF, 8)
+#define GLFLXP_RXDID_FLX_WRD_5_RXDID_OPCODE_S	30
+#define GLFLXP_RXDID_FLX_WRD_5_RXDID_OPCODE_M	ICE_M(0x3, 30)
+#define GLFLXP_TX_SCHED_CORRECT(_i, _j)		(0x00458000 + ((_i) * 4 + (_j) * 256)) /* _i=0...63, _j=0...31 */ /* Reset Source: CORER */
+#define GLFLXP_TX_SCHED_CORRECT_MAX_INDEX	63
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_S	0
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_M	ICE_M(0xFF, 0)
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_S	8
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_M	ICE_M(0x1F, 8)
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_1_S 16
+#define GLFLXP_TX_SCHED_CORRECT_PROTD_ID_2N_1_M ICE_M(0xFF, 16)
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_1_S	24
+#define GLFLXP_TX_SCHED_CORRECT_RECIPE_2N_1_M	ICE_M(0x1F, 24)
+#define QRXFLXP_CNTXT(_QRX)			(0x00480000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRXFLXP_CNTXT_MAX_INDEX			2047
 #define QRXFLXP_CNTXT_RXDID_IDX_S		0
 #define QRXFLXP_CNTXT_RXDID_IDX_M		ICE_M(0x3F, 0)
 #define QRXFLXP_CNTXT_RXDID_PRIO_S		8
 #define QRXFLXP_CNTXT_RXDID_PRIO_M		ICE_M(0x7, 8)
-#define GLGEN_RSTAT				0x000B8188
+#define QRXFLXP_CNTXT_TS_S			11
+#define QRXFLXP_CNTXT_TS_M			BIT(11)
+#define GL_FWSTS				0x00083048 /* Reset Source: POR */
+#define GL_FWSTS_FWS0B_S			0
+#define GL_FWSTS_FWS0B_M			ICE_M(0xFF, 0)
+#define GL_FWSTS_FWROWD_S			8
+#define GL_FWSTS_FWROWD_M			BIT(8)
+#define GL_FWSTS_FWRI_S				9
+#define GL_FWSTS_FWRI_M				BIT(9)
+#define GL_FWSTS_FWS1B_S			16
+#define GL_FWSTS_FWS1B_M			ICE_M(0xFF, 16)
+#define GL_TCVMLR_DRAIN_CNTR_CTL		0x000A21E0 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_CNTR_CTL_OP_S		0
+#define GL_TCVMLR_DRAIN_CNTR_CTL_OP_M		BIT(0)
+#define GL_TCVMLR_DRAIN_CNTR_CTL_PORT_S		1
+#define GL_TCVMLR_DRAIN_CNTR_CTL_PORT_M		ICE_M(0x7, 1)
+#define GL_TCVMLR_DRAIN_CNTR_CTL_VALUE_S	4
+#define GL_TCVMLR_DRAIN_CNTR_CTL_VALUE_M	ICE_M(0x3FFF, 4)
+#define GL_TCVMLR_DRAIN_DONE_DEC		0x000A21A8 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_DEC_TARGET_S	0
+#define GL_TCVMLR_DRAIN_DONE_DEC_TARGET_M	BIT(0)
+#define GL_TCVMLR_DRAIN_DONE_DEC_INDEX_S	1
+#define GL_TCVMLR_DRAIN_DONE_DEC_INDEX_M	ICE_M(0x1F, 1)
+#define GL_TCVMLR_DRAIN_DONE_DEC_VALUE_S	6
+#define GL_TCVMLR_DRAIN_DONE_DEC_VALUE_M	ICE_M(0xFF, 6)
+#define GL_TCVMLR_DRAIN_DONE_TCLAN(_i)		(0x000A20A8 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_MAX_INDEX	31
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_COUNT_S	0
+#define GL_TCVMLR_DRAIN_DONE_TCLAN_COUNT_M	ICE_M(0xFF, 0)
+#define GL_TCVMLR_DRAIN_DONE_TPB(_i)		(0x000A2128 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_DONE_TPB_MAX_INDEX	31
+#define GL_TCVMLR_DRAIN_DONE_TPB_COUNT_S	0
+#define GL_TCVMLR_DRAIN_DONE_TPB_COUNT_M	ICE_M(0xFF, 0)
+#define GL_TCVMLR_DRAIN_MARKER			0x000A2008 /* Reset Source: CORER */
+#define GL_TCVMLR_DRAIN_MARKER_PORT_S		0
+#define GL_TCVMLR_DRAIN_MARKER_PORT_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_DRAIN_MARKER_TC_S		3
+#define GL_TCVMLR_DRAIN_MARKER_TC_M		ICE_M(0x1F, 3)
+#define GL_TCVMLR_ERR_STAT			0x000A2024 /* Reset Source: CORER */
+#define GL_TCVMLR_ERR_STAT_ERROR_S		0
+#define GL_TCVMLR_ERR_STAT_ERROR_M		BIT(0)
+#define GL_TCVMLR_ERR_STAT_FW_REQ_S		1
+#define GL_TCVMLR_ERR_STAT_FW_REQ_M		BIT(1)
+#define GL_TCVMLR_ERR_STAT_STAT_S		2
+#define GL_TCVMLR_ERR_STAT_STAT_M		ICE_M(0x7, 2)
+#define GL_TCVMLR_ERR_STAT_ENT_TYPE_S		5
+#define GL_TCVMLR_ERR_STAT_ENT_TYPE_M		ICE_M(0x7, 5)
+#define GL_TCVMLR_ERR_STAT_ENT_ID_S		8
+#define GL_TCVMLR_ERR_STAT_ENT_ID_M		ICE_M(0x3FFF, 8)
+#define GL_TCVMLR_QCFG				0x000A2010 /* Reset Source: CORER */
+#define GL_TCVMLR_QCFG_QID_S			0
+#define GL_TCVMLR_QCFG_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCFG_OP_S			14
+#define GL_TCVMLR_QCFG_OP_M			BIT(14)
+#define GL_TCVMLR_QCFG_PORT_S			15
+#define GL_TCVMLR_QCFG_PORT_M			ICE_M(0x7, 15)
+#define GL_TCVMLR_QCFG_TC_S			18
+#define GL_TCVMLR_QCFG_TC_M			ICE_M(0x1F, 18)
+#define GL_TCVMLR_QCFG_RD			0x000A2014 /* Reset Source: CORER */
+#define GL_TCVMLR_QCFG_RD_QID_S			0
+#define GL_TCVMLR_QCFG_RD_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCFG_RD_PORT_S		14
+#define GL_TCVMLR_QCFG_RD_PORT_M		ICE_M(0x7, 14)
+#define GL_TCVMLR_QCFG_RD_TC_S			17
+#define GL_TCVMLR_QCFG_RD_TC_M			ICE_M(0x1F, 17)
+#define GL_TCVMLR_QCNTR				0x000A200C /* Reset Source: CORER */
+#define GL_TCVMLR_QCNTR_CNTR_S			0
+#define GL_TCVMLR_QCNTR_CNTR_M			ICE_M(0x7FFF, 0)
+#define GL_TCVMLR_QCTL				0x000A2004 /* Reset Source: CORER */
+#define GL_TCVMLR_QCTL_QID_S			0
+#define GL_TCVMLR_QCTL_QID_M			ICE_M(0x3FFF, 0)
+#define GL_TCVMLR_QCTL_OP_S			14
+#define GL_TCVMLR_QCTL_OP_M			BIT(14)
+#define GL_TCVMLR_REQ_STAT			0x000A2018 /* Reset Source: CORER */
+#define GL_TCVMLR_REQ_STAT_ENT_TYPE_S		0
+#define GL_TCVMLR_REQ_STAT_ENT_TYPE_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_REQ_STAT_ENT_ID_S		3
+#define GL_TCVMLR_REQ_STAT_ENT_ID_M		ICE_M(0x3FFF, 3)
+#define GL_TCVMLR_REQ_STAT_OP_S			17
+#define GL_TCVMLR_REQ_STAT_OP_M			BIT(17)
+#define GL_TCVMLR_REQ_STAT_WRITE_STATUS_S	18
+#define GL_TCVMLR_REQ_STAT_WRITE_STATUS_M	ICE_M(0x7, 18)
+#define GL_TCVMLR_STAT				0x000A201C /* Reset Source: CORER */
+#define GL_TCVMLR_STAT_ENT_TYPE_S		0
+#define GL_TCVMLR_STAT_ENT_TYPE_M		ICE_M(0x7, 0)
+#define GL_TCVMLR_STAT_ENT_ID_S			3
+#define GL_TCVMLR_STAT_ENT_ID_M			ICE_M(0x3FFF, 3)
+#define GL_TCVMLR_STAT_STATUS_S			17
+#define GL_TCVMLR_STAT_STATUS_M			ICE_M(0x7, 17)
+#define GL_XLR_MARKER_TRIG_TCVMLR		0x000A2000 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_TCVMLR_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_TCVMLR_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_TCVMLR_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_TCVMLR_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_TCVMLR_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_XLR_MARKER_TRIG_VMLR			0x00093804 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_VMLR_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_VMLR_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_VMLR_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_VMLR_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_VMLR_PORT_NUM_M	ICE_M(0x7, 16)
+#define GLGEN_ANA_ABORT_PTYPE			0x0020C21C /* Reset Source: CORER */
+#define GLGEN_ANA_ABORT_PTYPE_ABORT_S		0
+#define GLGEN_ANA_ABORT_PTYPE_ABORT_M		ICE_M(0x3FF, 0)
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT		0x0020C208 /* Reset Source: CORER */
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT_NPC_S	0
+#define GLGEN_ANA_ALU_ACCSS_OUT_OF_PKT_NPC_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_CFG_CTRL			0x0020C104 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_CTRL_LINE_IDX_S		0
+#define GLGEN_ANA_CFG_CTRL_LINE_IDX_M		ICE_M(0x3FFFF, 0)
+#define GLGEN_ANA_CFG_CTRL_TABLE_ID_S		18
+#define GLGEN_ANA_CFG_CTRL_TABLE_ID_M		ICE_M(0xFF, 18)
+#define GLGEN_ANA_CFG_CTRL_RESRVED_S		26
+#define GLGEN_ANA_CFG_CTRL_RESRVED_M		ICE_M(0x7, 26)
+#define GLGEN_ANA_CFG_CTRL_OPERATION_ID_S	29
+#define GLGEN_ANA_CFG_CTRL_OPERATION_ID_M	ICE_M(0x7, 29)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT		0x0020C158 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_PG_MEM_IDX_S 1
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_PG_MEM_IDX_M ICE_M(0x7, 1)
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_CFG_HTBL_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_CFG_LU_KEY(_i)		(0x0020C14C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_LU_KEY_MAX_INDEX		2
+#define GLGEN_ANA_CFG_LU_KEY_LU_KEY_S		0
+#define GLGEN_ANA_CFG_LU_KEY_LU_KEY_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_CFG_RDDATA(_i)		(0x0020C10C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_RDDATA_MAX_INDEX		15
+#define GLGEN_ANA_CFG_RDDATA_RD_DATA_S		0
+#define GLGEN_ANA_CFG_RDDATA_RD_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT		0x0020C15C /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_RSV_S	1
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_RSV_M	ICE_M(0x7, 1)
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_CFG_SPLBUF_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_CFG_WRDATA			0x0020C108 /* Reset Source: CORER */
+#define GLGEN_ANA_CFG_WRDATA_WR_DATA_S		0
+#define GLGEN_ANA_CFG_WRDATA_WR_DATA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_DEF_PTYPE			0x0020C100 /* Reset Source: CORER */
+#define GLGEN_ANA_DEF_PTYPE_DEF_PTYPE_S		0
+#define GLGEN_ANA_DEF_PTYPE_DEF_PTYPE_M		ICE_M(0x3FF, 0)
+#define GLGEN_ANA_ERR_CTRL			0x0020C220 /* Reset Source: CORER */
+#define GLGEN_ANA_ERR_CTRL_ERR_MASK_EN_S	0
+#define GLGEN_ANA_ERR_CTRL_ERR_MASK_EN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_FLAG_MAP(_i)			(0x0020C000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLGEN_ANA_FLAG_MAP_MAX_INDEX		63
+#define GLGEN_ANA_FLAG_MAP_FLAG_EN_S		0
+#define GLGEN_ANA_FLAG_MAP_FLAG_EN_M		BIT(0)
+#define GLGEN_ANA_FLAG_MAP_EXT_FLAG_ID_S	1
+#define GLGEN_ANA_FLAG_MAP_EXT_FLAG_ID_M	ICE_M(0x3F, 1)
+#define GLGEN_ANA_INV_NODE_PTYPE		0x0020C210 /* Reset Source: CORER */
+#define GLGEN_ANA_INV_NODE_PTYPE_INV_NODE_PTYPE_S 0
+#define GLGEN_ANA_INV_NODE_PTYPE_INV_NODE_PTYPE_M ICE_M(0x7FF, 0)
+#define GLGEN_ANA_INV_PTYPE_MARKER		0x0020C218 /* Reset Source: CORER */
+#define GLGEN_ANA_INV_PTYPE_MARKER_INV_PTYPE_MARKER_S 0
+#define GLGEN_ANA_INV_PTYPE_MARKER_INV_PTYPE_MARKER_M ICE_M(0x7F, 0)
+#define GLGEN_ANA_LAST_PROT_ID(_i)		(0x0020C1E4 + ((_i) * 4)) /* _i=0...5 */ /* Reset Source: CORER */
+#define GLGEN_ANA_LAST_PROT_ID_MAX_INDEX	5
+#define GLGEN_ANA_LAST_PROT_ID_EN_S		0
+#define GLGEN_ANA_LAST_PROT_ID_EN_M		BIT(0)
+#define GLGEN_ANA_LAST_PROT_ID_PROT_ID_S	1
+#define GLGEN_ANA_LAST_PROT_ID_PROT_ID_M	ICE_M(0xFF, 1)
+#define GLGEN_ANA_NMPG_KEYMASK(_i)		(0x0020C1D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_NMPG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_NMPG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_NMPG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_NMPG0_HASHKEY(_i)		(0x0020C1B0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_NMPG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_NMPG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_NMPG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_NO_HIT_PG_NM_PG		0x0020C204 /* Reset Source: CORER */
+#define GLGEN_ANA_NO_HIT_PG_NM_PG_NPC_S		0
+#define GLGEN_ANA_NO_HIT_PG_NM_PG_NPC_M		ICE_M(0xFF, 0)
+#define GLGEN_ANA_OUT_OF_PKT			0x0020C200 /* Reset Source: CORER */
+#define GLGEN_ANA_OUT_OF_PKT_NPC_S		0
+#define GLGEN_ANA_OUT_OF_PKT_NPC_M		ICE_M(0xFF, 0)
+#define GLGEN_ANA_P2P(_i)			(0x0020C160 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_P2P_MAX_INDEX			15
+#define GLGEN_ANA_P2P_TARGET_PROF_S		0
+#define GLGEN_ANA_P2P_TARGET_PROF_M		ICE_M(0xF, 0)
+#define GLGEN_ANA_PG_KEYMASK(_i)		(0x0020C1C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_PG_KEYMASK_MAX_INDEX		3
+#define GLGEN_ANA_PG_KEYMASK_HASH_KEY_S		0
+#define GLGEN_ANA_PG_KEYMASK_HASH_KEY_M		ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_PG0_HASHKEY(_i)		(0x0020C1A0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_PG0_HASHKEY_MAX_INDEX		3
+#define GLGEN_ANA_PG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_PG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_PROFIL_CTRL			0x0020C1FC /* Reset Source: CORER */
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDID_S 0
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDID_M ICE_M(0x1F, 0)
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDSTART_S 5
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MDSTART_M ICE_M(0xF, 5)
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_S 9
+#define GLGEN_ANA_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_M ICE_M(0x1F, 9)
+#define GLGEN_ANA_PROFIL_CTRL_NUM_CTRL_DOMAIN_S 14
+#define GLGEN_ANA_PROFIL_CTRL_NUM_CTRL_DOMAIN_M ICE_M(0x3, 14)
+#define GLGEN_ANA_PROFIL_CTRL_DEF_PROF_ID_S	16
+#define GLGEN_ANA_PROFIL_CTRL_DEF_PROF_ID_M	ICE_M(0xF, 16)
+#define GLGEN_ANA_PROFIL_CTRL_SEL_DEF_PROF_ID_S 20
+#define GLGEN_ANA_PROFIL_CTRL_SEL_DEF_PROF_ID_M BIT(20)
+#define GLGEN_ANA_TX_ABORT_PTYPE		0x0020D21C /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ABORT_PTYPE_ABORT_S	0
+#define GLGEN_ANA_TX_ABORT_PTYPE_ABORT_M	ICE_M(0x3FF, 0)
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT	0x0020D208 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT_NPC_S 0
+#define GLGEN_ANA_TX_ALU_ACCSS_OUT_OF_PKT_NPC_M ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_CFG_CTRL			0x0020D104 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_CTRL_LINE_IDX_S	0
+#define GLGEN_ANA_TX_CFG_CTRL_LINE_IDX_M	ICE_M(0x3FFFF, 0)
+#define GLGEN_ANA_TX_CFG_CTRL_TABLE_ID_S	18
+#define GLGEN_ANA_TX_CFG_CTRL_TABLE_ID_M	ICE_M(0xFF, 18)
+#define GLGEN_ANA_TX_CFG_CTRL_RESRVED_S		26
+#define GLGEN_ANA_TX_CFG_CTRL_RESRVED_M		ICE_M(0x7, 26)
+#define GLGEN_ANA_TX_CFG_CTRL_OPERATION_ID_S	29
+#define GLGEN_ANA_TX_CFG_CTRL_OPERATION_ID_M	ICE_M(0x7, 29)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT		0x0020D158 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_HIT_S	0
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_HIT_M	BIT(0)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_PG_MEM_IDX_S 1
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_PG_MEM_IDX_M ICE_M(0x7, 1)
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_ADDR_S	4
+#define GLGEN_ANA_TX_CFG_HTBL_LU_RESULT_ADDR_M	ICE_M(0x1FF, 4)
+#define GLGEN_ANA_TX_CFG_LU_KEY(_i)		(0x0020D14C + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_LU_KEY_MAX_INDEX	2
+#define GLGEN_ANA_TX_CFG_LU_KEY_LU_KEY_S	0
+#define GLGEN_ANA_TX_CFG_LU_KEY_LU_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_CFG_RDDATA(_i)		(0x0020D10C + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_RDDATA_MAX_INDEX	15
+#define GLGEN_ANA_TX_CFG_RDDATA_RD_DATA_S	0
+#define GLGEN_ANA_TX_CFG_RDDATA_RD_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT	0x0020D15C /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_HIT_S 0
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_HIT_M BIT(0)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_RSV_S 1
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_RSV_M ICE_M(0x7, 1)
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_ADDR_S 4
+#define GLGEN_ANA_TX_CFG_SPLBUF_LU_RESULT_ADDR_M ICE_M(0x1FF, 4)
+#define GLGEN_ANA_TX_CFG_WRDATA			0x0020D108 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_CFG_WRDATA_WR_DATA_S	0
+#define GLGEN_ANA_TX_CFG_WRDATA_WR_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_DEF_PTYPE			0x0020D100 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_DEF_PTYPE_DEF_PTYPE_S	0
+#define GLGEN_ANA_TX_DEF_PTYPE_DEF_PTYPE_M	ICE_M(0x3FF, 0)
+#define GLGEN_ANA_TX_DFD_PACE_OUT		0x0020D4CC /* Reset Source: CORER */
+#define GLGEN_ANA_TX_DFD_PACE_OUT_PUSH_S	0
+#define GLGEN_ANA_TX_DFD_PACE_OUT_PUSH_M	BIT(0)
+#define GLGEN_ANA_TX_ERR_CTRL			0x0020D220 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_ERR_CTRL_ERR_MASK_EN_S	0
+#define GLGEN_ANA_TX_ERR_CTRL_ERR_MASK_EN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_FLAG_MAP(_i)		(0x0020D000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_FLAG_MAP_MAX_INDEX		63
+#define GLGEN_ANA_TX_FLAG_MAP_FLAG_EN_S		0
+#define GLGEN_ANA_TX_FLAG_MAP_FLAG_EN_M		BIT(0)
+#define GLGEN_ANA_TX_FLAG_MAP_EXT_FLAG_ID_S	1
+#define GLGEN_ANA_TX_FLAG_MAP_EXT_FLAG_ID_M	ICE_M(0x3F, 1)
+#define GLGEN_ANA_TX_INV_NODE_PTYPE		0x0020D210 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_NODE_PTYPE_INV_NODE_PTYPE_S 0
+#define GLGEN_ANA_TX_INV_NODE_PTYPE_INV_NODE_PTYPE_M ICE_M(0x7FF, 0)
+#define GLGEN_ANA_TX_INV_PROT_ID		0x0020D214 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_PROT_ID_INV_PROT_ID_S	0
+#define GLGEN_ANA_TX_INV_PROT_ID_INV_PROT_ID_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER		0x0020D218 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER_INV_PTYPE_MARKER_S 0
+#define GLGEN_ANA_TX_INV_PTYPE_MARKER_INV_PTYPE_MARKER_M ICE_M(0x7F, 0)
+#define GLGEN_ANA_TX_NMPG_KEYMASK(_i)		(0x0020D1D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NMPG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_TX_NMPG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_TX_NMPG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_NMPG0_HASHKEY(_i)		(0x0020D1B0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_TX_NMPG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG		0x0020D204 /* Reset Source: CORER */
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG_NPC_S	0
+#define GLGEN_ANA_TX_NO_HIT_PG_NM_PG_NPC_M	ICE_M(0xFF, 0)
+#define GLGEN_ANA_TX_P2P(_i)			(0x0020D160 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_P2P_MAX_INDEX		15
+#define GLGEN_ANA_TX_P2P_TARGET_PROF_S		0
+#define GLGEN_ANA_TX_P2P_TARGET_PROF_M		ICE_M(0xF, 0)
+#define GLGEN_ANA_TX_PG_KEYMASK(_i)		(0x0020D1C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PG_KEYMASK_MAX_INDEX	3
+#define GLGEN_ANA_TX_PG_KEYMASK_HASH_KEY_S	0
+#define GLGEN_ANA_TX_PG_KEYMASK_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_PG0_HASHKEY(_i)		(0x0020D1A0 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PG0_HASHKEY_MAX_INDEX	3
+#define GLGEN_ANA_TX_PG0_HASHKEY_HASH_KEY_S	0
+#define GLGEN_ANA_TX_PG0_HASHKEY_HASH_KEY_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ANA_TX_PROFIL_CTRL		0x0020D1FC /* Reset Source: CORER */
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDID_S 0
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDID_M ICE_M(0x1F, 0)
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDSTART_S 5
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MDSTART_M ICE_M(0xF, 5)
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_S 9
+#define GLGEN_ANA_TX_PROFIL_CTRL_PROFILE_SELECT_MD_LEN_M ICE_M(0x1F, 9)
+#define GLGEN_ANA_TX_PROFIL_CTRL_NUM_CTRL_DOMAIN_S 14
+#define GLGEN_ANA_TX_PROFIL_CTRL_NUM_CTRL_DOMAIN_M ICE_M(0x3, 14)
+#define GLGEN_ANA_TX_PROFIL_CTRL_DEF_PROF_ID_S	16
+#define GLGEN_ANA_TX_PROFIL_CTRL_DEF_PROF_ID_M	ICE_M(0xF, 16)
+#define GLGEN_ANA_TX_PROFIL_CTRL_SEL_DEF_PROF_ID_S 20
+#define GLGEN_ANA_TX_PROFIL_CTRL_SEL_DEF_PROF_ID_M BIT(20)
+#define GLGEN_ASSERT_HLP			0x000B81E4 /* Reset Source: POR */
+#define GLGEN_ASSERT_HLP_CORE_ON_RST_S		0
+#define GLGEN_ASSERT_HLP_CORE_ON_RST_M		BIT(0)
+#define GLGEN_ASSERT_HLP_FULL_ON_RST_S		1
+#define GLGEN_ASSERT_HLP_FULL_ON_RST_M		BIT(1)
+#define GLGEN_CLKSTAT				0x000B8184 /* Reset Source: POR */
+#define GLGEN_CLKSTAT_U_CLK_SPEED_S		0
+#define GLGEN_CLKSTAT_U_CLK_SPEED_M		ICE_M(0x7, 0)
+#define GLGEN_CLKSTAT_L_CLK_SPEED_S		3
+#define GLGEN_CLKSTAT_L_CLK_SPEED_M		ICE_M(0x7, 3)
+#define GLGEN_CLKSTAT_PSM_CLK_SPEED_S		6
+#define GLGEN_CLKSTAT_PSM_CLK_SPEED_M		ICE_M(0x7, 6)
+#define GLGEN_CLKSTAT_RXCTL_CLK_SPEED_S		9
+#define GLGEN_CLKSTAT_RXCTL_CLK_SPEED_M		ICE_M(0x7, 9)
+#define GLGEN_CLKSTAT_UANA_CLK_SPEED_S		12
+#define GLGEN_CLKSTAT_UANA_CLK_SPEED_M		ICE_M(0x7, 12)
+#define GLGEN_CLKSTAT_PE_CLK_SPEED_S		18
+#define GLGEN_CLKSTAT_PE_CLK_SPEED_M		ICE_M(0x7, 18)
+#define GLGEN_CLKSTAT_SRC			0x000B826C /* Reset Source: POR */
+#define GLGEN_CLKSTAT_SRC_U_CLK_SRC_S		0
+#define GLGEN_CLKSTAT_SRC_U_CLK_SRC_M		ICE_M(0x3, 0)
+#define GLGEN_CLKSTAT_SRC_L_CLK_SRC_S		2
+#define GLGEN_CLKSTAT_SRC_L_CLK_SRC_M		ICE_M(0x3, 2)
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S		4
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M		ICE_M(0x3, 4)
+#define GLGEN_CLKSTAT_SRC_RXCTL_CLK_SRC_S	6
+#define GLGEN_CLKSTAT_SRC_RXCTL_CLK_SRC_M	ICE_M(0x3, 6)
+#define GLGEN_CLKSTAT_SRC_UANA_CLK_SRC_S	8
+#define GLGEN_CLKSTAT_SRC_UANA_CLK_SRC_M	ICE_M(0xF, 8)
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H		0x00093A00 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H_CLIENT_NUM_S 0
+#define GLGEN_ECC_ERR_INT_TOG_MASK_H_CLIENT_NUM_M ICE_M(0x7F, 0)
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L		0x000939FC /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L_CLIENT_NUM_S 0
+#define GLGEN_ECC_ERR_INT_TOG_MASK_L_CLIENT_NUM_M ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_ECC_ERR_RST_MASK_H		0x000939F8 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_RST_MASK_H_CLIENT_NUM_S	0
+#define GLGEN_ECC_ERR_RST_MASK_H_CLIENT_NUM_M	ICE_M(0x7F, 0)
+#define GLGEN_ECC_ERR_RST_MASK_L		0x000939F4 /* Reset Source: CORER */
+#define GLGEN_ECC_ERR_RST_MASK_L_CLIENT_NUM_S	0
+#define GLGEN_ECC_ERR_RST_MASK_L_CLIENT_NUM_M	ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_GPIO_CTL(_i)			(0x000880C8 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: POR */
+#define GLGEN_GPIO_CTL_MAX_INDEX		6
+#define GLGEN_GPIO_CTL_IN_VALUE_S		0
+#define GLGEN_GPIO_CTL_IN_VALUE_M		BIT(0)
+#define GLGEN_GPIO_CTL_IN_TRANSIT_S		1
+#define GLGEN_GPIO_CTL_IN_TRANSIT_M		BIT(1)
+#define GLGEN_GPIO_CTL_OUT_VALUE_S		2
+#define GLGEN_GPIO_CTL_OUT_VALUE_M		BIT(2)
+#define GLGEN_GPIO_CTL_NO_P_UP_S		3
+#define GLGEN_GPIO_CTL_NO_P_UP_M		BIT(3)
+#define GLGEN_GPIO_CTL_PIN_DIR_S		4
+#define GLGEN_GPIO_CTL_PIN_DIR_M		BIT(4)
+#define GLGEN_GPIO_CTL_TRI_CTL_S		5
+#define GLGEN_GPIO_CTL_TRI_CTL_M		BIT(5)
+#define GLGEN_GPIO_CTL_PIN_FUNC_S		8
+#define GLGEN_GPIO_CTL_PIN_FUNC_M		ICE_M(0xF, 8)
+#define GLGEN_GPIO_CTL_INT_MODE_S		12
+#define GLGEN_GPIO_CTL_INT_MODE_M		ICE_M(0x3, 12)
+#define GLGEN_MARKER_COUNT			0x000939E8 /* Reset Source: CORER */
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_S	0
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_M	ICE_M(0xFF, 0)
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_EN_S	31
+#define GLGEN_MARKER_COUNT_MARKER_COUNT_EN_M	BIT(31)
+#define GLGEN_RSTAT				0x000B8188 /* Reset Source: POR */
+#define GLGEN_RSTAT_DEVSTATE_S			0
 #define GLGEN_RSTAT_DEVSTATE_M			ICE_M(0x3, 0)
-#define GLGEN_RSTCTL				0x000B8180
-#define GLGEN_RSTCTL_GRSTDEL_S			0
-#define GLGEN_RSTCTL_GRSTDEL_M			ICE_M(0x3F, GLGEN_RSTCTL_GRSTDEL_S)
 #define GLGEN_RSTAT_RESET_TYPE_S		2
 #define GLGEN_RSTAT_RESET_TYPE_M		ICE_M(0x3, 2)
-#define GLGEN_RTRIG				0x000B8190
+#define GLGEN_RSTAT_CORERCNT_S			4
+#define GLGEN_RSTAT_CORERCNT_M			ICE_M(0x3, 4)
+#define GLGEN_RSTAT_GLOBRCNT_S			6
+#define GLGEN_RSTAT_GLOBRCNT_M			ICE_M(0x3, 6)
+#define GLGEN_RSTAT_EMPRCNT_S			8
+#define GLGEN_RSTAT_EMPRCNT_M			ICE_M(0x3, 8)
+#define GLGEN_RSTAT_TIME_TO_RST_S		10
+#define GLGEN_RSTAT_TIME_TO_RST_M		ICE_M(0x3F, 10)
+#define GLGEN_RSTAT_RTRIG_FLR_S			16
+#define GLGEN_RSTAT_RTRIG_FLR_M			BIT(16)
+#define GLGEN_RSTAT_RTRIG_ECC_S			17
+#define GLGEN_RSTAT_RTRIG_ECC_M			BIT(17)
+#define GLGEN_RSTAT_RTRIG_FW_AUX_S		18
+#define GLGEN_RSTAT_RTRIG_FW_AUX_M		BIT(18)
+#define GLGEN_RSTCTL				0x000B8180 /* Reset Source: POR */
+#define GLGEN_RSTCTL_GRSTDEL_S			0
+#define GLGEN_RSTCTL_GRSTDEL_M			ICE_M(0x3F, 0)
+#define GLGEN_RSTCTL_ECC_RST_ENA_S		8
+#define GLGEN_RSTCTL_ECC_RST_ENA_M		BIT(8)
+#define GLGEN_RSTCTL_ECC_RT_EN_S		30
+#define GLGEN_RSTCTL_ECC_RT_EN_M		BIT(30)
+#define GLGEN_RSTCTL_FLR_RT_EN_S		31
+#define GLGEN_RSTCTL_FLR_RT_EN_M		BIT(31)
+#define GLGEN_RTRIG				0x000B8190 /* Reset Source: CORER */
+#define GLGEN_RTRIG_CORER_S			0
 #define GLGEN_RTRIG_CORER_M			BIT(0)
+#define GLGEN_RTRIG_GLOBR_S			1
 #define GLGEN_RTRIG_GLOBR_M			BIT(1)
-#define GLGEN_STAT				0x000B612C
-#define GLGEN_VFLRSTAT(_i)			(0x00093A04 + ((_i) * 4))
-#define PFGEN_CTRL				0x00091000
+#define GLGEN_RTRIG_EMPFWR_S			2
+#define GLGEN_RTRIG_EMPFWR_M			BIT(2)
+#define GLGEN_STAT				0x000B612C /* Reset Source: POR */
+#define GLGEN_STAT_RSVD4FW_S			0
+#define GLGEN_STAT_RSVD4FW_M			ICE_M(0xFF, 0)
+#define GLGEN_VFLRSTAT(_i)			(0x00093A04 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLGEN_VFLRSTAT_MAX_INDEX		7
+#define GLGEN_VFLRSTAT_VFLRS_S			0
+#define GLGEN_VFLRSTAT_VFLRS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLGEN_XLR_MSK2HLP_RDY			0x000939F0 /* Reset Source: CORER */
+#define GLGEN_XLR_MSK2HLP_RDY_GLGEN_XLR_MSK2HLP_RDY_S 0
+#define GLGEN_XLR_MSK2HLP_RDY_GLGEN_XLR_MSK2HLP_RDY_M BIT(0)
+#define GLGEN_XLR_TRNS_WAIT_COUNT		0x000939EC /* Reset Source: CORER */
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_BTWN_TRNS_COUNT_S 0
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_BTWN_TRNS_COUNT_M ICE_M(0x1F, 0)
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_PEND_TRNS_COUNT_S 8
+#define GLGEN_XLR_TRNS_WAIT_COUNT_W_PEND_TRNS_COUNT_M ICE_M(0xFF, 8)
+#define GLVFGEN_TIMER				0x000B8214 /* Reset Source: POR */
+#define GLVFGEN_TIMER_GTIME_S			0
+#define GLVFGEN_TIMER_GTIME_M			ICE_M(0xFFFFFFFF, 0)
+#define PFGEN_CTRL				0x00091000 /* Reset Source: CORER */
+#define PFGEN_CTRL_PFSWR_S			0
 #define PFGEN_CTRL_PFSWR_M			BIT(0)
-#define PFGEN_STATE				0x00088000
-#define PRTGEN_STATUS				0x000B8100
-#define VFGEN_RSTAT(_VF)			(0x00074000 + ((_VF) * 4))
-#define VPGEN_VFRSTAT(_VF)			(0x00090800 + ((_VF) * 4))
+#define PFGEN_DRUN				0x00091180 /* Reset Source: CORER */
+#define PFGEN_DRUN_DRVUNLD_S			0
+#define PFGEN_DRUN_DRVUNLD_M			BIT(0)
+#define PFGEN_PFRSTAT				0x00091080 /* Reset Source: CORER */
+#define PFGEN_PFRSTAT_PFRD_S			0
+#define PFGEN_PFRSTAT_PFRD_M			BIT(0)
+#define PFGEN_PORTNUM				0x001D2400 /* Reset Source: CORER */
+#define PFGEN_PORTNUM_PORT_NUM_S		0
+#define PFGEN_PORTNUM_PORT_NUM_M		ICE_M(0x7, 0)
+#define PFGEN_STATE				0x00088000 /* Reset Source: CORER */
+#define PFGEN_STATE_PFPEEN_S			0
+#define PFGEN_STATE_PFPEEN_M			BIT(0)
+#define PFGEN_STATE_RSVD_S			1
+#define PFGEN_STATE_RSVD_M			BIT(1)
+#define PFGEN_STATE_PFLINKEN_S			2
+#define PFGEN_STATE_PFLINKEN_M			BIT(2)
+#define PFGEN_STATE_PFSCEN_S			3
+#define PFGEN_STATE_PFSCEN_M			BIT(3)
+#define PRT_TCVMLR_DRAIN_CNTR			0x000A21C0 /* Reset Source: CORER */
+#define PRT_TCVMLR_DRAIN_CNTR_CNTR_S		0
+#define PRT_TCVMLR_DRAIN_CNTR_CNTR_M		ICE_M(0x3FFF, 0)
+#define PRTGEN_CNF				0x000B8120 /* Reset Source: POR */
+#define PRTGEN_CNF_PORT_DIS_S			0
+#define PRTGEN_CNF_PORT_DIS_M			BIT(0)
+#define PRTGEN_CNF_ALLOW_PORT_DIS_S		1
+#define PRTGEN_CNF_ALLOW_PORT_DIS_M		BIT(1)
+#define PRTGEN_CNF_EMP_PORT_DIS_S		2
+#define PRTGEN_CNF_EMP_PORT_DIS_M		BIT(2)
+#define PRTGEN_CNF2				0x000B8160 /* Reset Source: POR */
+#define PRTGEN_CNF2_ACTIVATE_PORT_LINK_S	0
+#define PRTGEN_CNF2_ACTIVATE_PORT_LINK_M	BIT(0)
+#define PRTGEN_CNF3				0x000B8280 /* Reset Source: POR */
+#define PRTGEN_CNF3_PORT_STAGERING_EN_S		0
+#define PRTGEN_CNF3_PORT_STAGERING_EN_M		BIT(0)
+#define PRTGEN_STATUS				0x000B8100 /* Reset Source: POR */
+#define PRTGEN_STATUS_PORT_VALID_S		0
+#define PRTGEN_STATUS_PORT_VALID_M		BIT(0)
+#define PRTGEN_STATUS_PORT_ACTIVE_S		1
+#define PRTGEN_STATUS_PORT_ACTIVE_M		BIT(1)
+#define VFGEN_RSTAT(_VF)			(0x00074000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: VFR */
+#define VFGEN_RSTAT_MAX_INDEX			255
+#define VFGEN_RSTAT_VFR_STATE_S			0
+#define VFGEN_RSTAT_VFR_STATE_M			ICE_M(0x3, 0)
+#define VPGEN_VFRSTAT(_VF)			(0x00090800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPGEN_VFRSTAT_MAX_INDEX			255
+#define VPGEN_VFRSTAT_VFRD_S			0
 #define VPGEN_VFRSTAT_VFRD_M			BIT(0)
-#define VPGEN_VFRTRIG(_VF)			(0x00090000 + ((_VF) * 4))
+#define VPGEN_VFRTRIG(_VF)			(0x00090000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPGEN_VFRTRIG_MAX_INDEX			255
+#define VPGEN_VFRTRIG_VFSWR_S			0
 #define VPGEN_VFRTRIG_VFSWR_M			BIT(0)
-#define PFHMC_ERRORDATA				0x00520500
-#define PFHMC_ERRORINFO				0x00520400
-#define GLINT_CTL				0x0016CC54
+#define VSIGEN_RSTAT(_VSI)			(0x00092800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIGEN_RSTAT_MAX_INDEX			767
+#define VSIGEN_RSTAT_VMRD_S			0
+#define VSIGEN_RSTAT_VMRD_M			BIT(0)
+#define VSIGEN_RTRIG(_VSI)			(0x00091800 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIGEN_RTRIG_MAX_INDEX			767
+#define VSIGEN_RTRIG_VMSWR_S			0
+#define VSIGEN_RTRIG_VMSWR_M			BIT(0)
+#define GLHMC_APBVTINUSEBASE(_i)		(0x00524A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_APBVTINUSEBASE_MAX_INDEX		7
+#define GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_S	0
+#define GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_CEQPART(_i)			(0x005031C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_CEQPART_MAX_INDEX			7
+#define GLHMC_CEQPART_PMCEQBASE_S		0
+#define GLHMC_CEQPART_PMCEQBASE_M		ICE_M(0x3FF, 0)
+#define GLHMC_CEQPART_PMCEQSIZE_S		16
+#define GLHMC_CEQPART_PMCEQSIZE_M		ICE_M(0x3FF, 16)
+#define GLHMC_DBCQMAX				0x005220F0 /* Reset Source: CORER */
+#define GLHMC_DBCQMAX_GLHMC_DBCQMAX_S		0
+#define GLHMC_DBCQMAX_GLHMC_DBCQMAX_M		ICE_M(0xFFFFF, 0)
+#define GLHMC_DBCQPART(_i)			(0x00503180 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_DBCQPART_MAX_INDEX		7
+#define GLHMC_DBCQPART_PMDBCQBASE_S		0
+#define GLHMC_DBCQPART_PMDBCQBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_DBCQPART_PMDBCQSIZE_S		16
+#define GLHMC_DBCQPART_PMDBCQSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_DBQPMAX				0x005220EC /* Reset Source: CORER */
+#define GLHMC_DBQPMAX_GLHMC_DBQPMAX_S		0
+#define GLHMC_DBQPMAX_GLHMC_DBQPMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_DBQPPART(_i)			(0x005044C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_DBQPPART_MAX_INDEX		7
+#define GLHMC_DBQPPART_PMDBQPBASE_S		0
+#define GLHMC_DBQPPART_PMDBQPBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_DBQPPART_PMDBQPSIZE_S		16
+#define GLHMC_DBQPPART_PMDBQPSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_FSIAVBASE(_i)			(0x00525600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIAVBASE_MAX_INDEX		7
+#define GLHMC_FSIAVBASE_FPMFSIAVBASE_S		0
+#define GLHMC_FSIAVBASE_FPMFSIAVBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_FSIAVCNT(_i)			(0x00525700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIAVCNT_MAX_INDEX		7
+#define GLHMC_FSIAVCNT_FPMFSIAVCNT_S		0
+#define GLHMC_FSIAVCNT_FPMFSIAVCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_FSIAVMAX				0x00522068 /* Reset Source: CORER */
+#define GLHMC_FSIAVMAX_PMFSIAVMAX_S		0
+#define GLHMC_FSIAVMAX_PMFSIAVMAX_M		ICE_M(0x3FFFF, 0)
+#define GLHMC_FSIAVOBJSZ			0x00522064 /* Reset Source: CORER */
+#define GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_S		0
+#define GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_FSIMCBASE(_i)			(0x00526000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIMCBASE_MAX_INDEX		7
+#define GLHMC_FSIMCBASE_FPMFSIMCBASE_S		0
+#define GLHMC_FSIMCBASE_FPMFSIMCBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_FSIMCCNT(_i)			(0x00526100 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_FSIMCCNT_MAX_INDEX		7
+#define GLHMC_FSIMCCNT_FPMFSIMCSZ_S		0
+#define GLHMC_FSIMCCNT_FPMFSIMCSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_FSIMCMAX				0x00522060 /* Reset Source: CORER */
+#define GLHMC_FSIMCMAX_PMFSIMCMAX_S		0
+#define GLHMC_FSIMCMAX_PMFSIMCMAX_M		ICE_M(0x3FFF, 0)
+#define GLHMC_FSIMCOBJSZ			0x0052205C /* Reset Source: CORER */
+#define GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_S		0
+#define GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_FWPDINV				0x0052207C /* Reset Source: CORER */
+#define GLHMC_FWPDINV_PMSDIDX_S			0
+#define GLHMC_FWPDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define GLHMC_FWPDINV_PMSDPARTSEL_S		15
+#define GLHMC_FWPDINV_PMSDPARTSEL_M		BIT(15)
+#define GLHMC_FWPDINV_PMPDIDX_S			16
+#define GLHMC_FWPDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define GLHMC_FWPDINV_FPMAT			0x0010207C /* Reset Source: CORER */
+#define GLHMC_FWPDINV_FPMAT_PMSDIDX_S		0
+#define GLHMC_FWPDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define GLHMC_FWPDINV_FPMAT_PMSDPARTSEL_S	15
+#define GLHMC_FWPDINV_FPMAT_PMSDPARTSEL_M	BIT(15)
+#define GLHMC_FWPDINV_FPMAT_PMPDIDX_S		16
+#define GLHMC_FWPDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define GLHMC_FWSDDATAHIGH			0x00522078 /* Reset Source: CORER */
+#define GLHMC_FWSDDATAHIGH_PMSDDATAHIGH_S	0
+#define GLHMC_FWSDDATAHIGH_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_FWSDDATAHIGH_FPMAT		0x00102078 /* Reset Source: CORER */
+#define GLHMC_FWSDDATAHIGH_FPMAT_PMSDDATAHIGH_S 0
+#define GLHMC_FWSDDATAHIGH_FPMAT_PMSDDATAHIGH_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_FWSDDATALOW			0x00522074 /* Reset Source: CORER */
+#define GLHMC_FWSDDATALOW_PMSDVALID_S		0
+#define GLHMC_FWSDDATALOW_PMSDVALID_M		BIT(0)
+#define GLHMC_FWSDDATALOW_PMSDTYPE_S		1
+#define GLHMC_FWSDDATALOW_PMSDTYPE_M		BIT(1)
+#define GLHMC_FWSDDATALOW_PMSDBPCOUNT_S		2
+#define GLHMC_FWSDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define GLHMC_FWSDDATALOW_PMSDDATALOW_S		12
+#define GLHMC_FWSDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define GLHMC_FWSDDATALOW_FPMAT			0x00102074 /* Reset Source: CORER */
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDVALID_S	0
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDTYPE_S	1
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define GLHMC_FWSDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GLHMC_PEARPBASE(_i)			(0x00524800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEARPBASE_MAX_INDEX		7
+#define GLHMC_PEARPBASE_FPMPEARPBASE_S		0
+#define GLHMC_PEARPBASE_FPMPEARPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEARPCNT(_i)			(0x00524900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEARPCNT_MAX_INDEX		7
+#define GLHMC_PEARPCNT_FPMPEARPCNT_S		0
+#define GLHMC_PEARPCNT_FPMPEARPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEARPMAX				0x00522038 /* Reset Source: CORER */
+#define GLHMC_PEARPMAX_PMPEARPMAX_S		0
+#define GLHMC_PEARPMAX_PMPEARPMAX_M		ICE_M(0x1FFFF, 0)
+#define GLHMC_PEARPOBJSZ			0x00522034 /* Reset Source: CORER */
+#define GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_S		0
+#define GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_M		ICE_M(0x7, 0)
+#define GLHMC_PECQBASE(_i)			(0x00524200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PECQBASE_MAX_INDEX		7
+#define GLHMC_PECQBASE_FPMPECQBASE_S		0
+#define GLHMC_PECQBASE_FPMPECQBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PECQCNT(_i)			(0x00524300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PECQCNT_MAX_INDEX			7
+#define GLHMC_PECQCNT_FPMPECQCNT_S		0
+#define GLHMC_PECQCNT_FPMPECQCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PECQOBJSZ				0x00522020 /* Reset Source: CORER */
+#define GLHMC_PECQOBJSZ_PMPECQOBJSZ_S		0
+#define GLHMC_PECQOBJSZ_PMPECQOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHDRBASE(_i)			(0x00526200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHDRBASE_MAX_INDEX		7
+#define GLHMC_PEHDRBASE_GLHMC_PEHDRBASE_S	0
+#define GLHMC_PEHDRBASE_GLHMC_PEHDRBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEHDRCNT(_i)			(0x00526300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHDRCNT_MAX_INDEX		7
+#define GLHMC_PEHDRCNT_GLHMC_PEHDRCNT_S		0
+#define GLHMC_PEHDRCNT_GLHMC_PEHDRCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEHDRMAX				0x00522008 /* Reset Source: CORER */
+#define GLHMC_PEHDRMAX_PMPEHDRMAX_S		0
+#define GLHMC_PEHDRMAX_PMPEHDRMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_PEHDRMAX_RSVD_S			19
+#define GLHMC_PEHDRMAX_RSVD_M			ICE_M(0x1FFF, 19)
+#define GLHMC_PEHDROBJSZ			0x00522004 /* Reset Source: CORER */
+#define GLHMC_PEHDROBJSZ_PMPEHDROBJSZ_S		0
+#define GLHMC_PEHDROBJSZ_PMPEHDROBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHDROBJSZ_RSVD_S			4
+#define GLHMC_PEHDROBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEHTCNT(_i)			(0x00524700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTCNT_MAX_INDEX			7
+#define GLHMC_PEHTCNT_FPMPEHTCNT_S		0
+#define GLHMC_PEHTCNT_FPMPEHTCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEHTCNT_FPMAT(_i)			(0x00104700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTCNT_FPMAT_MAX_INDEX		7
+#define GLHMC_PEHTCNT_FPMAT_FPMPEHTCNT_S	0
+#define GLHMC_PEHTCNT_FPMAT_FPMPEHTCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEHTEBASE(_i)			(0x00524600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTEBASE_MAX_INDEX		7
+#define GLHMC_PEHTEBASE_FPMPEHTEBASE_S		0
+#define GLHMC_PEHTEBASE_FPMPEHTEBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEHTEBASE_FPMAT(_i)		(0x00104600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEHTEBASE_FPMAT_MAX_INDEX		7
+#define GLHMC_PEHTEBASE_FPMAT_FPMPEHTEBASE_S	0
+#define GLHMC_PEHTEBASE_FPMAT_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEHTEOBJSZ			0x0052202C /* Reset Source: CORER */
+#define GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_S		0
+#define GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEHTEOBJSZ_FPMAT			0x0010202C /* Reset Source: CORER */
+#define GLHMC_PEHTEOBJSZ_FPMAT_PMPEHTEOBJSZ_S	0
+#define GLHMC_PEHTEOBJSZ_FPMAT_PMPEHTEOBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEHTMAX				0x00522030 /* Reset Source: CORER */
+#define GLHMC_PEHTMAX_PMPEHTMAX_S		0
+#define GLHMC_PEHTMAX_PMPEHTMAX_M		ICE_M(0x1FFFFF, 0)
+#define GLHMC_PEHTMAX_FPMAT			0x00102030 /* Reset Source: CORER */
+#define GLHMC_PEHTMAX_FPMAT_PMPEHTMAX_S		0
+#define GLHMC_PEHTMAX_FPMAT_PMPEHTMAX_M		ICE_M(0x1FFFFF, 0)
+#define GLHMC_PEMDBASE(_i)			(0x00526400 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMDBASE_MAX_INDEX		7
+#define GLHMC_PEMDBASE_GLHMC_PEMDBASE_S		0
+#define GLHMC_PEMDBASE_GLHMC_PEMDBASE_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEMDCNT(_i)			(0x00526500 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMDCNT_MAX_INDEX			7
+#define GLHMC_PEMDCNT_GLHMC_PEMDCNT_S		0
+#define GLHMC_PEMDCNT_GLHMC_PEMDCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEMDMAX				0x00522010 /* Reset Source: CORER */
+#define GLHMC_PEMDMAX_PMPEMDMAX_S		0
+#define GLHMC_PEMDMAX_PMPEMDMAX_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEMDMAX_RSVD_S			24
+#define GLHMC_PEMDMAX_RSVD_M			ICE_M(0xFF, 24)
+#define GLHMC_PEMDOBJSZ				0x0052200C /* Reset Source: CORER */
+#define GLHMC_PEMDOBJSZ_PMPEMDOBJSZ_S		0
+#define GLHMC_PEMDOBJSZ_PMPEMDOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEMDOBJSZ_RSVD_S			4
+#define GLHMC_PEMDOBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEMRBASE(_i)			(0x00524C00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMRBASE_MAX_INDEX		7
+#define GLHMC_PEMRBASE_FPMPEMRBASE_S		0
+#define GLHMC_PEMRBASE_FPMPEMRBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEMRCNT(_i)			(0x00524D00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEMRCNT_MAX_INDEX			7
+#define GLHMC_PEMRCNT_FPMPEMRSZ_S		0
+#define GLHMC_PEMRCNT_FPMPEMRSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEMRMAX				0x00522040 /* Reset Source: CORER */
+#define GLHMC_PEMRMAX_PMPEMRMAX_S		0
+#define GLHMC_PEMRMAX_PMPEMRMAX_M		ICE_M(0x7FFFFF, 0)
+#define GLHMC_PEMROBJSZ				0x0052203C /* Reset Source: CORER */
+#define GLHMC_PEMROBJSZ_PMPEMROBJSZ_S		0
+#define GLHMC_PEMROBJSZ_PMPEMROBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEOOISCBASE(_i)			(0x00526600 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCBASE_MAX_INDEX		7
+#define GLHMC_PEOOISCBASE_GLHMC_PEOOISCBASE_S	0
+#define GLHMC_PEOOISCBASE_GLHMC_PEOOISCBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCCNT(_i)			(0x00526700 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCCNT_MAX_INDEX		7
+#define GLHMC_PEOOISCCNT_GLHMC_PEOOISCCNT_S	0
+#define GLHMC_PEOOISCCNT_GLHMC_PEOOISCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCFFLBASE(_i)		(0x00526C00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLBASE_MAX_INDEX		7
+#define GLHMC_PEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_S 0
+#define GLHMC_PEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PEOOISCFFLCNT_PMAT(_i)		(0x00526D00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLCNT_PMAT_MAX_INDEX	7
+#define GLHMC_PEOOISCFFLCNT_PMAT_FPMPEOOISCFLCNT_S 0
+#define GLHMC_PEOOISCFFLCNT_PMAT_FPMPEOOISCFLCNT_M ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEOOISCFFLMAX			0x005220A4 /* Reset Source: CORER */
+#define GLHMC_PEOOISCFFLMAX_PMPEOOISCFFLMAX_S	0
+#define GLHMC_PEOOISCFFLMAX_PMPEOOISCFFLMAX_M	ICE_M(0x7FFFF, 0)
+#define GLHMC_PEOOISCFFLMAX_RSVD_S		19
+#define GLHMC_PEOOISCFFLMAX_RSVD_M		ICE_M(0x1FFF, 19)
+#define GLHMC_PEOOISCMAX			0x00522018 /* Reset Source: CORER */
+#define GLHMC_PEOOISCMAX_PMPEOOISCMAX_S		0
+#define GLHMC_PEOOISCMAX_PMPEOOISCMAX_M		ICE_M(0x7FFFF, 0)
+#define GLHMC_PEOOISCMAX_RSVD_S			19
+#define GLHMC_PEOOISCMAX_RSVD_M			ICE_M(0x1FFF, 19)
+#define GLHMC_PEOOISCOBJSZ			0x00522014 /* Reset Source: CORER */
+#define GLHMC_PEOOISCOBJSZ_PMPEOOISCOBJSZ_S	0
+#define GLHMC_PEOOISCOBJSZ_PMPEOOISCOBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEOOISCOBJSZ_RSVD_S		4
+#define GLHMC_PEOOISCOBJSZ_RSVD_M		ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PEPBLBASE(_i)			(0x00525800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEPBLBASE_MAX_INDEX		7
+#define GLHMC_PEPBLBASE_FPMPEPBLBASE_S		0
+#define GLHMC_PEPBLBASE_FPMPEPBLBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEPBLCNT(_i)			(0x00525900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEPBLCNT_MAX_INDEX		7
+#define GLHMC_PEPBLCNT_FPMPEPBLCNT_S		0
+#define GLHMC_PEPBLCNT_FPMPEPBLCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEPBLMAX				0x0052206C /* Reset Source: CORER */
+#define GLHMC_PEPBLMAX_PMPEPBLMAX_S		0
+#define GLHMC_PEPBLMAX_PMPEPBLMAX_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQ1BASE(_i)			(0x00525200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1BASE_MAX_INDEX		7
+#define GLHMC_PEQ1BASE_FPMPEQ1BASE_S		0
+#define GLHMC_PEQ1BASE_FPMPEQ1BASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQ1CNT(_i)			(0x00525300 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1CNT_MAX_INDEX			7
+#define GLHMC_PEQ1CNT_FPMPEQ1CNT_S		0
+#define GLHMC_PEQ1CNT_FPMPEQ1CNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQ1FLBASE(_i)			(0x00525400 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQ1FLBASE_MAX_INDEX		7
+#define GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_S	0
+#define GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQ1FLMAX				0x00522058 /* Reset Source: CORER */
+#define GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_S		0
+#define GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_M		ICE_M(0x3FFFFFF, 0)
+#define GLHMC_PEQ1MAX				0x00522054 /* Reset Source: CORER */
+#define GLHMC_PEQ1MAX_PMPEQ1MAX_S		0
+#define GLHMC_PEQ1MAX_PMPEQ1MAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEQ1OBJSZ				0x00522050 /* Reset Source: CORER */
+#define GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_S		0
+#define GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PEQPBASE(_i)			(0x00524000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQPBASE_MAX_INDEX		7
+#define GLHMC_PEQPBASE_FPMPEQPBASE_S		0
+#define GLHMC_PEQPBASE_FPMPEQPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEQPCNT(_i)			(0x00524100 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEQPCNT_MAX_INDEX			7
+#define GLHMC_PEQPCNT_FPMPEQPCNT_S		0
+#define GLHMC_PEQPCNT_FPMPEQPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEQPOBJSZ				0x0052201C /* Reset Source: CORER */
+#define GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_S		0
+#define GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PERRFBASE(_i)			(0x00526800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFBASE_MAX_INDEX		7
+#define GLHMC_PERRFBASE_GLHMC_PERRFBASE_S	0
+#define GLHMC_PERRFBASE_GLHMC_PERRFBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFCNT(_i)			(0x00526900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFCNT_MAX_INDEX		7
+#define GLHMC_PERRFCNT_GLHMC_PERRFCNT_S		0
+#define GLHMC_PERRFCNT_GLHMC_PERRFCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFFLBASE(_i)			(0x00526A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFFLBASE_MAX_INDEX		7
+#define GLHMC_PERRFFLBASE_GLHMC_PERRFFLBASE_S	0
+#define GLHMC_PERRFFLBASE_GLHMC_PERRFFLBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_PERRFFLCNT_PMAT(_i)		(0x00526B00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PERRFFLCNT_PMAT_MAX_INDEX		7
+#define GLHMC_PERRFFLCNT_PMAT_FPMPERRFFLCNT_S	0
+#define GLHMC_PERRFFLCNT_PMAT_FPMPERRFFLCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PERRFFLMAX			0x005220A0 /* Reset Source: CORER */
+#define GLHMC_PERRFFLMAX_PMPERRFFLMAX_S		0
+#define GLHMC_PERRFFLMAX_PMPERRFFLMAX_M		ICE_M(0x3FFFFFF, 0)
+#define GLHMC_PERRFFLMAX_RSVD_S			26
+#define GLHMC_PERRFFLMAX_RSVD_M			ICE_M(0x3F, 26)
+#define GLHMC_PERRFMAX				0x0052209C /* Reset Source: CORER */
+#define GLHMC_PERRFMAX_PMPERRFMAX_S		0
+#define GLHMC_PERRFMAX_PMPERRFMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PERRFMAX_RSVD_S			28
+#define GLHMC_PERRFMAX_RSVD_M			ICE_M(0xF, 28)
+#define GLHMC_PERRFOBJSZ			0x00522098 /* Reset Source: CORER */
+#define GLHMC_PERRFOBJSZ_PMPERRFOBJSZ_S		0
+#define GLHMC_PERRFOBJSZ_PMPERRFOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PERRFOBJSZ_RSVD_S			4
+#define GLHMC_PERRFOBJSZ_RSVD_M			ICE_M(0xFFFFFFF, 4)
+#define GLHMC_PETIMERBASE(_i)			(0x00525A00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PETIMERBASE_MAX_INDEX		7
+#define GLHMC_PETIMERBASE_FPMPETIMERBASE_S	0
+#define GLHMC_PETIMERBASE_FPMPETIMERBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PETIMERCNT(_i)			(0x00525B00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PETIMERCNT_MAX_INDEX		7
+#define GLHMC_PETIMERCNT_FPMPETIMERCNT_S	0
+#define GLHMC_PETIMERCNT_FPMPETIMERCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PETIMERMAX			0x00522084 /* Reset Source: CORER */
+#define GLHMC_PETIMERMAX_PMPETIMERMAX_S		0
+#define GLHMC_PETIMERMAX_PMPETIMERMAX_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PETIMEROBJSZ			0x00522080 /* Reset Source: CORER */
+#define GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_S	0
+#define GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_M	ICE_M(0xF, 0)
+#define GLHMC_PEXFBASE(_i)			(0x00524E00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFBASE_MAX_INDEX		7
+#define GLHMC_PEXFBASE_FPMPEXFBASE_S		0
+#define GLHMC_PEXFBASE_FPMPEXFBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEXFCNT(_i)			(0x00524F00 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFCNT_MAX_INDEX			7
+#define GLHMC_PEXFCNT_FPMPEXFCNT_S		0
+#define GLHMC_PEXFCNT_FPMPEXFCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_PEXFFLBASE(_i)			(0x00525000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PEXFFLBASE_MAX_INDEX		7
+#define GLHMC_PEXFFLBASE_FPMPEXFFLBASE_S	0
+#define GLHMC_PEXFFLBASE_FPMPEXFFLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_PEXFFLMAX				0x0052204C /* Reset Source: CORER */
+#define GLHMC_PEXFFLMAX_PMPEXFFLMAX_S		0
+#define GLHMC_PEXFFLMAX_PMPEXFFLMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEXFMAX				0x00522048 /* Reset Source: CORER */
+#define GLHMC_PEXFMAX_PMPEXFMAX_S		0
+#define GLHMC_PEXFMAX_PMPEXFMAX_M		ICE_M(0xFFFFFFF, 0)
+#define GLHMC_PEXFOBJSZ				0x00522044 /* Reset Source: CORER */
+#define GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_S		0
+#define GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_M		ICE_M(0xF, 0)
+#define GLHMC_PFPESDPART(_i)			(0x00520880 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PFPESDPART_MAX_INDEX		7
+#define GLHMC_PFPESDPART_PMSDBASE_S		0
+#define GLHMC_PFPESDPART_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_PFPESDPART_PMSDSIZE_S		16
+#define GLHMC_PFPESDPART_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_PFPESDPART_FPMAT(_i)		(0x00100880 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_PFPESDPART_FPMAT_MAX_INDEX	7
+#define GLHMC_PFPESDPART_FPMAT_PMSDBASE_S	0
+#define GLHMC_PFPESDPART_FPMAT_PMSDBASE_M	ICE_M(0xFFF, 0)
+#define GLHMC_PFPESDPART_FPMAT_PMSDSIZE_S	16
+#define GLHMC_PFPESDPART_FPMAT_PMSDSIZE_M	ICE_M(0x1FFF, 16)
+#define GLHMC_SDPART(_i)			(0x00520800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_SDPART_MAX_INDEX			7
+#define GLHMC_SDPART_PMSDBASE_S			0
+#define GLHMC_SDPART_PMSDBASE_M			ICE_M(0xFFF, 0)
+#define GLHMC_SDPART_PMSDSIZE_S			16
+#define GLHMC_SDPART_PMSDSIZE_M			ICE_M(0x1FFF, 16)
+#define GLHMC_SDPART_FPMAT(_i)			(0x00100800 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLHMC_SDPART_FPMAT_MAX_INDEX		7
+#define GLHMC_SDPART_FPMAT_PMSDBASE_S		0
+#define GLHMC_SDPART_FPMAT_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_SDPART_FPMAT_PMSDSIZE_S		16
+#define GLHMC_SDPART_FPMAT_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_VFAPBVTINUSEBASE(_i)		(0x0052CA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFAPBVTINUSEBASE_MAX_INDEX	31
+#define GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_S 0
+#define GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_M ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFCEQPART(_i)			(0x00502F00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFCEQPART_MAX_INDEX		31
+#define GLHMC_VFCEQPART_PMCEQBASE_S		0
+#define GLHMC_VFCEQPART_PMCEQBASE_M		ICE_M(0x3FF, 0)
+#define GLHMC_VFCEQPART_PMCEQSIZE_S		16
+#define GLHMC_VFCEQPART_PMCEQSIZE_M		ICE_M(0x3FF, 16)
+#define GLHMC_VFDBCQPART(_i)			(0x00502E00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFDBCQPART_MAX_INDEX		31
+#define GLHMC_VFDBCQPART_PMDBCQBASE_S		0
+#define GLHMC_VFDBCQPART_PMDBCQBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_VFDBCQPART_PMDBCQSIZE_S		16
+#define GLHMC_VFDBCQPART_PMDBCQSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_VFDBQPPART(_i)			(0x00504520 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFDBQPPART_MAX_INDEX		31
+#define GLHMC_VFDBQPPART_PMDBQPBASE_S		0
+#define GLHMC_VFDBQPPART_PMDBQPBASE_M		ICE_M(0x3FFF, 0)
+#define GLHMC_VFDBQPPART_PMDBQPSIZE_S		16
+#define GLHMC_VFDBQPPART_PMDBQPSIZE_M		ICE_M(0x7FFF, 16)
+#define GLHMC_VFFSIAVBASE(_i)			(0x0052D600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIAVBASE_MAX_INDEX		31
+#define GLHMC_VFFSIAVBASE_FPMFSIAVBASE_S	0
+#define GLHMC_VFFSIAVBASE_FPMFSIAVBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFFSIAVCNT(_i)			(0x0052D700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIAVCNT_MAX_INDEX		31
+#define GLHMC_VFFSIAVCNT_FPMFSIAVCNT_S		0
+#define GLHMC_VFFSIAVCNT_FPMFSIAVCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFFSIMCBASE(_i)			(0x0052E000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIMCBASE_MAX_INDEX		31
+#define GLHMC_VFFSIMCBASE_FPMFSIMCBASE_S	0
+#define GLHMC_VFFSIMCBASE_FPMFSIMCBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFFSIMCCNT(_i)			(0x0052E100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFFSIMCCNT_MAX_INDEX		31
+#define GLHMC_VFFSIMCCNT_FPMFSIMCSZ_S		0
+#define GLHMC_VFFSIMCCNT_FPMFSIMCSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPDINV(_i)			(0x00528300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPDINV_MAX_INDEX			31
+#define GLHMC_VFPDINV_PMSDIDX_S			0
+#define GLHMC_VFPDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define GLHMC_VFPDINV_PMSDPARTSEL_S		15
+#define GLHMC_VFPDINV_PMSDPARTSEL_M		BIT(15)
+#define GLHMC_VFPDINV_PMPDIDX_S			16
+#define GLHMC_VFPDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define GLHMC_VFPDINV_FPMAT(_i)			(0x00108300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPDINV_FPMAT_MAX_INDEX		31
+#define GLHMC_VFPDINV_FPMAT_PMSDIDX_S		0
+#define GLHMC_VFPDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFPDINV_FPMAT_PMSDPARTSEL_S	15
+#define GLHMC_VFPDINV_FPMAT_PMSDPARTSEL_M	BIT(15)
+#define GLHMC_VFPDINV_FPMAT_PMPDIDX_S		16
+#define GLHMC_VFPDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define GLHMC_VFPEARPBASE(_i)			(0x0052C800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEARPBASE_MAX_INDEX		31
+#define GLHMC_VFPEARPBASE_FPMPEARPBASE_S	0
+#define GLHMC_VFPEARPBASE_FPMPEARPBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEARPCNT(_i)			(0x0052C900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEARPCNT_MAX_INDEX		31
+#define GLHMC_VFPEARPCNT_FPMPEARPCNT_S		0
+#define GLHMC_VFPEARPCNT_FPMPEARPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPECQBASE(_i)			(0x0052C200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPECQBASE_MAX_INDEX		31
+#define GLHMC_VFPECQBASE_FPMPECQBASE_S		0
+#define GLHMC_VFPECQBASE_FPMPECQBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPECQCNT(_i)			(0x0052C300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPECQCNT_MAX_INDEX		31
+#define GLHMC_VFPECQCNT_FPMPECQCNT_S		0
+#define GLHMC_VFPECQCNT_FPMPECQCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHDRBASE(_i)			(0x0052E200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHDRBASE_MAX_INDEX		31
+#define GLHMC_VFPEHDRBASE_GLHMC_PEHDRBASE_S	0
+#define GLHMC_VFPEHDRBASE_GLHMC_PEHDRBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEHDRCNT(_i)			(0x0052E300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHDRCNT_MAX_INDEX		31
+#define GLHMC_VFPEHDRCNT_GLHMC_PEHDRCNT_S	0
+#define GLHMC_VFPEHDRCNT_GLHMC_PEHDRCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEHTCNT(_i)			(0x0052C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTCNT_MAX_INDEX		31
+#define GLHMC_VFPEHTCNT_FPMPEHTCNT_S		0
+#define GLHMC_VFPEHTCNT_FPMPEHTCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHTCNT_FPMAT(_i)		(0x0010C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTCNT_FPMAT_MAX_INDEX		31
+#define GLHMC_VFPEHTCNT_FPMAT_FPMPEHTCNT_S	0
+#define GLHMC_VFPEHTCNT_FPMAT_FPMPEHTCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEHTEBASE(_i)			(0x0052C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTEBASE_MAX_INDEX		31
+#define GLHMC_VFPEHTEBASE_FPMPEHTEBASE_S	0
+#define GLHMC_VFPEHTEBASE_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEHTEBASE_FPMAT(_i)		(0x0010C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEHTEBASE_FPMAT_MAX_INDEX	31
+#define GLHMC_VFPEHTEBASE_FPMAT_FPMPEHTEBASE_S	0
+#define GLHMC_VFPEHTEBASE_FPMAT_FPMPEHTEBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEMDBASE(_i)			(0x0052E400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMDBASE_MAX_INDEX		31
+#define GLHMC_VFPEMDBASE_GLHMC_PEMDBASE_S	0
+#define GLHMC_VFPEMDBASE_GLHMC_PEMDBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEMDCNT(_i)			(0x0052E500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMDCNT_MAX_INDEX		31
+#define GLHMC_VFPEMDCNT_GLHMC_PEMDCNT_S		0
+#define GLHMC_VFPEMDCNT_GLHMC_PEMDCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEMRBASE(_i)			(0x0052CC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMRBASE_MAX_INDEX		31
+#define GLHMC_VFPEMRBASE_FPMPEMRBASE_S		0
+#define GLHMC_VFPEMRBASE_FPMPEMRBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEMRCNT(_i)			(0x0052CD00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEMRCNT_MAX_INDEX		31
+#define GLHMC_VFPEMRCNT_FPMPEMRSZ_S		0
+#define GLHMC_VFPEMRCNT_FPMPEMRSZ_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEOOISCBASE(_i)			(0x0052E600 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCBASE_MAX_INDEX		31
+#define GLHMC_VFPEOOISCBASE_GLHMC_PEOOISCBASE_S 0
+#define GLHMC_VFPEOOISCBASE_GLHMC_PEOOISCBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEOOISCCNT(_i)			(0x0052E700 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCCNT_MAX_INDEX		31
+#define GLHMC_VFPEOOISCCNT_GLHMC_PEOOISCCNT_S	0
+#define GLHMC_VFPEOOISCCNT_GLHMC_PEOOISCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEOOISCFFLBASE(_i)		(0x0052EC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEOOISCFFLBASE_MAX_INDEX	31
+#define GLHMC_VFPEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_S 0
+#define GLHMC_VFPEOOISCFFLBASE_GLHMC_PEOOISCFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPEPBLBASE(_i)			(0x0052D800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEPBLBASE_MAX_INDEX		31
+#define GLHMC_VFPEPBLBASE_FPMPEPBLBASE_S	0
+#define GLHMC_VFPEPBLBASE_FPMPEPBLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEPBLCNT(_i)			(0x0052D900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEPBLCNT_MAX_INDEX		31
+#define GLHMC_VFPEPBLCNT_FPMPEPBLCNT_S		0
+#define GLHMC_VFPEPBLCNT_FPMPEPBLCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEQ1BASE(_i)			(0x0052D200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1BASE_MAX_INDEX		31
+#define GLHMC_VFPEQ1BASE_FPMPEQ1BASE_S		0
+#define GLHMC_VFPEQ1BASE_FPMPEQ1BASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQ1CNT(_i)			(0x0052D300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1CNT_MAX_INDEX		31
+#define GLHMC_VFPEQ1CNT_FPMPEQ1CNT_S		0
+#define GLHMC_VFPEQ1CNT_FPMPEQ1CNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEQ1FLBASE(_i)			(0x0052D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQ1FLBASE_MAX_INDEX		31
+#define GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_S	0
+#define GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQPBASE(_i)			(0x0052C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQPBASE_MAX_INDEX		31
+#define GLHMC_VFPEQPBASE_FPMPEQPBASE_S		0
+#define GLHMC_VFPEQPBASE_FPMPEQPBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEQPCNT(_i)			(0x0052C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEQPCNT_MAX_INDEX		31
+#define GLHMC_VFPEQPCNT_FPMPEQPCNT_S		0
+#define GLHMC_VFPEQPCNT_FPMPEQPCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPERRFBASE(_i)			(0x0052E800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFBASE_MAX_INDEX		31
+#define GLHMC_VFPERRFBASE_GLHMC_PERRFBASE_S	0
+#define GLHMC_VFPERRFBASE_GLHMC_PERRFBASE_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPERRFCNT(_i)			(0x0052E900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFCNT_MAX_INDEX		31
+#define GLHMC_VFPERRFCNT_GLHMC_PERRFCNT_S	0
+#define GLHMC_VFPERRFCNT_GLHMC_PERRFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPERRFFLBASE(_i)			(0x0052EA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPERRFFLBASE_MAX_INDEX		31
+#define GLHMC_VFPERRFFLBASE_GLHMC_PERRFFLBASE_S 0
+#define GLHMC_VFPERRFFLBASE_GLHMC_PERRFFLBASE_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFPETIMERBASE(_i)			(0x0052DA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPETIMERBASE_MAX_INDEX		31
+#define GLHMC_VFPETIMERBASE_FPMPETIMERBASE_S	0
+#define GLHMC_VFPETIMERBASE_FPMPETIMERBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPETIMERCNT(_i)			(0x0052DB00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPETIMERCNT_MAX_INDEX		31
+#define GLHMC_VFPETIMERCNT_FPMPETIMERCNT_S	0
+#define GLHMC_VFPETIMERCNT_FPMPETIMERCNT_M	ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEXFBASE(_i)			(0x0052CE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFBASE_MAX_INDEX		31
+#define GLHMC_VFPEXFBASE_FPMPEXFBASE_S		0
+#define GLHMC_VFPEXFBASE_FPMPEXFBASE_M		ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFPEXFCNT(_i)			(0x0052CF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFCNT_MAX_INDEX		31
+#define GLHMC_VFPEXFCNT_FPMPEXFCNT_S		0
+#define GLHMC_VFPEXFCNT_FPMPEXFCNT_M		ICE_M(0x1FFFFFFF, 0)
+#define GLHMC_VFPEXFFLBASE(_i)			(0x0052D000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFPEXFFLBASE_MAX_INDEX		31
+#define GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_S	0
+#define GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_M	ICE_M(0xFFFFFF, 0)
+#define GLHMC_VFSDDATAHIGH(_i)			(0x00528200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATAHIGH_MAX_INDEX		31
+#define GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_S	0
+#define GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFSDDATAHIGH_FPMAT(_i)		(0x00108200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATAHIGH_FPMAT_MAX_INDEX	31
+#define GLHMC_VFSDDATAHIGH_FPMAT_PMSDDATAHIGH_S 0
+#define GLHMC_VFSDDATAHIGH_FPMAT_PMSDDATAHIGH_M ICE_M(0xFFFFFFFF, 0)
+#define GLHMC_VFSDDATALOW(_i)			(0x00528100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATALOW_MAX_INDEX		31
+#define GLHMC_VFSDDATALOW_PMSDVALID_S		0
+#define GLHMC_VFSDDATALOW_PMSDVALID_M		BIT(0)
+#define GLHMC_VFSDDATALOW_PMSDTYPE_S		1
+#define GLHMC_VFSDDATALOW_PMSDTYPE_M		BIT(1)
+#define GLHMC_VFSDDATALOW_PMSDBPCOUNT_S		2
+#define GLHMC_VFSDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define GLHMC_VFSDDATALOW_PMSDDATALOW_S		12
+#define GLHMC_VFSDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define GLHMC_VFSDDATALOW_FPMAT(_i)		(0x00108100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDDATALOW_FPMAT_MAX_INDEX	31
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDVALID_S	0
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDTYPE_S	1
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define GLHMC_VFSDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GLHMC_VFSDPART(_i)			(0x00528800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDPART_MAX_INDEX		31
+#define GLHMC_VFSDPART_PMSDBASE_S		0
+#define GLHMC_VFSDPART_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFSDPART_PMSDSIZE_S		16
+#define GLHMC_VFSDPART_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLHMC_VFSDPART_FPMAT(_i)		(0x00108800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLHMC_VFSDPART_FPMAT_MAX_INDEX		31
+#define GLHMC_VFSDPART_FPMAT_PMSDBASE_S		0
+#define GLHMC_VFSDPART_FPMAT_PMSDBASE_M		ICE_M(0xFFF, 0)
+#define GLHMC_VFSDPART_FPMAT_PMSDSIZE_S		16
+#define GLHMC_VFSDPART_FPMAT_PMSDSIZE_M		ICE_M(0x1FFF, 16)
+#define GLMDOC_CACHESIZE			0x0051C06C /* Reset Source: CORER */
+#define GLMDOC_CACHESIZE_WORD_SIZE_S		0
+#define GLMDOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLMDOC_CACHESIZE_SETS_S			8
+#define GLMDOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLMDOC_CACHESIZE_WAYS_S			20
+#define GLMDOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLPBLOC0_CACHESIZE			0x00518074 /* Reset Source: CORER */
+#define GLPBLOC0_CACHESIZE_WORD_SIZE_S		0
+#define GLPBLOC0_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPBLOC0_CACHESIZE_SETS_S		8
+#define GLPBLOC0_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPBLOC0_CACHESIZE_WAYS_S		20
+#define GLPBLOC0_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPBLOC1_CACHESIZE			0x0051A074 /* Reset Source: CORER */
+#define GLPBLOC1_CACHESIZE_WORD_SIZE_S		0
+#define GLPBLOC1_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPBLOC1_CACHESIZE_SETS_S		8
+#define GLPBLOC1_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPBLOC1_CACHESIZE_WAYS_S		20
+#define GLPBLOC1_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPDOC_CACHESIZE			0x00530048 /* Reset Source: CORER */
+#define GLPDOC_CACHESIZE_WORD_SIZE_S		0
+#define GLPDOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPDOC_CACHESIZE_SETS_S			8
+#define GLPDOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLPDOC_CACHESIZE_WAYS_S			20
+#define GLPDOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLPDOC_CACHESIZE_FPMAT			0x00110088 /* Reset Source: CORER */
+#define GLPDOC_CACHESIZE_FPMAT_WORD_SIZE_S	0
+#define GLPDOC_CACHESIZE_FPMAT_WORD_SIZE_M	ICE_M(0xFF, 0)
+#define GLPDOC_CACHESIZE_FPMAT_SETS_S		8
+#define GLPDOC_CACHESIZE_FPMAT_SETS_M		ICE_M(0xFFF, 8)
+#define GLPDOC_CACHESIZE_FPMAT_WAYS_S		20
+#define GLPDOC_CACHESIZE_FPMAT_WAYS_M		ICE_M(0xF, 20)
+#define GLPEOC0_CACHESIZE			0x005140A8 /* Reset Source: CORER */
+#define GLPEOC0_CACHESIZE_WORD_SIZE_S		0
+#define GLPEOC0_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPEOC0_CACHESIZE_SETS_S		8
+#define GLPEOC0_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPEOC0_CACHESIZE_WAYS_S		20
+#define GLPEOC0_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define GLPEOC1_CACHESIZE			0x005160A8 /* Reset Source: CORER */
+#define GLPEOC1_CACHESIZE_WORD_SIZE_S		0
+#define GLPEOC1_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLPEOC1_CACHESIZE_SETS_S		8
+#define GLPEOC1_CACHESIZE_SETS_M		ICE_M(0xFFF, 8)
+#define GLPEOC1_CACHESIZE_WAYS_S		20
+#define GLPEOC1_CACHESIZE_WAYS_M		ICE_M(0xF, 20)
+#define PFHMC_ERRORDATA				0x00520500 /* Reset Source: PFR */
+#define PFHMC_ERRORDATA_HMC_ERROR_DATA_S	0
+#define PFHMC_ERRORDATA_HMC_ERROR_DATA_M	ICE_M(0x3FFFFFFF, 0)
+#define PFHMC_ERRORDATA_FPMAT			0x00100500 /* Reset Source: PFR */
+#define PFHMC_ERRORDATA_FPMAT_HMC_ERROR_DATA_S	0
+#define PFHMC_ERRORDATA_FPMAT_HMC_ERROR_DATA_M	ICE_M(0x3FFFFFFF, 0)
+#define PFHMC_ERRORINFO				0x00520400 /* Reset Source: PFR */
+#define PFHMC_ERRORINFO_PMF_INDEX_S		0
+#define PFHMC_ERRORINFO_PMF_INDEX_M		ICE_M(0x1F, 0)
+#define PFHMC_ERRORINFO_PMF_ISVF_S		7
+#define PFHMC_ERRORINFO_PMF_ISVF_M		BIT(7)
+#define PFHMC_ERRORINFO_HMC_ERROR_TYPE_S	8
+#define PFHMC_ERRORINFO_HMC_ERROR_TYPE_M	ICE_M(0xF, 8)
+#define PFHMC_ERRORINFO_HMC_OBJECT_TYPE_S	16
+#define PFHMC_ERRORINFO_HMC_OBJECT_TYPE_M	ICE_M(0x1F, 16)
+#define PFHMC_ERRORINFO_ERROR_DETECTED_S	31
+#define PFHMC_ERRORINFO_ERROR_DETECTED_M	BIT(31)
+#define PFHMC_ERRORINFO_FPMAT			0x00100400 /* Reset Source: PFR */
+#define PFHMC_ERRORINFO_FPMAT_PMF_INDEX_S	0
+#define PFHMC_ERRORINFO_FPMAT_PMF_INDEX_M	ICE_M(0x1F, 0)
+#define PFHMC_ERRORINFO_FPMAT_PMF_ISVF_S	7
+#define PFHMC_ERRORINFO_FPMAT_PMF_ISVF_M	BIT(7)
+#define PFHMC_ERRORINFO_FPMAT_HMC_ERROR_TYPE_S	8
+#define PFHMC_ERRORINFO_FPMAT_HMC_ERROR_TYPE_M	ICE_M(0xF, 8)
+#define PFHMC_ERRORINFO_FPMAT_HMC_OBJECT_TYPE_S 16
+#define PFHMC_ERRORINFO_FPMAT_HMC_OBJECT_TYPE_M ICE_M(0x1F, 16)
+#define PFHMC_ERRORINFO_FPMAT_ERROR_DETECTED_S	31
+#define PFHMC_ERRORINFO_FPMAT_ERROR_DETECTED_M	BIT(31)
+#define PFHMC_PDINV				0x00520300 /* Reset Source: PFR */
+#define PFHMC_PDINV_PMSDIDX_S			0
+#define PFHMC_PDINV_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define PFHMC_PDINV_PMSDPARTSEL_S		15
+#define PFHMC_PDINV_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_PDINV_PMPDIDX_S			16
+#define PFHMC_PDINV_PMPDIDX_M			ICE_M(0x1FF, 16)
+#define PFHMC_PDINV_FPMAT			0x00100300 /* Reset Source: PFR */
+#define PFHMC_PDINV_FPMAT_PMSDIDX_S		0
+#define PFHMC_PDINV_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define PFHMC_PDINV_FPMAT_PMSDPARTSEL_S		15
+#define PFHMC_PDINV_FPMAT_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_PDINV_FPMAT_PMPDIDX_S		16
+#define PFHMC_PDINV_FPMAT_PMPDIDX_M		ICE_M(0x1FF, 16)
+#define PFHMC_SDCMD				0x00520000 /* Reset Source: PFR */
+#define PFHMC_SDCMD_PMSDIDX_S			0
+#define PFHMC_SDCMD_PMSDIDX_M			ICE_M(0xFFF, 0)
+#define PFHMC_SDCMD_PMSDPARTSEL_S		15
+#define PFHMC_SDCMD_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_SDCMD_PMSDWR_S			31
+#define PFHMC_SDCMD_PMSDWR_M			BIT(31)
+#define PFHMC_SDCMD_FPMAT			0x00100000 /* Reset Source: PFR */
+#define PFHMC_SDCMD_FPMAT_PMSDIDX_S		0
+#define PFHMC_SDCMD_FPMAT_PMSDIDX_M		ICE_M(0xFFF, 0)
+#define PFHMC_SDCMD_FPMAT_PMSDPARTSEL_S		15
+#define PFHMC_SDCMD_FPMAT_PMSDPARTSEL_M		BIT(15)
+#define PFHMC_SDCMD_FPMAT_PMSDWR_S		31
+#define PFHMC_SDCMD_FPMAT_PMSDWR_M		BIT(31)
+#define PFHMC_SDDATAHIGH			0x00520200 /* Reset Source: PFR */
+#define PFHMC_SDDATAHIGH_PMSDDATAHIGH_S		0
+#define PFHMC_SDDATAHIGH_PMSDDATAHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define PFHMC_SDDATAHIGH_FPMAT			0x00100200 /* Reset Source: PFR */
+#define PFHMC_SDDATAHIGH_FPMAT_PMSDDATAHIGH_S	0
+#define PFHMC_SDDATAHIGH_FPMAT_PMSDDATAHIGH_M	ICE_M(0xFFFFFFFF, 0)
+#define PFHMC_SDDATALOW				0x00520100 /* Reset Source: PFR */
+#define PFHMC_SDDATALOW_PMSDVALID_S		0
+#define PFHMC_SDDATALOW_PMSDVALID_M		BIT(0)
+#define PFHMC_SDDATALOW_PMSDTYPE_S		1
+#define PFHMC_SDDATALOW_PMSDTYPE_M		BIT(1)
+#define PFHMC_SDDATALOW_PMSDBPCOUNT_S		2
+#define PFHMC_SDDATALOW_PMSDBPCOUNT_M		ICE_M(0x3FF, 2)
+#define PFHMC_SDDATALOW_PMSDDATALOW_S		12
+#define PFHMC_SDDATALOW_PMSDDATALOW_M		ICE_M(0xFFFFF, 12)
+#define PFHMC_SDDATALOW_FPMAT			0x00100100 /* Reset Source: PFR */
+#define PFHMC_SDDATALOW_FPMAT_PMSDVALID_S	0
+#define PFHMC_SDDATALOW_FPMAT_PMSDVALID_M	BIT(0)
+#define PFHMC_SDDATALOW_FPMAT_PMSDTYPE_S	1
+#define PFHMC_SDDATALOW_FPMAT_PMSDTYPE_M	BIT(1)
+#define PFHMC_SDDATALOW_FPMAT_PMSDBPCOUNT_S	2
+#define PFHMC_SDDATALOW_FPMAT_PMSDBPCOUNT_M	ICE_M(0x3FF, 2)
+#define PFHMC_SDDATALOW_FPMAT_PMSDDATALOW_S	12
+#define PFHMC_SDDATALOW_FPMAT_PMSDDATALOW_M	ICE_M(0xFFFFF, 12)
+#define GL_DSI_REPC				0x00294208 /* Reset Source: CORER */
+#define GL_DSI_REPC_NO_DESC_CNT_S		0
+#define GL_DSI_REPC_NO_DESC_CNT_M		ICE_M(0xFFFF, 0)
+#define GL_DSI_REPC_ERROR_CNT_S			16
+#define GL_DSI_REPC_ERROR_CNT_M			ICE_M(0xFFFF, 16)
+#define GL_MDCK_TDAT_TCLAN			0x000FC0DC /* Reset Source: CORER */
+#define GL_MDCK_TDAT_TCLAN_WRONG_ORDER_FORMAT_DESC_S 0
+#define GL_MDCK_TDAT_TCLAN_WRONG_ORDER_FORMAT_DESC_M BIT(0)
+#define GL_MDCK_TDAT_TCLAN_UR_S			1
+#define GL_MDCK_TDAT_TCLAN_UR_M			BIT(1)
+#define GL_MDCK_TDAT_TCLAN_TAIL_DESC_NOT_DDESC_EOP_NOP_S 2
+#define GL_MDCK_TDAT_TCLAN_TAIL_DESC_NOT_DDESC_EOP_NOP_M BIT(2)
+#define GL_MDCK_TDAT_TCLAN_FALSE_SCHEDULING_S	3
+#define GL_MDCK_TDAT_TCLAN_FALSE_SCHEDULING_M	BIT(3)
+#define GL_MDCK_TDAT_TCLAN_TAIL_VALUE_BIGGER_THAN_RING_LEN_S 4
+#define GL_MDCK_TDAT_TCLAN_TAIL_VALUE_BIGGER_THAN_RING_LEN_M BIT(4)
+#define GL_MDCK_TDAT_TCLAN_MORE_THAN_8_DCMDS_IN_PKT_S 5
+#define GL_MDCK_TDAT_TCLAN_MORE_THAN_8_DCMDS_IN_PKT_M BIT(5)
+#define GL_MDCK_TDAT_TCLAN_NO_HEAD_UPDATE_IN_QUANTA_S 6
+#define GL_MDCK_TDAT_TCLAN_NO_HEAD_UPDATE_IN_QUANTA_M BIT(6)
+#define GL_MDCK_TDAT_TCLAN_PKT_LEN_NOT_LEGAL_S	7
+#define GL_MDCK_TDAT_TCLAN_PKT_LEN_NOT_LEGAL_M	BIT(7)
+#define GL_MDCK_TDAT_TCLAN_TSO_TLEN_NOT_COHERENT_WITH_SUM_BUFS_S 8
+#define GL_MDCK_TDAT_TCLAN_TSO_TLEN_NOT_COHERENT_WITH_SUM_BUFS_M BIT(8)
+#define GL_MDCK_TDAT_TCLAN_TSO_TAIL_REACHED_BEFORE_TLEN_END_S 9
+#define GL_MDCK_TDAT_TCLAN_TSO_TAIL_REACHED_BEFORE_TLEN_END_M BIT(9)
+#define GL_MDCK_TDAT_TCLAN_TSO_MORE_THAN_3_HDRS_S 10
+#define GL_MDCK_TDAT_TCLAN_TSO_MORE_THAN_3_HDRS_M BIT(10)
+#define GL_MDCK_TDAT_TCLAN_TSO_SUM_BUFFS_LT_SUM_HDRS_S 11
+#define GL_MDCK_TDAT_TCLAN_TSO_SUM_BUFFS_LT_SUM_HDRS_M BIT(11)
+#define GL_MDCK_TDAT_TCLAN_TSO_ZERO_MSS_TLEN_HDRS_S 12
+#define GL_MDCK_TDAT_TCLAN_TSO_ZERO_MSS_TLEN_HDRS_M BIT(12)
+#define GL_MDCK_TDAT_TCLAN_TSO_CTX_DESC_IPSEC_S 13
+#define GL_MDCK_TDAT_TCLAN_TSO_CTX_DESC_IPSEC_M BIT(13)
+#define GL_MDCK_TDAT_TCLAN_SSO_COMS_NOT_WHOLE_PKT_NUM_IN_QUANTA_S 14
+#define GL_MDCK_TDAT_TCLAN_SSO_COMS_NOT_WHOLE_PKT_NUM_IN_QUANTA_M BIT(14)
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_BYTES_EXCEED_PKTLEN_X_64_S 15
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_BYTES_EXCEED_PKTLEN_X_64_M BIT(15)
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_CMDS_EXCEED_S 16
+#define GL_MDCK_TDAT_TCLAN_COMS_QUANTA_CMDS_EXCEED_M BIT(16)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_LAST_LSO_QUANTA_S 17
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_LAST_LSO_QUANTA_M BIT(17)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_TLEN_S 18
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_TSO_DESCS_TLEN_M BIT(18)
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_QUANTA_FINISHED_TOO_EARLY_S 19
+#define GL_MDCK_TDAT_TCLAN_TSO_COMS_QUANTA_FINISHED_TOO_EARLY_M BIT(19)
+#define GL_MDCK_TDAT_TCLAN_COMS_NUM_PKTS_IN_QUANTA_S 20
+#define GL_MDCK_TDAT_TCLAN_COMS_NUM_PKTS_IN_QUANTA_M BIT(20)
+#define GLCORE_CLKCTL_H				0x000B81E8 /* Reset Source: POR */
+#define GLCORE_CLKCTL_H_UPPER_CLK_SRC_H_S	0
+#define GLCORE_CLKCTL_H_UPPER_CLK_SRC_H_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_H_LOWER_CLK_SRC_H_S	2
+#define GLCORE_CLKCTL_H_LOWER_CLK_SRC_H_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_H_PSM_CLK_SRC_H_S		4
+#define GLCORE_CLKCTL_H_PSM_CLK_SRC_H_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_H_RXCTL_CLK_SRC_H_S	6
+#define GLCORE_CLKCTL_H_RXCTL_CLK_SRC_H_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_H_UANA_CLK_SRC_H_S	8
+#define GLCORE_CLKCTL_H_UANA_CLK_SRC_H_M	ICE_M(0x7, 8)
+#define GLCORE_CLKCTL_L				0x000B8254 /* Reset Source: POR */
+#define GLCORE_CLKCTL_L_UPPER_CLK_SRC_L_S	0
+#define GLCORE_CLKCTL_L_UPPER_CLK_SRC_L_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_L_LOWER_CLK_SRC_L_S	2
+#define GLCORE_CLKCTL_L_LOWER_CLK_SRC_L_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_L_PSM_CLK_SRC_L_S		4
+#define GLCORE_CLKCTL_L_PSM_CLK_SRC_L_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_L_RXCTL_CLK_SRC_L_S	6
+#define GLCORE_CLKCTL_L_RXCTL_CLK_SRC_L_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_L_UANA_CLK_SRC_L_S	8
+#define GLCORE_CLKCTL_L_UANA_CLK_SRC_L_M	ICE_M(0x7, 8)
+#define GLCORE_CLKCTL_M				0x000B8258 /* Reset Source: POR */
+#define GLCORE_CLKCTL_M_UPPER_CLK_SRC_M_S	0
+#define GLCORE_CLKCTL_M_UPPER_CLK_SRC_M_M	ICE_M(0x3, 0)
+#define GLCORE_CLKCTL_M_LOWER_CLK_SRC_M_S	2
+#define GLCORE_CLKCTL_M_LOWER_CLK_SRC_M_M	ICE_M(0x3, 2)
+#define GLCORE_CLKCTL_M_PSM_CLK_SRC_M_S		4
+#define GLCORE_CLKCTL_M_PSM_CLK_SRC_M_M		ICE_M(0x3, 4)
+#define GLCORE_CLKCTL_M_RXCTL_CLK_SRC_M_S	6
+#define GLCORE_CLKCTL_M_RXCTL_CLK_SRC_M_M	ICE_M(0x3, 6)
+#define GLCORE_CLKCTL_M_UANA_CLK_SRC_M_S	8
+#define GLCORE_CLKCTL_M_UANA_CLK_SRC_M_M	ICE_M(0x7, 8)
+#define GLFOC_CACHESIZE				0x000AA074 /* Reset Source: CORER */
+#define GLFOC_CACHESIZE_WORD_SIZE_S		0
+#define GLFOC_CACHESIZE_WORD_SIZE_M		ICE_M(0xFF, 0)
+#define GLFOC_CACHESIZE_SETS_S			8
+#define GLFOC_CACHESIZE_SETS_M			ICE_M(0xFFF, 8)
+#define GLFOC_CACHESIZE_WAYS_S			20
+#define GLFOC_CACHESIZE_WAYS_M			ICE_M(0xF, 20)
+#define GLMAC_CLKSTAT				0x000B8210 /* Reset Source: POR */
+#define GLMAC_CLKSTAT_P0_CLK_SPEED_S		0
+#define GLMAC_CLKSTAT_P0_CLK_SPEED_M		ICE_M(0xF, 0)
+#define GLMAC_CLKSTAT_P1_CLK_SPEED_S		4
+#define GLMAC_CLKSTAT_P1_CLK_SPEED_M		ICE_M(0xF, 4)
+#define GLMAC_CLKSTAT_P2_CLK_SPEED_S		8
+#define GLMAC_CLKSTAT_P2_CLK_SPEED_M		ICE_M(0xF, 8)
+#define GLMAC_CLKSTAT_P3_CLK_SPEED_S		12
+#define GLMAC_CLKSTAT_P3_CLK_SPEED_M		ICE_M(0xF, 12)
+#define GLMAC_CLKSTAT_P4_CLK_SPEED_S		16
+#define GLMAC_CLKSTAT_P4_CLK_SPEED_M		ICE_M(0xF, 16)
+#define GLMAC_CLKSTAT_P5_CLK_SPEED_S		20
+#define GLMAC_CLKSTAT_P5_CLK_SPEED_M		ICE_M(0xF, 20)
+#define GLMAC_CLKSTAT_P6_CLK_SPEED_S		24
+#define GLMAC_CLKSTAT_P6_CLK_SPEED_M		ICE_M(0xF, 24)
+#define GLMAC_CLKSTAT_P7_CLK_SPEED_S		28
+#define GLMAC_CLKSTAT_P7_CLK_SPEED_M		ICE_M(0xF, 28)
+#define GLTPB_100G_MAC_FC_THRESH		0x00099510 /* Reset Source: CORER */
+#define GLTPB_100G_MAC_FC_THRESH_PORT0_FC_THRESH_S 0
+#define GLTPB_100G_MAC_FC_THRESH_PORT0_FC_THRESH_M ICE_M(0xFFFF, 0)
+#define GLTPB_100G_MAC_FC_THRESH_PORT1_FC_THRESH_S 16
+#define GLTPB_100G_MAC_FC_THRESH_PORT1_FC_THRESH_M ICE_M(0xFFFF, 16)
+#define GLTPB_100G_RPB_FC_THRESH		0x0009963C /* Reset Source: CORER */
+#define GLTPB_100G_RPB_FC_THRESH_PORT0_FC_THRESH_S 0
+#define GLTPB_100G_RPB_FC_THRESH_PORT0_FC_THRESH_M ICE_M(0xFFFF, 0)
+#define GLTPB_100G_RPB_FC_THRESH_PORT1_FC_THRESH_S 16
+#define GLTPB_100G_RPB_FC_THRESH_PORT1_FC_THRESH_M ICE_M(0xFFFF, 16)
+#define GLTPB_PACING_10G			0x000994E4 /* Reset Source: CORER */
+#define GLTPB_PACING_10G_N_S			0
+#define GLTPB_PACING_10G_N_M			ICE_M(0xFF, 0)
+#define GLTPB_PACING_10G_K_S			8
+#define GLTPB_PACING_10G_K_M			ICE_M(0xFF, 8)
+#define GLTPB_PACING_10G_S_S			16
+#define GLTPB_PACING_10G_S_M			ICE_M(0x1FF, 16)
+#define GLTPB_PACING_25G			0x000994E0 /* Reset Source: CORER */
+#define GLTPB_PACING_25G_N_S			0
+#define GLTPB_PACING_25G_N_M			ICE_M(0xFF, 0)
+#define GLTPB_PACING_25G_K_S			8
+#define GLTPB_PACING_25G_K_M			ICE_M(0xFF, 8)
+#define GLTPB_PACING_25G_S_S			16
+#define GLTPB_PACING_25G_S_M			ICE_M(0x1FF, 16)
+#define GLTPB_PORT_PACING_SPEED			0x000994E8 /* Reset Source: CORER */
+#define GLTPB_PORT_PACING_SPEED_PORT0_SPEED_S	0
+#define GLTPB_PORT_PACING_SPEED_PORT0_SPEED_M	BIT(0)
+#define GLTPB_PORT_PACING_SPEED_PORT1_SPEED_S	1
+#define GLTPB_PORT_PACING_SPEED_PORT1_SPEED_M	BIT(1)
+#define GLTPB_PORT_PACING_SPEED_PORT2_SPEED_S	2
+#define GLTPB_PORT_PACING_SPEED_PORT2_SPEED_M	BIT(2)
+#define GLTPB_PORT_PACING_SPEED_PORT3_SPEED_S	3
+#define GLTPB_PORT_PACING_SPEED_PORT3_SPEED_M	BIT(3)
+#define GLTPB_PORT_PACING_SPEED_PORT4_SPEED_S	4
+#define GLTPB_PORT_PACING_SPEED_PORT4_SPEED_M	BIT(4)
+#define GLTPB_PORT_PACING_SPEED_PORT5_SPEED_S	5
+#define GLTPB_PORT_PACING_SPEED_PORT5_SPEED_M	BIT(5)
+#define GLTPB_PORT_PACING_SPEED_PORT6_SPEED_S	6
+#define GLTPB_PORT_PACING_SPEED_PORT6_SPEED_M	BIT(6)
+#define GLTPB_PORT_PACING_SPEED_PORT7_SPEED_S	7
+#define GLTPB_PORT_PACING_SPEED_PORT7_SPEED_M	BIT(7)
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD		0x00099494 /* Reset Source: CORER */
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD_THRESHOLD_S 0
+#define TPB_CFG_SCHEDULED_BC_THRESHOLD_THRESHOLD_M ICE_M(0x7FFF, 0)
+#define GL_UFUSE_SOC				0x000A400C /* Reset Source: POR */
+#define GL_UFUSE_SOC_PORT_MODE_S		0
+#define GL_UFUSE_SOC_PORT_MODE_M		ICE_M(0x3, 0)
+#define GL_UFUSE_SOC_BANDWIDTH_S		2
+#define GL_UFUSE_SOC_BANDWIDTH_M		ICE_M(0x3, 2)
+#define GL_UFUSE_SOC_PE_DISABLE_S		4
+#define GL_UFUSE_SOC_PE_DISABLE_M		BIT(4)
+#define GL_UFUSE_SOC_SWITCH_MODE_S		5
+#define GL_UFUSE_SOC_SWITCH_MODE_M		BIT(5)
+#define GL_UFUSE_SOC_CSR_PROTECTION_ENABLE_S	6
+#define GL_UFUSE_SOC_CSR_PROTECTION_ENABLE_M	BIT(6)
+#define GL_UFUSE_SOC_SERIAL_50G_S		7
+#define GL_UFUSE_SOC_SERIAL_50G_M		BIT(7)
+#define GL_UFUSE_SOC_NIC_ID_S			8
+#define GL_UFUSE_SOC_NIC_ID_M			BIT(8)
+#define GL_UFUSE_SOC_BLOCK_BME_TO_FW_S		9
+#define GL_UFUSE_SOC_BLOCK_BME_TO_FW_M		BIT(9)
+#define GL_UFUSE_SOC_SOC_TYPE_S			10
+#define GL_UFUSE_SOC_SOC_TYPE_M			BIT(10)
+#define GL_UFUSE_SOC_BTS_MODE_S			11
+#define GL_UFUSE_SOC_BTS_MODE_M			BIT(11)
+#define GL_UFUSE_SOC_SPARE_FUSES_S		12
+#define GL_UFUSE_SOC_SPARE_FUSES_M		ICE_M(0xF, 12)
+#define EMPINT_GPIO_ENA				0x000880C0 /* Reset Source: POR */
+#define EMPINT_GPIO_ENA_GPIO0_ENA_S		0
+#define EMPINT_GPIO_ENA_GPIO0_ENA_M		BIT(0)
+#define EMPINT_GPIO_ENA_GPIO1_ENA_S		1
+#define EMPINT_GPIO_ENA_GPIO1_ENA_M		BIT(1)
+#define EMPINT_GPIO_ENA_GPIO2_ENA_S		2
+#define EMPINT_GPIO_ENA_GPIO2_ENA_M		BIT(2)
+#define EMPINT_GPIO_ENA_GPIO3_ENA_S		3
+#define EMPINT_GPIO_ENA_GPIO3_ENA_M		BIT(3)
+#define EMPINT_GPIO_ENA_GPIO4_ENA_S		4
+#define EMPINT_GPIO_ENA_GPIO4_ENA_M		BIT(4)
+#define EMPINT_GPIO_ENA_GPIO5_ENA_S		5
+#define EMPINT_GPIO_ENA_GPIO5_ENA_M		BIT(5)
+#define EMPINT_GPIO_ENA_GPIO6_ENA_S		6
+#define EMPINT_GPIO_ENA_GPIO6_ENA_M		BIT(6)
+#define GLGEN_MAC_LINK_TOPO			0x000B81DC /* Reset Source: GLOBR */
+#define GLGEN_MAC_LINK_TOPO_LINK_TOPO_S		0
+#define GLGEN_MAC_LINK_TOPO_LINK_TOPO_M		ICE_M(0x3, 0)
+#define GLINT_CEQCTL(_INT)			(0x0015C000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_CEQCTL_MAX_INDEX			2047
+#define GLINT_CEQCTL_MSIX_INDX_S		0
+#define GLINT_CEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define GLINT_CEQCTL_ITR_INDX_S			11
+#define GLINT_CEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define GLINT_CEQCTL_CAUSE_ENA_S		30
+#define GLINT_CEQCTL_CAUSE_ENA_M		BIT(30)
+#define GLINT_CEQCTL_INTEVENT_S			31
+#define GLINT_CEQCTL_INTEVENT_M			BIT(31)
+#define GLINT_CTL				0x0016CC54 /* Reset Source: CORER */
+#define GLINT_CTL_DIS_AUTOMASK_S		0
 #define GLINT_CTL_DIS_AUTOMASK_M		BIT(0)
+#define GLINT_CTL_RSVD_S			1
+#define GLINT_CTL_RSVD_M			ICE_M(0x7FFF, 1)
 #define GLINT_CTL_ITR_GRAN_200_S		16
 #define GLINT_CTL_ITR_GRAN_200_M		ICE_M(0xF, 16)
 #define GLINT_CTL_ITR_GRAN_100_S		20
@@ -124,106 +4491,868 @@
 #define GLINT_CTL_ITR_GRAN_50_M			ICE_M(0xF, 24)
 #define GLINT_CTL_ITR_GRAN_25_S			28
 #define GLINT_CTL_ITR_GRAN_25_M			ICE_M(0xF, 28)
-#define GLINT_DYN_CTL(_INT)			(0x00160000 + ((_INT) * 4))
+#define GLINT_DYN_CTL(_INT)			(0x00160000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_DYN_CTL_MAX_INDEX			2047
+#define GLINT_DYN_CTL_INTENA_S			0
 #define GLINT_DYN_CTL_INTENA_M			BIT(0)
+#define GLINT_DYN_CTL_CLEARPBA_S		1
 #define GLINT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define GLINT_DYN_CTL_SWINT_TRIG_S		2
 #define GLINT_DYN_CTL_SWINT_TRIG_M		BIT(2)
 #define GLINT_DYN_CTL_ITR_INDX_S		3
 #define GLINT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
 #define GLINT_DYN_CTL_INTERVAL_S		5
 #define GLINT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define GLINT_DYN_CTL_SW_ITR_INDX_ENA_S		24
+#define GLINT_DYN_CTL_SW_ITR_INDX_ENA_M		BIT(24)
+#define GLINT_DYN_CTL_SW_ITR_INDX_S		25
 #define GLINT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define GLINT_DYN_CTL_WB_ON_ITR_S		30
 #define GLINT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define GLINT_DYN_CTL_INTENA_MSK_S		31
 #define GLINT_DYN_CTL_INTENA_MSK_M		BIT(31)
-#define GLINT_ITR(_i, _INT)			(0x00154000 + ((_i) * 8192 + (_INT) * 4))
-#define GLINT_RATE(_INT)			(0x0015A000 + ((_INT) * 4))
+#define GLINT_FW_TOOL_CTL			0x0016C840 /* Reset Source: CORER */
+#define GLINT_FW_TOOL_CTL_MSIX_INDX_S		0
+#define GLINT_FW_TOOL_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define GLINT_FW_TOOL_CTL_ITR_INDX_S		11
+#define GLINT_FW_TOOL_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define GLINT_FW_TOOL_CTL_CAUSE_ENA_S		30
+#define GLINT_FW_TOOL_CTL_CAUSE_ENA_M		BIT(30)
+#define GLINT_FW_TOOL_CTL_INTEVENT_S		31
+#define GLINT_FW_TOOL_CTL_INTEVENT_M		BIT(31)
+#define GLINT_ITR(_i, _INT)			(0x00154000 + ((_i) * 8192 + (_INT) * 4)) /* _i=0...2, _INT=0...2047 */ /* Reset Source: CORER */
+#define GLINT_ITR_MAX_INDEX			2
+#define GLINT_ITR_INTERVAL_S			0
+#define GLINT_ITR_INTERVAL_M			ICE_M(0xFFF, 0)
+#define GLINT_RATE(_INT)			(0x0015A000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_RATE_MAX_INDEX			2047
+#define GLINT_RATE_INTERVAL_S			0
+#define GLINT_RATE_INTERVAL_M			ICE_M(0x3F, 0)
+#define GLINT_RATE_INTRL_ENA_S			6
 #define GLINT_RATE_INTRL_ENA_M			BIT(6)
-#define GLINT_VECT2FUNC(_INT)			(0x00162000 + ((_INT) * 4))
+#define GLINT_TSYN_PFMSTR(_i)			(0x0016CCC0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLINT_TSYN_PFMSTR_MAX_INDEX		1
+#define GLINT_TSYN_PFMSTR_PF_MASTER_S		0
+#define GLINT_TSYN_PFMSTR_PF_MASTER_M		ICE_M(0x7, 0)
+#define GLINT_TSYN_PHY				0x0016CC50 /* Reset Source: CORER */
+#define GLINT_TSYN_PHY_PHY_INDX_S		0
+#define GLINT_TSYN_PHY_PHY_INDX_M		ICE_M(0x1F, 0)
+#define GLINT_VECT2FUNC(_INT)			(0x00162000 + ((_INT) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLINT_VECT2FUNC_MAX_INDEX		2047
 #define GLINT_VECT2FUNC_VF_NUM_S		0
 #define GLINT_VECT2FUNC_VF_NUM_M		ICE_M(0xFF, 0)
 #define GLINT_VECT2FUNC_PF_NUM_S		12
 #define GLINT_VECT2FUNC_PF_NUM_M		ICE_M(0x7, 12)
 #define GLINT_VECT2FUNC_IS_PF_S			16
 #define GLINT_VECT2FUNC_IS_PF_M			BIT(16)
-#define PFINT_FW_CTL				0x0016C800
+#define PF0INT_FW_HLP_CTL			0x0016C844 /* Reset Source: CORER */
+#define PF0INT_FW_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_FW_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_FW_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_FW_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_FW_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_FW_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_FW_HLP_CTL_INTEVENT_S		31
+#define PF0INT_FW_HLP_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_FW_PSM_CTL			0x0016C848 /* Reset Source: CORER */
+#define PF0INT_FW_PSM_CTL_MSIX_INDX_S		0
+#define PF0INT_FW_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_FW_PSM_CTL_ITR_INDX_S		11
+#define PF0INT_FW_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_FW_PSM_CTL_CAUSE_ENA_S		30
+#define PF0INT_FW_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_FW_PSM_CTL_INTEVENT_S		31
+#define PF0INT_FW_PSM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_CPM_CTL			0x0016B2C0 /* Reset Source: CORER */
+#define PF0INT_MBX_CPM_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_CPM_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_CPM_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_CPM_CTL_INTEVENT_S		31
+#define PF0INT_MBX_CPM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_HLP_CTL			0x0016B2C4 /* Reset Source: CORER */
+#define PF0INT_MBX_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_HLP_CTL_INTEVENT_S		31
+#define PF0INT_MBX_HLP_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_MBX_PSM_CTL			0x0016B2C8 /* Reset Source: CORER */
+#define PF0INT_MBX_PSM_CTL_MSIX_INDX_S		0
+#define PF0INT_MBX_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_MBX_PSM_CTL_ITR_INDX_S		11
+#define PF0INT_MBX_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_MBX_PSM_CTL_CAUSE_ENA_S		30
+#define PF0INT_MBX_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_MBX_PSM_CTL_INTEVENT_S		31
+#define PF0INT_MBX_PSM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CPM				0x0016CC40 /* Reset Source: CORER */
+#define PF0INT_OICR_CPM_INTEVENT_S		0
+#define PF0INT_OICR_CPM_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_CPM_QUEUE_S			1
+#define PF0INT_OICR_CPM_QUEUE_M			BIT(1)
+#define PF0INT_OICR_CPM_RSV1_S			2
+#define PF0INT_OICR_CPM_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_CPM_HH_COMP_S		10
+#define PF0INT_OICR_CPM_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_CPM_TSYN_TX_S		11
+#define PF0INT_OICR_CPM_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_CPM_TSYN_EVNT_S		12
+#define PF0INT_OICR_CPM_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_CPM_TSYN_TGT_S		13
+#define PF0INT_OICR_CPM_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_CPM_HLP_RDY_S		14
+#define PF0INT_OICR_CPM_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_CPM_CPM_RDY_S		15
+#define PF0INT_OICR_CPM_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_CPM_ECC_ERR_S		16
+#define PF0INT_OICR_CPM_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_CPM_RSV2_S			17
+#define PF0INT_OICR_CPM_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_CPM_MAL_DETECT_S		19
+#define PF0INT_OICR_CPM_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_CPM_GRST_S			20
+#define PF0INT_OICR_CPM_GRST_M			BIT(20)
+#define PF0INT_OICR_CPM_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_CPM_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_CPM_GPIO_S			22
+#define PF0INT_OICR_CPM_GPIO_M			BIT(22)
+#define PF0INT_OICR_CPM_RSV3_S			23
+#define PF0INT_OICR_CPM_RSV3_M			BIT(23)
+#define PF0INT_OICR_CPM_STORM_DETECT_S		24
+#define PF0INT_OICR_CPM_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_CPM_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_CPM_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_CPM_HMC_ERR_S		26
+#define PF0INT_OICR_CPM_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_CPM_PE_PUSH_S		27
+#define PF0INT_OICR_CPM_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_CPM_PE_CRITERR_S		28
+#define PF0INT_OICR_CPM_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_CPM_VFLR_S			29
+#define PF0INT_OICR_CPM_VFLR_M			BIT(29)
+#define PF0INT_OICR_CPM_XLR_HW_DONE_S		30
+#define PF0INT_OICR_CPM_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_CPM_SWINT_S			31
+#define PF0INT_OICR_CPM_SWINT_M			BIT(31)
+#define PF0INT_OICR_CTL_CPM			0x0016CC48 /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_CPM_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_CPM_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_CPM_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_CPM_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_CPM_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_CPM_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_CPM_INTEVENT_S		31
+#define PF0INT_OICR_CTL_CPM_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CTL_HLP			0x0016CC5C /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_HLP_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_HLP_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_HLP_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_HLP_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_HLP_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_HLP_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_HLP_INTEVENT_S		31
+#define PF0INT_OICR_CTL_HLP_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_CTL_PSM			0x0016CC64 /* Reset Source: CORER */
+#define PF0INT_OICR_CTL_PSM_MSIX_INDX_S		0
+#define PF0INT_OICR_CTL_PSM_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_OICR_CTL_PSM_ITR_INDX_S		11
+#define PF0INT_OICR_CTL_PSM_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_OICR_CTL_PSM_CAUSE_ENA_S		30
+#define PF0INT_OICR_CTL_PSM_CAUSE_ENA_M		BIT(30)
+#define PF0INT_OICR_CTL_PSM_INTEVENT_S		31
+#define PF0INT_OICR_CTL_PSM_INTEVENT_M		BIT(31)
+#define PF0INT_OICR_ENA_CPM			0x0016CC60 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_CPM_RSV0_S		0
+#define PF0INT_OICR_ENA_CPM_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_CPM_INT_ENA_S		1
+#define PF0INT_OICR_ENA_CPM_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_HLP			0x0016CC4C /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_HLP_RSV0_S		0
+#define PF0INT_OICR_ENA_HLP_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_HLP_INT_ENA_S		1
+#define PF0INT_OICR_ENA_HLP_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_ENA_PSM			0x0016CC58 /* Reset Source: CORER */
+#define PF0INT_OICR_ENA_PSM_RSV0_S		0
+#define PF0INT_OICR_ENA_PSM_RSV0_M		BIT(0)
+#define PF0INT_OICR_ENA_PSM_INT_ENA_S		1
+#define PF0INT_OICR_ENA_PSM_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PF0INT_OICR_HLP				0x0016CC68 /* Reset Source: CORER */
+#define PF0INT_OICR_HLP_INTEVENT_S		0
+#define PF0INT_OICR_HLP_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_HLP_QUEUE_S			1
+#define PF0INT_OICR_HLP_QUEUE_M			BIT(1)
+#define PF0INT_OICR_HLP_RSV1_S			2
+#define PF0INT_OICR_HLP_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_HLP_HH_COMP_S		10
+#define PF0INT_OICR_HLP_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_HLP_TSYN_TX_S		11
+#define PF0INT_OICR_HLP_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_HLP_TSYN_EVNT_S		12
+#define PF0INT_OICR_HLP_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_HLP_TSYN_TGT_S		13
+#define PF0INT_OICR_HLP_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_HLP_HLP_RDY_S		14
+#define PF0INT_OICR_HLP_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_HLP_CPM_RDY_S		15
+#define PF0INT_OICR_HLP_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_HLP_ECC_ERR_S		16
+#define PF0INT_OICR_HLP_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_HLP_RSV2_S			17
+#define PF0INT_OICR_HLP_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_HLP_MAL_DETECT_S		19
+#define PF0INT_OICR_HLP_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_HLP_GRST_S			20
+#define PF0INT_OICR_HLP_GRST_M			BIT(20)
+#define PF0INT_OICR_HLP_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_HLP_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_HLP_GPIO_S			22
+#define PF0INT_OICR_HLP_GPIO_M			BIT(22)
+#define PF0INT_OICR_HLP_RSV3_S			23
+#define PF0INT_OICR_HLP_RSV3_M			BIT(23)
+#define PF0INT_OICR_HLP_STORM_DETECT_S		24
+#define PF0INT_OICR_HLP_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_HLP_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_HLP_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_HLP_HMC_ERR_S		26
+#define PF0INT_OICR_HLP_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_HLP_PE_PUSH_S		27
+#define PF0INT_OICR_HLP_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_HLP_PE_CRITERR_S		28
+#define PF0INT_OICR_HLP_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_HLP_VFLR_S			29
+#define PF0INT_OICR_HLP_VFLR_M			BIT(29)
+#define PF0INT_OICR_HLP_XLR_HW_DONE_S		30
+#define PF0INT_OICR_HLP_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_HLP_SWINT_S			31
+#define PF0INT_OICR_HLP_SWINT_M			BIT(31)
+#define PF0INT_OICR_PSM				0x0016CC44 /* Reset Source: CORER */
+#define PF0INT_OICR_PSM_INTEVENT_S		0
+#define PF0INT_OICR_PSM_INTEVENT_M		BIT(0)
+#define PF0INT_OICR_PSM_QUEUE_S			1
+#define PF0INT_OICR_PSM_QUEUE_M			BIT(1)
+#define PF0INT_OICR_PSM_RSV1_S			2
+#define PF0INT_OICR_PSM_RSV1_M			ICE_M(0xFF, 2)
+#define PF0INT_OICR_PSM_HH_COMP_S		10
+#define PF0INT_OICR_PSM_HH_COMP_M		BIT(10)
+#define PF0INT_OICR_PSM_TSYN_TX_S		11
+#define PF0INT_OICR_PSM_TSYN_TX_M		BIT(11)
+#define PF0INT_OICR_PSM_TSYN_EVNT_S		12
+#define PF0INT_OICR_PSM_TSYN_EVNT_M		BIT(12)
+#define PF0INT_OICR_PSM_TSYN_TGT_S		13
+#define PF0INT_OICR_PSM_TSYN_TGT_M		BIT(13)
+#define PF0INT_OICR_PSM_HLP_RDY_S		14
+#define PF0INT_OICR_PSM_HLP_RDY_M		BIT(14)
+#define PF0INT_OICR_PSM_CPM_RDY_S		15
+#define PF0INT_OICR_PSM_CPM_RDY_M		BIT(15)
+#define PF0INT_OICR_PSM_ECC_ERR_S		16
+#define PF0INT_OICR_PSM_ECC_ERR_M		BIT(16)
+#define PF0INT_OICR_PSM_RSV2_S			17
+#define PF0INT_OICR_PSM_RSV2_M			ICE_M(0x3, 17)
+#define PF0INT_OICR_PSM_MAL_DETECT_S		19
+#define PF0INT_OICR_PSM_MAL_DETECT_M		BIT(19)
+#define PF0INT_OICR_PSM_GRST_S			20
+#define PF0INT_OICR_PSM_GRST_M			BIT(20)
+#define PF0INT_OICR_PSM_PCI_EXCEPTION_S		21
+#define PF0INT_OICR_PSM_PCI_EXCEPTION_M		BIT(21)
+#define PF0INT_OICR_PSM_GPIO_S			22
+#define PF0INT_OICR_PSM_GPIO_M			BIT(22)
+#define PF0INT_OICR_PSM_RSV3_S			23
+#define PF0INT_OICR_PSM_RSV3_M			BIT(23)
+#define PF0INT_OICR_PSM_STORM_DETECT_S		24
+#define PF0INT_OICR_PSM_STORM_DETECT_M		BIT(24)
+#define PF0INT_OICR_PSM_LINK_STAT_CHANGE_S	25
+#define PF0INT_OICR_PSM_LINK_STAT_CHANGE_M	BIT(25)
+#define PF0INT_OICR_PSM_HMC_ERR_S		26
+#define PF0INT_OICR_PSM_HMC_ERR_M		BIT(26)
+#define PF0INT_OICR_PSM_PE_PUSH_S		27
+#define PF0INT_OICR_PSM_PE_PUSH_M		BIT(27)
+#define PF0INT_OICR_PSM_PE_CRITERR_S		28
+#define PF0INT_OICR_PSM_PE_CRITERR_M		BIT(28)
+#define PF0INT_OICR_PSM_VFLR_S			29
+#define PF0INT_OICR_PSM_VFLR_M			BIT(29)
+#define PF0INT_OICR_PSM_XLR_HW_DONE_S		30
+#define PF0INT_OICR_PSM_XLR_HW_DONE_M		BIT(30)
+#define PF0INT_OICR_PSM_SWINT_S			31
+#define PF0INT_OICR_PSM_SWINT_M			BIT(31)
+#define PF0INT_SB_CPM_CTL			0x0016B2CC /* Reset Source: CORER */
+#define PF0INT_SB_CPM_CTL_MSIX_INDX_S		0
+#define PF0INT_SB_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_SB_CPM_CTL_ITR_INDX_S		11
+#define PF0INT_SB_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_SB_CPM_CTL_CAUSE_ENA_S		30
+#define PF0INT_SB_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_SB_CPM_CTL_INTEVENT_S		31
+#define PF0INT_SB_CPM_CTL_INTEVENT_M		BIT(31)
+#define PF0INT_SB_HLP_CTL			0x0016B640 /* Reset Source: CORER */
+#define PF0INT_SB_HLP_CTL_MSIX_INDX_S		0
+#define PF0INT_SB_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PF0INT_SB_HLP_CTL_ITR_INDX_S		11
+#define PF0INT_SB_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PF0INT_SB_HLP_CTL_CAUSE_ENA_S		30
+#define PF0INT_SB_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define PF0INT_SB_HLP_CTL_INTEVENT_S		31
+#define PF0INT_SB_HLP_CTL_INTEVENT_M		BIT(31)
+#define PFINT_AEQCTL				0x0016CB00 /* Reset Source: CORER */
+#define PFINT_AEQCTL_MSIX_INDX_S		0
+#define PFINT_AEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_AEQCTL_ITR_INDX_S			11
+#define PFINT_AEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_AEQCTL_CAUSE_ENA_S		30
+#define PFINT_AEQCTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_AEQCTL_INTEVENT_S			31
+#define PFINT_AEQCTL_INTEVENT_M			BIT(31)
+#define PFINT_ALLOC				0x001D2600 /* Reset Source: CORER */
+#define PFINT_ALLOC_FIRST_S			0
+#define PFINT_ALLOC_FIRST_M			ICE_M(0x7FF, 0)
+#define PFINT_ALLOC_LAST_S			12
+#define PFINT_ALLOC_LAST_M			ICE_M(0x7FF, 12)
+#define PFINT_ALLOC_VALID_S			31
+#define PFINT_ALLOC_VALID_M			BIT(31)
+#define PFINT_ALLOC_PCI				0x0009D800 /* Reset Source: PCIR */
+#define PFINT_ALLOC_PCI_FIRST_S			0
+#define PFINT_ALLOC_PCI_FIRST_M			ICE_M(0x7FF, 0)
+#define PFINT_ALLOC_PCI_LAST_S			12
+#define PFINT_ALLOC_PCI_LAST_M			ICE_M(0x7FF, 12)
+#define PFINT_ALLOC_PCI_VALID_S			31
+#define PFINT_ALLOC_PCI_VALID_M			BIT(31)
+#define PFINT_FW_CTL				0x0016C800 /* Reset Source: CORER */
+#define PFINT_FW_CTL_MSIX_INDX_S		0
 #define PFINT_FW_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_FW_CTL_ITR_INDX_S			11
 #define PFINT_FW_CTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_FW_CTL_CAUSE_ENA_S		30
 #define PFINT_FW_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_MBX_CTL				0x0016B280
+#define PFINT_FW_CTL_INTEVENT_S			31
+#define PFINT_FW_CTL_INTEVENT_M			BIT(31)
+#define PFINT_GPIO_ENA				0x00088080 /* Reset Source: CORER */
+#define PFINT_GPIO_ENA_GPIO0_ENA_S		0
+#define PFINT_GPIO_ENA_GPIO0_ENA_M		BIT(0)
+#define PFINT_GPIO_ENA_GPIO1_ENA_S		1
+#define PFINT_GPIO_ENA_GPIO1_ENA_M		BIT(1)
+#define PFINT_GPIO_ENA_GPIO2_ENA_S		2
+#define PFINT_GPIO_ENA_GPIO2_ENA_M		BIT(2)
+#define PFINT_GPIO_ENA_GPIO3_ENA_S		3
+#define PFINT_GPIO_ENA_GPIO3_ENA_M		BIT(3)
+#define PFINT_GPIO_ENA_GPIO4_ENA_S		4
+#define PFINT_GPIO_ENA_GPIO4_ENA_M		BIT(4)
+#define PFINT_GPIO_ENA_GPIO5_ENA_S		5
+#define PFINT_GPIO_ENA_GPIO5_ENA_M		BIT(5)
+#define PFINT_GPIO_ENA_GPIO6_ENA_S		6
+#define PFINT_GPIO_ENA_GPIO6_ENA_M		BIT(6)
+#define PFINT_MBX_CTL				0x0016B280 /* Reset Source: CORER */
+#define PFINT_MBX_CTL_MSIX_INDX_S		0
 #define PFINT_MBX_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_MBX_CTL_ITR_INDX_S		11
 #define PFINT_MBX_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PFINT_MBX_CTL_CAUSE_ENA_S		30
 #define PFINT_MBX_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_OICR				0x0016CA00
+#define PFINT_MBX_CTL_INTEVENT_S		31
+#define PFINT_MBX_CTL_INTEVENT_M		BIT(31)
+#define PFINT_OICR				0x0016CA00 /* Reset Source: CORER */
+#define PFINT_OICR_INTEVENT_S			0
+#define PFINT_OICR_INTEVENT_M			BIT(0)
+#define PFINT_OICR_QUEUE_S			1
+#define PFINT_OICR_QUEUE_M			BIT(1)
+#define PFINT_OICR_RSV1_S			2
+#define PFINT_OICR_RSV1_M			ICE_M(0xFF, 2)
+#define PFINT_OICR_HH_COMP_S			10
+#define PFINT_OICR_HH_COMP_M			BIT(10)
+#define PFINT_OICR_TSYN_TX_S			11
+#define PFINT_OICR_TSYN_TX_M			BIT(11)
+#define PFINT_OICR_TSYN_EVNT_S			12
+#define PFINT_OICR_TSYN_EVNT_M			BIT(12)
+#define PFINT_OICR_TSYN_TGT_S			13
+#define PFINT_OICR_TSYN_TGT_M			BIT(13)
+#define PFINT_OICR_HLP_RDY_S			14
+#define PFINT_OICR_HLP_RDY_M			BIT(14)
+#define PFINT_OICR_CPM_RDY_S			15
+#define PFINT_OICR_CPM_RDY_M			BIT(15)
+#define PFINT_OICR_ECC_ERR_S			16
 #define PFINT_OICR_ECC_ERR_M			BIT(16)
+#define PFINT_OICR_RSV2_S			17
+#define PFINT_OICR_RSV2_M			ICE_M(0x3, 17)
+#define PFINT_OICR_MAL_DETECT_S			19
 #define PFINT_OICR_MAL_DETECT_M			BIT(19)
+#define PFINT_OICR_GRST_S			20
 #define PFINT_OICR_GRST_M			BIT(20)
+#define PFINT_OICR_PCI_EXCEPTION_S		21
 #define PFINT_OICR_PCI_EXCEPTION_M		BIT(21)
+#define PFINT_OICR_GPIO_S			22
+#define PFINT_OICR_GPIO_M			BIT(22)
+#define PFINT_OICR_RSV3_S			23
+#define PFINT_OICR_RSV3_M			BIT(23)
+#define PFINT_OICR_STORM_DETECT_S		24
+#define PFINT_OICR_STORM_DETECT_M		BIT(24)
+#define PFINT_OICR_LINK_STAT_CHANGE_S		25
+#define PFINT_OICR_LINK_STAT_CHANGE_M		BIT(25)
+#define PFINT_OICR_HMC_ERR_S			26
 #define PFINT_OICR_HMC_ERR_M			BIT(26)
+#define PFINT_OICR_PE_PUSH_S			27
+#define PFINT_OICR_PE_PUSH_M			BIT(27)
+#define PFINT_OICR_PE_CRITERR_S			28
 #define PFINT_OICR_PE_CRITERR_M			BIT(28)
+#define PFINT_OICR_VFLR_S			29
 #define PFINT_OICR_VFLR_M			BIT(29)
+#define PFINT_OICR_XLR_HW_DONE_S		30
+#define PFINT_OICR_XLR_HW_DONE_M		BIT(30)
+#define PFINT_OICR_SWINT_S			31
 #define PFINT_OICR_SWINT_M			BIT(31)
-#define PFINT_OICR_CTL				0x0016CA80
+#define PFINT_OICR_CTL				0x0016CA80 /* Reset Source: CORER */
+#define PFINT_OICR_CTL_MSIX_INDX_S		0
 #define PFINT_OICR_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
 #define PFINT_OICR_CTL_ITR_INDX_S		11
 #define PFINT_OICR_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PFINT_OICR_CTL_CAUSE_ENA_S		30
 #define PFINT_OICR_CTL_CAUSE_ENA_M		BIT(30)
-#define PFINT_OICR_ENA				0x0016C900
-#define QINT_RQCTL(_QRX)			(0x00150000 + ((_QRX) * 4))
+#define PFINT_OICR_CTL_INTEVENT_S		31
+#define PFINT_OICR_CTL_INTEVENT_M		BIT(31)
+#define PFINT_OICR_ENA				0x0016C900 /* Reset Source: CORER */
+#define PFINT_OICR_ENA_RSV0_S			0
+#define PFINT_OICR_ENA_RSV0_M			BIT(0)
+#define PFINT_OICR_ENA_INT_ENA_S		1
+#define PFINT_OICR_ENA_INT_ENA_M		ICE_M(0x7FFFFFFF, 1)
+#define PFINT_SB_CTL				0x0016B600 /* Reset Source: CORER */
+#define PFINT_SB_CTL_MSIX_INDX_S		0
+#define PFINT_SB_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_SB_CTL_ITR_INDX_S			11
+#define PFINT_SB_CTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_SB_CTL_CAUSE_ENA_S		30
+#define PFINT_SB_CTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_SB_CTL_INTEVENT_S			31
+#define PFINT_SB_CTL_INTEVENT_M			BIT(31)
+#define PFINT_TSYN_MSK				0x0016C980 /* Reset Source: CORER */
+#define PFINT_TSYN_MSK_PHY_INDX_S		0
+#define PFINT_TSYN_MSK_PHY_INDX_M		ICE_M(0x1F, 0)
+#define QINT_RQCTL(_QRX)			(0x00150000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QINT_RQCTL_MAX_INDEX			2047
 #define QINT_RQCTL_MSIX_INDX_S			0
 #define QINT_RQCTL_MSIX_INDX_M			ICE_M(0x7FF, 0)
 #define QINT_RQCTL_ITR_INDX_S			11
 #define QINT_RQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define QINT_RQCTL_CAUSE_ENA_S			30
 #define QINT_RQCTL_CAUSE_ENA_M			BIT(30)
-#define QINT_TQCTL(_DBQM)			(0x00140000 + ((_DBQM) * 4))
+#define QINT_RQCTL_INTEVENT_S			31
+#define QINT_RQCTL_INTEVENT_M			BIT(31)
+#define QINT_TQCTL(_DBQM)			(0x00140000 + ((_DBQM) * 4)) /* _i=0...16383 */ /* Reset Source: CORER */
+#define QINT_TQCTL_MAX_INDEX			16383
 #define QINT_TQCTL_MSIX_INDX_S			0
 #define QINT_TQCTL_MSIX_INDX_M			ICE_M(0x7FF, 0)
 #define QINT_TQCTL_ITR_INDX_S			11
 #define QINT_TQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define QINT_TQCTL_CAUSE_ENA_S			30
 #define QINT_TQCTL_CAUSE_ENA_M			BIT(30)
-#define VPINT_ALLOC(_VF)			(0x001D1000 + ((_VF) * 4))
+#define QINT_TQCTL_INTEVENT_S			31
+#define QINT_TQCTL_INTEVENT_M			BIT(31)
+#define VPINT_AEQCTL(_VF)			(0x0016B800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPINT_AEQCTL_MAX_INDEX			255
+#define VPINT_AEQCTL_MSIX_INDX_S		0
+#define VPINT_AEQCTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_AEQCTL_ITR_INDX_S			11
+#define VPINT_AEQCTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define VPINT_AEQCTL_CAUSE_ENA_S		30
+#define VPINT_AEQCTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_AEQCTL_INTEVENT_S			31
+#define VPINT_AEQCTL_INTEVENT_M			BIT(31)
+#define VPINT_ALLOC(_VF)			(0x001D1000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPINT_ALLOC_MAX_INDEX			255
 #define VPINT_ALLOC_FIRST_S			0
 #define VPINT_ALLOC_FIRST_M			ICE_M(0x7FF, 0)
 #define VPINT_ALLOC_LAST_S			12
 #define VPINT_ALLOC_LAST_M			ICE_M(0x7FF, 12)
+#define VPINT_ALLOC_VALID_S			31
 #define VPINT_ALLOC_VALID_M			BIT(31)
-#define VPINT_ALLOC_PCI(_VF)			(0x0009D000 + ((_VF) * 4))
+#define VPINT_ALLOC_PCI(_VF)			(0x0009D000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PCIR */
+#define VPINT_ALLOC_PCI_MAX_INDEX		255
 #define VPINT_ALLOC_PCI_FIRST_S			0
 #define VPINT_ALLOC_PCI_FIRST_M			ICE_M(0x7FF, 0)
 #define VPINT_ALLOC_PCI_LAST_S			12
 #define VPINT_ALLOC_PCI_LAST_M			ICE_M(0x7FF, 12)
+#define VPINT_ALLOC_PCI_VALID_S			31
 #define VPINT_ALLOC_PCI_VALID_M			BIT(31)
-#define VPINT_MBX_CTL(_VSI)			(0x0016A000 + ((_VSI) * 4))
+#define VPINT_MBX_CPM_CTL(_VP128)		(0x0016B000 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VPINT_MBX_CPM_CTL_MAX_INDEX		127
+#define VPINT_MBX_CPM_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_CPM_CTL_ITR_INDX_S		11
+#define VPINT_MBX_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_CPM_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_CPM_CTL_INTEVENT_S		31
+#define VPINT_MBX_CPM_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_CTL(_VSI)			(0x0016A000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VPINT_MBX_CTL_MAX_INDEX			767
+#define VPINT_MBX_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_CTL_ITR_INDX_S		11
+#define VPINT_MBX_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_CTL_CAUSE_ENA_S		30
 #define VPINT_MBX_CTL_CAUSE_ENA_M		BIT(30)
-#define GLLAN_RCTL_0				0x002941F8
-#define QRX_CONTEXT(_i, _QRX)			(0x00280000 + ((_i) * 8192 + (_QRX) * 4))
-#define QRX_CTRL(_QRX)				(0x00120000 + ((_QRX) * 4))
+#define VPINT_MBX_CTL_INTEVENT_S		31
+#define VPINT_MBX_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_HLP_CTL(_VP16)		(0x0016B200 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPINT_MBX_HLP_CTL_MAX_INDEX		15
+#define VPINT_MBX_HLP_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_HLP_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_HLP_CTL_ITR_INDX_S		11
+#define VPINT_MBX_HLP_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_HLP_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_HLP_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_HLP_CTL_INTEVENT_S		31
+#define VPINT_MBX_HLP_CTL_INTEVENT_M		BIT(31)
+#define VPINT_MBX_PSM_CTL(_VP16)		(0x0016B240 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPINT_MBX_PSM_CTL_MAX_INDEX		15
+#define VPINT_MBX_PSM_CTL_MSIX_INDX_S		0
+#define VPINT_MBX_PSM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_MBX_PSM_CTL_ITR_INDX_S		11
+#define VPINT_MBX_PSM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_MBX_PSM_CTL_CAUSE_ENA_S		30
+#define VPINT_MBX_PSM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_MBX_PSM_CTL_INTEVENT_S		31
+#define VPINT_MBX_PSM_CTL_INTEVENT_M		BIT(31)
+#define VPINT_SB_CPM_CTL(_VP128)		(0x0016B400 + ((_VP128) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define VPINT_SB_CPM_CTL_MAX_INDEX		127
+#define VPINT_SB_CPM_CTL_MSIX_INDX_S		0
+#define VPINT_SB_CPM_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define VPINT_SB_CPM_CTL_ITR_INDX_S		11
+#define VPINT_SB_CPM_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define VPINT_SB_CPM_CTL_CAUSE_ENA_S		30
+#define VPINT_SB_CPM_CTL_CAUSE_ENA_M		BIT(30)
+#define VPINT_SB_CPM_CTL_INTEVENT_S		31
+#define VPINT_SB_CPM_CTL_INTEVENT_M		BIT(31)
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE(_i)	(0x00049240 + ((_i) * 4)) /* _i=0...20 */ /* Reset Source: CORER */
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_MAX_INDEX	20
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_IPG_PREAMBLE_SIZE_S 0
+#define GL_HLP_PRT_IPG_PREAMBLE_SIZE_IPG_PREAMBLE_SIZE_M ICE_M(0xFF, 0)
+#define GL_TDPU_PSM_DEFAULT_RECIPE(_i)		(0x00049294 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GL_TDPU_PSM_DEFAULT_RECIPE_MAX_INDEX	3
+#define GL_TDPU_PSM_DEFAULT_RECIPE_ADD_IPG_S	0
+#define GL_TDPU_PSM_DEFAULT_RECIPE_ADD_IPG_M	BIT(0)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_CRC_S	1
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_CRC_M	BIT(1)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_ESP_TRAILER_S 2
+#define GL_TDPU_PSM_DEFAULT_RECIPE_SUB_ESP_TRAILER_M BIT(2)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_INCLUDE_L2_PAD_S 3
+#define GL_TDPU_PSM_DEFAULT_RECIPE_INCLUDE_L2_PAD_M BIT(3)
+#define GL_TDPU_PSM_DEFAULT_RECIPE_DEFAULT_UPDATE_MODE_S 4
+#define GL_TDPU_PSM_DEFAULT_RECIPE_DEFAULT_UPDATE_MODE_M BIT(4)
+#define GLLAN_PF_RECIPE(_i)			(0x0029420C + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLLAN_PF_RECIPE_MAX_INDEX		7
+#define GLLAN_PF_RECIPE_RECIPE_S		0
+#define GLLAN_PF_RECIPE_RECIPE_M		ICE_M(0x3, 0)
+#define GLLAN_RCTL_0				0x002941F8 /* Reset Source: CORER */
+#define GLLAN_RCTL_0_PXE_MODE_S			0
+#define GLLAN_RCTL_0_PXE_MODE_M			BIT(0)
+#define GLLAN_RCTL_1				0x002941FC /* Reset Source: CORER */
+#define GLLAN_RCTL_1_RXMAX_EXPANSION_S		12
+#define GLLAN_RCTL_1_RXMAX_EXPANSION_M		ICE_M(0xF, 12)
+#define GLLAN_RCTL_1_RXDRDCTL_S			17
+#define GLLAN_RCTL_1_RXDRDCTL_M			BIT(17)
+#define GLLAN_RCTL_1_RXDESCRDROEN_S		18
+#define GLLAN_RCTL_1_RXDESCRDROEN_M		BIT(18)
+#define GLLAN_RCTL_1_RXDATAWRROEN_S		19
+#define GLLAN_RCTL_1_RXDATAWRROEN_M		BIT(19)
+#define GLLAN_TSOMSK_F				0x00049308 /* Reset Source: CORER */
+#define GLLAN_TSOMSK_F_TCPMSKF_S		0
+#define GLLAN_TSOMSK_F_TCPMSKF_M		ICE_M(0xFFF, 0)
+#define GLLAN_TSOMSK_L				0x00049310 /* Reset Source: CORER */
+#define GLLAN_TSOMSK_L_TCPMSKL_S		0
+#define GLLAN_TSOMSK_L_TCPMSKL_M		ICE_M(0xFFF, 0)
+#define GLLAN_TSOMSK_M				0x0004930C /* Reset Source: CORER */
+#define GLLAN_TSOMSK_M_TCPMSKM_S		0
+#define GLLAN_TSOMSK_M_TCPMSKM_M		ICE_M(0xFFF, 0)
+#define PFLAN_CP_QALLOC				0x00075700 /* Reset Source: CORER */
+#define PFLAN_CP_QALLOC_FIRSTQ_S		0
+#define PFLAN_CP_QALLOC_FIRSTQ_M		ICE_M(0x1FF, 0)
+#define PFLAN_CP_QALLOC_LASTQ_S			16
+#define PFLAN_CP_QALLOC_LASTQ_M			ICE_M(0x1FF, 16)
+#define PFLAN_CP_QALLOC_VALID_S			31
+#define PFLAN_CP_QALLOC_VALID_M			BIT(31)
+#define PFLAN_DB_QALLOC				0x00075680 /* Reset Source: CORER */
+#define PFLAN_DB_QALLOC_FIRSTQ_S		0
+#define PFLAN_DB_QALLOC_FIRSTQ_M		ICE_M(0xFF, 0)
+#define PFLAN_DB_QALLOC_LASTQ_S			16
+#define PFLAN_DB_QALLOC_LASTQ_M			ICE_M(0xFF, 16)
+#define PFLAN_DB_QALLOC_VALID_S			31
+#define PFLAN_DB_QALLOC_VALID_M			BIT(31)
+#define PFLAN_RX_QALLOC				0x001D2500 /* Reset Source: CORER */
+#define PFLAN_RX_QALLOC_FIRSTQ_S		0
+#define PFLAN_RX_QALLOC_FIRSTQ_M		ICE_M(0x7FF, 0)
+#define PFLAN_RX_QALLOC_LASTQ_S			16
+#define PFLAN_RX_QALLOC_LASTQ_M			ICE_M(0x7FF, 16)
+#define PFLAN_RX_QALLOC_VALID_S			31
+#define PFLAN_RX_QALLOC_VALID_M			BIT(31)
+#define PFLAN_TX_QALLOC				0x001D2580 /* Reset Source: CORER */
+#define PFLAN_TX_QALLOC_FIRSTQ_S		0
+#define PFLAN_TX_QALLOC_FIRSTQ_M		ICE_M(0x3FFF, 0)
+#define PFLAN_TX_QALLOC_LASTQ_S			16
+#define PFLAN_TX_QALLOC_LASTQ_M			ICE_M(0x3FFF, 16)
+#define PFLAN_TX_QALLOC_VALID_S			31
+#define PFLAN_TX_QALLOC_VALID_M			BIT(31)
+#define PRT_TDPUL2TAGSEN			0x00040BA0 /* Reset Source: CORER */
+#define PRT_TDPUL2TAGSEN_ENABLE_S		0
+#define PRT_TDPUL2TAGSEN_ENABLE_M		ICE_M(0xFF, 0)
+#define PRT_TDPUL2TAGSEN_NONLAST_TAG_S		8
+#define PRT_TDPUL2TAGSEN_NONLAST_TAG_M		ICE_M(0xFF, 8)
+#define QRX_CONTEXT(_i, _QRX)			(0x00280000 + ((_i) * 8192 + (_QRX) * 4)) /* _i=0...7, _QRX=0...2047 */ /* Reset Source: CORER */
+#define QRX_CONTEXT_MAX_INDEX			7
+#define QRX_CONTEXT_RXQ_CONTEXT_S		0
+#define QRX_CONTEXT_RXQ_CONTEXT_M		ICE_M(0xFFFFFFFF, 0)
+#define QRX_CTRL(_QRX)				(0x00120000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: PFR */
 #define QRX_CTRL_MAX_INDEX			2047
 #define QRX_CTRL_QENA_REQ_S			0
 #define QRX_CTRL_QENA_REQ_M			BIT(0)
+#define QRX_CTRL_FAST_QDIS_S			1
+#define QRX_CTRL_FAST_QDIS_M			BIT(1)
 #define QRX_CTRL_QENA_STAT_S			2
 #define QRX_CTRL_QENA_STAT_M			BIT(2)
-#define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4))
-#define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4))
+#define QRX_CTRL_CDE_S				3
+#define QRX_CTRL_CDE_M				BIT(3)
+#define QRX_CTRL_CDS_S				4
+#define QRX_CTRL_CDS_M				BIT(4)
+#define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define QRX_ITR_MAX_INDEX			2047
+#define QRX_ITR_NO_EXPR_S			0
+#define QRX_ITR_NO_EXPR_M			BIT(0)
+#define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
 #define QRX_TAIL_MAX_INDEX			2047
 #define QRX_TAIL_TAIL_S				0
 #define QRX_TAIL_TAIL_M				ICE_M(0x1FFF, 0)
-#define VPLAN_RX_QBASE(_VF)			(0x00072000 + ((_VF) * 4))
+#define VPDSI_RX_QTABLE(_i, _VP16)		(0x00074C00 + ((_i) * 64 + (_VP16) * 4)) /* _i=0...15, _VP16=0...15 */ /* Reset Source: CORER */
+#define VPDSI_RX_QTABLE_MAX_INDEX		15
+#define VPDSI_RX_QTABLE_PAGE_INDEX0_S		0
+#define VPDSI_RX_QTABLE_PAGE_INDEX0_M		ICE_M(0x7F, 0)
+#define VPDSI_RX_QTABLE_PAGE_INDEX1_S		8
+#define VPDSI_RX_QTABLE_PAGE_INDEX1_M		ICE_M(0x7F, 8)
+#define VPDSI_RX_QTABLE_PAGE_INDEX2_S		16
+#define VPDSI_RX_QTABLE_PAGE_INDEX2_M		ICE_M(0x7F, 16)
+#define VPDSI_RX_QTABLE_PAGE_INDEX3_S		24
+#define VPDSI_RX_QTABLE_PAGE_INDEX3_M		ICE_M(0x7F, 24)
+#define VPDSI_TX_QTABLE(_i, _VP16)		(0x001D2000 + ((_i) * 64 + (_VP16) * 4)) /* _i=0...15, _VP16=0...15 */ /* Reset Source: CORER */
+#define VPDSI_TX_QTABLE_MAX_INDEX		15
+#define VPDSI_TX_QTABLE_PAGE_INDEX0_S		0
+#define VPDSI_TX_QTABLE_PAGE_INDEX0_M		ICE_M(0x7F, 0)
+#define VPDSI_TX_QTABLE_PAGE_INDEX1_S		8
+#define VPDSI_TX_QTABLE_PAGE_INDEX1_M		ICE_M(0x7F, 8)
+#define VPDSI_TX_QTABLE_PAGE_INDEX2_S		16
+#define VPDSI_TX_QTABLE_PAGE_INDEX2_M		ICE_M(0x7F, 16)
+#define VPDSI_TX_QTABLE_PAGE_INDEX3_S		24
+#define VPDSI_TX_QTABLE_PAGE_INDEX3_M		ICE_M(0x7F, 24)
+#define VPLAN_DB_QTABLE(_i, _VF)		(0x00070000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...3, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_DB_QTABLE_MAX_INDEX		3
+#define VPLAN_DB_QTABLE_QINDEX_S		0
+#define VPLAN_DB_QTABLE_QINDEX_M		ICE_M(0x1FF, 0)
+#define VPLAN_DSI_VF_MODE(_VP16)		(0x002D2C00 + ((_VP16) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define VPLAN_DSI_VF_MODE_MAX_INDEX		15
+#define VPLAN_DSI_VF_MODE_LAN_DSI_VF_MODE_S	0
+#define VPLAN_DSI_VF_MODE_LAN_DSI_VF_MODE_M	BIT(0)
+#define VPLAN_RX_QBASE(_VF)			(0x00072000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RX_QBASE_MAX_INDEX		255
 #define VPLAN_RX_QBASE_VFFIRSTQ_S		0
 #define VPLAN_RX_QBASE_VFFIRSTQ_M		ICE_M(0x7FF, 0)
 #define VPLAN_RX_QBASE_VFNUMQ_S			16
 #define VPLAN_RX_QBASE_VFNUMQ_M			ICE_M(0xFF, 16)
-#define VPLAN_RXQ_MAPENA(_VF)			(0x00073000 + ((_VF) * 4))
+#define VPLAN_RX_QBASE_VFQTABLE_ENA_S		31
+#define VPLAN_RX_QBASE_VFQTABLE_ENA_M		BIT(31)
+#define VPLAN_RX_QTABLE(_i, _VF)		(0x00060000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...15, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RX_QTABLE_MAX_INDEX		15
+#define VPLAN_RX_QTABLE_QINDEX_S		0
+#define VPLAN_RX_QTABLE_QINDEX_M		ICE_M(0xFFF, 0)
+#define VPLAN_RXQ_MAPENA(_VF)			(0x00073000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_RXQ_MAPENA_MAX_INDEX		255
+#define VPLAN_RXQ_MAPENA_RX_ENA_S		0
 #define VPLAN_RXQ_MAPENA_RX_ENA_M		BIT(0)
-#define VPLAN_TX_QBASE(_VF)			(0x001D1800 + ((_VF) * 4))
+#define VPLAN_TX_QBASE(_VF)			(0x001D1800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TX_QBASE_MAX_INDEX		255
 #define VPLAN_TX_QBASE_VFFIRSTQ_S		0
 #define VPLAN_TX_QBASE_VFFIRSTQ_M		ICE_M(0x3FFF, 0)
 #define VPLAN_TX_QBASE_VFNUMQ_S			16
 #define VPLAN_TX_QBASE_VFNUMQ_M			ICE_M(0xFF, 16)
-#define VPLAN_TXQ_MAPENA(_VF)			(0x00073800 + ((_VF) * 4))
+#define VPLAN_TX_QBASE_VFQTABLE_ENA_S		31
+#define VPLAN_TX_QBASE_VFQTABLE_ENA_M		BIT(31)
+#define VPLAN_TX_QTABLE(_i, _VF)		(0x001C0000 + ((_i) * 2048 + (_VF) * 4)) /* _i=0...15, _VF=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TX_QTABLE_MAX_INDEX		15
+#define VPLAN_TX_QTABLE_QINDEX_S		0
+#define VPLAN_TX_QTABLE_QINDEX_M		ICE_M(0x7FFF, 0)
+#define VPLAN_TXQ_MAPENA(_VF)			(0x00073800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPLAN_TXQ_MAPENA_MAX_INDEX		255
+#define VPLAN_TXQ_MAPENA_TX_ENA_S		0
 #define VPLAN_TXQ_MAPENA_TX_ENA_M		BIT(0)
-#define GL_MDET_RX				0x00294C00
+#define VSILAN_QBASE(_VSI)			(0x0044C000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSILAN_QBASE_MAX_INDEX			767
+#define VSILAN_QBASE_VSIBASE_S			0
+#define VSILAN_QBASE_VSIBASE_M			ICE_M(0x7FF, 0)
+#define VSILAN_QBASE_VSIQTABLE_ENA_S		11
+#define VSILAN_QBASE_VSIQTABLE_ENA_M		BIT(11)
+#define VSILAN_QTABLE(_i, _VSI)			(0x00440000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...7, _VSI=0...767 */ /* Reset Source: PFR */
+#define VSILAN_QTABLE_MAX_INDEX			7
+#define VSILAN_QTABLE_QINDEX_0_S		0
+#define VSILAN_QTABLE_QINDEX_0_M		ICE_M(0x7FF, 0)
+#define VSILAN_QTABLE_QINDEX_1_S		16
+#define VSILAN_QTABLE_QINDEX_1_M		ICE_M(0x7FF, 16)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP		0x001E31C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP		0x001E34C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP		0x001E35C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_S 0
+#define PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL	0x001E36C0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_S 0
+#define PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_M BIT(0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1 0x001E3220 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2 0x001E3240 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE		0x001E3180 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_M ICE_M(0x1FF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1	0x001E3280 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2	0x001E32A0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_S 0
+#define PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_RX_QUANTA_S		0x001E3C40 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_S 0
+#define PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_PRTMAC_HSEC_CTL_RX_QUANTA_SHIFT_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE		0x001E31A0 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_M ICE_M(0x1FF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(_i)	(0x001E36E0 + ((_i) * 32)) /* _i=0...8 */ /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX 8
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(_i) (0x001E3800 + ((_i) * 32)) /* _i=0...8 */ /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MAX_INDEX 8
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_S 0
+#define PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_M ICE_M(0xFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_SA_PART1		0x001E3960 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_S 0
+#define PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_HSEC_CTL_TX_SA_PART2		0x001E3980 /* Reset Source: GLOBR */
+#define PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_S 0
+#define PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_M ICE_M(0xFFFF, 0)
+#define PRTMAC_LINK_DOWN_COUNTER		0x001E47C0 /* Reset Source: GLOBR */
+#define PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_S 0
+#define PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_M ICE_M(0xFFFF, 0)
+#define PRTMAC_MD_OVRRIDE_ENABLE(_i)		(0x001E3C60 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTMAC_MD_OVRRIDE_ENABLE_MAX_INDEX	7
+#define PRTMAC_MD_OVRRIDE_ENABLE_PRTMAC_MD_OVRRIDE_ENABLE_S 0
+#define PRTMAC_MD_OVRRIDE_ENABLE_PRTMAC_MD_OVRRIDE_ENABLE_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_MD_OVRRIDE_VAL(_i)		(0x001E3D60 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: GLOBR */
+#define PRTMAC_MD_OVRRIDE_VAL_MAX_INDEX		7
+#define PRTMAC_MD_OVRRIDE_VAL_PRTMAC_MD_OVRRIDE_ENABLE_S 0
+#define PRTMAC_MD_OVRRIDE_VAL_PRTMAC_MD_OVRRIDE_ENABLE_M ICE_M(0xFFFFFFFF, 0)
+#define PRTMAC_RX_CNT_MRKR			0x001E48E0 /* Reset Source: GLOBR */
+#define PRTMAC_RX_CNT_MRKR_RX_CNT_MRKR_S	0
+#define PRTMAC_RX_CNT_MRKR_RX_CNT_MRKR_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_RX_PKT_DRP_CNT			0x001E3C20 /* Reset Source: GLOBR */
+#define PRTMAC_RX_PKT_DRP_CNT_RX_PKT_DRP_CNT_S	0
+#define PRTMAC_RX_PKT_DRP_CNT_RX_PKT_DRP_CNT_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_RX_PKT_DRP_CNT_RX_MKR_PKT_DRP_CNT_S 16
+#define PRTMAC_RX_PKT_DRP_CNT_RX_MKR_PKT_DRP_CNT_M ICE_M(0xFFFF, 16)
+#define PRTMAC_TX_CNT_MRKR			0x001E48C0 /* Reset Source: GLOBR */
+#define PRTMAC_TX_CNT_MRKR_TX_CNT_MRKR_S	0
+#define PRTMAC_TX_CNT_MRKR_TX_CNT_MRKR_M	ICE_M(0xFFFF, 0)
+#define PRTMAC_TX_LNK_UP_CNT			0x001E4840 /* Reset Source: GLOBR */
+#define PRTMAC_TX_LNK_UP_CNT_TX_LINK_UP_CNT_S	0
+#define PRTMAC_TX_LNK_UP_CNT_TX_LINK_UP_CNT_M	ICE_M(0xFFFF, 0)
+#define GL_MDCK_CFG1_TX_PQM			0x002D2DF4 /* Reset Source: CORER */
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DATA_LEN_S	0
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DATA_LEN_M	ICE_M(0xFF, 0)
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_PKT_CNT_S	8
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_PKT_CNT_M	ICE_M(0x3F, 8)
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DESC_CNT_S	16
+#define GL_MDCK_CFG1_TX_PQM_SSO_MAX_DESC_CNT_M	ICE_M(0x3F, 16)
+#define GL_MDCK_EN_TX_PQM			0x002D2DFC /* Reset Source: CORER */
+#define GL_MDCK_EN_TX_PQM_PCI_DUMMY_COMP_S	0
+#define GL_MDCK_EN_TX_PQM_PCI_DUMMY_COMP_M	BIT(0)
+#define GL_MDCK_EN_TX_PQM_PCI_UR_COMP_S		1
+#define GL_MDCK_EN_TX_PQM_PCI_UR_COMP_M		BIT(1)
+#define GL_MDCK_EN_TX_PQM_RCV_SH_BE_LSO_S	3
+#define GL_MDCK_EN_TX_PQM_RCV_SH_BE_LSO_M	BIT(3)
+#define GL_MDCK_EN_TX_PQM_Q_FL_MNG_EPY_CH_S	4
+#define GL_MDCK_EN_TX_PQM_Q_FL_MNG_EPY_CH_M	BIT(4)
+#define GL_MDCK_EN_TX_PQM_Q_EPY_MNG_FL_CH_S	5
+#define GL_MDCK_EN_TX_PQM_Q_EPY_MNG_FL_CH_M	BIT(5)
+#define GL_MDCK_EN_TX_PQM_LSO_NUMDESCS_ZERO_S	6
+#define GL_MDCK_EN_TX_PQM_LSO_NUMDESCS_ZERO_M	BIT(6)
+#define GL_MDCK_EN_TX_PQM_LSO_LENGTH_ZERO_S	7
+#define GL_MDCK_EN_TX_PQM_LSO_LENGTH_ZERO_M	BIT(7)
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_BELOW_MIN_S	8
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_BELOW_MIN_M	BIT(8)
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_ABOVE_MAX_S	9
+#define GL_MDCK_EN_TX_PQM_LSO_MSS_ABOVE_MAX_M	BIT(9)
+#define GL_MDCK_EN_TX_PQM_LSO_HDR_SIZE_ZERO_S	10
+#define GL_MDCK_EN_TX_PQM_LSO_HDR_SIZE_ZERO_M	BIT(10)
+#define GL_MDCK_EN_TX_PQM_RCV_CNT_BE_LSO_S	11
+#define GL_MDCK_EN_TX_PQM_RCV_CNT_BE_LSO_M	BIT(11)
+#define GL_MDCK_EN_TX_PQM_SKIP_ONE_QT_ONLY_S	12
+#define GL_MDCK_EN_TX_PQM_SKIP_ONE_QT_ONLY_M	BIT(12)
+#define GL_MDCK_EN_TX_PQM_LSO_PKTCNT_ZERO_S	13
+#define GL_MDCK_EN_TX_PQM_LSO_PKTCNT_ZERO_M	BIT(13)
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_ZERO_S	14
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_ZERO_M	BIT(14)
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_EXCEED_S	15
+#define GL_MDCK_EN_TX_PQM_SSO_LENGTH_EXCEED_M	BIT(15)
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_ZERO_S	16
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_ZERO_M	BIT(16)
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_EXCEED_S	17
+#define GL_MDCK_EN_TX_PQM_SSO_PKTCNT_EXCEED_M	BIT(17)
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_ZERO_S	18
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_ZERO_M	BIT(18)
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_EXCEED_S 19
+#define GL_MDCK_EN_TX_PQM_SSO_NUMDESCS_EXCEED_M BIT(19)
+#define GL_MDCK_EN_TX_PQM_TAIL_GT_RING_LENGTH_S 20
+#define GL_MDCK_EN_TX_PQM_TAIL_GT_RING_LENGTH_M BIT(20)
+#define GL_MDCK_EN_TX_PQM_RESERVED_DBL_TYPE_S	21
+#define GL_MDCK_EN_TX_PQM_RESERVED_DBL_TYPE_M	BIT(21)
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_HEAD_DROP_DBL_S 22
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_HEAD_DROP_DBL_M BIT(22)
+#define GL_MDCK_EN_TX_PQM_LSO_OVER_COMMS_Q_S	23
+#define GL_MDCK_EN_TX_PQM_LSO_OVER_COMMS_Q_M	BIT(23)
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_VF_QNUM_S	24
+#define GL_MDCK_EN_TX_PQM_ILLEGAL_VF_QNUM_M	BIT(24)
+#define GL_MDCK_EN_TX_PQM_QTAIL_GT_RING_LENGTH_S 25
+#define GL_MDCK_EN_TX_PQM_QTAIL_GT_RING_LENGTH_M BIT(25)
+#define GL_MDCK_EN_TX_PQM_RSVD_S		26
+#define GL_MDCK_EN_TX_PQM_RSVD_M		ICE_M(0x3F, 26)
+#define GL_MDCK_RX				0x0029422C /* Reset Source: CORER */
+#define GL_MDCK_RX_DESC_ADDR_S			0
+#define GL_MDCK_RX_DESC_ADDR_M			BIT(0)
+#define GL_MDCK_TX_TDPU				0x00049348 /* Reset Source: CORER */
+#define GL_MDCK_TX_TDPU_TTL_ERR_ITR_DIS_S	0
+#define GL_MDCK_TX_TDPU_TTL_ERR_ITR_DIS_M	BIT(0)
+#define GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_S 1
+#define GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_M BIT(1)
+#define GL_MDCK_TX_TDPU_PCIE_UR_ITR_DIS_S	2
+#define GL_MDCK_TX_TDPU_PCIE_UR_ITR_DIS_M	BIT(2)
+#define GL_MDCK_TX_TDPU_MAL_OFFSET_ITR_DIS_S	3
+#define GL_MDCK_TX_TDPU_MAL_OFFSET_ITR_DIS_M	BIT(3)
+#define GL_MDCK_TX_TDPU_MAL_CMD_ITR_DIS_S	4
+#define GL_MDCK_TX_TDPU_MAL_CMD_ITR_DIS_M	BIT(4)
+#define GL_MDCK_TX_TDPU_BIG_PKT_SIZE_ITR_DIS_S	5
+#define GL_MDCK_TX_TDPU_BIG_PKT_SIZE_ITR_DIS_M	BIT(5)
+#define GL_MDCK_TX_TDPU_L2_ACCEPT_FAIL_ITR_DIS_S 6
+#define GL_MDCK_TX_TDPU_L2_ACCEPT_FAIL_ITR_DIS_M BIT(6)
+#define GL_MDCK_TX_TDPU_NIC_DSI_ITR_DIS_S	7
+#define GL_MDCK_TX_TDPU_NIC_DSI_ITR_DIS_M	BIT(7)
+#define GL_MDCK_TX_TDPU_MAL_IPSEC_CMD_ITR_DIS_S 8
+#define GL_MDCK_TX_TDPU_MAL_IPSEC_CMD_ITR_DIS_M BIT(8)
+#define GL_MDCK_TX_TDPU_DSCP_CHECK_FAIL_ITR_DIS_S 9
+#define GL_MDCK_TX_TDPU_DSCP_CHECK_FAIL_ITR_DIS_M BIT(9)
+#define GL_MDCK_TX_TDPU_NIC_IPSEC_ITR_DIS_S	10
+#define GL_MDCK_TX_TDPU_NIC_IPSEC_ITR_DIS_M	BIT(10)
+#define GL_MDET_RX				0x00294C00 /* Reset Source: CORER */
 #define GL_MDET_RX_QNUM_S			0
 #define GL_MDET_RX_QNUM_M			ICE_M(0x7FFF, 0)
 #define GL_MDET_RX_VF_NUM_S			15
@@ -232,8 +5361,9 @@
 #define GL_MDET_RX_PF_NUM_M			ICE_M(0x7, 23)
 #define GL_MDET_RX_MAL_TYPE_S			26
 #define GL_MDET_RX_MAL_TYPE_M			ICE_M(0x1F, 26)
+#define GL_MDET_RX_VALID_S			31
 #define GL_MDET_RX_VALID_M			BIT(31)
-#define GL_MDET_TX_PQM				0x002D2E00
+#define GL_MDET_TX_PQM				0x002D2E00 /* Reset Source: CORER */
 #define GL_MDET_TX_PQM_PF_NUM_S			0
 #define GL_MDET_TX_PQM_PF_NUM_M			ICE_M(0x7, 0)
 #define GL_MDET_TX_PQM_VF_NUM_S			4
@@ -242,8 +5372,9 @@
 #define GL_MDET_TX_PQM_QNUM_M			ICE_M(0x3FFF, 12)
 #define GL_MDET_TX_PQM_MAL_TYPE_S		26
 #define GL_MDET_TX_PQM_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_PQM_VALID_S			31
 #define GL_MDET_TX_PQM_VALID_M			BIT(31)
-#define GL_MDET_TX_TCLAN			0x000FC068
+#define GL_MDET_TX_TCLAN			0x000FC068 /* Reset Source: CORER */
 #define GL_MDET_TX_TCLAN_QNUM_S			0
 #define GL_MDET_TX_TCLAN_QNUM_M			ICE_M(0x7FFF, 0)
 #define GL_MDET_TX_TCLAN_VF_NUM_S		15
@@ -252,102 +5383,4071 @@
 #define GL_MDET_TX_TCLAN_PF_NUM_M		ICE_M(0x7, 23)
 #define GL_MDET_TX_TCLAN_MAL_TYPE_S		26
 #define GL_MDET_TX_TCLAN_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_TCLAN_VALID_S		31
 #define GL_MDET_TX_TCLAN_VALID_M		BIT(31)
-#define PF_MDET_RX				0x00294280
+#define GL_MDET_TX_TDPU				0x00049350 /* Reset Source: CORER */
+#define GL_MDET_TX_TDPU_QNUM_S			0
+#define GL_MDET_TX_TDPU_QNUM_M			ICE_M(0x7FFF, 0)
+#define GL_MDET_TX_TDPU_VF_NUM_S		15
+#define GL_MDET_TX_TDPU_VF_NUM_M		ICE_M(0xFF, 15)
+#define GL_MDET_TX_TDPU_PF_NUM_S		23
+#define GL_MDET_TX_TDPU_PF_NUM_M		ICE_M(0x7, 23)
+#define GL_MDET_TX_TDPU_MAL_TYPE_S		26
+#define GL_MDET_TX_TDPU_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_TDPU_VALID_S			31
+#define GL_MDET_TX_TDPU_VALID_M			BIT(31)
+#define GLRLAN_MDET				0x00294200 /* Reset Source: CORER */
+#define GLRLAN_MDET_PCKT_EXTRCT_ERR_S		0
+#define GLRLAN_MDET_PCKT_EXTRCT_ERR_M		BIT(0)
+#define PF_MDET_RX				0x00294280 /* Reset Source: CORER */
+#define PF_MDET_RX_VALID_S			0
 #define PF_MDET_RX_VALID_M			BIT(0)
-#define PF_MDET_TX_PQM				0x002D2C80
+#define PF_MDET_TX_PQM				0x002D2C80 /* Reset Source: CORER */
+#define PF_MDET_TX_PQM_VALID_S			0
 #define PF_MDET_TX_PQM_VALID_M			BIT(0)
-#define PF_MDET_TX_TCLAN			0x000FC000
+#define PF_MDET_TX_TCLAN			0x000FC000 /* Reset Source: CORER */
+#define PF_MDET_TX_TCLAN_VALID_S		0
 #define PF_MDET_TX_TCLAN_VALID_M		BIT(0)
-#define VP_MDET_RX(_VF)				(0x00294400 + ((_VF) * 4))
+#define PF_MDET_TX_TDPU				0x00040800 /* Reset Source: CORER */
+#define PF_MDET_TX_TDPU_VALID_S			0
+#define PF_MDET_TX_TDPU_VALID_M			BIT(0)
+#define VP_MDET_RX(_VF)				(0x00294400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_RX_MAX_INDEX			255
+#define VP_MDET_RX_VALID_S			0
 #define VP_MDET_RX_VALID_M			BIT(0)
-#define VP_MDET_TX_PQM(_VF)			(0x002D2000 + ((_VF) * 4))
+#define VP_MDET_TX_PQM(_VF)			(0x002D2000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_PQM_MAX_INDEX		255
+#define VP_MDET_TX_PQM_VALID_S			0
 #define VP_MDET_TX_PQM_VALID_M			BIT(0)
-#define VP_MDET_TX_TCLAN(_VF)			(0x000FB800 + ((_VF) * 4))
+#define VP_MDET_TX_TCLAN(_VF)			(0x000FB800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_TCLAN_MAX_INDEX		255
+#define VP_MDET_TX_TCLAN_VALID_S		0
 #define VP_MDET_TX_TCLAN_VALID_M		BIT(0)
-#define VP_MDET_TX_TDPU(_VF)			(0x00040000 + ((_VF) * 4))
+#define VP_MDET_TX_TDPU(_VF)			(0x00040000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VP_MDET_TX_TDPU_MAX_INDEX		255
+#define VP_MDET_TX_TDPU_VALID_S			0
 #define VP_MDET_TX_TDPU_VALID_M			BIT(0)
-#define GLNVM_FLA				0x000B6108
+#define GENERAL_MNG_FW_DBG_CSR(_i)		(0x000B6180 + ((_i) * 4)) /* _i=0...9 */ /* Reset Source: POR */
+#define GENERAL_MNG_FW_DBG_CSR_MAX_INDEX	9
+#define GENERAL_MNG_FW_DBG_CSR_GENERAL_FW_DBG_S 0
+#define GENERAL_MNG_FW_DBG_CSR_GENERAL_FW_DBG_M ICE_M(0xFFFFFFFF, 0)
+#define GL_FWRESETCNT				0x00083100 /* Reset Source: POR */
+#define GL_FWRESETCNT_FWRESETCNT_S		0
+#define GL_FWRESETCNT_FWRESETCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_FW_RAM_STAT			0x0008309C /* Reset Source: POR */
+#define GL_MNG_FW_RAM_STAT_FW_RAM_RST_STAT_S	0
+#define GL_MNG_FW_RAM_STAT_FW_RAM_RST_STAT_M	BIT(0)
+#define GL_MNG_FW_RAM_STAT_MNG_MEM_ECC_ERR_S	1
+#define GL_MNG_FW_RAM_STAT_MNG_MEM_ECC_ERR_M	BIT(1)
+#define GL_MNG_FWSM				0x000B6134 /* Reset Source: POR */
+#define GL_MNG_FWSM_FW_MODES_S			0
+#define GL_MNG_FWSM_FW_MODES_M			ICE_M(0x7, 0)
+#define GL_MNG_FWSM_RSV0_S			3
+#define GL_MNG_FWSM_RSV0_M			ICE_M(0x7F, 3)
+#define GL_MNG_FWSM_EEP_RELOAD_IND_S		10
+#define GL_MNG_FWSM_EEP_RELOAD_IND_M		BIT(10)
+#define GL_MNG_FWSM_RSV1_S			11
+#define GL_MNG_FWSM_RSV1_M			ICE_M(0xF, 11)
+#define GL_MNG_FWSM_RSV2_S			15
+#define GL_MNG_FWSM_RSV2_M			BIT(15)
+#define GL_MNG_FWSM_PCIR_AL_FAILURE_S		16
+#define GL_MNG_FWSM_PCIR_AL_FAILURE_M		BIT(16)
+#define GL_MNG_FWSM_POR_AL_FAILURE_S		17
+#define GL_MNG_FWSM_POR_AL_FAILURE_M		BIT(17)
+#define GL_MNG_FWSM_RSV3_S			18
+#define GL_MNG_FWSM_RSV3_M			BIT(18)
+#define GL_MNG_FWSM_EXT_ERR_IND_S		19
+#define GL_MNG_FWSM_EXT_ERR_IND_M		ICE_M(0x3F, 19)
+#define GL_MNG_FWSM_RSV4_S			25
+#define GL_MNG_FWSM_RSV4_M			BIT(25)
+#define GL_MNG_FWSM_RESERVED_11_S		26
+#define GL_MNG_FWSM_RESERVED_11_M		ICE_M(0xF, 26)
+#define GL_MNG_FWSM_RSV5_S			30
+#define GL_MNG_FWSM_RSV5_M			ICE_M(0x3, 30)
+#define GL_MNG_HWARB_CTRL			0x000B6130 /* Reset Source: POR */
+#define GL_MNG_HWARB_CTRL_NCSI_ARB_EN_S		0
+#define GL_MNG_HWARB_CTRL_NCSI_ARB_EN_M		BIT(0)
+#define GL_MNG_SHA_EXTEND(_i)			(0x00083120 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_MAX_INDEX		7
+#define GL_MNG_SHA_EXTEND_GL_MNG_SHA_EXTEND_S	0
+#define GL_MNG_SHA_EXTEND_GL_MNG_SHA_EXTEND_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_SHA_EXTEND_ROM(_i)		(0x00083160 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_ROM_MAX_INDEX		7
+#define GL_MNG_SHA_EXTEND_ROM_GL_MNG_SHA_EXTEND_ROM_S 0
+#define GL_MNG_SHA_EXTEND_ROM_GL_MNG_SHA_EXTEND_ROM_M ICE_M(0xFFFFFFFF, 0)
+#define GL_MNG_SHA_EXTEND_STATUS		0x00083148 /* Reset Source: EMPR */
+#define GL_MNG_SHA_EXTEND_STATUS_STAGE_S	0
+#define GL_MNG_SHA_EXTEND_STATUS_STAGE_M	ICE_M(0x7, 0)
+#define GL_MNG_SHA_EXTEND_STATUS_FW_HALTED_S	30
+#define GL_MNG_SHA_EXTEND_STATUS_FW_HALTED_M	BIT(30)
+#define GL_MNG_SHA_EXTEND_STATUS_DONE_S		31
+#define GL_MNG_SHA_EXTEND_STATUS_DONE_M		BIT(31)
+#define GL_SWT_PRT2MDEF(_i)			(0x00216018 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: POR */
+#define GL_SWT_PRT2MDEF_MAX_INDEX		31
+#define GL_SWT_PRT2MDEF_MDEFIDX_S		0
+#define GL_SWT_PRT2MDEF_MDEFIDX_M		ICE_M(0x7, 0)
+#define GL_SWT_PRT2MDEF_MDEFENA_S		31
+#define GL_SWT_PRT2MDEF_MDEFENA_M		BIT(31)
+#define PRT_MNG_MANC				0x00214720 /* Reset Source: POR */
+#define PRT_MNG_MANC_FLOW_CONTROL_DISCARD_S	0
+#define PRT_MNG_MANC_FLOW_CONTROL_DISCARD_M	BIT(0)
+#define PRT_MNG_MANC_NCSI_DISCARD_S		1
+#define PRT_MNG_MANC_NCSI_DISCARD_M		BIT(1)
+#define PRT_MNG_MANC_RCV_TCO_EN_S		17
+#define PRT_MNG_MANC_RCV_TCO_EN_M		BIT(17)
+#define PRT_MNG_MANC_RCV_ALL_S			19
+#define PRT_MNG_MANC_RCV_ALL_M			BIT(19)
+#define PRT_MNG_MANC_FIXED_NET_TYPE_S		25
+#define PRT_MNG_MANC_FIXED_NET_TYPE_M		BIT(25)
+#define PRT_MNG_MANC_NET_TYPE_S			26
+#define PRT_MNG_MANC_NET_TYPE_M			BIT(26)
+#define PRT_MNG_MANC_EN_BMC2OS_S		28
+#define PRT_MNG_MANC_EN_BMC2OS_M		BIT(28)
+#define PRT_MNG_MANC_EN_BMC2NET_S		29
+#define PRT_MNG_MANC_EN_BMC2NET_M		BIT(29)
+#define PRT_MNG_MAVTV(_i)			(0x00214780 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MAVTV_MAX_INDEX			7
+#define PRT_MNG_MAVTV_VID_S			0
+#define PRT_MNG_MAVTV_VID_M			ICE_M(0xFFF, 0)
+#define PRT_MNG_MDEF(_i)			(0x00214880 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MDEF_MAX_INDEX			7
+#define PRT_MNG_MDEF_MAC_EXACT_AND_S		0
+#define PRT_MNG_MDEF_MAC_EXACT_AND_M		ICE_M(0xF, 0)
+#define PRT_MNG_MDEF_BROADCAST_AND_S		4
+#define PRT_MNG_MDEF_BROADCAST_AND_M		BIT(4)
+#define PRT_MNG_MDEF_VLAN_AND_S			5
+#define PRT_MNG_MDEF_VLAN_AND_M			ICE_M(0xFF, 5)
+#define PRT_MNG_MDEF_IPV4_ADDRESS_AND_S		13
+#define PRT_MNG_MDEF_IPV4_ADDRESS_AND_M		ICE_M(0xF, 13)
+#define PRT_MNG_MDEF_IPV6_ADDRESS_AND_S		17
+#define PRT_MNG_MDEF_IPV6_ADDRESS_AND_M		ICE_M(0xF, 17)
+#define PRT_MNG_MDEF_MAC_EXACT_OR_S		21
+#define PRT_MNG_MDEF_MAC_EXACT_OR_M		ICE_M(0xF, 21)
+#define PRT_MNG_MDEF_BROADCAST_OR_S		25
+#define PRT_MNG_MDEF_BROADCAST_OR_M		BIT(25)
+#define PRT_MNG_MDEF_MULTICAST_AND_S		26
+#define PRT_MNG_MDEF_MULTICAST_AND_M		BIT(26)
+#define PRT_MNG_MDEF_ARP_REQUEST_OR_S		27
+#define PRT_MNG_MDEF_ARP_REQUEST_OR_M		BIT(27)
+#define PRT_MNG_MDEF_ARP_RESPONSE_OR_S		28
+#define PRT_MNG_MDEF_ARP_RESPONSE_OR_M		BIT(28)
+#define PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_S 29
+#define PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_M BIT(29)
+#define PRT_MNG_MDEF_PORT_0X298_OR_S		30
+#define PRT_MNG_MDEF_PORT_0X298_OR_M		BIT(30)
+#define PRT_MNG_MDEF_PORT_0X26F_OR_S		31
+#define PRT_MNG_MDEF_PORT_0X26F_OR_M		BIT(31)
+#define PRT_MNG_MDEF_EXT(_i)			(0x00214A00 + ((_i) * 32)) /* _i=0...7 */ /* Reset Source: POR */
+#define PRT_MNG_MDEF_EXT_MAX_INDEX		7
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_S	0
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_M	ICE_M(0xF, 0)
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_S	4
+#define PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_M	ICE_M(0xF, 4)
+#define PRT_MNG_MDEF_EXT_FLEX_PORT_OR_S		8
+#define PRT_MNG_MDEF_EXT_FLEX_PORT_OR_M		ICE_M(0xFFFF, 8)
+#define PRT_MNG_MDEF_EXT_FLEX_TCO_S		24
+#define PRT_MNG_MDEF_EXT_FLEX_TCO_M		BIT(24)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_S 25
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_M BIT(25)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_S 26
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_M BIT(26)
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_S 27
+#define PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_M BIT(27)
+#define PRT_MNG_MDEF_EXT_ICMP_OR_S		28
+#define PRT_MNG_MDEF_EXT_ICMP_OR_M		BIT(28)
+#define PRT_MNG_MDEF_EXT_MLD_S			29
+#define PRT_MNG_MDEF_EXT_MLD_M			BIT(29)
+#define PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_S 30
+#define PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_M BIT(30)
+#define PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_S 31
+#define PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_M BIT(31)
+#define PRT_MNG_MDEFVSI(_i)			(0x00214980 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MDEFVSI_MAX_INDEX		3
+#define PRT_MNG_MDEFVSI_MDEFVSI_2N_S		0
+#define PRT_MNG_MDEFVSI_MDEFVSI_2N_M		ICE_M(0xFFFF, 0)
+#define PRT_MNG_MDEFVSI_MDEFVSI_2NP1_S		16
+#define PRT_MNG_MDEFVSI_MDEFVSI_2NP1_M		ICE_M(0xFFFF, 16)
+#define PRT_MNG_METF(_i)			(0x00214120 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_METF_MAX_INDEX			3
+#define PRT_MNG_METF_ETYPE_S			0
+#define PRT_MNG_METF_ETYPE_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_METF_POLARITY_S			30
+#define PRT_MNG_METF_POLARITY_M			BIT(30)
+#define PRT_MNG_MFUTP(_i)			(0x00214320 + ((_i) * 32)) /* _i=0...15 */ /* Reset Source: POR */
+#define PRT_MNG_MFUTP_MAX_INDEX			15
+#define PRT_MNG_MFUTP_MFUTP_N_S			0
+#define PRT_MNG_MFUTP_MFUTP_N_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_MFUTP_UDP_S			16
+#define PRT_MNG_MFUTP_UDP_M			BIT(16)
+#define PRT_MNG_MFUTP_TCP_S			17
+#define PRT_MNG_MFUTP_TCP_M			BIT(17)
+#define PRT_MNG_MFUTP_SOURCE_DESTINATION_S	18
+#define PRT_MNG_MFUTP_SOURCE_DESTINATION_M	BIT(18)
+#define PRT_MNG_MIPAF4(_i)			(0x002141A0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MIPAF4_MAX_INDEX		3
+#define PRT_MNG_MIPAF4_MIPAF_S			0
+#define PRT_MNG_MIPAF4_MIPAF_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MIPAF6(_i)			(0x00214520 + ((_i) * 32)) /* _i=0...15 */ /* Reset Source: POR */
+#define PRT_MNG_MIPAF6_MAX_INDEX		15
+#define PRT_MNG_MIPAF6_MIPAF_S			0
+#define PRT_MNG_MIPAF6_MIPAF_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MMAH(_i)			(0x00214220 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MMAH_MAX_INDEX			3
+#define PRT_MNG_MMAH_MMAH_S			0
+#define PRT_MNG_MMAH_MMAH_M			ICE_M(0xFFFF, 0)
+#define PRT_MNG_MMAL(_i)			(0x002142A0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: POR */
+#define PRT_MNG_MMAL_MAX_INDEX			3
+#define PRT_MNG_MMAL_MMAL_S			0
+#define PRT_MNG_MMAL_MMAL_M			ICE_M(0xFFFFFFFF, 0)
+#define PRT_MNG_MNGONLY				0x00214740 /* Reset Source: POR */
+#define PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_S 0
+#define PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_M ICE_M(0xFF, 0)
+#define PRT_MNG_MSFM				0x00214760 /* Reset Source: POR */
+#define PRT_MNG_MSFM_PORT_26F_UDP_S		0
+#define PRT_MNG_MSFM_PORT_26F_UDP_M		BIT(0)
+#define PRT_MNG_MSFM_PORT_26F_TCP_S		1
+#define PRT_MNG_MSFM_PORT_26F_TCP_M		BIT(1)
+#define PRT_MNG_MSFM_PORT_298_UDP_S		2
+#define PRT_MNG_MSFM_PORT_298_UDP_M		BIT(2)
+#define PRT_MNG_MSFM_PORT_298_TCP_S		3
+#define PRT_MNG_MSFM_PORT_298_TCP_M		BIT(3)
+#define PRT_MNG_MSFM_IPV6_0_MASK_S		4
+#define PRT_MNG_MSFM_IPV6_0_MASK_M		BIT(4)
+#define PRT_MNG_MSFM_IPV6_1_MASK_S		5
+#define PRT_MNG_MSFM_IPV6_1_MASK_M		BIT(5)
+#define PRT_MNG_MSFM_IPV6_2_MASK_S		6
+#define PRT_MNG_MSFM_IPV6_2_MASK_M		BIT(6)
+#define PRT_MNG_MSFM_IPV6_3_MASK_S		7
+#define PRT_MNG_MSFM_IPV6_3_MASK_M		BIT(7)
+#define MSIX_PBA_PAGE(_i)			(0x02E08000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: FLR */
+#define MSIX_PBA_PAGE_MAX_INDEX			63
+#define MSIX_PBA_PAGE_PENBIT_S			0
+#define MSIX_PBA_PAGE_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_PBA1(_i)				(0x00008000 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: FLR */
+#define MSIX_PBA1_MAX_INDEX			63
+#define MSIX_PBA1_PENBIT_S			0
+#define MSIX_PBA1_PENBIT_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TADD_PAGE(_i)			(0x02E00000 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TADD_PAGE_MAX_INDEX		2047
+#define MSIX_TADD_PAGE_MSIXTADD10_S		0
+#define MSIX_TADD_PAGE_MSIXTADD10_M		ICE_M(0x3, 0)
+#define MSIX_TADD_PAGE_MSIXTADD_S		2
+#define MSIX_TADD_PAGE_MSIXTADD_M		ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TADD1(_i)				(0x00000000 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TADD1_MAX_INDEX			2047
+#define MSIX_TADD1_MSIXTADD10_S			0
+#define MSIX_TADD1_MSIXTADD10_M			ICE_M(0x3, 0)
+#define MSIX_TADD1_MSIXTADD_S			2
+#define MSIX_TADD1_MSIXTADD_M			ICE_M(0x3FFFFFFF, 2)
+#define MSIX_TMSG(_i)				(0x00000008 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TMSG_MAX_INDEX			2047
+#define MSIX_TMSG_MSIXTMSG_S			0
+#define MSIX_TMSG_MSIXTMSG_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TMSG_PAGE(_i)			(0x02E00008 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TMSG_PAGE_MAX_INDEX		2047
+#define MSIX_TMSG_PAGE_MSIXTMSG_S		0
+#define MSIX_TMSG_PAGE_MSIXTMSG_M		ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TUADD_PAGE(_i)			(0x02E00004 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TUADD_PAGE_MAX_INDEX		2047
+#define MSIX_TUADD_PAGE_MSIXTUADD_S		0
+#define MSIX_TUADD_PAGE_MSIXTUADD_M		ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TUADD1(_i)				(0x00000004 + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TUADD1_MAX_INDEX			2047
+#define MSIX_TUADD1_MSIXTUADD_S			0
+#define MSIX_TUADD1_MSIXTUADD_M			ICE_M(0xFFFFFFFF, 0)
+#define MSIX_TVCTRL_PAGE(_i)			(0x02E0000C + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL_PAGE_MAX_INDEX		2047
+#define MSIX_TVCTRL_PAGE_MASK_S			0
+#define MSIX_TVCTRL_PAGE_MASK_M			BIT(0)
+#define MSIX_TVCTRL1(_i)			(0x0000000C + ((_i) * 16)) /* _i=0...2047 */ /* Reset Source: FLR */
+#define MSIX_TVCTRL1_MAX_INDEX			2047
+#define MSIX_TVCTRL1_MASK_S			0
+#define MSIX_TVCTRL1_MASK_M			BIT(0)
+#define GLNVM_AL_DONE_HLP			0x000824C4 /* Reset Source: POR */
+#define GLNVM_AL_DONE_HLP_HLP_CORER_S		0
+#define GLNVM_AL_DONE_HLP_HLP_CORER_M		BIT(0)
+#define GLNVM_AL_DONE_HLP_HLP_FULLR_S		1
+#define GLNVM_AL_DONE_HLP_HLP_FULLR_M		BIT(1)
+#define GLNVM_ALTIMERS				0x000B6140 /* Reset Source: POR */
+#define GLNVM_ALTIMERS_PCI_ALTIMER_S		0
+#define GLNVM_ALTIMERS_PCI_ALTIMER_M		ICE_M(0xFFF, 0)
+#define GLNVM_ALTIMERS_GEN_ALTIMER_S		12
+#define GLNVM_ALTIMERS_GEN_ALTIMER_M		ICE_M(0xFFFFF, 12)
+#define GLNVM_FLA				0x000B6108 /* Reset Source: POR */
+#define GLNVM_FLA_LOCKED_S			6
 #define GLNVM_FLA_LOCKED_M			BIT(6)
-#define GLNVM_GENS				0x000B6100
+#define GLNVM_GENS				0x000B6100 /* Reset Source: POR */
+#define GLNVM_GENS_NVM_PRES_S			0
+#define GLNVM_GENS_NVM_PRES_M			BIT(0)
 #define GLNVM_GENS_SR_SIZE_S			5
 #define GLNVM_GENS_SR_SIZE_M			ICE_M(0x7, 5)
-#define GLNVM_ULD				0x000B6008
+#define GLNVM_GENS_BANK1VAL_S			8
+#define GLNVM_GENS_BANK1VAL_M			BIT(8)
+#define GLNVM_GENS_ALT_PRST_S			23
+#define GLNVM_GENS_ALT_PRST_M			BIT(23)
+#define GLNVM_GENS_FL_AUTO_RD_S			25
+#define GLNVM_GENS_FL_AUTO_RD_M			BIT(25)
+#define GLNVM_PROTCSR(_i)			(0x000B6010 + ((_i) * 4)) /* _i=0...59 */ /* Reset Source: POR */
+#define GLNVM_PROTCSR_MAX_INDEX			59
+#define GLNVM_PROTCSR_ADDR_BLOCK_S		0
+#define GLNVM_PROTCSR_ADDR_BLOCK_M		ICE_M(0xFFFFFF, 0)
+#define GLNVM_ULD				0x000B6008 /* Reset Source: POR */
+#define GLNVM_ULD_PCIER_DONE_S			0
 #define GLNVM_ULD_PCIER_DONE_M			BIT(0)
+#define GLNVM_ULD_PCIER_DONE_1_S		1
 #define GLNVM_ULD_PCIER_DONE_1_M		BIT(1)
+#define GLNVM_ULD_CORER_DONE_S			3
 #define GLNVM_ULD_CORER_DONE_M			BIT(3)
+#define GLNVM_ULD_GLOBR_DONE_S			4
 #define GLNVM_ULD_GLOBR_DONE_M			BIT(4)
+#define GLNVM_ULD_POR_DONE_S			5
 #define GLNVM_ULD_POR_DONE_M			BIT(5)
+#define GLNVM_ULD_POR_DONE_1_S			8
 #define GLNVM_ULD_POR_DONE_1_M			BIT(8)
+#define GLNVM_ULD_PCIER_DONE_2_S		9
 #define GLNVM_ULD_PCIER_DONE_2_M		BIT(9)
+#define GLNVM_ULD_PE_DONE_S			10
 #define GLNVM_ULD_PE_DONE_M			BIT(10)
-#define GLPCI_CNF2				0x000BE004
+#define GLNVM_ULD_HLP_CORE_DONE_S		11
+#define GLNVM_ULD_HLP_CORE_DONE_M		BIT(11)
+#define GLNVM_ULD_HLP_FULL_DONE_S		12
+#define GLNVM_ULD_HLP_FULL_DONE_M		BIT(12)
+#define GLNVM_ULT				0x000B6154 /* Reset Source: POR */
+#define GLNVM_ULT_CONF_PCIR_AE_S		0
+#define GLNVM_ULT_CONF_PCIR_AE_M		BIT(0)
+#define GLNVM_ULT_CONF_PCIRTL_AE_S		1
+#define GLNVM_ULT_CONF_PCIRTL_AE_M		BIT(1)
+#define GLNVM_ULT_RESERVED_1_S			2
+#define GLNVM_ULT_RESERVED_1_M			BIT(2)
+#define GLNVM_ULT_CONF_CORE_AE_S		3
+#define GLNVM_ULT_CONF_CORE_AE_M		BIT(3)
+#define GLNVM_ULT_CONF_GLOBAL_AE_S		4
+#define GLNVM_ULT_CONF_GLOBAL_AE_M		BIT(4)
+#define GLNVM_ULT_CONF_POR_AE_S			5
+#define GLNVM_ULT_CONF_POR_AE_M			BIT(5)
+#define GLNVM_ULT_RESERVED_2_S			6
+#define GLNVM_ULT_RESERVED_2_M			BIT(6)
+#define GLNVM_ULT_RESERVED_3_S			7
+#define GLNVM_ULT_RESERVED_3_M			BIT(7)
+#define GLNVM_ULT_RESERVED_5_S			8
+#define GLNVM_ULT_RESERVED_5_M			BIT(8)
+#define GLNVM_ULT_CONF_PCIALT_AE_S		9
+#define GLNVM_ULT_CONF_PCIALT_AE_M		BIT(9)
+#define GLNVM_ULT_CONF_PE_AE_S			10
+#define GLNVM_ULT_CONF_PE_AE_M			BIT(10)
+#define GLNVM_ULT_RESERVED_4_S			11
+#define GLNVM_ULT_RESERVED_4_M			ICE_M(0x1FFFFF, 11)
+#define GL_COTF_MARKER_STATUS			0x00200200 /* Reset Source: CORER */
+#define GL_COTF_MARKER_STATUS_MRKR_BUSY_S	0
+#define GL_COTF_MARKER_STATUS_MRKR_BUSY_M	ICE_M(0xFF, 0)
+#define GL_COTF_MARKER_TRIG_RCU_PRS(_i)		(0x002001D4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GL_COTF_MARKER_TRIG_RCU_PRS_MAX_INDEX	7
+#define GL_COTF_MARKER_TRIG_RCU_PRS_SET_RST_S	0
+#define GL_COTF_MARKER_TRIG_RCU_PRS_SET_RST_M	BIT(0)
+#define GL_PRS_MARKER_ERROR			0x00200204 /* Reset Source: CORER */
+#define GL_PRS_MARKER_ERROR_XLR_CFG_ERR_S	0
+#define GL_PRS_MARKER_ERROR_XLR_CFG_ERR_M	BIT(0)
+#define GL_PRS_MARKER_ERROR_QH_CFG_ERR_S	1
+#define GL_PRS_MARKER_ERROR_QH_CFG_ERR_M	BIT(1)
+#define GL_PRS_MARKER_ERROR_COTF_CFG_ERR_S	2
+#define GL_PRS_MARKER_ERROR_COTF_CFG_ERR_M	BIT(2)
+#define GL_PRS_RX_PIPE_INIT0(_i)		(0x0020000C + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT0_MAX_INDEX		6
+#define GL_PRS_RX_PIPE_INIT0_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT0_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_PIPE_INIT1			0x00200028 /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT1_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT1_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_PIPE_INIT2			0x0020002C /* Reset Source: CORER */
+#define GL_PRS_RX_PIPE_INIT2_GPCSR_INIT_S	0
+#define GL_PRS_RX_PIPE_INIT2_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_RX_SIZE_CTRL			0x00200004 /* Reset Source: CORER */
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_S		0
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_M		ICE_M(0x3FF, 0)
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_EN_S	15
+#define GL_PRS_RX_SIZE_CTRL_MIN_SIZE_EN_M	BIT(15)
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_S		16
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_M		ICE_M(0x3FF, 16)
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_EN_S	31
+#define GL_PRS_RX_SIZE_CTRL_MAX_SIZE_EN_M	BIT(31)
+#define GL_PRS_TX_PIPE_INIT0(_i)		(0x00202018 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT0_MAX_INDEX		6
+#define GL_PRS_TX_PIPE_INIT0_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT0_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_PIPE_INIT1			0x00202034 /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT1_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT1_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_PIPE_INIT2			0x00202038 /* Reset Source: CORER */
+#define GL_PRS_TX_PIPE_INIT2_GPCSR_INIT_S	0
+#define GL_PRS_TX_PIPE_INIT2_GPCSR_INIT_M	ICE_M(0xFFFF, 0)
+#define GL_PRS_TX_SIZE_CTRL			0x00202014 /* Reset Source: CORER */
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_S		0
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_M		ICE_M(0x3FF, 0)
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_EN_S	15
+#define GL_PRS_TX_SIZE_CTRL_MIN_SIZE_EN_M	BIT(15)
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_S		16
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_M		ICE_M(0x3FF, 16)
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_EN_S	31
+#define GL_PRS_TX_SIZE_CTRL_MAX_SIZE_EN_M	BIT(31)
+#define GL_QH_MARKER_STATUS			0x002001FC /* Reset Source: CORER */
+#define GL_QH_MARKER_STATUS_MRKR_BUSY_S		0
+#define GL_QH_MARKER_STATUS_MRKR_BUSY_M		ICE_M(0xF, 0)
+#define GL_QH_MARKER_TRIG_RCU_PRS(_i)		(0x002001C4 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GL_QH_MARKER_TRIG_RCU_PRS_MAX_INDEX	3
+#define GL_QH_MARKER_TRIG_RCU_PRS_QPID_S	0
+#define GL_QH_MARKER_TRIG_RCU_PRS_QPID_M	ICE_M(0x3FFFF, 0)
+#define GL_QH_MARKER_TRIG_RCU_PRS_PE_TAG_S	18
+#define GL_QH_MARKER_TRIG_RCU_PRS_PE_TAG_M	ICE_M(0xFF, 18)
+#define GL_QH_MARKER_TRIG_RCU_PRS_PORT_NUM_S	26
+#define GL_QH_MARKER_TRIG_RCU_PRS_PORT_NUM_M	ICE_M(0x7, 26)
+#define GL_QH_MARKER_TRIG_RCU_PRS_SET_RST_S	31
+#define GL_QH_MARKER_TRIG_RCU_PRS_SET_RST_M	BIT(31)
+#define GL_RPRS_ANA_CSR_CTRL			0x00200708 /* Reset Source: CORER */
+#define GL_RPRS_ANA_CSR_CTRL_SELECT_EN_S	0
+#define GL_RPRS_ANA_CSR_CTRL_SELECT_EN_M	BIT(0)
+#define GL_RPRS_ANA_CSR_CTRL_SELECTED_ANA_S	1
+#define GL_RPRS_ANA_CSR_CTRL_SELECTED_ANA_M	BIT(1)
+#define GL_TPRS_ANA_CSR_CTRL			0x00202100 /* Reset Source: CORER */
+#define GL_TPRS_ANA_CSR_CTRL_SELECT_EN_S	0
+#define GL_TPRS_ANA_CSR_CTRL_SELECT_EN_M	BIT(0)
+#define GL_TPRS_ANA_CSR_CTRL_SELECTED_ANA_S	1
+#define GL_TPRS_ANA_CSR_CTRL_SELECTED_ANA_M	BIT(1)
+#define GL_TPRS_MNG_PM_THR			0x00202004 /* Reset Source: CORER */
+#define GL_TPRS_MNG_PM_THR_MNG_PM_THR_S		0
+#define GL_TPRS_MNG_PM_THR_MNG_PM_THR_M		ICE_M(0x3FFF, 0)
+#define GL_TPRS_PM_CNT(_i)			(0x00202008 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_TPRS_PM_CNT_MAX_INDEX		1
+#define GL_TPRS_PM_CNT_GL_PRS_PM_CNT_S		0
+#define GL_TPRS_PM_CNT_GL_PRS_PM_CNT_M		ICE_M(0x3FFF, 0)
+#define GL_TPRS_PM_THR				0x00202000 /* Reset Source: CORER */
+#define GL_TPRS_PM_THR_PM_THR_S			0
+#define GL_TPRS_PM_THR_PM_THR_M			ICE_M(0x3FFF, 0)
+#define GL_XLR_MARKER_LOG_RCU_PRS(_i)		(0x00200208 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_XLR_MARKER_LOG_RCU_PRS_MAX_INDEX	63
+#define GL_XLR_MARKER_LOG_RCU_PRS_XLR_TRIG_S	0
+#define GL_XLR_MARKER_LOG_RCU_PRS_XLR_TRIG_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_XLR_MARKER_STATUS(_i)		(0x002001F4 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GL_XLR_MARKER_STATUS_MAX_INDEX		1
+#define GL_XLR_MARKER_STATUS_MRKR_BUSY_S	0
+#define GL_XLR_MARKER_STATUS_MRKR_BUSY_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_XLR_MARKER_TRIG_PE			0x005008C0 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_TYPE_S	10
+#define GL_XLR_MARKER_TRIG_PE_VM_VF_TYPE_M	ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_PE_PF_NUM_S		12
+#define GL_XLR_MARKER_TRIG_PE_PF_NUM_M		ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_PE_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_PE_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_XLR_MARKER_TRIG_RCU_PRS		0x002001C0 /* Reset Source: CORER */
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_NUM_S	0
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_NUM_M	ICE_M(0x3FF, 0)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_TYPE_S 10
+#define GL_XLR_MARKER_TRIG_RCU_PRS_VM_VF_TYPE_M ICE_M(0x3, 10)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PF_NUM_S	12
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PF_NUM_M	ICE_M(0x7, 12)
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PORT_NUM_S	16
+#define GL_XLR_MARKER_TRIG_RCU_PRS_PORT_NUM_M	ICE_M(0x7, 16)
+#define GL_CLKGATE_EVENTS			0x0009DE70 /* Reset Source: PERST */
+#define GL_CLKGATE_EVENTS_PRIMARY_CLKGATE_EVENTS_S 0
+#define GL_CLKGATE_EVENTS_PRIMARY_CLKGATE_EVENTS_M ICE_M(0xFFFF, 0)
+#define GL_CLKGATE_EVENTS_SIDEBAND_CLKGATE_EVENTS_S 16
+#define GL_CLKGATE_EVENTS_SIDEBAND_CLKGATE_EVENTS_M ICE_M(0xFFFF, 16)
+#define GLPCI_BYTCTH_NP_C			0x000BFDA8 /* Reset Source: PCIR */
+#define GLPCI_BYTCTH_NP_C_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTH_NP_C_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTH_P				0x0009E970 /* Reset Source: PCIR */
+#define GLPCI_BYTCTH_P_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTH_P_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTL_NP_C			0x000BFDAC /* Reset Source: PCIR */
+#define GLPCI_BYTCTL_NP_C_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTL_NP_C_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_BYTCTL_P				0x0009E994 /* Reset Source: PCIR */
+#define GLPCI_BYTCTL_P_PCI_COUNT_BW_BCT_S	0
+#define GLPCI_BYTCTL_P_PCI_COUNT_BW_BCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_CAPCTRL				0x0009DE88 /* Reset Source: PCIR */
+#define GLPCI_CAPCTRL_VPD_EN_S			0
+#define GLPCI_CAPCTRL_VPD_EN_M			BIT(0)
+#define GLPCI_CAPSUP				0x0009DE8C /* Reset Source: PCIR */
+#define GLPCI_CAPSUP_PCIE_VER_S			0
+#define GLPCI_CAPSUP_PCIE_VER_M			BIT(0)
+#define GLPCI_CAPSUP_RESERVED_2_S		1
+#define GLPCI_CAPSUP_RESERVED_2_M		BIT(1)
+#define GLPCI_CAPSUP_LTR_EN_S			2
+#define GLPCI_CAPSUP_LTR_EN_M			BIT(2)
+#define GLPCI_CAPSUP_TPH_EN_S			3
+#define GLPCI_CAPSUP_TPH_EN_M			BIT(3)
+#define GLPCI_CAPSUP_ARI_EN_S			4
+#define GLPCI_CAPSUP_ARI_EN_M			BIT(4)
+#define GLPCI_CAPSUP_IOV_EN_S			5
+#define GLPCI_CAPSUP_IOV_EN_M			BIT(5)
+#define GLPCI_CAPSUP_ACS_EN_S			6
+#define GLPCI_CAPSUP_ACS_EN_M			BIT(6)
+#define GLPCI_CAPSUP_SEC_EN_S			7
+#define GLPCI_CAPSUP_SEC_EN_M			BIT(7)
+#define GLPCI_CAPSUP_PASID_EN_S			8
+#define GLPCI_CAPSUP_PASID_EN_M			BIT(8)
+#define GLPCI_CAPSUP_DLFE_EN_S			9
+#define GLPCI_CAPSUP_DLFE_EN_M			BIT(9)
+#define GLPCI_CAPSUP_GEN4_EXT_EN_S		10
+#define GLPCI_CAPSUP_GEN4_EXT_EN_M		BIT(10)
+#define GLPCI_CAPSUP_GEN4_MARG_EN_S		11
+#define GLPCI_CAPSUP_GEN4_MARG_EN_M		BIT(11)
+#define GLPCI_CAPSUP_ECRC_GEN_EN_S		16
+#define GLPCI_CAPSUP_ECRC_GEN_EN_M		BIT(16)
+#define GLPCI_CAPSUP_ECRC_CHK_EN_S		17
+#define GLPCI_CAPSUP_ECRC_CHK_EN_M		BIT(17)
+#define GLPCI_CAPSUP_IDO_EN_S			18
+#define GLPCI_CAPSUP_IDO_EN_M			BIT(18)
+#define GLPCI_CAPSUP_MSI_MASK_S			19
+#define GLPCI_CAPSUP_MSI_MASK_M			BIT(19)
+#define GLPCI_CAPSUP_CSR_CONF_EN_S		20
+#define GLPCI_CAPSUP_CSR_CONF_EN_M		BIT(20)
+#define GLPCI_CAPSUP_WAKUP_EN_S			21
+#define GLPCI_CAPSUP_WAKUP_EN_M			BIT(21)
+#define GLPCI_CAPSUP_LOAD_SUBSYS_ID_S		30
+#define GLPCI_CAPSUP_LOAD_SUBSYS_ID_M		BIT(30)
+#define GLPCI_CAPSUP_LOAD_DEV_ID_S		31
+#define GLPCI_CAPSUP_LOAD_DEV_ID_M		BIT(31)
+#define GLPCI_CNF				0x0009DEA0 /* Reset Source: POR */
+#define GLPCI_CNF_FLEX10_S			1
+#define GLPCI_CNF_FLEX10_M			BIT(1)
+#define GLPCI_CNF_WAKE_PIN_EN_S			2
+#define GLPCI_CNF_WAKE_PIN_EN_M			BIT(2)
+#define GLPCI_CNF_MSIX_ECC_BLOCK_DISABLE_S	3
+#define GLPCI_CNF_MSIX_ECC_BLOCK_DISABLE_M	BIT(3)
+#define GLPCI_CNF2				0x000BE004 /* Reset Source: PCIR */
+#define GLPCI_CNF2_RO_DIS_S			0
+#define GLPCI_CNF2_RO_DIS_M			BIT(0)
+#define GLPCI_CNF2_CACHELINE_SIZE_S		1
 #define GLPCI_CNF2_CACHELINE_SIZE_M		BIT(1)
-#define PF_FUNC_RID				0x0009E880
-#define PF_FUNC_RID_FUNC_NUM_S			0
-#define PF_FUNC_RID_FUNC_NUM_M			ICE_M(0x7, 0)
-#define PF_PCI_CIAA				0x0009E580
+#define GLPCI_DREVID				0x0009E9AC /* Reset Source: PCIR */
+#define GLPCI_DREVID_DEFAULT_REVID_S		0
+#define GLPCI_DREVID_DEFAULT_REVID_M		ICE_M(0xFF, 0)
+#define GLPCI_GSCL_1_NP_C			0x000BFDA4 /* Reset Source: PCIR */
+#define GLPCI_GSCL_1_NP_C_RT_MODE_S		8
+#define GLPCI_GSCL_1_NP_C_RT_MODE_M		BIT(8)
+#define GLPCI_GSCL_1_NP_C_RT_EVENT_S		9
+#define GLPCI_GSCL_1_NP_C_RT_EVENT_M		ICE_M(0x1F, 9)
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EN_S	14
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EN_M	BIT(14)
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EV_S	15
+#define GLPCI_GSCL_1_NP_C_PCI_COUNT_BW_EV_M	ICE_M(0x1F, 15)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_RESET_S	29
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_RESET_M	BIT(29)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_STOP_S	30
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_STOP_M	BIT(30)
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_START_S	31
+#define GLPCI_GSCL_1_NP_C_GIO_COUNT_START_M	BIT(31)
+#define GLPCI_GSCL_1_P				0x0009E9B4 /* Reset Source: PCIR */
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_0_S		0
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_0_M		BIT(0)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_1_S		1
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_1_M		BIT(1)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_2_S		2
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_2_M		BIT(2)
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_3_S		3
+#define GLPCI_GSCL_1_P_GIO_COUNT_EN_3_M		BIT(3)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_0_S		4
+#define GLPCI_GSCL_1_P_LBC_ENABLE_0_M		BIT(4)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_1_S		5
+#define GLPCI_GSCL_1_P_LBC_ENABLE_1_M		BIT(5)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_2_S		6
+#define GLPCI_GSCL_1_P_LBC_ENABLE_2_M		BIT(6)
+#define GLPCI_GSCL_1_P_LBC_ENABLE_3_S		7
+#define GLPCI_GSCL_1_P_LBC_ENABLE_3_M		BIT(7)
+#define GLPCI_GSCL_1_P_PCI_COUNT_BW_EN_S	14
+#define GLPCI_GSCL_1_P_PCI_COUNT_BW_EN_M	BIT(14)
+#define GLPCI_GSCL_1_P_GIO_64_BIT_EN_S		28
+#define GLPCI_GSCL_1_P_GIO_64_BIT_EN_M		BIT(28)
+#define GLPCI_GSCL_1_P_GIO_COUNT_RESET_S	29
+#define GLPCI_GSCL_1_P_GIO_COUNT_RESET_M	BIT(29)
+#define GLPCI_GSCL_1_P_GIO_COUNT_STOP_S		30
+#define GLPCI_GSCL_1_P_GIO_COUNT_STOP_M		BIT(30)
+#define GLPCI_GSCL_1_P_GIO_COUNT_START_S	31
+#define GLPCI_GSCL_1_P_GIO_COUNT_START_M	BIT(31)
+#define GLPCI_GSCL_2				0x0009E998 /* Reset Source: PCIR */
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_0_S		0
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_0_M		ICE_M(0xFF, 0)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_1_S		8
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_1_M		ICE_M(0xFF, 8)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_2_S		16
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_2_M		ICE_M(0xFF, 16)
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_3_S		24
+#define GLPCI_GSCL_2_GIO_EVENT_NUM_3_M		ICE_M(0xFF, 24)
+#define GLPCI_GSCL_5_8(_i)			(0x0009E954 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: PCIR */
+#define GLPCI_GSCL_5_8_MAX_INDEX		3
+#define GLPCI_GSCL_5_8_LBC_THRESHOLD_N_S	0
+#define GLPCI_GSCL_5_8_LBC_THRESHOLD_N_M	ICE_M(0xFFFF, 0)
+#define GLPCI_GSCL_5_8_LBC_TIMER_N_S		16
+#define GLPCI_GSCL_5_8_LBC_TIMER_N_M		ICE_M(0xFFFF, 16)
+#define GLPCI_GSCN_0_3(_i)			(0x0009E99C + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: PCIR */
+#define GLPCI_GSCN_0_3_MAX_INDEX		3
+#define GLPCI_GSCN_0_3_EVENT_COUNTER_S		0
+#define GLPCI_GSCN_0_3_EVENT_COUNTER_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_LATCT_NP_C			0x000BFDA0 /* Reset Source: PCIR */
+#define GLPCI_LATCT_NP_C_PCI_LATENCY_COUNT_S	0
+#define GLPCI_LATCT_NP_C_PCI_LATENCY_COUNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_LBARCTRL				0x0009DE74 /* Reset Source: POR */
+#define GLPCI_LBARCTRL_PREFBAR_S		0
+#define GLPCI_LBARCTRL_PREFBAR_M		BIT(0)
+#define GLPCI_LBARCTRL_BAR32_S			1
+#define GLPCI_LBARCTRL_BAR32_M			BIT(1)
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_PF_S	2
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_PF_M	BIT(2)
+#define GLPCI_LBARCTRL_FLASH_EXPOSE_S		3
+#define GLPCI_LBARCTRL_FLASH_EXPOSE_M		BIT(3)
+#define GLPCI_LBARCTRL_PE_DB_SIZE_S		4
+#define GLPCI_LBARCTRL_PE_DB_SIZE_M		ICE_M(0x3, 4)
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_VF_S	9
+#define GLPCI_LBARCTRL_PAGES_SPACE_EN_VF_M	BIT(9)
+#define GLPCI_LBARCTRL_EXROM_SIZE_S		11
+#define GLPCI_LBARCTRL_EXROM_SIZE_M		ICE_M(0x7, 11)
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_S		14
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_M		ICE_M(0x3, 14)
+#define GLPCI_LINKCAP				0x0009DE90 /* Reset Source: PCIR */
+#define GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_S	0
+#define GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_M	ICE_M(0x3F, 0)
+#define GLPCI_LINKCAP_MAX_LINK_WIDTH_S		9
+#define GLPCI_LINKCAP_MAX_LINK_WIDTH_M		ICE_M(0xF, 9)
+#define GLPCI_NPQ_CFG				0x000BFD80 /* Reset Source: PCIR */
+#define GLPCI_NPQ_CFG_EXTEND_TO_S		0
+#define GLPCI_NPQ_CFG_EXTEND_TO_M		BIT(0)
+#define GLPCI_NPQ_CFG_SMALL_TO_S		1
+#define GLPCI_NPQ_CFG_SMALL_TO_M		BIT(1)
+#define GLPCI_NPQ_CFG_WEIGHT_AVG_S		2
+#define GLPCI_NPQ_CFG_WEIGHT_AVG_M		ICE_M(0xF, 2)
+#define GLPCI_NPQ_CFG_NPQ_SPARE_S		6
+#define GLPCI_NPQ_CFG_NPQ_SPARE_M		ICE_M(0x3FF, 6)
+#define GLPCI_NPQ_CFG_NPQ_ERR_STAT_S		16
+#define GLPCI_NPQ_CFG_NPQ_ERR_STAT_M		ICE_M(0xF, 16)
+#define GLPCI_PKTCT_NP_C			0x000BFD9C /* Reset Source: PCIR */
+#define GLPCI_PKTCT_NP_C_PCI_COUNT_BW_PCT_S	0
+#define GLPCI_PKTCT_NP_C_PCI_COUNT_BW_PCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_PKTCT_P				0x0009E9B0 /* Reset Source: PCIR */
+#define GLPCI_PKTCT_P_PCI_COUNT_BW_PCT_S	0
+#define GLPCI_PKTCT_P_PCI_COUNT_BW_PCT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_PMSUP				0x0009DE94 /* Reset Source: PCIR */
+#define GLPCI_PMSUP_RESERVED_0_S		0
+#define GLPCI_PMSUP_RESERVED_0_M		ICE_M(0x3, 0)
+#define GLPCI_PMSUP_RESERVED_1_S		2
+#define GLPCI_PMSUP_RESERVED_1_M		ICE_M(0x7, 2)
+#define GLPCI_PMSUP_RESERVED_2_S		5
+#define GLPCI_PMSUP_RESERVED_2_M		ICE_M(0x7, 5)
+#define GLPCI_PMSUP_L0S_ACC_LAT_S		8
+#define GLPCI_PMSUP_L0S_ACC_LAT_M		ICE_M(0x7, 8)
+#define GLPCI_PMSUP_L1_ACC_LAT_S		11
+#define GLPCI_PMSUP_L1_ACC_LAT_M		ICE_M(0x7, 11)
+#define GLPCI_PMSUP_RESERVED_3_S		14
+#define GLPCI_PMSUP_RESERVED_3_M		BIT(14)
+#define GLPCI_PMSUP_OBFF_SUP_S			15
+#define GLPCI_PMSUP_OBFF_SUP_M			ICE_M(0x3, 15)
+#define GLPCI_PUSH_PE_IF_TO_STATUS		0x0009DF44 /* Reset Source: PCIR */
+#define GLPCI_PUSH_PE_IF_TO_STATUS_GLPCI_PUSH_PE_IF_TO_STATUS_S 0
+#define GLPCI_PUSH_PE_IF_TO_STATUS_GLPCI_PUSH_PE_IF_TO_STATUS_M BIT(0)
+#define GLPCI_PWRDATA				0x0009DE7C /* Reset Source: PCIR */
+#define GLPCI_PWRDATA_D0_POWER_S		0
+#define GLPCI_PWRDATA_D0_POWER_M		ICE_M(0xFF, 0)
+#define GLPCI_PWRDATA_COMM_POWER_S		8
+#define GLPCI_PWRDATA_COMM_POWER_M		ICE_M(0xFF, 8)
+#define GLPCI_PWRDATA_D3_POWER_S		16
+#define GLPCI_PWRDATA_D3_POWER_M		ICE_M(0xFF, 16)
+#define GLPCI_PWRDATA_DATA_SCALE_S		24
+#define GLPCI_PWRDATA_DATA_SCALE_M		ICE_M(0x3, 24)
+#define GLPCI_REVID				0x0009DE98 /* Reset Source: PCIR */
+#define GLPCI_REVID_NVM_REVID_S			0
+#define GLPCI_REVID_NVM_REVID_M			ICE_M(0xFF, 0)
+#define GLPCI_SERH				0x0009DE84 /* Reset Source: PCIR */
+#define GLPCI_SERH_SER_NUM_H_S			0
+#define GLPCI_SERH_SER_NUM_H_M			ICE_M(0xFFFF, 0)
+#define GLPCI_SERL				0x0009DE80 /* Reset Source: PCIR */
+#define GLPCI_SERL_SER_NUM_L_S			0
+#define GLPCI_SERL_SER_NUM_L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPCI_SUBVENID				0x0009DEE8 /* Reset Source: PCIR */
+#define GLPCI_SUBVENID_SUB_VEN_ID_S		0
+#define GLPCI_SUBVENID_SUB_VEN_ID_M		ICE_M(0xFFFF, 0)
+#define GLPCI_UPADD				0x000BE0D4 /* Reset Source: PCIR */
+#define GLPCI_UPADD_ADDRESS_S			1
+#define GLPCI_UPADD_ADDRESS_M			ICE_M(0x7FFFFFFF, 1)
+#define GLPCI_VENDORID				0x0009DEC8 /* Reset Source: PCIR */
+#define GLPCI_VENDORID_VENDORID_S		0
+#define GLPCI_VENDORID_VENDORID_M		ICE_M(0xFFFF, 0)
+#define GLPCI_VFSUP				0x0009DE9C /* Reset Source: PCIR */
+#define GLPCI_VFSUP_VF_PREFETCH_S		0
+#define GLPCI_VFSUP_VF_PREFETCH_M		BIT(0)
+#define GLPCI_VFSUP_VR_BAR_TYPE_S		1
+#define GLPCI_VFSUP_VR_BAR_TYPE_M		BIT(1)
+#define GLPCI_WATMK_CLNT_PIPEMON		0x000BFD90 /* Reset Source: PCIR */
+#define GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_S	0
+#define GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_M	ICE_M(0xFFFF, 0)
+#define PF_FUNC_RID				0x0009E880 /* Reset Source: PCIR */
+#define PF_FUNC_RID_FUNCTION_NUMBER_S		0
+#define PF_FUNC_RID_FUNCTION_NUMBER_M		ICE_M(0x7, 0)
+#define PF_FUNC_RID_DEVICE_NUMBER_S		3
+#define PF_FUNC_RID_DEVICE_NUMBER_M		ICE_M(0x1F, 3)
+#define PF_FUNC_RID_BUS_NUMBER_S		8
+#define PF_FUNC_RID_BUS_NUMBER_M		ICE_M(0xFF, 8)
+#define PF_PCI_CIAA				0x0009E580 /* Reset Source: FLR */
+#define PF_PCI_CIAA_ADDRESS_S			0
+#define PF_PCI_CIAA_ADDRESS_M			ICE_M(0xFFF, 0)
 #define PF_PCI_CIAA_VF_NUM_S			12
-#define PF_PCI_CIAD				0x0009E500
-#define GL_PWR_MODE_CTL				0x000B820C
+#define PF_PCI_CIAA_VF_NUM_M			ICE_M(0xFF, 12)
+#define PF_PCI_CIAD				0x0009E500 /* Reset Source: FLR */
+#define PF_PCI_CIAD_DATA_S			0
+#define PF_PCI_CIAD_DATA_M			ICE_M(0xFFFFFFFF, 0)
+#define PFPCI_CLASS				0x0009DB00 /* Reset Source: PCIR */
+#define PFPCI_CLASS_STORAGE_CLASS_S		0
+#define PFPCI_CLASS_STORAGE_CLASS_M		BIT(0)
+#define PFPCI_CLASS_PF_IS_LAN_S			2
+#define PFPCI_CLASS_PF_IS_LAN_M			BIT(2)
+#define PFPCI_CNF				0x0009DF00 /* Reset Source: PCIR */
+#define PFPCI_CNF_MSI_EN_S			2
+#define PFPCI_CNF_MSI_EN_M			BIT(2)
+#define PFPCI_CNF_EXROM_DIS_S			3
+#define PFPCI_CNF_EXROM_DIS_M			BIT(3)
+#define PFPCI_CNF_IO_BAR_S			4
+#define PFPCI_CNF_IO_BAR_M			BIT(4)
+#define PFPCI_CNF_INT_PIN_S			5
+#define PFPCI_CNF_INT_PIN_M			ICE_M(0x3, 5)
+#define PFPCI_DEVID				0x0009DE00 /* Reset Source: PCIR */
+#define PFPCI_DEVID_PF_DEV_ID_S			0
+#define PFPCI_DEVID_PF_DEV_ID_M			ICE_M(0xFFFF, 0)
+#define PFPCI_DEVID_VF_DEV_ID_S			16
+#define PFPCI_DEVID_VF_DEV_ID_M			ICE_M(0xFFFF, 16)
+#define PFPCI_FACTPS				0x0009E900 /* Reset Source: FLR */
+#define PFPCI_FACTPS_FUNC_POWER_STATE_S		0
+#define PFPCI_FACTPS_FUNC_POWER_STATE_M		ICE_M(0x3, 0)
+#define PFPCI_FACTPS_FUNC_AUX_EN_S		3
+#define PFPCI_FACTPS_FUNC_AUX_EN_M		BIT(3)
+#define PFPCI_FUNC				0x0009D980 /* Reset Source: POR */
+#define PFPCI_FUNC_FUNC_DIS_S			0
+#define PFPCI_FUNC_FUNC_DIS_M			BIT(0)
+#define PFPCI_FUNC_ALLOW_FUNC_DIS_S		1
+#define PFPCI_FUNC_ALLOW_FUNC_DIS_M		BIT(1)
+#define PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_S	2
+#define PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_M	BIT(2)
+#define PFPCI_PF_FLUSH_DONE			0x0009E400 /* Reset Source: PCIR */
+#define PFPCI_PF_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_PF_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_PM				0x0009DA80 /* Reset Source: POR */
+#define PFPCI_PM_PME_EN_S			0
+#define PFPCI_PM_PME_EN_M			BIT(0)
+#define PFPCI_STATUS1				0x0009DA00 /* Reset Source: POR */
+#define PFPCI_STATUS1_FUNC_VALID_S		0
+#define PFPCI_STATUS1_FUNC_VALID_M		BIT(0)
+#define PFPCI_SUBSYSID				0x0009D880 /* Reset Source: PCIR */
+#define PFPCI_SUBSYSID_PF_SUBSYS_ID_S		0
+#define PFPCI_SUBSYSID_PF_SUBSYS_ID_M		ICE_M(0xFFFF, 0)
+#define PFPCI_SUBSYSID_VF_SUBSYS_ID_S		16
+#define PFPCI_SUBSYSID_VF_SUBSYS_ID_M		ICE_M(0xFFFF, 16)
+#define PFPCI_VF_FLUSH_DONE(_VF)		(0x0009E000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PCIR */
+#define PFPCI_VF_FLUSH_DONE_MAX_INDEX		255
+#define PFPCI_VF_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_VF_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_VM_FLUSH_DONE			0x0009E480 /* Reset Source: PCIR */
+#define PFPCI_VM_FLUSH_DONE_FLUSH_DONE_S	0
+#define PFPCI_VM_FLUSH_DONE_FLUSH_DONE_M	BIT(0)
+#define PFPCI_VMINDEX				0x0009E600 /* Reset Source: PCIR */
+#define PFPCI_VMINDEX_VMINDEX_S			0
+#define PFPCI_VMINDEX_VMINDEX_M			ICE_M(0x3FF, 0)
+#define PFPCI_VMPEND				0x0009E800 /* Reset Source: PCIR */
+#define PFPCI_VMPEND_PENDING_S			0
+#define PFPCI_VMPEND_PENDING_M			BIT(0)
+#define PQ_FIFO_STATUS				0x0009DF40 /* Reset Source: PCIR */
+#define PQ_FIFO_STATUS_PQ_FIFO_COUNT_S		0
+#define PQ_FIFO_STATUS_PQ_FIFO_COUNT_M		ICE_M(0x7FFFFFFF, 0)
+#define PQ_FIFO_STATUS_PQ_FIFO_EMPTY_S		31
+#define PQ_FIFO_STATUS_PQ_FIFO_EMPTY_M		BIT(31)
+#define GLPE_CPUSTATUS0				0x0050BA5C /* Reset Source: CORER */
+#define GLPE_CPUSTATUS0_PECPUSTATUS0_S		0
+#define GLPE_CPUSTATUS0_PECPUSTATUS0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CPUSTATUS1				0x0050BA60 /* Reset Source: CORER */
+#define GLPE_CPUSTATUS1_PECPUSTATUS1_S		0
+#define GLPE_CPUSTATUS1_PECPUSTATUS1_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CPUSTATUS2				0x0050BA64 /* Reset Source: CORER */
+#define GLPE_CPUSTATUS2_PECPUSTATUS2_S		0
+#define GLPE_CPUSTATUS2_PECPUSTATUS2_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_MDQ_BASE(_i)			(0x00536000 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_BASE_MAX_INDEX			511
+#define GLPE_MDQ_BASE_MDOC_INDEX_S		0
+#define GLPE_MDQ_BASE_MDOC_INDEX_M		ICE_M(0xFFFFFFF, 0)
+#define GLPE_MDQ_PTR(_i)			(0x00537000 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_PTR_MAX_INDEX			511
+#define GLPE_MDQ_PTR_MDQ_HEAD_S			0
+#define GLPE_MDQ_PTR_MDQ_HEAD_M			ICE_M(0x3FFF, 0)
+#define GLPE_MDQ_PTR_MDQ_TAIL_S			16
+#define GLPE_MDQ_PTR_MDQ_TAIL_M			ICE_M(0x3FFF, 16)
+#define GLPE_MDQ_SIZE(_i)			(0x00536800 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLPE_MDQ_SIZE_MAX_INDEX			511
+#define GLPE_MDQ_SIZE_MDQ_SIZE_S		0
+#define GLPE_MDQ_SIZE_MDQ_SIZE_M		ICE_M(0x3FFF, 0)
+#define GLPE_PEPM_CTRL				0x0050C000 /* Reset Source: PERST */
+#define GLPE_PEPM_CTRL_PEPM_ENABLE_S		0
+#define GLPE_PEPM_CTRL_PEPM_ENABLE_M		BIT(0)
+#define GLPE_PEPM_CTRL_PEPM_HALT_S		8
+#define GLPE_PEPM_CTRL_PEPM_HALT_M		BIT(8)
+#define GLPE_PEPM_CTRL_PEPM_PUSH_MARGIN_S	16
+#define GLPE_PEPM_CTRL_PEPM_PUSH_MARGIN_M	ICE_M(0xFF, 16)
+#define GLPE_PEPM_DEALLOC			0x0050C004 /* Reset Source: PERST */
+#define GLPE_PEPM_DEALLOC_MDQ_CREDITS_S		0
+#define GLPE_PEPM_DEALLOC_MDQ_CREDITS_M		ICE_M(0x3FFF, 0)
+#define GLPE_PEPM_DEALLOC_PSQ_CREDITS_S		14
+#define GLPE_PEPM_DEALLOC_PSQ_CREDITS_M		ICE_M(0x1F, 14)
+#define GLPE_PEPM_DEALLOC_PQID_S		19
+#define GLPE_PEPM_DEALLOC_PQID_M		ICE_M(0x1FF, 19)
+#define GLPE_PEPM_DEALLOC_PORT_S		28
+#define GLPE_PEPM_DEALLOC_PORT_M		ICE_M(0x7, 28)
+#define GLPE_PEPM_DEALLOC_DEALLOC_RDY_S		31
+#define GLPE_PEPM_DEALLOC_DEALLOC_RDY_M		BIT(31)
+#define GLPE_PEPM_PSQ_COUNT			0x0050C020 /* Reset Source: PERST */
+#define GLPE_PEPM_PSQ_COUNT_PEPM_PSQ_COUNT_S	0
+#define GLPE_PEPM_PSQ_COUNT_PEPM_PSQ_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PEPM_THRESH(_i)			(0x0050C840 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: PERST */
+#define GLPE_PEPM_THRESH_MAX_INDEX		511
+#define GLPE_PEPM_THRESH_PEPM_PSQ_THRESH_S	0
+#define GLPE_PEPM_THRESH_PEPM_PSQ_THRESH_M	ICE_M(0x1F, 0)
+#define GLPE_PEPM_THRESH_PEPM_MDQ_THRESH_S	16
+#define GLPE_PEPM_THRESH_PEPM_MDQ_THRESH_M	ICE_M(0x3FFF, 16)
+#define GLPE_PFAEQEDROPCNT(_i)			(0x00503240 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFAEQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFAEQEDROPCNT_AEQEDROPCNT_S	0
+#define GLPE_PFAEQEDROPCNT_AEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFCEQEDROPCNT(_i)			(0x00503220 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFCEQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFCEQEDROPCNT_CEQEDROPCNT_S	0
+#define GLPE_PFCEQEDROPCNT_CEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFCQEDROPCNT(_i)			(0x00503200 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFCQEDROPCNT_MAX_INDEX		7
+#define GLPE_PFCQEDROPCNT_CQEDROPCNT_S		0
+#define GLPE_PFCQEDROPCNT_CQEDROPCNT_M		ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMOOISCALLOCERR(_i)		(0x0050B960 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMOOISCALLOCERR_MAX_INDEX	7
+#define GLPE_PFFLMOOISCALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMOOISCALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMQ1ALLOCERR(_i)		(0x0050B920 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMQ1ALLOCERR_MAX_INDEX		7
+#define GLPE_PFFLMQ1ALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMQ1ALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMRRFALLOCERR(_i)		(0x0050B940 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMRRFALLOCERR_MAX_INDEX		7
+#define GLPE_PFFLMRRFALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMRRFALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFFLMXMITALLOCERR(_i)		(0x0050B900 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFFLMXMITALLOCERR_MAX_INDEX	7
+#define GLPE_PFFLMXMITALLOCERR_ERROR_COUNT_S	0
+#define GLPE_PFFLMXMITALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_PFTCPNOW50USCNT(_i)		(0x0050B8C0 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPE_PFTCPNOW50USCNT_MAX_INDEX		7
+#define GLPE_PFTCPNOW50USCNT_CNT_S		0
+#define GLPE_PFTCPNOW50USCNT_CNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPE_PUSH_PEPM				0x0053241C /* Reset Source: CORER */
+#define GLPE_PUSH_PEPM_MDQ_CREDITS_S		0
+#define GLPE_PUSH_PEPM_MDQ_CREDITS_M		ICE_M(0xFF, 0)
+#define GLPE_VFAEQEDROPCNT(_i)			(0x00503100 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFAEQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFAEQEDROPCNT_AEQEDROPCNT_S	0
+#define GLPE_VFAEQEDROPCNT_AEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFCEQEDROPCNT(_i)			(0x00503080 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFCEQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFCEQEDROPCNT_CEQEDROPCNT_S	0
+#define GLPE_VFCEQEDROPCNT_CEQEDROPCNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFCQEDROPCNT(_i)			(0x00503000 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFCQEDROPCNT_MAX_INDEX		31
+#define GLPE_VFCQEDROPCNT_CQEDROPCNT_S		0
+#define GLPE_VFCQEDROPCNT_CQEDROPCNT_M		ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMOOISCALLOCERR(_i)		(0x0050B580 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMOOISCALLOCERR_MAX_INDEX	31
+#define GLPE_VFFLMOOISCALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMOOISCALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMQ1ALLOCERR(_i)		(0x0050B480 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMQ1ALLOCERR_MAX_INDEX		31
+#define GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMRRFALLOCERR(_i)		(0x0050B500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMRRFALLOCERR_MAX_INDEX		31
+#define GLPE_VFFLMRRFALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMRRFALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFFLMXMITALLOCERR(_i)		(0x0050B400 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLPE_VFFLMXMITALLOCERR_MAX_INDEX	31
+#define GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_S	0
+#define GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_M	ICE_M(0xFFFF, 0)
+#define GLPE_VFTCPNOW50USCNT(_i)		(0x0050B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: PE_CORER */
+#define GLPE_VFTCPNOW50USCNT_MAX_INDEX		31
+#define GLPE_VFTCPNOW50USCNT_CNT_S		0
+#define GLPE_VFTCPNOW50USCNT_CNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_AEQALLOC				0x00502D00 /* Reset Source: PFR */
+#define PFPE_AEQALLOC_AECOUNT_S			0
+#define PFPE_AEQALLOC_AECOUNT_M			ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPHIGH				0x0050A100 /* Reset Source: PFR */
+#define PFPE_CCQPHIGH_PECCQPHIGH_S		0
+#define PFPE_CCQPHIGH_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPLOW				0x0050A080 /* Reset Source: PFR */
+#define PFPE_CCQPLOW_PECCQPLOW_S		0
+#define PFPE_CCQPLOW_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_CCQPSTATUS				0x0050A000 /* Reset Source: PFR */
+#define PFPE_CCQPSTATUS_CCQP_DONE_S		0
+#define PFPE_CCQPSTATUS_CCQP_DONE_M		BIT(0)
+#define PFPE_CCQPSTATUS_HMC_PROFILE_S		4
+#define PFPE_CCQPSTATUS_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define PFPE_CCQPSTATUS_RDMA_EN_VFS_S		16
+#define PFPE_CCQPSTATUS_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define PFPE_CCQPSTATUS_CCQP_ERR_S		31
+#define PFPE_CCQPSTATUS_CCQP_ERR_M		BIT(31)
+#define PFPE_CQACK				0x00502C80 /* Reset Source: PFR */
+#define PFPE_CQACK_PECQID_S			0
+#define PFPE_CQACK_PECQID_M			ICE_M(0x7FFFF, 0)
+#define PFPE_CQARM				0x00502C00 /* Reset Source: PFR */
+#define PFPE_CQARM_PECQID_S			0
+#define PFPE_CQARM_PECQID_M			ICE_M(0x7FFFF, 0)
+#define PFPE_CQPDB				0x00500800 /* Reset Source: PFR */
+#define PFPE_CQPDB_WQHEAD_S			0
+#define PFPE_CQPDB_WQHEAD_M			ICE_M(0x7FF, 0)
+#define PFPE_CQPERRCODES			0x0050A200 /* Reset Source: PFR */
+#define PFPE_CQPERRCODES_CQP_MINOR_CODE_S	0
+#define PFPE_CQPERRCODES_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define PFPE_CQPERRCODES_CQP_MAJOR_CODE_S	16
+#define PFPE_CQPERRCODES_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define PFPE_CQPTAIL				0x00500880 /* Reset Source: PFR */
+#define PFPE_CQPTAIL_WQTAIL_S			0
+#define PFPE_CQPTAIL_WQTAIL_M			ICE_M(0x7FF, 0)
+#define PFPE_CQPTAIL_CQP_OP_ERR_S		31
+#define PFPE_CQPTAIL_CQP_OP_ERR_M		BIT(31)
+#define PFPE_IPCONFIG0				0x0050A180 /* Reset Source: PFR */
+#define PFPE_IPCONFIG0_PEIPID_S			0
+#define PFPE_IPCONFIG0_PEIPID_M			ICE_M(0xFFFF, 0)
+#define PFPE_IPCONFIG0_USEENTIREIDRANGE_S	16
+#define PFPE_IPCONFIG0_USEENTIREIDRANGE_M	BIT(16)
+#define PFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_S	17
+#define PFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define PFPE_MRTEIDXMASK			0x0050A300 /* Reset Source: PFR */
+#define PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_S	0
+#define PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define PFPE_RCVUNEXPECTEDERROR			0x0050A380 /* Reset Source: PFR */
+#define PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_S 0
+#define PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define PFPE_TCPNOWTIMER			0x0050A280 /* Reset Source: PFR */
+#define PFPE_TCPNOWTIMER_TCP_NOW_S		0
+#define PFPE_TCPNOWTIMER_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define PFPE_WQEALLOC				0x00504400 /* Reset Source: PFR */
+#define PFPE_WQEALLOC_PEQPID_S			0
+#define PFPE_WQEALLOC_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define PFPE_WQEALLOC_WQE_DESC_INDEX_S		20
+#define PFPE_WQEALLOC_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
+#define PRT_PEPM_COUNT(_i)			(0x0050C040 + ((_i) * 4)) /* _i=0...511 */ /* Reset Source: PERST */
+#define PRT_PEPM_COUNT_MAX_INDEX		511
+#define PRT_PEPM_COUNT_PEPM_PSQ_COUNT_S		0
+#define PRT_PEPM_COUNT_PEPM_PSQ_COUNT_M		ICE_M(0x1F, 0)
+#define PRT_PEPM_COUNT_PEPM_MDQ_COUNT_S		16
+#define PRT_PEPM_COUNT_PEPM_MDQ_COUNT_M		ICE_M(0x3FFF, 16)
+#define VFPE_AEQALLOC(_VF)			(0x00502800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_AEQALLOC_MAX_INDEX			255
+#define VFPE_AEQALLOC_AECOUNT_S			0
+#define VFPE_AEQALLOC_AECOUNT_M			ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPHIGH(_VF)			(0x00508800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPHIGH_MAX_INDEX			255
+#define VFPE_CCQPHIGH_PECCQPHIGH_S		0
+#define VFPE_CCQPHIGH_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPLOW(_VF)			(0x00508400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPLOW_MAX_INDEX			255
+#define VFPE_CCQPLOW_PECCQPLOW_S		0
+#define VFPE_CCQPLOW_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPSTATUS(_VF)			(0x00508000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CCQPSTATUS_MAX_INDEX		255
+#define VFPE_CCQPSTATUS_CCQP_DONE_S		0
+#define VFPE_CCQPSTATUS_CCQP_DONE_M		BIT(0)
+#define VFPE_CCQPSTATUS_HMC_PROFILE_S		4
+#define VFPE_CCQPSTATUS_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define VFPE_CCQPSTATUS_RDMA_EN_VFS_S		16
+#define VFPE_CCQPSTATUS_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define VFPE_CCQPSTATUS_CCQP_ERR_S		31
+#define VFPE_CCQPSTATUS_CCQP_ERR_M		BIT(31)
+#define VFPE_CQACK(_VF)				(0x00502400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQACK_MAX_INDEX			255
+#define VFPE_CQACK_PECQID_S			0
+#define VFPE_CQACK_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQARM(_VF)				(0x00502000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQARM_MAX_INDEX			255
+#define VFPE_CQARM_PECQID_S			0
+#define VFPE_CQARM_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQPDB(_VF)				(0x00500000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPDB_MAX_INDEX			255
+#define VFPE_CQPDB_WQHEAD_S			0
+#define VFPE_CQPDB_WQHEAD_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPERRCODES(_VF)			(0x00509000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPERRCODES_MAX_INDEX		255
+#define VFPE_CQPERRCODES_CQP_MINOR_CODE_S	0
+#define VFPE_CQPERRCODES_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define VFPE_CQPERRCODES_CQP_MAJOR_CODE_S	16
+#define VFPE_CQPERRCODES_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define VFPE_CQPTAIL(_VF)			(0x00500400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_CQPTAIL_MAX_INDEX			255
+#define VFPE_CQPTAIL_WQTAIL_S			0
+#define VFPE_CQPTAIL_WQTAIL_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPTAIL_CQP_OP_ERR_S		31
+#define VFPE_CQPTAIL_CQP_OP_ERR_M		BIT(31)
+#define VFPE_IPCONFIG0(_VF)			(0x00508C00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_IPCONFIG0_MAX_INDEX		255
+#define VFPE_IPCONFIG0_PEIPID_S			0
+#define VFPE_IPCONFIG0_PEIPID_M			ICE_M(0xFFFF, 0)
+#define VFPE_IPCONFIG0_USEENTIREIDRANGE_S	16
+#define VFPE_IPCONFIG0_USEENTIREIDRANGE_M	BIT(16)
+#define VFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_S	17
+#define VFPE_IPCONFIG0_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define VFPE_RCVUNEXPECTEDERROR(_VF)		(0x00509C00 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_RCVUNEXPECTEDERROR_MAX_INDEX	255
+#define VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_S 0
+#define VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define VFPE_TCPNOWTIMER(_VF)			(0x00509400 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_TCPNOWTIMER_MAX_INDEX		255
+#define VFPE_TCPNOWTIMER_TCP_NOW_S		0
+#define VFPE_TCPNOWTIMER_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_WQEALLOC(_VF)			(0x00504000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_WQEALLOC_MAX_INDEX			255
+#define VFPE_WQEALLOC_PEQPID_S			0
+#define VFPE_WQEALLOC_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define VFPE_WQEALLOC_WQE_DESC_INDEX_S		20
+#define VFPE_WQEALLOC_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
+#define GLPES_PFIP4RXDISCARD(_i)		(0x00541400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXDISCARD_MAX_INDEX		127
+#define GLPES_PFIP4RXDISCARD_IP4RXDISCARD_S	0
+#define GLPES_PFIP4RXDISCARD_IP4RXDISCARD_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXFRAGSHI(_i)		(0x00541C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_S	0
+#define GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXFRAGSLO(_i)		(0x00541C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_S	0
+#define GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXMCOCTSHI(_i)		(0x00542404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_S	0
+#define GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXMCOCTSLO(_i)		(0x00542400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_S	0
+#define GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXMCPKTSHI(_i)		(0x00542C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_S	0
+#define GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXMCPKTSLO(_i)		(0x00542C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_S	0
+#define GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXOCTSHI(_i)			(0x00540404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_S	0
+#define GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXOCTSLO(_i)			(0x00540400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_S	0
+#define GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXPKTSHI(_i)			(0x00540C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_S	0
+#define GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4RXPKTSLO(_i)			(0x00540C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_S	0
+#define GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4RXTRUNC(_i)			(0x00541800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4RXTRUNC_MAX_INDEX		127
+#define GLPES_PFIP4RXTRUNC_IP4RXTRUNC_S		0
+#define GLPES_PFIP4RXTRUNC_IP4RXTRUNC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXFRAGSHI(_i)		(0x00547404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_S	0
+#define GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXFRAGSLO(_i)		(0x00547400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_S	0
+#define GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXMCOCTSHI(_i)		(0x00547C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_S	0
+#define GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXMCOCTSLO(_i)		(0x00547C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_S	0
+#define GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXMCPKTSHI(_i)		(0x00548404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_S	0
+#define GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXMCPKTSLO(_i)		(0x00548400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_S	0
+#define GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXNOROUTE(_i)		(0x0054B400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXNOROUTE_MAX_INDEX		127
+#define GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_S	0
+#define GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFIP4TXOCTSHI(_i)			(0x00546404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_S	0
+#define GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXOCTSLO(_i)			(0x00546400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_S	0
+#define GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP4TXPKTSHI(_i)			(0x00546C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_S	0
+#define GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP4TXPKTSLO(_i)			(0x00546C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP4TXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_S	0
+#define GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXDISCARD(_i)		(0x00544400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXDISCARD_MAX_INDEX		127
+#define GLPES_PFIP6RXDISCARD_IP6RXDISCARD_S	0
+#define GLPES_PFIP6RXDISCARD_IP6RXDISCARD_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXFRAGSHI(_i)		(0x00544C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_S	0
+#define GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXFRAGSLO(_i)		(0x00544C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_S	0
+#define GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXMCOCTSHI(_i)		(0x00545404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_S	0
+#define GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXMCOCTSLO(_i)		(0x00545400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_S	0
+#define GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXMCPKTSHI(_i)		(0x00545C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_S	0
+#define GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXMCPKTSLO(_i)		(0x00545C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_S	0
+#define GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXOCTSHI(_i)			(0x00543404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_S	0
+#define GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXOCTSLO(_i)			(0x00543400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_S	0
+#define GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXPKTSHI(_i)			(0x00543C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_S	0
+#define GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6RXPKTSLO(_i)			(0x00543C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_S	0
+#define GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6RXTRUNC(_i)			(0x00544800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6RXTRUNC_MAX_INDEX		127
+#define GLPES_PFIP6RXTRUNC_IP6RXTRUNC_S		0
+#define GLPES_PFIP6RXTRUNC_IP6RXTRUNC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXFRAGSHI(_i)		(0x00549C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXFRAGSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_S	0
+#define GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXFRAGSLO(_i)		(0x00549C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXFRAGSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_S	0
+#define GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXMCOCTSHI(_i)		(0x0054A404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_S	0
+#define GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXMCOCTSLO(_i)		(0x0054A400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_S	0
+#define GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXMCPKTSHI(_i)		(0x0054AC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_S	0
+#define GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXMCPKTSLO(_i)		(0x0054AC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXMCPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_S	0
+#define GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXNOROUTE(_i)		(0x0054B800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXNOROUTE_MAX_INDEX		127
+#define GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_S	0
+#define GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFIP6TXOCTSHI(_i)			(0x00548C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXOCTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_S	0
+#define GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXOCTSLO(_i)			(0x00548C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXOCTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_S	0
+#define GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFIP6TXPKTSHI(_i)			(0x00549404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXPKTSHI_MAX_INDEX		127
+#define GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_S	0
+#define GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFIP6TXPKTSLO(_i)			(0x00549400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFIP6TXPKTSLO_MAX_INDEX		127
+#define GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_S	0
+#define GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXRDSHI(_i)			(0x0054EC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXRDSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXRDSHI_RDMARXRDSHI_S	0
+#define GLPES_PFRDMARXRDSHI_RDMARXRDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXRDSLO(_i)			(0x0054EC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXRDSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXRDSLO_RDMARXRDSLO_S	0
+#define GLPES_PFRDMARXRDSLO_RDMARXRDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXSNDSHI(_i)		(0x0054F404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXSNDSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_S	0
+#define GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXSNDSLO(_i)		(0x0054F400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXSNDSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_S	0
+#define GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMARXWRSHI(_i)			(0x0054E404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXWRSHI_MAX_INDEX		127
+#define GLPES_PFRDMARXWRSHI_RDMARXWRSHI_S	0
+#define GLPES_PFRDMARXWRSHI_RDMARXWRSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMARXWRSLO(_i)			(0x0054E400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMARXWRSLO_MAX_INDEX		127
+#define GLPES_PFRDMARXWRSLO_RDMARXWRSLO_S	0
+#define GLPES_PFRDMARXWRSLO_RDMARXWRSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXRDSHI(_i)			(0x00550404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXRDSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXRDSHI_RDMARXRDSHI_S	0
+#define GLPES_PFRDMATXRDSHI_RDMARXRDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXRDSLO(_i)			(0x00550400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXRDSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXRDSLO_RDMARXRDSLO_S	0
+#define GLPES_PFRDMATXRDSLO_RDMARXRDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXSNDSHI(_i)		(0x00550C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXSNDSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_S	0
+#define GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXSNDSLO(_i)		(0x00550C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXSNDSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_S	0
+#define GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMATXWRSHI(_i)			(0x0054FC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXWRSHI_MAX_INDEX		127
+#define GLPES_PFRDMATXWRSHI_RDMARXWRSHI_S	0
+#define GLPES_PFRDMATXWRSHI_RDMARXWRSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMATXWRSLO(_i)			(0x0054FC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMATXWRSLO_MAX_INDEX		127
+#define GLPES_PFRDMATXWRSLO_RDMARXWRSLO_S	0
+#define GLPES_PFRDMATXWRSLO_RDMARXWRSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMAVBNDHI(_i)			(0x00551404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVBNDHI_MAX_INDEX		127
+#define GLPES_PFRDMAVBNDHI_RDMAVBNDHI_S		0
+#define GLPES_PFRDMAVBNDHI_RDMAVBNDHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMAVBNDLO(_i)			(0x00551400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVBNDLO_MAX_INDEX		127
+#define GLPES_PFRDMAVBNDLO_RDMAVBNDLO_S		0
+#define GLPES_PFRDMAVBNDLO_RDMAVBNDLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRDMAVINVHI(_i)			(0x00551C04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVINVHI_MAX_INDEX		127
+#define GLPES_PFRDMAVINVHI_RDMAVINVHI_S		0
+#define GLPES_PFRDMAVINVHI_RDMAVINVHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFRDMAVINVLO(_i)			(0x00551C00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRDMAVINVLO_MAX_INDEX		127
+#define GLPES_PFRDMAVINVLO_RDMAVINVLO_S		0
+#define GLPES_PFRDMAVINVLO_RDMAVINVLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFRXVLANERR(_i)			(0x00540000 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFRXVLANERR_MAX_INDEX		127
+#define GLPES_PFRXVLANERR_RXVLANERR_S		0
+#define GLPES_PFRXVLANERR_RXVLANERR_M		ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRTXSEG(_i)			(0x00552400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRTXSEG_MAX_INDEX		127
+#define GLPES_PFTCPRTXSEG_TCPRTXSEG_S		0
+#define GLPES_PFTCPRTXSEG_TCPRTXSEG_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFTCPRXOPTERR(_i)			(0x0054C400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXOPTERR_MAX_INDEX		127
+#define GLPES_PFTCPRXOPTERR_TCPRXOPTERR_S	0
+#define GLPES_PFTCPRXOPTERR_TCPRXOPTERR_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRXPROTOERR(_i)		(0x0054C800 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXPROTOERR_MAX_INDEX		127
+#define GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_S	0
+#define GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_PFTCPRXSEGSHI(_i)			(0x0054BC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXSEGSHI_MAX_INDEX		127
+#define GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_S	0
+#define GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFTCPRXSEGSLO(_i)			(0x0054BC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPRXSEGSLO_MAX_INDEX		127
+#define GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_S	0
+#define GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFTCPTXSEGHI(_i)			(0x0054CC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPTXSEGHI_MAX_INDEX		127
+#define GLPES_PFTCPTXSEGHI_TCPTXSEGHI_S		0
+#define GLPES_PFTCPTXSEGHI_TCPTXSEGHI_M		ICE_M(0xFFFF, 0)
+#define GLPES_PFTCPTXSEGLO(_i)			(0x0054CC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFTCPTXSEGLO_MAX_INDEX		127
+#define GLPES_PFTCPTXSEGLO_TCPTXSEGLO_S		0
+#define GLPES_PFTCPTXSEGLO_TCPTXSEGLO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFUDPRXPKTSHI(_i)			(0x0054D404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPRXPKTSHI_MAX_INDEX		127
+#define GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_S	0
+#define GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFUDPRXPKTSLO(_i)			(0x0054D400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPRXPKTSLO_MAX_INDEX		127
+#define GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_S	0
+#define GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_PFUDPTXPKTSHI(_i)			(0x0054DC04 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPTXPKTSHI_MAX_INDEX		127
+#define GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_S	0
+#define GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_M	ICE_M(0xFFFF, 0)
+#define GLPES_PFUDPTXPKTSLO(_i)			(0x0054DC00 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLPES_PFUDPTXPKTSLO_MAX_INDEX		127
+#define GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_S	0
+#define GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXMULTFPDUSHI			0x0055E00C /* Reset Source: CORER */
+#define GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_S 0
+#define GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_RDMARXMULTFPDUSLO			0x0055E008 /* Reset Source: CORER */
+#define GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_S 0
+#define GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXOOODDPHI			0x0055E014 /* Reset Source: CORER */
+#define GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_S	0
+#define GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_RDMARXOOODDPLO			0x0055E010 /* Reset Source: CORER */
+#define GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_S	0
+#define GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXOOONOMARK			0x0055E004 /* Reset Source: CORER */
+#define GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_S	0
+#define GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_RDMARXUNALIGN			0x0055E000 /* Reset Source: CORER */
+#define GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_S	0
+#define GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXFOURHOLEHI			0x0055E03C /* Reset Source: CORER */
+#define GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_S 0
+#define GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXFOURHOLELO			0x0055E038 /* Reset Source: CORER */
+#define GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_S 0
+#define GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXONEHOLEHI			0x0055E024 /* Reset Source: CORER */
+#define GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_S	0
+#define GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXONEHOLELO			0x0055E020 /* Reset Source: CORER */
+#define GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_S	0
+#define GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXPUREACKHI			0x0055E01C /* Reset Source: CORER */
+#define GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_S	0
+#define GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXPUREACKSLO			0x0055E018 /* Reset Source: CORER */
+#define GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_S	0
+#define GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXTHREEHOLEHI			0x0055E034 /* Reset Source: CORER */
+#define GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_S 0
+#define GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXTHREEHOLELO			0x0055E030 /* Reset Source: CORER */
+#define GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_S 0
+#define GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPRXTWOHOLEHI			0x0055E02C /* Reset Source: CORER */
+#define GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_S	0
+#define GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPRXTWOHOLELO			0x0055E028 /* Reset Source: CORER */
+#define GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_S	0
+#define GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXRETRANSFASTHI		0x0055E044 /* Reset Source: CORER */
+#define GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_S 0
+#define GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXRETRANSFASTLO		0x0055E040 /* Reset Source: CORER */
+#define GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_S 0
+#define GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXTOUTSFASTHI			0x0055E04C /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_S 0
+#define GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_M ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXTOUTSFASTLO			0x0055E048 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_S 0
+#define GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_M ICE_M(0xFFFFFFFF, 0)
+#define GLPES_TCPTXTOUTSHI			0x0055E054 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_S	0
+#define GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_M	ICE_M(0xFFFFFF, 0)
+#define GLPES_TCPTXTOUTSLO			0x0055E050 /* Reset Source: CORER */
+#define GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_S	0
+#define GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_PWR_MODE_CTL				0x000B820C /* Reset Source: POR */
+#define GL_PWR_MODE_CTL_SWITCH_PWR_MODE_EN_S	0
+#define GL_PWR_MODE_CTL_SWITCH_PWR_MODE_EN_M	BIT(0)
+#define GL_PWR_MODE_CTL_NIC_PWR_MODE_EN_S	1
+#define GL_PWR_MODE_CTL_NIC_PWR_MODE_EN_M	BIT(1)
+#define GL_PWR_MODE_CTL_S5_PWR_MODE_EN_S	2
+#define GL_PWR_MODE_CTL_S5_PWR_MODE_EN_M	BIT(2)
+#define GL_PWR_MODE_CTL_CAR_MAX_SW_CONFIG_S	3
+#define GL_PWR_MODE_CTL_CAR_MAX_SW_CONFIG_M	ICE_M(0x3, 3)
 #define GL_PWR_MODE_CTL_CAR_MAX_BW_S		30
 #define GL_PWR_MODE_CTL_CAR_MAX_BW_M		ICE_M(0x3, 30)
-#define GLPRT_BPRCL(_i)				(0x00381380 + ((_i) * 8))
-#define GLPRT_BPTCL(_i)				(0x00381240 + ((_i) * 8))
-#define GLPRT_CRCERRS(_i)			(0x00380100 + ((_i) * 8))
-#define GLPRT_GORCL(_i)				(0x00380000 + ((_i) * 8))
-#define GLPRT_GOTCL(_i)				(0x00380B40 + ((_i) * 8))
-#define GLPRT_ILLERRC(_i)			(0x003801C0 + ((_i) * 8))
-#define GLPRT_LXOFFRXC(_i)			(0x003802C0 + ((_i) * 8))
-#define GLPRT_LXOFFTXC(_i)			(0x00381180 + ((_i) * 8))
-#define GLPRT_LXONRXC(_i)			(0x00380280 + ((_i) * 8))
-#define GLPRT_LXONTXC(_i)			(0x00381140 + ((_i) * 8))
-#define GLPRT_MLFC(_i)				(0x00380040 + ((_i) * 8))
-#define GLPRT_MPRCL(_i)				(0x00381340 + ((_i) * 8))
-#define GLPRT_MPTCL(_i)				(0x00381200 + ((_i) * 8))
-#define GLPRT_MRFC(_i)				(0x00380080 + ((_i) * 8))
-#define GLPRT_PRC1023L(_i)			(0x00380A00 + ((_i) * 8))
-#define GLPRT_PRC127L(_i)			(0x00380940 + ((_i) * 8))
-#define GLPRT_PRC1522L(_i)			(0x00380A40 + ((_i) * 8))
-#define GLPRT_PRC255L(_i)			(0x00380980 + ((_i) * 8))
-#define GLPRT_PRC511L(_i)			(0x003809C0 + ((_i) * 8))
-#define GLPRT_PRC64L(_i)			(0x00380900 + ((_i) * 8))
-#define GLPRT_PRC9522L(_i)			(0x00380A80 + ((_i) * 8))
-#define GLPRT_PTC1023L(_i)			(0x00380C80 + ((_i) * 8))
-#define GLPRT_PTC127L(_i)			(0x00380BC0 + ((_i) * 8))
-#define GLPRT_PTC1522L(_i)			(0x00380CC0 + ((_i) * 8))
-#define GLPRT_PTC255L(_i)			(0x00380C00 + ((_i) * 8))
-#define GLPRT_PTC511L(_i)			(0x00380C40 + ((_i) * 8))
-#define GLPRT_PTC64L(_i)			(0x00380B80 + ((_i) * 8))
-#define GLPRT_PTC9522L(_i)			(0x00380D00 + ((_i) * 8))
-#define GLPRT_PXOFFRXC(_i, _j)			(0x00380500 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXOFFTXC(_i, _j)			(0x00380F40 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXONRXC(_i, _j)			(0x00380300 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_PXONTXC(_i, _j)			(0x00380D40 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_RFC(_i)				(0x00380AC0 + ((_i) * 8))
-#define GLPRT_RJC(_i)				(0x00380B00 + ((_i) * 8))
-#define GLPRT_RLEC(_i)				(0x00380140 + ((_i) * 8))
-#define GLPRT_ROC(_i)				(0x00380240 + ((_i) * 8))
-#define GLPRT_RUC(_i)				(0x00380200 + ((_i) * 8))
-#define GLPRT_RXON2OFFCNT(_i, _j)		(0x00380700 + ((_i) * 8 + (_j) * 64))
-#define GLPRT_TDOLD(_i)				(0x00381280 + ((_i) * 8))
-#define GLPRT_UPRCL(_i)				(0x00381300 + ((_i) * 8))
-#define GLPRT_UPTCL(_i)				(0x003811C0 + ((_i) * 8))
-#define GLV_BPRCL(_i)				(0x003B6000 + ((_i) * 8))
-#define GLV_BPTCL(_i)				(0x0030E000 + ((_i) * 8))
-#define GLV_GORCL(_i)				(0x003B0000 + ((_i) * 8))
-#define GLV_GOTCL(_i)				(0x00300000 + ((_i) * 8))
-#define GLV_MPRCL(_i)				(0x003B4000 + ((_i) * 8))
-#define GLV_MPTCL(_i)				(0x0030C000 + ((_i) * 8))
-#define GLV_RDPC(_i)				(0x00294C04 + ((_i) * 4))
-#define GLV_TEPC(_VSI)				(0x00312000 + ((_VSI) * 4))
-#define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8))
-#define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8))
-#define PF_VT_PFALLOC_HIF			0x0009DD80
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT	0x000B825C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_H_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT	0x000B8218 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_L_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT	0x000B8260 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PECLK_S 0
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PECLK_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UCLK_S 3
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UCLK_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_LCLK_S 6
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_LCLK_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PSM_S 9
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_PSM_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_RXCTL_S 12
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_RXCTL_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UANA_S 15
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_UANA_M ICE_M(0x7, 15)
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_S5_S 18
+#define GL_PWR_MODE_DIVIDE_CTRL_M_DEFAULT_DEFAULT_DIV_VAL_S5_M ICE_M(0x7, 18)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK	0x000B8200 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_LCLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK	0x000B81F0 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PECLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM	0x000B81FC /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_PSM_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL	0x000B81F8 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_RXCTL_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA	0x000B8208 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UANA_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK	0x000B81F4 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_H_UCLK_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK	0x000B8244 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_LCLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK	0x000B8220 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PECLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM	0x000B8240 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_PSM_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL	0x000B823C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_RXCTL_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA	0x000B8248 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UANA_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK	0x000B8238 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_L_UCLK_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK	0x000B8230 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_LCLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK	0x000B821C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PECLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM	0x000B822C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_PSM_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL	0x000B8228 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_RXCTL_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA	0x000B8234 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UANA_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK	0x000B8224 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S0_CTRL_M_UCLK_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL		0x000B81EC /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_50G_H_S 0
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_50G_H_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_25G_H_S 3
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_25G_H_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_10G_H_S 6
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_10G_H_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_4G_H_S 9
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_4G_H_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_A50G_H_S 12
+#define GL_PWR_MODE_DIVIDE_S5_H_CTRL_DIV_VAL_TBW_A50G_H_M ICE_M(0xF, 12)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL		0x000B824C /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_50G_L_S 0
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_50G_L_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_25G_L_S 3
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_25G_L_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_10G_L_S 6
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_10G_L_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_4G_L_S 9
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_4G_L_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_A50G_L_S 12
+#define GL_PWR_MODE_DIVIDE_S5_L_CTRL_DIV_VAL_TBW_A50G_L_M ICE_M(0x7, 12)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL		0x000B8250 /* Reset Source: POR */
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_50G_M_S 0
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_50G_M_M ICE_M(0x7, 0)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_25G_M_S 3
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_25G_M_M ICE_M(0x7, 3)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_10G_M_S 6
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_10G_M_M ICE_M(0x7, 6)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_4G_M_S 9
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_4G_M_M ICE_M(0x7, 9)
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_A50G_M_S 12
+#define GL_PWR_MODE_DIVIDE_S5_M_CTRL_DIV_VAL_TBW_A50G_M_M ICE_M(0x7, 12)
+#define GL_S5_PWR_MODE_EXIT_CTL			0x000B8270 /* Reset Source: POR */
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_AUTO_EXIT_S 0
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_AUTO_EXIT_M BIT(0)
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_FW_EXIT_S 1
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_FW_EXIT_M BIT(1)
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_PRST_FLOWS_ON_CORER_S 3
+#define GL_S5_PWR_MODE_EXIT_CTL_S5_PWR_MODE_PRST_FLOWS_ON_CORER_M BIT(3)
+#define GLGEN_PME_TO				0x000B81BC /* Reset Source: POR */
+#define GLGEN_PME_TO_PME_TO_FOR_PE_S		0
+#define GLGEN_PME_TO_PME_TO_FOR_PE_M		BIT(0)
+#define PRTPM_EEE_STAT				0x001E4320 /* Reset Source: GLOBR */
+#define PRTPM_EEE_STAT_EEE_NEG_S		29
+#define PRTPM_EEE_STAT_EEE_NEG_M		BIT(29)
+#define PRTPM_EEE_STAT_RX_LPI_STATUS_S		30
+#define PRTPM_EEE_STAT_RX_LPI_STATUS_M		BIT(30)
+#define PRTPM_EEE_STAT_TX_LPI_STATUS_S		31
+#define PRTPM_EEE_STAT_TX_LPI_STATUS_M		BIT(31)
+#define PRTPM_EEEC				0x001E4380 /* Reset Source: GLOBR */
+#define PRTPM_EEEC_TW_WAKE_MIN_S		16
+#define PRTPM_EEEC_TW_WAKE_MIN_M		ICE_M(0x3F, 16)
+#define PRTPM_EEEC_TX_LU_LPI_DLY_S		24
+#define PRTPM_EEEC_TX_LU_LPI_DLY_M		ICE_M(0x3, 24)
+#define PRTPM_EEEC_TEEE_DLY_S			26
+#define PRTPM_EEEC_TEEE_DLY_M			ICE_M(0x3F, 26)
+#define PRTPM_EEEFWD				0x001E4400 /* Reset Source: GLOBR */
+#define PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_S	31
+#define PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_M	BIT(31)
+#define PRTPM_EEER				0x001E4360 /* Reset Source: GLOBR */
+#define PRTPM_EEER_TW_SYSTEM_S			0
+#define PRTPM_EEER_TW_SYSTEM_M			ICE_M(0xFFFF, 0)
+#define PRTPM_EEER_TX_LPI_EN_S			16
+#define PRTPM_EEER_TX_LPI_EN_M			BIT(16)
+#define PRTPM_EEETXC				0x001E43E0 /* Reset Source: GLOBR */
+#define PRTPM_EEETXC_TW_PHY_S			0
+#define PRTPM_EEETXC_TW_PHY_M			ICE_M(0xFFFF, 0)
+#define PRTPM_RLPIC				0x001E43A0 /* Reset Source: GLOBR */
+#define PRTPM_RLPIC_ERLPIC_S			0
+#define PRTPM_RLPIC_ERLPIC_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTPM_TLPIC				0x001E43C0 /* Reset Source: GLOBR */
+#define PRTPM_TLPIC_ETLPIC_S			0
+#define PRTPM_TLPIC_ETLPIC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLRPB_DHW(_i)				(0x000AC000 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DHW_MAX_INDEX			15
+#define GLRPB_DHW_DHW_TCN_S			0
+#define GLRPB_DHW_DHW_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DLW(_i)				(0x000AC044 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DLW_MAX_INDEX			15
+#define GLRPB_DLW_DLW_TCN_S			0
+#define GLRPB_DLW_DLW_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DPS(_i)				(0x000AC084 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPB_DPS_MAX_INDEX			15
+#define GLRPB_DPS_DPS_TCN_S			0
+#define GLRPB_DPS_DPS_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_DSI_EN				0x000AC324 /* Reset Source: CORER */
+#define GLRPB_DSI_EN_DSI_EN_S			0
+#define GLRPB_DSI_EN_DSI_EN_M			BIT(0)
+#define GLRPB_DSI_EN_DSI_L2_MAC_ERR_DROP_EN_S	1
+#define GLRPB_DSI_EN_DSI_L2_MAC_ERR_DROP_EN_M	BIT(1)
+#define GLRPB_SHW(_i)				(0x000AC120 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SHW_MAX_INDEX			7
+#define GLRPB_SHW_SHW_S				0
+#define GLRPB_SHW_SHW_M				ICE_M(0xFFFFF, 0)
+#define GLRPB_SLW(_i)				(0x000AC140 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SLW_MAX_INDEX			7
+#define GLRPB_SLW_SLW_S				0
+#define GLRPB_SLW_SLW_M				ICE_M(0xFFFFF, 0)
+#define GLRPB_SPS(_i)				(0x000AC0C4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPB_SPS_MAX_INDEX			7
+#define GLRPB_SPS_SPS_TCN_S			0
+#define GLRPB_SPS_SPS_TCN_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_TC_CFG(_i)			(0x000AC2A4 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TC_CFG_MAX_INDEX			31
+#define GLRPB_TC_CFG_D_POOL_S			0
+#define GLRPB_TC_CFG_D_POOL_M			ICE_M(0xFFFF, 0)
+#define GLRPB_TC_CFG_S_POOL_S			16
+#define GLRPB_TC_CFG_S_POOL_M			ICE_M(0xFFFF, 16)
+#define GLRPB_TCHW(_i)				(0x000AC330 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TCHW_MAX_INDEX			31
+#define GLRPB_TCHW_TCHW_S			0
+#define GLRPB_TCHW_TCHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPB_TCLW(_i)				(0x000AC3B0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPB_TCLW_MAX_INDEX			31
+#define GLRPB_TCLW_TCLW_S			0
+#define GLRPB_TCLW_TCLW_M			ICE_M(0xFFFFF, 0)
+#define GLQF_APBVT(_i)				(0x00450000 + ((_i) * 4)) /* _i=0...2047 */ /* Reset Source: CORER */
+#define GLQF_APBVT_MAX_INDEX			2047
+#define GLQF_APBVT_APBVT_S			0
+#define GLQF_APBVT_APBVT_M			ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CLSN_0				0x00460028 /* Reset Source: CORER */
+#define GLQF_FD_CLSN_0_HITSBCNT_S		0
+#define GLQF_FD_CLSN_0_HITSBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CLSN1				0x00460030 /* Reset Source: CORER */
+#define GLQF_FD_CLSN1_HITLBCNT_S		0
+#define GLQF_FD_CLSN1_HITLBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FD_CNT				0x00460018 /* Reset Source: CORER */
+#define GLQF_FD_CNT_FD_GCNT_S			0
+#define GLQF_FD_CNT_FD_GCNT_M			ICE_M(0x7FFF, 0)
+#define GLQF_FD_CNT_FD_BCNT_S			16
+#define GLQF_FD_CNT_FD_BCNT_M			ICE_M(0x7FFF, 16)
+#define GLQF_FD_CTL				0x00460000 /* Reset Source: CORER */
+#define GLQF_FD_CTL_FDLONG_S			0
+#define GLQF_FD_CTL_FDLONG_M			ICE_M(0xF, 0)
+#define GLQF_FD_CTL_HASH_REPORT_S		4
+#define GLQF_FD_CTL_HASH_REPORT_M		BIT(4)
+#define GLQF_FD_CTL_FLT_ADDR_REPORT_S		5
+#define GLQF_FD_CTL_FLT_ADDR_REPORT_M		BIT(5)
+#define GLQF_FD_SIZE				0x00460010 /* Reset Source: CORER */
+#define GLQF_FD_SIZE_FD_GSIZE_S			0
+#define GLQF_FD_SIZE_FD_GSIZE_M			ICE_M(0x7FFF, 0)
+#define GLQF_FD_SIZE_FD_BSIZE_S			16
+#define GLQF_FD_SIZE_FD_BSIZE_M			ICE_M(0x7FFF, 16)
+#define GLQF_FDCNT_0				0x00460020 /* Reset Source: CORER */
+#define GLQF_FDCNT_0_BUCKETCNT_S		0
+#define GLQF_FDCNT_0_BUCKETCNT_M		ICE_M(0x7FFF, 0)
+#define GLQF_FDCNT_0_CNT_NOT_VLD_S		31
+#define GLQF_FDCNT_0_CNT_NOT_VLD_M		BIT(31)
+#define GLQF_FDEVICTENA(_i)			(0x00452000 + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLQF_FDEVICTENA_MAX_INDEX		3
+#define GLQF_FDEVICTENA_FDEVICTENA_S		0
+#define GLQF_FDEVICTENA_FDEVICTENA_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FDINSET(_i, _j)			(0x00412000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_FDINSET_MAX_INDEX			127
+#define GLQF_FDINSET_FV_WORD_INDX0_S		0
+#define GLQF_FDINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_FDINSET_FV_WORD_VAL0_S		7
+#define GLQF_FDINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_FDINSET_FV_WORD_INDX1_S		8
+#define GLQF_FDINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_FDINSET_FV_WORD_VAL1_S		15
+#define GLQF_FDINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_FDINSET_FV_WORD_INDX2_S		16
+#define GLQF_FDINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_FDINSET_FV_WORD_VAL2_S		23
+#define GLQF_FDINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_FDINSET_FV_WORD_INDX3_S		24
+#define GLQF_FDINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_FDINSET_FV_WORD_VAL3_S		31
+#define GLQF_FDINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_FDMASK(_i)				(0x00410800 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_FDMASK_MAX_INDEX			31
+#define GLQF_FDMASK_MSK_INDEX_S			0
+#define GLQF_FDMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_FDMASK_MASK_S			16
+#define GLQF_FDMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_FDMASK_SEL(_i)			(0x00410400 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLQF_FDMASK_SEL_MAX_INDEX		127
+#define GLQF_FDMASK_SEL_MASK_SEL_S		0
+#define GLQF_FDMASK_SEL_MASK_SEL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_FDSWAP(_i, _j)			(0x00413000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_FDSWAP_MAX_INDEX			127
+#define GLQF_FDSWAP_FV_WORD_INDX0_S		0
+#define GLQF_FDSWAP_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_FDSWAP_FV_WORD_VAL0_S		7
+#define GLQF_FDSWAP_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_FDSWAP_FV_WORD_INDX1_S		8
+#define GLQF_FDSWAP_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_FDSWAP_FV_WORD_VAL1_S		15
+#define GLQF_FDSWAP_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_FDSWAP_FV_WORD_INDX2_S		16
+#define GLQF_FDSWAP_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_FDSWAP_FV_WORD_VAL2_S		23
+#define GLQF_FDSWAP_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_FDSWAP_FV_WORD_INDX3_S		24
+#define GLQF_FDSWAP_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_FDSWAP_FV_WORD_VAL3_S		31
+#define GLQF_FDSWAP_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_HINSET(_i, _j)			(0x0040E000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_HINSET_MAX_INDEX			127
+#define GLQF_HINSET_FV_WORD_INDX0_S		0
+#define GLQF_HINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_HINSET_FV_WORD_VAL0_S		7
+#define GLQF_HINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_HINSET_FV_WORD_INDX1_S		8
+#define GLQF_HINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_HINSET_FV_WORD_VAL1_S		15
+#define GLQF_HINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_HINSET_FV_WORD_INDX2_S		16
+#define GLQF_HINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_HINSET_FV_WORD_VAL2_S		23
+#define GLQF_HINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_HINSET_FV_WORD_INDX3_S		24
+#define GLQF_HINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_HINSET_FV_WORD_VAL3_S		31
+#define GLQF_HINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_HKEY(_i)				(0x00456000 + ((_i) * 4)) /* _i=0...12 */ /* Reset Source: CORER */
+#define GLQF_HKEY_MAX_INDEX			12
+#define GLQF_HKEY_KEY_0_S			0
+#define GLQF_HKEY_KEY_0_M			ICE_M(0xFF, 0)
+#define GLQF_HKEY_KEY_1_S			8
+#define GLQF_HKEY_KEY_1_M			ICE_M(0xFF, 8)
+#define GLQF_HKEY_KEY_2_S			16
+#define GLQF_HKEY_KEY_2_M			ICE_M(0xFF, 16)
+#define GLQF_HKEY_KEY_3_S			24
+#define GLQF_HKEY_KEY_3_M			ICE_M(0xFF, 24)
+#define GLQF_HLUT(_i, _j)			(0x00438000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...15 */ /* Reset Source: CORER */
+#define GLQF_HLUT_MAX_INDEX			127
+#define GLQF_HLUT_LUT0_S			0
+#define GLQF_HLUT_LUT0_M			ICE_M(0x3F, 0)
+#define GLQF_HLUT_LUT1_S			8
+#define GLQF_HLUT_LUT1_M			ICE_M(0x3F, 8)
+#define GLQF_HLUT_LUT2_S			16
+#define GLQF_HLUT_LUT2_M			ICE_M(0x3F, 16)
+#define GLQF_HLUT_LUT3_S			24
+#define GLQF_HLUT_LUT3_M			ICE_M(0x3F, 24)
+#define GLQF_HLUT_SIZE(_i)			(0x00455400 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLQF_HLUT_SIZE_MAX_INDEX		15
+#define GLQF_HLUT_SIZE_HSIZE_S			0
+#define GLQF_HLUT_SIZE_HSIZE_M			BIT(0)
+#define GLQF_HMASK(_i)				(0x0040FC00 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_HMASK_MAX_INDEX			31
+#define GLQF_HMASK_MSK_INDEX_S			0
+#define GLQF_HMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_HMASK_MASK_S			16
+#define GLQF_HMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_HMASK_SEL(_i)			(0x00410000 + ((_i) * 4)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GLQF_HMASK_SEL_MAX_INDEX		127
+#define GLQF_HMASK_SEL_MASK_SEL_S		0
+#define GLQF_HMASK_SEL_MASK_SEL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_HSYMM(_i, _j)			(0x0040F000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_HSYMM_MAX_INDEX			127
+#define GLQF_HSYMM_FV_SYMM_INDX0_S		0
+#define GLQF_HSYMM_FV_SYMM_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_HSYMM_SYMM0_ENA_S			7
+#define GLQF_HSYMM_SYMM0_ENA_M			BIT(7)
+#define GLQF_HSYMM_FV_SYMM_INDX1_S		8
+#define GLQF_HSYMM_FV_SYMM_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_HSYMM_SYMM1_ENA_S			15
+#define GLQF_HSYMM_SYMM1_ENA_M			BIT(15)
+#define GLQF_HSYMM_FV_SYMM_INDX2_S		16
+#define GLQF_HSYMM_FV_SYMM_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_HSYMM_SYMM2_ENA_S			23
+#define GLQF_HSYMM_SYMM2_ENA_M			BIT(23)
+#define GLQF_HSYMM_FV_SYMM_INDX3_S		24
+#define GLQF_HSYMM_FV_SYMM_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_HSYMM_SYMM3_ENA_S			31
+#define GLQF_HSYMM_SYMM3_ENA_M			BIT(31)
+#define GLQF_PE_APBVT_CNT			0x00455500 /* Reset Source: CORER */
+#define GLQF_PE_APBVT_CNT_APBVT_LAN_S		0
+#define GLQF_PE_APBVT_CNT_APBVT_LAN_M		ICE_M(0xFFFFFFFF, 0)
+#define GLQF_PE_CMD				0x00471080 /* Reset Source: CORER */
+#define GLQF_PE_CMD_ADDREM_STS_S		0
+#define GLQF_PE_CMD_ADDREM_STS_M		ICE_M(0xFFFFFF, 0)
+#define GLQF_PE_CMD_ADDREM_ID_S			28
+#define GLQF_PE_CMD_ADDREM_ID_M			ICE_M(0xF, 28)
+#define GLQF_PE_CTL				0x004710C0 /* Reset Source: CORER */
+#define GLQF_PE_CTL_PELONG_S			0
+#define GLQF_PE_CTL_PELONG_M			ICE_M(0xF, 0)
+#define GLQF_PE_CTL2(_i)			(0x00455200 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_PE_CTL2_MAX_INDEX			31
+#define GLQF_PE_CTL2_TO_QH_S			0
+#define GLQF_PE_CTL2_TO_QH_M			ICE_M(0x3, 0)
+#define GLQF_PE_CTL2_APBVT_ENA_S		2
+#define GLQF_PE_CTL2_APBVT_ENA_M		BIT(2)
+#define GLQF_PE_FVE				0x0020E514 /* Reset Source: CORER */
+#define GLQF_PE_FVE_W_ENA_S			0
+#define GLQF_PE_FVE_W_ENA_M			ICE_M(0xFFFFFF, 0)
+#define GLQF_PE_OSR_STS				0x00471040 /* Reset Source: CORER */
+#define GLQF_PE_OSR_STS_QH_SRCH_MAXOSR_S	0
+#define GLQF_PE_OSR_STS_QH_SRCH_MAXOSR_M	ICE_M(0x3FF, 0)
+#define GLQF_PE_OSR_STS_QH_CMD_MAXOSR_S		16
+#define GLQF_PE_OSR_STS_QH_CMD_MAXOSR_M		ICE_M(0x3FF, 16)
+#define GLQF_PEINSET(_i, _j)			(0x00415000 + ((_i) * 4 + (_j) * 128)) /* _i=0...31, _j=0...5 */ /* Reset Source: CORER */
+#define GLQF_PEINSET_MAX_INDEX			31
+#define GLQF_PEINSET_FV_WORD_INDX0_S		0
+#define GLQF_PEINSET_FV_WORD_INDX0_M		ICE_M(0x1F, 0)
+#define GLQF_PEINSET_FV_WORD_VAL0_S		7
+#define GLQF_PEINSET_FV_WORD_VAL0_M		BIT(7)
+#define GLQF_PEINSET_FV_WORD_INDX1_S		8
+#define GLQF_PEINSET_FV_WORD_INDX1_M		ICE_M(0x1F, 8)
+#define GLQF_PEINSET_FV_WORD_VAL1_S		15
+#define GLQF_PEINSET_FV_WORD_VAL1_M		BIT(15)
+#define GLQF_PEINSET_FV_WORD_INDX2_S		16
+#define GLQF_PEINSET_FV_WORD_INDX2_M		ICE_M(0x1F, 16)
+#define GLQF_PEINSET_FV_WORD_VAL2_S		23
+#define GLQF_PEINSET_FV_WORD_VAL2_M		BIT(23)
+#define GLQF_PEINSET_FV_WORD_INDX3_S		24
+#define GLQF_PEINSET_FV_WORD_INDX3_M		ICE_M(0x1F, 24)
+#define GLQF_PEINSET_FV_WORD_VAL3_S		31
+#define GLQF_PEINSET_FV_WORD_VAL3_M		BIT(31)
+#define GLQF_PEMASK(_i)				(0x00415400 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLQF_PEMASK_MAX_INDEX			15
+#define GLQF_PEMASK_MSK_INDEX_S			0
+#define GLQF_PEMASK_MSK_INDEX_M			ICE_M(0x1F, 0)
+#define GLQF_PEMASK_MASK_S			16
+#define GLQF_PEMASK_MASK_M			ICE_M(0xFFFF, 16)
+#define GLQF_PEMASK_SEL(_i)			(0x00415500 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLQF_PEMASK_SEL_MAX_INDEX		31
+#define GLQF_PEMASK_SEL_MASK_SEL_S		0
+#define GLQF_PEMASK_SEL_MASK_SEL_M		ICE_M(0xFFFF, 0)
+#define GLQF_PETABLE_CLR(_i)			(0x000AA078 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLQF_PETABLE_CLR_MAX_INDEX		1
+#define GLQF_PETABLE_CLR_VM_VF_NUM_S		0
+#define GLQF_PETABLE_CLR_VM_VF_NUM_M		ICE_M(0x3FF, 0)
+#define GLQF_PETABLE_CLR_VM_VF_TYPE_S		10
+#define GLQF_PETABLE_CLR_VM_VF_TYPE_M		ICE_M(0x3, 10)
+#define GLQF_PETABLE_CLR_PF_NUM_S		12
+#define GLQF_PETABLE_CLR_PF_NUM_M		ICE_M(0x7, 12)
+#define GLQF_PETABLE_CLR_PE_BUSY_S		16
+#define GLQF_PETABLE_CLR_PE_BUSY_M		BIT(16)
+#define GLQF_PETABLE_CLR_PE_CLEAR_S		17
+#define GLQF_PETABLE_CLR_PE_CLEAR_M		BIT(17)
+#define GLQF_PROF2TC(_i, _j)			(0x0044D000 + ((_i) * 4 + (_j) * 512)) /* _i=0...127, _j=0...3 */ /* Reset Source: CORER */
+#define GLQF_PROF2TC_MAX_INDEX			127
+#define GLQF_PROF2TC_OVERRIDE_ENA_0_S		0
+#define GLQF_PROF2TC_OVERRIDE_ENA_0_M		BIT(0)
+#define GLQF_PROF2TC_REGION_0_S			1
+#define GLQF_PROF2TC_REGION_0_M			ICE_M(0x7, 1)
+#define GLQF_PROF2TC_OVERRIDE_ENA_1_S		4
+#define GLQF_PROF2TC_OVERRIDE_ENA_1_M		BIT(4)
+#define GLQF_PROF2TC_REGION_1_S			5
+#define GLQF_PROF2TC_REGION_1_M			ICE_M(0x7, 5)
+#define GLQF_PROF2TC_OVERRIDE_ENA_2_S		8
+#define GLQF_PROF2TC_OVERRIDE_ENA_2_M		BIT(8)
+#define GLQF_PROF2TC_REGION_2_S			9
+#define GLQF_PROF2TC_REGION_2_M			ICE_M(0x7, 9)
+#define GLQF_PROF2TC_OVERRIDE_ENA_3_S		12
+#define GLQF_PROF2TC_OVERRIDE_ENA_3_M		BIT(12)
+#define GLQF_PROF2TC_REGION_3_S			13
+#define GLQF_PROF2TC_REGION_3_M			ICE_M(0x7, 13)
+#define GLQF_PROF2TC_OVERRIDE_ENA_4_S		16
+#define GLQF_PROF2TC_OVERRIDE_ENA_4_M		BIT(16)
+#define GLQF_PROF2TC_REGION_4_S			17
+#define GLQF_PROF2TC_REGION_4_M			ICE_M(0x7, 17)
+#define GLQF_PROF2TC_OVERRIDE_ENA_5_S		20
+#define GLQF_PROF2TC_OVERRIDE_ENA_5_M		BIT(20)
+#define GLQF_PROF2TC_REGION_5_S			21
+#define GLQF_PROF2TC_REGION_5_M			ICE_M(0x7, 21)
+#define GLQF_PROF2TC_OVERRIDE_ENA_6_S		24
+#define GLQF_PROF2TC_OVERRIDE_ENA_6_M		BIT(24)
+#define GLQF_PROF2TC_REGION_6_S			25
+#define GLQF_PROF2TC_REGION_6_M			ICE_M(0x7, 25)
+#define GLQF_PROF2TC_OVERRIDE_ENA_7_S		28
+#define GLQF_PROF2TC_OVERRIDE_ENA_7_M		BIT(28)
+#define GLQF_PROF2TC_REGION_7_S			29
+#define GLQF_PROF2TC_REGION_7_M			ICE_M(0x7, 29)
+#define PFQF_FD_CNT				0x00460180 /* Reset Source: CORER */
+#define PFQF_FD_CNT_FD_GCNT_S			0
+#define PFQF_FD_CNT_FD_GCNT_M			ICE_M(0x7FFF, 0)
+#define PFQF_FD_CNT_FD_BCNT_S			16
+#define PFQF_FD_CNT_FD_BCNT_M			ICE_M(0x7FFF, 16)
+#define PFQF_FD_ENA				0x0043A000 /* Reset Source: CORER */
+#define PFQF_FD_ENA_FD_ENA_S			0
+#define PFQF_FD_ENA_FD_ENA_M			BIT(0)
+#define PFQF_FD_SIZE				0x00460100 /* Reset Source: CORER */
+#define PFQF_FD_SIZE_FD_GSIZE_S			0
+#define PFQF_FD_SIZE_FD_GSIZE_M			ICE_M(0x7FFF, 0)
+#define PFQF_FD_SIZE_FD_BSIZE_S			16
+#define PFQF_FD_SIZE_FD_BSIZE_M			ICE_M(0x7FFF, 16)
+#define PFQF_FD_SUBTRACT			0x00460200 /* Reset Source: CORER */
+#define PFQF_FD_SUBTRACT_FD_GCNT_S		0
+#define PFQF_FD_SUBTRACT_FD_GCNT_M		ICE_M(0x7FFF, 0)
+#define PFQF_FD_SUBTRACT_FD_BCNT_S		16
+#define PFQF_FD_SUBTRACT_FD_BCNT_M		ICE_M(0x7FFF, 16)
+#define PFQF_HLUT(_i)				(0x00430000 + ((_i) * 64)) /* _i=0...511 */ /* Reset Source: CORER */
+#define PFQF_HLUT_MAX_INDEX			511
+#define PFQF_HLUT_LUT0_S			0
+#define PFQF_HLUT_LUT0_M			ICE_M(0xFF, 0)
+#define PFQF_HLUT_LUT1_S			8
+#define PFQF_HLUT_LUT1_M			ICE_M(0xFF, 8)
+#define PFQF_HLUT_LUT2_S			16
+#define PFQF_HLUT_LUT2_M			ICE_M(0xFF, 16)
+#define PFQF_HLUT_LUT3_S			24
+#define PFQF_HLUT_LUT3_M			ICE_M(0xFF, 24)
+#define PFQF_HLUT_SIZE				0x00455480 /* Reset Source: CORER */
+#define PFQF_HLUT_SIZE_HSIZE_S			0
+#define PFQF_HLUT_SIZE_HSIZE_M			ICE_M(0x3, 0)
+#define PFQF_PE_CLSN0				0x00470480 /* Reset Source: CORER */
+#define PFQF_PE_CLSN0_HITSBCNT_S		0
+#define PFQF_PE_CLSN0_HITSBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFQF_PE_CLSN1				0x00470500 /* Reset Source: CORER */
+#define PFQF_PE_CLSN1_HITLBCNT_S		0
+#define PFQF_PE_CLSN1_HITLBCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define PFQF_PE_CTL1				0x00470000 /* Reset Source: CORER */
+#define PFQF_PE_CTL1_PEHSIZE_S			0
+#define PFQF_PE_CTL1_PEHSIZE_M			ICE_M(0xF, 0)
+#define PFQF_PE_CTL2				0x00470040 /* Reset Source: CORER */
+#define PFQF_PE_CTL2_PEDSIZE_S			0
+#define PFQF_PE_CTL2_PEDSIZE_M			ICE_M(0xF, 0)
+#define PFQF_PE_FILTERING_ENA			0x0043A080 /* Reset Source: CORER */
+#define PFQF_PE_FILTERING_ENA_PE_ENA_S		0
+#define PFQF_PE_FILTERING_ENA_PE_ENA_M		BIT(0)
+#define PFQF_PE_FLHD				0x00470100 /* Reset Source: CORER */
+#define PFQF_PE_FLHD_FLHD_S			0
+#define PFQF_PE_FLHD_FLHD_M			ICE_M(0xFFFFFF, 0)
+#define PFQF_PE_ST_CTL				0x00470400 /* Reset Source: CORER */
+#define PFQF_PE_ST_CTL_PF_CNT_EN_S		0
+#define PFQF_PE_ST_CTL_PF_CNT_EN_M		BIT(0)
+#define PFQF_PE_ST_CTL_VFS_CNT_EN_S		1
+#define PFQF_PE_ST_CTL_VFS_CNT_EN_M		BIT(1)
+#define PFQF_PE_ST_CTL_VF_CNT_EN_S		2
+#define PFQF_PE_ST_CTL_VF_CNT_EN_M		BIT(2)
+#define PFQF_PE_ST_CTL_VF_NUM_S			16
+#define PFQF_PE_ST_CTL_VF_NUM_M			ICE_M(0xFF, 16)
+#define PFQF_PE_TC_CTL				0x00452080 /* Reset Source: CORER */
+#define PFQF_PE_TC_CTL_TC_EN_PF_S		0
+#define PFQF_PE_TC_CTL_TC_EN_PF_M		ICE_M(0xFF, 0)
+#define PFQF_PE_TC_CTL_TC_EN_VF_S		16
+#define PFQF_PE_TC_CTL_TC_EN_VF_M		ICE_M(0xFF, 16)
+#define PFQF_PECNT_0				0x00470200 /* Reset Source: CORER */
+#define PFQF_PECNT_0_BUCKETCNT_S		0
+#define PFQF_PECNT_0_BUCKETCNT_M		ICE_M(0x3FFFF, 0)
+#define PFQF_PECNT_1				0x00470300 /* Reset Source: CORER */
+#define PFQF_PECNT_1_FLTCNT_S			0
+#define PFQF_PECNT_1_FLTCNT_M			ICE_M(0x3FFFF, 0)
+#define VPQF_PE_CTL1(_VF)			(0x00474000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_CTL1_MAX_INDEX			255
+#define VPQF_PE_CTL1_PEHSIZE_S			0
+#define VPQF_PE_CTL1_PEHSIZE_M			ICE_M(0xF, 0)
+#define VPQF_PE_CTL2(_VF)			(0x00474800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_CTL2_MAX_INDEX			255
+#define VPQF_PE_CTL2_PEDSIZE_S			0
+#define VPQF_PE_CTL2_PEDSIZE_M			ICE_M(0xF, 0)
+#define VPQF_PE_FILTERING_ENA(_VF)		(0x00455800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_FILTERING_ENA_MAX_INDEX		255
+#define VPQF_PE_FILTERING_ENA_PE_ENA_S		0
+#define VPQF_PE_FILTERING_ENA_PE_ENA_M		BIT(0)
+#define VPQF_PE_FLHD(_VF)			(0x00472000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PE_FLHD_MAX_INDEX			255
+#define VPQF_PE_FLHD_FLHD_S			0
+#define VPQF_PE_FLHD_FLHD_M			ICE_M(0xFFFFFF, 0)
+#define VPQF_PECNT_0(_VF)			(0x00472800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PECNT_0_MAX_INDEX			255
+#define VPQF_PECNT_0_BUCKETCNT_S		0
+#define VPQF_PECNT_0_BUCKETCNT_M		ICE_M(0x3FFFF, 0)
+#define VPQF_PECNT_1(_VF)			(0x00473000 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VPQF_PECNT_1_MAX_INDEX			255
+#define VPQF_PECNT_1_FLTCNT_S			0
+#define VPQF_PECNT_1_FLTCNT_M			ICE_M(0x3FFFF, 0)
+#define GLDCB_RMPMC				0x001223C8 /* Reset Source: CORER */
+#define GLDCB_RMPMC_RSPM_S			0
+#define GLDCB_RMPMC_RSPM_M			ICE_M(0x3F, 0)
+#define GLDCB_RMPMC_MIQ_NODROP_MODE_S		6
+#define GLDCB_RMPMC_MIQ_NODROP_MODE_M		ICE_M(0x1F, 6)
+#define GLDCB_RMPMC_RPM_DIS_S			31
+#define GLDCB_RMPMC_RPM_DIS_M			BIT(31)
+#define GLDCB_RMPMS				0x001223CC /* Reset Source: CORER */
+#define GLDCB_RMPMS_RMPM_S			0
+#define GLDCB_RMPMS_RMPM_M			ICE_M(0xFFFF, 0)
+#define GLDCB_RPCC				0x00122260 /* Reset Source: CORER */
+#define GLDCB_RPCC_EN_S				0
+#define GLDCB_RPCC_EN_M				BIT(0)
+#define GLDCB_RPCC_SCL_FACT_S			4
+#define GLDCB_RPCC_SCL_FACT_M			ICE_M(0x1F, 4)
+#define GLDCB_RPCC_THRSH_S			16
+#define GLDCB_RPCC_THRSH_M			ICE_M(0xFFF, 16)
+#define GLDCB_RSPMC				0x001223C4 /* Reset Source: CORER */
+#define GLDCB_RSPMC_RSPM_S			0
+#define GLDCB_RSPMC_RSPM_M			ICE_M(0xFF, 0)
+#define GLDCB_RSPMC_RPM_MODE_S			8
+#define GLDCB_RSPMC_RPM_MODE_M			ICE_M(0x3, 8)
+#define GLDCB_RSPMC_PRR_MAX_EXP_S		10
+#define GLDCB_RSPMC_PRR_MAX_EXP_M		ICE_M(0xF, 10)
+#define GLDCB_RSPMC_PFCTIMER_S			14
+#define GLDCB_RSPMC_PFCTIMER_M			ICE_M(0x3FFF, 14)
+#define GLDCB_RSPMC_RPM_DIS_S			31
+#define GLDCB_RSPMC_RPM_DIS_M			BIT(31)
+#define GLDCB_RSPMS				0x001223C0 /* Reset Source: CORER */
+#define GLDCB_RSPMS_RSPM_S			0
+#define GLDCB_RSPMS_RSPM_M			ICE_M(0x3FFFF, 0)
+#define GLDCB_RTCTI				0x001223D0 /* Reset Source: CORER */
+#define GLDCB_RTCTI_PFCTIMEOUT_TC_S		0
+#define GLDCB_RTCTI_PFCTIMEOUT_TC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLDCB_RTCTQ(_i)				(0x001222C0 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RTCTQ_MAX_INDEX			31
+#define GLDCB_RTCTQ_RXQNUM_S			0
+#define GLDCB_RTCTQ_RXQNUM_M			ICE_M(0x7FF, 0)
+#define GLDCB_RTCTQ_IS_PF_Q_S			16
+#define GLDCB_RTCTQ_IS_PF_Q_M			BIT(16)
+#define GLDCB_RTCTS(_i)				(0x00122340 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLDCB_RTCTS_MAX_INDEX			31
+#define GLDCB_RTCTS_PFCTIMER_S			0
+#define GLDCB_RTCTS_PFCTIMER_M			ICE_M(0x3FFF, 0)
+#define GLRCB_CFG_COTF_CNT(_i)			(0x001223D4 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRCB_CFG_COTF_CNT_MAX_INDEX		7
+#define GLRCB_CFG_COTF_CNT_MRKR_COTF_CNT_S	0
+#define GLRCB_CFG_COTF_CNT_MRKR_COTF_CNT_M	ICE_M(0x3F, 0)
+#define GLRCB_CFG_COTF_ST			0x001223F4 /* Reset Source: CORER */
+#define GLRCB_CFG_COTF_ST_MRKR_COTF_ST_S	0
+#define GLRCB_CFG_COTF_ST_MRKR_COTF_ST_M	ICE_M(0xFF, 0)
+#define GLRPRS_PMCFG_DHW(_i)			(0x00200388 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DHW_MAX_INDEX		15
+#define GLRPRS_PMCFG_DHW_DHW_S			0
+#define GLRPRS_PMCFG_DHW_DHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_DLW(_i)			(0x002003C8 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DLW_MAX_INDEX		15
+#define GLRPRS_PMCFG_DLW_DLW_S			0
+#define GLRPRS_PMCFG_DLW_DLW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_DPS(_i)			(0x00200308 + ((_i) * 4)) /* _i=0...15 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_DPS_MAX_INDEX		15
+#define GLRPRS_PMCFG_DPS_DPS_S			0
+#define GLRPRS_PMCFG_DPS_DPS_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SHW(_i)			(0x00200448 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SHW_MAX_INDEX		7
+#define GLRPRS_PMCFG_SHW_SHW_S			0
+#define GLRPRS_PMCFG_SHW_SHW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SLW(_i)			(0x00200468 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SLW_MAX_INDEX		7
+#define GLRPRS_PMCFG_SLW_SLW_S			0
+#define GLRPRS_PMCFG_SLW_SLW_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_SPS(_i)			(0x00200408 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_SPS_MAX_INDEX		7
+#define GLRPRS_PMCFG_SPS_SPS_S			0
+#define GLRPRS_PMCFG_SPS_SPS_M			ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_TC_CFG(_i)			(0x00200488 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TC_CFG_MAX_INDEX		31
+#define GLRPRS_PMCFG_TC_CFG_D_POOL_S		0
+#define GLRPRS_PMCFG_TC_CFG_D_POOL_M		ICE_M(0xF, 0)
+#define GLRPRS_PMCFG_TC_CFG_S_POOL_S		16
+#define GLRPRS_PMCFG_TC_CFG_S_POOL_M		ICE_M(0x7, 16)
+#define GLRPRS_PMCFG_TCHW(_i)			(0x00200588 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TCHW_MAX_INDEX		31
+#define GLRPRS_PMCFG_TCHW_TCHW_S		0
+#define GLRPRS_PMCFG_TCHW_TCHW_M		ICE_M(0xFFFFF, 0)
+#define GLRPRS_PMCFG_TCLW(_i)			(0x00200608 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLRPRS_PMCFG_TCLW_MAX_INDEX		31
+#define GLRPRS_PMCFG_TCLW_TCLW_S		0
+#define GLRPRS_PMCFG_TCLW_TCLW_M		ICE_M(0xFFFFF, 0)
+#define GLSWT_PMCFG_TC_CFG(_i)			(0x00204900 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSWT_PMCFG_TC_CFG_MAX_INDEX		31
+#define GLSWT_PMCFG_TC_CFG_D_POOL_S		0
+#define GLSWT_PMCFG_TC_CFG_D_POOL_M		ICE_M(0xF, 0)
+#define GLSWT_PMCFG_TC_CFG_S_POOL_S		16
+#define GLSWT_PMCFG_TC_CFG_S_POOL_M		ICE_M(0x7, 16)
+#define PRTDCB_RLANPMS				0x00122280 /* Reset Source: CORER */
+#define PRTDCB_RLANPMS_LANRPPM_S		0
+#define PRTDCB_RLANPMS_LANRPPM_M		ICE_M(0x3FFFF, 0)
+#define PRTDCB_RPPMC				0x00122240 /* Reset Source: CORER */
+#define PRTDCB_RPPMC_LANRPPM_S			0
+#define PRTDCB_RPPMC_LANRPPM_M			ICE_M(0xFF, 0)
+#define PRTDCB_RPPMC_RDMARPPM_S			8
+#define PRTDCB_RPPMC_RDMARPPM_M			ICE_M(0xFF, 8)
+#define PRTDCB_RRDMAPMS				0x00122120 /* Reset Source: CORER */
+#define PRTDCB_RRDMAPMS_RDMARPPM_S		0
+#define PRTDCB_RRDMAPMS_RDMARPPM_M		ICE_M(0x3FFFF, 0)
+#define GL_STAT_SWR_BPCH(_i)			(0x00347804 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_BPCH_MAX_INDEX		127
+#define GL_STAT_SWR_BPCH_VLBPCH_S		0
+#define GL_STAT_SWR_BPCH_VLBPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_BPCL(_i)			(0x00347800 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_BPCL_MAX_INDEX		127
+#define GL_STAT_SWR_BPCL_VLBPCL_S		0
+#define GL_STAT_SWR_BPCL_VLBPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_GORCH(_i)			(0x00342004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GORCH_MAX_INDEX		127
+#define GL_STAT_SWR_GORCH_VLBCH_S		0
+#define GL_STAT_SWR_GORCH_VLBCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_GORCL(_i)			(0x00342000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GORCL_MAX_INDEX		127
+#define GL_STAT_SWR_GORCL_VLBCL_S		0
+#define GL_STAT_SWR_GORCL_VLBCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_GOTCH(_i)			(0x00304004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GOTCH_MAX_INDEX		127
+#define GL_STAT_SWR_GOTCH_VLBCH_S		0
+#define GL_STAT_SWR_GOTCH_VLBCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_GOTCL(_i)			(0x00304000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_GOTCL_MAX_INDEX		127
+#define GL_STAT_SWR_GOTCL_VLBCL_S		0
+#define GL_STAT_SWR_GOTCL_VLBCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_MPCH(_i)			(0x00347404 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_MPCH_MAX_INDEX		127
+#define GL_STAT_SWR_MPCH_VLMPCH_S		0
+#define GL_STAT_SWR_MPCH_VLMPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_MPCL(_i)			(0x00347400 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_MPCL_MAX_INDEX		127
+#define GL_STAT_SWR_MPCL_VLMPCL_S		0
+#define GL_STAT_SWR_MPCL_VLMPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_STAT_SWR_UPCH(_i)			(0x00347004 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_UPCH_MAX_INDEX		127
+#define GL_STAT_SWR_UPCH_VLUPCH_S		0
+#define GL_STAT_SWR_UPCH_VLUPCH_M		ICE_M(0xFF, 0)
+#define GL_STAT_SWR_UPCL(_i)			(0x00347000 + ((_i) * 8)) /* _i=0...127 */ /* Reset Source: CORER */
+#define GL_STAT_SWR_UPCL_MAX_INDEX		127
+#define GL_STAT_SWR_UPCL_VLUPCL_S		0
+#define GL_STAT_SWR_UPCL_VLUPCL_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_AORCL(_i)				(0x003812C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_AORCL_MAX_INDEX			7
+#define GLPRT_AORCL_AORCL_S			0
+#define GLPRT_AORCL_AORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_BPRCH(_i)				(0x00381384 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPRCH_MAX_INDEX			7
+#define GLPRT_BPRCH_UPRCH_S			0
+#define GLPRT_BPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_BPRCL(_i)				(0x00381380 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPRCL_MAX_INDEX			7
+#define GLPRT_BPRCL_UPRCH_S			0
+#define GLPRT_BPRCL_UPRCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_BPTCH(_i)				(0x00381244 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPTCH_MAX_INDEX			7
+#define GLPRT_BPTCH_UPRCH_S			0
+#define GLPRT_BPTCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_BPTCL(_i)				(0x00381240 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_BPTCL_MAX_INDEX			7
+#define GLPRT_BPTCL_UPRCH_S			0
+#define GLPRT_BPTCL_UPRCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_CRCERRS(_i)			(0x00380100 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_CRCERRS_MAX_INDEX			7
+#define GLPRT_CRCERRS_CRCERRS_S			0
+#define GLPRT_CRCERRS_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_CRCERRS_H(_i)			(0x00380104 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_CRCERRS_H_MAX_INDEX		7
+#define GLPRT_CRCERRS_H_CRCERRS_S		0
+#define GLPRT_CRCERRS_H_CRCERRS_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_GORCH(_i)				(0x00380004 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GORCH_MAX_INDEX			7
+#define GLPRT_GORCH_GORCH_S			0
+#define GLPRT_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLPRT_GORCL(_i)				(0x00380000 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GORCL_MAX_INDEX			7
+#define GLPRT_GORCL_GORCL_S			0
+#define GLPRT_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_GOTCH(_i)				(0x00380B44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GOTCH_MAX_INDEX			7
+#define GLPRT_GOTCH_GOTCH_S			0
+#define GLPRT_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_GOTCL(_i)				(0x00380B40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_GOTCL_MAX_INDEX			7
+#define GLPRT_GOTCL_GOTCL_S			0
+#define GLPRT_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ILLERRC(_i)			(0x003801C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ILLERRC_MAX_INDEX			7
+#define GLPRT_ILLERRC_ILLERRC_S			0
+#define GLPRT_ILLERRC_ILLERRC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ILLERRC_H(_i)			(0x003801C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ILLERRC_H_MAX_INDEX		7
+#define GLPRT_ILLERRC_H_ILLERRC_S		0
+#define GLPRT_ILLERRC_H_ILLERRC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFRXC(_i)			(0x003802C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFRXC_MAX_INDEX		7
+#define GLPRT_LXOFFRXC_LXOFFRXCNT_S		0
+#define GLPRT_LXOFFRXC_LXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFRXC_H(_i)			(0x003802C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFRXC_H_MAX_INDEX		7
+#define GLPRT_LXOFFRXC_H_LXOFFRXCNT_S		0
+#define GLPRT_LXOFFRXC_H_LXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFTXC(_i)			(0x00381180 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFTXC_MAX_INDEX		7
+#define GLPRT_LXOFFTXC_LXOFFTXC_S		0
+#define GLPRT_LXOFFTXC_LXOFFTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXOFFTXC_H(_i)			(0x00381184 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXOFFTXC_H_MAX_INDEX		7
+#define GLPRT_LXOFFTXC_H_LXOFFTXC_S		0
+#define GLPRT_LXOFFTXC_H_LXOFFTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONRXC(_i)			(0x00380280 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONRXC_MAX_INDEX			7
+#define GLPRT_LXONRXC_LXONRXCNT_S		0
+#define GLPRT_LXONRXC_LXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONRXC_H(_i)			(0x00380284 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONRXC_H_MAX_INDEX		7
+#define GLPRT_LXONRXC_H_LXONRXCNT_S		0
+#define GLPRT_LXONRXC_H_LXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONTXC(_i)			(0x00381140 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONTXC_MAX_INDEX			7
+#define GLPRT_LXONTXC_LXONTXC_S			0
+#define GLPRT_LXONTXC_LXONTXC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_LXONTXC_H(_i)			(0x00381144 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_LXONTXC_H_MAX_INDEX		7
+#define GLPRT_LXONTXC_H_LXONTXC_S		0
+#define GLPRT_LXONTXC_H_LXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MLFC(_i)				(0x00380040 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MLFC_MAX_INDEX			7
+#define GLPRT_MLFC_MLFC_S			0
+#define GLPRT_MLFC_MLFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MLFC_H(_i)			(0x00380044 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MLFC_H_MAX_INDEX			7
+#define GLPRT_MLFC_H_MLFC_S			0
+#define GLPRT_MLFC_H_MLFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MPRCH(_i)				(0x00381344 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPRCH_MAX_INDEX			7
+#define GLPRT_MPRCH_MPRCH_S			0
+#define GLPRT_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_MPRCL(_i)				(0x00381340 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPRCL_MAX_INDEX			7
+#define GLPRT_MPRCL_MPRCL_S			0
+#define GLPRT_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MPTCH(_i)				(0x00381204 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPTCH_MAX_INDEX			7
+#define GLPRT_MPTCH_MPTCH_S			0
+#define GLPRT_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_MPTCL(_i)				(0x00381200 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MPTCL_MAX_INDEX			7
+#define GLPRT_MPTCL_MPTCL_S			0
+#define GLPRT_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MRFC(_i)				(0x00380080 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MRFC_MAX_INDEX			7
+#define GLPRT_MRFC_MRFC_S			0
+#define GLPRT_MRFC_MRFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_MRFC_H(_i)			(0x00380084 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_MRFC_H_MAX_INDEX			7
+#define GLPRT_MRFC_H_MRFC_S			0
+#define GLPRT_MRFC_H_MRFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC1023H(_i)			(0x00380A04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1023H_MAX_INDEX		7
+#define GLPRT_PRC1023H_PRC1023H_S		0
+#define GLPRT_PRC1023H_PRC1023H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC1023L(_i)			(0x00380A00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1023L_MAX_INDEX		7
+#define GLPRT_PRC1023L_PRC1023L_S		0
+#define GLPRT_PRC1023L_PRC1023L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC127H(_i)			(0x00380944 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC127H_MAX_INDEX			7
+#define GLPRT_PRC127H_PRC127H_S			0
+#define GLPRT_PRC127H_PRC127H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC127L(_i)			(0x00380940 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC127L_MAX_INDEX			7
+#define GLPRT_PRC127L_PRC127L_S			0
+#define GLPRT_PRC127L_PRC127L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC1522H(_i)			(0x00380A44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1522H_MAX_INDEX		7
+#define GLPRT_PRC1522H_PRC1522H_S		0
+#define GLPRT_PRC1522H_PRC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC1522L(_i)			(0x00380A40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC1522L_MAX_INDEX		7
+#define GLPRT_PRC1522L_PRC1522L_S		0
+#define GLPRT_PRC1522L_PRC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC255H(_i)			(0x00380984 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC255H_MAX_INDEX			7
+#define GLPRT_PRC255H_PRTPRC255H_S		0
+#define GLPRT_PRC255H_PRTPRC255H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC255L(_i)			(0x00380980 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC255L_MAX_INDEX			7
+#define GLPRT_PRC255L_PRC255L_S			0
+#define GLPRT_PRC255L_PRC255L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC511H(_i)			(0x003809C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC511H_MAX_INDEX			7
+#define GLPRT_PRC511H_PRC511H_S			0
+#define GLPRT_PRC511H_PRC511H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC511L(_i)			(0x003809C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC511L_MAX_INDEX			7
+#define GLPRT_PRC511L_PRC511L_S			0
+#define GLPRT_PRC511L_PRC511L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC64H(_i)			(0x00380904 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC64H_MAX_INDEX			7
+#define GLPRT_PRC64H_PRC64H_S			0
+#define GLPRT_PRC64H_PRC64H_M			ICE_M(0xFF, 0)
+#define GLPRT_PRC64L(_i)			(0x00380900 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC64L_MAX_INDEX			7
+#define GLPRT_PRC64L_PRC64L_S			0
+#define GLPRT_PRC64L_PRC64L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PRC9522H(_i)			(0x00380A84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC9522H_MAX_INDEX		7
+#define GLPRT_PRC9522H_PRC1522H_S		0
+#define GLPRT_PRC9522H_PRC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PRC9522L(_i)			(0x00380A80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PRC9522L_MAX_INDEX		7
+#define GLPRT_PRC9522L_PRC1522L_S		0
+#define GLPRT_PRC9522L_PRC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC1023H(_i)			(0x00380C84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1023H_MAX_INDEX		7
+#define GLPRT_PTC1023H_PTC1023H_S		0
+#define GLPRT_PTC1023H_PTC1023H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC1023L(_i)			(0x00380C80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1023L_MAX_INDEX		7
+#define GLPRT_PTC1023L_PTC1023L_S		0
+#define GLPRT_PTC1023L_PTC1023L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC127H(_i)			(0x00380BC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC127H_MAX_INDEX			7
+#define GLPRT_PTC127H_PTC127H_S			0
+#define GLPRT_PTC127H_PTC127H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC127L(_i)			(0x00380BC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC127L_MAX_INDEX			7
+#define GLPRT_PTC127L_PTC127L_S			0
+#define GLPRT_PTC127L_PTC127L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC1522H(_i)			(0x00380CC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1522H_MAX_INDEX		7
+#define GLPRT_PTC1522H_PTC1522H_S		0
+#define GLPRT_PTC1522H_PTC1522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC1522L(_i)			(0x00380CC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC1522L_MAX_INDEX		7
+#define GLPRT_PTC1522L_PTC1522L_S		0
+#define GLPRT_PTC1522L_PTC1522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC255H(_i)			(0x00380C04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC255H_MAX_INDEX			7
+#define GLPRT_PTC255H_PTC255H_S			0
+#define GLPRT_PTC255H_PTC255H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC255L(_i)			(0x00380C00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC255L_MAX_INDEX			7
+#define GLPRT_PTC255L_PTC255L_S			0
+#define GLPRT_PTC255L_PTC255L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC511H(_i)			(0x00380C44 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC511H_MAX_INDEX			7
+#define GLPRT_PTC511H_PTC511H_S			0
+#define GLPRT_PTC511H_PTC511H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC511L(_i)			(0x00380C40 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC511L_MAX_INDEX			7
+#define GLPRT_PTC511L_PTC511L_S			0
+#define GLPRT_PTC511L_PTC511L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC64H(_i)			(0x00380B84 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC64H_MAX_INDEX			7
+#define GLPRT_PTC64H_PTC64H_S			0
+#define GLPRT_PTC64H_PTC64H_M			ICE_M(0xFF, 0)
+#define GLPRT_PTC64L(_i)			(0x00380B80 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC64L_MAX_INDEX			7
+#define GLPRT_PTC64L_PTC64L_S			0
+#define GLPRT_PTC64L_PTC64L_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PTC9522H(_i)			(0x00380D04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC9522H_MAX_INDEX		7
+#define GLPRT_PTC9522H_PTC9522H_S		0
+#define GLPRT_PTC9522H_PTC9522H_M		ICE_M(0xFF, 0)
+#define GLPRT_PTC9522L(_i)			(0x00380D00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PTC9522L_MAX_INDEX		7
+#define GLPRT_PTC9522L_PTC9522L_S		0
+#define GLPRT_PTC9522L_PTC9522L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFRXC(_i, _j)			(0x00380500 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFRXC_MAX_INDEX		7
+#define GLPRT_PXOFFRXC_PRPXOFFRXCNT_S		0
+#define GLPRT_PXOFFRXC_PRPXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFRXC_H(_i, _j)		(0x00380504 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFRXC_H_MAX_INDEX		7
+#define GLPRT_PXOFFRXC_H_PRPXOFFRXCNT_S		0
+#define GLPRT_PXOFFRXC_H_PRPXOFFRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFTXC(_i, _j)			(0x00380F40 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFTXC_MAX_INDEX		7
+#define GLPRT_PXOFFTXC_PRPXOFFTXCNT_S		0
+#define GLPRT_PXOFFTXC_PRPXOFFTXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXOFFTXC_H(_i, _j)		(0x00380F44 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXOFFTXC_H_MAX_INDEX		7
+#define GLPRT_PXOFFTXC_H_PRPXOFFTXCNT_S		0
+#define GLPRT_PXOFFTXC_H_PRPXOFFTXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONRXC(_i, _j)			(0x00380300 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONRXC_MAX_INDEX			7
+#define GLPRT_PXONRXC_PRPXONRXCNT_S		0
+#define GLPRT_PXONRXC_PRPXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONRXC_H(_i, _j)			(0x00380304 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONRXC_H_MAX_INDEX		7
+#define GLPRT_PXONRXC_H_PRPXONRXCNT_S		0
+#define GLPRT_PXONRXC_H_PRPXONRXCNT_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONTXC(_i, _j)			(0x00380D40 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONTXC_MAX_INDEX			7
+#define GLPRT_PXONTXC_PRPXONTXC_S		0
+#define GLPRT_PXONTXC_PRPXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_PXONTXC_H(_i, _j)			(0x00380D44 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_PXONTXC_H_MAX_INDEX		7
+#define GLPRT_PXONTXC_H_PRPXONTXC_S		0
+#define GLPRT_PXONTXC_H_PRPXONTXC_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RFC(_i)				(0x00380AC0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RFC_MAX_INDEX			7
+#define GLPRT_RFC_RFC_S				0
+#define GLPRT_RFC_RFC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RFC_H(_i)				(0x00380AC4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RFC_H_MAX_INDEX			7
+#define GLPRT_RFC_H_RFC_S			0
+#define GLPRT_RFC_H_RFC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RJC(_i)				(0x00380B00 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RJC_MAX_INDEX			7
+#define GLPRT_RJC_RJC_S				0
+#define GLPRT_RJC_RJC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RJC_H(_i)				(0x00380B04 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RJC_H_MAX_INDEX			7
+#define GLPRT_RJC_H_RJC_S			0
+#define GLPRT_RJC_H_RJC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RLEC(_i)				(0x00380140 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RLEC_MAX_INDEX			7
+#define GLPRT_RLEC_RLEC_S			0
+#define GLPRT_RLEC_RLEC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RLEC_H(_i)			(0x00380144 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RLEC_H_MAX_INDEX			7
+#define GLPRT_RLEC_H_RLEC_S			0
+#define GLPRT_RLEC_H_RLEC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ROC(_i)				(0x00380240 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ROC_MAX_INDEX			7
+#define GLPRT_ROC_ROC_S				0
+#define GLPRT_ROC_ROC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_ROC_H(_i)				(0x00380244 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_ROC_H_MAX_INDEX			7
+#define GLPRT_ROC_H_ROC_S			0
+#define GLPRT_ROC_H_ROC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RUC(_i)				(0x00380200 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RUC_MAX_INDEX			7
+#define GLPRT_RUC_RUC_S				0
+#define GLPRT_RUC_RUC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RUC_H(_i)				(0x00380204 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RUC_H_MAX_INDEX			7
+#define GLPRT_RUC_H_RUC_S			0
+#define GLPRT_RUC_H_RUC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RXON2OFFCNT(_i, _j)		(0x00380700 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RXON2OFFCNT_MAX_INDEX		7
+#define GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_S	0
+#define GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_RXON2OFFCNT_H(_i, _j)		(0x00380704 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...7 */ /* Reset Source: CORER */
+#define GLPRT_RXON2OFFCNT_H_MAX_INDEX		7
+#define GLPRT_RXON2OFFCNT_H_PRRXON2OFFCNT_S	0
+#define GLPRT_RXON2OFFCNT_H_PRRXON2OFFCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_STDC(_i)				(0x00340000 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_STDC_MAX_INDEX			7
+#define GLPRT_STDC_STDC_S			0
+#define GLPRT_STDC_STDC_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_TDOLD(_i)				(0x00381280 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_TDOLD_MAX_INDEX			7
+#define GLPRT_TDOLD_GLPRT_TDOLD_S		0
+#define GLPRT_TDOLD_GLPRT_TDOLD_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_TDOLD_H(_i)			(0x00381284 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_TDOLD_H_MAX_INDEX			7
+#define GLPRT_TDOLD_H_GLPRT_TDOLD_S		0
+#define GLPRT_TDOLD_H_GLPRT_TDOLD_M		ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_UPRCH(_i)				(0x00381304 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPRCH_MAX_INDEX			7
+#define GLPRT_UPRCH_UPRCH_S			0
+#define GLPRT_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLPRT_UPRCL(_i)				(0x00381300 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPRCL_MAX_INDEX			7
+#define GLPRT_UPRCL_UPRCL_S			0
+#define GLPRT_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPRT_UPTCH(_i)				(0x003811C4 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPTCH_MAX_INDEX			7
+#define GLPRT_UPTCH_UPTCH_S			0
+#define GLPRT_UPTCH_UPTCH_M			ICE_M(0xFF, 0)
+#define GLPRT_UPTCL(_i)				(0x003811C0 + ((_i) * 8)) /* _i=0...7 */ /* Reset Source: CORER */
+#define GLPRT_UPTCL_MAX_INDEX			7
+#define GLPRT_UPTCL_VUPTCH_S			0
+#define GLPRT_UPTCL_VUPTCH_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_0_H(_i)			(0x00388004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_0_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_0_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_0_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_0_L(_i)			(0x00388000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_0_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_0_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_0_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_1_H(_i)			(0x00389004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_1_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_1_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_1_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_1_L(_i)			(0x00389000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_1_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_1_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_1_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_2_H(_i)			(0x0038A004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_2_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_2_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_2_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_2_L(_i)			(0x0038A000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_2_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_2_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_2_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_ACL_CNT_3_H(_i)			(0x0038B004 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_3_H_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_3_H_CNT_MSB_S		0
+#define GLSTAT_ACL_CNT_3_H_CNT_MSB_M		ICE_M(0xFF, 0)
+#define GLSTAT_ACL_CNT_3_L(_i)			(0x0038B000 + ((_i) * 8)) /* _i=0...511 */ /* Reset Source: CORER */
+#define GLSTAT_ACL_CNT_3_L_MAX_INDEX		511
+#define GLSTAT_ACL_CNT_3_L_CNT_LSB_S		0
+#define GLSTAT_ACL_CNT_3_L_CNT_LSB_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_FD_CNT0H(_i)			(0x003A0004 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT0H_MAX_INDEX		4095
+#define GLSTAT_FD_CNT0H_FD0_CNT_H_S		0
+#define GLSTAT_FD_CNT0H_FD0_CNT_H_M		ICE_M(0xFF, 0)
+#define GLSTAT_FD_CNT0L(_i)			(0x003A0000 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT0L_MAX_INDEX		4095
+#define GLSTAT_FD_CNT0L_FD0_CNT_L_S		0
+#define GLSTAT_FD_CNT0L_FD0_CNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSTAT_FD_CNT1H(_i)			(0x003A8004 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT1H_MAX_INDEX		4095
+#define GLSTAT_FD_CNT1H_FD0_CNT_H_S		0
+#define GLSTAT_FD_CNT1H_FD0_CNT_H_M		ICE_M(0xFF, 0)
+#define GLSTAT_FD_CNT1L(_i)			(0x003A8000 + ((_i) * 8)) /* _i=0...4095 */ /* Reset Source: CORER */
+#define GLSTAT_FD_CNT1L_MAX_INDEX		4095
+#define GLSTAT_FD_CNT1L_FD0_CNT_L_S		0
+#define GLSTAT_FD_CNT1L_FD0_CNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLSW_BPRCH(_i)				(0x00346204 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPRCH_MAX_INDEX			31
+#define GLSW_BPRCH_BPRCH_S			0
+#define GLSW_BPRCH_BPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_BPRCL(_i)				(0x00346200 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPRCL_MAX_INDEX			31
+#define GLSW_BPRCL_BPRCL_S			0
+#define GLSW_BPRCL_BPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_BPTCH(_i)				(0x00310204 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPTCH_MAX_INDEX			31
+#define GLSW_BPTCH_BPTCH_S			0
+#define GLSW_BPTCH_BPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_BPTCL(_i)				(0x00310200 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_BPTCL_MAX_INDEX			31
+#define GLSW_BPTCL_BPTCL_S			0
+#define GLSW_BPTCL_BPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_GORCH(_i)				(0x00341004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GORCH_MAX_INDEX			31
+#define GLSW_GORCH_GORCH_S			0
+#define GLSW_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLSW_GORCL(_i)				(0x00341000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GORCL_MAX_INDEX			31
+#define GLSW_GORCL_GORCL_S			0
+#define GLSW_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_GOTCH(_i)				(0x00302004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GOTCH_MAX_INDEX			31
+#define GLSW_GOTCH_GOTCH_S			0
+#define GLSW_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLSW_GOTCL(_i)				(0x00302000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_GOTCL_MAX_INDEX			31
+#define GLSW_GOTCL_GOTCL_S			0
+#define GLSW_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_MPRCH(_i)				(0x00346104 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPRCH_MAX_INDEX			31
+#define GLSW_MPRCH_MPRCH_S			0
+#define GLSW_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_MPRCL(_i)				(0x00346100 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPRCL_MAX_INDEX			31
+#define GLSW_MPRCL_MPRCL_S			0
+#define GLSW_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_MPTCH(_i)				(0x00310104 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPTCH_MAX_INDEX			31
+#define GLSW_MPTCH_MPTCH_S			0
+#define GLSW_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_MPTCL(_i)				(0x00310100 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_MPTCL_MAX_INDEX			31
+#define GLSW_MPTCL_MPTCL_S			0
+#define GLSW_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_UPRCH(_i)				(0x00346004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPRCH_MAX_INDEX			31
+#define GLSW_UPRCH_UPRCH_S			0
+#define GLSW_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLSW_UPRCL(_i)				(0x00346000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPRCL_MAX_INDEX			31
+#define GLSW_UPRCL_UPRCL_S			0
+#define GLSW_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSW_UPTCH(_i)				(0x00310004 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPTCH_MAX_INDEX			31
+#define GLSW_UPTCH_UPTCH_S			0
+#define GLSW_UPTCH_UPTCH_M			ICE_M(0xFF, 0)
+#define GLSW_UPTCL(_i)				(0x00310000 + ((_i) * 8)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GLSW_UPTCL_MAX_INDEX			31
+#define GLSW_UPTCL_UPTCL_S			0
+#define GLSW_UPTCL_UPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLSWID_RUPP(_i)				(0x00345000 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define GLSWID_RUPP_MAX_INDEX			255
+#define GLSWID_RUPP_RUPP_S			0
+#define GLSWID_RUPP_RUPP_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_BPRCH(_i)				(0x003B6004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPRCH_MAX_INDEX			767
+#define GLV_BPRCH_BPRCH_S			0
+#define GLV_BPRCH_BPRCH_M			ICE_M(0xFF, 0)
+#define GLV_BPRCL(_i)				(0x003B6000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPRCL_MAX_INDEX			767
+#define GLV_BPRCL_BPRCL_S			0
+#define GLV_BPRCL_BPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_BPTCH(_i)				(0x0030E004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPTCH_MAX_INDEX			767
+#define GLV_BPTCH_BPTCH_S			0
+#define GLV_BPTCH_BPTCH_M			ICE_M(0xFF, 0)
+#define GLV_BPTCL(_i)				(0x0030E000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_BPTCL_MAX_INDEX			767
+#define GLV_BPTCL_BPTCL_S			0
+#define GLV_BPTCL_BPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_GORCH(_i)				(0x003B0004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GORCH_MAX_INDEX			767
+#define GLV_GORCH_GORCH_S			0
+#define GLV_GORCH_GORCH_M			ICE_M(0xFF, 0)
+#define GLV_GORCL(_i)				(0x003B0000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GORCL_MAX_INDEX			767
+#define GLV_GORCL_GORCL_S			0
+#define GLV_GORCL_GORCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_GOTCH(_i)				(0x00300004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GOTCH_MAX_INDEX			767
+#define GLV_GOTCH_GOTCH_S			0
+#define GLV_GOTCH_GOTCH_M			ICE_M(0xFF, 0)
+#define GLV_GOTCL(_i)				(0x00300000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_GOTCL_MAX_INDEX			767
+#define GLV_GOTCL_GOTCL_S			0
+#define GLV_GOTCL_GOTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_MPRCH(_i)				(0x003B4004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPRCH_MAX_INDEX			767
+#define GLV_MPRCH_MPRCH_S			0
+#define GLV_MPRCH_MPRCH_M			ICE_M(0xFF, 0)
+#define GLV_MPRCL(_i)				(0x003B4000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPRCL_MAX_INDEX			767
+#define GLV_MPRCL_MPRCL_S			0
+#define GLV_MPRCL_MPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_MPTCH(_i)				(0x0030C004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPTCH_MAX_INDEX			767
+#define GLV_MPTCH_MPTCH_S			0
+#define GLV_MPTCH_MPTCH_M			ICE_M(0xFF, 0)
+#define GLV_MPTCL(_i)				(0x0030C000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_MPTCL_MAX_INDEX			767
+#define GLV_MPTCL_MPTCL_S			0
+#define GLV_MPTCL_MPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_RDPC(_i)				(0x00294C04 + ((_i) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_RDPC_MAX_INDEX			767
+#define GLV_RDPC_RDPC_S				0
+#define GLV_RDPC_RDPC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLV_REPC(_i)				(0x00295804 + ((_i) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_REPC_MAX_INDEX			767
+#define GLV_REPC_NO_DESC_CNT_S			0
+#define GLV_REPC_NO_DESC_CNT_M			ICE_M(0xFFFF, 0)
+#define GLV_REPC_ERROR_CNT_S			16
+#define GLV_REPC_ERROR_CNT_M			ICE_M(0xFFFF, 16)
+#define GLV_TEPC(_VSI)				(0x00312000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_TEPC_MAX_INDEX			767
+#define GLV_TEPC_TEPC_S				0
+#define GLV_TEPC_TEPC_M				ICE_M(0xFFFFFFFF, 0)
+#define GLV_UPRCH(_i)				(0x003B2004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPRCH_MAX_INDEX			767
+#define GLV_UPRCH_UPRCH_S			0
+#define GLV_UPRCH_UPRCH_M			ICE_M(0xFF, 0)
+#define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPRCL_MAX_INDEX			767
+#define GLV_UPRCL_UPRCL_S			0
+#define GLV_UPRCL_UPRCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLV_UPTCH(_i)				(0x0030A004 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPTCH_MAX_INDEX			767
+#define GLV_UPTCH_GLVUPTCH_S			0
+#define GLV_UPTCH_GLVUPTCH_M			ICE_M(0xFF, 0)
+#define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8)) /* _i=0...767 */ /* Reset Source: CORER */
+#define GLV_UPTCL_MAX_INDEX			767
+#define GLV_UPTCL_UPTCL_S			0
+#define GLV_UPTCL_UPTCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_RBCH(_i, _j)			(0x00343004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RBCH_MAX_INDEX			7
+#define GLVEBUP_RBCH_UPBCH_S			0
+#define GLVEBUP_RBCH_UPBCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_RBCL(_i, _j)			(0x00343000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RBCL_MAX_INDEX			7
+#define GLVEBUP_RBCL_UPBCL_S			0
+#define GLVEBUP_RBCL_UPBCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_RPCH(_i, _j)			(0x00344004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RPCH_MAX_INDEX			7
+#define GLVEBUP_RPCH_UPPCH_S			0
+#define GLVEBUP_RPCH_UPPCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_RPCL(_i, _j)			(0x00344000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_RPCL_MAX_INDEX			7
+#define GLVEBUP_RPCL_UPPCL_S			0
+#define GLVEBUP_RPCL_UPPCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_TBCH(_i, _j)			(0x00306004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TBCH_MAX_INDEX			7
+#define GLVEBUP_TBCH_UPBCH_S			0
+#define GLVEBUP_TBCH_UPBCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_TBCL(_i, _j)			(0x00306000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TBCL_MAX_INDEX			7
+#define GLVEBUP_TBCL_UPBCL_S			0
+#define GLVEBUP_TBCL_UPBCL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLVEBUP_TPCH(_i, _j)			(0x00308004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TPCH_MAX_INDEX			7
+#define GLVEBUP_TPCH_UPPCH_S			0
+#define GLVEBUP_TPCH_UPPCH_M			ICE_M(0xFF, 0)
+#define GLVEBUP_TPCL(_i, _j)			(0x00308000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...31 */ /* Reset Source: CORER */
+#define GLVEBUP_TPCL_MAX_INDEX			7
+#define GLVEBUP_TPCL_UPPCL_S			0
+#define GLVEBUP_TPCL_UPPCL_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTRPB_LDPC				0x000AC280 /* Reset Source: CORER */
+#define PRTRPB_LDPC_CRCERRS_S			0
+#define PRTRPB_LDPC_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTRPB_RDPC				0x000AC260 /* Reset Source: CORER */
+#define PRTRPB_RDPC_CRCERRS_S			0
+#define PRTRPB_RDPC_CRCERRS_M			ICE_M(0xFFFFFFFF, 0)
+#define PRTTPB_STAT_TC_BYTES_SENTL(_i)		(0x00098200 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define PRTTPB_STAT_TC_BYTES_SENTL_MAX_INDEX	63
+#define PRTTPB_STAT_TC_BYTES_SENTL_TCCNT_S	0
+#define PRTTPB_STAT_TC_BYTES_SENTL_TCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define TPB_PRTTPB_STAT_PKT_SENT(_i)		(0x00099470 + ((_i) * 4)) /* _i=0...7 */ /* Reset Source: CORER */
+#define TPB_PRTTPB_STAT_PKT_SENT_MAX_INDEX	7
+#define TPB_PRTTPB_STAT_PKT_SENT_PKTCNT_S	0
+#define TPB_PRTTPB_STAT_PKT_SENT_PKTCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT(_i)	(0x00099094 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_MAX_INDEX 63
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_TCCNT_S	0
+#define TPB_PRTTPB_STAT_TC_BYTES_SENT_TCCNT_M	ICE_M(0xFFFFFFFF, 0)
+#define EMP_SWT_PRUNIND				0x00204020 /* Reset Source: CORER */
+#define EMP_SWT_PRUNIND_OPCODE_S		0
+#define EMP_SWT_PRUNIND_OPCODE_M		ICE_M(0xF, 0)
+#define EMP_SWT_PRUNIND_LIST_INDEX_NUM_S	4
+#define EMP_SWT_PRUNIND_LIST_INDEX_NUM_M	ICE_M(0x3FF, 4)
+#define EMP_SWT_PRUNIND_VSI_NUM_S		16
+#define EMP_SWT_PRUNIND_VSI_NUM_M		ICE_M(0x3FF, 16)
+#define EMP_SWT_PRUNIND_BIT_VALUE_S		31
+#define EMP_SWT_PRUNIND_BIT_VALUE_M		BIT(31)
+#define EMP_SWT_REPIND				0x0020401C /* Reset Source: CORER */
+#define EMP_SWT_REPIND_OPCODE_S			0
+#define EMP_SWT_REPIND_OPCODE_M			ICE_M(0xF, 0)
+#define EMP_SWT_REPIND_LIST_INDEX_NUMBER_S	4
+#define EMP_SWT_REPIND_LIST_INDEX_NUMBER_M	ICE_M(0x3FF, 4)
+#define EMP_SWT_REPIND_VSI_NUM_S		16
+#define EMP_SWT_REPIND_VSI_NUM_M		ICE_M(0x3FF, 16)
+#define EMP_SWT_REPIND_BIT_VALUE_S		31
+#define EMP_SWT_REPIND_BIT_VALUE_M		BIT(31)
+#define GL_OVERRIDEC				0x002040A4 /* Reset Source: CORER */
+#define GL_OVERRIDEC_OVERRIDE_ATTEMPTC_S	0
+#define GL_OVERRIDEC_OVERRIDE_ATTEMPTC_M	ICE_M(0xFFFF, 0)
+#define GL_OVERRIDEC_LAST_VSI_S			16
+#define GL_OVERRIDEC_LAST_VSI_M			ICE_M(0x3FF, 16)
+#define GL_PLG_AVG_CALC_CFG			0x0020A5AC /* Reset Source: CORER */
+#define GL_PLG_AVG_CALC_CFG_CYCLE_LEN_S		0
+#define GL_PLG_AVG_CALC_CFG_CYCLE_LEN_M		ICE_M(0x7FFFFFFF, 0)
+#define GL_PLG_AVG_CALC_CFG_MODE_S		31
+#define GL_PLG_AVG_CALC_CFG_MODE_M		BIT(31)
+#define GL_PLG_AVG_CALC_ST			0x0020A5B0 /* Reset Source: CORER */
+#define GL_PLG_AVG_CALC_ST_IN_DATA_S		0
+#define GL_PLG_AVG_CALC_ST_IN_DATA_M		ICE_M(0x7FFF, 0)
+#define GL_PLG_AVG_CALC_ST_OUT_DATA_S		16
+#define GL_PLG_AVG_CALC_ST_OUT_DATA_M		ICE_M(0x7FFF, 16)
+#define GL_PLG_AVG_CALC_ST_VALID_S		31
+#define GL_PLG_AVG_CALC_ST_VALID_M		BIT(31)
+#define GL_PRE_CFG_CMD				0x00214090 /* Reset Source: CORER */
+#define GL_PRE_CFG_CMD_ADDR_S			0
+#define GL_PRE_CFG_CMD_ADDR_M			ICE_M(0x1FFF, 0)
+#define GL_PRE_CFG_CMD_TBLIDX_S			16
+#define GL_PRE_CFG_CMD_TBLIDX_M			ICE_M(0x7, 16)
+#define GL_PRE_CFG_CMD_CMD_S			29
+#define GL_PRE_CFG_CMD_CMD_M			BIT(29)
+#define GL_PRE_CFG_CMD_DONE_S			31
+#define GL_PRE_CFG_CMD_DONE_M			BIT(31)
+#define GL_PRE_CFG_DATA(_i)			(0x00214074 + ((_i) * 4)) /* _i=0...6 */ /* Reset Source: CORER */
+#define GL_PRE_CFG_DATA_MAX_INDEX		6
+#define GL_PRE_CFG_DATA_GL_PRE_RCP_DATA_S	0
+#define GL_PRE_CFG_DATA_GL_PRE_RCP_DATA_M	ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_FUNCFILT				0x001D2698 /* Reset Source: CORER */
+#define GL_SWT_FUNCFILT_FUNCFILT_S		0
+#define GL_SWT_FUNCFILT_FUNCFILT_M		BIT(0)
+#define GL_SWT_FW_STS(_i)			(0x00216000 + ((_i) * 4)) /* _i=0...5 */ /* Reset Source: CORER */
+#define GL_SWT_FW_STS_MAX_INDEX			5
+#define GL_SWT_FW_STS_GL_SWT_FW_STS_S		0
+#define GL_SWT_FW_STS_GL_SWT_FW_STS_M		ICE_M(0xFFFFFFFF, 0)
+#define GL_SWT_LAT_DOUBLE			0x00204004 /* Reset Source: CORER */
+#define GL_SWT_LAT_DOUBLE_BASE_S		0
+#define GL_SWT_LAT_DOUBLE_BASE_M		ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_DOUBLE_SIZE_S		16
+#define GL_SWT_LAT_DOUBLE_SIZE_M		ICE_M(0x7FF, 16)
+#define GL_SWT_LAT_QUAD				0x00204008 /* Reset Source: CORER */
+#define GL_SWT_LAT_QUAD_BASE_S			0
+#define GL_SWT_LAT_QUAD_BASE_M			ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_QUAD_SIZE_S			16
+#define GL_SWT_LAT_QUAD_SIZE_M			ICE_M(0x7FF, 16)
+#define GL_SWT_LAT_SINGLE			0x00204000 /* Reset Source: CORER */
+#define GL_SWT_LAT_SINGLE_BASE_S		0
+#define GL_SWT_LAT_SINGLE_BASE_M		ICE_M(0x7FF, 0)
+#define GL_SWT_LAT_SINGLE_SIZE_S		16
+#define GL_SWT_LAT_SINGLE_SIZE_M		ICE_M(0x7FF, 16)
+#define GL_SWT_MD_PRI				0x002040AC /* Reset Source: CORER */
+#define GL_SWT_MD_PRI_VSI_PRI_S			0
+#define GL_SWT_MD_PRI_VSI_PRI_M			ICE_M(0x7, 0)
+#define GL_SWT_MD_PRI_LB_PRI_S			4
+#define GL_SWT_MD_PRI_LB_PRI_M			ICE_M(0x7, 4)
+#define GL_SWT_MD_PRI_LAN_EN_PRI_S		8
+#define GL_SWT_MD_PRI_LAN_EN_PRI_M		ICE_M(0x7, 8)
+#define GL_SWT_MD_PRI_QH_PRI_S			12
+#define GL_SWT_MD_PRI_QH_PRI_M			ICE_M(0x7, 12)
+#define GL_SWT_MD_PRI_QL_PRI_S			16
+#define GL_SWT_MD_PRI_QL_PRI_M			ICE_M(0x7, 16)
+#define GL_SWT_MIRTARVSI(_i)			(0x00204500 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define GL_SWT_MIRTARVSI_MAX_INDEX		63
+#define GL_SWT_MIRTARVSI_VFVMNUMBER_S		0
+#define GL_SWT_MIRTARVSI_VFVMNUMBER_M		ICE_M(0x3FF, 0)
+#define GL_SWT_MIRTARVSI_FUNCTIONTYPE_S		10
+#define GL_SWT_MIRTARVSI_FUNCTIONTYPE_M		ICE_M(0x3, 10)
+#define GL_SWT_MIRTARVSI_PFNUMBER_S		12
+#define GL_SWT_MIRTARVSI_PFNUMBER_M		ICE_M(0x7, 12)
+#define GL_SWT_MIRTARVSI_TARGETVSI_S		20
+#define GL_SWT_MIRTARVSI_TARGETVSI_M		ICE_M(0x3FF, 20)
+#define GL_SWT_MIRTARVSI_RULEENABLE_S		31
+#define GL_SWT_MIRTARVSI_RULEENABLE_M		BIT(31)
+#define GL_SWT_SWIDFVIDX			0x00214114 /* Reset Source: CORER */
+#define GL_SWT_SWIDFVIDX_SWIDFVIDX_S		0
+#define GL_SWT_SWIDFVIDX_SWIDFVIDX_M		ICE_M(0x3F, 0)
+#define GL_SWT_SWIDFVIDX_PORT_TYPE_S		31
+#define GL_SWT_SWIDFVIDX_PORT_TYPE_M		BIT(31)
+#define GL_VP_SWITCHID(_i)			(0x00214094 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define GL_VP_SWITCHID_MAX_INDEX		31
+#define GL_VP_SWITCHID_SWITCHID_S		0
+#define GL_VP_SWITCHID_SWITCHID_M		ICE_M(0xFF, 0)
+#define GLSWID_STAT_BLOCK(_i)			(0x0020A1A4 + ((_i) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define GLSWID_STAT_BLOCK_MAX_INDEX		255
+#define GLSWID_STAT_BLOCK_VEBID_S		0
+#define GLSWID_STAT_BLOCK_VEBID_M		ICE_M(0x1F, 0)
+#define GLSWID_STAT_BLOCK_VEBID_VALID_S		31
+#define GLSWID_STAT_BLOCK_VEBID_VALID_M		BIT(31)
+#define GLSWT_ACT_RESP_0			0x0020A5A4 /* Reset Source: CORER */
+#define GLSWT_ACT_RESP_0_GLSWT_ACT_RESP_S	0
+#define GLSWT_ACT_RESP_0_GLSWT_ACT_RESP_M	ICE_M(0xFFFFFFFF, 0)
+#define GLSWT_ACT_RESP_1			0x0020A5A8 /* Reset Source: CORER */
+#define GLSWT_ACT_RESP_1_GLSWT_ACT_RESP_S	0
+#define GLSWT_ACT_RESP_1_GLSWT_ACT_RESP_M	ICE_M(0xFFFFFFFF, 0)
+#define GLSWT_ARB_MODE				0x0020A674 /* Reset Source: CORER */
+#define GLSWT_ARB_MODE_FLU_PRI_SHM_S		0
+#define GLSWT_ARB_MODE_FLU_PRI_SHM_M		BIT(0)
+#define GLSWT_ARB_MODE_TX_RX_FWD_PRI_S		1
+#define GLSWT_ARB_MODE_TX_RX_FWD_PRI_M		BIT(1)
+#define PRT_SBPVSI				0x00204120 /* Reset Source: CORER */
+#define PRT_SBPVSI_BAD_FRAMES_VSI_S		0
+#define PRT_SBPVSI_BAD_FRAMES_VSI_M		ICE_M(0x3FF, 0)
+#define PRT_SBPVSI_SBP_S			31
+#define PRT_SBPVSI_SBP_M			BIT(31)
+#define PRT_SCSTS				0x00204140 /* Reset Source: CORER */
+#define PRT_SCSTS_BSCA_S			0
+#define PRT_SCSTS_BSCA_M			BIT(0)
+#define PRT_SCSTS_BSCAP_S			1
+#define PRT_SCSTS_BSCAP_M			BIT(1)
+#define PRT_SCSTS_MSCA_S			2
+#define PRT_SCSTS_MSCA_M			BIT(2)
+#define PRT_SCSTS_MSCAP_S			3
+#define PRT_SCSTS_MSCAP_M			BIT(3)
+#define PRT_SWT_BSCCNT				0x00204160 /* Reset Source: CORER */
+#define PRT_SWT_BSCCNT_CCOUNT_S			0
+#define PRT_SWT_BSCCNT_CCOUNT_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_BSCTRH				0x00204180 /* Reset Source: CORER */
+#define PRT_SWT_BSCTRH_UTRESH_S			0
+#define PRT_SWT_BSCTRH_UTRESH_M			ICE_M(0x7FFFF, 0)
+#define PRT_SWT_MIREG				0x002042A0 /* Reset Source: CORER */
+#define PRT_SWT_MIREG_MIRRULE_S			0
+#define PRT_SWT_MIREG_MIRRULE_M			ICE_M(0x3F, 0)
+#define PRT_SWT_MIREG_MIRENA_S			7
+#define PRT_SWT_MIREG_MIRENA_M			BIT(7)
+#define PRT_SWT_MIRIG				0x00204280 /* Reset Source: CORER */
+#define PRT_SWT_MIRIG_MIRRULE_S			0
+#define PRT_SWT_MIRIG_MIRRULE_M			ICE_M(0x3F, 0)
+#define PRT_SWT_MIRIG_MIRENA_S			7
+#define PRT_SWT_MIRIG_MIRENA_M			BIT(7)
+#define PRT_SWT_MSCCNT				0x00204100 /* Reset Source: CORER */
+#define PRT_SWT_MSCCNT_CCOUNT_S			0
+#define PRT_SWT_MSCCNT_CCOUNT_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_MSCTRH				0x002041C0 /* Reset Source: CORER */
+#define PRT_SWT_MSCTRH_UTRESH_S			0
+#define PRT_SWT_MSCTRH_UTRESH_M			ICE_M(0x7FFFF, 0)
+#define PRT_SWT_SCBI				0x002041E0 /* Reset Source: CORER */
+#define PRT_SWT_SCBI_BI_S			0
+#define PRT_SWT_SCBI_BI_M			ICE_M(0x1FFFFFF, 0)
+#define PRT_SWT_SCCRL				0x00204200 /* Reset Source: CORER */
+#define PRT_SWT_SCCRL_MDIPW_S			0
+#define PRT_SWT_SCCRL_MDIPW_M			BIT(0)
+#define PRT_SWT_SCCRL_MDICW_S			1
+#define PRT_SWT_SCCRL_MDICW_M			BIT(1)
+#define PRT_SWT_SCCRL_BDIPW_S			2
+#define PRT_SWT_SCCRL_BDIPW_M			BIT(2)
+#define PRT_SWT_SCCRL_BDICW_S			3
+#define PRT_SWT_SCCRL_BDICW_M			BIT(3)
+#define PRT_SWT_SCCRL_INTERVAL_S		8
+#define PRT_SWT_SCCRL_INTERVAL_M		ICE_M(0xFFFFF, 8)
+#define PRT_TCTUPR(_i)				(0x00040840 + ((_i) * 4)) /* _i=0...31 */ /* Reset Source: CORER */
+#define PRT_TCTUPR_MAX_INDEX			31
+#define PRT_TCTUPR_UP0_S			0
+#define PRT_TCTUPR_UP0_M			ICE_M(0x7, 0)
+#define PRT_TCTUPR_UP1_S			4
+#define PRT_TCTUPR_UP1_M			ICE_M(0x7, 4)
+#define PRT_TCTUPR_UP2_S			8
+#define PRT_TCTUPR_UP2_M			ICE_M(0x7, 8)
+#define PRT_TCTUPR_UP3_S			12
+#define PRT_TCTUPR_UP3_M			ICE_M(0x7, 12)
+#define PRT_TCTUPR_UP4_S			16
+#define PRT_TCTUPR_UP4_M			ICE_M(0x7, 16)
+#define PRT_TCTUPR_UP5_S			20
+#define PRT_TCTUPR_UP5_M			ICE_M(0x7, 20)
+#define PRT_TCTUPR_UP6_S			24
+#define PRT_TCTUPR_UP6_M			ICE_M(0x7, 24)
+#define PRT_TCTUPR_UP7_S			28
+#define PRT_TCTUPR_UP7_M			ICE_M(0x7, 28)
+#define GLHH_ART_CTL				0x000A41D4 /* Reset Source: POR */
+#define GLHH_ART_CTL_ACTIVE_S			0
+#define GLHH_ART_CTL_ACTIVE_M			BIT(0)
+#define GLHH_ART_CTL_TIME_OUT1_S		1
+#define GLHH_ART_CTL_TIME_OUT1_M		BIT(1)
+#define GLHH_ART_CTL_TIME_OUT2_S		2
+#define GLHH_ART_CTL_TIME_OUT2_M		BIT(2)
+#define GLHH_ART_CTL_RESET_HH_S			31
+#define GLHH_ART_CTL_RESET_HH_M			BIT(31)
+#define GLHH_ART_DATA				0x000A41E0 /* Reset Source: POR */
+#define GLHH_ART_DATA_AGENT_TYPE_S		0
+#define GLHH_ART_DATA_AGENT_TYPE_M		ICE_M(0x7, 0)
+#define GLHH_ART_DATA_SYNC_TYPE_S		3
+#define GLHH_ART_DATA_SYNC_TYPE_M		BIT(3)
+#define GLHH_ART_DATA_MAX_DELAY_S		4
+#define GLHH_ART_DATA_MAX_DELAY_M		ICE_M(0xF, 4)
+#define GLHH_ART_DATA_TIME_BASE_S		8
+#define GLHH_ART_DATA_TIME_BASE_M		ICE_M(0xF, 8)
+#define GLHH_ART_DATA_RSV_DATA_S		12
+#define GLHH_ART_DATA_RSV_DATA_M		ICE_M(0xFFFFF, 12)
+#define GLHH_ART_TIME_H				0x000A41D8 /* Reset Source: POR */
+#define GLHH_ART_TIME_H_ART_TIME_H_S		0
+#define GLHH_ART_TIME_H_ART_TIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLHH_ART_TIME_L				0x000A41DC /* Reset Source: POR */
+#define GLHH_ART_TIME_L_ART_TIME_L_S		0
+#define GLHH_ART_TIME_L_ART_TIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_AUX_IN_0(_i)			(0x000889D8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_0_MAX_INDEX		1
+#define GLTSYN_AUX_IN_0_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_0_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_0_INT_ENA_S		4
+#define GLTSYN_AUX_IN_0_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_IN_1(_i)			(0x000889E0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_1_MAX_INDEX		1
+#define GLTSYN_AUX_IN_1_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_1_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_1_INT_ENA_S		4
+#define GLTSYN_AUX_IN_1_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_IN_2(_i)			(0x000889E8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_IN_2_MAX_INDEX		1
+#define GLTSYN_AUX_IN_2_EVNTLVL_S		0
+#define GLTSYN_AUX_IN_2_EVNTLVL_M		ICE_M(0x3, 0)
+#define GLTSYN_AUX_IN_2_INT_ENA_S		4
+#define GLTSYN_AUX_IN_2_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_0(_i)			(0x00088998 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_0_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_0_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_0_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_0_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_0_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_0_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_0_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_0_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_0_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_0_PULSEW_S		8
+#define GLTSYN_AUX_OUT_0_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_1(_i)			(0x000889A0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_1_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_1_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_1_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_1_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_1_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_1_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_1_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_1_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_1_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_1_PULSEW_S		8
+#define GLTSYN_AUX_OUT_1_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_2(_i)			(0x000889A8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_2_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_2_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_2_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_2_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_2_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_2_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_2_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_2_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_2_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_2_PULSEW_S		8
+#define GLTSYN_AUX_OUT_2_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_AUX_OUT_3(_i)			(0x000889B0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_AUX_OUT_3_MAX_INDEX		1
+#define GLTSYN_AUX_OUT_3_OUT_ENA_S		0
+#define GLTSYN_AUX_OUT_3_OUT_ENA_M		BIT(0)
+#define GLTSYN_AUX_OUT_3_OUTMOD_S		1
+#define GLTSYN_AUX_OUT_3_OUTMOD_M		ICE_M(0x3, 1)
+#define GLTSYN_AUX_OUT_3_OUTLVL_S		3
+#define GLTSYN_AUX_OUT_3_OUTLVL_M		BIT(3)
+#define GLTSYN_AUX_OUT_3_INT_ENA_S		4
+#define GLTSYN_AUX_OUT_3_INT_ENA_M		BIT(4)
+#define GLTSYN_AUX_OUT_3_PULSEW_S		8
+#define GLTSYN_AUX_OUT_3_PULSEW_M		ICE_M(0xF, 8)
+#define GLTSYN_CLKO_0(_i)			(0x000889B8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_0_MAX_INDEX			1
+#define GLTSYN_CLKO_0_TSYNCLKO_S		0
+#define GLTSYN_CLKO_0_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_1(_i)			(0x000889C0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_1_MAX_INDEX			1
+#define GLTSYN_CLKO_1_TSYNCLKO_S		0
+#define GLTSYN_CLKO_1_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_2(_i)			(0x000889C8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_2_MAX_INDEX			1
+#define GLTSYN_CLKO_2_TSYNCLKO_S		0
+#define GLTSYN_CLKO_2_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CLKO_3(_i)			(0x000889D0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_CLKO_3_MAX_INDEX			1
+#define GLTSYN_CLKO_3_TSYNCLKO_S		0
+#define GLTSYN_CLKO_3_TSYNCLKO_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_CMD				0x00088810 /* Reset Source: CORER */
+#define GLTSYN_CMD_CMD_S			0
+#define GLTSYN_CMD_CMD_M			ICE_M(0xFF, 0)
+#define GLTSYN_CMD_SEL_MASTER_S			8
+#define GLTSYN_CMD_SEL_MASTER_M			BIT(8)
+#define GLTSYN_CMD_SYNC				0x00088814 /* Reset Source: CORER */
+#define GLTSYN_CMD_SYNC_SYNC_S			0
+#define GLTSYN_CMD_SYNC_SYNC_M			ICE_M(0x3, 0)
+#define GLTSYN_ENA(_i)				(0x00088808 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_ENA_MAX_INDEX			1
+#define GLTSYN_ENA_TSYN_ENA_S			0
+#define GLTSYN_ENA_TSYN_ENA_M			BIT(0)
+#define GLTSYN_EVNT_H_0(_i)			(0x00088970 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_0_MAX_INDEX		1
+#define GLTSYN_EVNT_H_0_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_0_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_H_1(_i)			(0x00088980 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_1_MAX_INDEX		1
+#define GLTSYN_EVNT_H_1_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_1_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_H_2(_i)			(0x00088990 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_H_2_MAX_INDEX		1
+#define GLTSYN_EVNT_H_2_TSYNEVNT_H_S		0
+#define GLTSYN_EVNT_H_2_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_0(_i)			(0x00088968 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_0_MAX_INDEX		1
+#define GLTSYN_EVNT_L_0_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_0_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_1(_i)			(0x00088978 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_1_MAX_INDEX		1
+#define GLTSYN_EVNT_L_1_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_1_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_EVNT_L_2(_i)			(0x00088988 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_EVNT_L_2_MAX_INDEX		1
+#define GLTSYN_EVNT_L_2_TSYNEVNT_L_S		0
+#define GLTSYN_EVNT_L_2_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_HHTIME_H(_i)			(0x00088900 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_HHTIME_H_MAX_INDEX		1
+#define GLTSYN_HHTIME_H_TSYNEVNT_H_S		0
+#define GLTSYN_HHTIME_H_TSYNEVNT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_HHTIME_L(_i)			(0x000888F8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_HHTIME_L_MAX_INDEX		1
+#define GLTSYN_HHTIME_L_TSYNEVNT_L_S		0
+#define GLTSYN_HHTIME_L_TSYNEVNT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_INCVAL_H(_i)			(0x00088920 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_INCVAL_H_MAX_INDEX		1
+#define GLTSYN_INCVAL_H_INCVAL_H_S		0
+#define GLTSYN_INCVAL_H_INCVAL_H_M		ICE_M(0xFF, 0)
+#define GLTSYN_INCVAL_L(_i)			(0x00088918 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_INCVAL_L_MAX_INDEX		1
+#define GLTSYN_INCVAL_L_INCVAL_L_S		0
+#define GLTSYN_INCVAL_L_INCVAL_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHADJ_H(_i)			(0x00088910 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHADJ_H_MAX_INDEX		1
+#define GLTSYN_SHADJ_H_ADJUST_H_S		0
+#define GLTSYN_SHADJ_H_ADJUST_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHADJ_L(_i)			(0x00088908 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHADJ_L_MAX_INDEX		1
+#define GLTSYN_SHADJ_L_ADJUST_L_S		0
+#define GLTSYN_SHADJ_L_ADJUST_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_0(_i)			(0x000888E0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_0_MAX_INDEX		1
+#define GLTSYN_SHTIME_0_TSYNTIME_0_S		0
+#define GLTSYN_SHTIME_0_TSYNTIME_0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_H(_i)			(0x000888F0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_H_MAX_INDEX		1
+#define GLTSYN_SHTIME_H_TSYNTIME_H_S		0
+#define GLTSYN_SHTIME_H_TSYNTIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_SHTIME_L(_i)			(0x000888E8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_SHTIME_L_MAX_INDEX		1
+#define GLTSYN_SHTIME_L_TSYNTIME_L_S		0
+#define GLTSYN_SHTIME_L_TSYNTIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_STAT(_i)				(0x000888C0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_STAT_MAX_INDEX			1
+#define GLTSYN_STAT_EVENT0_S			0
+#define GLTSYN_STAT_EVENT0_M			BIT(0)
+#define GLTSYN_STAT_EVENT1_S			1
+#define GLTSYN_STAT_EVENT1_M			BIT(1)
+#define GLTSYN_STAT_EVENT2_S			2
+#define GLTSYN_STAT_EVENT2_M			BIT(2)
+#define GLTSYN_STAT_TGT0_S			4
+#define GLTSYN_STAT_TGT0_M			BIT(4)
+#define GLTSYN_STAT_TGT1_S			5
+#define GLTSYN_STAT_TGT1_M			BIT(5)
+#define GLTSYN_STAT_TGT2_S			6
+#define GLTSYN_STAT_TGT2_M			BIT(6)
+#define GLTSYN_STAT_TGT3_S			7
+#define GLTSYN_STAT_TGT3_M			BIT(7)
+#define GLTSYN_SYNC_DLAY			0x00088818 /* Reset Source: CORER */
+#define GLTSYN_SYNC_DLAY_SYNC_DELAY_S		0
+#define GLTSYN_SYNC_DLAY_SYNC_DELAY_M		ICE_M(0x1F, 0)
+#define GLTSYN_TGT_H_0(_i)			(0x00088930 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_0_MAX_INDEX		1
+#define GLTSYN_TGT_H_0_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_0_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_1(_i)			(0x00088940 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_1_MAX_INDEX		1
+#define GLTSYN_TGT_H_1_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_1_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_2(_i)			(0x00088950 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_2_MAX_INDEX		1
+#define GLTSYN_TGT_H_2_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_2_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_H_3(_i)			(0x00088960 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_H_3_MAX_INDEX		1
+#define GLTSYN_TGT_H_3_TSYNTGTT_H_S		0
+#define GLTSYN_TGT_H_3_TSYNTGTT_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_0(_i)			(0x00088928 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_0_MAX_INDEX		1
+#define GLTSYN_TGT_L_0_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_0_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_1(_i)			(0x00088938 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_1_MAX_INDEX		1
+#define GLTSYN_TGT_L_1_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_1_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_2(_i)			(0x00088948 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_2_MAX_INDEX		1
+#define GLTSYN_TGT_L_2_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_2_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TGT_L_3(_i)			(0x00088958 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TGT_L_3_MAX_INDEX		1
+#define GLTSYN_TGT_L_3_TSYNTGTT_L_S		0
+#define GLTSYN_TGT_L_3_TSYNTGTT_L_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_0(_i)			(0x000888C8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_0_MAX_INDEX			1
+#define GLTSYN_TIME_0_TSYNTIME_0_S		0
+#define GLTSYN_TIME_0_TSYNTIME_0_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_H(_i)			(0x000888D8 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_H_MAX_INDEX			1
+#define GLTSYN_TIME_H_TSYNTIME_H_S		0
+#define GLTSYN_TIME_H_TSYNTIME_H_M		ICE_M(0xFFFFFFFF, 0)
+#define GLTSYN_TIME_L(_i)			(0x000888D0 + ((_i) * 4)) /* _i=0...1 */ /* Reset Source: CORER */
+#define GLTSYN_TIME_L_MAX_INDEX			1
+#define GLTSYN_TIME_L_TSYNTIME_L_S		0
+#define GLTSYN_TIME_L_TSYNTIME_L_M		ICE_M(0xFFFFFFFF, 0)
+#define PFHH_SEM				0x000A4200 /* Reset Source: PFR */
+#define PFHH_SEM_BUSY_S				0
+#define PFHH_SEM_BUSY_M				BIT(0)
+#define PFHH_SEM_PF_OWNER_S			4
+#define PFHH_SEM_PF_OWNER_M			ICE_M(0x7, 4)
+#define PFTSYN_SEM				0x00088880 /* Reset Source: PFR */
+#define PFTSYN_SEM_BUSY_S			0
+#define PFTSYN_SEM_BUSY_M			BIT(0)
+#define PFTSYN_SEM_PF_OWNER_S			4
+#define PFTSYN_SEM_PF_OWNER_M			ICE_M(0x7, 4)
+#define GLPE_TSCD_FLR(_i)			(0x0051E24C + ((_i) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define GLPE_TSCD_FLR_MAX_INDEX			3
+#define GLPE_TSCD_FLR_DRAIN_VCTR_ID_S		0
+#define GLPE_TSCD_FLR_DRAIN_VCTR_ID_M		ICE_M(0x3, 0)
+#define GLPE_TSCD_FLR_PORT_S			2
+#define GLPE_TSCD_FLR_PORT_M			ICE_M(0x7, 2)
+#define GLPE_TSCD_FLR_PF_NUM_S			5
+#define GLPE_TSCD_FLR_PF_NUM_M			ICE_M(0x7, 5)
+#define GLPE_TSCD_FLR_VM_VF_TYPE_S		8
+#define GLPE_TSCD_FLR_VM_VF_TYPE_M		ICE_M(0x3, 8)
+#define GLPE_TSCD_FLR_VM_VF_NUM_S		16
+#define GLPE_TSCD_FLR_VM_VF_NUM_M		ICE_M(0x3FF, 16)
+#define GLPE_TSCD_FLR_VLD_S			31
+#define GLPE_TSCD_FLR_VLD_M			BIT(31)
+#define GLPE_TSCD_PEPM				0x0051E228 /* Reset Source: CORER */
+#define GLPE_TSCD_PEPM_MDQ_CREDITS_S		0
+#define GLPE_TSCD_PEPM_MDQ_CREDITS_M		ICE_M(0xFF, 0)
+#define PF_VIRT_VSTATUS				0x0009E680 /* Reset Source: PFR */
+#define PF_VIRT_VSTATUS_NUM_VFS_S		0
+#define PF_VIRT_VSTATUS_NUM_VFS_M		ICE_M(0xFF, 0)
+#define PF_VIRT_VSTATUS_TOTAL_VFS_S		8
+#define PF_VIRT_VSTATUS_TOTAL_VFS_M		ICE_M(0xFF, 8)
+#define PF_VIRT_VSTATUS_IOV_ACTIVE_S		16
+#define PF_VIRT_VSTATUS_IOV_ACTIVE_M		BIT(16)
+#define PF_VT_PFALLOC				0x001D2480 /* Reset Source: CORER */
+#define PF_VT_PFALLOC_FIRSTVF_S			0
+#define PF_VT_PFALLOC_FIRSTVF_M			ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_LASTVF_S			8
+#define PF_VT_PFALLOC_LASTVF_M			ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_VALID_S			31
+#define PF_VT_PFALLOC_VALID_M			BIT(31)
+#define PF_VT_PFALLOC_HIF			0x0009DD80 /* Reset Source: PCIR */
+#define PF_VT_PFALLOC_HIF_FIRSTVF_S		0
+#define PF_VT_PFALLOC_HIF_FIRSTVF_M		ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_HIF_LASTVF_S		8
+#define PF_VT_PFALLOC_HIF_LASTVF_M		ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_HIF_VALID_S		31
+#define PF_VT_PFALLOC_HIF_VALID_M		BIT(31)
+#define PF_VT_PFALLOC_PCIE			0x000BE080 /* Reset Source: PCIR */
+#define PF_VT_PFALLOC_PCIE_FIRSTVF_S		0
+#define PF_VT_PFALLOC_PCIE_FIRSTVF_M		ICE_M(0xFF, 0)
+#define PF_VT_PFALLOC_PCIE_LASTVF_S		8
+#define PF_VT_PFALLOC_PCIE_LASTVF_M		ICE_M(0xFF, 8)
+#define PF_VT_PFALLOC_PCIE_VALID_S		31
+#define PF_VT_PFALLOC_PCIE_VALID_M		BIT(31)
+#define VSI_L2TAGSTXVALID(_VSI)			(0x00046000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_L2TAGSTXVALID_MAX_INDEX		767
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_S	0
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_M	ICE_M(0x7, 0)
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_VALID_S 3
+#define VSI_L2TAGSTXVALID_L2TAG1INSERTID_VALID_M BIT(3)
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_S	4
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_M	ICE_M(0x7, 4)
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_VALID_S 7
+#define VSI_L2TAGSTXVALID_L2TAG2INSERTID_VALID_M BIT(7)
+#define VSI_L2TAGSTXVALID_TIR0INSERTID_S	16
+#define VSI_L2TAGSTXVALID_TIR0INSERTID_M	ICE_M(0x7, 16)
+#define VSI_L2TAGSTXVALID_TIR0_INSERT_S		19
+#define VSI_L2TAGSTXVALID_TIR0_INSERT_M		BIT(19)
+#define VSI_L2TAGSTXVALID_TIR1INSERTID_S	20
+#define VSI_L2TAGSTXVALID_TIR1INSERTID_M	ICE_M(0x7, 20)
+#define VSI_L2TAGSTXVALID_TIR1_INSERT_S		23
+#define VSI_L2TAGSTXVALID_TIR1_INSERT_M		BIT(23)
+#define VSI_L2TAGSTXVALID_TIR2INSERTID_S	24
+#define VSI_L2TAGSTXVALID_TIR2INSERTID_M	ICE_M(0x7, 24)
+#define VSI_L2TAGSTXVALID_TIR2_INSERT_S		27
+#define VSI_L2TAGSTXVALID_TIR2_INSERT_M		BIT(27)
+#define VSI_PASID(_VSI)				(0x0009C000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_PASID_MAX_INDEX			767
+#define VSI_PASID_PASID_S			0
+#define VSI_PASID_PASID_M			ICE_M(0xFFFFF, 0)
+#define VSI_PASID_EN_S				31
+#define VSI_PASID_EN_M				BIT(31)
+#define VSI_RUPR(_VSI)				(0x00050000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_RUPR_MAX_INDEX			767
+#define VSI_RUPR_UP0_S				0
+#define VSI_RUPR_UP0_M				ICE_M(0x7, 0)
+#define VSI_RUPR_UP1_S				3
+#define VSI_RUPR_UP1_M				ICE_M(0x7, 3)
+#define VSI_RUPR_UP2_S				6
+#define VSI_RUPR_UP2_M				ICE_M(0x7, 6)
+#define VSI_RUPR_UP3_S				9
+#define VSI_RUPR_UP3_M				ICE_M(0x7, 9)
+#define VSI_RUPR_UP4_S				12
+#define VSI_RUPR_UP4_M				ICE_M(0x7, 12)
+#define VSI_RUPR_UP5_S				15
+#define VSI_RUPR_UP5_M				ICE_M(0x7, 15)
+#define VSI_RUPR_UP6_S				18
+#define VSI_RUPR_UP6_M				ICE_M(0x7, 18)
+#define VSI_RUPR_UP7_S				21
+#define VSI_RUPR_UP7_M				ICE_M(0x7, 21)
+#define VSI_RXSWCTRL(_VSI)			(0x00205000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_RXSWCTRL_MAX_INDEX			767
+#define VSI_RXSWCTRL_MACVSIPRUNEENABLE_S	8
+#define VSI_RXSWCTRL_MACVSIPRUNEENABLE_M	BIT(8)
+#define VSI_RXSWCTRL_PRUNEENABLE_S		9
+#define VSI_RXSWCTRL_PRUNEENABLE_M		ICE_M(0xF, 9)
+#define VSI_RXSWCTRL_SRCPRUNEENABLE_S		13
+#define VSI_RXSWCTRL_SRCPRUNEENABLE_M		BIT(13)
+#define VSI_SRCSWCTRL(_VSI)			(0x00209000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SRCSWCTRL_MAX_INDEX			767
+#define VSI_SRCSWCTRL_ALLOWDESTOVERRIDE_S	0
+#define VSI_SRCSWCTRL_ALLOWDESTOVERRIDE_M	BIT(0)
+#define VSI_SRCSWCTRL_ALLOWLOOPBACK_S		1
+#define VSI_SRCSWCTRL_ALLOWLOOPBACK_M		BIT(1)
+#define VSI_SRCSWCTRL_LANENABLE_S		2
+#define VSI_SRCSWCTRL_LANENABLE_M		BIT(2)
+#define VSI_SRCSWCTRL_MACAS_S			3
+#define VSI_SRCSWCTRL_MACAS_M			BIT(3)
+#define VSI_SRCSWCTRL_PRUNEENABLE_S		4
+#define VSI_SRCSWCTRL_PRUNEENABLE_M		ICE_M(0xF, 4)
+#define VSI_SWITCHID(_VSI)			(0x00215000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWITCHID_MAX_INDEX			767
+#define VSI_SWITCHID_SWITCHID_S			0
+#define VSI_SWITCHID_SWITCHID_M			ICE_M(0xFF, 0)
+#define VSI_SWT_MIREG(_VSI)			(0x00207000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWT_MIREG_MAX_INDEX			767
+#define VSI_SWT_MIREG_MIRRULE_S			0
+#define VSI_SWT_MIREG_MIRRULE_M			ICE_M(0x3F, 0)
+#define VSI_SWT_MIREG_MIRENA_S			7
+#define VSI_SWT_MIREG_MIRENA_M			BIT(7)
+#define VSI_SWT_MIRIG(_VSI)			(0x00208000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_SWT_MIRIG_MAX_INDEX			767
+#define VSI_SWT_MIRIG_MIRRULE_S			0
+#define VSI_SWT_MIRIG_MIRRULE_M			ICE_M(0x3F, 0)
+#define VSI_SWT_MIRIG_MIRENA_S			7
+#define VSI_SWT_MIRIG_MIRENA_M			BIT(7)
+#define VSI_TAIR(_VSI)				(0x00044000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_TAIR_MAX_INDEX			767
+#define VSI_TAIR_PORT_TAG_ID_S			0
+#define VSI_TAIR_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TAR(_VSI)				(0x00045000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TAR_MAX_INDEX			767
+#define VSI_TAR_ACCEPTTAGGED_S			0
+#define VSI_TAR_ACCEPTTAGGED_M			ICE_M(0x3FF, 0)
+#define VSI_TAR_ACCEPTUNTAGGED_S		16
+#define VSI_TAR_ACCEPTUNTAGGED_M		ICE_M(0x3FF, 16)
+#define VSI_TIR_0(_VSI)				(0x00041000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_0_MAX_INDEX			767
+#define VSI_TIR_0_PORT_TAG_ID_S			0
+#define VSI_TIR_0_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TIR_1(_VSI)				(0x00042000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_1_MAX_INDEX			767
+#define VSI_TIR_1_PORT_TAG_ID_S			0
+#define VSI_TIR_1_PORT_TAG_ID_M			ICE_M(0xFFFFFFFF, 0)
+#define VSI_TIR_2(_VSI)				(0x00043000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TIR_2_MAX_INDEX			767
+#define VSI_TIR_2_PORT_TAG_ID_S			0
+#define VSI_TIR_2_PORT_TAG_ID_M			ICE_M(0xFFFF, 0)
+#define VSI_TSR(_VSI)				(0x00051000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TSR_MAX_INDEX			767
+#define VSI_TSR_STRIPTAG_S			0
+#define VSI_TSR_STRIPTAG_M			ICE_M(0x3FF, 0)
+#define VSI_TSR_SHOWTAG_S			10
+#define VSI_TSR_SHOWTAG_M			ICE_M(0x3FF, 10)
+#define VSI_TSR_SHOWPRIONLY_S			20
+#define VSI_TSR_SHOWPRIONLY_M			ICE_M(0x3FF, 20)
+#define VSI_TUPIOM(_VSI)			(0x00048000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TUPIOM_MAX_INDEX			767
+#define VSI_TUPIOM_UP0_S			0
+#define VSI_TUPIOM_UP0_M			ICE_M(0x7, 0)
+#define VSI_TUPIOM_UP1_S			3
+#define VSI_TUPIOM_UP1_M			ICE_M(0x7, 3)
+#define VSI_TUPIOM_UP2_S			6
+#define VSI_TUPIOM_UP2_M			ICE_M(0x7, 6)
+#define VSI_TUPIOM_UP3_S			9
+#define VSI_TUPIOM_UP3_M			ICE_M(0x7, 9)
+#define VSI_TUPIOM_UP4_S			12
+#define VSI_TUPIOM_UP4_M			ICE_M(0x7, 12)
+#define VSI_TUPIOM_UP5_S			15
+#define VSI_TUPIOM_UP5_M			ICE_M(0x7, 15)
+#define VSI_TUPIOM_UP6_S			18
+#define VSI_TUPIOM_UP6_M			ICE_M(0x7, 18)
+#define VSI_TUPIOM_UP7_S			21
+#define VSI_TUPIOM_UP7_M			ICE_M(0x7, 21)
+#define VSI_TUPR(_VSI)				(0x00047000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSI_TUPR_MAX_INDEX			767
+#define VSI_TUPR_UP0_S				0
+#define VSI_TUPR_UP0_M				ICE_M(0x7, 0)
+#define VSI_TUPR_UP1_S				3
+#define VSI_TUPR_UP1_M				ICE_M(0x7, 3)
+#define VSI_TUPR_UP2_S				6
+#define VSI_TUPR_UP2_M				ICE_M(0x7, 6)
+#define VSI_TUPR_UP3_S				9
+#define VSI_TUPR_UP3_M				ICE_M(0x7, 9)
+#define VSI_TUPR_UP4_S				12
+#define VSI_TUPR_UP4_M				ICE_M(0x7, 12)
+#define VSI_TUPR_UP5_S				15
+#define VSI_TUPR_UP5_M				ICE_M(0x7, 15)
+#define VSI_TUPR_UP6_S				18
+#define VSI_TUPR_UP6_M				ICE_M(0x7, 18)
+#define VSI_TUPR_UP7_S				21
+#define VSI_TUPR_UP7_M				ICE_M(0x7, 21)
+#define VSI_VSI2F(_VSI)				(0x001D0000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSI_VSI2F_MAX_INDEX			767
+#define VSI_VSI2F_VFVMNUMBER_S			0
+#define VSI_VSI2F_VFVMNUMBER_M			ICE_M(0x3FF, 0)
+#define VSI_VSI2F_FUNCTIONTYPE_S		10
+#define VSI_VSI2F_FUNCTIONTYPE_M		ICE_M(0x3, 10)
+#define VSI_VSI2F_PFNUMBER_S			12
+#define VSI_VSI2F_PFNUMBER_M			ICE_M(0x7, 12)
+#define VSI_VSI2F_BUFFERNUMBER_S		16
+#define VSI_VSI2F_BUFFERNUMBER_M		ICE_M(0x7, 16)
+#define VSI_VSI2F_VSI_NUMBER_S			20
+#define VSI_VSI2F_VSI_NUMBER_M			ICE_M(0x3FF, 20)
+#define VSI_VSI2F_VSI_ENABLE_S			31
+#define VSI_VSI2F_VSI_ENABLE_M			BIT(31)
+#define VSIQF_FD_CNT(_VSI)			(0x00464000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: PFR */
+#define VSIQF_FD_CNT_MAX_INDEX			767
+#define VSIQF_FD_CNT_FD_GCNT_S			0
+#define VSIQF_FD_CNT_FD_GCNT_M			ICE_M(0x3FFF, 0)
+#define VSIQF_FD_CNT_FD_BCNT_S			16
+#define VSIQF_FD_CNT_FD_BCNT_M			ICE_M(0x3FFF, 16)
+#define VSIQF_FD_CTL1(_VSI)			(0x00411000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_CTL1_MAX_INDEX			767
+#define VSIQF_FD_CTL1_FLT_ENA_S			0
+#define VSIQF_FD_CTL1_FLT_ENA_M			BIT(0)
+#define VSIQF_FD_CTL1_CFG_ENA_S			1
+#define VSIQF_FD_CTL1_CFG_ENA_M			BIT(1)
+#define VSIQF_FD_CTL1_EVICT_ENA_S		2
+#define VSIQF_FD_CTL1_EVICT_ENA_M		BIT(2)
+#define VSIQF_FD_DFLT(_VSI)			(0x00457000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_DFLT_MAX_INDEX			767
+#define VSIQF_FD_DFLT_DEFLT_QINDX_S		0
+#define VSIQF_FD_DFLT_DEFLT_QINDX_M		ICE_M(0x7FF, 0)
+#define VSIQF_FD_DFLT_DEFLT_TOQUEUE_S		12
+#define VSIQF_FD_DFLT_DEFLT_TOQUEUE_M		ICE_M(0x7, 12)
+#define VSIQF_FD_DFLT_COMP_QINDX_S		16
+#define VSIQF_FD_DFLT_COMP_QINDX_M		ICE_M(0x7FF, 16)
+#define VSIQF_FD_DFLT_DEFLT_QINDX_PRIO_S	28
+#define VSIQF_FD_DFLT_DEFLT_QINDX_PRIO_M	ICE_M(0x7, 28)
+#define VSIQF_FD_DFLT_DEFLT_DROP_S		31
+#define VSIQF_FD_DFLT_DEFLT_DROP_M		BIT(31)
+#define VSIQF_FD_SIZE(_VSI)			(0x00462000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_FD_SIZE_MAX_INDEX			767
+#define VSIQF_FD_SIZE_FD_GSIZE_S		0
+#define VSIQF_FD_SIZE_FD_GSIZE_M		ICE_M(0x3FFF, 0)
+#define VSIQF_FD_SIZE_FD_BSIZE_S		16
+#define VSIQF_FD_SIZE_FD_BSIZE_M		ICE_M(0x3FFF, 16)
+#define VSIQF_HASH_CTL(_VSI)			(0x0040D000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_HASH_CTL_MAX_INDEX		767
+#define VSIQF_HASH_CTL_HASH_LUT_SEL_S		0
+#define VSIQF_HASH_CTL_HASH_LUT_SEL_M		ICE_M(0x3, 0)
+#define VSIQF_HASH_CTL_GLOB_LUT_S		2
+#define VSIQF_HASH_CTL_GLOB_LUT_M		ICE_M(0xF, 2)
+#define VSIQF_HASH_CTL_HASH_SCHEME_S		6
+#define VSIQF_HASH_CTL_HASH_SCHEME_M		ICE_M(0x3, 6)
+#define VSIQF_HASH_CTL_TC_OVER_SEL_S		8
+#define VSIQF_HASH_CTL_TC_OVER_SEL_M		ICE_M(0x1F, 8)
+#define VSIQF_HASH_CTL_TC_OVER_ENA_S		15
+#define VSIQF_HASH_CTL_TC_OVER_ENA_M		BIT(15)
+#define VSIQF_HKEY(_i, _VSI)			(0x00400000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...12, _VSI=0...767 */ /* Reset Source: PFR */
 #define VSIQF_HKEY_MAX_INDEX			12
+#define VSIQF_HKEY_KEY_0_S			0
+#define VSIQF_HKEY_KEY_0_M			ICE_M(0xFF, 0)
+#define VSIQF_HKEY_KEY_1_S			8
+#define VSIQF_HKEY_KEY_1_M			ICE_M(0xFF, 8)
+#define VSIQF_HKEY_KEY_2_S			16
+#define VSIQF_HKEY_KEY_2_M			ICE_M(0xFF, 16)
+#define VSIQF_HKEY_KEY_3_S			24
+#define VSIQF_HKEY_KEY_3_M			ICE_M(0xFF, 24)
+#define VSIQF_HLUT(_i, _VSI)			(0x00420000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...15, _VSI=0...767 */ /* Reset Source: PFR */
 #define VSIQF_HLUT_MAX_INDEX			15
-#define VFINT_DYN_CTLN(_i)			(0x00003800 + ((_i) * 4))
+#define VSIQF_HLUT_LUT0_S			0
+#define VSIQF_HLUT_LUT0_M			ICE_M(0xF, 0)
+#define VSIQF_HLUT_LUT1_S			8
+#define VSIQF_HLUT_LUT1_M			ICE_M(0xF, 8)
+#define VSIQF_HLUT_LUT2_S			16
+#define VSIQF_HLUT_LUT2_M			ICE_M(0xF, 16)
+#define VSIQF_HLUT_LUT3_S			24
+#define VSIQF_HLUT_LUT3_M			ICE_M(0xF, 24)
+#define VSIQF_PE_CTL1(_VSI)			(0x00414000 + ((_VSI) * 4)) /* _i=0...767 */ /* Reset Source: CORER */
+#define VSIQF_PE_CTL1_MAX_INDEX			767
+#define VSIQF_PE_CTL1_PE_FLTENA_S		0
+#define VSIQF_PE_CTL1_PE_FLTENA_M		BIT(0)
+#define VSIQF_TC_REGION(_i, _VSI)		(0x00448000 + ((_i) * 4096 + (_VSI) * 4)) /* _i=0...3, _VSI=0...767 */ /* Reset Source: CORER */
+#define VSIQF_TC_REGION_MAX_INDEX		3
+#define VSIQF_TC_REGION_TC_BASE0_S		0
+#define VSIQF_TC_REGION_TC_BASE0_M		ICE_M(0x7FF, 0)
+#define VSIQF_TC_REGION_TC_SIZE0_S		11
+#define VSIQF_TC_REGION_TC_SIZE0_M		ICE_M(0xF, 11)
+#define VSIQF_TC_REGION_TC_BASE1_S		16
+#define VSIQF_TC_REGION_TC_BASE1_M		ICE_M(0x7FF, 16)
+#define VSIQF_TC_REGION_TC_SIZE1_S		27
+#define VSIQF_TC_REGION_TC_SIZE1_M		ICE_M(0xF, 27)
+#define GLPM_WUMC				0x0009DEE4 /* Reset Source: POR */
+#define GLPM_WUMC_MNG_WU_PF_S			16
+#define GLPM_WUMC_MNG_WU_PF_M			ICE_M(0xFF, 16)
+#define PFPM_APM				0x000B8080 /* Reset Source: POR */
+#define PFPM_APM_APME_S				0
+#define PFPM_APM_APME_M				BIT(0)
+#define PFPM_WUC				0x0009DC80 /* Reset Source: POR */
+#define PFPM_WUC_EN_APM_D0_S			5
+#define PFPM_WUC_EN_APM_D0_M			BIT(5)
+#define PFPM_WUFC				0x0009DC00 /* Reset Source: POR */
+#define PFPM_WUFC_LNKC_S			0
+#define PFPM_WUFC_LNKC_M			BIT(0)
+#define PFPM_WUFC_MAG_S				1
+#define PFPM_WUFC_MAG_M				BIT(1)
+#define PFPM_WUFC_MNG_S				3
+#define PFPM_WUFC_MNG_M				BIT(3)
+#define PFPM_WUFC_FLX0_ACT_S			4
+#define PFPM_WUFC_FLX0_ACT_M			BIT(4)
+#define PFPM_WUFC_FLX1_ACT_S			5
+#define PFPM_WUFC_FLX1_ACT_M			BIT(5)
+#define PFPM_WUFC_FLX2_ACT_S			6
+#define PFPM_WUFC_FLX2_ACT_M			BIT(6)
+#define PFPM_WUFC_FLX3_ACT_S			7
+#define PFPM_WUFC_FLX3_ACT_M			BIT(7)
+#define PFPM_WUFC_FLX4_ACT_S			8
+#define PFPM_WUFC_FLX4_ACT_M			BIT(8)
+#define PFPM_WUFC_FLX5_ACT_S			9
+#define PFPM_WUFC_FLX5_ACT_M			BIT(9)
+#define PFPM_WUFC_FLX6_ACT_S			10
+#define PFPM_WUFC_FLX6_ACT_M			BIT(10)
+#define PFPM_WUFC_FLX7_ACT_S			11
+#define PFPM_WUFC_FLX7_ACT_M			BIT(11)
+#define PFPM_WUFC_FLX0_S			16
+#define PFPM_WUFC_FLX0_M			BIT(16)
+#define PFPM_WUFC_FLX1_S			17
+#define PFPM_WUFC_FLX1_M			BIT(17)
+#define PFPM_WUFC_FLX2_S			18
+#define PFPM_WUFC_FLX2_M			BIT(18)
+#define PFPM_WUFC_FLX3_S			19
+#define PFPM_WUFC_FLX3_M			BIT(19)
+#define PFPM_WUFC_FLX4_S			20
+#define PFPM_WUFC_FLX4_M			BIT(20)
+#define PFPM_WUFC_FLX5_S			21
+#define PFPM_WUFC_FLX5_M			BIT(21)
+#define PFPM_WUFC_FLX6_S			22
+#define PFPM_WUFC_FLX6_M			BIT(22)
+#define PFPM_WUFC_FLX7_S			23
+#define PFPM_WUFC_FLX7_M			BIT(23)
+#define PFPM_WUFC_FW_RST_WK_S			31
+#define PFPM_WUFC_FW_RST_WK_M			BIT(31)
+#define PFPM_WUS				0x0009DB80 /* Reset Source: POR */
+#define PFPM_WUS_LNKC_S				0
+#define PFPM_WUS_LNKC_M				BIT(0)
+#define PFPM_WUS_MAG_S				1
+#define PFPM_WUS_MAG_M				BIT(1)
+#define PFPM_WUS_PME_STATUS_S			2
+#define PFPM_WUS_PME_STATUS_M			BIT(2)
+#define PFPM_WUS_MNG_S				3
+#define PFPM_WUS_MNG_M				BIT(3)
+#define PFPM_WUS_FLX0_S				16
+#define PFPM_WUS_FLX0_M				BIT(16)
+#define PFPM_WUS_FLX1_S				17
+#define PFPM_WUS_FLX1_M				BIT(17)
+#define PFPM_WUS_FLX2_S				18
+#define PFPM_WUS_FLX2_M				BIT(18)
+#define PFPM_WUS_FLX3_S				19
+#define PFPM_WUS_FLX3_M				BIT(19)
+#define PFPM_WUS_FLX4_S				20
+#define PFPM_WUS_FLX4_M				BIT(20)
+#define PFPM_WUS_FLX5_S				21
+#define PFPM_WUS_FLX5_M				BIT(21)
+#define PFPM_WUS_FLX6_S				22
+#define PFPM_WUS_FLX6_M				BIT(22)
+#define PFPM_WUS_FLX7_S				23
+#define PFPM_WUS_FLX7_M				BIT(23)
+#define PFPM_WUS_FW_RST_WK_S			31
+#define PFPM_WUS_FW_RST_WK_M			BIT(31)
+#define PRTPM_SAH(_i)				(0x001E3BA0 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: PFR */
+#define PRTPM_SAH_MAX_INDEX			3
+#define PRTPM_SAH_PFPM_SAH_S			0
+#define PRTPM_SAH_PFPM_SAH_M			ICE_M(0xFFFF, 0)
+#define PRTPM_SAH_PF_NUM_S			26
+#define PRTPM_SAH_PF_NUM_M			ICE_M(0xF, 26)
+#define PRTPM_SAH_MC_MAG_EN_S			30
+#define PRTPM_SAH_MC_MAG_EN_M			BIT(30)
+#define PRTPM_SAH_AV_S				31
+#define PRTPM_SAH_AV_M				BIT(31)
+#define PRTPM_SAL(_i)				(0x001E3B20 + ((_i) * 32)) /* _i=0...3 */ /* Reset Source: PFR */
+#define PRTPM_SAL_MAX_INDEX			3
+#define PRTPM_SAL_PFPM_SAL_S			0
+#define PRTPM_SAL_PFPM_SAL_M			ICE_M(0xFFFFFFFF, 0)
+#define GLPE_CQM_FUNC_INVALIDATE		0x00503300 /* Reset Source: CORER */
+#define GLPE_CQM_FUNC_INVALIDATE_PF_NUM_S	0
+#define GLPE_CQM_FUNC_INVALIDATE_PF_NUM_M	ICE_M(0x7, 0)
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_NUM_S	3
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_NUM_M	ICE_M(0x3FF, 3)
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_TYPE_S	13
+#define GLPE_CQM_FUNC_INVALIDATE_VM_VF_TYPE_M	ICE_M(0x3, 13)
+#define GLPE_CQM_FUNC_INVALIDATE_ENABLE_S	31
+#define GLPE_CQM_FUNC_INVALIDATE_ENABLE_M	BIT(31)
+#define VFPE_MRTEIDXMASK			0x00009000 /* Reset Source: PFR */
+#define VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_S	0
+#define VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define GLTSYN_HH_DLAY				0x0008881C /* Reset Source: CORER */
+#define GLTSYN_HH_DLAY_SYNC_DELAY_S		0
+#define GLTSYN_HH_DLAY_SYNC_DELAY_M		ICE_M(0xF, 0)
+#define VF_MBX_ARQBAH1				0x00006000 /* Reset Source: CORER */
+#define VF_MBX_ARQBAH1_ARQBAH_S			0
+#define VF_MBX_ARQBAH1_ARQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ARQBAL1				0x00006C00 /* Reset Source: CORER */
+#define VF_MBX_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_ARQBAL1_ARQBAL_S			6
+#define VF_MBX_ARQBAL1_ARQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ARQH1				0x00007400 /* Reset Source: CORER */
+#define VF_MBX_ARQH1_ARQH_S			0
+#define VF_MBX_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN1				0x00008000 /* Reset Source: PFR */
+#define VF_MBX_ARQLEN1_ARQLEN_S			0
+#define VF_MBX_ARQLEN1_ARQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ARQLEN1_ARQVFE_S			28
+#define VF_MBX_ARQLEN1_ARQVFE_M			BIT(28)
+#define VF_MBX_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_ARQT1				0x00007000 /* Reset Source: CORER */
+#define VF_MBX_ARQT1_ARQT_S			0
+#define VF_MBX_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQBAH1				0x00007800 /* Reset Source: CORER */
+#define VF_MBX_ATQBAH1_ATQBAH_S			0
+#define VF_MBX_ATQBAH1_ATQBAH_M			ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_ATQBAL1				0x00007C00 /* Reset Source: CORER */
+#define VF_MBX_ATQBAL1_ATQBAL_S			6
+#define VF_MBX_ATQBAL1_ATQBAL_M			ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_ATQH1				0x00006400 /* Reset Source: CORER */
+#define VF_MBX_ATQH1_ATQH_S			0
+#define VF_MBX_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN1				0x00006800 /* Reset Source: PFR */
+#define VF_MBX_ATQLEN1_ATQLEN_S			0
+#define VF_MBX_ATQLEN1_ATQLEN_M			ICE_M(0x3FF, 0)
+#define VF_MBX_ATQLEN1_ATQVFE_S			28
+#define VF_MBX_ATQLEN1_ATQVFE_M			BIT(28)
+#define VF_MBX_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_ATQT1				0x00008400 /* Reset Source: CORER */
+#define VF_MBX_ATQT1_ATQT_S			0
+#define VF_MBX_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define PFPCI_VF_FLUSH_DONE1			0x0000E400 /* Reset Source: PCIR */
+#define PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_S	0
+#define PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_M	BIT(0)
+#define VFGEN_RSTAT1				0x00008800 /* Reset Source: VFR */
+#define VFGEN_RSTAT1_VFR_STATE_S		0
+#define VFGEN_RSTAT1_VFR_STATE_M		ICE_M(0x3, 0)
+#define VFINT_DYN_CTL0				0x00005C00 /* Reset Source: CORER */
+#define VFINT_DYN_CTL0_INTENA_S			0
+#define VFINT_DYN_CTL0_INTENA_M			BIT(0)
+#define VFINT_DYN_CTL0_CLEARPBA_S		1
+#define VFINT_DYN_CTL0_CLEARPBA_M		BIT(1)
+#define VFINT_DYN_CTL0_SWINT_TRIG_S		2
+#define VFINT_DYN_CTL0_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTL0_ITR_INDX_S		3
+#define VFINT_DYN_CTL0_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTL0_INTERVAL_S		5
+#define VFINT_DYN_CTL0_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTL0_SW_ITR_INDX_ENA_S	24
+#define VFINT_DYN_CTL0_SW_ITR_INDX_ENA_M	BIT(24)
+#define VFINT_DYN_CTL0_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTL0_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTL0_WB_ON_ITR_S		30
+#define VFINT_DYN_CTL0_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTL0_INTENA_MSK_S		31
+#define VFINT_DYN_CTL0_INTENA_MSK_M		BIT(31)
+#define VFINT_DYN_CTLN(_i)			(0x00003800 + ((_i) * 4)) /* _i=0...63 */ /* Reset Source: CORER */
+#define VFINT_DYN_CTLN_MAX_INDEX		63
+#define VFINT_DYN_CTLN_INTENA_S			0
+#define VFINT_DYN_CTLN_INTENA_M			BIT(0)
+#define VFINT_DYN_CTLN_CLEARPBA_S		1
 #define VFINT_DYN_CTLN_CLEARPBA_M		BIT(1)
-#define PRTRPB_RDPC				0x000AC260
+#define VFINT_DYN_CTLN_SWINT_TRIG_S		2
+#define VFINT_DYN_CTLN_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTLN_ITR_INDX_S		3
+#define VFINT_DYN_CTLN_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTLN_INTERVAL_S		5
+#define VFINT_DYN_CTLN_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTLN_SW_ITR_INDX_ENA_S	24
+#define VFINT_DYN_CTLN_SW_ITR_INDX_ENA_M	BIT(24)
+#define VFINT_DYN_CTLN_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTLN_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTLN_WB_ON_ITR_S		30
+#define VFINT_DYN_CTLN_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTLN_INTENA_MSK_S		31
+#define VFINT_DYN_CTLN_INTENA_MSK_M		BIT(31)
+#define VFINT_ITR0(_i)				(0x00004C00 + ((_i) * 4)) /* _i=0...2 */ /* Reset Source: CORER */
+#define VFINT_ITR0_MAX_INDEX			2
+#define VFINT_ITR0_INTERVAL_S			0
+#define VFINT_ITR0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITRN(_i, _j)			(0x00002800 + ((_i) * 4 + (_j) * 12)) /* _i=0...2, _j=0...63 */ /* Reset Source: CORER */
+#define VFINT_ITRN_MAX_INDEX			2
+#define VFINT_ITRN_INTERVAL_S			0
+#define VFINT_ITRN_INTERVAL_M			ICE_M(0xFFF, 0)
+#define QRX_TAIL1(_QRX)				(0x00002000 + ((_QRX) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QRX_TAIL1_MAX_INDEX			255
+#define QRX_TAIL1_TAIL_S			0
+#define QRX_TAIL1_TAIL_M			ICE_M(0x1FFF, 0)
+#define QTX_TAIL(_DBQM)				(0x00000000 + ((_DBQM) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define QTX_TAIL_MAX_INDEX			255
+#define QTX_TAIL_QTX_COMM_DBELL_S		0
+#define QTX_TAIL_QTX_COMM_DBELL_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAH1			0x0000F060 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_CPM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ARQBAL1			0x0000F050 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_CPM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ARQH1			0x0000F080 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQH1_ARQH_S			0
+#define VF_MBX_CPM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN1			0x0000F070 /* Reset Source: PFR */
+#define VF_MBX_CPM_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_CPM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_CPM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_CPM_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_CPM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_CPM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_CPM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ARQT1			0x0000F090 /* Reset Source: CORER */
+#define VF_MBX_CPM_ARQT1_ARQT_S			0
+#define VF_MBX_CPM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQBAH1			0x0000F010 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_CPM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_CPM_ATQBAL1			0x0000F000 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_CPM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_CPM_ATQH1			0x0000F030 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQH1_ATQH_S			0
+#define VF_MBX_CPM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN1			0x0000F020 /* Reset Source: PFR */
+#define VF_MBX_CPM_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_CPM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_CPM_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_CPM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_CPM_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_CPM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_CPM_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_CPM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_CPM_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_CPM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_CPM_ATQT1			0x0000F040 /* Reset Source: CORER */
+#define VF_MBX_CPM_ATQT1_ATQT_S			0
+#define VF_MBX_CPM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQBAH1			0x00020060 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_HLP_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ARQBAL1			0x00020050 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_HLP_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ARQH1			0x00020080 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQH1_ARQH_S			0
+#define VF_MBX_HLP_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN1			0x00020070 /* Reset Source: PFR */
+#define VF_MBX_HLP_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_HLP_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_HLP_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_HLP_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_HLP_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_HLP_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_HLP_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ARQT1			0x00020090 /* Reset Source: CORER */
+#define VF_MBX_HLP_ARQT1_ARQT_S			0
+#define VF_MBX_HLP_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQBAH1			0x00020010 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_HLP_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_HLP_ATQBAL1			0x00020000 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_HLP_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_HLP_ATQH1			0x00020030 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQH1_ATQH_S			0
+#define VF_MBX_HLP_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN1			0x00020020 /* Reset Source: PFR */
+#define VF_MBX_HLP_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_HLP_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_HLP_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_HLP_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_HLP_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_HLP_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_HLP_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_HLP_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_HLP_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_HLP_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_HLP_ATQT1			0x00020040 /* Reset Source: CORER */
+#define VF_MBX_HLP_ATQT1_ATQT_S			0
+#define VF_MBX_HLP_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQBAH1			0x00021060 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAH1_ARQBAH_S		0
+#define VF_MBX_PSM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ARQBAL1			0x00021050 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_S		6
+#define VF_MBX_PSM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ARQH1			0x00021080 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQH1_ARQH_S			0
+#define VF_MBX_PSM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN1			0x00021070 /* Reset Source: PFR */
+#define VF_MBX_PSM_ARQLEN1_ARQLEN_S		0
+#define VF_MBX_PSM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ARQLEN1_ARQVFE_S		28
+#define VF_MBX_PSM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_MBX_PSM_ARQLEN1_ARQOVFL_S		29
+#define VF_MBX_PSM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ARQLEN1_ARQCRIT_S		30
+#define VF_MBX_PSM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ARQLEN1_ARQENABLE_S		31
+#define VF_MBX_PSM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ARQT1			0x00021090 /* Reset Source: CORER */
+#define VF_MBX_PSM_ARQT1_ARQT_S			0
+#define VF_MBX_PSM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQBAH1			0x00021010 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAH1_ATQBAH_S		0
+#define VF_MBX_PSM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_MBX_PSM_ATQBAL1			0x00021000 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQBAL1_ATQBAL_S		6
+#define VF_MBX_PSM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_MBX_PSM_ATQH1			0x00021030 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQH1_ATQH_S			0
+#define VF_MBX_PSM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN1			0x00021020 /* Reset Source: PFR */
+#define VF_MBX_PSM_ATQLEN1_ATQLEN_S		0
+#define VF_MBX_PSM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_MBX_PSM_ATQLEN1_ATQVFE_S		28
+#define VF_MBX_PSM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_MBX_PSM_ATQLEN1_ATQOVFL_S		29
+#define VF_MBX_PSM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_MBX_PSM_ATQLEN1_ATQCRIT_S		30
+#define VF_MBX_PSM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_MBX_PSM_ATQLEN1_ATQENABLE_S		31
+#define VF_MBX_PSM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_MBX_PSM_ATQT1			0x00021040 /* Reset Source: CORER */
+#define VF_MBX_PSM_ATQT1_ATQT_S			0
+#define VF_MBX_PSM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQBAH1			0x0000F160 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAH1_ARQBAH_S		0
+#define VF_SB_CPM_ARQBAH1_ARQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ARQBAL1			0x0000F150 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQBAL1_ARQBAL_LSB_S		0
+#define VF_SB_CPM_ARQBAL1_ARQBAL_LSB_M		ICE_M(0x3F, 0)
+#define VF_SB_CPM_ARQBAL1_ARQBAL_S		6
+#define VF_SB_CPM_ARQBAL1_ARQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ARQH1				0x0000F180 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQH1_ARQH_S			0
+#define VF_SB_CPM_ARQH1_ARQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN1			0x0000F170 /* Reset Source: PFR */
+#define VF_SB_CPM_ARQLEN1_ARQLEN_S		0
+#define VF_SB_CPM_ARQLEN1_ARQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ARQLEN1_ARQVFE_S		28
+#define VF_SB_CPM_ARQLEN1_ARQVFE_M		BIT(28)
+#define VF_SB_CPM_ARQLEN1_ARQOVFL_S		29
+#define VF_SB_CPM_ARQLEN1_ARQOVFL_M		BIT(29)
+#define VF_SB_CPM_ARQLEN1_ARQCRIT_S		30
+#define VF_SB_CPM_ARQLEN1_ARQCRIT_M		BIT(30)
+#define VF_SB_CPM_ARQLEN1_ARQENABLE_S		31
+#define VF_SB_CPM_ARQLEN1_ARQENABLE_M		BIT(31)
+#define VF_SB_CPM_ARQT1				0x0000F190 /* Reset Source: CORER */
+#define VF_SB_CPM_ARQT1_ARQT_S			0
+#define VF_SB_CPM_ARQT1_ARQT_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQBAH1			0x0000F110 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAH1_ATQBAH_S		0
+#define VF_SB_CPM_ATQBAH1_ATQBAH_M		ICE_M(0xFFFFFFFF, 0)
+#define VF_SB_CPM_ATQBAL1			0x0000F100 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQBAL1_ATQBAL_S		6
+#define VF_SB_CPM_ATQBAL1_ATQBAL_M		ICE_M(0x3FFFFFF, 6)
+#define VF_SB_CPM_ATQH1				0x0000F130 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQH1_ATQH_S			0
+#define VF_SB_CPM_ATQH1_ATQH_M			ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN1			0x0000F120 /* Reset Source: PFR */
+#define VF_SB_CPM_ATQLEN1_ATQLEN_S		0
+#define VF_SB_CPM_ATQLEN1_ATQLEN_M		ICE_M(0x3FF, 0)
+#define VF_SB_CPM_ATQLEN1_ATQVFE_S		28
+#define VF_SB_CPM_ATQLEN1_ATQVFE_M		BIT(28)
+#define VF_SB_CPM_ATQLEN1_ATQOVFL_S		29
+#define VF_SB_CPM_ATQLEN1_ATQOVFL_M		BIT(29)
+#define VF_SB_CPM_ATQLEN1_ATQCRIT_S		30
+#define VF_SB_CPM_ATQLEN1_ATQCRIT_M		BIT(30)
+#define VF_SB_CPM_ATQLEN1_ATQENABLE_S		31
+#define VF_SB_CPM_ATQLEN1_ATQENABLE_M		BIT(31)
+#define VF_SB_CPM_ATQT1				0x0000F140 /* Reset Source: CORER */
+#define VF_SB_CPM_ATQT1_ATQT_S			0
+#define VF_SB_CPM_ATQT1_ATQT_M			ICE_M(0x3FF, 0)
+#define VFINT_DYN_CTL(_i)			(0x00023000 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_DYN_CTL_MAX_INDEX			7
+#define VFINT_DYN_CTL_INTENA_S			0
+#define VFINT_DYN_CTL_INTENA_M			BIT(0)
+#define VFINT_DYN_CTL_CLEARPBA_S		1
+#define VFINT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define VFINT_DYN_CTL_SWINT_TRIG_S		2
+#define VFINT_DYN_CTL_SWINT_TRIG_M		BIT(2)
+#define VFINT_DYN_CTL_ITR_INDX_S		3
+#define VFINT_DYN_CTL_ITR_INDX_M		ICE_M(0x3, 3)
+#define VFINT_DYN_CTL_INTERVAL_S		5
+#define VFINT_DYN_CTL_INTERVAL_M		ICE_M(0xFFF, 5)
+#define VFINT_DYN_CTL_SW_ITR_INDX_ENA_S		24
+#define VFINT_DYN_CTL_SW_ITR_INDX_ENA_M		BIT(24)
+#define VFINT_DYN_CTL_SW_ITR_INDX_S		25
+#define VFINT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define VFINT_DYN_CTL_WB_ON_ITR_S		30
+#define VFINT_DYN_CTL_WB_ON_ITR_M		BIT(30)
+#define VFINT_DYN_CTL_INTENA_MSK_S		31
+#define VFINT_DYN_CTL_INTENA_MSK_M		BIT(31)
+#define VFINT_ITR_0(_i)				(0x00023004 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_0_MAX_INDEX			7
+#define VFINT_ITR_0_INTERVAL_S			0
+#define VFINT_ITR_0_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITR_1(_i)				(0x00023008 + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_1_MAX_INDEX			7
+#define VFINT_ITR_1_INTERVAL_S			0
+#define VFINT_ITR_1_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFINT_ITR_2(_i)				(0x0002300C + ((_i) * 4096)) /* _i=0...7 */ /* Reset Source: CORER */
+#define VFINT_ITR_2_MAX_INDEX			7
+#define VFINT_ITR_2_INTERVAL_S			0
+#define VFINT_ITR_2_INTERVAL_M			ICE_M(0xFFF, 0)
+#define VFQRX_TAIL(_QRX)			(0x0002E000 + ((_QRX) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VFQRX_TAIL_MAX_INDEX			255
+#define VFQRX_TAIL_TAIL_S			0
+#define VFQRX_TAIL_TAIL_M			ICE_M(0x1FFF, 0)
+#define VFQTX_COMM_DBELL(_DBQM)			(0x00030000 + ((_DBQM) * 4)) /* _i=0...255 */ /* Reset Source: CORER */
+#define VFQTX_COMM_DBELL_MAX_INDEX		255
+#define VFQTX_COMM_DBELL_QTX_COMM_DBELL_S	0
+#define VFQTX_COMM_DBELL_QTX_COMM_DBELL_M	ICE_M(0xFFFFFFFF, 0)
+#define VFQTX_COMM_DBLQ_DBELL(_DBLQ)		(0x00022000 + ((_DBLQ) * 4)) /* _i=0...3 */ /* Reset Source: CORER */
+#define VFQTX_COMM_DBLQ_DBELL_MAX_INDEX		3
+#define VFQTX_COMM_DBLQ_DBELL_TAIL_S		0
+#define VFQTX_COMM_DBLQ_DBELL_TAIL_M		ICE_M(0x1FFF, 0)
+#define MSIX_TMSG1(_i)				(0x00000008 + ((_i) * 16)) /* _i=0...64 */ /* Reset Source: FLR */
+#define MSIX_TMSG1_MAX_INDEX			64
+#define MSIX_TMSG1_MSIXTMSG_S			0
+#define MSIX_TMSG1_MSIXTMSG_M			ICE_M(0xFFFFFFFF, 0)
+#define VFPE_AEQALLOC1				0x0000A400 /* Reset Source: VFR */
+#define VFPE_AEQALLOC1_AECOUNT_S		0
+#define VFPE_AEQALLOC1_AECOUNT_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPHIGH1				0x00009800 /* Reset Source: VFR */
+#define VFPE_CCQPHIGH1_PECCQPHIGH_S		0
+#define VFPE_CCQPHIGH1_PECCQPHIGH_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPLOW1				0x0000AC00 /* Reset Source: VFR */
+#define VFPE_CCQPLOW1_PECCQPLOW_S		0
+#define VFPE_CCQPLOW1_PECCQPLOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_CCQPSTATUS1			0x0000B800 /* Reset Source: VFR */
+#define VFPE_CCQPSTATUS1_CCQP_DONE_S		0
+#define VFPE_CCQPSTATUS1_CCQP_DONE_M		BIT(0)
+#define VFPE_CCQPSTATUS1_HMC_PROFILE_S		4
+#define VFPE_CCQPSTATUS1_HMC_PROFILE_M		ICE_M(0x7, 4)
+#define VFPE_CCQPSTATUS1_RDMA_EN_VFS_S		16
+#define VFPE_CCQPSTATUS1_RDMA_EN_VFS_M		ICE_M(0x3F, 16)
+#define VFPE_CCQPSTATUS1_CCQP_ERR_S		31
+#define VFPE_CCQPSTATUS1_CCQP_ERR_M		BIT(31)
+#define VFPE_CQACK1				0x0000B000 /* Reset Source: VFR */
+#define VFPE_CQACK1_PECQID_S			0
+#define VFPE_CQACK1_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQARM1				0x0000B400 /* Reset Source: VFR */
+#define VFPE_CQARM1_PECQID_S			0
+#define VFPE_CQARM1_PECQID_M			ICE_M(0x7FFFF, 0)
+#define VFPE_CQPDB1				0x0000BC00 /* Reset Source: VFR */
+#define VFPE_CQPDB1_WQHEAD_S			0
+#define VFPE_CQPDB1_WQHEAD_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPERRCODES1			0x00009C00 /* Reset Source: VFR */
+#define VFPE_CQPERRCODES1_CQP_MINOR_CODE_S	0
+#define VFPE_CQPERRCODES1_CQP_MINOR_CODE_M	ICE_M(0xFFFF, 0)
+#define VFPE_CQPERRCODES1_CQP_MAJOR_CODE_S	16
+#define VFPE_CQPERRCODES1_CQP_MAJOR_CODE_M	ICE_M(0xFFFF, 16)
+#define VFPE_CQPTAIL1				0x0000A000 /* Reset Source: VFR */
+#define VFPE_CQPTAIL1_WQTAIL_S			0
+#define VFPE_CQPTAIL1_WQTAIL_M			ICE_M(0x7FF, 0)
+#define VFPE_CQPTAIL1_CQP_OP_ERR_S		31
+#define VFPE_CQPTAIL1_CQP_OP_ERR_M		BIT(31)
+#define VFPE_IPCONFIG01				0x00008C00 /* Reset Source: VFR */
+#define VFPE_IPCONFIG01_PEIPID_S		0
+#define VFPE_IPCONFIG01_PEIPID_M		ICE_M(0xFFFF, 0)
+#define VFPE_IPCONFIG01_USEENTIREIDRANGE_S	16
+#define VFPE_IPCONFIG01_USEENTIREIDRANGE_M	BIT(16)
+#define VFPE_IPCONFIG01_UDP_SRC_PORT_MASK_EN_S	17
+#define VFPE_IPCONFIG01_UDP_SRC_PORT_MASK_EN_M	BIT(17)
+#define VFPE_MRTEIDXMASK1(_VF)			(0x00509800 + ((_VF) * 4)) /* _i=0...255 */ /* Reset Source: PFR */
+#define VFPE_MRTEIDXMASK1_MAX_INDEX		255
+#define VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_S	0
+#define VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_M	ICE_M(0x1F, 0)
+#define VFPE_RCVUNEXPECTEDERROR1		0x00009400 /* Reset Source: VFR */
+#define VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_S 0
+#define VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_M ICE_M(0xFFFFFF, 0)
+#define VFPE_TCPNOWTIMER1			0x0000A800 /* Reset Source: VFR */
+#define VFPE_TCPNOWTIMER1_TCP_NOW_S		0
+#define VFPE_TCPNOWTIMER1_TCP_NOW_M		ICE_M(0xFFFFFFFF, 0)
+#define VFPE_WQEALLOC1				0x0000C000 /* Reset Source: VFR */
+#define VFPE_WQEALLOC1_PEQPID_S			0
+#define VFPE_WQEALLOC1_PEQPID_M			ICE_M(0x3FFFF, 0)
+#define VFPE_WQEALLOC1_WQE_DESC_INDEX_S		20
+#define VFPE_WQEALLOC1_WQE_DESC_INDEX_M		ICE_M(0xFFF, 20)
 
-#endif /* _ICE_HW_AUTOGEN_H_ */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c
new file mode 100644
index 0000000000000000000000000000000000000000..512bca2f98d0c84d5fcf36614c0e3f7437591250
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.c
@@ -0,0 +1,1488 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* Inter-Driver Communication */
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice_dcb_lib.h"
+#include "ice_ptp.h"
+
+DEFINE_IDA(ice_peer_index_ida);
+
+
+
+static struct mfd_cell ice_mfd_cells[] = ASSIGN_PEER_INFO;
+
+/**
+ * ice_is_vsi_state_nominal
+ * @vsi: pointer to the VSI struct
+ *
+ * returns true if VSI state is nominal, false otherwise
+ */
+static bool ice_is_vsi_state_nominal(struct ice_vsi *vsi)
+{
+	if (!vsi)
+		return false;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state) ||
+	    test_bit(ICE_VSI_NEEDS_RESTART, vsi->state))
+		return false;
+
+	return true;
+}
+
+
+/**
+ * ice_peer_state_change - manage state machine for peer
+ * @peer_obj: pointer to peer's configuration
+ * @new_state: the state requested to transition into
+ * @locked: boolean to determine if call made with mutex held
+ *
+ * This function handles all state transitions for peer objects.
+ *
+ * The state machine is as follows:
+ *
+ *     +<-----------------------+<-----------------------------+
+ *				|<-------+<----------+	       +
+ *				\/	 +	     +	       +
+ *    INIT  --------------> PROBED --> OPENING	  CLOSED --> REMOVED
+ *					 +           +
+ *				       OPENED --> CLOSING
+ *					 +	     +
+ *				       PREP_RST	     +
+ *					 +	     +
+ *				      PREPPED	     +
+ *					 +---------->+
+ *
+ * NOTE: there is an error condition that can take a peer from OPENING
+ * to REMOVED.
+ */
+static void
+ice_peer_state_change(struct ice_peer_obj_int *peer_obj, long new_state,
+		      bool locked)
+{
+	struct device *dev;
+
+	dev = bus_find_device_by_name(&platform_bus_type, NULL,
+				      peer_obj->plat_name);
+
+	if (!locked)
+		mutex_lock(&peer_obj->peer_obj_state_mutex);
+
+	switch (new_state) {
+	case ICE_PEER_OBJ_STATE_INIT:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_REMOVED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj->state);
+			dev_dbg(dev, "state transition from _REMOVED to _INIT\n");
+		} else {
+			set_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj->state);
+			if (dev)
+				dev_dbg(dev, "state set to _INIT\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PROBED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_INIT,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _INIT to _PROBED\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_REMOVED,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _REMOVED to _PROBED\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENING to _PROBED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_OPENING:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PROBED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj->state);
+			dev_dbg(dev, "state transition from _PROBED to _OPENING\n");
+		} else if (test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSED,
+					      peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj->state);
+			dev_dbg(dev, "state transition from _CLOSED to _OPENING\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_OPENED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENING to _OPENED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PREP_RST:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENED to _PREP_RST\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_PREPPED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREP_RST,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj->state);
+			dev_dbg(dev, "state transition _PREP_RST to _PREPPED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_CLOSING:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_dbg(dev, "state transition from _OPENED to _CLOSING\n");
+		}
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREPPED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_dbg(dev, "state transition _PREPPED to _CLOSING\n");
+		}
+		/* NOTE - up to peer to handle this situation correctly */
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_PREP_RST,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj->state);
+			dev_warn(dev,
+				 "WARN: Peer state _PREP_RST to _CLOSING\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_CLOSED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj->state);
+			dev_dbg(dev, "state transition from _CLOSING to _CLOSED\n");
+		}
+		break;
+	case ICE_PEER_OBJ_STATE_REMOVED:
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENED,
+				       peer_obj->state) ||
+		    test_and_clear_bit(ICE_PEER_OBJ_STATE_CLOSED,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj->state);
+			dev_dbg(dev, "state from _OPENED/_CLOSED to _REMOVED\n");
+			/* Clear registration for events when peer removed */
+			bitmap_zero(peer_obj->events, ICE_PEER_OBJ_STATE_NBITS);
+		}
+		if (test_and_clear_bit(ICE_PEER_OBJ_STATE_OPENING,
+				       peer_obj->state)) {
+			set_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj->state);
+			dev_warn(dev, "Peer failed to open, set to _REMOVED");
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (!locked)
+		mutex_unlock(&peer_obj->peer_obj_state_mutex);
+
+	put_device(dev);
+}
+
+/**
+ * ice_peer_close - close a peer object
+ * @peer_obj_int: peer object to close
+ * @data: pointer to opaque data
+ *
+ * This function will also set the state bit for the peer to CLOSED. This
+ * function is meant to be called from a ice_for_each_peer().
+ */
+int ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	enum ice_close_reason reason = *(enum ice_close_reason *)(data);
+	struct ice_peer_obj *peer_obj;
+	struct ice_pf *pf;
+	int i;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	/* return 0 so ice_for_each_peer will continue closing other peers */
+	if (!ice_validate_peer_obj(peer_obj))
+		return 0;
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    test_bit(ICE_SUSPENDED, pf->state) ||
+	    test_bit(ICE_NEEDS_RESTART, pf->state))
+		return 0;
+
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	/* no peer driver, already closed, closing or opening nothing to do */
+	if (test_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_CLOSING, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_OPENING, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj_int->state))
+		goto peer_close_out;
+
+	/* Set the peer state to CLOSING */
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING, true);
+
+	for (i = 0; i < ICE_EVENT_NBITS; i++)
+		bitmap_zero(peer_obj_int->current_events[i].type,
+			    ICE_EVENT_NBITS);
+
+	if (peer_obj->peer_ops && peer_obj->peer_ops->close)
+		peer_obj->peer_ops->close(peer_obj, reason);
+
+	/* Set the peer state to CLOSED */
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED, true);
+
+peer_close_out:
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+
+	return 0;
+}
+
+/**
+ * ice_close_peer_for_reset - queue work to close peer for reset
+ * @peer_obj_int: pointer peer object internal struct
+ * @data: pointer to opaque data used for reset type
+ */
+int ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_peer_obj *peer_obj;
+	enum ice_reset_req reset;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!ice_validate_peer_obj(peer_obj) ||
+	    (!test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) &&
+	     !test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)))
+		return 0;
+
+	reset = *(enum ice_reset_req *)data;
+
+	switch (reset) {
+	case ICE_RESET_EMPR:
+		peer_obj_int->rst_type = ICE_REASON_EMPR_REQ;
+		break;
+	case ICE_RESET_GLOBR:
+		peer_obj_int->rst_type = ICE_REASON_GLOBR_REQ;
+		break;
+	case ICE_RESET_CORER:
+		peer_obj_int->rst_type = ICE_REASON_CORER_REQ;
+		break;
+	case ICE_RESET_PFR:
+		peer_obj_int->rst_type = ICE_REASON_PFR_REQ;
+		break;
+	default:
+		/* reset type is invalid */
+		return 1;
+	}
+	queue_work(peer_obj_int->ice_peer_wq, &peer_obj_int->peer_close_task);
+	return 0;
+}
+
+/**
+ * ice_check_peer_drv_for_events - check peer_drv for events to report
+ * @peer_obj: peer object to report to
+ */
+static void ice_check_peer_drv_for_events(struct ice_peer_obj *peer_obj)
+{
+	const struct ice_peer_ops *p_ops = peer_obj->peer_ops;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv_int *peer_drv_int;
+	int i;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	if (!peer_obj_int)
+		return;
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	for_each_set_bit(i, peer_obj_int->events, ICE_EVENT_NBITS) {
+		struct ice_event *curr = &peer_drv_int->current_events[i];
+
+		if (!bitmap_empty(curr->type, ICE_EVENT_NBITS) &&
+		    p_ops->event_handler)
+			p_ops->event_handler(peer_obj, curr);
+	}
+}
+
+/**
+ * ice_check_peer_for_events - check peer_objs for events new peer reg'd for
+ * @src_peer_int: peer to check for events
+ * @data: ptr to opaque data, to be used for the peer struct that opened
+ *
+ * This function is to be called when a peer object is opened.
+ *
+ * Since a new peer opening would have missed any events that would
+ * have happened before its opening, we need to walk the peers and see
+ * if any of them have events that the new peer cares about
+ *
+ * This function is meant to be called by a ice_for_each_peer.
+ */
+static int
+ice_check_peer_for_events(struct ice_peer_obj_int *src_peer_int, void *data)
+{
+	struct ice_peer_obj *new_peer = (struct ice_peer_obj *)data;
+	const struct ice_peer_ops *p_ops = new_peer->peer_ops;
+	struct ice_peer_obj_int *new_peer_int;
+	struct ice_peer_obj *src_peer;
+	unsigned long i;
+
+	src_peer = ice_get_peer_obj(src_peer_int);
+	if (!ice_validate_peer_obj(new_peer) ||
+	    !ice_validate_peer_obj(src_peer))
+		return 0;
+
+	new_peer_int = peer_to_ice_obj_int(new_peer);
+
+	for_each_set_bit(i, new_peer_int->events, ICE_EVENT_NBITS) {
+		struct ice_event *curr = &src_peer_int->current_events[i];
+
+		if (!bitmap_empty(curr->type, ICE_EVENT_NBITS) &&
+		    new_peer->peer_obj_id != src_peer->peer_obj_id &&
+		    p_ops->event_handler)
+			p_ops->event_handler(new_peer, curr);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_for_each_peer - iterate across and call function for each peer obj
+ * @pf: pointer to private board struct
+ * @data: data to pass to function on each call
+ * @fn: pointer to function to call for each peer
+ */
+int
+ice_for_each_peer(struct ice_pf *pf, void *data,
+		  int (*fn)(struct ice_peer_obj_int *, void *))
+{
+	unsigned int i;
+
+	if (!pf->peers)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		struct ice_peer_obj_int *peer_obj_int;
+
+		peer_obj_int = pf->peers[i];
+		if (peer_obj_int) {
+			int ret = fn(peer_obj_int, data);
+
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_finish_init_peer_obj - complete peer object initialization
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data
+ *
+ * This function completes remaining initialization of peer objects
+ */
+int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int,
+			 void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+	struct ice_peer_drv *peer_drv;
+	struct device *dev;
+	struct ice_pf *pf;
+	int ret = 0;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	/* peer_obj will not always be populated at the time of this check */
+	if (!ice_validate_peer_obj(peer_obj))
+		return ret;
+
+	peer_drv = peer_obj->peer_drv;
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+	/* There will be several assessments of the peer_obj's state in this
+	 * chunk of logic.  We need to hold the peer_obj_int's state mutex
+	 * for the entire part so that the flow progresses without another
+	 * context changing things mid-flow
+	 */
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	if (!peer_obj->peer_ops) {
+		dev_err(dev, "peer_ops not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	if (!peer_obj->peer_ops->open) {
+		dev_err(dev, "peer_ops:open not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	if (!peer_obj->peer_ops->close) {
+		dev_err(dev, "peer_ops:close not defined in peer obj\n");
+		goto init_unlock;
+	}
+
+	/* Peer driver expected to set driver_id during registration */
+	if (!peer_drv->driver_id) {
+		dev_err(dev, "Peer driver did not set driver_id\n");
+		goto init_unlock;
+	}
+
+	if ((test_bit(ICE_PEER_OBJ_STATE_CLOSED, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PROBED, peer_obj_int->state)) &&
+	    ice_pf_state_is_nominal(pf)) {
+		/* If the RTNL is locked, we defer opening the peer
+		 * until the next time this function is called by the
+		 * service task.
+		 */
+		if (rtnl_is_locked())
+			goto init_unlock;
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_OPENING,
+				      true);
+		ret = peer_obj->peer_ops->open(peer_obj);
+		if (ret == -EAGAIN) {
+			dev_err(dev, "Peer %d failed to open\n",
+				peer_obj->peer_obj_id);
+			ice_peer_state_change(peer_obj_int,
+					      ICE_PEER_OBJ_STATE_PROBED, true);
+			goto init_unlock;
+		} else if (ret) {
+			ice_peer_state_change(peer_obj_int,
+					      ICE_PEER_OBJ_STATE_REMOVED, true);
+			peer_obj->peer_ops = NULL;
+			goto init_unlock;
+		}
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_OPENED,
+				      true);
+		ret = ice_for_each_peer(pf, peer_obj,
+					ice_check_peer_for_events);
+		ice_check_peer_drv_for_events(peer_obj);
+	}
+
+	if (test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)) {
+		enum ice_close_reason reason = ICE_REASON_CORER_REQ;
+		int i;
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING,
+				      true);
+		for (i = 0; i < ICE_EVENT_NBITS; i++)
+			bitmap_zero(peer_obj_int->current_events[i].type,
+				    ICE_EVENT_NBITS);
+
+		peer_obj->peer_ops->close(peer_obj, reason);
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED,
+				      true);
+	}
+
+init_unlock:
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+
+	return ret;
+}
+
+/**
+ * ice_unreg_peer_obj - unregister specified peer object
+ * @peer_obj_int: ptr to peer object internal
+ * @data: ptr to opaque data
+ *
+ * This function invokes object unregistration, removes ID associated with
+ * the specified object.
+ */
+int ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void __always_unused *data)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_obj *peer_obj;
+	struct pci_dev *pdev;
+	struct device *dev;
+	struct ice_pf *pf;
+
+	if (!peer_obj_int)
+		return 0;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	pdev = peer_obj->pdev;
+	if (!pdev)
+		return 0;
+
+	pf = pci_get_drvdata(pdev);
+	if (!pf)
+		return 0;
+	dev = ice_pf_to_dev(pf);
+
+	mfd_remove_devices(&pdev->dev);
+
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	if (peer_obj_int->ice_peer_wq) {
+		if (peer_obj_int->peer_prep_task.func)
+			cancel_work_sync(&peer_obj_int->peer_prep_task);
+
+		if (peer_obj_int->peer_close_task.func)
+			cancel_work_sync(&peer_obj_int->peer_close_task);
+		destroy_workqueue(peer_obj_int->ice_peer_wq);
+	}
+
+	devm_kfree(dev, peer_drv_int);
+
+	devm_kfree(dev, peer_obj_int);
+
+	return 0;
+}
+
+/**
+ * ice_unroll_peer - destroy peers and peer_wq in case of error
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data
+ *
+ * This function releases resources in the event of a failure in creating
+ * peer objects or their individual work_queues. Meant to be called from
+ * a ice_for_each_peer invocation
+ */
+int ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void __always_unused *data)
+{
+	struct ice_peer_obj *peer_obj;
+	struct ice_pf *pf;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj || !peer_obj->pdev)
+		return 0;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!pf)
+		return 0;
+
+	if (peer_obj_int->ice_peer_wq)
+		destroy_workqueue(peer_obj_int->ice_peer_wq);
+
+	if (peer_obj_int->peer_drv_int)
+		devm_kfree(ice_pf_to_dev(pf), peer_obj_int->peer_drv_int);
+
+	devm_kfree(ice_pf_to_dev(pf), peer_obj_int);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+/**
+ * ice_peer_refresh_msix - load new values into ice_peer_obj structs
+ * @pf: pointer to private board struct
+ */
+void ice_peer_refresh_msix(struct ice_pf *pf)
+{
+	struct ice_peer_obj *peer;
+	unsigned int i;
+
+	if (!pf->peers)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		if (!pf->peers[i])
+			continue;
+
+		peer = ice_get_peer_obj(pf->peers[i]);
+		if (!peer)
+			continue;
+
+		switch (peer->peer_obj_id) {
+		case ICE_PEER_RDMA_ID:
+			peer->msix_count = pf->num_rdma_msix;
+			peer->msix_entries = &pf->msix_entries[pf->rdma_base_vector];
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+#endif /* CONFIG_PM */
+/**
+ * ice_find_vsi - Find the VSI from VSI ID
+ * @pf: The PF pointer to search in
+ * @vsi_num: The VSI ID to search for
+ */
+static struct ice_vsi *ice_find_vsi(struct ice_pf *pf, u16 vsi_num)
+{
+	int i;
+
+	ice_for_each_vsi(pf, i)
+		if (pf->vsi[i] && pf->vsi[i]->vsi_num == vsi_num)
+			return  pf->vsi[i];
+	return NULL;
+}
+
+/**
+ * ice_peer_alloc_rdma_qsets - Allocate Leaf Nodes for RDMA Qset
+ * @peer_obj: peer that is requesting the Leaf Nodes
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates Leaf Nodes for given RDMA Qset resources
+ * for the peer object.
+ */
+static int
+ice_peer_alloc_rdma_qsets(struct ice_peer_obj *peer_obj, struct ice_res *res,
+			  int __always_unused partial_acceptable)
+{
+	u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS];
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	int i, ret = 0;
+	u32 *qset_teid;
+	u16 *qs_handle;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_IWARP_ENA, pf->flags))
+		return -EINVAL;
+
+	if (res->cnt_req > ICE_MAX_TXQ_PER_TXQG)
+		return -EINVAL;
+
+	qset_teid = kcalloc(res->cnt_req, sizeof(*qset_teid), GFP_KERNEL);
+	if (!qset_teid)
+		return -ENOMEM;
+
+	qs_handle = kcalloc(res->cnt_req, sizeof(*qs_handle), GFP_KERNEL);
+	if (!qs_handle) {
+		kfree(qset_teid);
+		return -ENOMEM;
+	}
+
+	ice_for_each_traffic_class(i)
+		max_rdmaqs[i] = 0;
+
+	for (i = 0; i < res->cnt_req; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		if (qset->vsi_id != peer_obj->pf_vsi_num) {
+			dev_err(dev, "RDMA QSet invalid VSI requested\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		max_rdmaqs[qset->tc]++;
+		qs_handle[i] = qset->qs_handle;
+	}
+
+	vsi = ice_find_vsi(pf, peer_obj->pf_vsi_num);
+	if (!vsi) {
+		dev_err(dev, "RDMA QSet invalid VSI\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	status = ice_cfg_vsi_rdma(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+				  max_rdmaqs);
+	if (status) {
+		dev_err(dev, "Failed VSI RDMA qset config\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < res->cnt_req; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		status = ice_ena_vsi_rdma_qset(vsi->port_info, vsi->idx,
+					       qset->tc, &qs_handle[i], 1,
+					       &qset_teid[i]);
+		if (status) {
+			dev_err(dev, "Failed VSI RDMA qset enable\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		vsi->qset_handle[qset->tc] = qset->qs_handle;
+		qset->teid = qset_teid[i];
+	}
+
+out:
+	kfree(qset_teid);
+	kfree(qs_handle);
+	return ret;
+}
+
+/**
+ * ice_peer_free_rdma_qsets - Free leaf nodes for RDMA Qset
+ * @peer_obj: peer that requested qsets to be freed
+ * @res: Resource to be freed
+ */
+static int
+ice_peer_free_rdma_qsets(struct ice_peer_obj *peer_obj, struct ice_res *res)
+{
+	enum ice_status status;
+	int count, i, ret = 0;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+	u16 vsi_id;
+	u32 *teid;
+	u16 *q_id;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	dev = ice_pf_to_dev(pf);
+
+	count = res->res_allocated;
+	if (count > ICE_MAX_TXQ_PER_TXQG)
+		return -EINVAL;
+
+	teid = kcalloc(count, sizeof(*teid), GFP_KERNEL);
+	if (!teid)
+		return -ENOMEM;
+
+	q_id = kcalloc(count, sizeof(*q_id), GFP_KERNEL);
+	if (!q_id) {
+		kfree(teid);
+		return -ENOMEM;
+	}
+
+	vsi_id = res->res[0].res.qsets.vsi_id;
+	vsi = ice_find_vsi(pf, vsi_id);
+	if (!vsi) {
+		dev_err(dev, "RDMA Invalid VSI\n");
+		ret = -EINVAL;
+		goto rdma_free_out;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->res[i].res.qsets;
+		if (qset->vsi_id != vsi_id) {
+			dev_err(dev, "RDMA Invalid VSI ID\n");
+			ret = -EINVAL;
+			goto rdma_free_out;
+		}
+		q_id[i] = qset->qs_handle;
+		teid[i] = qset->teid;
+
+		vsi->qset_handle[qset->tc] = 0;
+	}
+
+	status = ice_dis_vsi_rdma_qset(vsi->port_info, count, teid, q_id);
+	if (status)
+		ret = -EINVAL;
+
+rdma_free_out:
+	kfree(teid);
+	kfree(q_id);
+
+	return ret;
+}
+
+/**
+ * ice_peer_alloc_res - Allocate requested resources for peer objects
+ * @peer_obj: peer that is requesting resources
+ * @res: Resources to be allocated
+ * @partial_acceptable: If partial allocation is acceptable to the peer
+ *
+ * This function allocates requested resources for the peer object.
+ */
+static int
+ice_peer_alloc_res(struct ice_peer_obj *peer_obj, struct ice_res *res,
+		   int partial_acceptable)
+{
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!ice_pf_state_is_nominal(pf))
+		return -EBUSY;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_alloc_rdma_qsets(peer_obj, res,
+						partial_acceptable);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_free_res - Free given resources
+ * @peer_obj: peer that is requesting freeing of resources
+ * @res: Resources to be freed
+ *
+ * Free/Release resources allocated to given peer onjects.
+ */
+static int
+ice_peer_free_res(struct ice_peer_obj *peer_obj, struct ice_res *res)
+{
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj) || !res)
+		return -EINVAL;
+
+	switch (res->res_type) {
+	case ICE_RDMA_QSETS_TXSCHED:
+		ret = ice_peer_free_rdma_qsets(peer_obj, res);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_reg_for_notif - register a peer to receive specific notifications
+ * @peer_obj: peer that is registering for event notifications
+ * @events: mask of event types peer is registering for
+ */
+static void
+ice_peer_reg_for_notif(struct ice_peer_obj *peer_obj, struct ice_event *events)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj) || !events)
+		return;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	bitmap_or(peer_obj_int->events, peer_obj_int->events, events->type,
+		  ICE_EVENT_NBITS);
+
+	/* Check to see if any events happened previous to peer registering */
+	ice_for_each_peer(pf, peer_obj, ice_check_peer_for_events);
+	ice_check_peer_drv_for_events(peer_obj);
+}
+
+/**
+ * ice_peer_unreg_for_notif - unreg a peer from receiving certain notifications
+ * @peer_obj: peer that is unregistering from event notifications
+ * @events: mask of event types peer is unregistering for
+ */
+static void
+ice_peer_unreg_for_notif(struct ice_peer_obj *peer_obj,
+			 struct ice_event *events)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+
+	if (!ice_validate_peer_obj(peer_obj) || !events)
+		return;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+
+	bitmap_andnot(peer_obj_int->events, peer_obj_int->events, events->type,
+		      ICE_EVENT_NBITS);
+}
+
+/**
+ * ice_peer_check_for_reg - check to see if any peers are reg'd for event
+ * @peer_obj_int: ptr to peer object internal struct
+ * @data: ptr to opaque data, to be used for ice_event to report
+ *
+ * This function is to be called by ice_for_each_peer to handle an
+ * event reported by a peer or the ice driver.
+ */
+int ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_event *event = (struct ice_event *)data;
+	DECLARE_BITMAP(comp_events, ICE_EVENT_NBITS);
+	struct ice_peer_obj *peer_obj;
+	bool check = true;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+
+	if (!ice_validate_peer_obj(peer_obj) || !data)
+	/* If invalid obj, in this case return 0 instead of error
+	 * because caller ignores this return value
+	 */
+		return 0;
+
+	if (event->reporter)
+		check = event->reporter->peer_obj_id != peer_obj->peer_obj_id;
+
+	if (bitmap_and(comp_events, event->type, peer_obj_int->events,
+		       ICE_EVENT_NBITS) &&
+	    (test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj_int->state) ||
+	     test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state)) &&
+	    check &&
+	    peer_obj->peer_ops->event_handler)
+		peer_obj->peer_ops->event_handler(peer_obj, event);
+
+	return 0;
+}
+
+/**
+ * ice_peer_report_state_change - accept report of a peer state change
+ * @peer_obj: peer that is sending notification about state change
+ * @event: ice_event holding info on what the state change is
+ *
+ * We also need to parse the list of peers to see if anyone is registered
+ * for notifications about this state change event, and if so, notify them.
+ */
+static void
+ice_peer_report_state_change(struct ice_peer_obj *peer_obj,
+			     struct ice_event *event)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv_int *peer_drv_int;
+	unsigned int e_type;
+	int drv_event = 0;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj) || !event)
+		return;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	peer_drv_int = peer_obj_int->peer_drv_int;
+
+	e_type = find_first_bit(event->type, ICE_EVENT_NBITS);
+	if (!e_type)
+		return;
+
+	switch (e_type) {
+	/* Check for peer_drv events */
+	case ICE_EVENT_MBX_CHANGE:
+		drv_event = 1;
+		if (event->info.mbx_rdy)
+			set_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				peer_drv_int->state);
+		else
+			clear_bit(ICE_PEER_DRV_STATE_MBX_RDY,
+				  peer_drv_int->state);
+		break;
+
+	/* Check for peer_obj events */
+	case ICE_EVENT_API_CHANGE:
+		if (event->info.api_rdy) {
+			set_bit(ICE_PEER_OBJ_STATE_API_RDY,
+				peer_obj_int->state);
+		} else {
+			clear_bit(ICE_PEER_OBJ_STATE_API_RDY,
+				  peer_obj_int->state);
+		}
+		break;
+
+	default:
+		return;
+	}
+
+	/* store the event and state to notify any new peers opening */
+	if (drv_event)
+		memcpy(&peer_drv_int->current_events[e_type], event,
+		       sizeof(*event));
+	else
+		memcpy(&peer_obj_int->current_events[e_type], event,
+		       sizeof(*event));
+
+	ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+}
+
+/**
+ * ice_peer_unregister - request to unregister peer
+ * @peer_obj: peer object
+ *
+ * This function triggers close/remove on peer_obj allowing peer
+ * to unregister.
+ */
+static int ice_peer_unregister(struct ice_peer_obj *peer_obj)
+{
+	enum ice_close_reason reason = ICE_REASON_PEER_DRV_UNREG;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+
+	ret = ice_peer_close(peer_obj_int, &reason);
+	if (ret)
+		return ret;
+
+	switch (peer_obj->peer_obj_id) {
+	case ICE_PEER_RDMA_ID:
+		pf->rdma_peer = NULL;
+		break;
+	default:
+		break;
+	}
+
+	peer_obj->peer_ops = NULL;
+
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_REMOVED, false);
+	return 0;
+}
+
+/**
+ * ice_peer_register - Called by peer to open communication with LAN
+ * @peer_obj: ptr to peer object
+ *
+ * registering peer is expected to populate the ice_peerdrv->name field
+ * before calling this function.
+ */
+static int ice_peer_register(struct ice_peer_obj *peer_obj)
+{
+	struct ice_peer_drv_int *peer_drv_int;
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_drv *peer_drv;
+
+
+	if (!peer_obj) {
+		pr_err("Failed to reg peer_obj: peer_obj ptr NULL\n");
+		return -EINVAL;
+	}
+
+	if (!peer_obj->pdev) {
+		pr_err("Failed to reg peer_obj: peer_obj pdev NULL\n");
+		return -EINVAL;
+	}
+
+	if (!peer_obj->peer_ops || !peer_obj->ops) {
+		pr_err("Failed to reg peer_obj: peer_obj peer_ops/ops NULL\n");
+		return -EINVAL;
+	}
+
+	peer_drv = peer_obj->peer_drv;
+	if (!peer_drv) {
+		pr_err("Failed to reg peer_obj: peer drv NULL\n");
+		return -EINVAL;
+	}
+
+
+	if (peer_drv->ver.major != ICE_PEER_MAJOR_VER ||
+	    peer_drv->ver.minor != ICE_PEER_MINOR_VER) {
+		pr_err("failed to register due to version mismatch:\n");
+		pr_err("expected major ver %d, caller specified major ver %d\n",
+		       ICE_PEER_MAJOR_VER, peer_drv->ver.major);
+		pr_err("expected minor ver %d, caller specified minor ver %d\n",
+		       ICE_PEER_MINOR_VER, peer_drv->ver.minor);
+		return -EINVAL;
+	}
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	peer_drv_int = peer_obj_int->peer_drv_int;
+	if (!peer_drv_int) {
+		pr_err("Failed to match peer_drv_int to peer_obj\n");
+		return -EINVAL;
+	}
+
+	peer_drv_int->peer_drv = peer_drv;
+
+	ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_PROBED, false);
+
+	return 0;
+}
+
+
+/**
+ * ice_peer_request_reset - accept request from peer to perform a reset
+ * @peer_obj: peer object that is requesting a reset
+ * @reset_type: type of reset the peer is requesting
+ */
+static int
+ice_peer_request_reset(struct ice_peer_obj *peer_obj, enum ice_peer_reset_type reset_type)
+{
+	enum ice_reset_req reset;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	switch (reset_type) {
+	case ICE_PEER_PFR:
+		reset = ICE_RESET_PFR;
+		break;
+	case ICE_PEER_CORER:
+		reset = ICE_RESET_CORER;
+		break;
+	case ICE_PEER_GLOBR:
+		reset = ICE_RESET_GLOBR;
+		break;
+	default:
+		dev_err(ice_pf_to_dev(pf), "incorrect reset request from peer\n");
+		return -EINVAL;
+	}
+
+	return ice_schedule_reset(pf, reset);
+}
+
+/**
+ * ice_peer_is_vsi_ready - query if VSI in nominal state
+ * @peer_obj: pointer to ice_peer_obj struct
+ */
+static int ice_peer_is_vsi_ready(struct ice_peer_obj *peer_obj)
+{
+	struct ice_netdev_priv *np;
+	struct ice_vsi *vsi;
+
+	/* If the peer_obj or associated values are not valid, then return
+	 * 0 as there is no ready port associated with the values passed in
+	 * as parameters.
+	 */
+
+	if (!peer_obj || !peer_obj->pdev || !pci_get_drvdata(peer_obj->pdev) ||
+	    !peer_to_ice_obj_int(peer_obj))
+		return 0;
+
+	if (!peer_obj->netdev)
+		return 0;
+
+	np = netdev_priv(peer_obj->netdev);
+	vsi = np->vsi;
+
+	return ice_is_vsi_state_nominal(vsi);
+}
+
+/**
+ * ice_peer_update_vsi_filter - update main VSI filters for RDMA
+ * @peer_obj: pointer to RDMA peer object
+ * @filter: selection of filters to enable or disable
+ * @enable: bool whether to enable or disable filters
+ */
+static int
+ice_peer_update_vsi_filter(struct ice_peer_obj *peer_obj,
+			   enum ice_rdma_filter __maybe_unused filter,
+			   bool enable)
+{
+	struct ice_vsi *vsi;
+	struct ice_pf *pf;
+	int ret;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return -EINVAL;
+
+	ret = ice_cfg_iwarp_fltr(&pf->hw, vsi->idx, enable);
+
+	if (ret) {
+		dev_err(ice_pf_to_dev(pf), "Failed to  %sable iWARP filtering\n",
+			enable ? "en" : "dis");
+	} else {
+		if (enable)
+			vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+		else
+			vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_peer_vc_send - send a virt channel message from a peer
+ * @peer_obj: pointer to a peer object
+ * @vf_id: the absolute VF ID of recipient of message
+ * @msg: pointer to message contents
+ * @len: len of message
+ */
+static int
+ice_peer_vc_send(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg, u16 len)
+{
+	enum ice_status status;
+	struct ice_pf *pf;
+
+	if (!ice_validate_peer_obj(peer_obj))
+		return -EINVAL;
+	if (!msg || !len)
+		return -ENOMEM;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (len > ICE_AQ_MAX_BUF_LEN)
+		return -EINVAL;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+	switch (peer_obj->peer_drv->driver_id) {
+	case ICE_PEER_RDMA_DRIVER:
+		if (vf_id >= pf->num_alloc_vfs)
+			return -ENODEV;
+
+		/* VIRTCHNL_OP_RDMA is being used for RoCEv2 msg also */
+		status = ice_aq_send_msg_to_vf(&pf->hw, vf_id, VIRTCHNL_OP_RDMA,
+					       0, msg, len, NULL);
+		break;
+	default:
+		dev_err(ice_pf_to_dev(pf),
+			"Peer driver (%u) not supported!", (u32)peer_obj->peer_drv->driver_id);
+		return -ENODEV;
+	}
+
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "Unable to send msg to VF, error %s\n",
+			ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_reserve_peer_qvector - Reserve vector resources for peer drivers
+ * @pf: board private structure to initialize
+ */
+static int ice_reserve_peer_qvector(struct ice_pf *pf)
+{
+	if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+		int index;
+
+		index = ice_get_res(pf, pf->irq_tracker, pf->num_rdma_msix, ICE_RES_RDMA_VEC_ID);
+		if (index < 0)
+			return index;
+		pf->num_avail_sw_msix -= pf->num_rdma_msix;
+		pf->rdma_base_vector = (u16)index;
+	}
+	return 0;
+}
+
+/**
+ * ice_peer_close_task - call peer's close asynchronously
+ * @work: pointer to work_struct contained by the peer_obj_int struct
+ *
+ * This method (asynchronous) of calling a peer's close function is
+ * meant to be used in the reset path.
+ */
+static void ice_peer_close_task(struct work_struct *work)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj_int = container_of(work, struct ice_peer_obj_int, peer_close_task);
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj || !peer_obj->peer_ops)
+		return;
+
+	/* If this peer_obj is going to close, we do not want any state changes
+	 * to happen until after we successfully finish or abort the close.
+	 * Grab the peer_obj_state_mutex to protect this flow
+	 */
+	mutex_lock(&peer_obj_int->peer_obj_state_mutex);
+
+	/* Only allow a close to go to the peer if they are in a state
+	 * to accept it. The last state of PREP_RST is a special case
+	 * that will not normally happen, but it is up to the peer
+	 * to handle it correctly.
+	 */
+	if (test_bit(ICE_PEER_OBJ_STATE_OPENED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PREPPED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_PREP_RST, peer_obj_int->state)) {
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSING, true);
+
+		if (peer_obj->peer_ops->close)
+			peer_obj->peer_ops->close(peer_obj, peer_obj_int->rst_type);
+
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_CLOSED, true);
+	}
+
+	mutex_unlock(&peer_obj_int->peer_obj_state_mutex);
+}
+
+/**
+ * ice_peer_update_vsi - update the pf_vsi info in peer_obj struct
+ * @peer_obj_int: pointer to peer_obj internal struct
+ * @data: opaque pointer - VSI to be updated
+ */
+int ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	struct ice_vsi *vsi = (struct ice_vsi *)data;
+	struct ice_peer_obj *peer_obj;
+
+	peer_obj = ice_get_peer_obj(peer_obj_int);
+	if (!peer_obj)
+		return 0;
+
+	peer_obj->pf_vsi_num = vsi->vsi_num;
+	return 0;
+}
+
+/* Initialize the ice_ops struct, which is used in 'ice_init_peer_devices' */
+static const struct ice_ops ops = {
+	.alloc_res			= ice_peer_alloc_res,
+	.free_res			= ice_peer_free_res,
+	.is_vsi_ready			= ice_peer_is_vsi_ready,
+	.reg_for_notification		= ice_peer_reg_for_notif,
+	.unreg_for_notification		= ice_peer_unreg_for_notif,
+	.notify_state_change		= ice_peer_report_state_change,
+	.request_reset			= ice_peer_request_reset,
+	.peer_register			= ice_peer_register,
+	.peer_unregister		= ice_peer_unregister,
+	.update_vsi_filter		= ice_peer_update_vsi_filter,
+	.vc_send			= ice_peer_vc_send,
+
+};
+
+/**
+ * ice_init_peer_devices - initializes peer objects and aux devices
+ * @pf: ptr to ice_pf
+ *
+ * This function initializes peer objects and auxiliary device, then
+ * associates them with specified pci_dev as their parent.
+ */
+int ice_init_peer_devices(struct ice_pf *pf)
+{
+	struct ice_port_info *port_info = pf->hw.port_info;
+	struct ice_vsi *vsi = pf->vsi[0];
+	struct pci_dev *pdev = pf->pdev;
+	struct device *dev = &pdev->dev;
+	int status = 0;
+	unsigned int i;
+
+	/* Reserve vector resources */
+	status = ice_reserve_peer_qvector(pf);
+	if (status < 0) {
+		dev_err(dev, "failed to reserve vectors for peer drivers\n");
+		return status;
+	}
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		struct ice_peer_obj_platform_data *platform_data;
+		struct ice_peer_obj_int *peer_obj_int;
+		struct ice_peer_drv_int *peer_drv_int;
+		struct msix_entry *entry = NULL;
+		struct ice_qos_params *qos_info;
+		struct ice_peer_obj *peer_obj;
+		int j;
+
+		peer_obj_int = devm_kzalloc(dev, sizeof(*peer_obj_int),
+					    GFP_KERNEL);
+		if (!peer_obj_int)
+			return -ENOMEM;
+		pf->peers[i] = peer_obj_int;
+
+		peer_drv_int = devm_kzalloc(dev, sizeof(*peer_drv_int),
+					    GFP_KERNEL);
+		if (!peer_drv_int)
+			return -ENOMEM;
+
+		peer_obj_int->peer_drv_int = peer_drv_int;
+
+		/* Initialize driver values */
+		for (j = 0; j < ICE_EVENT_NBITS; j++)
+			bitmap_zero(peer_drv_int->current_events[j].type,
+				    ICE_EVENT_NBITS);
+
+		mutex_init(&peer_obj_int->peer_obj_state_mutex);
+
+		peer_obj = ice_get_peer_obj(peer_obj_int);
+		peer_obj_int->plat_data.peer_obj = peer_obj;
+		platform_data = &peer_obj_int->plat_data;
+		peer_obj->peer_ops = NULL;
+		peer_obj->hw_addr = (u8 __iomem *)pf->hw.hw_addr;
+		peer_obj->ver.major = ICE_PEER_MAJOR_VER;
+		peer_obj->ver.minor = ICE_PEER_MINOR_VER;
+		peer_obj->ver.support = ICE_IDC_FEATURES;
+		peer_obj->peer_obj_id = ice_mfd_cells[i].id;
+		peer_obj->pf_vsi_num = vsi->vsi_num;
+		peer_obj->netdev = vsi->netdev;
+		peer_obj->initial_mtu = vsi->netdev->mtu;
+		ether_addr_copy(peer_obj->lan_addr, port_info->mac.lan_addr);
+
+		ice_mfd_cells[i].platform_data = platform_data;
+		ice_mfd_cells[i].pdata_size = sizeof(*platform_data);
+
+		peer_obj_int->ice_peer_wq =
+			alloc_ordered_workqueue("ice_peer_wq_%d", WQ_UNBOUND,
+						i);
+		if (!peer_obj_int->ice_peer_wq)
+			return -ENOMEM;
+		INIT_WORK(&peer_obj_int->peer_close_task, ice_peer_close_task);
+
+		peer_obj->pdev = pdev;
+		peer_obj->ari_ena = pci_ari_enabled(pdev->bus);
+		peer_obj->bus_num = PCI_BUS_NUM(pdev->devfn);
+		if (!peer_obj->ari_ena) {
+			peer_obj->dev_num = PCI_SLOT(pdev->devfn);
+			peer_obj->fn_num = PCI_FUNC(pdev->devfn);
+		} else {
+			peer_obj->dev_num = 0;
+			peer_obj->fn_num = pdev->devfn & 0xff;
+		}
+
+		qos_info = &peer_obj->initial_qos_info;
+
+		/* setup qos_info fields with defaults */
+		qos_info->num_apps = 0;
+		qos_info->num_tc = 1;
+
+		for (j = 0; j < ICE_IDC_MAX_USER_PRIORITY; j++)
+			qos_info->up2tc[j] = 0;
+
+		qos_info->tc_info[0].rel_bw = 100;
+		for (j = 1; j < IEEE_8021QAZ_MAX_TCS; j++)
+			qos_info->tc_info[j].rel_bw = 0;
+
+		/* for DCB, override the qos_info defaults. */
+		ice_setup_dcb_qos_info(pf, qos_info);
+		/* Initialize ice_ops */
+		peer_obj->ops = &ops;
+
+		/* make sure peer specific resources such as msix_count and
+		 * msix_entries are initialized
+		 */
+		switch (ice_mfd_cells[i].id) {
+		case ICE_PEER_RDMA_ID:
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				peer_obj->msix_count = pf->num_rdma_msix;
+				entry = &pf->msix_entries[pf->rdma_base_vector];
+			}
+			pf->rdma_peer = peer_obj;
+			break;
+		default:
+			break;
+		}
+
+		peer_obj->msix_entries = entry;
+		ice_peer_state_change(peer_obj_int, ICE_PEER_OBJ_STATE_INIT,
+				      false);
+	}
+
+	status = ida_simple_get(&ice_peer_index_ida, 0, 0, GFP_KERNEL);
+	if (status < 0) {
+		dev_err(&pdev->dev, "failed to get unique index for device\n");
+		return status;
+	}
+
+	pf->peer_idx = status;
+
+	status = mfd_add_devices(dev, pf->peer_idx, ice_mfd_cells,
+				 ARRAY_SIZE(ice_mfd_cells), NULL, 0, NULL);
+	if (status) {
+		dev_err(dev, "Failure adding MFD devs for peers: %d\n", status);
+		return status;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ice_mfd_cells); i++) {
+		snprintf(pf->peers[i]->plat_name, ICE_MAX_PEER_NAME, "%s.%d",
+			 ice_mfd_cells[i].name,
+			 pf->peer_idx + ice_mfd_cells[i].id);
+		dev = bus_find_device_by_name(&platform_bus_type, NULL,
+					      pf->peers[i]->plat_name);
+		if (dev) {
+			dev_dbg(dev, "Peer Created: %s %d\n",
+				pf->peers[i]->plat_name, pf->peer_idx);
+			put_device(dev);
+		}
+	}
+
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.h b/drivers/net/ethernet/intel/ice/ice_idc.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2b74e0c539cd9bb420e48a75b7253fc2aa9c3bc
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_IDC_H_
+#define _ICE_IDC_H_
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/dcbnl.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+/* This major and minor version represent IDC API version information.
+ * During peer driver registration, peer driver specifies major and minor
+ * version information (via. peer_driver:ver_info). It gets checked against
+ * following defines and if mismatch, then peer driver registration
+ * fails and appropriate message gets logged.
+ */
+#define ICE_PEER_MAJOR_VER		7
+#define ICE_PEER_MINOR_VER		1
+
+enum ice_peer_features {
+	ICE_PEER_FEATURE_ADK_SUPPORT,
+	ICE_PEER_FEATURE_PTP_SUPPORT,
+	ICE_PEER_FEATURE_SRIOV_SUPPORT,
+	ICE_PEER_FEATURE_PCIIOV_SUPPORT,
+	ICE_PEER_FEATURE_NBITS
+};
+
+#define ICE_ADK_SUP		0
+
+#define ICE_PTP_SUP		BIT(ICE_PEER_FEATURE_PTP_SUPPORT)
+
+#define ICE_SRIOV_SUP		BIT(ICE_PEER_FEATURE_SRIOV_SUPPORT)
+
+#ifdef CONFIG_PCI_IOV
+#define ICE_PCIIOV_SUP		BIT(ICE_PEER_FEATURE_PCIIOV_SUPPORT)
+#else
+#define ICE_PCIIOV_SUP		0
+#endif /* CONFIG_PCI_IOV */
+
+#define ICE_IDC_FEATURES (ICE_ADK_SUP | ICE_PTP_SUP | ICE_SRIOV_SUP |\
+			  ICE_PCIIOV_SUP)
+
+enum ice_event_type {
+	ICE_EVENT_LINK_CHANGE = 0x0,
+	ICE_EVENT_MTU_CHANGE,
+	ICE_EVENT_TC_CHANGE,
+	ICE_EVENT_API_CHANGE,
+	ICE_EVENT_MBX_CHANGE,
+	ICE_EVENT_CRIT_ERR,
+	ICE_EVENT_NBITS		/* must be last */
+};
+
+enum ice_res_type {
+	ICE_INVAL_RES = 0x0,
+	ICE_VSI,
+	ICE_VEB,
+	ICE_EVENT_Q,
+	ICE_EGRESS_CMPL_Q,
+	ICE_CMPL_EVENT_Q,
+	ICE_ASYNC_EVENT_Q,
+	ICE_DOORBELL_Q,
+	ICE_RDMA_QSETS_TXSCHED,
+};
+
+enum ice_peer_reset_type {
+	ICE_PEER_PFR = 0,
+	ICE_PEER_CORER,
+	ICE_PEER_CORER_SW_CORE,
+	ICE_PEER_CORER_SW_FULL,
+	ICE_PEER_GLOBR,
+};
+
+/* reason notified to peer driver as part of event handling */
+enum ice_close_reason {
+	ICE_REASON_INVAL = 0x0,
+	ICE_REASON_HW_UNRESPONSIVE,
+	ICE_REASON_INTERFACE_DOWN, /* Administrative down */
+	ICE_REASON_PEER_DRV_UNREG, /* peer driver getting unregistered */
+	ICE_REASON_PEER_OBJ_UNINIT,
+	ICE_REASON_GLOBR_REQ,
+	ICE_REASON_CORER_REQ,
+	ICE_REASON_EMPR_REQ,
+	ICE_REASON_PFR_REQ,
+	ICE_REASON_HW_RESET_PENDING,
+	ICE_REASON_RECOVERY_MODE,
+	ICE_REASON_PARAM_CHANGE,
+};
+
+enum ice_rdma_filter {
+	ICE_RDMA_FILTER_INVAL = 0x0,
+	ICE_RDMA_FILTER_IWARP,
+	ICE_RDMA_FILTER_ROCEV2,
+	ICE_RDMA_FILTER_BOTH,
+};
+
+/* This information is needed to handle peer driver registration,
+ * instead of adding more params to peer_drv_registration function,
+ * let's get it thru' peer_drv object.
+ */
+struct ice_ver_info {
+	u16 major;
+	u16 minor;
+	u64 support;
+};
+
+/* Struct to hold per DCB APP info */
+struct ice_dcb_app_info {
+	u8  priority;
+	u8  selector;
+	u16 prot_id;
+};
+
+struct ice_peer_obj;
+struct ice_peer_obj_int;
+
+#define ICE_IDC_MAX_USER_PRIORITY        8
+#define ICE_IDC_MAX_APPS		64
+#define ICE_IDC_DSCP_NUM_VAL		64
+
+/* Source timer mode */
+enum ice_src_tmr_mode {
+	ICE_SRC_TMR_MODE_NANOSECONDS,
+	ICE_SRC_TMR_MODE_LOCKED,
+
+	NUM_ICE_SRC_TMR_MODE
+};
+
+
+
+/* Struct to hold per RDMA Qset info */
+struct ice_rdma_qset_params {
+	u32 teid;	/* qset TEID */
+	u16 qs_handle; /* RDMA driver provides this */
+	u16 vsi_id; /* VSI index */
+	u8 tc; /* TC branch the QSet should belong to */
+	u8 reserved[3];
+};
+
+struct ice_res_base {
+	/* Union for future provision e.g. other res_type */
+	union {
+		struct ice_rdma_qset_params qsets;
+	} res;
+};
+
+struct ice_res {
+	/* Type of resource. Filled by peer driver */
+	enum ice_res_type res_type;
+	/* Count requested by peer driver */
+	u16 cnt_req;
+
+
+	/* Number of resources allocated. Filled in by callee.
+	 * Based on this value, caller to fill up "resources"
+	 */
+	u16 res_allocated;
+
+	/* Unique handle to resources allocated. Zero if call fails.
+	 * Allocated by callee and for now used by caller for internal
+	 * tracking purpose.
+	 */
+	u32 res_handle;
+
+	/* Peer driver has to allocate sufficient memory, to accommodate
+	 * cnt_requested before calling this function.
+	 * Memory has to be zero initialized. It is input/output param.
+	 * As a result of alloc_res API, this structures will be populated.
+	 */
+	struct ice_res_base res[1];
+};
+
+struct ice_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+#define IDC_QOS_MODE_VLAN       0x0
+#define IDC_QOS_MODE_DSCP       0x1
+
+/* Struct to hold QoS info */
+struct ice_qos_params {
+	struct ice_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[ICE_IDC_MAX_USER_PRIORITY];
+	u8 vsi_relative_bw;
+	u8 vsi_priority_type;
+	u32 num_apps;
+	u8 pfc_mode;
+	u8 dscp_map[ICE_IDC_DSCP_NUM_VAL];
+	struct ice_dcb_app_info apps[ICE_IDC_MAX_APPS];
+	u8 num_tc;
+};
+
+union ice_event_info {
+	/* ICE_EVENT_LINK_CHANGE */
+	struct {
+		struct net_device *lwr_nd;
+		u16 vsi_num; /* HW index of VSI corresponding to lwr ndev */
+		u8 new_link_state;
+		u8 lport;
+	} link_info;
+	/* ICE_EVENT_MTU_CHANGE */
+	u16 mtu;
+	/* ICE_EVENT_TC_CHANGE */
+	struct ice_qos_params port_qos;
+	/* ICE_EVENT_API_CHANGE */
+	u8 api_rdy;
+	/* ICE_EVENT_MBX_CHANGE */
+	u8 mbx_rdy;
+	/* ICE_EVENT_CRIT_ERR */
+	u32 reg;
+};
+
+/* ice_event elements are to be passed back and forth between the ice driver
+ * and the peer drivers. They are to be used to both register/unregister
+ * for event reporting and to report an event (events can be either ice
+ * generated or peer generated).
+ *
+ * For (un)registering for events, the structure needs to be populated with:
+ *   reporter - pointer to the ice_peer_obj struct of the peer (un)registering
+ *   type - bitmap with bits set for event types to (un)register for
+ *
+ * For reporting events, the structure needs to be populated with:
+ *   reporter - pointer to peer that generated the event (NULL for ice)
+ *   type - bitmap with single bit set for this event type
+ *   info - union containing data relevant to this event type
+ */
+struct ice_event {
+	struct ice_peer_obj *reporter;
+	DECLARE_BITMAP(type, ICE_EVENT_NBITS);
+	union ice_event_info info;
+};
+
+/* Following APIs are implemented by ICE driver and invoked by peer drivers */
+struct ice_ops {
+	/* APIs to allocate resources such as VEB, VSI, Doorbell queues,
+	 * completion queues, Tx/Rx queues, etc...
+	 */
+	int (*alloc_res)(struct ice_peer_obj *peer_obj,
+			 struct ice_res *res,
+			 int partial_acceptable);
+	int (*free_res)(struct ice_peer_obj *peer_obj,
+			struct ice_res *res);
+
+	int (*is_vsi_ready)(struct ice_peer_obj *peer_obj);
+	int (*peer_register)(struct ice_peer_obj *peer_obj);
+	int (*peer_unregister)(struct ice_peer_obj *peer_obj);
+	int (*request_reset)(struct ice_peer_obj *obj,
+			     enum ice_peer_reset_type reset_type);
+
+	void (*notify_state_change)(struct ice_peer_obj *obj,
+				    struct ice_event *event);
+
+	/* Notification APIs */
+	void (*reg_for_notification)(struct ice_peer_obj *obj,
+				     struct ice_event *event);
+	void (*unreg_for_notification)(struct ice_peer_obj *obj,
+				       struct ice_event *event);
+	int (*update_vsi_filter)(struct ice_peer_obj *peer_obj,
+				 enum ice_rdma_filter filter, bool enable);
+	int (*vc_send)(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg,
+		       u16 len);
+};
+
+/* Following APIs are implemented by peer drivers and invoked by ICE driver */
+struct ice_peer_ops {
+	void (*event_handler)(struct ice_peer_obj *peer_obj,
+			      struct ice_event *event);
+
+	/* Why we have 'open' and when it is expected to be called:
+	 * 1. symmetric set of API w.r.t close
+	 * 2. To be invoked form driver initialization path
+	 *     - call peer_driver:open once ice driver is fully initialized
+	 * 3. To be invoked upon RESET complete
+	 *
+	 * Calls to open are performed from ice_finish_init_peer_obj
+	 * which is invoked from the service task. This helps keep objects
+	 * from having their open called until the ice driver is ready and
+	 * has scheduled its service task.
+	 */
+	int (*open)(struct ice_peer_obj *peer_obj);
+
+	/* Peer's close function is to be called when the peer needs to be
+	 * quiesced. This can be for a variety of reasons (enumerated in the
+	 * ice_close_reason enum struct). A call to close will only be
+	 * followed by a call to either remove or open. No IDC calls from the
+	 * peer should be accepted until it is re-opened.
+	 *
+	 * The *reason* parameter is the reason for the call to close. This
+	 * can be for any reason enumerated in the ice_close_reason struct.
+	 * It's primary reason is for the peer's bookkeeping and in case the
+	 * peer want to perform any different tasks dictated by the reason.
+	 */
+	void (*close)(struct ice_peer_obj *peer_obj,
+		      enum ice_close_reason reason);
+
+	int (*vc_receive)(struct ice_peer_obj *peer_obj, u32 vf_id, u8 *msg,
+			  u16 len);
+	/* tell RDMA peer to prepare for TC change in a blocking call
+	 * that will directly precede the change event
+	 */
+	void (*prep_tc_change)(struct ice_peer_obj *peer_obj);
+};
+
+#define ICE_PEER_RDMA_NAME	"ice_rdma"
+#define ICE_PEER_RDMA_ID	0x00000010
+#define ICE_MAX_NUM_PEERS	4
+
+/* The const struct that instantiates peer_obj_id needs to be initialized
+ * in the .c with the macro ASSIGN_PEER_INFO.
+ * For example:
+ * static const struct peer_obj_id peer_obj_ids[] = ASSIGN_PEER_INFO;
+ */
+struct peer_obj_id {
+	char *name;
+	int id;
+};
+
+#define IDC_RDMA_INFO   { .name = ICE_PEER_RDMA_NAME,  .id = ICE_PEER_RDMA_ID },
+#define IDC_AE_INFO
+#define IDC_IPSEC_INFO
+#define IDC_SWITCH_INFO
+#define IDC_ADK_INFO
+/* this is a list of all possible peers, some are unused but left for clarity */
+#define ASSIGN_PEER_INFO	\
+{				\
+	IDC_RDMA_INFO		\
+	IDC_AE_INFO		\
+	IDC_IPSEC_INFO		\
+	IDC_SWITCH_INFO		\
+	IDC_ADK_INFO		\
+}
+
+#define ice_peer_priv(x) ((x)->peer_priv)
+
+/* structure representing peer_object */
+struct ice_peer_obj {
+	struct ice_ver_info ver;
+	struct pci_dev *pdev; /* PCI device of corresponding to main function */
+	/* KVA / Linear address corresponding to BAR0 of underlying
+	 * pci_device.
+	 */
+	u8 __iomem *hw_addr;
+	int peer_obj_id;
+
+	int index;
+
+	/* Opaque pointer for peer specific data tracking.  This memory will
+	 * be alloc'd and freed by the peer driver and used for private data
+	 * accessible only to the specific peer.  It is stored here so that
+	 * when this struct is passed to the peer via an IDC call, the data
+	 * can be accessed by the peer at that time.
+	 * The peers should only retrieve the pointer by the macro:
+	 *    ice_peer_priv(struct ice_peer_obj *)
+	 */
+	void *peer_priv;
+
+
+	u8 ftype;	/* PF(false) or VF (true) */
+
+	/* Data VSI created by driver */
+	u16 pf_vsi_num;
+
+	u8 lan_addr[ETH_ALEN]; /* default MAC address of main netdev */
+	u16 initial_mtu; /* Initial MTU of main netdev */
+	struct ice_qos_params initial_qos_info;
+	struct net_device *netdev;
+	/* PCI info */
+	u8 ari_ena;
+	u16 bus_num;
+	u16 dev_num;
+	u16 fn_num;
+
+	/* Based on peer driver type, this shall point to corresponding MSIx
+	 * entries in pf->msix_entries (which were allocated as part of driver
+	 * initialization) e.g. for RDMA driver, msix_entries reserved will be
+	 * num_online_cpus + 1.
+	 */
+	u16 msix_count; /* How many vectors are reserved for this device */
+	struct msix_entry *msix_entries;
+
+	/* Following struct contains function pointers to be initialized
+	 * by ICE driver and called by peer driver
+	 */
+	const struct ice_ops *ops;
+
+	/* Following struct contains function pointers to be initialized
+	 * by peer driver and called by ICE driver
+	 */
+	const struct ice_peer_ops *peer_ops;
+
+	/* Pointer to peer_drv struct to be populated by peer driver */
+	struct ice_peer_drv *peer_drv;
+};
+
+struct ice_peer_obj_platform_data {
+	struct ice_peer_obj *peer_obj;
+};
+
+/* structure representing peer driver
+ * Peer driver to initialize those function ptrs and
+ * it will be invoked by ICE as part of driver_registration
+ * via bus infrastructure
+ */
+struct ice_peer_drv {
+	u16 driver_id;
+#define ICE_PEER_LAN_DRIVER		0
+#define ICE_PEER_RDMA_DRIVER		4
+#define ICE_PEER_ADK_DRIVER		5
+
+	struct ice_ver_info ver;
+	const char *name;
+
+};
+
+#endif /* _ICE_IDC_H_*/
diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..6939681f1498f59831f0d1d4fa02b2f9017829ba
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_IDC_INT_H_
+#define _ICE_IDC_INT_H_
+
+#include "ice.h"
+#include "ice_idc.h"
+
+
+enum ice_peer_obj_state {
+	ICE_PEER_OBJ_STATE_INIT,
+	ICE_PEER_OBJ_STATE_PROBED,
+	ICE_PEER_OBJ_STATE_OPENING,
+	ICE_PEER_OBJ_STATE_OPENED,
+	ICE_PEER_OBJ_STATE_PREP_RST,
+	ICE_PEER_OBJ_STATE_PREPPED,
+	ICE_PEER_OBJ_STATE_CLOSING,
+	ICE_PEER_OBJ_STATE_CLOSED,
+	ICE_PEER_OBJ_STATE_REMOVED,
+	ICE_PEER_OBJ_STATE_API_RDY,
+	ICE_PEER_OBJ_STATE_NBITS,               /* must be last */
+};
+
+enum ice_peer_drv_state {
+	ICE_PEER_DRV_STATE_MBX_RDY,
+	ICE_PEER_DRV_STATE_NBITS,               /* must be last */
+};
+
+struct ice_peer_drv_int {
+	struct ice_peer_drv *peer_drv;
+
+	/* States associated with peer driver */
+	DECLARE_BITMAP(state, ICE_PEER_DRV_STATE_NBITS);
+
+	/* if this peer_obj is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+};
+
+#define ICE_MAX_PEER_NAME 64
+
+struct ice_peer_obj_int {
+	struct ice_peer_obj peer_obj;
+	struct ice_peer_drv_int *peer_drv_int; /* driver private structure */
+	char plat_name[ICE_MAX_PEER_NAME];
+	struct ice_peer_obj_platform_data plat_data;
+
+	/* if this peer_obj is the originator of an event, these are the
+	 * most recent events of each type
+	 */
+	struct ice_event current_events[ICE_EVENT_NBITS];
+	/* Events a peer has registered to be notified about */
+	DECLARE_BITMAP(events, ICE_EVENT_NBITS);
+
+	/* States associated with peer_obj */
+	DECLARE_BITMAP(state, ICE_PEER_OBJ_STATE_NBITS);
+	struct mutex peer_obj_state_mutex; /* peer_obj state mutex */
+
+	/* per peer workqueue */
+	struct workqueue_struct *ice_peer_wq;
+
+	struct work_struct peer_prep_task;
+	struct work_struct peer_close_task;
+
+	enum ice_close_reason rst_type;
+};
+
+static inline struct
+ice_peer_obj_int *peer_to_ice_obj_int(struct ice_peer_obj *peer_obj)
+{
+	return peer_obj ? container_of(peer_obj, struct ice_peer_obj_int,
+				       peer_obj) : NULL;
+}
+
+static inline struct
+ice_peer_obj *ice_get_peer_obj(struct ice_peer_obj_int *peer_obj_int)
+{
+	if (peer_obj_int)
+		return &peer_obj_int->peer_obj;
+	else
+		return NULL;
+}
+
+#if IS_ENABLED(CONFIG_MFD_CORE)
+int ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data);
+int ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data);
+int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data);
+static inline bool ice_validate_peer_obj(struct ice_peer_obj *peer_obj)
+{
+	struct ice_peer_obj_int *peer_obj_int;
+	struct ice_pf *pf;
+
+	if (!peer_obj || !peer_obj->pdev)
+		return false;
+
+	if (!peer_obj->peer_ops)
+		return false;
+
+	pf = pci_get_drvdata(peer_obj->pdev);
+	if (!pf)
+		return false;
+
+	peer_obj_int = peer_to_ice_obj_int(peer_obj);
+	if (!peer_obj_int)
+		return false;
+
+	if (test_bit(ICE_PEER_OBJ_STATE_REMOVED, peer_obj_int->state) ||
+	    test_bit(ICE_PEER_OBJ_STATE_INIT, peer_obj_int->state))
+		return false;
+
+	return true;
+}
+#else /* !CONFIG_MFD_CORE */
+static inline int
+ice_peer_update_vsi(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_close_peer_for_reset(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_unroll_peer(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_unreg_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_peer_close(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_peer_check_for_reg(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline int
+ice_finish_init_peer_obj(struct ice_peer_obj_int *peer_obj_int, void *data)
+{
+	return 0;
+}
+
+static inline bool ice_validate_peer_obj(struct ice_peer_obj *peer)
+{
+	return true;
+}
+
+#endif /* !CONFIG_MFD_CORE */
+
+#endif /* !_ICE_IDC_INT_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef36be8fb8678312cb575bd1a12b9a91b4daa763
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* Link Aggregation code */
+
+#include "ice.h"
+#ifdef HAVE_NETDEV_UPPER_INFO
+#include "ice_lag.h"
+
+/**
+ * ice_lag_set_primary - set PF LAG state as Primary
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_primary(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_BACKUP) {
+		dev_warn(ice_pf_to_dev(pf), "%s: Attempt to be Primary, but incompatible state.\n",
+			 netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_PRIMARY;
+}
+
+/**
+ * ice_lag_set_backup - set PF LAG state to Backup
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_backup(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_PRIMARY) {
+		dev_dbg(ice_pf_to_dev(pf), "%s: Attempt to be Backup, but incompatible state\n",
+			netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_BACKUP;
+}
+
+/**
+ * ice_display_lag_info - print LAG info
+ * @lag: LAG info struct
+ */
+static void ice_display_lag_info(struct ice_lag *lag)
+{
+	const char *name, *peer, *upper, *role, *bonded, *master;
+	struct device *dev = &lag->pf->pdev->dev;
+
+	name = lag->netdev ? netdev_name(lag->netdev) : "unset";
+	peer = lag->peer_netdev ? netdev_name(lag->peer_netdev) : "unset";
+	upper = lag->upper_netdev ? netdev_name(lag->upper_netdev) : "unset";
+	master = lag->master ? "TRUE" : "FALSE";
+	bonded = lag->bonded ? "BONDED" : "UNBONDED";
+
+	switch (lag->role) {
+	case ICE_LAG_NONE:
+		role = "NONE";
+		break;
+	case ICE_LAG_PRIMARY:
+		role = "PRIMARY";
+		break;
+	case ICE_LAG_BACKUP:
+		role = "BACKUP";
+		break;
+	case ICE_LAG_UNSET:
+		role = "UNSET";
+		break;
+	default:
+		role = "ERROR";
+	}
+
+	dev_dbg(dev, "%s %s, peer:%s, upper:%s, role:%s, master:%s\n", name,
+		bonded, peer, upper, role, master);
+}
+
+/**
+ * ice_lag_info_event - handle NETDEV_BONDING_INFO event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr is to be cast to (netdev_notifier_bonding_info *)
+ */
+static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *event_netdev, *netdev_tmp;
+	struct netdev_notifier_bonding_info *info;
+	struct netdev_bonding_info *bonding_info;
+	const char *lag_netdev_name;
+
+	event_netdev = netdev_notifier_info_to_dev(ptr);
+	info = ptr;
+	lag_netdev_name = netdev_name(lag->netdev);
+	bonding_info = &info->bonding_info;
+
+	if (event_netdev != lag->netdev || !lag->bonded || !lag->upper_netdev)
+		return;
+
+	if (bonding_info->master.bond_mode != BOND_MODE_ACTIVEBACKUP) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but mode not active/backup\n");
+		goto lag_out;
+	}
+
+	if (strcmp(bonding_info->slave.slave_name, lag_netdev_name)) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but slave info not for us\n");
+		goto lag_out;
+	}
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) {
+		if (!netif_is_ice(netdev_tmp))
+			continue;
+
+		if (netdev_tmp && netdev_tmp != lag->netdev &&
+		    lag->peer_netdev != netdev_tmp) {
+			dev_hold(netdev_tmp);
+			lag->peer_netdev = netdev_tmp;
+		}
+	}
+	rcu_read_unlock();
+
+	if (bonding_info->slave.state)
+		ice_lag_set_backup(lag);
+	else
+		ice_lag_set_primary(lag);
+
+lag_out:
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_link - handle LAG link event
+ * @lag: LAG info struct
+ * @info: info from the netdev notifier
+ */
+static void
+ice_lag_link(struct ice_lag *lag, struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	struct ice_pf *pf = lag->pf;
+	int peers = 0;
+
+
+	if (lag->bonded)
+		dev_warn(ice_pf_to_dev(pf), "%s Already part of a bond\n",
+			 netdev_name(lag->netdev));
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp)
+		peers++;
+	rcu_read_unlock();
+
+	if (lag->upper_netdev != upper) {
+		dev_hold(upper);
+		lag->upper_netdev = upper;
+	}
+
+	lag->bonded = true;
+	lag->role = ICE_LAG_UNSET;
+
+	/* if this is the first element in an LAG mark as master */
+	lag->master = !!(peers == 1);
+}
+
+/**
+ * ice_lag_unlink - handle unlink event
+ * @lag: LAG info struct
+ * @info: info from netdev notification
+ */
+static void
+ice_lag_unlink(struct ice_lag *lag,
+	       struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	bool found = false;
+
+	if (!lag->bonded) {
+		netdev_dbg(lag->netdev, "bonding unlink event on non-LAG netdev\n");
+		return;
+	}
+
+	/* determine if we are in the new LAG config or not */
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp) {
+		if (netdev_tmp == lag->netdev) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (found)
+		return;
+
+	if (lag->upper_netdev) {
+		dev_put(lag->upper_netdev);
+		lag->upper_netdev = NULL;
+	}
+
+	if (lag->peer_netdev) {
+		dev_put(lag->peer_netdev);
+		lag->peer_netdev = NULL;
+	}
+
+	lag->bonded = false;
+	lag->role = ICE_LAG_NONE;
+}
+
+/**
+ * ice_lag_changeupper_event - handle LAG changeupper event
+ * @lag: LAG info struct
+ * @ptr: opaque pointer data
+ *
+ * ptr is to be cast into netdev_notifier_changeupper_info
+ */
+static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info;
+	struct net_device *netdev;
+
+	info = ptr;
+	netdev = netdev_notifier_info_to_dev(ptr);
+
+	/* not for this netdev */
+	if (netdev != lag->netdev)
+		return;
+
+	if (!info->upper_dev) {
+		netdev_dbg(netdev, "changeupper rcvd, but no upper defined\n");
+		return;
+	}
+
+	netdev_dbg(netdev, "bonding %s\n", info->linking ? "LINK" : "UNLINK");
+
+	if (!netif_is_lag_master(info->upper_dev)) {
+		netdev_dbg(netdev, "changeupper rcvd, but not master. bail\n");
+		return;
+	}
+
+	if (info->linking)
+		ice_lag_link(lag, info);
+	else
+		ice_lag_unlink(lag, info);
+
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_changelower_event - handle LAG changelower event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr to be cast to netdev_notifier_changelowerstate_info
+ */
+static void ice_lag_changelower_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	if (netdev != lag->netdev)
+		return;
+
+	netdev_dbg(netdev, "bonding info\n");
+
+	if (!netif_is_lag_port(netdev)) {
+		netdev_dbg(netdev, "CHANGELOWER rcvd, but netdev not in LAG. Bail\n");
+		return;
+	}
+
+}
+
+/**
+ * ice_lag_event_handler - handle LAG events from netdev
+ * @notif_blk: notifier block registered by this netdev
+ * @event: event type
+ * @ptr: opaque data containing notifier event
+ */
+static int
+ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event,
+		      void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct ice_lag *lag;
+
+	lag = container_of(notif_blk, struct ice_lag, notif_block);
+
+	if (!lag->netdev)
+		return NOTIFY_DONE;
+
+	/* Check that the netdev is in the working namespace */
+	if (!net_eq(dev_net(netdev), &init_net))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		ice_lag_changeupper_event(lag, ptr);
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		ice_lag_changelower_event(lag, ptr);
+		break;
+	case NETDEV_BONDING_INFO:
+		ice_lag_info_event(lag, ptr);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/**
+ * ice_register_lag_handler - register LAG handler on netdev
+ * @lag: LAG struct
+ */
+static int ice_register_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+
+	if (!notif_blk->notifier_call) {
+		notif_blk->notifier_call = ice_lag_event_handler;
+		if (register_netdevice_notifier(notif_blk)) {
+			notif_blk->notifier_call = NULL;
+			dev_err(dev, "FAIL register LAG event handler!\n");
+			return -EINVAL;
+		}
+		dev_dbg(dev, "LAG event handler registered\n");
+	}
+	return 0;
+}
+
+/**
+ * ice_unregister_lag_handler - unregister LAG handler on netdev
+ * @lag: LAG struct
+ */
+static void ice_unregister_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+	if (notif_blk->notifier_call) {
+		unregister_netdevice_notifier(notif_blk);
+		dev_dbg(dev, "LAG event handler unregistered\n");
+	}
+}
+
+/**
+ * ice_init_lag - initialize support for LAG
+ * @pf: PF struct
+ *
+ * Alloc memory for LAG structs and initialize the elements.
+ * Memory will be freed in ice_deinit_lag
+ */
+int ice_init_lag(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_lag *lag;
+	struct ice_vsi *vsi;
+	int err;
+
+	pf->lag = kzalloc(sizeof(*lag), GFP_KERNEL);
+	if (!pf->lag)
+		return -ENOMEM;
+	lag = pf->lag;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi) {
+		dev_err(dev, "couldn't get main vsi, link aggregation init fail\n");
+		err = -EIO;
+		goto lag_error;
+	}
+
+	lag->pf = pf;
+	lag->netdev = vsi->netdev;
+	lag->role = ICE_LAG_NONE;
+	lag->bonded = false;
+	lag->peer_netdev = NULL;
+	lag->upper_netdev = NULL;
+	lag->notif_block.notifier_call = NULL;
+
+	err = ice_register_lag_handler(lag);
+	if (err) {
+		dev_warn(dev, "INIT LAG: Failed to register event handler\n");
+		goto lag_error;
+	}
+
+	ice_display_lag_info(lag);
+
+	dev_dbg(dev, "INIT LAG complete\n");
+	return 0;
+
+lag_error:
+	kfree(lag);
+	pf->lag = NULL;
+	return err;
+}
+
+/**
+ * ice_deinit_lag - Clean up LAG
+ * @pf: PF struct
+ *
+ * Clean up kernel LAG info and free memory
+ * This function is meant to only be called on driver remove/shutdown
+ */
+void ice_deinit_lag(struct ice_pf *pf)
+{
+	struct ice_lag *lag;
+
+	lag = pf->lag;
+
+	if (!lag)
+		return;
+
+	if (lag->pf)
+		ice_unregister_lag_handler(lag);
+
+	if (lag->upper_netdev)
+		dev_put(lag->upper_netdev);
+
+	if (lag->peer_netdev)
+		dev_put(lag->peer_netdev);
+
+	kfree(lag);
+
+	pf->lag = NULL;
+}
+#endif /* HAVE_NETDEV_UPPER_INFO */
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h
new file mode 100644
index 0000000000000000000000000000000000000000..1603c0f973c9bf43f49ebbae65d03a854a0375a4
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_LAG_H_
+#define _ICE_LAG_H_
+#ifdef HAVE_NETDEV_UPPER_INFO
+
+#include <linux/netdevice.h>
+
+/* LAG roles for netdev */
+enum ice_lag_role {
+	ICE_LAG_NONE,
+	ICE_LAG_PRIMARY,
+	ICE_LAG_BACKUP,
+	ICE_LAG_UNSET
+};
+
+struct ice_pf;
+
+/* LAG info struct */
+struct ice_lag {
+	struct ice_pf *pf; /* backlink to PF struct */
+	struct net_device *netdev; /* this PF's netdev */
+	struct net_device *peer_netdev;
+	struct net_device *upper_netdev; /* upper bonding netdev */
+	struct notifier_block notif_block;
+	u8 bonded:1; /* currently bonded */
+	u8 master:1; /* this is a master */
+	u8 handler:1; /* did we register a rx_netdev_handler */
+	/* each thing blocking bonding will increment this value by one.
+	 * If this value is zero, then bonding is allowed.
+	 */
+	u16 dis_lag;
+	u8 role;
+};
+
+int ice_init_lag(struct ice_pf *pf);
+void ice_deinit_lag(struct ice_pf *pf);
+#endif /* HAVE_NETDEV_UPPER_INFO */
+#endif /* _ICE_LAG_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 2aac8f13daeba31aeb0cc38301558cd6ed3b303c..72adcf27882474412f5c0e9c8ee5b2e5fb41ba75 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -1,8 +1,33 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_LAN_TX_RX_H_
 #define _ICE_LAN_TX_RX_H_
+#include "ice_osdep.h"
+
+/* Rx Descriptors */
+union ice_16byte_rx_desc {
+	struct {
+		__le64 pkt_addr; /* Packet buffer address */
+		__le64 hdr_addr; /* Header buffer address */
+	} read;
+	struct {
+		struct {
+			struct {
+				__le16 mirroring_status;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fd_id; /* Flow Director filter ID */
+			} hi_dword;
+		} qword0;
+		struct {
+			/* ext status/error/PTYPE/length */
+			__le64 status_error_len;
+		} qword1;
+	} wb;  /* writeback */
+};
 
 union ice_32byte_rx_desc {
 	struct {
@@ -40,8 +65,240 @@ union ice_32byte_rx_desc {
 	} wb; /* writeback */
 };
 
+struct ice_fltr_desc {
+	__le64 qidx_compq_space_stat;
+	__le64 dtype_cmd_vsi_fdid;
+};
+
+#define ICE_FXD_FLTR_QW0_QINDEX_S	0
+#define ICE_FXD_FLTR_QW0_QINDEX_M	(0x7FFULL << ICE_FXD_FLTR_QW0_QINDEX_S)
+#define ICE_FXD_FLTR_QW0_COMP_Q_S	11
+#define ICE_FXD_FLTR_QW0_COMP_Q_M	BIT_ULL(ICE_FXD_FLTR_QW0_COMP_Q_S)
+#define ICE_FXD_FLTR_QW0_COMP_Q_ZERO	0x0ULL
+#define ICE_FXD_FLTR_QW0_COMP_Q_QINDX	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_S	12
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_M	\
+				(0x3ULL << ICE_FXD_FLTR_QW0_COMP_REPORT_S)
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_NONE	0x0ULL
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL	0x1ULL
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW		0x2ULL
+
+#define ICE_FXD_FLTR_QW0_FD_SPACE_S	14
+#define ICE_FXD_FLTR_QW0_FD_SPACE_M	(0x3ULL << ICE_FXD_FLTR_QW0_FD_SPACE_S)
+#define ICE_FXD_FLTR_QW0_FD_SPACE_GUAR			0x0ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_BEST_EFFORT		0x1ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_GUAR_BEST		0x2ULL
+#define ICE_FXD_FLTR_QW0_FD_SPACE_BEST_GUAR		0x3ULL
+
+#define ICE_FXD_FLTR_QW0_STAT_CNT_S	16
+#define ICE_FXD_FLTR_QW0_STAT_CNT_M	\
+				(0x1FFFULL << ICE_FXD_FLTR_QW0_STAT_CNT_S)
+#define ICE_FXD_FLTR_QW0_STAT_ENA_S	29
+#define ICE_FXD_FLTR_QW0_STAT_ENA_M	(0x3ULL << ICE_FXD_FLTR_QW0_STAT_ENA_S)
+#define ICE_FXD_FLTR_QW0_STAT_ENA_NONE		0x0ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_PKTS		0x1ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_BYTES		0x2ULL
+#define ICE_FXD_FLTR_QW0_STAT_ENA_PKTS_BYTES	0x3ULL
+
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_S	31
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_M	BIT_ULL(ICE_FXD_FLTR_QW0_EVICT_ENA_S)
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE	0x0ULL
+#define ICE_FXD_FLTR_QW0_EVICT_ENA_TRUE		0x1ULL
+
+#define ICE_FXD_FLTR_QW0_TO_Q_S		32
+#define ICE_FXD_FLTR_QW0_TO_Q_M		(0x7ULL << ICE_FXD_FLTR_QW0_TO_Q_S)
+#define ICE_FXD_FLTR_QW0_TO_Q_EQUALS_QINDEX	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_TO_Q_PRI_S	35
+#define ICE_FXD_FLTR_QW0_TO_Q_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW0_TO_Q_PRI_S)
+#define ICE_FXD_FLTR_QW0_TO_Q_PRIO1	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_S	38
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_M	\
+			(0x3ULL << ICE_FXD_FLTR_QW0_DPU_RECIPE_S)
+#define ICE_FXD_FLTR_QW0_DPU_RECIPE_DFLT	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_DROP_S		40
+#define ICE_FXD_FLTR_QW0_DROP_M		BIT_ULL(ICE_FXD_FLTR_QW0_DROP_S)
+#define ICE_FXD_FLTR_QW0_DROP_NO	0x0ULL
+#define ICE_FXD_FLTR_QW0_DROP_YES	0x1ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_S	41
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW0_FLEX_PRI_S)
+#define ICE_FXD_FLTR_QW0_FLEX_PRI_NONE	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_MDID_S	44
+#define ICE_FXD_FLTR_QW0_FLEX_MDID_M	(0xFULL << ICE_FXD_FLTR_QW0_FLEX_MDID_S)
+#define ICE_FXD_FLTR_QW0_FLEX_MDID0	0x0ULL
+
+#define ICE_FXD_FLTR_QW0_FLEX_VAL_S	48
+#define ICE_FXD_FLTR_QW0_FLEX_VAL_M	\
+				(0xFFFFULL << ICE_FXD_FLTR_QW0_FLEX_VAL_S)
+#define ICE_FXD_FLTR_QW0_FLEX_VAL0	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_DTYPE_S	0
+#define ICE_FXD_FLTR_QW1_DTYPE_M	(0xFULL << ICE_FXD_FLTR_QW1_DTYPE_S)
+#define ICE_FXD_FLTR_QW1_PCMD_S		4
+#define ICE_FXD_FLTR_QW1_PCMD_M		BIT_ULL(ICE_FXD_FLTR_QW1_PCMD_S)
+#define ICE_FXD_FLTR_QW1_PCMD_ADD	0x0ULL
+#define ICE_FXD_FLTR_QW1_PCMD_REMOVE	0x1ULL
+
+#define ICE_FXD_FLTR_QW1_PROF_PRI_S	5
+#define ICE_FXD_FLTR_QW1_PROF_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW1_PROF_PRI_S)
+#define ICE_FXD_FLTR_QW1_PROF_PRIO_ZERO	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_PROF_S		8
+#define ICE_FXD_FLTR_QW1_PROF_M		(0x3FULL << ICE_FXD_FLTR_QW1_PROF_S)
+#define ICE_FXD_FLTR_QW1_PROF_ZERO	0x0ULL
+
+#define ICE_FXD_FLTR_QW1_FD_VSI_S	14
+#define ICE_FXD_FLTR_QW1_FD_VSI_M	(0x3FFULL << ICE_FXD_FLTR_QW1_FD_VSI_S)
+#define ICE_FXD_FLTR_QW1_SWAP_S		24
+#define ICE_FXD_FLTR_QW1_SWAP_M		BIT_ULL(ICE_FXD_FLTR_QW1_SWAP_S)
+#define ICE_FXD_FLTR_QW1_SWAP_NOT_SET	0x0ULL
+#define ICE_FXD_FLTR_QW1_SWAP_SET	0x1ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_PRI_S	25
+#define ICE_FXD_FLTR_QW1_FDID_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW1_FDID_PRI_S)
+#define ICE_FXD_FLTR_QW1_FDID_PRI_ZERO	0x0ULL
+#define ICE_FXD_FLTR_QW1_FDID_PRI_ONE	0x1ULL
+#define ICE_FXD_FLTR_QW1_FDID_PRI_THREE	0x3ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_MDID_S	28
+#define ICE_FXD_FLTR_QW1_FDID_MDID_M	(0xFULL << ICE_FXD_FLTR_QW1_FDID_MDID_S)
+#define ICE_FXD_FLTR_QW1_FDID_MDID_FD	0x05ULL
+
+#define ICE_FXD_FLTR_QW1_FDID_S		32
+#define ICE_FXD_FLTR_QW1_FDID_M		\
+			(0xFFFFFFFFULL << ICE_FXD_FLTR_QW1_FDID_S)
+#define ICE_FXD_FLTR_QW1_FDID_ZERO	0x0ULL
+
+/* definition for FD filter programming status descriptor WB format */
+#define ICE_FXD_FLTR_WB_QW0_BUKT_LEN_S	28
+#define ICE_FXD_FLTR_WB_QW0_BUKT_LEN_M	\
+			(0xFULL << ICE_FXD_FLTR_WB_QW0_BUKT_LEN_S)
+
+#define ICE_FXD_FLTR_WB_QW0_FLTR_STAT_S	32
+#define ICE_FXD_FLTR_WB_QW0_FLTR_STAT_M	\
+			(0xFFFFFFFFULL << ICE_FXD_FLTR_WB_QW0_FLTR_STAT_S)
+
+#define ICE_FXD_FLTR_WB_QW1_DD_S	0
+#define ICE_FXD_FLTR_WB_QW1_DD_M	(0x1ULL << ICE_FXD_FLTR_WB_QW1_DD_S)
+#define ICE_FXD_FLTR_WB_QW1_DD_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_PROG_ID_S	1
+#define ICE_FXD_FLTR_WB_QW1_PROG_ID_M	\
+				(0x3ULL << ICE_FXD_FLTR_WB_QW1_PROG_ID_S)
+#define ICE_FXD_FLTR_WB_QW1_PROG_ADD	0x0ULL
+#define ICE_FXD_FLTR_WB_QW1_PROG_DEL	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_S	4
+#define ICE_FXD_FLTR_WB_QW1_FAIL_M	(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S	5
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M	\
+				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
+
+#define ICE_FXD_FLTR_WB_QW1_FLT_ADDR_S	8
+#define ICE_FXD_FLTR_WB_QW1_FLT_ADDR_M	\
+				(0x3FFFULL << ICE_FXD_FLTR_WB_QW1_FLT_ADDR_S)
+
+#define ICE_FXD_FLTR_WB_QW1_PKT_PROF_S	28
+#define ICE_FXD_FLTR_WB_QW1_PKT_PROF_M	\
+				(0x7FULL << ICE_FXD_FLTR_WB_QW1_PKT_PROF_S)
+
+#define ICE_FXD_FLTR_WB_QW1_BUKT_HASH_S	38
+#define ICE_FXD_FLTR_WB_QW1_BUKT_HASH_M	\
+				(0x3FFFFFF << ICE_FXD_FLTR_WB_QW1_BUKT_HASH_S)
+
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M	\
+				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
+#define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
+
+
+enum ice_rx_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_STATUS_DD_S			= 0,
+	ICE_RX_DESC_STATUS_EOF_S		= 1,
+	ICE_RX_DESC_STATUS_L2TAG1P_S		= 2,
+	ICE_RX_DESC_STATUS_L3L4P_S		= 3,
+	ICE_RX_DESC_STATUS_CRCP_S		= 4,
+	ICE_RX_DESC_STATUS_TSYNINDX_S		= 5,
+	ICE_RX_DESC_STATUS_TSYNVALID_S		= 7,
+	ICE_RX_DESC_STATUS_EXT_UDP_0_S		= 8,
+	ICE_RX_DESC_STATUS_UMBCAST_S		= 9,
+	ICE_RX_DESC_STATUS_FLM_S		= 11,
+	ICE_RX_DESC_STATUS_FLTSTAT_S		= 12,
+	ICE_RX_DESC_STATUS_LPBK_S		= 14,
+	ICE_RX_DESC_STATUS_IPV6EXADD_S		= 15,
+	ICE_RX_DESC_STATUS_RESERVED2_S		= 16,
+	ICE_RX_DESC_STATUS_INT_UDP_0_S		= 18,
+	ICE_RX_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+#define ICE_RXD_QW1_STATUS_S	0
+#define ICE_RXD_QW1_STATUS_M	((BIT(ICE_RX_DESC_STATUS_LAST) - 1) << \
+				 ICE_RXD_QW1_STATUS_S)
+
+#define ICE_RXD_QW1_STATUS_TSYNINDX_S ICE_RX_DESC_STATUS_TSYNINDX_S
+#define ICE_RXD_QW1_STATUS_TSYNINDX_M (0x3UL << ICE_RXD_QW1_STATUS_TSYNINDX_S)
+
+#define ICE_RXD_QW1_STATUS_TSYNVALID_S ICE_RX_DESC_STATUS_TSYNVALID_S
+#define ICE_RXD_QW1_STATUS_TSYNVALID_M BIT_ULL(ICE_RXD_QW1_STATUS_TSYNVALID_S)
+
+
+enum ice_rx_desc_fltstat_values {
+	ICE_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	ICE_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	ICE_RX_DESC_FLTSTAT_RSV		= 2,
+	ICE_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+
+#define ICE_RXD_QW1_ERROR_S	19
+#define ICE_RXD_QW1_ERROR_M		(0xFFUL << ICE_RXD_QW1_ERROR_S)
+
+enum ice_rx_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_ERROR_RXE_S			= 0,
+	ICE_RX_DESC_ERROR_RECIPE_S		= 1,
+	ICE_RX_DESC_ERROR_HBO_S			= 2,
+	ICE_RX_DESC_ERROR_L3L4E_S		= 3, /* 3 BITS */
+	ICE_RX_DESC_ERROR_IPE_S			= 3,
+	ICE_RX_DESC_ERROR_L4E_S			= 4,
+	ICE_RX_DESC_ERROR_EIPE_S		= 5,
+	ICE_RX_DESC_ERROR_OVERSIZE_S		= 6,
+	ICE_RX_DESC_ERROR_PPRS_S		= 7
+};
+
+enum ice_rx_desc_error_l3l4e_masks {
+	ICE_RX_DESC_ERROR_L3L4E_NONE		= 0,
+	ICE_RX_DESC_ERROR_L3L4E_PROT		= 1,
+};
+
+#define ICE_RXD_QW1_PTYPE_S	30
+#define ICE_RXD_QW1_PTYPE_M	(0xFFULL << ICE_RXD_QW1_PTYPE_S)
+
+/* Packet type non-ip values */
+enum ice_rx_l2_ptype {
+	ICE_RX_PTYPE_L2_RESERVED	= 0,
+	ICE_RX_PTYPE_L2_MAC_PAY2	= 1,
+	ICE_RX_PTYPE_L2_TIMESYNC_PAY2	= 2,
+	ICE_RX_PTYPE_L2_FIP_PAY2	= 3,
+	ICE_RX_PTYPE_L2_OUI_PAY2	= 4,
+	ICE_RX_PTYPE_L2_MACCNTRL_PAY2	= 5,
+	ICE_RX_PTYPE_L2_LLDP_PAY2	= 6,
+	ICE_RX_PTYPE_L2_ECP_PAY2	= 7,
+	ICE_RX_PTYPE_L2_EVB_PAY2	= 8,
+	ICE_RX_PTYPE_L2_QCN_PAY2	= 9,
+	ICE_RX_PTYPE_L2_EAPOL_PAY2	= 10,
+	ICE_RX_PTYPE_L2_ARP		= 11,
+};
+
 struct ice_rx_ptype_decoded {
-	u32 ptype:10;
 	u32 known:1;
 	u32 outer_ip:1;
 	u32 outer_ip_ver:2;
@@ -99,10 +356,74 @@ enum ice_rx_ptype_payload_layer {
 	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
 };
 
-/* Rx Flex Descriptor
- * This descriptor is used instead of the legacy version descriptor when
+
+#define ICE_RXD_QW1_LEN_PBUF_S	38
+#define ICE_RXD_QW1_LEN_PBUF_M	(0x3FFFULL << ICE_RXD_QW1_LEN_PBUF_S)
+
+#define ICE_RXD_QW1_LEN_HBUF_S	52
+#define ICE_RXD_QW1_LEN_HBUF_M	(0x7FFULL << ICE_RXD_QW1_LEN_HBUF_S)
+
+#define ICE_RXD_QW1_LEN_SPH_S	63
+#define ICE_RXD_QW1_LEN_SPH_M	BIT_ULL(ICE_RXD_QW1_LEN_SPH_S)
+
+
+enum ice_rx_desc_ext_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_EXT_STATUS_L2TAG2P_S	= 0,
+	ICE_RX_DESC_EXT_STATUS_L2TAG3P_S	= 1,
+	ICE_RX_DESC_EXT_STATUS_FLEXBL_S		= 2,
+	ICE_RX_DESC_EXT_STATUS_FLEXBH_S		= 4,
+	ICE_RX_DESC_EXT_STATUS_FDLONGB_S	= 9,
+	ICE_RX_DESC_EXT_STATUS_PELONGB_S	= 11,
+};
+
+
+enum ice_rx_desc_pe_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_DESC_PE_STATUS_QPID_S		= 0, /* 18 BITS */
+	ICE_RX_DESC_PE_STATUS_L4PORT_S		= 0, /* 16 BITS */
+	ICE_RX_DESC_PE_STATUS_IPINDEX_S		= 16, /* 8 BITS */
+	ICE_RX_DESC_PE_STATUS_QPIDHIT_S		= 24,
+	ICE_RX_DESC_PE_STATUS_APBVTHIT_S	= 25,
+	ICE_RX_DESC_PE_STATUS_PORTV_S		= 26,
+	ICE_RX_DESC_PE_STATUS_URG_S		= 27,
+	ICE_RX_DESC_PE_STATUS_IPFRAG_S		= 28,
+	ICE_RX_DESC_PE_STATUS_IPOPT_S		= 29
+};
+
+#define ICE_RX_PROG_STATUS_DESC_LEN_S	38
+#define ICE_RX_PROG_STATUS_DESC_LEN	0x2000000
+
+#define ICE_RX_PROG_STATUS_DESC_QW1_PROGID_S	2
+#define ICE_RX_PROG_STATUS_DESC_QW1_PROGID_M	\
+			(0x7UL << ICE_RX_PROG_STATUS_DESC_QW1_PROGID_S)
+
+
+#define ICE_RX_PROG_STATUS_DESC_QW1_ERROR_S	19
+#define ICE_RX_PROG_STATUS_DESC_QW1_ERROR_M	\
+			(0x3FUL << ICE_RX_PROG_STATUS_DESC_QW1_ERROR_S)
+
+enum ice_rx_prog_status_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_PROG_STATUS_DESC_DD_S		= 0,
+	ICE_RX_PROG_STATUS_DESC_PROG_ID_S	= 2 /* 3 BITS */
+};
+
+enum ice_rx_prog_status_desc_prog_id_masks {
+	ICE_RX_PROG_STATUS_DESC_FD_FLTR_STATUS	= 1,
+};
+
+enum ice_rx_prog_status_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_PROG_STATUS_DESC_FD_TBL_FULL_S	= 0,
+	ICE_RX_PROG_STATUS_DESC_NO_FD_ENTRY_S	= 1,
+};
+
+/* Rx Flex Descriptors
+ * These descriptors are used instead of the legacy version descriptors when
  * ice_rlan_ctx.adv_desc is set
  */
+
 union ice_32b_rx_flex_desc {
 	struct {
 		__le64 pkt_addr; /* Packet buffer address */
@@ -148,8 +469,12 @@ union ice_32b_rx_flex_desc {
 };
 
 /* Rx Flex Descriptor NIC Profile
- * This descriptor corresponds to RxDID 2 which contains
- * metadata fields for RSS, flow ID and timestamp info
+ * RxDID Profile ID 2
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow ID lower 16-bits
+ * Flex-field 3: Flow ID higher 16-bits
+ * Flex-field 4: reserved, VLAN ID taken from L2Tag
  */
 struct ice_32b_rx_flex_desc_nic {
 	/* Qword 0 */
@@ -175,13 +500,149 @@ struct ice_32b_rx_flex_desc_nic {
 	__le32 flow_id;
 	union {
 		struct {
-			__le16 vlan_id;
+			__le16 rsvd;
 			__le16 flow_id_ipv6;
 		} flex;
 		__le32 ts_high;
 	} flex_ts;
 };
 
+/* Rx Flex Descriptor Switch Profile
+ * RxDID Profile ID 3
+ * Flex-field 0: Source VSI
+ */
+struct ice_32b_rx_flex_desc_sw {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 src_vsi; /* [10:15] are reserved */
+	__le16 flex_md1_rsvd;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC VEB Profile
+ * RxDID Profile ID 4
+ * Flex-field 0: Destination VSI
+ */
+struct ice_32b_rx_flex_desc_nic_veb_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 dst_vsi; /* [0:12]: destination VSI */
+			/* 13: VSI valid bit */
+			/* [14:15] are reserved */
+	__le16 flex_field_1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC ACL Profile
+ * RxDID Profile ID 5
+ * Flex-field 0: ACL Counter 0
+ * Flex-field 1: ACL Counter 1
+ * Flex-field 2: ACL Counter 2
+ */
+struct ice_32b_rx_flex_desc_nic_acl_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 acl_ctr0;
+	__le16 acl_ctr1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 acl_ctr2;
+	__le16 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC Profile
+ * RxDID Profile ID 6
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow ID lower 16-bits
+ * Flex-field 3: Source VSI
+ * Flex-field 4: reserved, VLAN ID taken from L2Tag
+ */
+struct ice_32b_rx_flex_desc_nic_2 {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flow_id;
+	__le16 src_vsi;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+
 /* Receive Flex Descriptor profile IDs: There are a total
  * of 64 profiles where profile IDs 0/1 are for legacy; and
  * profiles 2-63 are flex profiles that can be programmed
@@ -191,27 +652,72 @@ enum ice_rxdid {
 	ICE_RXDID_LEGACY_0		= 0,
 	ICE_RXDID_LEGACY_1		= 1,
 	ICE_RXDID_FLEX_NIC		= 2,
+	ICE_RXDID_FLEX_NIC_VEB_DBG	= 4,
+	ICE_RXDID_FLEX_NIC_ACL_DBG	= 5,
 	ICE_RXDID_FLEX_NIC_2		= 6,
 	ICE_RXDID_HW			= 7,
 	ICE_RXDID_LAST			= 63,
 };
 
+/* Recceive Flex descriptor Dword Index */
+enum ice_flex_word {
+	ICE_RX_FLEX_DWORD_0 = 0,
+	ICE_RX_FLEX_DWORD_1,
+	ICE_RX_FLEX_DWORD_2,
+	ICE_RX_FLEX_DWORD_3,
+	ICE_RX_FLEX_DWORD_4,
+	ICE_RX_FLEX_DWORD_5
+};
+
 /* Receive Flex Descriptor Rx opcode values */
-#define ICE_RX_OPC_MDID		0x01
+enum ice_flex_opcode {
+	ICE_RX_OPC_DEBUG = 0,
+	ICE_RX_OPC_MDID,
+	ICE_RX_OPC_EXTRACT,
+	ICE_RX_OPC_PROTID
+};
+
+/* Receive Descriptor MDID values that access packet flags */
+enum ice_flex_mdid_pkt_flags {
+	ICE_RX_MDID_PKT_FLAGS_15_0	= 20,
+	ICE_RX_MDID_PKT_FLAGS_31_16,
+	ICE_RX_MDID_PKT_FLAGS_47_32,
+	ICE_RX_MDID_PKT_FLAGS_63_48,
+};
 
-/* Receive Descriptor MDID values */
-enum ice_flex_rx_mdid {
-	ICE_RX_MDID_FLOW_ID_LOWER	= 5,
-	ICE_RX_MDID_FLOW_ID_HIGH,
-	ICE_RX_MDID_SRC_VSI		= 19,
-	ICE_RX_MDID_HASH_LOW		= 56,
-	ICE_RX_MDID_HASH_HIGH,
+/* Generic descriptor MDID values */
+enum ice_flex_mdid {
+	ICE_MDID_GENERIC_WORD_0,
+	ICE_MDID_GENERIC_WORD_1,
+	ICE_MDID_GENERIC_WORD_2,
+	ICE_MDID_GENERIC_WORD_3,
+	ICE_MDID_GENERIC_WORD_4,
+	ICE_MDID_FLOW_ID_LOWER,
+	ICE_MDID_FLOW_ID_HIGH,
+	ICE_MDID_RX_DESCR_PROF_IDX,
+	ICE_MDID_RX_PKT_DROP,
+	ICE_MDID_RX_DST_Q		= 12,
+	ICE_MDID_RX_DST_VSI,
+	ICE_MDID_SRC_VSI		= 19,
+	ICE_MDID_ACL_NOP		= 55,
+	/* Entry 56 */
+	ICE_MDID_RX_HASH_LOW,
+	ICE_MDID_ACL_CNTR_PKT		= ICE_MDID_RX_HASH_LOW,
+	/* Entry 57 */
+	ICE_MDID_RX_HASH_HIGH,
+	ICE_MDID_ACL_CNTR_BYTES		= ICE_MDID_RX_HASH_HIGH,
+	ICE_MDID_ACL_CNTR_PKT_BYTES
 };
 
+/* for ice_32byte_rx_flex_desc.mir_id_umb_cast member */
+#define ICE_RX_FLEX_DESC_MIRROR_M	(0x3F) /* 6-bits */
+
 /* Rx/Tx Flag64 packet flag bits */
 enum ice_flg64_bits {
 	ICE_FLG_PKT_DSI		= 0,
-	ICE_FLG_EVLAN_x8100	= 15,
+	/* If there is a 1 in this bit position then that means Rx packet */
+	ICE_FLG_PKT_DIR		= 4,
+	ICE_FLG_EVLAN_x8100	= 14,
 	ICE_FLG_EVLAN_x9100,
 	ICE_FLG_VLAN_x8100,
 	ICE_FLG_TNL_MAC		= 22,
@@ -227,12 +733,50 @@ enum ice_flg64_bits {
 	ICE_FLG_RSVD		= 63
 };
 
+enum ice_rx_flex_desc_umb_cast_bits { /* field is 2 bits long */
+	ICE_RX_FLEX_DESC_UMB_CAST_S = 6,
+	ICE_RX_FLEX_DESC_UMB_CAST_LAST /* this entry must be last!!! */
+};
+
+enum ice_umbcast_dest_addr_types {
+	ICE_DEST_UNICAST = 0,
+	ICE_DEST_MULTICAST,
+	ICE_DEST_BROADCAST,
+	ICE_DEST_MIRRORED,
+};
+
 /* for ice_32byte_rx_flex_desc.ptype_flexi_flags0 member */
 #define ICE_RX_FLEX_DESC_PTYPE_M	(0x3FF) /* 10-bits */
 
+enum ice_rx_flex_desc_flexi_flags0_bits { /* field is 6 bits long */
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS0_S = 10,
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS0_LAST /* this entry must be last!!! */
+};
+
 /* for ice_32byte_rx_flex_desc.pkt_length member */
 #define ICE_RX_FLX_DESC_PKT_LEN_M	(0x3FFF) /* 14-bits */
 
+/* for ice_32byte_rx_flex_desc.header_length_sph_flexi_flags1 member */
+#define ICE_RX_FLEX_DESC_HEADER_LEN_M	(0x7FF) /* 11-bits */
+
+enum ice_rx_flex_desc_sph_bits { /* field is 1 bit long */
+	ICE_RX_FLEX_DESC_SPH_S = 11,
+	ICE_RX_FLEX_DESC_SPH_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_flexi_flags1_bits { /* field is 4 bits long */
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS1_S = 12,
+	ICE_RX_FLEX_DESC_FLEXI_FLAGS1_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_ext_status_bits { /* field is 4 bits long */
+	ICE_RX_FLEX_DESC_EXT_STATUS_EXT_UDP_S = 12,
+	ICE_RX_FLEX_DESC_EXT_STATUS_INT_UDP_S = 13,
+	ICE_RX_FLEX_DESC_EXT_STATUS_RECIPE_S = 14,
+	ICE_RX_FLEX_DESC_EXT_STATUS_OVERSIZE_S = 15,
+	ICE_RX_FLEX_DESC_EXT_STATUS_LAST /* entry must be last!!! */
+};
+
 enum ice_rx_flex_desc_status_error_0_bits {
 	/* Note: These are predefined bit offsets */
 	ICE_RX_FLEX_DESC_STATUS0_DD_S = 0,
@@ -254,6 +798,29 @@ enum ice_rx_flex_desc_status_error_0_bits {
 	ICE_RX_FLEX_DESC_STATUS0_LAST /* this entry must be last!!! */
 };
 
+enum ice_rx_flex_desc_status_error_1_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_FLEX_DESC_STATUS1_CPM_S = 0, /* 4 bits */
+	ICE_RX_FLEX_DESC_STATUS1_NAT_S = 4,
+	ICE_RX_FLEX_DESC_STATUS1_CRYPTO_S = 5,
+	/* [10:6] reserved */
+	ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S = 11,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD2_VALID_S = 12,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD3_VALID_S = 13,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD4_VALID_S = 14,
+	ICE_RX_FLEX_DESC_STATUS1_XTRMD5_VALID_S = 15,
+	ICE_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */
+};
+
+enum ice_rx_flex_desc_exstat_bits {
+	/* Note: These are predefined bit offsets */
+	ICE_RX_FLEX_DESC_EXSTAT_EXTUDP_S = 0,
+	ICE_RX_FLEX_DESC_EXSTAT_INTUDP_S = 1,
+	ICE_RX_FLEX_DESC_EXSTAT_RECIPE_S = 2,
+	ICE_RX_FLEX_DESC_EXSTAT_OVERSIZE_S = 3,
+};
+
+
 #define ICE_RXQ_CTX_SIZE_DWORDS		8
 #define ICE_RXQ_CTX_SZ			(ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32))
 #define ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS	22
@@ -302,7 +869,7 @@ struct ice_ctx_ele {
 
 #define ICE_CTX_STORE(_struct, _ele, _width, _lsb) {	\
 	.offset = offsetof(struct _struct, _ele),	\
-	.size_of = FIELD_SIZEOF(struct _struct, _ele),	\
+	.size_of = sizeof_field(struct _struct, _ele),	\
 	.width = _width,				\
 	.lsb = _lsb,					\
 }
@@ -329,9 +896,15 @@ struct ice_tx_desc {
 	__le64 cmd_type_offset_bsz;
 };
 
+#define ICE_TXD_QW1_DTYPE_S	0
+#define ICE_TXD_QW1_DTYPE_M	(0xFUL << ICE_TXD_QW1_DTYPE_S)
+
 enum ice_tx_desc_dtype_value {
 	ICE_TX_DESC_DTYPE_DATA		= 0x0,
 	ICE_TX_DESC_DTYPE_CTX		= 0x1,
+	ICE_TX_DESC_DTYPE_IPSEC		= 0x3,
+	ICE_TX_DESC_DTYPE_FLTR_PROG	= 0x8,
+	ICE_TX_DESC_DTYPE_HLP_META	= 0x9,
 	/* DESC_DONE - HW has completed write-back of descriptor */
 	ICE_TX_DESC_DTYPE_DESC_DONE	= 0xF,
 };
@@ -342,13 +915,20 @@ enum ice_tx_desc_dtype_value {
 enum ice_tx_desc_cmd_bits {
 	ICE_TX_DESC_CMD_EOP			= 0x0001,
 	ICE_TX_DESC_CMD_RS			= 0x0002,
+	ICE_TX_DESC_CMD_RSVD			= 0x0004,
 	ICE_TX_DESC_CMD_IL2TAG1			= 0x0008,
+	ICE_TX_DESC_CMD_DUMMY			= 0x0010,
+	ICE_TX_DESC_CMD_IIPT_NONIP		= 0x0000,
 	ICE_TX_DESC_CMD_IIPT_IPV6		= 0x0020,
 	ICE_TX_DESC_CMD_IIPT_IPV4		= 0x0040,
 	ICE_TX_DESC_CMD_IIPT_IPV4_CSUM		= 0x0060,
+	ICE_TX_DESC_CMD_RSVD2			= 0x0080,
+	ICE_TX_DESC_CMD_L4T_EOFT_UNK		= 0x0000,
 	ICE_TX_DESC_CMD_L4T_EOFT_TCP		= 0x0100,
 	ICE_TX_DESC_CMD_L4T_EOFT_SCTP		= 0x0200,
 	ICE_TX_DESC_CMD_L4T_EOFT_UDP		= 0x0300,
+	ICE_TX_DESC_CMD_RE			= 0x0400,
+	ICE_TX_DESC_CMD_RSVD3			= 0x0800,
 };
 
 #define ICE_TXD_QW1_OFFSET_S	16
@@ -374,7 +954,10 @@ enum ice_tx_desc_len_fields {
 			    ICE_TX_DESC_LEN_L4_LEN_S) * ICE_BYTES_PER_DWORD)
 
 #define ICE_TXD_QW1_TX_BUF_SZ_S	34
+#define ICE_TXD_QW1_TX_BUF_SZ_M	(0x3FFFULL << ICE_TXD_QW1_TX_BUF_SZ_S)
+
 #define ICE_TXD_QW1_L2TAG1_S	48
+#define ICE_TXD_QW1_L2TAG1_M	(0xFFFFULL << ICE_TXD_QW1_L2TAG1_S)
 
 /* Context descriptors */
 struct ice_tx_ctx_desc {
@@ -384,14 +967,29 @@ struct ice_tx_ctx_desc {
 	__le64 qw1;
 };
 
+#define ICE_TXD_CTX_QW1_DTYPE_S	0
+#define ICE_TXD_CTX_QW1_DTYPE_M	(0xFUL << ICE_TXD_CTX_QW1_DTYPE_S)
+
 #define ICE_TXD_CTX_QW1_CMD_S	4
 #define ICE_TXD_CTX_QW1_CMD_M	(0x7FUL << ICE_TXD_CTX_QW1_CMD_S)
 
+#define ICE_TXD_CTX_QW1_IPSEC_S	11
+#define ICE_TXD_CTX_QW1_IPSEC_M	(0x7FUL << ICE_TXD_CTX_QW1_IPSEC_S)
+
 #define ICE_TXD_CTX_QW1_TSO_LEN_S	30
 #define ICE_TXD_CTX_QW1_TSO_LEN_M	\
 			(0x3FFFFULL << ICE_TXD_CTX_QW1_TSO_LEN_S)
 
+#define ICE_TXD_CTX_QW1_TSYN_S	ICE_TXD_CTX_QW1_TSO_LEN_S
+#define ICE_TXD_CTX_QW1_TSYN_M	ICE_TXD_CTX_QW1_TSO_LEN_M
+
 #define ICE_TXD_CTX_QW1_MSS_S	50
+#define ICE_TXD_CTX_QW1_MSS_M	(0x3FFFULL << ICE_TXD_CTX_QW1_MSS_S)
+#define ICE_TXD_CTX_MIN_MSS	64
+#define ICE_TXD_CTX_MAX_MSS	9668
+
+#define ICE_TXD_CTX_QW1_VSI_S	50
+#define ICE_TXD_CTX_QW1_VSI_M	(0x3FFULL << ICE_TXD_CTX_QW1_VSI_S)
 
 enum ice_tx_ctx_desc_cmd_bits {
 	ICE_TX_CTX_DESC_TSO		= 0x01,
@@ -405,6 +1003,40 @@ enum ice_tx_ctx_desc_cmd_bits {
 	ICE_TX_CTX_DESC_RESERVED	= 0x40
 };
 
+enum ice_tx_ctx_desc_eipt_offload {
+	ICE_TX_CTX_EIPT_NONE		= 0x0,
+	ICE_TX_CTX_EIPT_IPV6		= 0x1,
+	ICE_TX_CTX_EIPT_IPV4_NO_CSUM	= 0x2,
+	ICE_TX_CTX_EIPT_IPV4		= 0x3
+};
+
+#define ICE_TXD_CTX_QW0_EIPT_S	0
+#define ICE_TXD_CTX_QW0_EIPT_M	(0x3ULL << ICE_TXD_CTX_QW0_EIPT_S)
+
+#define ICE_TXD_CTX_QW0_EIPLEN_S	2
+#define ICE_TXD_CTX_QW0_EIPLEN_M	(0x7FUL << ICE_TXD_CTX_QW0_EIPLEN_S)
+
+#define ICE_TXD_CTX_QW0_L4TUNT_S	9
+#define ICE_TXD_CTX_QW0_L4TUNT_M	(0x3ULL << ICE_TXD_CTX_QW0_L4TUNT_S)
+
+#define ICE_TXD_CTX_UDP_TUNNELING	BIT_ULL(ICE_TXD_CTX_QW0_L4TUNT_S)
+#define ICE_TXD_CTX_GRE_TUNNELING	(0x2ULL << ICE_TXD_CTX_QW0_L4TUNT_S)
+
+#define ICE_TXD_CTX_QW0_EIP_NOINC_S	11
+#define ICE_TXD_CTX_QW0_EIP_NOINC_M	BIT_ULL(ICE_TXD_CTX_QW0_EIP_NOINC_S)
+
+#define ICE_TXD_CTX_EIP_NOINC_IPID_CONST	ICE_TXD_CTX_QW0_EIP_NOINC_M
+
+#define ICE_TXD_CTX_QW0_NATLEN_S	12
+#define ICE_TXD_CTX_QW0_NATLEN_M	(0X7FULL << ICE_TXD_CTX_QW0_NATLEN_S)
+
+#define ICE_TXD_CTX_QW0_DECTTL_S	19
+#define ICE_TXD_CTX_QW0_DECTTL_M	(0xFULL << ICE_TXD_CTX_QW0_DECTTL_S)
+
+#define ICE_TXD_CTX_QW0_L4T_CS_S	23
+#define ICE_TXD_CTX_QW0_L4T_CS_M	BIT_ULL(ICE_TXD_CTX_QW0_L4T_CS_S)
+
+
 #define ICE_LAN_TXQ_MAX_QGRPS	127
 #define ICE_LAN_TXQ_MAX_QDIS	1023
 
@@ -447,12 +1079,92 @@ struct ice_tlan_ctx {
 	u8 drop_ena;
 	u8 cache_prof_idx;
 	u8 pkt_shaper_prof_idx;
-	u8 int_q_state;	/* width not needed - internal do not write */
+	u8 int_q_state;	/* width not needed - internal - DO NOT WRITE!!! */
 };
 
-/* macro to make the table lines short */
+/* LAN Tx Completion Queue data */
+struct ice_tx_cmpltnq {
+	u16 txq_id;
+	u8 generation;
+	u16 tx_head;
+	u8 cmpl_type;
+} __packed;
+
+
+/* LAN Tx Completion Queue Context */
+struct ice_tx_cmpltnq_ctx {
+	u64 base;
+#define ICE_TX_CMPLTNQ_CTX_BASE_S	7
+	u32 q_len;
+#define ICE_TX_CMPLTNQ_CTX_Q_LEN_S	4
+	u8 generation;
+	u32 wrt_ptr;
+	u8 pf_num;
+	u16 vmvf_num;
+	u8 vmvf_type;
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_VF		0
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_VMQ	1
+#define ICE_TX_CMPLTNQ_CTX_VMVF_TYPE_PF		2
+	u8 tph_desc_wr;
+	u8 cpuid;
+	u32 cmpltn_cache[16];
+} __packed;
+
+/* LAN Tx Doorbell Descriptor Format */
+struct ice_tx_drbell_fmt {
+	u16 txq_id;
+	u8 dd;
+	u8 rs;
+	u32 db;
+};
+
+
+/* LAN Tx Doorbell Queue Context */
+struct ice_tx_drbell_q_ctx {
+	u64 base;
+#define ICE_TX_DRBELL_Q_CTX_BASE_S	7
+	u16 ring_len;
+#define ICE_TX_DRBELL_Q_CTX_RING_LEN_S	4
+	u8 pf_num;
+	u16 vf_num;
+	u8 vmvf_type;
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_VF	0
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_VMQ	1
+#define ICE_TX_DRBELL_Q_CTX_VMVF_TYPE_PF	2
+	u8 cpuid;
+	u8 tph_desc_rd;
+	u8 tph_desc_wr;
+	u8 db_q_en;
+	u16 rd_head;
+	u16 rd_tail;
+} __packed;
+
+/* The ice_ptype_lkup table is used to convert from the 10-bit ptype in the
+ * hardware to a bit-field that can be used by SW to more easily determine the
+ * packet type.
+ *
+ * Macros are used to shorten the table lines and make this table human
+ * readable.
+ *
+ * We store the PTYPE in the top byte of the bit field - this is just so that
+ * we can check that the table doesn't have a row missing, as the index into
+ * the table should be the PTYPE.
+ *
+ * Typical work flow:
+ *
+ * IF NOT ice_ptype_lkup[ptype].known
+ * THEN
+ *      Packet is unknown
+ * ELSE IF ice_ptype_lkup[ptype].outer_ip == ICE_RX_PTYPE_OUTER_IP
+ *      Use the rest of the fields to look at the tunnels, inner protocols, etc
+ * ELSE
+ *      Use the enum ice_rx_l2_ptype to decode the packet type
+ * ENDIF
+ */
+
+/* macro to make the table lines short, use explicit indexing with [PTYPE] */
 #define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	{	PTYPE, \
+	[PTYPE] = { \
 		1, \
 		ICE_RX_PTYPE_OUTER_##OUTER_IP, \
 		ICE_RX_PTYPE_OUTER_##OUTER_IP_VER, \
@@ -463,17 +1175,220 @@ struct ice_tlan_ctx {
 		ICE_RX_PTYPE_INNER_PROT_##I, \
 		ICE_RX_PTYPE_PAYLOAD_LAYER_##PL }
 
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) { PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 
 /* shorter macros makes the table fit but are terse */
 #define ICE_RX_PTYPE_NOF		ICE_RX_PTYPE_NOT_FRAG
+#define ICE_RX_PTYPE_FRG		ICE_RX_PTYPE_FRAG
+#define ICE_RX_PTYPE_INNER_PROT_TS	ICE_RX_PTYPE_INNER_PROT_TIMESYNC
 
-/* Lookup table mapping the HW PTYPE to the bit field for decoding */
-static const struct ice_rx_ptype_decoded ice_ptype_lkup[] = {
+/* Lookup table mapping the 10-bit HW PTYPE to the bit field for decoding */
+static const struct ice_rx_ptype_decoded ice_ptype_lkup[1024] = {
 	/* L2 Packet types */
 	ICE_PTT_UNUSED_ENTRY(0),
 	ICE_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	ICE_PTT(2, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(2),
+	ICE_PTT_UNUSED_ENTRY(3),
+	ICE_PTT_UNUSED_ENTRY(4),
+	ICE_PTT_UNUSED_ENTRY(5),
+	ICE_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(8),
+	ICE_PTT_UNUSED_ENTRY(9),
+	ICE_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	ICE_PTT_UNUSED_ENTRY(12),
+	ICE_PTT_UNUSED_ENTRY(13),
+	ICE_PTT_UNUSED_ENTRY(14),
+	ICE_PTT_UNUSED_ENTRY(15),
+	ICE_PTT_UNUSED_ENTRY(16),
+	ICE_PTT_UNUSED_ENTRY(17),
+	ICE_PTT_UNUSED_ENTRY(18),
+	ICE_PTT_UNUSED_ENTRY(19),
+	ICE_PTT_UNUSED_ENTRY(20),
+	ICE_PTT_UNUSED_ENTRY(21),
+
+	/* Non Tunneled IPv4 */
+	ICE_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(25),
+	ICE_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	ICE_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	ICE_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv4 */
+	ICE_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(32),
+	ICE_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv6 */
+	ICE_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(39),
+	ICE_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT */
+	ICE_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> IPv4 */
+	ICE_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(47),
+	ICE_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> IPv6 */
+	ICE_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(54),
+	ICE_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC */
+	ICE_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
+	ICE_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(62),
+	ICE_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
+	ICE_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(69),
+	ICE_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC/VLAN */
+	ICE_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
+	ICE_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(77),
+	ICE_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
+	ICE_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(84),
+	ICE_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* Non Tunneled IPv6 */
+	ICE_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
+	ICE_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(91),
+	ICE_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	ICE_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	ICE_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv4 */
+	ICE_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(98),
+	ICE_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv6 */
+	ICE_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(105),
+	ICE_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT */
+	ICE_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> IPv4 */
+	ICE_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(113),
+	ICE_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> IPv6 */
+	ICE_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(120),
+	ICE_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC */
+	ICE_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
+	ICE_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(128),
+	ICE_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
+	ICE_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(135),
+	ICE_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN */
+	ICE_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
+	ICE_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	ICE_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	ICE_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(143),
+	ICE_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	ICE_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	ICE_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
+	ICE_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	ICE_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	ICE_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	ICE_PTT_UNUSED_ENTRY(150),
+	ICE_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	ICE_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	ICE_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	[154 ... 1023] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 };
 
 static inline struct ice_rx_ptype_decoded ice_decode_rx_desc_ptype(u16 ptype)
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index cc755382df256ad47d73abd5990b8ec318bdcf62..820dcab6d7745467eeab38eaefd3f3cde549a1ba 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1,246 +1,64 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
+#include "ice_devlink.h"
+#include "ice_vsi_vlan_ops.h"
 
 /**
- * ice_setup_rx_ctx - Configure a receive ring context
- * @ring: The Rx ring to configure
- *
- * Configure the Rx descriptor ring in RLAN context.
- */
-static int ice_setup_rx_ctx(struct ice_ring *ring)
-{
-	struct ice_vsi *vsi = ring->vsi;
-	struct ice_hw *hw = &vsi->back->hw;
-	u32 rxdid = ICE_RXDID_FLEX_NIC;
-	struct ice_rlan_ctx rlan_ctx;
-	u32 regval;
-	u16 pf_q;
-	int err;
-
-	/* what is Rx queue number in global space of 2K Rx queues */
-	pf_q = vsi->rxq_map[ring->q_index];
-
-	/* clear the context structure first */
-	memset(&rlan_ctx, 0, sizeof(rlan_ctx));
-
-	rlan_ctx.base = ring->dma >> 7;
-
-	rlan_ctx.qlen = ring->count;
-
-	/* Receive Packet Data Buffer Size.
-	 * The Packet Data Buffer Size is defined in 128 byte units.
-	 */
-	rlan_ctx.dbuf = vsi->rx_buf_len >> ICE_RLAN_CTX_DBUF_S;
-
-	/* use 32 byte descriptors */
-	rlan_ctx.dsize = 1;
-
-	/* Strip the Ethernet CRC bytes before the packet is posted to host
-	 * memory.
-	 */
-	rlan_ctx.crcstrip = 1;
-
-	/* L2TSEL flag defines the reported L2 Tags in the receive descriptor */
-	rlan_ctx.l2tsel = 1;
-
-	rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
-	rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
-	rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
-
-	/* This controls whether VLAN is stripped from inner headers
-	 * The VLAN in the inner L2 header is stripped to the receive
-	 * descriptor if enabled by this flag.
-	 */
-	rlan_ctx.showiv = 0;
-
-	/* Max packet size for this queue - must not be set to a larger value
-	 * than 5 x DBUF
-	 */
-	rlan_ctx.rxmax = min_t(u16, vsi->max_frame,
-			       ICE_MAX_CHAINED_RX_BUFS * vsi->rx_buf_len);
-
-	/* Rx queue threshold in units of 64 */
-	rlan_ctx.lrxqthresh = 1;
-
-	 /* Enable Flexible Descriptors in the queue context which
-	  * allows this driver to select a specific receive descriptor format
-	  */
-	if (vsi->type != ICE_VSI_VF) {
-		regval = rd32(hw, QRXFLXP_CNTXT(pf_q));
-		regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) &
-			QRXFLXP_CNTXT_RXDID_IDX_M;
-
-		/* increasing context priority to pick up profile ID;
-		 * default is 0x01; setting to 0x03 to ensure profile
-		 * is programming if prev context is of same priority
-		 */
-		regval |= (0x03 << QRXFLXP_CNTXT_RXDID_PRIO_S) &
-			QRXFLXP_CNTXT_RXDID_PRIO_M;
-
-		wr32(hw, QRXFLXP_CNTXT(pf_q), regval);
-	}
-
-	/* Absolute queue number out of 2K needs to be passed */
-	err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q);
-	if (err) {
-		dev_err(&vsi->back->pdev->dev,
-			"Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n",
-			pf_q, err);
-		return -EIO;
-	}
-
-	if (vsi->type == ICE_VSI_VF)
-		return 0;
-
-	/* init queue specific tail register */
-	ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
-	writel(0, ring->tail);
-	ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
-
-	return 0;
-}
-
-/**
- * ice_setup_tx_ctx - setup a struct ice_tlan_ctx instance
- * @ring: The Tx ring to configure
- * @tlan_ctx: Pointer to the Tx LAN queue context structure to be initialized
- * @pf_q: queue index in the PF space
- *
- * Configure the Tx descriptor ring in TLAN context.
+ * ice_vsi_type_str - maps VSI type enum to string equivalents
+ * @vsi_type: VSI type enum
  */
-static void
-ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
+const char *ice_vsi_type_str(enum ice_vsi_type vsi_type)
 {
-	struct ice_vsi *vsi = ring->vsi;
-	struct ice_hw *hw = &vsi->back->hw;
-
-	tlan_ctx->base = ring->dma >> ICE_TLAN_CTX_BASE_S;
-
-	tlan_ctx->port_num = vsi->port_info->lport;
-
-	/* Transmit Queue Length */
-	tlan_ctx->qlen = ring->count;
-
-	ice_set_cgd_num(tlan_ctx, ring);
-
-	/* PF number */
-	tlan_ctx->pf_num = hw->pf_id;
-
-	/* queue belongs to a specific VSI type
-	 * VF / VM index should be programmed per vmvf_type setting:
-	 * for vmvf_type = VF, it is VF number between 0-256
-	 * for vmvf_type = VM, it is VM number between 0-767
-	 * for PF or EMP this field should be set to zero
-	 */
-	switch (vsi->type) {
-	case ICE_VSI_LB:
-		/* fall through */
+	switch (vsi_type) {
 	case ICE_VSI_PF:
-		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
-		break;
+		return "ICE_VSI_PF";
 	case ICE_VSI_VF:
-		/* Firmware expects vmvf_num to be absolute VF ID */
-		tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf_id;
-		tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
-		break;
+		return "ICE_VSI_VF";
+	case ICE_VSI_VMDQ2:
+		return "ICE_VSI_VMDQ2";
+	case ICE_VSI_CTRL:
+		return "ICE_VSI_CTRL";
+	case ICE_VSI_CHNL:
+		return "ICE_VSI_CHNL";
+	case ICE_VSI_OFFLOAD_MACVLAN:
+		return "ICE_VSI_OFFLOAD_MACVLAN";
+	case ICE_VSI_LB:
+		return "ICE_VSI_LB";
+	case ICE_VSI_SWITCHDEV_CTRL:
+		return "ICE_VSI_SWITCHDEV_CTRL";
 	default:
-		return;
-	}
-
-	/* make sure the context is associated with the right VSI */
-	tlan_ctx->src_vsi = ice_get_hw_vsi_num(hw, vsi->idx);
-
-	tlan_ctx->tso_ena = ICE_TX_LEGACY;
-	tlan_ctx->tso_qnum = pf_q;
-
-	/* Legacy or Advanced Host Interface:
-	 * 0: Advanced Host Interface
-	 * 1: Legacy Host Interface
-	 */
-	tlan_ctx->legacy_int = ICE_TX_LEGACY;
-}
-
-/**
- * ice_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
- * @pf: the PF being configured
- * @pf_q: the PF queue
- * @ena: enable or disable state of the queue
- *
- * This routine will wait for the given Rx queue of the PF to reach the
- * enabled or disabled state.
- * Returns -ETIMEDOUT in case of failing to reach the requested state after
- * multiple retries; else will return 0 in case of success.
- */
-static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena)
-{
-	int i;
-
-	for (i = 0; i < ICE_Q_WAIT_MAX_RETRY; i++) {
-		if (ena == !!(rd32(&pf->hw, QRX_CTRL(pf_q)) &
-			      QRX_CTRL_QENA_STAT_M))
-			return 0;
-
-		usleep_range(20, 40);
+		return "unknown";
 	}
-
-	return -ETIMEDOUT;
 }
 
 /**
- * ice_vsi_ctrl_rx_ring - Start or stop a VSI's Rx ring
+ * ice_vsi_ctrl_all_rx_rings - Start or stop a VSI's Rx rings
  * @vsi: the VSI being configured
  * @ena: start or stop the Rx rings
- * @rxq_idx: Rx queue index
+ *
+ * First enable/disable all of the Rx rings, flush any remaining writes, and
+ * then verify that they have all been enabled/disabled successfully. This will
+ * let all of the register writes complete when enabling/disabling the Rx rings
+ * before waiting for the change in hardware to complete.
  */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx)
+static int ice_vsi_ctrl_all_rx_rings(struct ice_vsi *vsi, bool ena)
 {
-	int pf_q = vsi->rxq_map[rxq_idx];
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
 	int ret = 0;
-	u32 rx_reg;
-
-	rx_reg = rd32(hw, QRX_CTRL(pf_q));
-
-	/* Skip if the queue is already in the requested state */
-	if (ena == !!(rx_reg & QRX_CTRL_QENA_STAT_M))
-		return 0;
-
-	/* turn on/off the queue */
-	if (ena)
-		rx_reg |= QRX_CTRL_QENA_REQ_M;
-	else
-		rx_reg &= ~QRX_CTRL_QENA_REQ_M;
-	wr32(hw, QRX_CTRL(pf_q), rx_reg);
-
-	/* wait for the change to finish */
-	ret = ice_pf_rxq_wait(pf, pf_q, ena);
-	if (ret)
-		dev_err(&pf->pdev->dev,
-			"VSI idx %d Rx ring %d %sable timeout\n",
-			vsi->idx, pf_q, (ena ? "en" : "dis"));
+	u16 i;
 
-	return ret;
-}
+	for (i = 0; i < vsi->num_rxq; i++)
+		ice_vsi_ctrl_one_rx_ring(vsi, ena, i, false);
 
-/**
- * ice_vsi_ctrl_rx_rings - Start or stop a VSI's Rx rings
- * @vsi: the VSI being configured
- * @ena: start or stop the Rx rings
- */
-static int ice_vsi_ctrl_rx_rings(struct ice_vsi *vsi, bool ena)
-{
-	int i, ret = 0;
+	ice_flush(&vsi->back->hw);
 
 	for (i = 0; i < vsi->num_rxq; i++) {
-		ret = ice_vsi_ctrl_rx_ring(vsi, ena, i);
+		ret = ice_vsi_wait_one_rx_ring(vsi, ena, i);
 		if (ret)
 			break;
 	}
@@ -258,36 +76,46 @@ static int ice_vsi_ctrl_rx_rings(struct ice_vsi *vsi, bool ena)
 static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	/* allocate memory for both Tx and Rx ring pointers */
-	vsi->tx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_txq,
+	vsi->tx_rings = devm_kcalloc(dev, vsi->alloc_txq,
 				     sizeof(*vsi->tx_rings), GFP_KERNEL);
 	if (!vsi->tx_rings)
 		return -ENOMEM;
 
-	vsi->rx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_rxq,
+	vsi->rx_rings = devm_kcalloc(dev, vsi->alloc_rxq,
 				     sizeof(*vsi->rx_rings), GFP_KERNEL);
 	if (!vsi->rx_rings)
 		goto err_rings;
 
-	vsi->txq_map = devm_kcalloc(&pf->pdev->dev, vsi->alloc_txq,
+#ifdef HAVE_XDP_SUPPORT
+	/* XDP will have vsi->alloc_txq Tx queues as well, so double the size */
+	vsi->txq_map = devm_kcalloc(dev, (2 * vsi->alloc_txq),
+				    sizeof(*vsi->txq_map), GFP_KERNEL);
+#else
+	vsi->txq_map = devm_kcalloc(dev, vsi->alloc_txq,
 				    sizeof(*vsi->txq_map), GFP_KERNEL);
+#endif /* HAVE_XDP_SUPPORT */
 
 	if (!vsi->txq_map)
 		goto err_txq_map;
 
-	vsi->rxq_map = devm_kcalloc(&pf->pdev->dev, vsi->alloc_rxq,
+	vsi->rxq_map = devm_kcalloc(dev, vsi->alloc_rxq,
 				    sizeof(*vsi->rxq_map), GFP_KERNEL);
 	if (!vsi->rxq_map)
 		goto err_rxq_map;
 
-
 	/* There is no need to allocate q_vectors for a loopback VSI. */
 	if (vsi->type == ICE_VSI_LB)
 		return 0;
 
 	/* allocate memory for q_vector pointers */
-	vsi->q_vectors = devm_kcalloc(&pf->pdev->dev, vsi->num_q_vectors,
+	vsi->q_vectors = devm_kcalloc(dev, vsi->num_q_vectors,
 				      sizeof(*vsi->q_vectors), GFP_KERNEL);
 	if (!vsi->q_vectors)
 		goto err_vectors;
@@ -295,13 +123,13 @@ static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 	return 0;
 
 err_vectors:
-	devm_kfree(&pf->pdev->dev, vsi->rxq_map);
+	devm_kfree(dev, vsi->rxq_map);
 err_rxq_map:
-	devm_kfree(&pf->pdev->dev, vsi->txq_map);
+	devm_kfree(dev, vsi->txq_map);
 err_txq_map:
-	devm_kfree(&pf->pdev->dev, vsi->rx_rings);
+	devm_kfree(dev, vsi->rx_rings);
 err_rings:
-	devm_kfree(&pf->pdev->dev, vsi->tx_rings);
+	devm_kfree(dev, vsi->tx_rings);
 	return -ENOMEM;
 }
 
@@ -313,14 +141,22 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi)
 {
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		/* fall through */
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+	case ICE_VSI_CTRL:
 	case ICE_VSI_LB:
-		vsi->num_rx_desc = ICE_DFLT_NUM_RX_DESC;
-		vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC;
+		/* a user could change the values of num_[tr]x_desc using
+		 * ethtool -G so we should keep those values instead of
+		 * overwriting them with the defaults.
+		 */
+		if (!vsi->num_rx_desc)
+			vsi->num_rx_desc = ICE_DFLT_NUM_RX_DESC;
+		if (!vsi->num_tx_desc)
+			vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC;
 		break;
 	default:
-		dev_dbg(&vsi->back->pdev->dev,
-			"Not setting number of Tx/Rx descriptors for VSI type %d\n",
+		dev_dbg(ice_pf_to_dev(vsi->back), "Not setting number of Tx/Rx descriptors for VSI type %d\n",
 			vsi->type);
 		break;
 	}
@@ -340,42 +176,92 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi, u16 vf_id)
 
 	if (vsi->type == ICE_VSI_VF)
 		vsi->vf_id = vf_id;
-
+	else
+		vsi->vf_id = ICE_INVAL_VFID;
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		vsi->alloc_txq = min_t(int, ice_get_avail_txq_count(pf),
-				       num_online_cpus());
+		/* default to 1 Tx queue per MSI-X to not hurt our performance */
+		vsi->alloc_txq = min3(pf->num_lan_msix,
+				      ice_get_avail_txq_count(pf),
+				      (u16)num_online_cpus());
+		if (vsi->req_txq) {
+			vsi->alloc_txq = vsi->req_txq;
+			vsi->num_txq = vsi->req_txq;
+		}
 
 		pf->num_lan_tx = vsi->alloc_txq;
 
 		/* only 1 Rx queue unless RSS is enabled */
-		if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			vsi->alloc_rxq = 1;
-		else
-			vsi->alloc_rxq = min_t(int, ice_get_avail_rxq_count(pf),
-					       num_online_cpus());
+		} else {
+			/* default to 1 Rx queue per MSI-X to not hurt our performance */
+			vsi->alloc_rxq = min3(pf->num_lan_msix,
+					      ice_get_avail_rxq_count(pf),
+					      (u16)num_online_cpus());
+			if (vsi->req_rxq) {
+				vsi->alloc_rxq = vsi->req_rxq;
+				vsi->num_rxq = vsi->req_rxq;
+			}
+		}
 
 		pf->num_lan_rx = vsi->alloc_rxq;
 
-		vsi->num_q_vectors = max_t(int, vsi->alloc_rxq, vsi->alloc_txq);
+		vsi->num_q_vectors = min_t(int, pf->num_lan_msix,
+					   max_t(int, vsi->alloc_rxq, vsi->alloc_txq));
+		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+		vsi->alloc_txq = ICE_DFLT_TXQ_VMDQ_VSI;
+		vsi->alloc_rxq = ICE_DFLT_RXQ_VMDQ_VSI;
+		vsi->num_q_vectors = ICE_DFLT_VEC_VMDQ_VSI;
+		break;
+	case ICE_VSI_SWITCHDEV_CTRL:
+		/* The number of queues for ctrl vsi is equal to number of VFs.
+		 * Each ring is associated to the corresponding VF_PR netdev.
+		 */
+		vsi->alloc_txq = pf->num_alloc_vfs;
+		vsi->alloc_rxq = pf->num_alloc_vfs;
+		vsi->num_q_vectors = 1;
 		break;
 	case ICE_VSI_VF:
 		vf = &pf->vf[vsi->vf_id];
-		vsi->alloc_txq = vf->num_vf_qs;
-		vsi->alloc_rxq = vf->num_vf_qs;
-		/* pf->num_vf_msix includes (VF miscellaneous vector +
+		/* pf->num_msix_per_vf includes (VF miscellaneous vector +
 		 * data queue interrupts). Since vsi->num_q_vectors is number
 		 * of queues vectors, subtract 1 (ICE_NONQ_VECS_VF) from the
 		 * original vector count
 		 */
-		vsi->num_q_vectors = pf->num_vf_msix - ICE_NONQ_VECS_VF;
+		if (vf->adq_enabled && vf->num_tc) {
+			u8 tc = vsi->vf_adq_tc;
+
+			vsi->alloc_txq = vf->ch[tc].num_qps;
+			vsi->alloc_rxq = vf->ch[tc].num_qps;
+			vsi->num_q_vectors = vf->ch[tc].num_qps;
+		} else {
+			if (vf->num_req_qs)
+				vf->num_vf_qs = vf->num_req_qs;
+
+			vsi->alloc_txq = vf->num_vf_qs;
+			vsi->alloc_rxq = vf->num_vf_qs;
+			vsi->num_q_vectors = pf->num_msix_per_vf -
+				ICE_NONQ_VECS_VF;
+		}
+		break;
+	case ICE_VSI_CTRL:
+		vsi->alloc_txq = 1;
+		vsi->alloc_rxq = 1;
+		vsi->num_q_vectors = 1;
+		break;
+	case ICE_VSI_CHNL:
+		vsi->alloc_txq = 0;
+		vsi->alloc_rxq = 0;
 		break;
 	case ICE_VSI_LB:
 		vsi->alloc_txq = 1;
 		vsi->alloc_rxq = 1;
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_warn(ice_pf_to_dev(pf), "Unknown VSI type %d\n", vsi->type);
 		break;
 	}
 
@@ -383,7 +269,7 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi, u16 vf_id)
 }
 
 /**
- * ice_get_free_slot - get the next non-NULL location index in array
+ * ice_get_free_slot - get the next available free slot in array
  * @array: array to search
  * @size: size of the array
  * @curr: last known occupied index to be used as a search hint
@@ -421,7 +307,7 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 	struct ice_vsi_ctx *ctxt;
 	enum ice_status status;
 
-	ctxt = devm_kzalloc(&pf->pdev->dev, sizeof(*ctxt), GFP_KERNEL);
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt)
 		return;
 
@@ -433,10 +319,10 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 
 	status = ice_free_vsi(&pf->hw, vsi->idx, ctxt, false, NULL);
 	if (status)
-		dev_err(&pf->pdev->dev, "Failed to delete VSI %i in FW\n",
-			vsi->vsi_num);
+		dev_err(ice_pf_to_dev(pf), "Failed to delete VSI %i in FW - error: %s\n",
+			vsi->vsi_num, ice_stat_str(status));
 
-	devm_kfree(&pf->pdev->dev, ctxt);
+	kfree(ctxt);
 }
 
 /**
@@ -446,30 +332,99 @@ void ice_vsi_delete(struct ice_vsi *vsi)
 static void ice_vsi_free_arrays(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
 
 	/* free the ring and vector containers */
 	if (vsi->q_vectors) {
-		devm_kfree(&pf->pdev->dev, vsi->q_vectors);
+		devm_kfree(dev, vsi->q_vectors);
 		vsi->q_vectors = NULL;
 	}
 	if (vsi->tx_rings) {
-		devm_kfree(&pf->pdev->dev, vsi->tx_rings);
+		devm_kfree(dev, vsi->tx_rings);
 		vsi->tx_rings = NULL;
 	}
 	if (vsi->rx_rings) {
-		devm_kfree(&pf->pdev->dev, vsi->rx_rings);
+		devm_kfree(dev, vsi->rx_rings);
 		vsi->rx_rings = NULL;
 	}
 	if (vsi->txq_map) {
-		devm_kfree(&pf->pdev->dev, vsi->txq_map);
+		devm_kfree(dev, vsi->txq_map);
 		vsi->txq_map = NULL;
 	}
 	if (vsi->rxq_map) {
-		devm_kfree(&pf->pdev->dev, vsi->rxq_map);
+		devm_kfree(dev, vsi->rxq_map);
 		vsi->rxq_map = NULL;
 	}
 }
 
+static void ice_vsi_free_rss_global_lut_memory(struct ice_vsi *vsi)
+{
+	if (vsi->global_lut_id) {
+		devm_kfree(ice_pf_to_dev(vsi->back), vsi->global_lut_id);
+		vsi->global_lut_id = NULL;
+	}
+}
+
+/**
+ * ice_vsi_free_rss_global_lut - free the VSI's global RSS LUT
+ * @vsi: VSI to free global RSS LUT for
+ *
+ * If the VSI didn't allocate a global RSS LUT, then there is nothing to do. The vsi->global_lut_id
+ * will always be cleared and freed regardless of the result of freeing the global RSS LUT.
+ */
+static void ice_vsi_free_rss_global_lut(struct ice_vsi *vsi)
+{
+	enum ice_status status;
+
+	if (!vsi->global_lut_id)
+		return;
+
+	status = ice_free_rss_global_lut(&vsi->back->hw, *vsi->global_lut_id);
+	if (status)
+		dev_dbg(ice_pf_to_dev(vsi->back),
+			"Failed to free RSS global LUT ID %d for %s %d, status %d\n",
+			*vsi->global_lut_id, ice_vsi_type_str(vsi->type), vsi->idx, status);
+
+	ice_vsi_free_rss_global_lut_memory(vsi);
+}
+
+/**
+ * ice_vsi_alloc_rss_global_lut - allocate a global RSS LUT for this VSI
+ * @vsi: VSI to allocate global RSS LUT for
+ *
+ * Allocate a global RSS LUT if the VSI supports it. The caller must take care if allocating the
+ * global RSS LUT fails since vsi->global_lut_id will be NULL, which means there are no global RSS
+ * LUT resources available. There might be other causes of failure, but they can all be treated as
+ * the device having no more global RSS LUT resources available.
+ */
+static void ice_vsi_alloc_rss_global_lut(struct ice_vsi *vsi)
+{
+	enum ice_status status;
+	struct device *dev;
+
+	if (vsi->type != ICE_VSI_VF)
+		return;
+
+	/* VSI LUT is wide enough for queue groups up to ICE_MAX_SMALL_RS_QS */
+	if (vsi->alloc_rxq <= ICE_MAX_SMALL_RSS_QS)
+		return;
+
+	dev = ice_pf_to_dev(vsi->back);
+	vsi->global_lut_id = devm_kzalloc(dev, sizeof(vsi->global_lut_id),
+					  GFP_KERNEL);
+	if (!vsi->global_lut_id)
+		return;
+
+	status = ice_alloc_rss_global_lut(&vsi->back->hw, false, vsi->global_lut_id);
+	if (status) {
+		dev_dbg(dev, "failed to allocate RSS global LUT for %s %d, status %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx, status);
+		ice_vsi_free_rss_global_lut_memory(vsi);
+	}
+}
+
 /**
  * ice_vsi_clear - clean up and deallocate the provided VSI
  * @vsi: pointer to VSI being cleared
@@ -482,6 +437,7 @@ static void ice_vsi_free_arrays(struct ice_vsi *vsi)
 int ice_vsi_clear(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = NULL;
+	struct device *dev;
 
 	if (!vsi)
 		return 0;
@@ -490,10 +446,10 @@ int ice_vsi_clear(struct ice_vsi *vsi)
 		return -EINVAL;
 
 	pf = vsi->back;
+	dev = ice_pf_to_dev(pf);
 
 	if (!pf->vsi[vsi->idx] || pf->vsi[vsi->idx] != vsi) {
-		dev_dbg(&pf->pdev->dev, "vsi does not exist at pf->vsi[%d]\n",
-			vsi->idx);
+		dev_dbg(dev, "vsi does not exist at pf->vsi[%d]\n", vsi->idx);
 		return -EINVAL;
 	}
 
@@ -501,44 +457,92 @@ int ice_vsi_clear(struct ice_vsi *vsi)
 	/* updates the PF for this cleared VSI */
 
 	pf->vsi[vsi->idx] = NULL;
-	if (vsi->idx < pf->next_vsi)
+	if (vsi->idx < pf->next_vsi && vsi->type != ICE_VSI_CTRL)
+		pf->next_vsi = vsi->idx;
+	if (vsi->idx < pf->next_vsi && vsi->type == ICE_VSI_CTRL &&
+	    vsi->vf_id != ICE_INVAL_VFID)
 		pf->next_vsi = vsi->idx;
 
 	ice_vsi_free_arrays(vsi);
+	ice_vsi_free_rss_global_lut(vsi);
 	mutex_unlock(&pf->sw_mutex);
-	devm_kfree(&pf->pdev->dev, vsi);
+	devm_kfree(dev, vsi);
 
 	return 0;
 }
 
+/**
+ * ice_msix_clean_ctrl_vsi - MSIX mode interrupt handler for ctrl VSI
+ * @irq: interrupt number
+ * @data: pointer to a q_vector
+ */
+static irqreturn_t ice_msix_clean_ctrl_vsi(int __always_unused irq, void *data)
+{
+	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
+
+	if (!q_vector->tx.ring)
+		return IRQ_HANDLED;
+
+#define FDIR_RX_DESC_CLEAN_BUDGET 64
+	ice_clean_rx_irq(q_vector->rx.ring, FDIR_RX_DESC_CLEAN_BUDGET);
+	ice_clean_ctrl_tx_irq(q_vector->tx.ring);
+
+	return IRQ_HANDLED;
+}
+
 /**
  * ice_msix_clean_rings - MSIX mode Interrupt Handler
  * @irq: interrupt number
  * @data: pointer to a q_vector
  */
+#ifdef HAVE_NETPOLL_CONTROLLER
+irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
+#else
 static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
+#endif /* HAVE_NETPOLL_CONTROLLER */
 {
 	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
 
 	if (!q_vector->tx.ring && !q_vector->rx.ring)
 		return IRQ_HANDLED;
 
+	q_vector->total_events++;
+
 	napi_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t ice_eswitch_msix_clean_rings(int __always_unused irq, void *data)
+{
+	struct ice_q_vector *q_vector = (struct ice_q_vector *)data;
+	struct ice_pf *pf = q_vector->vsi->back;
+	int i;
+
+	if (!q_vector->tx.ring && !q_vector->rx.ring)
+		return IRQ_HANDLED;
+
+	ice_for_each_vf(pf, i)
+		napi_schedule(&pf->vf[i].repr->q_vector->napi);
+
+	return IRQ_HANDLED;
+}
+
 /**
  * ice_vsi_alloc - Allocates the next available struct VSI in the PF
  * @pf: board private structure
- * @type: type of VSI
+ * @vsi_type: type of VSI
+ * @ch: ptr to channel
+ * @tc: traffic class number for VF ADQ
  * @vf_id: ID of the VF being configured
  *
  * returns a pointer to a VSI on success, NULL on failure.
  */
 static struct ice_vsi *
-ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
+ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type vsi_type,
+	      struct ice_channel *ch, u16 vf_id, u8 tc)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_vsi *vsi = NULL;
 
 	/* Need to protect the allocation of the VSIs at the PF level */
@@ -549,26 +553,28 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 	 * is available to be populated
 	 */
 	if (pf->next_vsi == ICE_NO_VSI) {
-		dev_dbg(&pf->pdev->dev, "out of VSI slots!\n");
+		dev_dbg(dev, "out of VSI slots!\n");
 		goto unlock_pf;
 	}
 
-	vsi = devm_kzalloc(&pf->pdev->dev, sizeof(*vsi), GFP_KERNEL);
+	vsi = devm_kzalloc(dev, sizeof(*vsi), GFP_KERNEL);
 	if (!vsi)
 		goto unlock_pf;
 
-	vsi->type = type;
+	vsi->type = vsi_type;
 	vsi->back = pf;
-	set_bit(__ICE_DOWN, vsi->state);
-
-	vsi->idx = pf->next_vsi;
+	if (vsi_type == ICE_VSI_VF)
+		vsi->vf_adq_tc = tc;
+	set_bit(ICE_VSI_DOWN, vsi->state);
 
-	if (type == ICE_VSI_VF)
+	if (vsi_type == ICE_VSI_VF)
 		ice_vsi_set_num_qs(vsi, vf_id);
-	else
+	else if (vsi_type != ICE_VSI_CHNL)
 		ice_vsi_set_num_qs(vsi, ICE_INVAL_VFID);
 
 	switch (vsi->type) {
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
 	case ICE_VSI_PF:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
@@ -576,29 +582,61 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 		/* Setup default MSIX irq handler for VSI */
 		vsi->irq_handler = ice_msix_clean_rings;
 		break;
+	case ICE_VSI_SWITCHDEV_CTRL:
+		if (ice_vsi_alloc_arrays(vsi))
+			goto err_rings;
+
+		/* Setup eswitch MSIX irq handler for VSI */
+		vsi->irq_handler = ice_eswitch_msix_clean_rings;
+		break;
+	case ICE_VSI_CTRL:
+		if (ice_vsi_alloc_arrays(vsi))
+			goto err_rings;
+
+		/* Setup ctrl VSI MSIX irq handler */
+		vsi->irq_handler = ice_msix_clean_ctrl_vsi;
+		break;
 	case ICE_VSI_VF:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
 		break;
+	case ICE_VSI_CHNL:
+		if (!ch)
+			goto err_rings;
+		vsi->num_rxq = ch->num_rxq;
+		vsi->num_txq = ch->num_txq;
+		vsi->next_base_q = ch->base_q;
+		break;
 	case ICE_VSI_LB:
 		if (ice_vsi_alloc_arrays(vsi))
 			goto err_rings;
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_warn(dev, "Unknown VSI type %d\n", vsi->type);
 		goto unlock_pf;
 	}
 
-	/* fill VSI slot in the PF struct */
-	pf->vsi[pf->next_vsi] = vsi;
+	if (vsi->type == ICE_VSI_CTRL && vf_id == ICE_INVAL_VFID) {
+		/* Use the last VSI slot as the index for PF control VSI */
+		vsi->idx = pf->num_alloc_vsi - 1;
+		pf->ctrl_vsi_idx = vsi->idx;
+		pf->vsi[vsi->idx] = vsi;
+	} else {
+		/* fill slot and make note of the index */
+		vsi->idx = pf->next_vsi;
+		pf->vsi[pf->next_vsi] = vsi;
+
+		/* prepare pf->next_vsi for next use */
+		pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
+						 pf->next_vsi);
+	}
 
-	/* prepare pf->next_vsi for next use */
-	pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
-					 pf->next_vsi);
+	if (vsi->type == ICE_VSI_CTRL && vf_id != ICE_INVAL_VFID)
+		pf->vf[vf_id].ctrl_vsi_idx = vsi->idx;
 	goto unlock_pf;
 
 err_rings:
-	devm_kfree(&pf->pdev->dev, vsi);
+	devm_kfree(dev, vsi);
 	vsi = NULL;
 unlock_pf:
 	mutex_unlock(&pf->sw_mutex);
@@ -606,85 +644,99 @@ ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type, u16 vf_id)
 }
 
 /**
- * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
- * @qs_cfg: gathered variables needed for PF->VSI queues assignment
+ * ice_alloc_fd_res - Allocate FD resource for a VSI
+ * @vsi: pointer to the ice_vsi
  *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
+ * This allocates the FD resources
+ *
+ * Returns 0 on success, -EPERM on no-op or -EIO on failure
  */
-static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg)
+static int ice_alloc_fd_res(struct ice_vsi *vsi)
 {
-	int offset, i;
+	struct ice_pf *pf = vsi->back;
+	u32 g_val, b_val;
 
-	mutex_lock(qs_cfg->qs_mutex);
-	offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size,
-					    0, qs_cfg->q_count, 0);
-	if (offset >= qs_cfg->pf_map_size) {
-		mutex_unlock(qs_cfg->qs_mutex);
-		return -ENOMEM;
-	}
+	/* Flow Director filters are only allocated/assigned to the PF VSI or
+	 * CHNL VSI which passes the traffic. The CTRL VSI is only used to
+	 * add/delete filters so we don't allocate resources to it
+	 */
 
-	bitmap_set(qs_cfg->pf_map, offset, qs_cfg->q_count);
-	for (i = 0; i < qs_cfg->q_count; i++)
-		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = i + offset;
-	mutex_unlock(qs_cfg->qs_mutex);
+	/* FD filters from guaranteed pool per VSI */
+	g_val = pf->hw.func_caps.fd_fltr_guar;
+	if (!g_val)
+		return -EPERM;
 
-	return 0;
-}
+	/* FD filters from best effort pool */
+	b_val = pf->hw.func_caps.fd_fltr_best_effort;
+	if (!b_val)
+		return -EPERM;
 
-/**
- * __ice_vsi_get_qs_sc - Assign a scattered queues from PF to VSI
- * @qs_cfg: gathered variables needed for pf->vsi queues assignment
- *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
- */
-static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg)
-{
-	int i, index = 0;
+	if (!(vsi->type == ICE_VSI_PF || vsi->type == ICE_VSI_CHNL ||
+	      vsi->type == ICE_VSI_VF))
+		return -EPERM;
 
-	mutex_lock(qs_cfg->qs_mutex);
-	for (i = 0; i < qs_cfg->q_count; i++) {
-		index = find_next_zero_bit(qs_cfg->pf_map,
-					   qs_cfg->pf_map_size, index);
-		if (index >= qs_cfg->pf_map_size)
-			goto err_scatter;
-		set_bit(index, qs_cfg->pf_map);
-		qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = index;
-	}
-	mutex_unlock(qs_cfg->qs_mutex);
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EPERM;
 
-	return 0;
-err_scatter:
-	for (index = 0; index < i; index++) {
-		clear_bit(qs_cfg->vsi_map[index], qs_cfg->pf_map);
-		qs_cfg->vsi_map[index + qs_cfg->vsi_map_offset] = 0;
-	}
-	mutex_unlock(qs_cfg->qs_mutex);
+	/* PF main VSI gets only 64 FD resources from guaranteed pool
+	 * when ADQ is configured. This is current policy, change as needed
+	 */
+#define ICE_PF_VSI_GFLTR	64
 
-	return -ENOMEM;
-}
+	/* determines FD filter resources per VSI from shared(best effort) and
+	 * dedicated pool
+	 */
+	if (vsi->type == ICE_VSI_PF) {
+		vsi->num_gfltr = g_val;
+#ifdef NETIF_F_HW_TC
+		/* if MQPRIO ic configured, main VSI doesn't get all
+		 * FD resources from guaranteed pool. Current policy is,
+		 * PF VSI gets 64 FD resources
+		 */
+		if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+			if (g_val < ICE_PF_VSI_GFLTR)
+				return -EPERM;
+			/* allow bare minimum entries for PF VSI */
+			vsi->num_gfltr = ICE_PF_VSI_GFLTR;
+		}
+#endif /* NETIF_F_HW_TC */
 
-/**
- * __ice_vsi_get_qs - helper function for assigning queues from PF to VSI
- * @qs_cfg: gathered variables needed for pf->vsi queues assignment
- *
- * This function first tries to find contiguous space. If it is not successful,
- * it tries with the scatter approach.
- *
- * Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
- */
-static int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg)
-{
-	int ret = 0;
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
+	} else if (vsi->type == ICE_VSI_VF) {
+		vsi->num_gfltr = 0;
+
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
+	} else {
+		struct ice_vsi *main_vsi;
+		int numtc;
+
+		main_vsi = ice_get_main_vsi(pf);
+		if (!main_vsi)
+			return -EPERM;
+
+		if (!main_vsi->all_numtc)
+			return -EINVAL;
+
+		/* figure out ADQ numtc */
+		numtc = main_vsi->all_numtc - ICE_CHNL_START_TC;
+
+		/* only one TC but still asking resources for channels,
+		 * invalid config
+		 */
+		if (numtc < ICE_CHNL_START_TC)
+			return -EPERM;
 
-	ret = __ice_vsi_get_qs_contig(qs_cfg);
-	if (ret) {
-		/* contig failed, so try with scatter approach */
-		qs_cfg->mapping_mode = ICE_VSI_MAP_SCATTER;
-		qs_cfg->q_count = min_t(u16, qs_cfg->q_count,
-					qs_cfg->scatter_count);
-		ret = __ice_vsi_get_qs_sc(qs_cfg);
+		g_val -= ICE_PF_VSI_GFLTR;
+		/* channel VSIs gets equal share from guaranteed pool */
+		vsi->num_gfltr = g_val / numtc;
+
+		/* each VSI gets same "best_effort" quota */
+		vsi->num_bfltr = b_val;
 	}
-	return ret;
+
+	return 0;
 }
 
 /**
@@ -704,7 +756,7 @@ static int ice_vsi_get_qs(struct ice_vsi *vsi)
 		.scatter_count = ICE_MAX_SCATTER_TXQS,
 		.vsi_map = vsi->txq_map,
 		.vsi_map_offset = 0,
-		.mapping_mode = vsi->tx_mapping_mode
+		.mapping_mode = ICE_VSI_MAP_CONTIG
 	};
 	struct ice_qs_cfg rx_qs_cfg = {
 		.qs_mutex = &pf->avail_q_mutex,
@@ -714,18 +766,24 @@ static int ice_vsi_get_qs(struct ice_vsi *vsi)
 		.scatter_count = ICE_MAX_SCATTER_RXQS,
 		.vsi_map = vsi->rxq_map,
 		.vsi_map_offset = 0,
-		.mapping_mode = vsi->rx_mapping_mode
+		.mapping_mode = ICE_VSI_MAP_CONTIG
 	};
-	int ret = 0;
+	int ret;
 
-	vsi->tx_mapping_mode = ICE_VSI_MAP_CONTIG;
-	vsi->rx_mapping_mode = ICE_VSI_MAP_CONTIG;
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	ret = __ice_vsi_get_qs(&tx_qs_cfg);
-	if (!ret)
-		ret = __ice_vsi_get_qs(&rx_qs_cfg);
+	if (ret)
+		return ret;
+	vsi->tx_mapping_mode = tx_qs_cfg.mapping_mode;
 
-	return ret;
+	ret = __ice_vsi_get_qs(&rx_qs_cfg);
+	if (ret)
+		return ret;
+	vsi->rx_mapping_mode = rx_qs_cfg.mapping_mode;
+
+	return 0;
 }
 
 /**
@@ -764,69 +822,128 @@ bool ice_is_safe_mode(struct ice_pf *pf)
 }
 
 /**
- * ice_rss_clean - Delete RSS related VSI structures that hold user inputs
- * @vsi: the VSI being removed
+ * ice_is_peer_ena
+ * @pf: pointer to the PF struct
+ *
+ * returns true if peer devices/drivers are supported, false otherwise
  */
-static void ice_rss_clean(struct ice_vsi *vsi)
+bool ice_is_peer_ena(struct ice_pf *pf)
 {
-	struct ice_pf *pf;
-
-	pf = vsi->back;
-
-	if (vsi->rss_hkey_user)
-		devm_kfree(&pf->pdev->dev, vsi->rss_hkey_user);
-	if (vsi->rss_lut_user)
-		devm_kfree(&pf->pdev->dev, vsi->rss_lut_user);
+	return test_bit(ICE_FLAG_PEER_ENA, pf->flags);
 }
 
 /**
- * ice_vsi_set_rss_params - Setup RSS capabilities per VSI type
- * @vsi: the VSI being configured
+ * ice_vsi_clean_rss_flow_fld - Delete RSS configuration
+ * @vsi: the VSI being cleaned up
+ *
+ * This function deletes RSS input set for all flows that were configured
+ * for this VSI
  */
-static void ice_vsi_set_rss_params(struct ice_vsi *vsi)
+static void ice_vsi_clean_rss_flow_fld(struct ice_vsi *vsi)
 {
-	struct ice_hw_common_caps *cap;
 	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
-		vsi->rss_size = 1;
+	if (ice_is_safe_mode(pf))
 		return;
-	}
 
-	cap = &pf->hw.func_caps.common_cap;
+	status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx);
+	if (status)
+		dev_dbg(ice_pf_to_dev(pf), "ice_rem_vsi_rss_cfg failed for vsi = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+}
+
+/**
+ * ice_rss_clean - Delete RSS related VSI structures and configuration
+ * @vsi: the VSI being removed
+ *
+ * This function deletes RSS related VSI structures that hold user inputs
+ * and removes RSS configuration
+ */
+static void ice_rss_clean(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (vsi->rss_hkey_user)
+		devm_kfree(dev, vsi->rss_hkey_user);
+	if (vsi->rss_lut_user)
+		devm_kfree(dev, vsi->rss_lut_user);
+
+	ice_vsi_clean_rss_flow_fld(vsi);
+	/* remove RSS replay list */
+	if (!ice_is_safe_mode(pf))
+		ice_rem_vsi_rss_list(&pf->hw, vsi->idx);
+}
+
+/**
+ * ice_vsi_set_rss_params - Setup RSS capabilities per VSI type
+ * @vsi: the VSI being configured
+ */
+static void ice_vsi_set_rss_params(struct ice_vsi *vsi)
+{
+	struct ice_hw_common_caps *cap;
+	struct ice_pf *pf = vsi->back;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		vsi->rss_size = 1;
+		return;
+	}
+
+	cap = &pf->hw.func_caps.common_cap;
 	switch (vsi->type) {
+	case ICE_VSI_CHNL:
 	case ICE_VSI_PF:
 		/* PF VSI will inherit RSS instance of PF */
-		vsi->rss_table_size = cap->rss_table_size;
-		vsi->rss_size = min_t(int, num_online_cpus(),
-				      BIT(cap->rss_table_entry_width));
+		vsi->rss_table_size = (u16)cap->rss_table_size;
+		if (vsi->type == ICE_VSI_CHNL)
+			vsi->rss_size = min_t(u16, vsi->num_rxq,
+					      BIT(cap->rss_table_entry_width));
+		else
+			vsi->rss_size = min_t(u16, num_online_cpus(),
+					      BIT(cap->rss_table_entry_width));
 		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_PF;
 		break;
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		vsi->rss_table_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+		vsi->rss_size = min_t(u16, num_online_cpus(),
+				      BIT(cap->rss_table_entry_width));
+		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI;
+		break;
 	case ICE_VSI_VF:
-		/* VF VSI will gets a small RSS table
-		 * For VSI_LUT, LUT size should be set to 64 bytes
+		/* VF VSI will get a small RSS table.
+		 * For VSI_LUT, LUT size should be set to 64 bytes.
 		 */
 		vsi->rss_table_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
-		vsi->rss_size = min_t(int, num_online_cpus(),
-				      BIT(cap->rss_table_entry_width));
+		vsi->rss_size = ICE_MAX_RSS_QS_PER_VF;
 		vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_VSI;
+		if (vsi->global_lut_id) {
+			vsi->rss_table_size = ICE_AQC_GSET_RSS_LUT_TABLE_SIZE_512;
+			vsi->rss_size = ICE_MAX_MEDIUM_RSS_QS;
+			vsi->rss_lut_type = ICE_AQC_GSET_RSS_LUT_TABLE_TYPE_GLOBAL;
+		}
 		break;
 	case ICE_VSI_LB:
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n",
-			 vsi->type);
+		dev_dbg(ice_pf_to_dev(pf), "Unsupported VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
 		break;
 	}
 }
 
 /**
  * ice_set_dflt_vsi_ctx - Set default VSI context before adding a VSI
+ * @hw: HW structure used to determine the VLAN mode of the device
  * @ctxt: the VSI context being set
  *
  * This initializes a default VSI context for all sections except the Queues.
  */
-static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
+static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt)
 {
 	u32 table = 0;
 
@@ -837,13 +954,27 @@ static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
 	ctxt->info.sw_flags = ICE_AQ_VSI_SW_FLAG_SRC_PRUNE;
 	/* Traffic from VSI can be sent to LAN */
 	ctxt->info.sw_flags2 = ICE_AQ_VSI_SW_FLAG_LAN_ENA;
-	/* By default bits 3 and 4 in vlan_flags are 0's which results in legacy
-	 * behavior (show VLAN, DEI, and UP) in descriptor. Also, allow all
-	 * packets untagged/tagged.
+	/* allow all untagged/tagged packets by default on Tx */
+	ctxt->info.inner_vlan_flags = ((ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL &
+				  ICE_AQ_VSI_INNER_VLAN_TX_MODE_M) >>
+				 ICE_AQ_VSI_INNER_VLAN_TX_MODE_S);
+	/* SVM - by default bits 3 and 4 in inner_vlan_flags are 0's which
+	 * results in legacy behavior (show VLAN, DEI, and UP) in descriptor.
+	 *
+	 * DVM - leave inner VLAN in packet by default
 	 */
-	ctxt->info.vlan_flags = ((ICE_AQ_VSI_VLAN_MODE_ALL &
-				  ICE_AQ_VSI_VLAN_MODE_M) >>
-				 ICE_AQ_VSI_VLAN_MODE_S);
+	if (ice_is_dvm_ena(hw)) {
+		ctxt->info.inner_vlan_flags |=
+			ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+		ctxt->info.outer_vlan_flags =
+			(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+			 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+			ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M;
+		ctxt->info.outer_vlan_flags |=
+			(ICE_AQ_VSI_OUTER_TAG_VLAN_8100 <<
+			 ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+			ICE_AQ_VSI_OUTER_TAG_TYPE_M;
+	}
 	/* Have 1:1 UP mapping for both ingress/egress tables */
 	table |= ICE_UP_TABLE_TRANSLATE(0, 0);
 	table |= ICE_UP_TABLE_TRANSLATE(1, 1);
@@ -867,34 +998,28 @@ static void ice_set_dflt_vsi_ctx(struct ice_vsi_ctx *ctxt)
  */
 static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 {
-	u16 offset = 0, qmap = 0, tx_count = 0;
+	u16 offset = 0, qmap = 0, tx_count = 0, pow = 0;
+	u16 num_txq_per_tc, num_rxq_per_tc;
 	u16 qcount_tx = vsi->alloc_txq;
 	u16 qcount_rx = vsi->alloc_rxq;
-	u16 tx_numq_tc, rx_numq_tc;
-	u16 pow = 0, max_rss = 0;
-	bool ena_tc0 = false;
 	u8 netdev_tc = 0;
 	int i;
 
-	/* at least TC0 should be enabled by default */
-	if (vsi->tc_cfg.numtc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(0)))
-			ena_tc0 = true;
-	} else {
-		ena_tc0 = true;
+	if (!vsi->tc_cfg.numtc) {
+		/* at least TC0 should be enabled by default */
+		vsi->tc_cfg.numtc = 1;
+		vsi->tc_cfg.ena_tc = 1;
 	}
 
-	if (ena_tc0) {
-		vsi->tc_cfg.numtc++;
-		vsi->tc_cfg.ena_tc |= 1;
-	}
+	num_rxq_per_tc = min_t(u16, qcount_rx / vsi->tc_cfg.numtc, ICE_MAX_RXQS_PER_TC);
+	if (!num_rxq_per_tc)
+		num_rxq_per_tc = 1;
+	num_txq_per_tc = qcount_tx / vsi->tc_cfg.numtc;
+	if (!num_txq_per_tc)
+		num_txq_per_tc = 1;
 
-	rx_numq_tc = qcount_rx / vsi->tc_cfg.numtc;
-	if (!rx_numq_tc)
-		rx_numq_tc = 1;
-	tx_numq_tc = qcount_tx / vsi->tc_cfg.numtc;
-	if (!tx_numq_tc)
-		tx_numq_tc = 1;
+	/* find the (rounded up) power-of-2 of qcount */
+	pow = (u16)order_base_2(num_rxq_per_tc);
 
 	/* TC mapping is a function of the number of Rx queues assigned to the
 	 * VSI for each traffic class and the offset of these queues.
@@ -907,24 +1032,6 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	 *
 	 * Setup number and offset of Rx queues for all TCs for the VSI
 	 */
-
-	qcount_rx = rx_numq_tc;
-
-	/* qcount will change if RSS is enabled */
-	if (test_bit(ICE_FLAG_RSS_ENA, vsi->back->flags)) {
-		if (vsi->type == ICE_VSI_PF || vsi->type == ICE_VSI_VF) {
-			if (vsi->type == ICE_VSI_PF)
-				max_rss = ICE_MAX_LG_RSS_QS;
-			else
-				max_rss = ICE_MAX_SMALL_RSS_QS;
-			qcount_rx = min_t(int, rx_numq_tc, max_rss);
-			qcount_rx = min_t(int, qcount_rx, vsi->rss_size);
-		}
-	}
-
-	/* find the (rounded up) power-of-2 of qcount */
-	pow = order_base_2(qcount_rx);
-
 	ice_for_each_traffic_class(i) {
 		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
 			/* TC is not enabled */
@@ -938,16 +1045,16 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 
 		/* TC is enabled */
 		vsi->tc_cfg.tc_info[i].qoffset = offset;
-		vsi->tc_cfg.tc_info[i].qcount_rx = qcount_rx;
-		vsi->tc_cfg.tc_info[i].qcount_tx = tx_numq_tc;
+		vsi->tc_cfg.tc_info[i].qcount_rx = num_rxq_per_tc;
+		vsi->tc_cfg.tc_info[i].qcount_tx = num_txq_per_tc;
 		vsi->tc_cfg.tc_info[i].netdev_tc = netdev_tc++;
 
 		qmap = ((offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
 			ICE_AQ_VSI_TC_Q_OFFSET_M) |
 			((pow << ICE_AQ_VSI_TC_Q_NUM_S) &
 			 ICE_AQ_VSI_TC_Q_NUM_M);
-		offset += qcount_rx;
-		tx_count += tx_numq_tc;
+		offset += num_rxq_per_tc;
+		tx_count += num_txq_per_tc;
 		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
 	}
 
@@ -960,12 +1067,12 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	if (offset)
 		vsi->num_rxq = offset;
 	else
-		vsi->num_rxq = qcount_rx;
+		vsi->num_rxq = num_rxq_per_tc;
 
 	vsi->num_txq = tx_count;
 
 	if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) {
-		dev_dbg(&vsi->back->pdev->dev, "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
+		dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
 		/* since there is a chance that num_rxq could have been changed
 		 * in the above for loop, make num_txq equal to num_rxq.
 		 */
@@ -982,6 +1089,60 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	ctxt->info.q_mapping[1] = cpu_to_le16(vsi->num_rxq);
 }
 
+/**
+ * ice_set_fd_vsi_ctx - Set FD VSI context before adding a VSI
+ * @ctxt: the VSI context being set
+ * @vsi: the VSI being configured
+ */
+static void ice_set_fd_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
+{
+	u8 dflt_q_group, dflt_q_prio;
+	u16 dflt_q, report_q, val;
+
+	if (vsi->type != ICE_VSI_PF && vsi->type != ICE_VSI_CTRL &&
+	    vsi->type != ICE_VSI_VF && vsi->type != ICE_VSI_CHNL)
+		return;
+
+	val = ICE_AQ_VSI_PROP_FLOW_DIR_VALID | ICE_AQ_VSI_PROP_ACL_VALID;
+	ctxt->info.valid_sections |= cpu_to_le16(val);
+	dflt_q = 0;
+	dflt_q_group = 0;
+	report_q = 0;
+	dflt_q_prio = 0;
+
+	/* enable flow director filtering/programming */
+	val = ICE_AQ_VSI_FD_ENABLE | ICE_AQ_VSI_FD_PROG_ENABLE;
+	ctxt->info.fd_options = cpu_to_le16(val);
+	/* max of allocated flow director filters */
+	ctxt->info.max_fd_fltr_dedicated =
+			cpu_to_le16(vsi->num_gfltr);
+	/* max of shared flow director filters any VSI may program */
+	ctxt->info.max_fd_fltr_shared =
+			cpu_to_le16(vsi->num_bfltr);
+	/* default queue index within the VSI of the default FD */
+	val = ((dflt_q << ICE_AQ_VSI_FD_DEF_Q_S) &
+	       ICE_AQ_VSI_FD_DEF_Q_M);
+	/* target queue or queue group to the FD filter */
+	val |= ((dflt_q_group << ICE_AQ_VSI_FD_DEF_GRP_S) &
+		ICE_AQ_VSI_FD_DEF_GRP_M);
+	ctxt->info.fd_def_q = cpu_to_le16(val);
+	/* queue index on which FD filter completion is reported */
+	val = ((report_q << ICE_AQ_VSI_FD_REPORT_Q_S) &
+	       ICE_AQ_VSI_FD_REPORT_Q_M);
+	/* priority of the default qindex action */
+	val |= ((dflt_q_prio << ICE_AQ_VSI_FD_DEF_PRIORITY_S) &
+		ICE_AQ_VSI_FD_DEF_PRIORITY_M);
+	ctxt->info.fd_report_opt = cpu_to_le16(val);
+
+#define ICE_ACL_RX_PROF_MISS_CNTR ((2 << ICE_AQ_VSI_ACL_DEF_RX_PROF_S) & \
+				   ICE_AQ_VSI_ACL_DEF_RX_PROF_M)
+#define ICE_ACL_RX_TBL_MISS_CNTR ((3 << ICE_AQ_VSI_ACL_DEF_RX_TABLE_S) & \
+				  ICE_AQ_VSI_ACL_DEF_RX_TABLE_M)
+
+	val = ICE_ACL_RX_PROF_MISS_CNTR | ICE_ACL_RX_TBL_MISS_CNTR;
+	ctxt->info.acl_def_act = cpu_to_le16(val);
+}
+
 /**
  * ice_set_rss_vsi_ctx - Set RSS VSI context before adding a VSI
  * @ctxt: the VSI context being set
@@ -989,88 +1150,153 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
  */
 static void ice_set_rss_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
 {
-	u8 lut_type, hash_type;
+	u8 lut_type, hash_type, global_lut_id = 0;
+	struct device *dev;
 	struct ice_pf *pf;
 
 	pf = vsi->back;
+	dev = ice_pf_to_dev(pf);
 
 	switch (vsi->type) {
+	case ICE_VSI_CHNL:
 	case ICE_VSI_PF:
 		/* PF VSI will inherit RSS instance of PF */
 		lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_PF;
 		hash_type = ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
 		break;
 	case ICE_VSI_VF:
-		/* VF VSI will gets a small RSS table which is a VSI LUT type */
 		lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI;
+		if (vsi->global_lut_id) {
+			lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL;
+			global_lut_id = *vsi->global_lut_id;
+		}
 		hash_type = ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
 		break;
-	case ICE_VSI_LB:
-		dev_dbg(&pf->pdev->dev, "Unsupported VSI type %d\n", vsi->type);
-		return;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
+		dev_dbg(dev, "Unsupported VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
 		return;
 	}
 
 	ctxt->info.q_opt_rss = ((lut_type << ICE_AQ_VSI_Q_OPT_RSS_LUT_S) &
 				ICE_AQ_VSI_Q_OPT_RSS_LUT_M) |
-				((hash_type << ICE_AQ_VSI_Q_OPT_RSS_HASH_S) &
-				 ICE_AQ_VSI_Q_OPT_RSS_HASH_M);
+				(hash_type & ICE_AQ_VSI_Q_OPT_RSS_HASH_M) |
+				((global_lut_id << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S) &
+				ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M);
+}
+
+static void
+ice_chnl_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
+{
+	struct ice_pf *pf = vsi->back;
+	u16 qcount, qmap;
+	u8 offset = 0;
+	int pow;
+
+	qcount = min_t(int, vsi->num_rxq, pf->num_lan_msix);
+
+	pow = order_base_2(qcount);
+	qmap = ((offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
+		 ICE_AQ_VSI_TC_Q_OFFSET_M) |
+		 ((pow << ICE_AQ_VSI_TC_Q_NUM_S) &
+		   ICE_AQ_VSI_TC_Q_NUM_M);
+
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+	ctxt->info.mapping_flags |= cpu_to_le16(ICE_AQ_VSI_Q_MAP_CONTIG);
+	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->next_base_q);
+	ctxt->info.q_mapping[1] = cpu_to_le16(qcount);
 }
 
 /**
  * ice_vsi_init - Create and initialize a VSI
  * @vsi: the VSI being configured
+ * @init_vsi: is this call creating a VSI
  *
  * This initializes a VSI context depending on the VSI type to be added and
  * passes it down to the add_vsi aq command to create a new VSI.
  */
-static int ice_vsi_init(struct ice_vsi *vsi)
+static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi)
 {
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
 	struct ice_vsi_ctx *ctxt;
+	struct device *dev;
 	int ret = 0;
 
-	ctxt = devm_kzalloc(&pf->pdev->dev, sizeof(*ctxt), GFP_KERNEL);
+	dev = ice_pf_to_dev(pf);
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt)
 		return -ENOMEM;
 
-	ctxt->info = vsi->info;
 	switch (vsi->type) {
+	case ICE_VSI_CTRL:
 	case ICE_VSI_LB:
-		/* fall through */
 	case ICE_VSI_PF:
 		ctxt->flags = ICE_AQ_VSI_TYPE_PF;
 		break;
+	case ICE_VSI_CHNL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		ctxt->flags = ICE_AQ_VSI_TYPE_VMDQ2;
+		break;
 	case ICE_VSI_VF:
 		ctxt->flags = ICE_AQ_VSI_TYPE_VF;
 		/* VF number here is the absolute VF number (0-255) */
 		ctxt->vf_num = vsi->vf_id + hw->func_caps.vf_base_id;
 		break;
 	default:
-		return -ENODEV;
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* Handle VLAN pruning for channel VSI if main VSI has VLAN
+	 * prune enabled
+	 */
+	if (vsi->type == ICE_VSI_CHNL) {
+		struct ice_vsi *main_vsi;
+
+		main_vsi = ice_get_main_vsi(pf);
+		if (main_vsi && ice_vsi_is_vlan_pruning_ena(main_vsi))
+			ctxt->info.sw_flags2 |=
+				ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+		else
+			ctxt->info.sw_flags2 &=
+				~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
 	}
 
-	ice_set_dflt_vsi_ctx(ctxt);
+	ice_set_dflt_vsi_ctx(hw, ctxt);
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		ice_set_fd_vsi_ctx(ctxt, vsi);
 	/* if the switch is in VEB mode, allow VSI loopback */
 	if (vsi->vsw->bridge_mode == BRIDGE_MODE_VEB)
-		ctxt->info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+		ctxt->info.sw_flags |= (ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
 
 	/* Set LUT type and HASH type if RSS is enabled */
-	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags) &&
+	    vsi->type != ICE_VSI_CTRL) {
 		ice_set_rss_vsi_ctx(ctxt, vsi);
+		/* if updating VSI context, make sure to set valid_section:
+		 * to indicate which section of VSI context being updated
+		 */
+		if (!init_vsi)
+			ctxt->info.valid_sections |=
+				cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+	}
 
 	ctxt->info.sw_id = vsi->port_info->sw_id;
-	ice_vsi_setup_q_map(vsi, ctxt);
 
-	/* Enable MAC Antispoof with new VSI being initialized or updated */
-	if (vsi->type == ICE_VSI_VF && pf->vf[vsi->vf_id].spoofchk) {
-		ctxt->info.valid_sections |=
-			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
-		ctxt->info.sec_flags |=
-			ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
+	if (vsi->type == ICE_VSI_CHNL) {
+		ice_chnl_vsi_setup_q_map(vsi, ctxt);
+	} else {
+		ice_vsi_setup_q_map(vsi, ctxt);
+		if (!init_vsi) /* means VSI being updated */
+			/* must to indicate which section of VSI context are
+			 * being modified
+			 */
+			ctxt->info.valid_sections |=
+				cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);
 	}
 
 	/* Allow control frames out of main VSI */
@@ -1080,11 +1306,20 @@ static int ice_vsi_init(struct ice_vsi *vsi)
 			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
 	}
 
-	ret = ice_add_vsi(hw, vsi->idx, ctxt, NULL);
-	if (ret) {
-		dev_err(&pf->pdev->dev,
-			"Add VSI failed, err %d\n", ret);
-		return -EIO;
+	if (init_vsi) {
+		ret = ice_add_vsi(hw, vsi->idx, ctxt, NULL);
+		if (ret) {
+			dev_err(dev, "Add VSI failed, err %d\n", ret);
+			ret = -EIO;
+			goto out;
+		}
+	} else {
+		ret = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+		if (ret) {
+			dev_err(dev, "Update VSI failed, err %d\n", ret);
+			ret = -EIO;
+			goto out;
+		}
 	}
 
 	/* keep context for update VSI operations */
@@ -1093,131 +1328,121 @@ static int ice_vsi_init(struct ice_vsi *vsi)
 	/* record VSI number returned */
 	vsi->vsi_num = ctxt->vsi_num;
 
-	devm_kfree(&pf->pdev->dev, ctxt);
+out:
+	kfree(ctxt);
 	return ret;
 }
 
 /**
- * ice_free_q_vector - Free memory allocated for a specific interrupt vector
- * @vsi: VSI having the memory freed
- * @v_idx: index of the vector to be freed
+ * ice_free_res - free a block of resources
+ * @res: pointer to the resource
+ * @index: starting index previously returned by ice_get_res
+ * @id: identifier to track owner
+ *
+ * Returns number of resources freed
  */
-static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
+int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id)
 {
-	struct ice_q_vector *q_vector;
-	struct ice_pf *pf = vsi->back;
-	struct ice_ring *ring;
-
-	if (!vsi->q_vectors[v_idx]) {
-		dev_dbg(&pf->pdev->dev, "Queue vector at index %d not found\n",
-			v_idx);
-		return;
-	}
-	q_vector = vsi->q_vectors[v_idx];
-
-	ice_for_each_ring(ring, q_vector->tx)
-		ring->q_vector = NULL;
-	ice_for_each_ring(ring, q_vector->rx)
-		ring->q_vector = NULL;
-
-	/* only VSI with an associated netdev is set up with NAPI */
-	if (vsi->netdev)
-		netif_napi_del(&q_vector->napi);
+	int count = 0;
+	int i;
 
-	devm_kfree(&pf->pdev->dev, q_vector);
-	vsi->q_vectors[v_idx] = NULL;
-}
+	if (!res || index >= res->end)
+		return -EINVAL;
 
-/**
- * ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
- * @vsi: the VSI having memory freed
- */
-void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
-{
-	int v_idx;
+	id |= ICE_RES_VALID_BIT;
+	for (i = index; i < res->end && res->list[i] == id; i++) {
+		res->list[i] = 0;
+		count++;
+	}
 
-	ice_for_each_q_vector(vsi, v_idx)
-		ice_free_q_vector(vsi, v_idx);
+	return count;
 }
 
 /**
- * ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
- * @vsi: the VSI being configured
- * @v_idx: index of the vector in the VSI struct
+ * ice_search_res - Search the tracker for a block of resources
+ * @res: pointer to the resource
+ * @needed: size of the block needed
+ * @id: identifier to track owner
  *
- * We allocate one q_vector. If allocation fails we return -ENOMEM.
+ * Returns the base item index of the block, or -ENOMEM for error
  */
-static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx)
+static int ice_search_res(struct ice_res_tracker *res, u16 needed, u16 id)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_q_vector *q_vector;
+	u16 start = 0, end = 0;
 
-	/* allocate q_vector */
-	q_vector = devm_kzalloc(&pf->pdev->dev, sizeof(*q_vector), GFP_KERNEL);
-	if (!q_vector)
+	if (needed > res->end)
 		return -ENOMEM;
 
-	q_vector->vsi = vsi;
-	q_vector->v_idx = v_idx;
-	if (vsi->type == ICE_VSI_VF)
-		goto out;
-	/* only set affinity_mask if the CPU is online */
-	if (cpu_online(v_idx))
-		cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
+	id |= ICE_RES_VALID_BIT;
 
-	/* This will not be called in the driver load path because the netdev
-	 * will not be created yet. All other cases with register the NAPI
-	 * handler here (i.e. resume, reset/rebuild, etc.)
-	 */
-	if (vsi->netdev)
-		netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll,
-			       NAPI_POLL_WEIGHT);
+	do {
+		/* skip already allocated entries */
+		if (res->list[end++] & ICE_RES_VALID_BIT) {
+			start = end;
+			if ((start + needed) > res->end)
+				break;
+		}
 
-out:
-	/* tie q_vector and VSI together */
-	vsi->q_vectors[v_idx] = q_vector;
+		if (end == (start + needed)) {
+			int i = start;
 
-	return 0;
+			/* there was enough, so assign it to the requestor */
+			while (i != end)
+				res->list[i++] = id;
+
+			return start;
+		}
+	} while (end < res->end);
+
+	return -ENOMEM;
 }
 
 /**
- * ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
- * @vsi: the VSI being configured
- *
- * We allocate one q_vector per queue interrupt. If allocation fails we
- * return -ENOMEM.
+ * ice_get_free_res_count - Get free count from a resource tracker
+ * @res: Resource tracker instance
  */
-static int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
+static u16 ice_get_free_res_count(struct ice_res_tracker *res)
 {
-	struct ice_pf *pf = vsi->back;
-	int v_idx = 0, num_q_vectors;
-	int err;
+	u16 i, count = 0;
 
-	if (vsi->q_vectors[0]) {
-		dev_dbg(&pf->pdev->dev, "VSI %d has existing q_vectors\n",
-			vsi->vsi_num);
-		return -EEXIST;
-	}
+	for (i = 0; i < res->end; i++)
+		if (!(res->list[i] & ICE_RES_VALID_BIT))
+			count++;
 
-	num_q_vectors = vsi->num_q_vectors;
+	return count;
+}
 
-	for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
-		err = ice_vsi_alloc_q_vector(vsi, v_idx);
-		if (err)
-			goto err_out;
-	}
+/**
+ * ice_get_valid_res_count - Get in-use count from a resource tracker
+ * @res: Resource tracker instance
+ */
+u16 ice_get_valid_res_count(struct ice_res_tracker *res)
+{
+	return res->end - ice_get_free_res_count(res);
+}
 
-	return 0;
+/**
+ * ice_get_res - get a block of resources
+ * @pf: board private structure
+ * @res: pointer to the resource
+ * @needed: size of the block needed
+ * @id: identifier to track owner
+ *
+ * Returns the base item index of the block, or negative for error
+ */
+int
+ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id)
+{
+	if (!res || !pf)
+		return -EINVAL;
 
-err_out:
-	while (v_idx--)
-		ice_free_q_vector(vsi, v_idx);
+	if (!needed || needed > res->num_entries || id >= ICE_RES_VALID_BIT) {
+		dev_err(ice_pf_to_dev(pf), "param err: needed=%d, num_entries = %d id=0x%04x\n",
+			needed, res->num_entries, id);
+		return -EINVAL;
+	}
 
-	dev_err(&pf->pdev->dev,
-		"Failed to allocate %d q_vector for VSI %d, ret=%d\n",
-		vsi->num_q_vectors, vsi->vsi_num, err);
-	vsi->num_q_vectors = 0;
-	return err;
+	return ice_search_res(res, needed, id);
 }
 
 /**
@@ -1233,28 +1458,51 @@ static int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
 static int ice_vsi_setup_vector_base(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
+	struct device *dev;
 	u16 num_q_vectors;
+	int base = 0;
 
+	dev = ice_pf_to_dev(pf);
 	/* SRIOV doesn't grab irq_tracker entries for each VSI */
 	if (vsi->type == ICE_VSI_VF)
 		return 0;
+	if (vsi->type == ICE_VSI_CHNL)
+		return 0;
 
 	if (vsi->base_vector) {
-		dev_dbg(&pf->pdev->dev, "VSI %d has non-zero base vector %d\n",
+		dev_dbg(dev, "VSI %d has non-zero base vector %d\n",
 			vsi->vsi_num, vsi->base_vector);
 		return -EEXIST;
 	}
 
 	num_q_vectors = vsi->num_q_vectors;
 	/* reserve slots from OS requested IRQs */
-	vsi->base_vector = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
-				       vsi->idx);
-	if (vsi->base_vector < 0) {
-		dev_err(&pf->pdev->dev,
-			"Failed to get tracking for %d vectors for VSI %d, err=%d\n",
-			num_q_vectors, vsi->vsi_num, vsi->base_vector);
+	if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID) {
+		int i;
+
+		ice_for_each_vf(pf, i) {
+			struct ice_vf *vf = &pf->vf[i];
+
+			if (i != vsi->vf_id && vf->ctrl_vsi_idx != ICE_NO_VSI) {
+				base = pf->vsi[vf->ctrl_vsi_idx]->base_vector;
+				break;
+			}
+		}
+		if (i == pf->num_alloc_vfs)
+			base = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
+					   ICE_RES_VF_CTRL_VEC_ID);
+	} else {
+		base = ice_get_res(pf, pf->irq_tracker, num_q_vectors,
+				   vsi->idx);
+	}
+
+	if (base < 0) {
+		dev_err(dev, "%d MSI-X interrupts available. %s %d failed to get %d MSI-X vectors\n",
+			ice_get_free_res_count(pf->irq_tracker),
+			ice_vsi_type_str(vsi->type), vsi->idx, num_q_vectors);
 		return -ENOENT;
 	}
+	vsi->base_vector = (u16)base;
 	pf->num_avail_sw_msix -= num_q_vectors;
 
 	return 0;
@@ -1268,11 +1516,23 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
 {
 	int i;
 
+	/* Avoid stale references by clearing map from vector to ring */
+	if (vsi->q_vectors) {
+		ice_for_each_q_vector(vsi, i) {
+			struct ice_q_vector *q_vector = vsi->q_vectors[i];
+
+			if (q_vector) {
+				q_vector->tx.ring = NULL;
+				q_vector->rx.ring = NULL;
+			}
+		}
+	}
+
 	if (vsi->tx_rings) {
 		for (i = 0; i < vsi->alloc_txq; i++) {
 			if (vsi->tx_rings[i]) {
 				kfree_rcu(vsi->tx_rings[i], rcu);
-				vsi->tx_rings[i] = NULL;
+				WRITE_ONCE(vsi->tx_rings[i], NULL);
 			}
 		}
 	}
@@ -1280,7 +1540,7 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
 		for (i = 0; i < vsi->alloc_rxq; i++) {
 			if (vsi->rx_rings[i]) {
 				kfree_rcu(vsi->rx_rings[i], rcu);
-				vsi->rx_rings[i] = NULL;
+				WRITE_ONCE(vsi->rx_rings[i], NULL);
 			}
 		}
 	}
@@ -1292,9 +1552,12 @@ static void ice_vsi_clear_rings(struct ice_vsi *vsi)
  */
 static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 {
+	bool dvm_ena = ice_is_dvm_ena(&vsi->back->hw);
 	struct ice_pf *pf = vsi->back;
-	int i;
+	struct device *dev;
+	u16 i;
 
+	dev = ice_pf_to_dev(pf);
 	/* Allocate Tx rings */
 	for (i = 0; i < vsi->alloc_txq; i++) {
 		struct ice_ring *ring;
@@ -1307,11 +1570,14 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 
 		ring->q_index = i;
 		ring->reg_idx = vsi->txq_map[i];
-		ring->ring_active = false;
 		ring->vsi = vsi;
-		ring->dev = &pf->pdev->dev;
+		ring->dev = dev;
 		ring->count = vsi->num_tx_desc;
-		vsi->tx_rings[i] = ring;
+		if (dvm_ena)
+			ring->flags |= ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2;
+		else
+			ring->flags |= ICE_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		WRITE_ONCE(vsi->tx_rings[i], ring);
 	}
 
 	/* Allocate Rx rings */
@@ -1325,12 +1591,11 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 
 		ring->q_index = i;
 		ring->reg_idx = vsi->rxq_map[i];
-		ring->ring_active = false;
 		ring->vsi = vsi;
 		ring->netdev = vsi->netdev;
-		ring->dev = &pf->pdev->dev;
+		ring->dev = dev;
 		ring->count = vsi->num_rx_desc;
-		vsi->rx_rings[i] = ring;
+		WRITE_ONCE(vsi->rx_rings[i], ring);
 	}
 
 	return 0;
@@ -1341,63 +1606,41 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
- * @vsi: the VSI being configured
- *
- * This function maps descriptor rings to the queue-specific vectors allotted
- * through the MSI-X enabling code. On a constrained vector budget, we map Tx
- * and Rx rings to the vector as "efficiently" as possible.
+ * ice_vsi_reset_stats - Reset all stats of a given VSI
+ * @vsi: the VSI whose stats needs to be cleared
  */
-#ifdef CONFIG_DCB
-void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
-#else
-static void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
-#endif /* CONFIG_DCB */
-{
-	int q_vectors = vsi->num_q_vectors;
-	int tx_rings_rem, rx_rings_rem;
-	int v_id;
-
-	/* initially assigning remaining rings count to VSIs num queue value */
-	tx_rings_rem = vsi->num_txq;
-	rx_rings_rem = vsi->num_rxq;
-
-	for (v_id = 0; v_id < q_vectors; v_id++) {
-		struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
-		int tx_rings_per_v, rx_rings_per_v, q_id, q_base;
-
-		/* Tx rings mapping to vector */
-		tx_rings_per_v = DIV_ROUND_UP(tx_rings_rem, q_vectors - v_id);
-		q_vector->num_ring_tx = tx_rings_per_v;
-		q_vector->tx.ring = NULL;
-		q_vector->tx.itr_idx = ICE_TX_ITR;
-		q_base = vsi->num_txq - tx_rings_rem;
-
-		for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
-			struct ice_ring *tx_ring = vsi->tx_rings[q_id];
-
-			tx_ring->q_vector = q_vector;
-			tx_ring->next = q_vector->tx.ring;
-			q_vector->tx.ring = tx_ring;
-		}
-		tx_rings_rem -= tx_rings_per_v;
+static void ice_vsi_reset_stats(struct ice_vsi *vsi)
+{
+	int i;
 
-		/* Rx rings mapping to vector */
-		rx_rings_per_v = DIV_ROUND_UP(rx_rings_rem, q_vectors - v_id);
-		q_vector->num_ring_rx = rx_rings_per_v;
-		q_vector->rx.ring = NULL;
-		q_vector->rx.itr_idx = ICE_RX_ITR;
-		q_base = vsi->num_rxq - rx_rings_rem;
+	if (!vsi)
+		return;
 
-		for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
-			struct ice_ring *rx_ring = vsi->rx_rings[q_id];
+	memset(&vsi->net_stats, 0, sizeof(vsi->net_stats));
+	memset(&vsi->eth_stats, 0, sizeof(vsi->eth_stats));
+	memset(&vsi->eth_stats_prev, 0, sizeof(vsi->eth_stats_prev));
 
-			rx_ring->q_vector = q_vector;
-			rx_ring->next = q_vector->rx.ring;
-			q_vector->rx.ring = rx_ring;
+	if (vsi->tx_rings) {
+		ice_for_each_txq(vsi, i) {
+			if (vsi->tx_rings[i]) {
+				memset(&vsi->tx_rings[i]->stats, 0,
+				       sizeof(vsi->tx_rings[i]->stats));
+				memset(&vsi->tx_rings[i]->tx_stats, 0,
+				       sizeof(vsi->tx_rings[i])->tx_stats);
+			}
+		}
+	}
+	if (vsi->rx_rings) {
+		ice_for_each_rxq(vsi, i) {
+			if (vsi->rx_rings[i]) {
+				memset(&vsi->rx_rings[i]->stats, 0,
+				       sizeof(vsi->rx_rings[i]->stats));
+				memset(&vsi->rx_rings[i]->rx_stats, 0,
+				       sizeof(vsi->rx_rings[i])->rx_stats);
+			}
 		}
-		rx_rings_rem -= rx_rings_per_v;
 	}
+	vsi->stat_offsets_loaded = false;
 }
 
 /**
@@ -1409,15 +1652,13 @@ static void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
  * LUT, while in the event of enable request for RSS, it will reconfigure RSS
  * LUT.
  */
-int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
+void ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
 {
-	int err = 0;
 	u8 *lut;
 
-	lut = devm_kzalloc(&vsi->back->pdev->dev, vsi->rss_table_size,
-			   GFP_KERNEL);
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
 	if (!lut)
-		return -ENOMEM;
+		return;
 
 	if (ena) {
 		if (vsi->rss_lut_user)
@@ -1427,26 +1668,45 @@ int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena)
 					 vsi->rss_size);
 	}
 
-	err = ice_set_rss(vsi, NULL, lut, vsi->rss_table_size);
-	devm_kfree(&vsi->back->pdev->dev, lut);
-	return err;
+	ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	kfree(lut);
 }
 
 /**
  * ice_vsi_cfg_rss_lut_key - Configure RSS params for a VSI
  * @vsi: VSI to be configured
  */
-static int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
+int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
 {
-	struct ice_aqc_get_set_rss_keys *key;
 	struct ice_pf *pf = vsi->back;
-	enum ice_status status;
-	int err = 0;
-	u8 *lut;
+	struct device *dev;
+	u8 *lut, *key;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF && vsi->ch_rss_size &&
+	    (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))) {
+		vsi->rss_size = min_t(u16, vsi->rss_size, vsi->ch_rss_size);
+	} else {
+		vsi->rss_size = min_t(u16, vsi->rss_size, vsi->num_rxq);
 
-	vsi->rss_size = min_t(int, vsi->rss_size, vsi->num_rxq);
+		/* If orig_rss_size is valid and it is less than determined
+		 * main VSI's rss_size, update main VSI's rss_size to be
+		 * orig_rss_size so that when tc-qdisc is deleted, main VSI
+		 * RSS table gets programmed to be correct (whatever it was
+		 * to begin with (prior to setup-tc for ADQ config)
+		 */
+		if (vsi->orig_rss_size && vsi->rss_size < vsi->orig_rss_size &&
+		    vsi->orig_rss_size <= vsi->num_rxq) {
+			vsi->rss_size = vsi->orig_rss_size;
+			/* now orig_rss_size is used, reset it to zero */
+			vsi->orig_rss_size = 0;
+		}
+	}
+#endif /* NETIF_F_HW_TC */
 
-	lut = devm_kzalloc(&pf->pdev->dev, vsi->rss_table_size, GFP_KERNEL);
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
 	if (!lut)
 		return -ENOMEM;
 
@@ -1455,75 +1715,146 @@ static int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi)
 	else
 		ice_fill_rss_lut(lut, vsi->rss_table_size, vsi->rss_size);
 
-	status = ice_aq_set_rss_lut(&pf->hw, vsi->idx, vsi->rss_lut_type, lut,
-				    vsi->rss_table_size);
-
-	if (status) {
-		dev_err(&pf->pdev->dev,
-			"set_rss_lut failed, error %d\n", status);
-		err = -EIO;
+	err = ice_set_rss_lut(vsi, lut, vsi->rss_table_size);
+	if (err) {
+		dev_err(dev, "set_rss_lut failed, error %d\n", err);
 		goto ice_vsi_cfg_rss_exit;
 	}
 
-	key = devm_kzalloc(&pf->pdev->dev, sizeof(*key), GFP_KERNEL);
+	key = kzalloc(ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE, GFP_KERNEL);
 	if (!key) {
 		err = -ENOMEM;
 		goto ice_vsi_cfg_rss_exit;
 	}
 
 	if (vsi->rss_hkey_user)
-		memcpy(key,
-		       (struct ice_aqc_get_set_rss_keys *)vsi->rss_hkey_user,
-		       ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
+		memcpy(key, vsi->rss_hkey_user, ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
 	else
-		netdev_rss_key_fill((void *)key,
-				    ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
-
-	status = ice_aq_set_rss_key(&pf->hw, vsi->idx, key);
+		netdev_rss_key_fill((void *)key, ICE_GET_SET_RSS_KEY_EXTEND_KEY_SIZE);
 
-	if (status) {
-		dev_err(&pf->pdev->dev, "set_rss_key failed, error %d\n",
-			status);
-		err = -EIO;
-	}
+	err = ice_set_rss_key(vsi, key);
+	if (err)
+		dev_err(dev, "set_rss_key failed, error %d\n", err);
 
-	devm_kfree(&pf->pdev->dev, key);
+	kfree(key);
 ice_vsi_cfg_rss_exit:
-	devm_kfree(&pf->pdev->dev, lut);
+	kfree(lut);
 	return err;
 }
 
 /**
- * ice_add_mac_to_list - Add a MAC address filter entry to the list
- * @vsi: the VSI to be forwarded to
- * @add_list: pointer to the list which contains MAC filter entries
- * @macaddr: the MAC address to be added.
+ * ice_vsi_set_vf_rss_flow_fld - Sets VF VSI RSS input set for different flows
+ * @vsi: VSI to be configured
  *
- * Adds MAC address filter entry to the temp list
+ * This function will only be called during the VF VSI setup. Upon successful
+ * completion of package download, this function will configure default RSS
+ * input sets for VF VSI.
+ */
+static void ice_vsi_set_vf_rss_flow_fld(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi->vsi_num);
+		return;
+	}
+
+	status = ice_add_avf_rss_cfg(&pf->hw, vsi->idx, ICE_DEFAULT_RSS_HENA);
+	if (status)
+		dev_dbg(dev, "ice_add_avf_rss_cfg failed for vsi = %d, error = %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+}
+
+
+static const struct ice_rss_hash_cfg default_rss_cfgs[] = {
+	/* configure RSS for IPv4 with input set IP src/dst */
+	{ICE_FLOW_SEG_HDR_IPV4, ICE_FLOW_HASH_IPV4, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for IPv6 with input set IPv6 src/dst */
+	{ICE_FLOW_SEG_HDR_IPV6, ICE_FLOW_HASH_IPV6, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for tcp4 with input set IP src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_TCP_IPV4,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for udp4 with input set IP src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_UDP_IPV4,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for sctp4 with input set IP src/dst */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_SCTP_IPV4, ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for tcp6 with input set IPv6 src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_TCP_IPV6,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for udp6 with input set IPv6 src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_UDP_IPV6,  ICE_RSS_ANY_HEADERS, false},
+	/* configure RSS for sctp6 with input set IPv6 src/dst */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_SCTP_IPV6, ICE_RSS_ANY_HEADERS, false},
+};
+
+/**
+ * ice_vsi_set_rss_flow_fld - Sets RSS input set for different flows
+ * @vsi: VSI to be configured
  *
- * Returns 0 on success or ENOMEM on failure.
+ * This function will only be called after successful download package call
+ * during initialization of PF. Since the downloaded package will erase the
+ * RSS section, this function will configure RSS input sets for different
+ * flow types. The last profile added has the highest priority, therefore 2
+ * tuple profiles (i.e. IPv4 src/dst) are added before 4 tuple profiles
+ * (i.e. IPv4 src/dst TCP src/dst port).
  */
-int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
-			const u8 *macaddr)
+static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *tmp;
+	u16 vsi_handle = vsi->idx, vsi_num = vsi->vsi_num;
 	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct device *dev;
+	u32 i;
 
-	tmp = devm_kzalloc(&pf->pdev->dev, sizeof(*tmp), GFP_ATOMIC);
-	if (!tmp)
-		return -ENOMEM;
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n",
+			vsi_num);
+		return;
+	}
 
-	tmp->fltr_info.flag = ICE_FLTR_TX;
-	tmp->fltr_info.src_id = ICE_SRC_ID_VSI;
-	tmp->fltr_info.lkup_type = ICE_SW_LKUP_MAC;
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.vsi_handle = vsi->idx;
-	ether_addr_copy(tmp->fltr_info.l_data.mac.mac_addr, macaddr);
+	for (i = 0; i < ARRAY_SIZE(default_rss_cfgs); i++) {
+		const struct ice_rss_hash_cfg *cfg = &default_rss_cfgs[i];
 
-	INIT_LIST_HEAD(&tmp->list_entry);
-	list_add(&tmp->list_entry, add_list);
+		status = ice_add_rss_cfg(hw, vsi_handle, cfg);
+		if (status)
+			dev_dbg(dev, "ice_add_rss_cfg failed, addl_hdrs = %x, hash_flds = %llx, hdr_type = %d, symm = %d\n",
+				cfg->addl_hdrs, cfg->hash_flds, cfg->hdr_type,
+				cfg->symm);
+	}
+}
 
-	return 0;
+/**
+ * ice_pf_state_is_nominal - checks the PF for nominal state
+ * @pf: pointer to PF to check
+ *
+ * Check the PF's state for a collection of bits that would indicate
+ * the PF is in a state that would inhibit normal operation for
+ * driver functionality.
+ *
+ * Returns true if PF is in a nominal state, false otherwise
+ */
+bool ice_pf_state_is_nominal(struct ice_pf *pf)
+{
+	DECLARE_BITMAP(check_bits, ICE_STATE_NBITS) = { 0 };
+
+	if (!pf)
+		return false;
+
+	bitmap_set(check_bits, 0, ICE_STATE_NOMINAL_CHECK_BITS);
+	if (bitmap_intersects(pf->state, check_bits, ICE_STATE_NBITS))
+		return false;
+
+	return true;
 }
 
 /**
@@ -1573,103 +1904,86 @@ void ice_update_eth_stats(struct ice_vsi *vsi)
 }
 
 /**
- * ice_free_fltr_list - free filter lists helper
- * @dev: pointer to the device struct
- * @h: pointer to the list head to be freed
- *
- * Helper function to free filter lists previously created using
- * ice_add_mac_to_list
+ * ice_vsi_cfg_frame_size - setup max frame size and Rx buffer length
+ * @vsi: VSI
  */
-void ice_free_fltr_list(struct device *dev, struct list_head *h)
+void ice_vsi_cfg_frame_size(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *e, *tmp;
-
-	list_for_each_entry_safe(e, tmp, h, list_entry) {
-		list_del(&e->list_entry);
-		devm_kfree(dev, e);
+	if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) {
+		vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+		vsi->rx_buf_len = ICE_RXBUF_2048;
+#if (PAGE_SIZE < 8192)
+	} else if (!ICE_2K_TOO_SMALL_WITH_PADDING &&
+		   (vsi->netdev->mtu <= ETH_DATA_LEN)) {
+		vsi->max_frame = ICE_RXBUF_1536 - NET_IP_ALIGN;
+		vsi->rx_buf_len = ICE_RXBUF_1536 - NET_IP_ALIGN;
+#endif
+	} else {
+		vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+#if (PAGE_SIZE < 8192)
+		vsi->rx_buf_len = ICE_RXBUF_3072;
+#else
+		vsi->rx_buf_len = ICE_RXBUF_2048;
+#endif
 	}
 }
 
 /**
- * ice_vsi_add_vlan - Add VSI membership for given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be added
+ * ice_write_qrxflxp_cntxt - write/configure QRXFLXP_CNTXT register
+ * @hw: HW pointer
+ * @pf_q: index of the Rx queue in the PF's queue space
+ * @rxdid: flexible descriptor RXDID
+ * @prio: priority for the RXDID for this queue
+ * @ena_ts: true to enable timestamp and false to disable timestamp
  */
-int ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid)
+void
+ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio,
+			bool ena_ts)
 {
-	struct ice_fltr_list_entry *tmp;
-	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
-	enum ice_status status;
-	int err = 0;
+	int regval = rd32(hw, QRXFLXP_CNTXT(pf_q));
 
-	tmp = devm_kzalloc(&pf->pdev->dev, sizeof(*tmp), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	/* clear any previous values */
+	regval &= ~(QRXFLXP_CNTXT_RXDID_IDX_M |
+		    QRXFLXP_CNTXT_RXDID_PRIO_M |
+		    QRXFLXP_CNTXT_TS_M);
 
-	tmp->fltr_info.lkup_type = ICE_SW_LKUP_VLAN;
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.flag = ICE_FLTR_TX;
-	tmp->fltr_info.src_id = ICE_SRC_ID_VSI;
-	tmp->fltr_info.vsi_handle = vsi->idx;
-	tmp->fltr_info.l_data.vlan.vlan_id = vid;
+	regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) &
+		QRXFLXP_CNTXT_RXDID_IDX_M;
 
-	INIT_LIST_HEAD(&tmp->list_entry);
-	list_add(&tmp->list_entry, &tmp_add_list);
+	regval |= (prio << QRXFLXP_CNTXT_RXDID_PRIO_S) &
+		QRXFLXP_CNTXT_RXDID_PRIO_M;
 
-	status = ice_add_vlan(&pf->hw, &tmp_add_list);
-	if (status) {
-		err = -ENODEV;
-		dev_err(&pf->pdev->dev, "Failure Adding VLAN %d on VSI %i\n",
-			vid, vsi->vsi_num);
-	}
+	if (ena_ts)
+		/* Enable TimeSync on this queue */
+		regval |= QRXFLXP_CNTXT_TS_M;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
-	return err;
+	wr32(hw, QRXFLXP_CNTXT(pf_q), regval);
 }
 
-/**
- * ice_vsi_kill_vlan - Remove VSI membership for a given VLAN
- * @vsi: the VSI being configured
- * @vid: VLAN ID to be removed
- *
- * Returns 0 on success and negative on failure
- */
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
+int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx)
 {
-	struct ice_fltr_list_entry *list;
-	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
-	enum ice_status status;
-	int err = 0;
+	if (q_idx >= vsi->num_rxq)
+		return -EINVAL;
+
+	return ice_vsi_cfg_rxq(vsi->rx_rings[q_idx]);
+}
+
+int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_ring **tx_rings, u16 q_idx)
+{
+	struct ice_aqc_add_tx_qgrp *qg_buf;
+	int err;
+
+	if (q_idx >= vsi->alloc_txq || !tx_rings || !tx_rings[q_idx])
+		return -EINVAL;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
+	qg_buf = kzalloc(struct_size(qg_buf, txqs, 1), GFP_KERNEL);
+	if (!qg_buf)
 		return -ENOMEM;
 
-	list->fltr_info.lkup_type = ICE_SW_LKUP_VLAN;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	list->fltr_info.l_data.vlan.vlan_id = vid;
-	list->fltr_info.flag = ICE_FLTR_TX;
-	list->fltr_info.src_id = ICE_SRC_ID_VSI;
-
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
-
-	status = ice_remove_vlan(&pf->hw, &tmp_add_list);
-	if (status == ICE_ERR_DOES_NOT_EXIST) {
-		dev_dbg(&pf->pdev->dev,
-			"Failed to remove VLAN %d on VSI %i, it does not exist, status: %d\n",
-			vid, vsi->vsi_num, status);
-	} else if (status) {
-		dev_err(&pf->pdev->dev,
-			"Error removing VLAN %d on vsi %i error: %d\n",
-			vid, vsi->vsi_num, status);
-		err = -EIO;
-	}
+	qg_buf->num_txqs = 1;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+	err = ice_vsi_cfg_txq(vsi, tx_rings[q_idx], qg_buf);
+	kfree(qg_buf);
 	return err;
 }
 
@@ -1687,83 +2001,16 @@ int ice_vsi_cfg_rxqs(struct ice_vsi *vsi)
 	if (vsi->type == ICE_VSI_VF)
 		goto setup_rings;
 
-	if (vsi->netdev && vsi->netdev->mtu > ETH_DATA_LEN)
-		vsi->max_frame = vsi->netdev->mtu +
-			ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
-	else
-		vsi->max_frame = ICE_RXBUF_2048;
-
-	vsi->rx_buf_len = ICE_RXBUF_2048;
+	ice_vsi_cfg_frame_size(vsi);
 setup_rings:
 	/* set up individual rings */
-	for (i = 0; i < vsi->num_rxq; i++) {
-		int err;
+	ice_for_each_rxq(vsi, i) {
+		int err = ice_vsi_cfg_rxq(vsi->rx_rings[i]);
 
-		err = ice_setup_rx_ctx(vsi->rx_rings[i]);
-		if (err) {
-			dev_err(&vsi->back->pdev->dev,
-				"ice_setup_rx_ctx failed for RxQ %d, err %d\n",
-				i, err);
+		if (err)
 			return err;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * ice_vsi_cfg_txq - Configure single Tx queue
- * @vsi: the VSI that queue belongs to
- * @ring: Tx ring to be configured
- * @tc_q_idx: queue index within given TC
- * @qg_buf: queue group buffer
- * @tc: TC that Tx ring belongs to
- */
-static int
-ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, u16 tc_q_idx,
-		struct ice_aqc_add_tx_qgrp *qg_buf, u8 tc)
-{
-	struct ice_tlan_ctx tlan_ctx = { 0 };
-	struct ice_aqc_add_txqs_perq *txq;
-	struct ice_pf *pf = vsi->back;
-	u8 buf_len = sizeof(*qg_buf);
-	enum ice_status status;
-	u16 pf_q;
-
-	pf_q = ring->reg_idx;
-	ice_setup_tx_ctx(ring, &tlan_ctx, pf_q);
-	/* copy context contents into the qg_buf */
-	qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q);
-	ice_set_ctx((u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx,
-		    ice_tlan_ctx_info);
-
-	/* init queue specific tail reg. It is referred as
-	 * transmit comm scheduler queue doorbell.
-	 */
-	ring->tail = pf->hw.hw_addr + QTX_COMM_DBELL(pf_q);
-
-	/* Add unique software queue handle of the Tx queue per
-	 * TC into the VSI Tx ring
-	 */
-	ring->q_handle = tc_q_idx;
-
-	status = ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc, ring->q_handle,
-				 1, qg_buf, buf_len, NULL);
-	if (status) {
-		dev_err(&pf->pdev->dev,
-			"Failed to set LAN Tx queue context, error: %d\n",
-			status);
-		return -ENODEV;
 	}
 
-	/* Add Tx Queue TEID into the VSI Tx ring from the
-	 * response. This will complete configuring and
-	 * enabling the queue.
-	 */
-	txq = &qg_buf->txqs[0];
-	if (pf_q == le16_to_cpu(txq->txq_id))
-		ring->txq_teid = le32_to_cpu(txq->q_teid);
-
 	return 0;
 }
 
@@ -1771,42 +2018,31 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, u16 tc_q_idx,
  * ice_vsi_cfg_txqs - Configure the VSI for Tx
  * @vsi: the VSI being configured
  * @rings: Tx ring array to be configured
- * @offset: offset within vsi->txq_map
  *
  * Return 0 on success and a negative value on error
  * Configure the Tx VSI for operation.
  */
 static int
-ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, int offset)
+ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings)
 {
 	struct ice_aqc_add_tx_qgrp *qg_buf;
-	struct ice_pf *pf = vsi->back;
-	u16 q_idx = 0, i;
+	u16 q_idx = 0;
 	int err = 0;
-	u8 tc;
 
-	qg_buf = devm_kzalloc(&pf->pdev->dev, sizeof(*qg_buf), GFP_KERNEL);
+	qg_buf = kzalloc(struct_size(qg_buf, txqs, 1), GFP_KERNEL);
 	if (!qg_buf)
 		return -ENOMEM;
 
 	qg_buf->num_txqs = 1;
 
-	/* set up and configure the Tx queues for each enabled TC */
-	ice_for_each_traffic_class(tc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(tc)))
-			break;
-
-		for (i = 0; i < vsi->tc_cfg.tc_info[tc].qcount_tx; i++) {
-			err = ice_vsi_cfg_txq(vsi, rings[q_idx], i + offset,
-					      qg_buf, tc);
-			if (err)
-				goto err_cfg_txqs;
-
-			q_idx++;
-		}
+	for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) {
+		err = ice_vsi_cfg_txq(vsi, rings[q_idx], qg_buf);
+		if (err)
+			goto err_cfg_txqs;
 	}
+
 err_cfg_txqs:
-	devm_kfree(&pf->pdev->dev, qg_buf);
+	kfree(qg_buf);
 	return err;
 }
 
@@ -1819,8 +2055,36 @@ ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, int offset)
  */
 int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi)
 {
-	return ice_vsi_cfg_txqs(vsi, vsi->tx_rings, 0);
+	return ice_vsi_cfg_txqs(vsi, vsi->tx_rings);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_vsi_cfg_xdp_txqs - Configure Tx queues dedicated for XDP in given VSI
+ * @vsi: the VSI being configured
+ *
+ * Return 0 on success and a negative value on error
+ * Configure the Tx queues dedicated for XDP in given VSI for operation.
+ */
+int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi)
+{
+	int ret;
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	int i;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	ret = ice_vsi_cfg_txqs(vsi, vsi->xdp_rings);
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ret)
+		return ret;
+
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		vsi->xdp_rings[i]->xsk_pool = ice_xsk_umem(vsi->xdp_rings[i]);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	return ret;
 }
+#endif /* HAVE_XDP_SUPPORT */
 
 /**
  * ice_intrl_usec_to_reg - convert interrupt rate limit to register value
@@ -1830,7 +2094,7 @@ int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi)
  * This function converts a decimal interrupt rate limit in usecs to the format
  * expected by firmware.
  */
-u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
+static u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
 {
 	u32 val = intrl / gran;
 
@@ -1840,138 +2104,73 @@ u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran)
 }
 
 /**
- * ice_cfg_itr_gran - set the ITR granularity to 2 usecs if not already set
- * @hw: board specific structure
+ * ice_write_intrl - write throttle rate limit to interrupt specific register
+ * @q_vector: pointer to interrupt specific structure
+ * @intrl: throttle rate limit in microseconds to write
  */
-static void ice_cfg_itr_gran(struct ice_hw *hw)
+void ice_write_intrl(struct ice_q_vector *q_vector, u8 intrl)
 {
-	u32 regval = rd32(hw, GLINT_CTL);
-
-	/* no need to update global register if ITR gran is already set */
-	if (!(regval & GLINT_CTL_DIS_AUTOMASK_M) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_200_M) >>
-	     GLINT_CTL_ITR_GRAN_200_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_100_M) >>
-	     GLINT_CTL_ITR_GRAN_100_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_50_M) >>
-	     GLINT_CTL_ITR_GRAN_50_S) == ICE_ITR_GRAN_US) &&
-	    (((regval & GLINT_CTL_ITR_GRAN_25_M) >>
-	      GLINT_CTL_ITR_GRAN_25_S) == ICE_ITR_GRAN_US))
-		return;
+	struct ice_hw *hw = &q_vector->vsi->back->hw;
 
-	regval = ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_200_S) &
-		  GLINT_CTL_ITR_GRAN_200_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_100_S) &
-		  GLINT_CTL_ITR_GRAN_100_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_50_S) &
-		  GLINT_CTL_ITR_GRAN_50_M) |
-		 ((ICE_ITR_GRAN_US << GLINT_CTL_ITR_GRAN_25_S) &
-		  GLINT_CTL_ITR_GRAN_25_M);
-	wr32(hw, GLINT_CTL, regval);
+	wr32(hw, GLINT_RATE(q_vector->reg_idx),
+	     ice_intrl_usec_to_reg(intrl, ICE_INTRL_GRAN_ABOVE_25));
 }
 
 /**
- * ice_cfg_itr - configure the initial interrupt throttle values
- * @hw: pointer to the HW structure
- * @q_vector: interrupt vector that's being configured
- *
- * Configure interrupt throttling values for the ring containers that are
- * associated with the interrupt vector passed in.
+ * __ice_write_itr - write throttle rate to register
+ * @q_vector: pointer to interrupt data structure
+ * @rc: pointer to ring container
+ * @itr: throttle rate in microseconds to write
  */
-static void
-ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector)
+static void __ice_write_itr(struct ice_q_vector *q_vector,
+			    struct ice_ring_container *rc, u16 itr)
 {
-	ice_cfg_itr_gran(hw);
-
-	if (q_vector->num_ring_rx) {
-		struct ice_ring_container *rc = &q_vector->rx;
-
-		/* if this value is set then don't overwrite with default */
-		if (!rc->itr_setting)
-			rc->itr_setting = ICE_DFLT_RX_ITR;
-
-		rc->target_itr = ITR_TO_REG(rc->itr_setting);
-		rc->next_update = jiffies + 1;
-		rc->current_itr = rc->target_itr;
-		wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
-		     ITR_REG_ALIGN(rc->current_itr) >> ICE_ITR_GRAN_S);
-	}
-
-	if (q_vector->num_ring_tx) {
-		struct ice_ring_container *rc = &q_vector->tx;
+	struct ice_hw *hw = &q_vector->vsi->back->hw;
 
-		/* if this value is set then don't overwrite with default */
-		if (!rc->itr_setting)
-			rc->itr_setting = ICE_DFLT_TX_ITR;
-
-		rc->target_itr = ITR_TO_REG(rc->itr_setting);
-		rc->next_update = jiffies + 1;
-		rc->current_itr = rc->target_itr;
-		wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
-		     ITR_REG_ALIGN(rc->current_itr) >> ICE_ITR_GRAN_S);
-	}
+	wr32(hw, GLINT_ITR(rc->itr_idx, q_vector->reg_idx),
+	     ITR_REG_ALIGN(itr) >> ICE_ITR_GRAN_S);
 }
 
 /**
- * ice_cfg_txq_interrupt - configure interrupt on Tx queue
- * @vsi: the VSI being configured
- * @txq: Tx queue being mapped to MSI-X vector
- * @msix_idx: MSI-X vector index within the function
- * @itr_idx: ITR index of the interrupt cause
- *
- * Configure interrupt on Tx queue by associating Tx queue to MSI-X vector
- * within the function space.
+ * ice_write_itr - write throttle rate to queue specific register
+ * @rc: pointer to ring container
+ * @itr: throttle rate in microseconds to write
  */
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
-#else
-static void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
-#endif /* CONFIG_PCI_IOV */
+void ice_write_itr(struct ice_ring_container *rc, u16 itr)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	u32 val;
+	struct ice_q_vector *q_vector;
 
-	itr_idx = (itr_idx << QINT_TQCTL_ITR_INDX_S) & QINT_TQCTL_ITR_INDX_M;
+	if (!rc->ring)
+		return;
 
-	val = QINT_TQCTL_CAUSE_ENA_M | itr_idx |
-	      ((msix_idx << QINT_TQCTL_MSIX_INDX_S) & QINT_TQCTL_MSIX_INDX_M);
+	q_vector = rc->ring->q_vector;
 
-	wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), val);
+	__ice_write_itr(q_vector, rc, itr);
 }
 
 /**
- * ice_cfg_rxq_interrupt - configure interrupt on Rx queue
- * @vsi: the VSI being configured
- * @rxq: Rx queue being mapped to MSI-X vector
- * @msix_idx: MSI-X vector index within the function
- * @itr_idx: ITR index of the interrupt cause
+ * ice_set_q_vector_intrl - set up interrupt rate limiting
+ * @q_vector: the vector to be configured
  *
- * Configure interrupt on Rx queue by associating Rx queue to MSI-X vector
- * within the function space.
+ * Interrupt rate limiting is local to the vector, not per-queue so we must
+ * detect if either ring container has dynamic enabled to decide what to set
+ * the interrupt rate limit to via INTRL settings. In the case of dynamic
+ * disabled on both, write the value with the cached setting to make sure
+ * INTRL matches user visible value.
  */
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
-#else
-static void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
-#endif /* CONFIG_PCI_IOV */
+void ice_set_q_vector_intrl(struct ice_q_vector *q_vector)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	u32 val;
-
-	itr_idx = (itr_idx << QINT_RQCTL_ITR_INDX_S) & QINT_RQCTL_ITR_INDX_M;
-
-	val = QINT_RQCTL_CAUSE_ENA_M | itr_idx |
-	      ((msix_idx << QINT_RQCTL_MSIX_INDX_S) & QINT_RQCTL_MSIX_INDX_M);
-
-	wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), val);
-
-	ice_flush(hw);
+	if (ITR_IS_DYNAMIC(&q_vector->tx) || ITR_IS_DYNAMIC(&q_vector->rx)) {
+		/* in the case of dynamic enabled, cap each vector to no more
+		 * than (4 us) 250,000 ints/sec, which allows low latency
+		 * but still less than 500,000 interrupts per second, which
+		 * reduces CPU a bit in the case of the lowest latency
+		 * setting. The 4 here is a value in microseconds.
+		 */
+		ice_write_intrl(q_vector, 4);
+	} else {
+		ice_write_intrl(q_vector, q_vector->intrl);
+	}
 }
 
 /**
@@ -1985,7 +2184,7 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
 	struct ice_hw *hw = &pf->hw;
-	u32 txq = 0, rxq = 0;
+	u16 txq = 0, rxq = 0;
 	int i, q;
 
 	for (i = 0; i < vsi->num_q_vectors; i++) {
@@ -1994,9 +2193,6 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 
 		ice_cfg_itr(hw, q_vector);
 
-		wr32(hw, GLINT_RATE(reg_idx),
-		     ice_intrl_usec_to_reg(q_vector->intrl, hw->intrl_gran));
-
 		/* Both Transmit Queue Interrupt Cause Control register
 		 * and Receive Queue Interrupt Cause control register
 		 * expects MSIX_INDX field to be the vector index
@@ -2023,217 +2219,25 @@ void ice_vsi_cfg_msix(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
- * @vsi: the VSI being changed
- */
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
-{
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	/* Here we are configuring the VSI to let the driver add VLAN tags by
-	 * setting vlan_flags to ICE_AQ_VSI_VLAN_MODE_ALL. The actual VLAN tag
-	 * insertion happens in the Tx hot path, in ice_tx_map.
-	 */
-	ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_MODE_ALL;
-
-	/* Preserve existing VLAN strip setting */
-	ctxt->info.vlan_flags |= (vsi->info.vlan_flags &
-				  ICE_AQ_VSI_VLAN_EMOD_M);
-
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for VLAN insert failed, err %d aq_err %d\n",
-			status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
-	}
-
-	vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
-}
-
-/**
- * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
- * @vsi: the VSI being changed
- * @ena: boolean value indicating if this is a enable or disable request
- */
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
-{
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	/* Here we are configuring what the VSI should do with the VLAN tag in
-	 * the Rx packet. We can either leave the tag in the packet or put it in
-	 * the Rx descriptor.
-	 */
-	if (ena)
-		/* Strip VLAN tag from Rx packet and put it in the desc */
-		ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_STR_BOTH;
-	else
-		/* Disable stripping. Leave tag in packet */
-		ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING;
-
-	/* Allow all packets untagged/tagged */
-	ctxt->info.vlan_flags |= ICE_AQ_VSI_VLAN_MODE_ALL;
-
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
-
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for VLAN strip failed, ena = %d err %d aq_err %d\n",
-			ena, status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
-	}
-
-	vsi->info.vlan_flags = ctxt->info.vlan_flags;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
-}
-
-/**
- * ice_vsi_start_rx_rings - start VSI's Rx rings
- * @vsi: the VSI whose rings are to be started
+ * ice_vsi_start_all_rx_rings - start/enable all of a VSI's Rx rings
+ * @vsi: the VSI whose rings are to be enabled
  *
  * Returns 0 on success and a negative value on error
  */
-int ice_vsi_start_rx_rings(struct ice_vsi *vsi)
+int ice_vsi_start_all_rx_rings(struct ice_vsi *vsi)
 {
-	return ice_vsi_ctrl_rx_rings(vsi, true);
+	return ice_vsi_ctrl_all_rx_rings(vsi, true);
 }
 
 /**
- * ice_vsi_stop_rx_rings - stop VSI's Rx rings
- * @vsi: the VSI
+ * ice_vsi_stop_all_rx_rings - stop/disable all of a VSI's Rx rings
+ * @vsi: the VSI whose rings are to be disabled
  *
  * Returns 0 on success and a negative value on error
  */
-int ice_vsi_stop_rx_rings(struct ice_vsi *vsi)
-{
-	return ice_vsi_ctrl_rx_rings(vsi, false);
-}
-
-/**
- * ice_trigger_sw_intr - trigger a software interrupt
- * @hw: pointer to the HW structure
- * @q_vector: interrupt vector to trigger the software interrupt for
- */
-void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector)
-{
-	wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx),
-	     (ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) |
-	     GLINT_DYN_CTL_SWINT_TRIG_M |
-	     GLINT_DYN_CTL_INTENA_M);
-}
-
-/**
- * ice_vsi_stop_tx_ring - Disable single Tx ring
- * @vsi: the VSI being configured
- * @rst_src: reset source
- * @rel_vmvf_num: Relative ID of VF/VM
- * @ring: Tx ring to be stopped
- * @txq_meta: Meta data of Tx ring to be stopped
- */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-int
-ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-		     u16 rel_vmvf_num, struct ice_ring *ring,
-		     struct ice_txq_meta *txq_meta)
-{
-	struct ice_pf *pf = vsi->back;
-	struct ice_q_vector *q_vector;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
-	u32 val;
-
-	/* clear cause_ena bit for disabled queues */
-	val = rd32(hw, QINT_TQCTL(ring->reg_idx));
-	val &= ~QINT_TQCTL_CAUSE_ENA_M;
-	wr32(hw, QINT_TQCTL(ring->reg_idx), val);
-
-	/* software is expected to wait for 100 ns */
-	ndelay(100);
-
-	/* trigger a software interrupt for the vector
-	 * associated to the queue to schedule NAPI handler
-	 */
-	q_vector = ring->q_vector;
-	if (q_vector)
-		ice_trigger_sw_intr(hw, q_vector);
-
-	status = ice_dis_vsi_txq(vsi->port_info, txq_meta->vsi_idx,
-				 txq_meta->tc, 1, &txq_meta->q_handle,
-				 &txq_meta->q_id, &txq_meta->q_teid, rst_src,
-				 rel_vmvf_num, NULL);
-
-	/* if the disable queue command was exercised during an
-	 * active reset flow, ICE_ERR_RESET_ONGOING is returned.
-	 * This is not an error as the reset operation disables
-	 * queues at the hardware level anyway.
-	 */
-	if (status == ICE_ERR_RESET_ONGOING) {
-		dev_dbg(&vsi->back->pdev->dev,
-			"Reset in progress. LAN Tx queues already disabled\n");
-	} else if (status == ICE_ERR_DOES_NOT_EXIST) {
-		dev_dbg(&vsi->back->pdev->dev,
-			"LAN Tx queues do not exist, nothing to disable\n");
-	} else if (status) {
-		dev_err(&vsi->back->pdev->dev,
-			"Failed to disable LAN Tx queues, error: %d\n", status);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-/**
- * ice_fill_txq_meta - Prepare the Tx queue's meta data
- * @vsi: VSI that ring belongs to
- * @ring: ring that txq_meta will be based on
- * @txq_meta: a helper struct that wraps Tx queue's information
- *
- * Set up a helper struct that will contain all the necessary fields that
- * are needed for stopping Tx queue
- */
-#ifndef CONFIG_PCI_IOV
-static
-#endif /* !CONFIG_PCI_IOV */
-void
-ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
-		  struct ice_txq_meta *txq_meta)
+int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi)
 {
-	u8 tc = 0;
-
-#ifdef CONFIG_DCB
-	tc = ring->dcb_tc;
-#endif /* CONFIG_DCB */
-	txq_meta->q_id = ring->reg_idx;
-	txq_meta->q_teid = ring->txq_teid;
-	txq_meta->q_handle = ring->q_handle;
-	txq_meta->vsi_idx = vsi->idx;
-	txq_meta->tc = tc;
+	return ice_vsi_ctrl_all_rx_rings(vsi, false);
 }
 
 /**
@@ -2247,34 +2251,24 @@ static int
 ice_vsi_stop_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
 		      u16 rel_vmvf_num, struct ice_ring **rings)
 {
-	u16 i, q_idx = 0;
-	int status;
-	u8 tc;
+	u16 q_idx;
 
 	if (vsi->num_txq > ICE_LAN_TXQ_MAX_QDIS)
 		return -EINVAL;
 
-	/* set up the Tx queue list to be disabled for each enabled TC */
-	ice_for_each_traffic_class(tc) {
-		if (!(vsi->tc_cfg.ena_tc & BIT(tc)))
-			break;
-
-		for (i = 0; i < vsi->tc_cfg.tc_info[tc].qcount_tx; i++) {
-			struct ice_txq_meta txq_meta = { };
+	for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) {
+		struct ice_txq_meta txq_meta = { };
+		int status;
 
-			if (!rings || !rings[q_idx])
-				return -EINVAL;
+		if (!rings || !rings[q_idx])
+			return -EINVAL;
 
-			ice_fill_txq_meta(vsi, rings[q_idx], &txq_meta);
-			status = ice_vsi_stop_tx_ring(vsi, rst_src,
-						      rel_vmvf_num,
-						      rings[q_idx], &txq_meta);
+		ice_fill_txq_meta(vsi, rings[q_idx], &txq_meta);
+		status = ice_vsi_stop_tx_ring(vsi, rst_src, rel_vmvf_num,
+					      rings[q_idx], &txq_meta);
 
-			if (status)
-				return status;
-
-			q_idx++;
-		}
+		if (status)
+			return status;
 	}
 
 	return 0;
@@ -2293,82 +2287,53 @@ ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
 	return ice_vsi_stop_tx_rings(vsi, rst_src, rel_vmvf_num, vsi->tx_rings);
 }
 
+#ifdef HAVE_XDP_SUPPORT
 /**
- * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
- * @vsi: VSI to enable or disable VLAN pruning on
- * @ena: set to true to enable VLAN pruning and false to disable it
- * @vlan_promisc: enable valid security flags if not in VLAN promiscuous mode
- *
- * returns 0 if VSI is updated, negative otherwise
+ * ice_vsi_stop_xdp_tx_rings - Disable XDP Tx rings
+ * @vsi: the VSI being configured
  */
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc)
+int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi)
 {
-	struct ice_vsi_ctx *ctxt;
-	struct device *dev;
-	struct ice_pf *pf;
-	int status;
+	return ice_vsi_stop_tx_rings(vsi, ICE_NO_RESET, 0, vsi->xdp_rings);
+}
 
-	if (!vsi)
-		return -EINVAL;
+#endif /* HAVE_XDP_SUPPORT */
 
-	pf = vsi->back;
-	dev = &pf->pdev->dev;
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	ctxt->info = vsi->info;
-
-	if (ena) {
-		ctxt->info.sec_flags |=
-			ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-			ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
-		ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	} else {
-		ctxt->info.sec_flags &=
-			~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
-			  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
-		ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	}
-
-	if (!vlan_promisc)
-		ctxt->info.valid_sections =
-			cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID |
-				    ICE_AQ_VSI_PROP_SW_VALID);
-
-	status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %d, aq_err = %d\n",
-			   ena ? "En" : "Dis", vsi->idx, vsi->vsi_num, status,
-			   pf->hw.adminq.sq_last_status);
-		goto err_out;
-	}
+/**
+ * ice_vsi_is_vlan_pruning_ena - check if VLAN pruning is enabled or not
+ * @vsi: VSI to check whether or not VLAN pruning is enabled.
+ *
+ * returns true if Rx VLAN pruning is enabled and false otherwise.
+ */
+bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi)
+{
+	if (!vsi)
+		return false;
 
-	vsi->info.sec_flags = ctxt->info.sec_flags;
-	vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+	return (vsi->info.sw_flags2 & ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA);
+}
 
-	devm_kfree(dev, ctxt);
-	return 0;
 
-err_out:
-	devm_kfree(dev, ctxt);
-	return -EIO;
-}
 
 static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
 {
-	struct ice_dcbx_cfg *cfg = &vsi->port_info->local_dcbx_cfg;
+	if (!test_bit(ICE_FLAG_DCB_ENA, vsi->back->flags)) {
+		vsi->tc_cfg.ena_tc = ICE_DFLT_TRAFFIC_CLASS;
+		vsi->tc_cfg.numtc = 1;
+		return;
+	}
 
-	vsi->tc_cfg.ena_tc = ice_dcb_get_ena_tc(cfg);
-	vsi->tc_cfg.numtc = ice_dcb_get_num_tc(cfg);
+	/* set VSI TC information based on DCB config */
+	ice_vsi_set_dcb_tc_cfg(vsi);
 }
 
 /**
  * ice_vsi_set_q_vectors_reg_idx - set the HW register index for all q_vectors
  * @vsi: VSI to set the q_vectors register index on
+ * @tc: traffic class for VF ADQ
  */
-static int
-ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
+static int ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi,
+					 u8 __maybe_unused tc)
 {
 	u16 i;
 
@@ -2379,8 +2344,7 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 		struct ice_q_vector *q_vector = vsi->q_vectors[i];
 
 		if (!q_vector) {
-			dev_err(&vsi->back->pdev->dev,
-				"Failed to set reg_idx on q_vector %d VSI %d\n",
+			dev_err(ice_pf_to_dev(vsi->back), "Failed to set reg_idx on q_vector %d VSI %d\n",
 				i, vsi->vsi_num);
 			goto clear_reg_idx;
 		}
@@ -2388,7 +2352,8 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 		if (vsi->type == ICE_VSI_VF) {
 			struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
 
-			q_vector->reg_idx = ice_calc_vf_reg_idx(vf, q_vector);
+			q_vector->reg_idx =
+					ice_calc_vf_reg_idx(vf, q_vector, tc);
 		} else {
 			q_vector->reg_idx =
 				q_vector->v_idx + vsi->base_vector;
@@ -2409,101 +2374,185 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_add_rem_eth_mac - Program VSI ethertype based filter with rule
+ * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling
  * @vsi: the VSI being configured
- * @add_rule: boolean value to add or remove ethertype filter rule
+ * @tx: bool to determine Tx or Rx rule
+ * @create: bool to determine create or remove Rule
  */
-static void
-ice_vsi_add_rem_eth_mac(struct ice_vsi *vsi, bool add_rule)
+void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
 {
-	struct ice_fltr_list_entry *list;
+	enum ice_status (*eth_fltr)(struct ice_vsi *v, u16 type, u16 flag,
+				    enum ice_sw_fwd_act_type act);
 	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
 	enum ice_status status;
+	struct device *dev;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
-		return;
-
-	list->fltr_info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
-	list->fltr_info.fltr_act = ICE_DROP_PACKET;
-	list->fltr_info.flag = ICE_FLTR_TX;
-	list->fltr_info.src_id = ICE_SRC_ID_VSI;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.l_data.ethertype_mac.ethertype = vsi->ethtype;
-
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
+	dev = ice_pf_to_dev(pf);
+	eth_fltr = create ? ice_fltr_add_eth : ice_fltr_remove_eth;
 
-	if (add_rule)
-		status = ice_add_eth_mac(&pf->hw, &tmp_add_list);
-	else
-		status = ice_remove_eth_mac(&pf->hw, &tmp_add_list);
+	if (tx) {
+		status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_TX,
+				  ICE_DROP_PACKET);
+	} else {
+		if (ice_fw_supports_lldp_fltr_ctrl(&pf->hw)) {
+			status = ice_lldp_fltr_add_remove(&pf->hw, vsi->vsi_num,
+							  create);
+		} else {
+			status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX,
+					  ICE_FWD_TO_VSI);
+		}
+	}
 
 	if (status)
-		dev_err(&pf->pdev->dev,
-			"Failure Adding or Removing Ethertype on VSI %i error: %d\n",
-			vsi->vsi_num, status);
-
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+		dev_dbg(dev, "Fail %s %s LLDP rule on VSI %i error: %s\n",
+			create ? "adding" : "removing", tx ? "TX" : "RX",
+			vsi->vsi_num, ice_stat_str(status));
 }
 
 /**
- * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling
- * @vsi: the VSI being configured
- * @tx: bool to determine Tx or Rx rule
- * @create: bool to determine create or remove Rule
+ * ice_set_agg_vsi - sets up scheduler aggregator node and move VSI into it
+ * @vsi: pointer to the VSI
+ *
+ * This function will allocate new scheduler aggregator now if needed and will
+ * move specified VSI into it.
  */
-void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
+static void ice_set_agg_vsi(struct ice_vsi *vsi)
 {
-	struct ice_fltr_list_entry *list;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_agg_node *agg_node_iter = NULL;
+	u32 agg_id = ICE_INVALID_AGG_NODE_ID;
+	struct ice_agg_node *agg_node = NULL;
+	int node_offset, max_agg_nodes = 0;
+	struct ice_port_info *port_info;
 	struct ice_pf *pf = vsi->back;
-	LIST_HEAD(tmp_add_list);
+	u32 agg_node_id_start = 0;
 	enum ice_status status;
 
-	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
-	if (!list)
+	/* create (as needed) scheduler aggregator node and move VSI into
+	 * corresponding aggregator node
+	 * - PF aggregator node to contains VSIs of type _PF, _CTRL, _CHNL
+	 * - MACVLAN aggregator node to contain MACVLAN VSIs only
+	 * - VF aggregator nodes will contain VF VSI including VF ADQ VSIs
+	 */
+	port_info = pf->hw.port_info;
+	if (!port_info)
 		return;
 
-	list->fltr_info.lkup_type = ICE_SW_LKUP_ETHERTYPE;
-	list->fltr_info.vsi_handle = vsi->idx;
-	list->fltr_info.l_data.ethertype_mac.ethertype = ETH_P_LLDP;
+	switch (vsi->type) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_CHNL:
+	case ICE_VSI_LB:
+	case ICE_VSI_PF:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		max_agg_nodes = ICE_MAX_PF_AGG_NODES;
+		agg_node_id_start = ICE_PF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->pf_agg_node[0];
+		break;
+	case ICE_VSI_VF:
+		/* user can create 'n' VFs on a given PF, but since max children
+		 * per aggregator node can be only 64. Following code handles
+		 * aggregator(s) for VF VSIs, either selects a agg_node which
+		 * was already created provided num_vsis < 64, otherwise
+		 * select next available node, which woll be created
+		 */
+		max_agg_nodes = ICE_MAX_VF_AGG_NODES;
+		agg_node_id_start = ICE_VF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->vf_agg_node[0];
+		break;
+#ifdef HAVE_NETDEV_SB_DEV
+	case ICE_VSI_OFFLOAD_MACVLAN:
+		/* there can be 'n' offloaded NACVLAN, hence select the desired
+		 * aggregator node for offloaded MACVLAN VSI
+		 */
+		max_agg_nodes = ICE_MAX_MACVLAN_AGG_NODES;
+		agg_node_id_start = ICE_MACVLAN_AGG_NODE_ID_START;
+		agg_node_iter = &pf->macvlan_agg_node[0];
+		break;
+#endif
+	default:
+		/* other VSI type, handle later if needed */
+		dev_dbg(dev, "unexpected VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
+		return;
+	}
 
-	if (tx) {
-		list->fltr_info.fltr_act = ICE_DROP_PACKET;
-		list->fltr_info.flag = ICE_FLTR_TX;
-		list->fltr_info.src_id = ICE_SRC_ID_VSI;
-	} else {
-		list->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-		list->fltr_info.flag = ICE_FLTR_RX;
-		list->fltr_info.src_id = ICE_SRC_ID_LPORT;
+	/* find the appropriate aggregator node */
+	for (node_offset = 0; node_offset < max_agg_nodes; node_offset++) {
+		/* see if we can find space in previously created
+		 * node if num_vsis < 64, otherwise skip
+		 */
+		if (agg_node_iter->num_vsis &&
+		    agg_node_iter->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+			agg_node_iter++;
+			continue;
+		}
+
+		if (agg_node_iter->valid &&
+		    agg_node_iter->agg_id != ICE_INVALID_AGG_NODE_ID) {
+			agg_id = agg_node_iter->agg_id;
+			agg_node = agg_node_iter;
+			break;
+		}
+
+		/* find unclaimed agg_id */
+		if (agg_node_iter->agg_id == ICE_INVALID_AGG_NODE_ID) {
+			agg_id = node_offset + agg_node_id_start;
+			agg_node = agg_node_iter;
+			break;
+		}
+		/* move to next agg_node */
+		agg_node_iter++;
 	}
 
-	INIT_LIST_HEAD(&list->list_entry);
-	list_add(&list->list_entry, &tmp_add_list);
+	if (!agg_node)
+		return;
 
-	if (create)
-		status = ice_add_eth_mac(&pf->hw, &tmp_add_list);
-	else
-		status = ice_remove_eth_mac(&pf->hw, &tmp_add_list);
+	/* if selected aggregator node was not created, create it */
+	if (!agg_node->valid) {
+		status = ice_cfg_agg(port_info, agg_id, ICE_AGG_TYPE_AGG,
+				     (u8)vsi->tc_cfg.ena_tc);
+		if (status) {
+			dev_err(dev, "unable to create aggregator node with agg_id %u\n",
+				agg_id);
+			return;
+		}
+		/* aggregator node is created, store the neeeded info */
+		agg_node->valid = true;
+		agg_node->agg_id = agg_id;
+	}
 
-	if (status)
-		dev_err(&pf->pdev->dev,
-			"Fail %s %s LLDP rule on VSI %i error: %d\n",
-			create ? "adding" : "removing", tx ? "TX" : "RX",
-			vsi->vsi_num, status);
+	/* move VSI to corresponding aggregator node */
+	status = ice_move_vsi_to_agg(port_info, agg_id, vsi->idx,
+				     (u8)vsi->tc_cfg.ena_tc);
+	if (status) {
+		dev_err(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, agg_id);
+		return;
+	}
+
+	/* keep active children count for aggregator node */
+	agg_node->num_vsis++;
 
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+	/* cache the 'agg_id' in VSI, so that after reset - VSI will be moved
+	 * to aggregator node
+	 */
+	vsi->agg_node = agg_node;
+	dev_dbg(dev, "successfully moved VSI idx %u tc_bitmap 0x%x) into aggregator node %d which has num_vsis %u\n",
+		vsi->idx, vsi->tc_cfg.ena_tc, vsi->agg_node->agg_id,
+		vsi->agg_node->num_vsis);
 }
 
 /**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @pi: pointer to the port_info instance
- * @type: VSI type
+ * @vsi_type: VSI type
  * @vf_id: defines VF ID to which this VSI connects. This field is meant to be
  *         used only for ICE_VSI_VF VSI type. For other VSI types, should
  *         fill-in ICE_INVAL_VFID as input.
+ * @ch: ptr to channel
+ * @tc: traffic class for VF ADQ
  *
  * This allocates the sw VSI structure and its queue resources.
  *
@@ -2512,18 +2561,22 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
  */
 struct ice_vsi *
 ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
-	      enum ice_vsi_type type, u16 vf_id)
+	      enum ice_vsi_type vsi_type, u16 vf_id, struct ice_channel *ch,
+	      u8 tc)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	enum ice_status status;
 	struct ice_vsi *vsi;
 	int ret, i;
 
-	if (type == ICE_VSI_VF)
-		vsi = ice_vsi_alloc(pf, type, vf_id);
-	else
-		vsi = ice_vsi_alloc(pf, type, ICE_INVAL_VFID);
+	if (vsi_type == ICE_VSI_CHNL)
+		vsi = ice_vsi_alloc(pf, vsi_type, ch, ICE_INVAL_VFID, 0);
+	else if (vsi_type == ICE_VSI_VF || vsi_type == ICE_VSI_CTRL) {
+		vsi = ice_vsi_alloc(pf, vsi_type, NULL, vf_id, tc);
+	} else {
+		vsi = ice_vsi_alloc(pf, vsi_type, NULL, ICE_INVAL_VFID, tc);
+	}
 
 	if (!vsi) {
 		dev_err(dev, "could not allocate VSI\n");
@@ -2535,15 +2588,21 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	if (vsi->type == ICE_VSI_PF)
 		vsi->ethtype = ETH_P_PAUSE;
 
-	if (vsi->type == ICE_VSI_VF)
+	if (vsi->type == ICE_VSI_VF || vsi->type == ICE_VSI_CTRL)
 		vsi->vf_id = vf_id;
 
-	if (ice_vsi_get_qs(vsi)) {
-		dev_err(dev, "Failed to allocate queues. vsi->idx = %d\n",
-			vsi->idx);
-		goto unroll_get_qs;
+	ice_alloc_fd_res(vsi);
+
+	if (vsi_type != ICE_VSI_CHNL) {
+		if (ice_vsi_get_qs(vsi)) {
+			dev_err(dev, "Failed to allocate queues. vsi->idx = %d\n",
+				vsi->idx);
+			goto unroll_vsi_alloc;
+		}
 	}
 
+	ice_vsi_alloc_rss_global_lut(vsi);
+
 	/* set RSS capabilities */
 	ice_vsi_set_rss_params(vsi);
 
@@ -2551,11 +2610,17 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	ice_vsi_set_tc_cfg(vsi);
 
 	/* create the VSI */
-	ret = ice_vsi_init(vsi);
+	ret = ice_vsi_init(vsi, true);
 	if (ret)
 		goto unroll_get_qs;
 
+	ice_vsi_init_vlan_ops(vsi);
+
 	switch (vsi->type) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
 	case ICE_VSI_PF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
@@ -2565,7 +2630,8 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		if (ret)
 			goto unroll_alloc_q_vector;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
+
 		if (ret)
 			goto unroll_vector_base;
 
@@ -2574,13 +2640,29 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 			goto unroll_vector_base;
 
 		ice_vsi_map_rings_to_vectors(vsi);
-
-		/* Do not exit if configuring RSS had an issue, at least
-		 * receive traffic on first queue. Hence no need to capture
-		 * return value
+		ice_vsi_reset_stats(vsi);
+		/* Perform an initial read of the statistics registers now to
+		 * set the baseline before the VSI becomes operational.
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		ice_update_eth_stats(vsi);
+
+		/* ICE_VSI_CTRL does not need RSS so skip RSS processing */
+		if (vsi->type != ICE_VSI_CTRL)
+			/* Do not exit if configuring RSS had an issue, at
+			 * least receive traffic on first queue. Hence no
+			 * need to capture return value
+			 */
+			if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+				ice_vsi_cfg_rss_lut_key(vsi);
+				ice_vsi_set_rss_flow_fld(vsi);
+			}
+		ice_init_arfs(vsi);
+		break;
+	case ICE_VSI_CHNL:
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_rss_flow_fld(vsi);
+		}
 		break;
 	case ICE_VSI_VF:
 		/* VF driver will take care of creating netdev for this type and
@@ -2596,7 +2678,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		if (ret)
 			goto unroll_alloc_q_vector;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, tc);
 		if (ret)
 			goto unroll_vector_base;
 
@@ -2604,8 +2686,10 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 		 * receive traffic on first queue. Hence no need to capture
 		 * return value
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_vf_rss_flow_fld(vsi);
+		}
 		break;
 	case ICE_VSI_LB:
 		ret = ice_vsi_alloc_rings(vsi);
@@ -2618,16 +2702,27 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	}
 
 	/* configure VSI nodes based on number of queues and TC's */
-	for (i = 0; i < vsi->tc_cfg.numtc; i++)
-		max_txqs[i] = vsi->alloc_txq;
+	ice_for_each_traffic_class(i) {
+		if (!(vsi->tc_cfg.ena_tc & BIT(i)))
+			continue;
+
+		if (vsi->type == ICE_VSI_CHNL) {
+			if (!vsi->alloc_txq && vsi->num_txq)
+				max_txqs[i] = vsi->num_txq;
+			else
+				max_txqs[i] = pf->num_lan_tx;
+		} else {
+			max_txqs[i] = vsi->alloc_txq;
+		}
+	}
 
+	dev_dbg(dev, "vsi->tc_cfg.ena_tc = %d\n", vsi->tc_cfg.ena_tc);
 	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
 				 max_txqs);
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed lan queue config, error %d\n",
-			vsi->vsi_num, status);
-		goto unroll_vector_base;
+		dev_err(dev, "VSI %d failed lan queue config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		goto unroll_clear_rings;
 	}
 
 	/* Add switch rule to drop all Tx Flow Control Frames, of look up
@@ -2635,26 +2730,25 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	 * out PAUSE or PFC frames. If enabled, FW can still send FC frames.
 	 * The rule is added once for PF VSI in order to create appropriate
 	 * recipe, since VSI/VSI list is ignored with drop action...
-	 * Also add rules to handle LLDP Tx and Rx packets.  Tx LLDP packets
-	 * need to be dropped so that VFs cannot send LLDP packets to reconfig
-	 * DCB settings in the HW.  Also, if the FW DCBX engine is not running
-	 * then Rx LLDP packets need to be redirected up the stack.
+	 * Also add rules to handle LLDP Tx packets.  Tx LLDP packets need to
+	 * be dropped so that VFs cannot send LLDP packets to reconfig DCB
+	 * settings in the HW.
 	 */
-	if (!ice_is_safe_mode(pf)) {
+	if (!ice_is_safe_mode(pf))
 		if (vsi->type == ICE_VSI_PF) {
-			ice_vsi_add_rem_eth_mac(vsi, true);
-
-			/* Tx LLDP packets */
+			ice_fltr_add_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX,
+					 ICE_DROP_PACKET);
 			ice_cfg_sw_lldp(vsi, true, true);
-
-			/* Rx LLDP packets */
-			if (!test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
-				ice_cfg_sw_lldp(vsi, false, true);
 		}
-	}
+
+
+	if (!vsi->agg_node)
+		ice_set_agg_vsi(vsi);
 
 	return vsi;
 
+unroll_clear_rings:
+	ice_vsi_clear_rings(vsi);
 unroll_vector_base:
 	/* reclaim SW interrupts back to the common pool */
 	ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
@@ -2665,6 +2759,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 	ice_vsi_delete(vsi);
 unroll_get_qs:
 	ice_vsi_put_qs(vsi);
+unroll_vsi_alloc:
 	ice_vsi_clear(vsi);
 
 	return NULL;
@@ -2684,16 +2779,23 @@ static void ice_vsi_release_msix(struct ice_vsi *vsi)
 
 	for (i = 0; i < vsi->num_q_vectors; i++) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[i];
-		u16 reg_idx = q_vector->reg_idx;
 
-		wr32(hw, GLINT_ITR(ICE_IDX_ITR0, reg_idx), 0);
-		wr32(hw, GLINT_ITR(ICE_IDX_ITR1, reg_idx), 0);
+		ice_write_intrl(q_vector, 0);
 		for (q = 0; q < q_vector->num_ring_tx; q++) {
+			ice_write_itr(&q_vector->tx, 0);
 			wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), 0);
+#ifdef HAVE_XDP_SUPPORT
+			if (ice_is_xdp_ena_vsi(vsi)) {
+				u32 xdp_txq = txq + vsi->num_xdp_txq;
+
+				wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]), 0);
+			}
+#endif /* HAVE_XDP_SUPPORT */
 			txq++;
 		}
 
 		for (q = 0; q < q_vector->num_ring_rx; q++) {
+			ice_write_itr(&q_vector->rx, 0);
 			wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), 0);
 			rxq++;
 		}
@@ -2738,8 +2840,7 @@ void ice_vsi_free_irq(struct ice_vsi *vsi)
 		/* clear the affinity_mask in the IRQ descriptor */
 		irq_set_affinity_hint(irq_num, NULL);
 		synchronize_irq(irq_num);
-		devm_free_irq(&pf->pdev->dev, irq_num,
-			      vsi->q_vectors[i]);
+		devm_free_irq(ice_pf_to_dev(pf), irq_num, vsi->q_vectors[i]);
 	}
 }
 
@@ -2781,101 +2882,104 @@ void ice_vsi_free_rx_rings(struct ice_vsi *vsi)
  */
 void ice_vsi_close(struct ice_vsi *vsi)
 {
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state))
-		ice_down(vsi);
+	enum ice_close_reason reason = ICE_REASON_INTERFACE_DOWN;
+
+	if (test_bit(ICE_CORER_REQ, vsi->back->state))
+		reason = ICE_REASON_CORER_REQ;
+	if (test_bit(ICE_GLOBR_REQ, vsi->back->state))
+		reason = ICE_REASON_GLOBR_REQ;
+	if (test_bit(ICE_PFR_REQ, vsi->back->state))
+		reason = ICE_REASON_PFR_REQ;
+	if (!ice_is_safe_mode(vsi->back) && vsi->type == ICE_VSI_PF) {
+		int ret = ice_for_each_peer(vsi->back, &reason, ice_peer_close);
 
+		if (ret)
+			dev_dbg(ice_pf_to_dev(vsi->back), "Peer device did not implement close function\n");
+	}
+
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state))
+		ice_down(vsi);
 	ice_vsi_free_irq(vsi);
 	ice_vsi_free_tx_rings(vsi);
 	ice_vsi_free_rx_rings(vsi);
 }
 
 /**
- * ice_free_res - free a block of resources
- * @res: pointer to the resource
- * @index: starting index previously returned by ice_get_res
- * @id: identifier to track owner
- *
- * Returns number of resources freed
+ * ice_ena_vsi - resume a VSI
+ * @vsi: the VSI being resume
+ * @locked: is the rtnl_lock already held
  */
-int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id)
+int ice_ena_vsi(struct ice_vsi *vsi, bool locked)
 {
-	int count = 0;
-	int i;
-
-	if (!res || index >= res->end)
-		return -EINVAL;
-
-	id |= ICE_RES_VALID_BIT;
-	for (i = index; i < res->end && res->list[i] == id; i++) {
-		res->list[i] = 0;
-		count++;
-	}
+	int err = 0;
 
-	return count;
-}
+	if (!test_bit(ICE_VSI_NEEDS_RESTART, vsi->state))
+		return 0;
 
-/**
- * ice_search_res - Search the tracker for a block of resources
- * @res: pointer to the resource
- * @needed: size of the block needed
- * @id: identifier to track owner
- *
- * Returns the base item index of the block, or -ENOMEM for error
- */
-static int ice_search_res(struct ice_res_tracker *res, u16 needed, u16 id)
-{
-	int start = 0, end = 0;
+	clear_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
 
-	if (needed > res->end)
-		return -ENOMEM;
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		if (netif_running(vsi->netdev)) {
+			if (!locked)
+				rtnl_lock();
 
-	id |= ICE_RES_VALID_BIT;
+			err = ice_open_internal(vsi->netdev);
 
-	do {
-		/* skip already allocated entries */
-		if (res->list[end++] & ICE_RES_VALID_BIT) {
-			start = end;
-			if ((start + needed) > res->end)
-				break;
+			if (!locked)
+				rtnl_unlock();
 		}
+#ifdef HAVE_NETDEV_SB_DEV
 
-		if (end == (start + needed)) {
-			int i = start;
-
-			/* there was enough, so assign it to the requestor */
-			while (i != end)
-				res->list[i++] = id;
-
-			return start;
-		}
-	} while (end < res->end);
+		if (err)
+			return err;
+		if (test_bit(ICE_FLAG_MACVLAN_ENA, vsi->back->flags) &&
+		    !ice_is_adq_active(vsi->back))
+			err = ice_vsi_cfg_netdev_tc0(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	} else if (vsi->type == ICE_VSI_CTRL) {
+		err = ice_vsi_open_ctrl(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	} else if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		err = ice_vsi_open(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	}
 
-	return -ENOMEM;
+	return err;
 }
 
 /**
- * ice_get_res - get a block of resources
- * @pf: board private structure
- * @res: pointer to the resource
- * @needed: size of the block needed
- * @id: identifier to track owner
- *
- * Returns the base item index of the block, or negative for error
+ * ice_dis_vsi - pause a VSI
+ * @vsi: the VSI being paused
+ * @locked: is the rtnl_lock already held
  */
-int
-ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id)
+void ice_dis_vsi(struct ice_vsi *vsi, bool locked)
 {
-	if (!res || !pf)
-		return -EINVAL;
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
 
-	if (!needed || needed > res->num_entries || id >= ICE_RES_VALID_BIT) {
-		dev_err(&pf->pdev->dev,
-			"param err: needed=%d, num_entries = %d id=0x%04x\n",
-			needed, res->num_entries, id);
-		return -EINVAL;
-	}
+	set_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
 
-	return ice_search_res(res, needed, id);
+	if (vsi->type == ICE_VSI_PF && vsi->netdev) {
+		if (netif_running(vsi->netdev)) {
+			if (!locked)
+				rtnl_lock();
+
+			ice_vsi_close(vsi);
+
+			if (!locked)
+				rtnl_unlock();
+		} else {
+			ice_vsi_close(vsi);
+		}
+	} else if (vsi->type == ICE_VSI_CTRL) {
+		ice_vsi_close(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	} else if (vsi->type == ICE_VSI_OFFLOAD_MACVLAN) {
+		ice_vsi_close(vsi);
+#endif /* HAVE_NETDEV_SB_DEV */
+	} else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) {
+		ice_vsi_close(vsi);
+	}
 }
 
 /**
@@ -2921,6 +3025,7 @@ void ice_vsi_dis_irq(struct ice_vsi *vsi)
 	ice_for_each_q_vector(vsi, i) {
 		if (!vsi->q_vectors[i])
 			continue;
+
 		wr32(hw, GLINT_DYN_CTL(vsi->q_vectors[i]->reg_idx), 0);
 	}
 
@@ -2969,8 +3074,14 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	 * PF that is running the work queue items currently. This is done to
 	 * avoid check_flush_dependency() warning on this wq
 	 */
-	if (vsi->netdev && !ice_is_reset_in_progress(pf->state))
+	if (vsi->netdev && !ice_is_reset_in_progress(pf->state) &&
+	    (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state))) {
 		unregister_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+
+		if (vsi->type == ICE_VSI_PF)
+			ice_devlink_destroy_pf_port(pf);
+	}
 
 	if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
 		ice_rss_clean(vsi);
@@ -2985,7 +3096,24 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	 * many interrupts each VF needs. SR-IOV MSIX resources are also
 	 * cleared in the same manner.
 	 */
-	if (vsi->type != ICE_VSI_VF) {
+	if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID) {
+		int i;
+
+		ice_for_each_vf(pf, i) {
+			struct ice_vf *vf = &pf->vf[i];
+
+			if (i != vsi->vf_id && vf->ctrl_vsi_idx != ICE_NO_VSI)
+				break;
+		}
+		if (i == pf->num_alloc_vfs) {
+			/* No other VFs left that have control VSI, reclaim SW
+			 * interrupts back to the common pool
+			 */
+			ice_free_res(pf->irq_tracker, vsi->base_vector,
+				     ICE_RES_VF_CTRL_VEC_ID);
+			pf->num_avail_sw_msix += vsi->num_q_vectors;
+		}
+	} else if (vsi->type != ICE_VSI_VF) {
 		/* reclaim SW interrupts back to the common pool */
 		ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
 		pf->num_avail_sw_msix += vsi->num_q_vectors;
@@ -2993,7 +3121,8 @@ int ice_vsi_release(struct ice_vsi *vsi)
 
 	if (!ice_is_safe_mode(pf)) {
 		if (vsi->type == ICE_VSI_PF) {
-			ice_vsi_add_rem_eth_mac(vsi, false);
+			ice_fltr_remove_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX,
+					    ICE_DROP_PACKET);
 			ice_cfg_sw_lldp(vsi, true, false);
 			/* The Rx rule will only exist to remove if the LLDP FW
 			 * engine is currently stopped
@@ -3003,17 +3132,26 @@ int ice_vsi_release(struct ice_vsi *vsi)
 		}
 	}
 
-	ice_remove_vsi_fltr(&pf->hw, vsi->idx);
+	ice_fltr_remove_all(vsi);
 	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
 	ice_vsi_delete(vsi);
 	ice_vsi_free_q_vectors(vsi);
 
-	/* make sure unregister_netdev() was called by checking __ICE_DOWN */
-	if (vsi->netdev && test_bit(__ICE_DOWN, vsi->state)) {
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
+	if (vsi->netdev) {
+		if (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state)) {
+			unregister_netdev(vsi->netdev);
+			clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+		}
+		if (test_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state)) {
+			free_netdev(vsi->netdev);
+			vsi->netdev = NULL;
+			clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		}
 	}
 
+	if (vsi->type == ICE_VSI_VF &&
+	    vsi->agg_node && vsi->agg_node->valid)
+		vsi->agg_node->num_vsis--;
 	ice_vsi_clear_rings(vsi);
 
 	ice_vsi_put_qs(vsi);
@@ -3028,16 +3166,134 @@ int ice_vsi_release(struct ice_vsi *vsi)
 	return 0;
 }
 
+/**
+ * ice_vsi_rebuild_get_coalesce - get coalesce from all q_vectors
+ * @vsi: VSI connected with q_vectors
+ * @coalesce: array of struct with stored coalesce
+ *
+ * Returns array size.
+ */
+static int
+ice_vsi_rebuild_get_coalesce(struct ice_vsi *vsi,
+			     struct ice_coalesce_stored *coalesce)
+{
+	int i;
+
+	ice_for_each_q_vector(vsi, i) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[i];
+
+		coalesce[i].itr_tx = q_vector->tx.itr_setting;
+		coalesce[i].itr_rx = q_vector->rx.itr_setting;
+		coalesce[i].intrl = q_vector->intrl;
+
+		if (i < vsi->num_txq)
+			coalesce[i].tx_valid = true;
+		if (i < vsi->num_rxq)
+			coalesce[i].rx_valid = true;
+	}
+
+	return vsi->num_q_vectors;
+}
+
+/**
+ * ice_vsi_rebuild_set_coalesce - set coalesce from earlier saved arrays
+ * @vsi: VSI connected with q_vectors
+ * @coalesce: pointer to array of struct with stored coalesce
+ * @size: size of coalesce array
+ *
+ * Before this function, ice_vsi_rebuild_get_coalesce should be called to save
+ * ITR params in arrays. If size is 0 or coalesce wasn't stored set coalesce
+ * to default value.
+ */
+static void
+ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi,
+			     struct ice_coalesce_stored *coalesce, int size)
+{
+	struct ice_ring_container *rc;
+	int i;
+
+	if ((size && !coalesce) || !vsi)
+		return;
+
+	/* there are a couple of cases that have to be handled here:
+	 *   1. The case where the number of queue vectors stays the same, but
+	 *      the number of tx or rx rings changes (the first for loop)
+	 *   2. The case where the number of queue vectors increased (the
+	 *      second for loop)
+	 */
+	for (i = 0; i < size && i < vsi->num_q_vectors; i++) {
+		/* there are 2 cases to handle here and they are the same for
+		 * both TX and RX:
+		 *   if the entry was valid previously (coalesce[i].[tr]x_valid
+		 *   and the loop variable is less than the number of rings
+		 *   allocated, then write the previous values
+		 *
+		 *   if the entry was not valid previously, but the number of
+		 *   rings is less than are allocated (this means the number of
+		 *   rings increased from previously), then write out the
+		 *   values in the first element
+		 *
+		 *   Also, always write the ITR, even if in ITR_IS_DYNAMIC
+		 *   as there is no harm because the dynamic algorithm
+		 *   will just overwrite.
+		 */
+		if (i < vsi->alloc_rxq && coalesce[i].rx_valid) {
+			rc = &vsi->q_vectors[i]->rx;
+			rc->itr_setting = coalesce[i].itr_rx;
+			ice_write_itr(rc, rc->itr_setting);
+		} else if (i < vsi->alloc_rxq) {
+			rc = &vsi->q_vectors[i]->rx;
+			rc->itr_setting = coalesce[0].itr_rx;
+			ice_write_itr(rc, rc->itr_setting);
+		}
+
+		if (i < vsi->alloc_txq && coalesce[i].tx_valid) {
+			rc = &vsi->q_vectors[i]->tx;
+			rc->itr_setting = coalesce[i].itr_tx;
+			ice_write_itr(rc, rc->itr_setting);
+		} else if (i < vsi->alloc_txq) {
+			rc = &vsi->q_vectors[i]->tx;
+			rc->itr_setting = coalesce[0].itr_tx;
+			ice_write_itr(rc, rc->itr_setting);
+		}
+
+		vsi->q_vectors[i]->intrl = coalesce[i].intrl;
+		ice_write_intrl(vsi->q_vectors[i], coalesce[i].intrl);
+	}
+
+	/* the number of queue vectors increased so write whatever is in
+	 * the first element
+	 */
+	for (; i < vsi->num_q_vectors; i++) {
+		/* transmit */
+		rc = &vsi->q_vectors[i]->tx;
+		rc->itr_setting = coalesce[0].itr_tx;
+		ice_write_itr(rc, rc->itr_setting);
+
+		/* receive */
+		rc = &vsi->q_vectors[i]->rx;
+		rc->itr_setting = coalesce[0].itr_rx;
+		ice_write_itr(rc, rc->itr_setting);
+
+		vsi->q_vectors[i]->intrl = coalesce[0].intrl;
+		ice_write_intrl(vsi->q_vectors[i], coalesce[0].intrl);
+	}
+}
+
 /**
  * ice_vsi_rebuild - Rebuild VSI after reset
  * @vsi: VSI to be rebuild
+ * @init_vsi: is this an initialization or a reconfigure of the VSI
  *
  * Returns 0 on success and negative value on failure
  */
-int ice_vsi_rebuild(struct ice_vsi *vsi)
+int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	struct ice_coalesce_stored *coalesce;
+	int prev_num_q_vectors = 0;
 	struct ice_vf *vf = NULL;
+	enum ice_vsi_type vtype;
 	enum ice_status status;
 	struct ice_pf *pf;
 	int ret, i;
@@ -3046,10 +3302,20 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		return -EINVAL;
 
 	pf = vsi->back;
-	if (vsi->type == ICE_VSI_VF)
+	vtype = vsi->type;
+	if (vtype == ICE_VSI_VF)
 		vf = &pf->vf[vsi->vf_id];
 
-	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
+	ice_vsi_init_vlan_ops(vsi);
+
+	coalesce = kcalloc(vsi->num_q_vectors,
+			   sizeof(struct ice_coalesce_stored), GFP_KERNEL);
+	if (!coalesce)
+		return -ENOMEM;
+
+	prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce);
+
+	ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx);
 	ice_vsi_free_q_vectors(vsi);
 
 	/* SR-IOV determines needed MSIX resources all at once instead of per
@@ -3057,18 +3323,25 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 	 * many interrupts each VF needs. SR-IOV MSIX resources are also
 	 * cleared in the same manner.
 	 */
-	if (vsi->type != ICE_VSI_VF) {
+	if (vtype != ICE_VSI_VF) {
 		/* reclaim SW interrupts back to the common pool */
 		ice_free_res(pf->irq_tracker, vsi->base_vector, vsi->idx);
 		pf->num_avail_sw_msix += vsi->num_q_vectors;
 		vsi->base_vector = 0;
 	}
 
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(vsi))
+		/* return value check can be skipped here, it always returns
+		 * 0 if reset is in progress
+		 */
+		ice_destroy_xdp_rings(vsi);
+#endif /* HAVE_XDP_SUPPORT */
 	ice_vsi_put_qs(vsi);
 	ice_vsi_clear_rings(vsi);
 	ice_vsi_free_arrays(vsi);
-	ice_dev_onetime_setup(&pf->hw);
-	if (vsi->type == ICE_VSI_VF)
+	ice_vsi_free_rss_global_lut(vsi);
+	if (vtype == ICE_VSI_VF)
 		ice_vsi_set_num_qs(vsi, vf->vf_id);
 	else
 		ice_vsi_set_num_qs(vsi, ICE_INVAL_VFID);
@@ -3078,15 +3351,21 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		goto err_vsi;
 
 	ice_vsi_get_qs(vsi);
+	ice_vsi_alloc_rss_global_lut(vsi);
+
+	ice_alloc_fd_res(vsi);
 	ice_vsi_set_tc_cfg(vsi);
 
 	/* Initialize VSI struct elements and create VSI in FW */
-	ret = ice_vsi_init(vsi);
+	ret = ice_vsi_init(vsi, init_vsi);
 	if (ret < 0)
 		goto err_vsi;
 
-
-	switch (vsi->type) {
+	switch (vtype) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_OFFLOAD_MACVLAN:
+	case ICE_VSI_VMDQ2:
+	case ICE_VSI_SWITCHDEV_CTRL:
 	case ICE_VSI_PF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
@@ -3096,7 +3375,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		if (ret)
 			goto err_vectors;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
 		if (ret)
 			goto err_vectors;
 
@@ -3105,19 +3384,34 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 			goto err_vectors;
 
 		ice_vsi_map_rings_to_vectors(vsi);
-		/* Do not exit if configuring RSS had an issue, at least
-		 * receive traffic on first queue. Hence no need to capture
-		 * return value
+		ice_vsi_reset_stats(vsi);
+		/* Perform an initial read of the statistics registers now to
+		 * set the baseline before the VSI becomes operational.
 		 */
-		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
-			ice_vsi_cfg_rss_lut_key(vsi);
+		ice_update_eth_stats(vsi);
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi)) {
+			vsi->num_xdp_txq = vsi->alloc_rxq;
+			ret = ice_prepare_xdp_rings(vsi, vsi->xdp_prog);
+			if (ret)
+				goto err_vectors;
+		}
+#endif /* HAVE_XDP_SUPPORT */
+		/* ICE_VSI_CTRL does not need RSS so skip RSS processing */
+		if (vtype != ICE_VSI_CTRL)
+			/* Do not exit if configuring RSS had an issue, at
+			 * least receive traffic on first queue. Hence no
+			 * need to capture return value
+			 */
+			if (test_bit(ICE_FLAG_RSS_ENA, pf->flags))
+				ice_vsi_cfg_rss_lut_key(vsi);
 		break;
 	case ICE_VSI_VF:
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
 			goto err_rings;
 
-		ret = ice_vsi_set_q_vectors_reg_idx(vsi);
+		ret = ice_vsi_set_q_vectors_reg_idx(vsi, 0);
 		if (ret)
 			goto err_vectors;
 
@@ -3125,37 +3419,73 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
 		if (ret)
 			goto err_vectors;
 
+		ice_vsi_reset_stats(vsi);
+		break;
+	case ICE_VSI_CHNL:
+		if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+			ice_vsi_cfg_rss_lut_key(vsi);
+			ice_vsi_set_rss_flow_fld(vsi);
+		}
 		break;
 	default:
 		break;
 	}
 
-	/* configure VSI nodes based on number of queues and TC's */
-	for (i = 0; i < vsi->tc_cfg.numtc; i++)
-		max_txqs[i] = vsi->alloc_txq;
+	for (i = 0; i < vsi->tc_cfg.numtc; i++) {
+		/* configure VSI nodes based on number of queues and TC's.
+		 * ADQ creates VSIs for each TC/Channel but doesn't
+		 * allocate queues instead it reconfigures the PF queues
+		 * as per the TC command. So max_txqs should point to the
+		 * PF Tx queues.
+		 */
+		if (vtype == ICE_VSI_CHNL)
+			max_txqs[i] = pf->num_lan_tx;
+		else
+			max_txqs[i] = vsi->alloc_txq;
+
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_is_xdp_ena_vsi(vsi))
+			max_txqs[i] += vsi->num_xdp_txq;
+#endif /* HAVE_XDP_SUPPORT */
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		/* If MQPRIO ise set, means channel code path, hence for main
+		 * VSI's, use TC as 1
+		 */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, 1, max_txqs);
+	else
+#endif /* NETIF_F_HW_TC */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx,
+					 vsi->tc_cfg.ena_tc, max_txqs);
 
-	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
-				 max_txqs);
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed lan queue config, error %d\n",
-			vsi->vsi_num, status);
-		goto err_vectors;
+		dev_err(ice_pf_to_dev(pf), "VSI %d failed lan queue config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		if (init_vsi) {
+			ret = -EIO;
+			goto err_vectors;
+		} else {
+			return ice_schedule_reset(pf, ICE_RESET_PFR);
+		}
 	}
+
+	ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors);
+	kfree(coalesce);
+
 	return 0;
 
 err_vectors:
 	ice_vsi_free_q_vectors(vsi);
 err_rings:
-	if (vsi->netdev) {
-		vsi->current_netdev_flags = 0;
-		unregister_netdev(vsi->netdev);
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
-	}
+	ice_vsi_clear_rings(vsi);
 err_vsi:
-	ice_vsi_clear(vsi);
-	set_bit(__ICE_RESET_FAILED, pf->state);
+	if (init_vsi)
+		ice_vsi_delete(vsi);
+	ice_vsi_put_qs(vsi);
+	set_bit(ICE_RESET_FAILED, pf->state);
+	kfree(coalesce);
 	return ret;
 }
 
@@ -3165,13 +3495,47 @@ int ice_vsi_rebuild(struct ice_vsi *vsi)
  */
 bool ice_is_reset_in_progress(unsigned long *state)
 {
-	return test_bit(__ICE_RESET_OICR_RECV, state) ||
-	       test_bit(__ICE_PFR_REQ, state) ||
-	       test_bit(__ICE_CORER_REQ, state) ||
-	       test_bit(__ICE_GLOBR_REQ, state);
+	return test_bit(ICE_RESET_OICR_RECV, state) ||
+	       test_bit(ICE_PFR_REQ, state) ||
+	       test_bit(ICE_CORER_REQ, state) ||
+	       test_bit(ICE_GLOBR_REQ, state);
+}
+
+/**
+ * ice_wait_for_reset - Wait for driver to finish reset and rebuild
+ * @pf: pointer to the PF structure
+ * @timeout: length of time to wait, in jiffies
+ *
+ * Wait (sleep) for a short time until the driver finishes cleaning up from
+ * a device reset. The caller must be able to sleep. Use this to delay
+ * operations that could fail while the driver is cleaning up after a device
+ * reset.
+ *
+ * Returns 0 on success, -EBUSY if the reset is not finished within the
+ * timeout, and -ERESTARTSYS if the thread was interrupted.
+ */
+int ice_wait_for_reset(struct ice_pf *pf, unsigned long timeout)
+{
+	long ret;
+
+#ifdef __CHECKER__
+	/* Suppress sparse warning from kernel macro:
+	 * warning: symbol '__ret' shadows an earlier one
+	 */
+	ret = timeout;
+#else
+	ret = wait_event_interruptible_timeout(pf->reset_wait_queue,
+					       !ice_is_reset_in_progress(pf->state),
+					       timeout);
+#endif
+	if (ret < 0)
+		return ret;
+	else if (!ret)
+		return -EBUSY;
+	else
+		return 0;
 }
 
-#ifdef CONFIG_DCB
 /**
  * ice_vsi_update_q_map - update our copy of the VSI info with new queue map
  * @vsi: VSI being configured
@@ -3186,6 +3550,156 @@ static void ice_vsi_update_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctx)
 	       sizeof(vsi->info.tc_mapping));
 }
 
+/**
+ * ice_vsi_cfg_netdev_tc - Setup the netdev TC configuration
+ * @vsi: the VSI being configured
+ * @ena_tc: TC map to be enabled
+ */
+void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct ice_pf *pf = vsi->back;
+	int numtc = vsi->tc_cfg.numtc;
+	struct ice_dcbx_cfg *dcbcfg;
+	u8 netdev_tc;
+	int i;
+
+	if (!netdev)
+		return;
+
+#ifdef NETIF_F_HW_TC
+	/* CHNL VSI doesn't have it's own netdev, hence, no netdev_tc */
+	if (vsi->type == ICE_VSI_CHNL)
+		return;
+#endif /* NETIF_F_HW_TC */
+
+	if (!ena_tc) {
+		netdev_reset_tc(netdev);
+		return;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF && ice_is_adq_active(pf))
+		numtc = vsi->all_numtc;
+#endif /* NETIF_F_HW_TC */
+
+	if (netdev_set_num_tc(netdev, numtc))
+		return;
+
+	dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
+
+	ice_for_each_traffic_class(i)
+		if (vsi->tc_cfg.ena_tc & BIT(i))
+			netdev_set_tc_queue(netdev,
+					    vsi->tc_cfg.tc_info[i].netdev_tc,
+					    vsi->tc_cfg.tc_info[i].qcount_tx,
+					    vsi->tc_cfg.tc_info[i].qoffset);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* setup TC queue map for CHNL TCs */
+	ice_for_each_chnl_tc(i) {
+		if (!(vsi->all_enatc & BIT(i)))
+			break;
+		if (!vsi->mqprio_qopt.qopt.count[i])
+			break;
+		netdev_set_tc_queue(netdev, i,
+				    vsi->mqprio_qopt.qopt.count[i],
+				    vsi->mqprio_qopt.qopt.offset[i]);
+	}
+
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#ifdef NETIF_F_HW_TC
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		return;
+#endif /* NETIF_F_HW_TC */
+
+	for (i = 0; i < ICE_MAX_USER_PRIORITY; i++) {
+		u8 ets_tc = dcbcfg->etscfg.prio_table[i];
+
+		/* Get the mapped netdev TC# for the UP */
+		netdev_tc = vsi->tc_cfg.tc_info[ets_tc].netdev_tc;
+		netdev_set_prio_tc_map(netdev, i, netdev_tc);
+	}
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/**
+ * ice_vsi_setup_q_map_mqprio - Prepares mqprio based tc_config
+ * @vsi: the VSI being configured,
+ * @ctxt: VSI context structure
+ * @ena_tc: number of traffic classes to enable
+ *
+ * Prepares VSI tc_config to have queue configurations based on MQPRIO options.
+ */
+static void
+ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
+			   u8 ena_tc)
+{
+	u16 pow, offset = 0, qcount_tx = 0, qcount_rx = 0, qmap;
+	u16 tc0_offset = vsi->mqprio_qopt.qopt.offset[0];
+	int tc0_qcount = vsi->mqprio_qopt.qopt.count[0];
+	u8 netdev_tc = 0;
+	int i;
+
+	vsi->tc_cfg.ena_tc = ena_tc ? ena_tc : 1;
+
+	pow = order_base_2(tc0_qcount);
+	qmap = ((tc0_offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
+		ICE_AQ_VSI_TC_Q_OFFSET_M) |
+		((pow << ICE_AQ_VSI_TC_Q_NUM_S) & ICE_AQ_VSI_TC_Q_NUM_M);
+
+	ice_for_each_traffic_class(i) {
+		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
+			/* TC is not enabled */
+			vsi->tc_cfg.tc_info[i].qoffset = 0;
+			vsi->tc_cfg.tc_info[i].qcount_rx = 1;
+			vsi->tc_cfg.tc_info[i].qcount_tx = 1;
+			vsi->tc_cfg.tc_info[i].netdev_tc = 0;
+			ctxt->info.tc_mapping[i] = 0;
+			continue;
+		}
+
+		offset = vsi->mqprio_qopt.qopt.offset[i];
+		qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+		qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		vsi->tc_cfg.tc_info[i].qoffset = offset;
+		vsi->tc_cfg.tc_info[i].qcount_rx = qcount_rx;
+		vsi->tc_cfg.tc_info[i].qcount_tx = qcount_tx;
+		vsi->tc_cfg.tc_info[i].netdev_tc = netdev_tc++;
+	}
+
+	if (vsi->all_numtc && vsi->all_numtc != vsi->tc_cfg.numtc) {
+		ice_for_each_chnl_tc(i) {
+			if (!(vsi->all_enatc & BIT(i)))
+				continue;
+			offset = vsi->mqprio_qopt.qopt.offset[i];
+			qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+			qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		}
+	}
+
+	/* Set actual Tx/Rx queue pairs */
+	vsi->num_txq = offset + qcount_tx;
+	vsi->num_rxq = offset + qcount_rx;
+
+	/* Setup queue TC[0].qmap for given VSI context */
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]);
+	ctxt->info.q_mapping[1] = cpu_to_le16(tc0_qcount);
+
+	/* Find queue count available for channel VSIs and starting offset
+	 * for channel VSIs
+	 */
+	if (tc0_qcount && tc0_qcount < vsi->num_rxq) {
+		vsi->cnt_q_avail = vsi->num_rxq - tc0_qcount;
+		vsi->next_base_q = tc0_qcount;
+	}
+	dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_txq = %d\n",  vsi->num_txq);
+	dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_rxq = %d\n",  vsi->num_rxq);
+	dev_dbg(ice_pf_to_dev(vsi->back), "%s: all_numtc %u, all_enatc: 0x%04x, tc_cfg.numtc %u\n",
+		__func__, vsi->all_numtc, vsi->all_enatc, vsi->tc_cfg.numtc);
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
 /**
  * ice_vsi_cfg_tc - Configure VSI Tx Sched for given TC map
  * @vsi: VSI to be configured
@@ -3196,48 +3710,81 @@ static void ice_vsi_update_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctx)
 int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
-	struct ice_vsi_ctx *ctx;
 	struct ice_pf *pf = vsi->back;
+	struct ice_vsi_ctx *ctx;
 	enum ice_status status;
+	struct device *dev;
 	int i, ret = 0;
 	u8 num_tc = 0;
 
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->tc_cfg.ena_tc == ena_tc &&
+	    vsi->mqprio_qopt.mode != TC_MQPRIO_MODE_CHANNEL)
+		return ret;
+#else
+	if (vsi->tc_cfg.ena_tc == ena_tc)
+		return 0;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
 	ice_for_each_traffic_class(i) {
 		/* build bitmap of enabled TCs */
 		if (ena_tc & BIT(i))
 			num_tc++;
 		/* populate max_txqs per TC */
 		max_txqs[i] = vsi->alloc_txq;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+		/* Update max_txqs if it is CHNL VSI, because alloc_t[r]xq are
+		 * zero for CHNL VSI, hence use num_txq instead as max_txqs
+		 */
+		if (vsi->type == ICE_VSI_CHNL &&
+		    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+			max_txqs[i] = vsi->num_txq;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 	}
 
 	vsi->tc_cfg.ena_tc = ena_tc;
 	vsi->tc_cfg.numtc = num_tc;
 
-	ctx = devm_kzalloc(&pf->pdev->dev, sizeof(*ctx), GFP_KERNEL);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
 	ctx->vf_num = 0;
 	ctx->info = vsi->info;
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (vsi->type == ICE_VSI_PF &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc);
+	else
+		ice_vsi_setup_q_map(vsi, ctx);
+#else
 	ice_vsi_setup_q_map(vsi, ctx);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
 	/* must to indicate which section of VSI context are being modified */
 	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);
 	status = ice_update_vsi(&pf->hw, vsi->idx, ctx, NULL);
 	if (status) {
-		dev_info(&pf->pdev->dev, "Failed VSI Update\n");
+		dev_info(dev, "Failed VSI Update\n");
 		ret = -EIO;
 		goto out;
 	}
 
-	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
-				 max_txqs);
+#ifdef NETIF_F_HW_TC
+	if (vsi->type == ICE_VSI_PF &&
+	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		status  = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, 1,
+					  max_txqs);
+	else
+#endif /* NETIF_F_HW_TC */
+		status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx,
+					 vsi->tc_cfg.ena_tc, max_txqs);
 
 	if (status) {
-		dev_err(&pf->pdev->dev,
-			"VSI %d failed TC config, error %d\n",
-			vsi->vsi_num, status);
+		dev_err(dev, "VSI %d failed TC config, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
 		ret = -EIO;
 		goto out;
 	}
@@ -3246,56 +3793,697 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 
 	ice_vsi_cfg_netdev_tc(vsi, ena_tc);
 out:
-	devm_kfree(&pf->pdev->dev, ctx);
+	kfree(ctx);
 	return ret;
 }
-#endif /* CONFIG_DCB */
 
 /**
- * ice_nvm_version_str - format the NVM version strings
- * @hw: ptr to the hardware info
+ * ice_update_ring_stats - Update ring statistics
+ * @ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ *
+ * This function assumes that caller has acquired a u64_stats_sync lock.
+ */
+static void ice_update_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes)
+{
+	ring->stats.bytes += bytes;
+	ring->stats.pkts += pkts;
+}
+
+#ifdef ADQ_PERF_COUNTERS
+/**
+ * ice_update_adq_stats - Update channel's counters for busy poll and NAPI
+ * @ring: ring to update
+ * @pkts: number of processed packets
+ *
+ * This function assumes that caller has acquired a u64_stats_sync lock.
+ */
+static void ice_update_adq_stats(struct ice_ring *ring, u64 pkts)
+{
+	/* separate accounting of packets (eithet from busy_poll or
+	 * napi_poll depending upon state of vector specific
+	 * flag 'in_bp', 'prev_in_bp'
+	 */
+	if (ring->q_vector->state_flags & ICE_CHNL_IN_BP) {
+		ring->ch_q_stats.poll.bp_packets += pkts;
+	} else {
+		if (ring->q_vector->state_flags & ICE_CHNL_PREV_IN_BP)
+			ring->ch_q_stats.poll.bp_packets += pkts;
+		else
+			ring->ch_q_stats.poll.np_packets += pkts;
+	}
+}
+#endif /* ADQ_PERF_COUNTERS */
+
+/**
+ * ice_update_tx_ring_stats - Update Tx ring specific counters
+ * @tx_ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ */
+void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
+{
+	u64_stats_update_begin(&tx_ring->syncp);
+	ice_update_ring_stats(tx_ring, pkts, bytes);
+#ifdef ADQ_PERF_COUNTERS
+	ice_update_adq_stats(tx_ring, pkts);
+#endif /* ADQ_PERF_COUNTERS */
+	u64_stats_update_end(&tx_ring->syncp);
+}
+
+/**
+ * ice_update_rx_ring_stats - Update Rx ring specific counters
+ * @rx_ring: ring to update
+ * @pkts: number of processed packets
+ * @bytes: number of processed bytes
+ */
+void ice_update_rx_ring_stats(struct ice_ring *rx_ring, u64 pkts, u64 bytes)
+{
+	u64_stats_update_begin(&rx_ring->syncp);
+	ice_update_ring_stats(rx_ring, pkts, bytes);
+#ifdef ADQ_PERF_COUNTERS
+	ice_update_adq_stats(rx_ring, pkts);
+#endif /* ADQ_PERF_COUNTERS */
+
+	/* if vector is transitioning from BP->INT (due to busy_poll_stop()) and
+	 * we find no packets, in that case: to avoid entering into INTR mode
+	 * (which happens from napi_poll - enabling interrupt if
+	 * unlikely_comeback_to_bp getting set), make "prev_data_pkt_recv" to be
+	 * non-zero, so that interrupts won't be enabled. This is to address the
+	 * issue where num_force_wb on some queues is 2 to 3 times higher than
+	 * other queues and those queues also sees lot of interrupts
+	 */
+	if (ice_ring_ch_enabled(rx_ring) &&
+	    ice_vector_busypoll_intr(rx_ring->q_vector)) {
+		if (!pkts)
+			rx_ring->q_vector->state_flags |=
+					ICE_CHNL_PREV_DATA_PKT_RECV;
+#ifdef ADQ_PERF_COUNTERS
+	} else if (ice_ring_ch_enabled(rx_ring)) {
+		struct ice_q_vector *q_vector = rx_ring->q_vector;
+
+		if (pkts &&
+		    !(q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV))
+			rx_ring->ch_q_stats.rx.num_only_ctrl_pkts++;
+		if (q_vector->state_flags & ICE_CHNL_IN_BP &&
+		    !(q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV))
+			rx_ring->ch_q_stats.rx.num_no_data_pkt_bp++;
+	}
+#else
+	}
+#endif /* ADQ_PERF_COUNTERS */
+
+	u64_stats_update_end(&rx_ring->syncp);
+}
+
+/**
+ * ice_status_to_errno - convert from enum ice_status to Linux errno
+ * @err: ice_status value to convert
  */
-char *ice_nvm_version_str(struct ice_hw *hw)
+int ice_status_to_errno(enum ice_status err)
 {
-	u8 oem_ver, oem_patch, ver_hi, ver_lo;
-	static char buf[ICE_NVM_VER_LEN];
-	u16 oem_build;
+	switch (err) {
+	case ICE_SUCCESS:
+		return 0;
+	case ICE_ERR_DOES_NOT_EXIST:
+		return -ENOENT;
+	case ICE_ERR_OUT_OF_RANGE:
+		return -ENOTTY;
+	case ICE_ERR_PARAM:
+		return -EINVAL;
+	case ICE_ERR_NO_MEMORY:
+		return -ENOMEM;
+	case ICE_ERR_MAX_LIMIT:
+		return -EAGAIN;
+	default:
+		return -EINVAL;
+	}
+}
 
-	ice_get_nvm_version(hw, &oem_ver, &oem_build, &oem_patch, &ver_hi,
-			    &ver_lo);
 
-	snprintf(buf, sizeof(buf), "%x.%02x 0x%x %d.%d.%d", ver_hi, ver_lo,
-		 hw->nvm.eetrack, oem_ver, oem_build, oem_patch);
+/**
+ * ice_is_dflt_vsi_in_use - check if the default forwarding VSI is being used
+ * @sw: switch to check if its default forwarding VSI is free
+ *
+ * Return true if the default forwarding VSI is already being used, else returns
+ * false signalling that it's available to use.
+ */
+bool ice_is_dflt_vsi_in_use(struct ice_sw *sw)
+{
+	return (sw->dflt_vsi && sw->dflt_vsi_ena);
+}
 
-	return buf;
+/**
+ * ice_is_vsi_dflt_vsi - check if the VSI passed in is the default VSI
+ * @sw: switch for the default forwarding VSI to compare against
+ * @vsi: VSI to compare against default forwarding VSI
+ *
+ * If this VSI passed in is the default forwarding VSI then return true, else
+ * return false
+ */
+bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi)
+{
+	return (sw->dflt_vsi == vsi && sw->dflt_vsi_ena);
 }
 
 /**
- * ice_vsi_cfg_mac_fltr - Add or remove a MAC address filter for a VSI
- * @vsi: the VSI being configured MAC filter
- * @macaddr: the MAC address to be added.
- * @set: Add or delete a MAC filter
+ * ice_set_dflt_vsi - set the default forwarding VSI
+ * @sw: switch used to assign the default forwarding VSI
+ * @vsi: VSI getting set as the default forwarding VSI on the switch
  *
- * Adds or removes MAC address filter entry for VF VSI
+ * If the VSI passed in is already the default VSI and it's enabled just return
+ * success.
+ *
+ * If there is already a default VSI on the switch and it's enabled then return
+ * -EEXIST since there can only be one default VSI per switch.
+ *
+ *  Otherwise try to set the VSI passed in as the switch's default VSI and
+ *  return the result.
  */
-enum ice_status
-ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set)
+int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi)
 {
-	LIST_HEAD(tmp_add_list);
 	enum ice_status status;
+	struct device *dev;
 
-	 /* Update MAC filter list to be added or removed for a VSI */
-	if (ice_add_mac_to_list(vsi, &tmp_add_list, macaddr)) {
-		status = ICE_ERR_NO_MEMORY;
-		goto cfg_mac_fltr_exit;
+	if (!sw || !vsi)
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(vsi->back);
+
+	/* the VSI passed in is already the default VSI */
+	if (ice_is_vsi_dflt_vsi(sw, vsi)) {
+		dev_dbg(dev, "VSI %d passed in is already the default forwarding VSI, nothing to do\n",
+			vsi->vsi_num);
+		return 0;
+	}
+
+	/* another VSI is already the default VSI for this switch */
+	if (ice_is_dflt_vsi_in_use(sw)) {
+		dev_err(dev, "Default forwarding VSI %d already in use, disable it and try again\n",
+			sw->dflt_vsi->vsi_num);
+		return -EEXIST;
 	}
 
-	if (set)
-		status = ice_add_mac(&vsi->back->hw, &tmp_add_list);
+	status = ice_cfg_dflt_vsi(vsi->port_info, vsi->idx, true, ICE_FLTR_RX);
+	if (status) {
+		dev_err(dev, "Failed to set VSI %d as the default forwarding VSI, error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		return -EIO;
+	}
+
+	sw->dflt_vsi = vsi;
+	sw->dflt_vsi_ena = true;
+
+	return 0;
+}
+
+/**
+ * ice_clear_dflt_vsi - clear the default forwarding VSI
+ * @sw: switch used to clear the default VSI
+ *
+ * If the switch has no default VSI or it's not enabled then return error.
+ *
+ * Otherwise try to clear the default VSI and return the result.
+ */
+int ice_clear_dflt_vsi(struct ice_sw *sw)
+{
+	struct ice_vsi *dflt_vsi;
+	enum ice_status status;
+	struct device *dev;
+
+	if (!sw)
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(sw->pf);
+
+	dflt_vsi = sw->dflt_vsi;
+
+	/* there is no default VSI configured */
+	if (!ice_is_dflt_vsi_in_use(sw))
+		return -ENODEV;
+
+	status = ice_cfg_dflt_vsi(dflt_vsi->port_info, dflt_vsi->idx, false,
+				  ICE_FLTR_RX);
+	if (status) {
+		dev_err(dev, "Failed to clear the default forwarding VSI %d, error %s\n",
+			dflt_vsi->vsi_num, ice_stat_str(status));
+		return -EIO;
+	}
+
+	sw->dflt_vsi = NULL;
+	sw->dflt_vsi_ena = false;
+
+	return 0;
+}
+
+
+/**
+ * ice_get_link_speed_mbps - get link speed in Mbps
+ * @vsi: the VSI whose link speed is being queried
+ *
+ * Return current VSI link speed, else ICE_LINK_SPEED_UNKNOWN (0) is
+ * returned.
+ */
+int ice_get_link_speed_mbps(struct ice_vsi *vsi)
+{
+	switch (vsi->port_info->phy.link_info.link_speed) {
+	case ICE_AQ_LINK_SPEED_100GB:
+		return ICE_LINK_SPEED_100000MBPS;
+	case ICE_AQ_LINK_SPEED_50GB:
+		return ICE_LINK_SPEED_50000MBPS;
+	case ICE_AQ_LINK_SPEED_40GB:
+		return ICE_LINK_SPEED_40000MBPS;
+	case ICE_AQ_LINK_SPEED_25GB:
+		return ICE_LINK_SPEED_25000MBPS;
+	case ICE_AQ_LINK_SPEED_20GB:
+		return ICE_LINK_SPEED_20000MBPS;
+	case ICE_AQ_LINK_SPEED_10GB:
+		return ICE_LINK_SPEED_10000MBPS;
+	case ICE_AQ_LINK_SPEED_5GB:
+		return ICE_LINK_SPEED_5000MBPS;
+	case ICE_AQ_LINK_SPEED_2500MB:
+		return ICE_LINK_SPEED_2500MBPS;
+	case ICE_AQ_LINK_SPEED_1000MB:
+		return ICE_LINK_SPEED_1000MBPS;
+	case ICE_AQ_LINK_SPEED_100MB:
+		return ICE_LINK_SPEED_100MBPS;
+	case ICE_AQ_LINK_SPEED_10MB:
+		return ICE_LINK_SPEED_10MBPS;
+	case ICE_AQ_LINK_SPEED_UNKNOWN:
+	default:
+		return ICE_LINK_SPEED_UNKNOWN;
+	}
+}
+
+
+/**
+ * ice_get_link_speed_kbps - get link speed in Kbps
+ * @vsi: the VSI whose link speed is being queried
+ *
+ * Return current VSI link speed, else ICE_LINK_SPEED_UNKNOWN (0) is
+ * returned.
+ */
+int ice_get_link_speed_kbps(struct ice_vsi *vsi)
+{
+	int speed_mbps;
+
+	speed_mbps = ice_get_link_speed_mbps(vsi);
+
+	return speed_mbps * 1000;
+}
+
+/**
+ * ice_set_min_bw_limit - setup minimum BW limit for Tx based on min_tx_rate
+ * @vsi: VSI to be configured
+ * @min_tx_rate: min Tx rate in Kbps to be configured as BW limit
+ *
+ * If the min_tx_rate is specified as 0 that means to clear the minimum BW limit
+ * profile, otherwise a non-zero value will force a minimum BW limit for the VSI
+ * on TC 0.
+ */
+int ice_set_min_bw_limit(struct ice_vsi *vsi, u64 min_tx_rate)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int speed;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->port_info) {
+		dev_dbg(dev, "VSI %d, type %u specified doesn't have valid port_info\n",
+			vsi->idx, vsi->type);
+		return -EINVAL;
+	}
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (min_tx_rate > (u64)speed) {
+		dev_err(dev, "invalid min Tx rate %llu Kbps specified for %s %d is greater than current link speed %u Kbps\n",
+			min_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx,
+			speed);
+		return -EINVAL;
+	}
+
+	/* Configure min BW for VSI limit */
+	if (min_tx_rate) {
+		status = ice_cfg_vsi_bw_lmt_per_tc(vsi->port_info, vsi->idx, 0,
+						   ICE_MIN_BW, min_tx_rate);
+		if (status) {
+			dev_err(dev, "failed to set min Tx rate(%llu Kbps) for %s %d\n",
+				min_tx_rate, ice_vsi_type_str(vsi->type),
+				vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "set min Tx rate(%llu Kbps) for %s\n",
+			min_tx_rate, ice_vsi_type_str(vsi->type));
+	} else {
+		status = ice_cfg_vsi_bw_dflt_lmt_per_tc(vsi->port_info,
+							vsi->idx, 0,
+							ICE_MIN_BW);
+		if (status) {
+			dev_err(dev, "failed to clear min Tx rate configuration for %s %d\n",
+				ice_vsi_type_str(vsi->type), vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "cleared min Tx rate configuration for %s %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_max_bw_limit - setup maximum BW limit for Tx based on max_tx_rate
+ * @vsi: VSI to be configured
+ * @max_tx_rate: max Tx rate in Kbps to be configured as BW limit
+ *
+ * If the max_tx_rate is specified as 0 that means to clear the maximum BW limit
+ * profile, otherwise a non-zero value will force a maximum BW limit for the VSI
+ * on TC 0.
+ */
+int ice_set_max_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int speed;
+
+	dev = ice_pf_to_dev(pf);
+	if (!vsi->port_info) {
+		dev_dbg(dev, "VSI %d, type %u specified doesn't have valid port_info\n",
+			vsi->idx, vsi->type);
+		return -EINVAL;
+	}
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (max_tx_rate > (u64)speed) {
+		dev_err(dev, "invalid max Tx rate %llu Kbps specified for %s %d is greater than current link speed %u Kbps\n",
+			max_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx,
+			speed);
+		return -EINVAL;
+	}
+
+	/* Configure max BW for VSI limit */
+	if (max_tx_rate) {
+		status = ice_cfg_vsi_bw_lmt_per_tc(vsi->port_info, vsi->idx, 0,
+						   ICE_MAX_BW, max_tx_rate);
+		if (status) {
+			dev_err(dev, "failed setting max Tx rate(%llu Kbps) for %s %d\n",
+				max_tx_rate, ice_vsi_type_str(vsi->type),
+				vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "set max Tx rate(%llu Kbps) for %s %d\n",
+			max_tx_rate, ice_vsi_type_str(vsi->type), vsi->idx);
+	} else {
+		status = ice_cfg_vsi_bw_dflt_lmt_per_tc(vsi->port_info,
+							vsi->idx, 0,
+							ICE_MAX_BW);
+		if (status) {
+			dev_err(dev, "failed clearing max Tx rate configuration for %s %d\n",
+				ice_vsi_type_str(vsi->type), vsi->idx);
+			return -EIO;
+		}
+
+		dev_dbg(dev, "cleared max Tx rate configuration for %s %d\n",
+			ice_vsi_type_str(vsi->type), vsi->idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_link - turn on/off physical link
+ * @vsi: VSI to modify physical link on
+ * @ena: turn on/off physical link
+ */
+int ice_set_link(struct ice_vsi *vsi, bool ena)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_port_info *pi = vsi->port_info;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	status = ice_aq_set_link_restart_an(pi, ena, NULL);
+
+	/* if link is owned by manageability, FW will return ICE_AQ_RC_EMODE.
+	 * this is not a fatal error, so print a warning message and return
+	 * a success code. Return an error if FW returns an error code other
+	 * than ICE_AQ_RC_EMODE
+	 */
+	if (status == ICE_ERR_AQ_ERROR) {
+		if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+			dev_warn(dev, "can't set link to %s, err %s aq_err %s. not fatal, continuing\n",
+				 (ena ? "ON" : "OFF"), ice_stat_str(status),
+				 ice_aq_str(hw->adminq.sq_last_status));
+	} else if (status) {
+		dev_err(dev, "can't set link to %s, err %s aq_err %s\n",
+			(ena ? "ON" : "OFF"), ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_update_security - update security block in VSI
+ * @vsi: pointer to VSI structure
+ * @fill: function pointer to fill ctx
+ */
+int ice_vsi_update_security(struct ice_vsi *vsi,
+			    void (*fill)(struct ice_vsi_ctx *))
+{
+	struct ice_vsi_ctx ctx = { 0 };
+
+	ctx.info = vsi->info;
+	ctx.info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+	fill(&ctx);
+
+	if (ice_update_vsi(&vsi->back->hw, vsi->idx, &ctx, NULL))
+		return -ENODEV;
+
+	vsi->info = ctx.info;
+	return 0;
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_vsi_ctx_set_antispoof - set antispoof function in VSI ctx
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_set_antispoof(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF |
+		(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+		 ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+}
+
+/**
+ * ice_vsi_ctx_clear_antispoof - clear antispoof function in VSI ctx
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_clear_antispoof(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF &
+		~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+		  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+}
+#endif /* HAVE_METADATA_PORT_INFO */
+
+/**
+ * ice_vsi_ctx_set_allow_override - allow destination override on VSI
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_set_allow_override(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
+}
+
+/**
+ * ice_vsi_ctx_clear_allow_override - turn off destination override on VSI
+ * @ctx: pointer to VSI ctx structure
+ */
+void ice_vsi_ctx_clear_allow_override(struct ice_vsi_ctx *ctx)
+{
+	ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD;
+}
+
+/**
+ * ice_vsi_add_vlan_zero - add VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * In Single VLAN Mode (SVM), single VLAN filters via ICE_SW_LKUP_VLAN are based
+ * on the inner VLAN ID, so the VLAN TPID (i.e. 0x8100 or 0x888a8) doesn't
+ * matter. In Double VLAN Mode (DVM), outer/single VLAN filters via
+ * ICE_SW_LKUP_VLAN are based on the outer/single VLAN ID + VLAN TPID.
+ *
+ * For both modes add a VLAN 0 + no VLAN TPID filter to handle untagged traffic
+ * when VLAN pruning is enabled. Also, this handles VLAN 0 priority tagged
+ * traffic in SVM, since the VLAN TPID isn't part of filtering.
+ *
+ * If DVM is enabled then an explicit VLAN 0 + VLAN TPID filter needs to be
+ * added to allow VLAN 0 priority tagged traffic in DVM, since the VLAN TPID is
+ * part of filtering.
+ */
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct ice_vlan vlan;
+	int err;
+
+	vlan = ICE_VLAN(0, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->add_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	/* in SVM both VLAN 0 filters are identical */
+	if (!ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	vlan = ICE_VLAN(ETH_P_8021Q, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->add_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_del_vlan_zero - delete VLAN 0 filter(s) for this VSI
+ * @vsi: VSI used to add VLAN filters
+ *
+ * Delete the VLAN 0 filters in the same manner that they were added in
+ * ice_vsi_add_vlan_zero.
+ */
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct ice_vlan vlan;
+	int err;
+
+	vlan = ICE_VLAN(0, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->del_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	/* in SVM both VLAN 0 filters are identical */
+	if (!ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	vlan = ICE_VLAN(ETH_P_8021Q, 0, 0, ICE_FWD_TO_VSI);
+	err = vlan_ops->del_vlan(vsi, &vlan);
+	if (err && err != -EEXIST)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_num_zero_vlans - get number of VLAN 0 filters based on VLAN mode
+ * @vsi: VSI used to get the VLAN mode
+ *
+ * If DVM is enabled then 2 VLAN 0 filters are added, else if SVM is enabled
+ * then 1 VLAN 0 filter is added. See ice_vsi_add_vlan_zero for more details.
+ */
+static u16 ice_vsi_num_zero_vlans(struct ice_vsi *vsi)
+{
+#define ICE_DVM_NUM_ZERO_VLAN_FLTRS	2
+#define ICE_SVM_NUM_ZERO_VLAN_FLTRS	1
+	/* no VLAN 0 filter is created when a port VLAN is active */
+	if (vsi->type == ICE_VSI_VF &&
+	    ice_vf_is_port_vlan_ena(&vsi->back->vf[vsi->vf_id]))
+		return 0;
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		return ICE_DVM_NUM_ZERO_VLAN_FLTRS;
 	else
-		status = ice_remove_mac(&vsi->back->hw, &tmp_add_list);
+		return ICE_SVM_NUM_ZERO_VLAN_FLTRS;
+}
+
+/**
+ * ice_vsi_has_non_zero_vlans - check is VSI has any non-zero VLANs
+ * @vsi: VSI used to determine if any non-zero VLANs have been added
+ */
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi)
+{
+	return (vsi->num_vlan > ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
+ * ice_vsi_num_non_zero_vlans - get the number of non-zero VLANs for this VSI
+ * @vsi: VSI used to get the number of non-zero VLANs added
+ */
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi)
+{
+	return (vsi->num_vlan - ice_vsi_num_zero_vlans(vsi));
+}
+
+/**
+ * ice_is_feature_supported
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to be checked
+ *
+ * returns true if feature is supported, false otherwise
+ */
+bool ice_is_feature_supported(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return false;
+
+	return test_bit(f, pf->features);
+}
 
-cfg_mac_fltr_exit:
-	ice_free_fltr_list(&vsi->back->pdev->dev, &tmp_add_list);
-	return status;
+/**
+ * ice_set_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to set
+ */
+void ice_set_feature_support(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return;
+
+	set_bit(f, pf->features);
+}
+
+/**
+ * ice_clear_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ * @f: feature enum to clear
+ */
+void ice_clear_feature_support(struct ice_pf *pf, enum ice_feature f)
+{
+	if (f < 0 || f >= ICE_F_MAX)
+		return;
+
+	clear_bit(f, pf->features);
+}
+
+/**
+ * ice_init_feature_support
+ * @pf: pointer to the struct ice_pf instance
+ *
+ * called during init to setup supported feature
+ */
+void ice_init_feature_support(struct ice_pf *pf)
+{
+	switch (pf->hw.device_id) {
+	case ICE_DEV_ID_E810C_BACKPLANE:
+	case ICE_DEV_ID_E810C_QSFP:
+	case ICE_DEV_ID_E810C_SFP:
+	case ICE_DEV_ID_E810_XXV_BACKPLANE:
+	case ICE_DEV_ID_E810_XXV_QSFP:
+	case ICE_DEV_ID_E810_XXV_SFP:
+		ice_set_feature_support(pf, ICE_F_DSCP);
+		ice_set_feature_support(pf, ICE_F_PTP_EXTTS);
+		break;
+	default:
+		break;
+	}
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h
index 47bc033fff20e471e683ac846bb92523aedfdee3..4331d2786d75a1e13acd484f423e6fd6c40c75b7 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_lib.h
@@ -1,87 +1,62 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_LIB_H_
 #define _ICE_LIB_H_
 
 #include "ice.h"
 
-struct ice_txq_meta {
-	/* Tx-scheduler element identifier */
-	u32 q_teid;
-	/* Entry in VSI's txq_map bitmap */
-	u16 q_id;
-	/* Relative index of Tx queue within TC */
-	u16 q_handle;
-	/* VSI index that Tx queue belongs to */
-	u16 vsi_idx;
-	/* TC number that Tx queue belongs to */
-	u8 tc;
-};
+const char *ice_vsi_type_str(enum ice_vsi_type vsi_type);
 
-int
-ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
-		    const u8 *macaddr);
-
-void ice_free_fltr_list(struct device *dev, struct list_head *h);
+bool ice_pf_state_is_nominal(struct ice_pf *pf);
 
 void ice_update_eth_stats(struct ice_vsi *vsi);
 
+int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx);
+
+int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_ring **tx_rings, u16 q_idx);
+
 int ice_vsi_cfg_rxqs(struct ice_vsi *vsi);
 
 int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi);
 
 void ice_vsi_cfg_msix(struct ice_vsi *vsi);
 
-#ifdef CONFIG_PCI_IOV
-void
-ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx);
+int ice_vsi_start_all_rx_rings(struct ice_vsi *vsi);
 
-void
-ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx);
+int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi);
 
 int
-ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-		     u16 rel_vmvf_num, struct ice_ring *ring,
-		     struct ice_txq_meta *txq_meta);
-
-void ice_fill_txq_meta(struct ice_vsi *vsi, struct ice_ring *ring,
-		       struct ice_txq_meta *txq_meta);
-
-int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx);
-#endif /* CONFIG_PCI_IOV */
-
-int ice_vsi_add_vlan(struct ice_vsi *vsi, u16 vid);
-
-int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid);
+ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
+			  u16 rel_vmvf_num);
+#ifdef HAVE_XDP_SUPPORT
 
-int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi);
+int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi);
 
-int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena);
+int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi);
 
-int ice_vsi_start_rx_rings(struct ice_vsi *vsi);
+#endif /* HAVE_XDP_SUPPORT */
 
-int ice_vsi_stop_rx_rings(struct ice_vsi *vsi);
-
-int
-ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
-			  u16 rel_vmvf_num);
-
-int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc);
+bool ice_vsi_is_vlan_pruning_ena(struct ice_vsi *vsi);
 
 void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create);
 
-void ice_vsi_delete(struct ice_vsi *vsi);
+int ice_set_link(struct ice_vsi *vsi, bool ena);
 
+void ice_vsi_delete(struct ice_vsi *vsi);
 int ice_vsi_clear(struct ice_vsi *vsi);
+void ice_vsi_put_qs(struct ice_vsi *vsi);
+
+void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc);
 
-#ifdef CONFIG_DCB
 int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc);
-#endif /* CONFIG_DCB */
+
+int ice_vsi_cfg_rss_lut_key(struct ice_vsi *vsi);
 
 struct ice_vsi *
 ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
-	      enum ice_vsi_type type, u16 vf_id);
+	      enum ice_vsi_type vsi_type, u16 vf_id, struct ice_channel *ch,
+	      u8 tc);
 
 void ice_napi_del(struct ice_vsi *vsi);
 
@@ -89,24 +64,21 @@ int ice_vsi_release(struct ice_vsi *vsi);
 
 void ice_vsi_close(struct ice_vsi *vsi);
 
+int ice_ena_vsi(struct ice_vsi *vsi, bool locked);
+
+void ice_dis_vsi(struct ice_vsi *vsi, bool locked);
+
 int ice_free_res(struct ice_res_tracker *res, u16 index, u16 id);
 
+u16 ice_get_valid_res_count(struct ice_res_tracker *res);
+
 int
 ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id);
 
-int ice_vsi_rebuild(struct ice_vsi *vsi);
+int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi);
 
 bool ice_is_reset_in_progress(unsigned long *state);
-
-void ice_vsi_free_q_vectors(struct ice_vsi *vsi);
-
-void ice_trigger_sw_intr(struct ice_hw *hw, struct ice_q_vector *q_vector);
-
-void ice_vsi_put_qs(struct ice_vsi *vsi);
-
-#ifdef CONFIG_DCB
-void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi);
-#endif /* CONFIG_DCB */
+int ice_wait_for_reset(struct ice_pf *pf, unsigned long timeout);
 
 void ice_vsi_dis_irq(struct ice_vsi *vsi);
 
@@ -116,14 +88,54 @@ void ice_vsi_free_rx_rings(struct ice_vsi *vsi);
 
 void ice_vsi_free_tx_rings(struct ice_vsi *vsi);
 
-int ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena);
+void ice_vsi_manage_rss_lut(struct ice_vsi *vsi, bool ena);
+
+void ice_update_tx_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes);
 
-u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran);
+void ice_update_rx_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes);
 
-char *ice_nvm_version_str(struct ice_hw *hw);
+void ice_vsi_cfg_frame_size(struct ice_vsi *vsi);
+
+int ice_status_to_errno(enum ice_status err);
+
+void
+ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio,
+			bool __maybe_unused ena_ts);
+
+#ifdef HAVE_NETPOLL_CONTROLLER
+irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data);
+#endif /* HAVE_NETPOLL_CONTROLLER */
+
+void ice_write_intrl(struct ice_q_vector *q_vector, u8 intrl);
+void ice_write_itr(struct ice_ring_container *rc, u16 itr);
+void ice_set_q_vector_intrl(struct ice_q_vector *q_vector);
 
 enum ice_status
 ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set);
-
 bool ice_is_safe_mode(struct ice_pf *pf);
+bool ice_is_peer_ena(struct ice_pf *pf);
+bool ice_is_dflt_vsi_in_use(struct ice_sw *sw);
+bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi);
+int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi);
+int ice_clear_dflt_vsi(struct ice_sw *sw);
+int ice_set_min_bw_limit(struct ice_vsi *vsi, u64 min_tx_rate);
+int ice_set_max_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate);
+int ice_get_link_speed_kbps(struct ice_vsi *vsi);
+int ice_get_link_speed_mbps(struct ice_vsi *vsi);
+int ice_vsi_update_security(struct ice_vsi *vsi,
+			    void (*fill)(struct ice_vsi_ctx *));
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_vsi_ctx_set_antispoof(struct ice_vsi_ctx *ctx);
+void ice_vsi_ctx_clear_antispoof(struct ice_vsi_ctx *ctx);
+#endif /* HAVE_METADATA_PORT_INFO */
+void ice_vsi_ctx_set_allow_override(struct ice_vsi_ctx *ctx);
+void ice_vsi_ctx_clear_allow_override(struct ice_vsi_ctx *ctx);
+int ice_vsi_add_vlan_zero(struct ice_vsi *vsi);
+int ice_vsi_del_vlan_zero(struct ice_vsi *vsi);
+bool ice_vsi_has_non_zero_vlans(struct ice_vsi *vsi);
+u16 ice_vsi_num_non_zero_vlans(struct ice_vsi *vsi);
+bool ice_is_feature_supported(struct ice_pf *pf, enum ice_feature f);
+void ice_set_feature_support(struct ice_pf *pf, enum ice_feature f);
+void ice_clear_feature_support(struct ice_pf *pf, enum ice_feature f);
+void ice_init_feature_support(struct ice_pf *pf);
 #endif /* !_ICE_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index d0ccb7ad447b194a849663770b05ad310d1040e4..0461100e5c9d089a3fc9d8805dd0c18b84f504f7 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1,34 +1,57 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* Intel(R) Ethernet Connection E800 Series Linux Driver */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
 #include "ice_dcb_lib.h"
-
-#define DRV_VERSION_MAJOR 0
-#define DRV_VERSION_MINOR 8
-#define DRV_VERSION_BUILD 1
-
-#define DRV_VERSION	__stringify(DRV_VERSION_MAJOR) "." \
-			__stringify(DRV_VERSION_MINOR) "." \
-			__stringify(DRV_VERSION_BUILD) "-k"
+#include "ice_dcb_nl.h"
+#include "ice_devlink.h"
+#include "ice_eswitch.h"
+/* Including ice_trace.h with CREATE_TRACE_POINTS defined will generate the
+ * ice tracepoint functions. This must be done exactly once across the
+ * ice driver.
+ */
+#define CREATE_TRACE_POINTS
+#include "ice_trace.h"
+#undef CREATE_TRACE_POINTS
+#include "ice_tc_lib.h"
+#include "ice_vsi_vlan_ops.h"
+#include "ice_fwlog.h"
+
+#define DRV_VERSION_MAJOR 1
+#define DRV_VERSION_MINOR 6
+#define DRV_VERSION_BUILD 7
+
+#define DRV_VERSION	"1.6.7.1.1"
 #define DRV_SUMMARY	"Intel(R) Ethernet Connection E800 Series Linux Driver"
-const char ice_drv_ver[] = DRV_VERSION;
+#ifdef ICE_ADD_PROBES
+#define DRV_VERSION_EXTRA "_probes"
+#else
+#define DRV_VERSION_EXTRA ""
+#endif /* ICE_ADD_PROBES */
+
+const char ice_drv_ver[] = DRV_VERSION DRV_VERSION_EXTRA;
 static const char ice_driver_string[] = DRV_SUMMARY;
-static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation.";
+static const char ice_copyright[] = "Copyright (C) 2018-2021, Intel Corporation.";
 
 /* DDP Package file located in firmware search paths (e.g. /lib/firmware/) */
+#if UTS_UBUNTU_RELEASE_ABI
+#define ICE_DDP_PKG_PATH	"updates/intel/ice/ddp/"
+#else /* UTS_UBUNTU_RELEASE_ABI */
 #define ICE_DDP_PKG_PATH	"intel/ice/ddp/"
+#endif /* UTS_UBUNTU_RELEASE_ABI */
 #define ICE_DDP_PKG_FILE	ICE_DDP_PKG_PATH "ice.pkg"
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION(DRV_SUMMARY);
 MODULE_LICENSE("GPL v2");
-MODULE_VERSION(DRV_VERSION);
+MODULE_VERSION(DRV_VERSION DRV_VERSION_EXTRA);
 MODULE_FIRMWARE(ICE_DDP_PKG_FILE);
 
 static int debug = -1;
@@ -39,14 +62,323 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all), hw debug_mask (0x8XXXX
 MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
 #endif /* !CONFIG_DYNAMIC_DEBUG */
 
+
+
+static ushort fwlog_level = ICE_FWLOG_LEVEL_NONE;
+module_param(fwlog_level, ushort, 0644);
+MODULE_PARM_DESC(fwlog_level, "FW event level to log. All levels <= to the specified value are enabled. Values: 0=none, 1=error, 2=warning, 3=normal, 4=verbose. Invalid values: >=5\n");
+/* Apply a bitmask to control which categories of FW logging is recorded
+ * 00000001 - General/Minor
+ * 00000002 - Control (Resets/Autoload)
+ * 00000004 - Link Management
+ * 00000008 - Link Topology Detection
+ * 00000010 - Dreadnought Lake
+ * 00000020 - I2C
+ * 00000040 - SDP
+ * 00000080 - MDIO
+ * 00000100 - Admin Queue
+ * 00000200 - HDMA
+ * 00000400 - LLDP
+ * 00000800 - DCBx
+ * 00001000 - DCB
+ * 00002000 - XLR
+ * 00004000 - NVM
+ * 00008000 - Authentication
+ * 00010000 - VPD
+ * 00020000 - IOSF
+ * 00040000 - Parser
+ * 00080000 - Switch
+ * 00100000 - Scheduler
+ * 00200000 - TX Queue Management
+ * 00400000 - ACL
+ * 00800000 - Post
+ * 01000000 - Watchdog
+ * 02000000 - Task Dispatcher
+ * 04000000 - Manageability
+ * 08000000 - Synce
+ * 10000000 - Health
+ * 20000000 - TimeSync
+ * 40000000 - PF Registration
+ * 80000000 - Module Version
+ */
+static unsigned long fwlog_events; /* no enabled events by default */
+module_param(fwlog_events, ulong, 0644);
+MODULE_PARM_DESC(fwlog_events, "FW events to log (32-bit mask)\n");
+
 static struct workqueue_struct *ice_wq;
+
+static const struct net_device_ops ice_netdev_recovery_ops;
 static const struct net_device_ops ice_netdev_safe_mode_ops;
 static const struct net_device_ops ice_netdev_ops;
-
 static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 
+static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 static void ice_vsi_release_all(struct ice_pf *pf);
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static int ice_rebuild_channels(struct ice_pf *pf);
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool rem_adv_fltr);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+
+bool netif_is_ice(struct net_device *dev)
+{
+	return dev && (dev->netdev_ops == &ice_netdev_ops);
+}
+
+#ifdef HAVE_NETDEV_SB_DEV
+static void ice_deinit_macvlan(struct ice_vsi *vsi);
+
+#endif /* HAVE_NETDEV_SB_DEV */
+#ifdef HAVE_TC_INDIR_BLOCK
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+static int
+ice_indr_setup_tc_cb(struct net_device *netdev, struct Qdisc *sch,
+		     void *cb_priv, enum tc_setup_type type, void *type_data,
+		     void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb));
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+static int
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data, void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb));
+#elif defined(HAVE_TC_FLOW_INDIR_DEV)
+static int ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+				enum tc_setup_type type, void *type_data);
+static int ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data,
+				   void *indr_priv);
+#else /* !HAVE_TC_FLOW_INDIR_DEV */
+static int
+ice_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr);
+static void ice_indr_clean_block_privs(struct ice_netdev_priv *np);
+#endif /* HAVE_TC_INDIR_BLOCK */
+#endif
+/**
+ * ice_chnl_subtask_handle_interrupt - if needed, trigger SW interrupt on
+ * channel enabled vector
+ * @pf: pointer to PF struct
+ *
+ * This function process all channel enabled vectors and based on jiffy
+ * and delta between jiffies, decides and triggers software initiated
+ * interrupt on each of such vectors. Logic used:
+ * if on given vector jiffies delta is greated than 1 second and old
+ * snapshot of jiffies is valid, then trigger software interrupt.
+ * Jiffies snapshot is stored/updated in vector whenever vector
+ * is serviced through busy-poll.
+ */
+static void ice_chnl_subtask_handle_interrupt(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	unsigned long end;
+	unsigned int i;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	if (!(vsi->netdev && netif_carrier_ok(vsi->netdev)))
+		return;
+
+	for (i = 0; i < vsi->num_txq; i++) {
+		struct ice_ring *tx_ring = vsi->tx_rings[i];
+		struct ice_ring *rx_ring = vsi->rx_rings[i];
+		struct ice_q_vector *q_vector;
+
+		if (!(tx_ring && tx_ring->desc && rx_ring))
+			continue;
+		q_vector = tx_ring->q_vector;
+		if (!q_vector || !ice_vector_ch_enabled(q_vector))
+			continue;
+
+		end = tx_ring->q_vector->jiffy;
+		if (!end)
+			continue;
+
+		/* trigger software interrupt (to revive queue processing) if
+		 * vector is channel enabled and only if current jiffies is at
+		 * least 1 sec (worth of jiffies, hence multiplying by HZ) more
+		 * than old_jiffies
+		 */
+#define ICE_JIFFY_DELTA_IN_SEC	(1 * HZ)
+		end += ICE_JIFFY_DELTA_IN_SEC;
+		if (time_is_before_jiffies(end) &&
+		    (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+#ifdef ADQ_PERF_COUNTERS
+			ice_sw_intr_cntr(q_vector, false);
+#endif /* ADQ_PERF_COUNTERS */
+			ice_adq_trigger_sw_intr(&pf->hw, q_vector);
+		}
+	}
+}
+
+/**
+ * ice_flush_vsi_fd_fltrs - flush VSI specific FD entries
+ * @vsi: ptr to VSI
+ *
+ * This function flushes all FD entries specific to VSI from
+ * HW FD table
+ */
+static inline void ice_flush_vsi_fd_fltrs(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	enum ice_status status;
+
+	status = ice_clear_vsi_fd_table(&vsi->back->hw, vsi->vsi_num);
+	if (status)
+		dev_err(dev, "Failed to clear FD table for %s, vsi_num: %u, status: %s\n",
+			ice_vsi_type_str(vsi->type), vsi->vsi_num,
+			ice_stat_str(status));
+}
+
+/**
+ * ice_chnl_handle_fd_transition - handle VSI specific FD transition
+ * @main_vsi: ptr to main VSI (ICE_VSI_PF)
+ * @ch: ptr to channel
+ * @hw_fd_cnt: HW FD count specific to VSI
+ * @fd_pkt_cnt: packets services thru' inline-FD filter
+ * @sw_fd_cnt: SW tracking, number of inline-FD filtere were programmed per VSI
+ *
+ * This function determines whether given VSI should continue to use inline-FD
+ * resources or not and sets the bit accordingly. It also flushes the FD entries
+ * occupied per VSI if detected table full condition 'n' times and no more
+ * packets serviced thru' inline-FD filter
+ */
+static void
+ice_chnl_handle_fd_transition(struct ice_vsi *main_vsi, struct ice_channel *ch,
+			      u32 hw_fd_cnt, u64 fd_pkt_cnt, int sw_fd_cnt)
+{
+	struct ice_vsi *vsi;
+
+	if (!ch || !main_vsi)
+		return;
+
+	vsi = ch->ch_vsi;
+	if (!vsi)
+		return;
+
+	/* did we reach table full condition and no activity w.r.t
+	 * inline-FD filter being hit in HW table during last 'n' runs of
+	 * service task, then it is safe to "drop HW table entries"
+	 */
+	/* check to see if given VSI reached max limit of FD entries */
+	if (ice_is_vsi_fd_table_full(vsi, hw_fd_cnt)) {
+		/* check to see if there are any hits using inline-FD filters,
+		 * if not start "table_full" counter
+		 */
+		if (!ch->fd_pkt_cnt && !fd_pkt_cnt &&
+		    ch->fd_pkt_cnt == fd_pkt_cnt) {
+			/* HW table is FULL: and no more packets being serviced
+			 * thru' inline-FD filters (by looking at the current
+			 * and prev packets serviced).
+			 * Logic to see if that current and prev packet count
+			 * is changing or not, if not changing means,
+			 * it is safe to assume that even though there are
+			 * inline-FD filters exist in HW table, but flows
+			 * associated with those filters are ended via.
+			 * RST code path
+			 */
+			vsi->cnt_tbl_full++;
+			main_vsi->cnt_tbl_full++;
+		} else {
+			vsi->cnt_tbl_full = 0;
+		}
+
+		/* detected that HW table remained full during
+		 * last 'n' times, now it is the time to purge
+		 * HW table entries.
+		 * detected that HW FD table full condition
+		 * based on SW counter based hueristics,
+		 * give around 4 second to be in same condition
+		 * otherwise proceed with purging HW table
+		 * entries
+		 */
+		if (vsi->cnt_tbl_full < ICE_TBL_FULL_TIMES)
+			return;
+
+		/* if we are here, then safe to flush HW inline-FD filters */
+		ice_flush_vsi_fd_fltrs(vsi);
+		/* stats to keep track, how many times HW table is flushed */
+		vsi->cnt_table_flushed++;
+		main_vsi->cnt_table_flushed++;
+
+		/* reset VSI specific counters */
+		atomic_set(&vsi->inline_fd_active_cnt, 0);
+		vsi->cnt_tbl_full = 0;
+		/* clear the feature flag for inline-FD/RSS */
+		clear_bit(ICE_SWITCH_TO_RSS, vsi->adv_state);
+	} else if ((u32)sw_fd_cnt > hw_fd_cnt) {
+		/* HW table (inline-FD filters) is not full and SW count is
+		 * higher than actual entries in HW table, time to sync SW
+		 * counter with HW counter (tracking inline-FD filter count)
+		 * and transition back to using inline-FD filters
+		 */
+		atomic_set(&vsi->inline_fd_active_cnt, hw_fd_cnt);
+		vsi->cnt_tbl_full = 0;
+		/* stats to keep track, how many times transitioned into
+		 * inline-FD from RSS
+		 */
+		vsi->cnt_inline_fd_transition++;
+		main_vsi->cnt_inline_fd_transition++;
+		/* clear the feature flag for inline-FD/RSS */
+		clear_bit(ICE_SWITCH_TO_RSS, vsi->adv_state);
+	}
+}
+
+/**
+ * ice_channel_sync_global_cntrs - sync SW and HW FD specific counters
+ * @pf: ptr to PF
+ *
+ * This function iterates thru' all channel VSIs and handles transition of
+ * FD (Flow-director) -> RSS and vice versa, if needed also flushes VSI
+ * specific FD entries from HW table
+ */
+static void ice_channel_sync_global_cntrs(struct ice_pf *pf)
+{
+	struct ice_vsi *main_vsi;
+	struct ice_channel *ch;
+
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return;
+
+	list_for_each_entry(ch, &main_vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+		u64 fd_pkt_cnt;
+		int sw_fd_cnt;
+		u32 hw_fd_cnt;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+		if (!ice_vsi_fd_ena(ch_vsi))
+			continue;
+		/* bailout if SWITCH_TO_RSS is not set */
+		if (!test_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state))
+			continue;
+		/* first counter index is always taken by sideband flow
+		 * director, hence channel specific counter index has
+		 * to be non-zero, otherwise skip...
+		 */
+		if (!ch->fd_cnt_index)
+			continue;
+
+		/* read SW count */
+		sw_fd_cnt = atomic_read(&ch_vsi->inline_fd_active_cnt);
+		/* Read HW count */
+		hw_fd_cnt = ice_get_current_fd_cnt(ch_vsi);
+		/* Read the HW counter which was associated with inline-FD */
+		fd_pkt_cnt = ice_read_cntr(pf, ch->fd_cnt_index);
+
+		/* handle VSI specific transition: inline-FD/RSS
+		 * if needed flush FD entries specific to VSI
+		 */
+		ice_chnl_handle_fd_transition(main_vsi, ch, hw_fd_cnt,
+					      fd_pkt_cnt, sw_fd_cnt);
+		/* store the value of fd_pkt_cnt per channel */
+		ch->fd_pkt_cnt = fd_pkt_cnt;
+	}
+}
+
 /**
  * ice_get_tx_pending - returns number of Tx descriptors not processed
  * @ring: the ring of descriptors
@@ -82,7 +414,7 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 			break;
 		}
 
-	if (!vsi || test_bit(__ICE_DOWN, vsi->state))
+	if (!vsi || test_bit(ICE_VSI_DOWN, vsi->state))
 		return;
 
 	if (!(vsi->netdev && netif_carrier_ok(vsi->netdev)))
@@ -93,7 +425,12 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 	for (i = 0; i < vsi->num_txq; i++) {
 		struct ice_ring *tx_ring = vsi->tx_rings[i];
 
-		if (tx_ring && tx_ring->desc) {
+		if (!tx_ring)
+			continue;
+		if (ice_ring_ch_enabled(tx_ring))
+			continue;
+
+		if (tx_ring->desc) {
 			/* If packet counter has not changed the queue is
 			 * likely stalled, so force an interrupt for this
 			 * queue.
@@ -129,45 +466,19 @@ static void ice_check_for_hang_subtask(struct ice_pf *pf)
 static int ice_init_mac_fltr(struct ice_pf *pf)
 {
 	enum ice_status status;
-	u8 broadcast[ETH_ALEN];
 	struct ice_vsi *vsi;
+	u8 *perm_addr;
 
 	vsi = ice_get_main_vsi(pf);
 	if (!vsi)
 		return -EINVAL;
 
-	/* To add a MAC filter, first add the MAC to a list and then
-	 * pass the list to ice_add_mac.
-	 */
-
-	 /* Add a unicast MAC filter so the VSI can get its packets */
-	status = ice_vsi_cfg_mac_fltr(vsi, vsi->port_info->mac.perm_addr, true);
+	perm_addr = vsi->port_info->mac.perm_addr;
+	status = ice_fltr_add_mac_and_broadcast(vsi, perm_addr, ICE_FWD_TO_VSI);
 	if (status)
-		goto unregister;
-
-	/* VSI needs to receive broadcast traffic, so add the broadcast
-	 * MAC address to the list as well.
-	 */
-	eth_broadcast_addr(broadcast);
-	status = ice_vsi_cfg_mac_fltr(vsi, broadcast, true);
-	if (status)
-		goto unregister;
+		return -EIO;
 
 	return 0;
-unregister:
-	/* We aren't useful with no MAC filters, so unregister if we
-	 * had an error
-	 */
-	if (status && vsi->netdev->reg_state == NETREG_REGISTERED) {
-		dev_err(&pf->pdev->dev,
-			"Could not add MAC filters error %d. Unregistering device\n",
-			status);
-		unregister_netdev(vsi->netdev);
-		free_netdev(vsi->netdev);
-		vsi->netdev = NULL;
-	}
-
-	return -EIO;
 }
 
 /**
@@ -185,7 +496,8 @@ static int ice_add_mac_to_sync_list(struct net_device *netdev, const u8 *addr)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
-	if (ice_add_mac_to_list(vsi, &vsi->tmp_sync_list, addr))
+	if (ice_fltr_add_mac_to_list(vsi, &vsi->tmp_sync_list, addr,
+				     ICE_FWD_TO_VSI))
 		return -EINVAL;
 
 	return 0;
@@ -206,7 +518,8 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr)
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
 
-	if (ice_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr))
+	if (ice_fltr_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr,
+				     ICE_FWD_TO_VSI))
 		return -EINVAL;
 
 	return 0;
@@ -220,37 +533,54 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr)
  */
 static bool ice_vsi_fltr_changed(struct ice_vsi *vsi)
 {
-	return test_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags) ||
-	       test_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags) ||
-	       test_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+	return test_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state) ||
+	       test_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state) ||
+	       test_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 }
 
 /**
- * ice_cfg_promisc - Enable or disable promiscuous mode for a given PF
+ * ice_set_promisc - Enable promiscuous mode for a given PF
  * @vsi: the VSI being configured
  * @promisc_m: mask of promiscuous config bits
- * @set_promisc: enable or disable promisc flag request
  *
  */
-static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
+static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m)
 {
-	struct ice_hw *hw = &vsi->back->hw;
-	enum ice_status status = 0;
+	enum ice_status status;
 
 	if (vsi->type != ICE_VSI_PF)
 		return 0;
 
-	if (vsi->vlan_ena) {
-		status = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_m,
-						  set_promisc);
-	} else {
-		if (set_promisc)
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     0);
-		else
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       0);
-	}
+	if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_set_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
+	else
+		status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0,
+						  vsi->port_info->lport);
+
+	if (status)
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_clear_promisc - Disable promiscuous mode for a given PF
+ * @vsi: the VSI being configured
+ * @promisc_m: mask of promiscuous config bits
+ *
+ */
+static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m)
+{
+	enum ice_status status;
+
+	if (vsi->type != ICE_VSI_PF)
+		return 0;
+
+	if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_clear_vlan_vsi_promisc(&vsi->back->hw, vsi, promisc_m);
+	else
+		status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m, 0,
+						    vsi->port_info->lport);
 
 	if (status)
 		return -EIO;
@@ -258,6 +588,7 @@ static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
 	return 0;
 }
 
+
 /**
  * ice_vsi_sync_fltr - Update the VSI filter list to the HW
  * @vsi: ptr to the VSI
@@ -266,7 +597,8 @@ static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc)
  */
 static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 {
-	struct device *dev = &vsi->back->pdev->dev;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct device *dev = ice_pf_to_dev(vsi->back);
 	struct net_device *netdev = vsi->netdev;
 	bool promisc_forced_on = false;
 	struct ice_pf *pf = vsi->back;
@@ -279,7 +611,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	if (!vsi->netdev)
 		return -EINVAL;
 
-	while (test_and_set_bit(__ICE_CFG_BUSY, vsi->state))
+	while (test_and_set_bit(ICE_CFG_BUSY, vsi->state))
 		usleep_range(1000, 2000);
 
 	changed_flags = vsi->current_netdev_flags ^ vsi->netdev->flags;
@@ -289,9 +621,9 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	INIT_LIST_HEAD(&vsi->tmp_unsync_list);
 
 	if (ice_vsi_fltr_changed(vsi)) {
-		clear_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-		clear_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
-		clear_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+		clear_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+		clear_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
+		clear_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 
 		/* grab the netdev's addr_list_lock */
 		netif_addr_lock_bh(netdev);
@@ -304,8 +636,8 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	/* Remove MAC addresses in the unsync list */
-	status = ice_remove_mac(hw, &vsi->tmp_unsync_list);
-	ice_free_fltr_list(dev, &vsi->tmp_unsync_list);
+	status = ice_fltr_remove_mac_list(vsi, &vsi->tmp_unsync_list);
+	ice_fltr_free_list(dev, &vsi->tmp_unsync_list);
 	if (status) {
 		netdev_err(netdev, "Failed to delete MAC filters\n");
 		/* if we failed because of alloc failures, just bail */
@@ -316,8 +648,8 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	/* Add MAC addresses in the sync list */
-	status = ice_add_mac(hw, &vsi->tmp_sync_list);
-	ice_free_fltr_list(dev, &vsi->tmp_sync_list);
+	status = ice_fltr_add_mac_list(vsi, &vsi->tmp_sync_list);
+	ice_fltr_free_list(dev, &vsi->tmp_sync_list);
 	/* If filter is added successfully or already exists, do not go into
 	 * 'if' condition and report it as error. Instead continue processing
 	 * rest of the function.
@@ -329,11 +661,10 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 		 * space reserved for promiscuous filters.
 		 */
 		if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOSPC &&
-		    !test_and_set_bit(__ICE_FLTR_OVERFLOW_PROMISC,
+		    !test_and_set_bit(ICE_FLTR_OVERFLOW_PROMISC,
 				      vsi->state)) {
 			promisc_forced_on = true;
-			netdev_warn(netdev,
-				    "Reached MAC filter limit, forcing promisc mode on VSI %d\n",
+			netdev_warn(netdev, "Reached MAC filter limit, forcing promisc mode on VSI %d\n",
 				    vsi->vsi_num);
 		} else {
 			err = -EIO;
@@ -343,25 +674,26 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	/* check for changes in promiscuous modes */
 	if (changed_flags & IFF_ALLMULTI) {
 		if (vsi->current_netdev_flags & IFF_ALLMULTI) {
-			if (vsi->vlan_ena)
+			if (ice_vsi_has_non_zero_vlans(vsi))
 				promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
 			else
 				promisc_m = ICE_MCAST_PROMISC_BITS;
 
-			err = ice_cfg_promisc(vsi, promisc_m, true);
+			err = ice_set_promisc(vsi, promisc_m);
 			if (err) {
 				netdev_err(netdev, "Error setting Multicast promiscuous mode on VSI %i\n",
 					   vsi->vsi_num);
 				vsi->current_netdev_flags &= ~IFF_ALLMULTI;
 				goto out_promisc;
 			}
-		} else if (!(vsi->current_netdev_flags & IFF_ALLMULTI)) {
-			if (vsi->vlan_ena)
+		} else {
+			/* !(vsi->current_netdev_flags & IFF_ALLMULTI) */
+			if (ice_vsi_has_non_zero_vlans(vsi))
 				promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
 			else
 				promisc_m = ICE_MCAST_PROMISC_BITS;
 
-			err = ice_cfg_promisc(vsi, promisc_m, false);
+			err = ice_clear_promisc(vsi, promisc_m);
 			if (err) {
 				netdev_err(netdev, "Error clearing Multicast promiscuous mode on VSI %i\n",
 					   vsi->vsi_num);
@@ -372,43 +704,49 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 	}
 
 	if (((changed_flags & IFF_PROMISC) || promisc_forced_on) ||
-	    test_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags)) {
-		clear_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+	    test_bit(ICE_VSI_PROMISC_CHANGED, vsi->state)) {
+		clear_bit(ICE_VSI_PROMISC_CHANGED, vsi->state);
 		if (vsi->current_netdev_flags & IFF_PROMISC) {
 			/* Apply Rx filter rule to get traffic from wire */
-			status = ice_cfg_dflt_vsi(hw, vsi->idx, true,
-						  ICE_FLTR_RX);
-			if (status) {
-				netdev_err(netdev, "Error setting default VSI %i Rx rule\n",
-					   vsi->vsi_num);
-				vsi->current_netdev_flags &= ~IFF_PROMISC;
-				err = -EIO;
-				goto out_promisc;
+			if (!ice_is_dflt_vsi_in_use(pf->first_sw)) {
+				err = ice_set_dflt_vsi(pf->first_sw, vsi);
+				if (err && err != -EEXIST) {
+					netdev_err(netdev, "Error %d setting default VSI %i Rx rule\n",
+						   err, vsi->vsi_num);
+					vsi->current_netdev_flags &=
+						~IFF_PROMISC;
+					goto out_promisc;
+				}
+				vlan_ops->dis_rx_filtering(vsi);
 			}
 		} else {
 			/* Clear Rx filter to remove traffic from wire */
-			status = ice_cfg_dflt_vsi(hw, vsi->idx, false,
-						  ICE_FLTR_RX);
-			if (status) {
-				netdev_err(netdev, "Error clearing default VSI %i Rx rule\n",
-					   vsi->vsi_num);
-				vsi->current_netdev_flags |= IFF_PROMISC;
-				err = -EIO;
-				goto out_promisc;
+			if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) {
+				err = ice_clear_dflt_vsi(pf->first_sw);
+				if (err) {
+					netdev_err(netdev, "Error %d clearing default VSI %i Rx rule\n",
+						   err, vsi->vsi_num);
+					vsi->current_netdev_flags |=
+						IFF_PROMISC;
+					goto out_promisc;
+				}
+				if (vsi->current_netdev_flags &
+				    NETIF_F_HW_VLAN_CTAG_FILTER)
+					vlan_ops->ena_rx_filtering(vsi);
 			}
 		}
 	}
 	goto exit;
 
 out_promisc:
-	set_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_PROMISC_CHANGED, vsi->state);
 	goto exit;
 out:
 	/* if something went wrong then set the changed flag so we try again */
-	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+	set_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
 exit:
-	clear_bit(__ICE_CFG_BUSY, vsi->state);
+	clear_bit(ICE_CFG_BUSY, vsi->state);
 	return err;
 }
 
@@ -435,64 +773,119 @@ static void ice_sync_fltr_subtask(struct ice_pf *pf)
 }
 
 /**
- * ice_dis_vsi - pause a VSI
- * @vsi: the VSI being paused
+ * ice_pf_dis_all_vsi - Pause all VSIs on a PF
+ * @pf: the PF
  * @locked: is the rtnl_lock already held
  */
-static void ice_dis_vsi(struct ice_vsi *vsi, bool locked)
+static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
 {
-	if (test_bit(__ICE_DOWN, vsi->state))
-		return;
+	int node;
+	int v;
 
-	set_bit(__ICE_NEEDS_RESTART, vsi->state);
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			ice_dis_vsi(pf->vsi[v], locked);
 
-	if (vsi->type == ICE_VSI_PF && vsi->netdev) {
-		if (netif_running(vsi->netdev)) {
-			if (!locked)
-				rtnl_lock();
+	for (node = 0; node < ICE_MAX_PF_AGG_NODES; node++)
+		pf->pf_agg_node[node].num_vsis = 0;
 
-			ice_stop(vsi->netdev);
+	for (node = 0; node < ICE_MAX_VF_AGG_NODES; node++)
+		pf->vf_agg_node[node].num_vsis = 0;
 
-			if (!locked)
-				rtnl_unlock();
-		} else {
-			ice_vsi_close(vsi);
+#ifdef HAVE_NETDEV_SB_DEV
+	for (node = 0; node < ICE_MAX_MACVLAN_AGG_NODES; node++)
+		pf->macvlan_agg_node[node].num_vsis = 0;
+#endif
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_remove_tc_fltrs - clear TC filters configuration
+ * @pf: ptr to PF, TC-flower based filter are tracked at PF level
+ * @remove_from_list: true if filter to be removed from tc_fltr list
+ *
+ * Remove all advanced TC flower switch filters from software bookkeeping.
+ * If 'remove_from_list' parameter is set to true, all filters are also
+ * removed from PF's tc_flower_fltr_list list and it's not possible to
+ * restore them (after the reset for example).
+ */
+static void ice_remove_tc_fltrs(struct ice_pf *pf, bool remove_from_list)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node2;
+
+	hlist_for_each_entry_safe(fltr, node2,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		struct ice_adv_fltr_mgmt_list_entry *entry, *tmp;
+		struct list_head *list_head;
+		struct mutex *rule_lock;
+		struct ice_switch_info *sw;
+
+		sw = pf->hw.switch_info;
+		if (!sw->recp_list[fltr->rid].recp_created)
+			continue;
+		rule_lock = &sw->recp_list[fltr->rid].filt_rule_lock;
+		list_head = &sw->recp_list[fltr->rid].filt_rules;
+
+		list_for_each_entry_safe(entry, tmp, list_head, list_entry) {
+			if (entry->rule_info.fltr_rule_id == fltr->rule_id) {
+				mutex_lock(rule_lock);
+				list_del(&entry->list_entry);
+				devm_kfree(ice_pf_to_dev(pf), entry->lkups);
+				devm_kfree(ice_pf_to_dev(pf), entry);
+				mutex_unlock(rule_lock);
+				break;
+			}
+		}
+
+		if (remove_from_list) {
+			hlist_del(&fltr->tc_flower_node);
+			kfree(fltr);
 		}
 	}
 }
 
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 /**
- * ice_pf_dis_all_vsi - Pause all VSIs on a PF
- * @pf: the PF
- * @locked: is the rtnl_lock already held
+ * ice_clear_sw_switch_recipes - clear switch recipes
+ * @pf: board private structure
+ *
+ * Mark switch recipes as not created in sw structures. There are cases where
+ * rules (especially advanced rules) need to be restored, either re-read from
+ * hardware or added again. For example after the reset. 'recp_created' flag
+ * prevents from doing that and need to be cleared upfront.
  */
-#ifdef CONFIG_DCB
-void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
-#else
-static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
-#endif /* CONFIG_DCB */
+static void ice_clear_sw_switch_recipes(struct ice_pf *pf)
 {
-	int v;
+	struct ice_sw_recipe *recp;
+	u8 i;
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			ice_dis_vsi(pf->vsi[v], locked);
+	recp = pf->hw.switch_info->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++)
+		recp[i].recp_created = false;
 }
 
 /**
- * ice_prepare_for_reset - prep for the core to reset
+ * ice_prepare_for_reset - prep for reset
  * @pf: board private structure
+ * @reset_type: reset type requested
  *
  * Inform or close all dependent features in prep for reset.
  */
 static void
-ice_prepare_for_reset(struct ice_pf *pf)
+ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 {
 	struct ice_hw *hw = &pf->hw;
-	int i;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	struct ice_vsi *vsi;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	unsigned int i;
+
+	dev_dbg(ice_pf_to_dev(pf), "reset_type=%d\n", reset_type);
 
 	/* already prepared for reset */
-	if (test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
+	if (test_bit(ICE_PREPARED_FOR_RESET, pf->state))
 		return;
 
 	/* Notify VFs of impending reset */
@@ -500,133 +893,335 @@ ice_prepare_for_reset(struct ice_pf *pf)
 		ice_vc_notify_reset(pf);
 
 	/* Disable VFs until reset is completed */
-	for (i = 0; i < pf->num_alloc_vfs; i++)
+	ice_for_each_vf(pf, i)
 		ice_set_vf_state_qs_dis(&pf->vf[i]);
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* release ADQ specific HW and SW resources */
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		goto skip;
+
+	/* to be on safer size, reset orig_rss_size so that normal flow
+	 * of deciding rss_size can take precedence
+	 */
+	vsi->orig_rss_size = 0;
+
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		if (reset_type == ICE_RESET_PFR) {
+			vsi->old_ena_tc = vsi->all_enatc;
+			vsi->old_numtc = vsi->all_numtc;
+		} else {
+			ice_remove_q_channels(vsi, true);
+
+			/* for other reset type, do not support "rebuild
+			 * of channel, hence reset needed info
+			 */
+			vsi->old_ena_tc = 0;
+			vsi->all_enatc = 0;
+			vsi->old_numtc = 0;
+			vsi->all_numtc = 0;
+			vsi->req_txq = 0;
+			vsi->req_rxq = 0;
+			clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+			memset(&vsi->mqprio_qopt, 0, sizeof(vsi->mqprio_qopt));
+		}
+	}
+skip:
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		ice_remove_tc_fltrs(pf, false);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+		if (reset_type != ICE_RESET_PFR)
+			ice_clear_sw_switch_recipes(pf);
+	}
+
 	/* clear SW filtering DB */
 	ice_clear_hw_tbls(hw);
 	/* disable the VSIs and their queues that are not already DOWN */
 	ice_pf_dis_all_vsi(pf, false);
 
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_release(pf);
+
 	if (hw->port_info)
 		ice_sched_clear_port(hw->port_info);
 
 	ice_shutdown_all_ctrlq(hw);
 
-	set_bit(__ICE_PREPARED_FOR_RESET, pf->state);
+	set_bit(ICE_PREPARED_FOR_RESET, pf->state);
 }
 
+
 /**
- * ice_do_reset - Initiate one of many types of resets
- * @pf: board private structure
- * @reset_type: reset type requested
- * before this function was called.
+ * ice_print_recovery_msg - print recovery mode message
+ * @dev: pointer to the device instance
  */
-static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
+static void ice_print_recovery_msg(struct device *dev)
 {
-	struct device *dev = &pf->pdev->dev;
-	struct ice_hw *hw = &pf->hw;
+	dev_err(dev, "Firmware recovery mode detected. Limiting functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode\n");
+}
 
-	dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
-	WARN_ON(in_interrupt());
+/**
+ * ice_prepare_for_recovery_mode - prepare the driver for FW recovery mode
+ * @pf: pointer to the PF instance
+ */
+static void ice_prepare_for_recovery_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
 
-	ice_prepare_for_reset(pf);
+	ice_print_recovery_msg(ice_pf_to_dev(pf));
+	set_bit(ICE_RECOVERY_MODE, pf->state);
 
-	/* trigger the reset */
-	if (ice_reset(hw, reset_type)) {
-		dev_err(dev, "reset %d failed\n", reset_type);
-		set_bit(__ICE_RESET_FAILED, pf->state);
-		clear_bit(__ICE_RESET_OICR_RECV, pf->state);
-		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-		clear_bit(__ICE_PFR_REQ, pf->state);
-		clear_bit(__ICE_CORER_REQ, pf->state);
-		clear_bit(__ICE_GLOBR_REQ, pf->state);
-		return;
+	vsi = ice_get_main_vsi(pf);
+	if (vsi && vsi->netdev) {
+		ice_set_ethtool_recovery_ops(vsi->netdev);
+		netif_carrier_off(vsi->netdev);
+		netif_tx_stop_all_queues(vsi->netdev);
 	}
 
-	/* PFR is a bit of a special case because it doesn't result in an OICR
-	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
-	 * associated state bits.
-	 */
-	if (reset_type == ICE_RESET_PFR) {
-		pf->pfr_count++;
-		ice_rebuild(pf, reset_type);
-		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-		clear_bit(__ICE_PFR_REQ, pf->state);
-		ice_reset_all_vfs(pf, true);
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
+		if (!pci_vfs_assigned(pf->pdev))
+			pci_disable_sriov(pf->pdev);
+
+	if (pf->ptp.clock)
+		ptp_clock_unregister(pf->ptp.clock);
+
+	set_bit(ICE_PREPPED_RECOVERY_MODE, pf->state);
+}
+
+/**
+ * ice_remove_recovery_mode - Unload helper when in FW recovery mode
+ * @pf: pointer to the PF instance
+ */
+static void ice_remove_recovery_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (vsi && vsi->netdev) {
+		unregister_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+		free_netdev(vsi->netdev);
+		clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		devm_kfree(dev, vsi);
 	}
+
+	ice_reset(&pf->hw, ICE_RESET_PFR);
+	pci_disable_pcie_error_reporting(pf->pdev);
+	ice_devlink_unregister(pf);
 }
 
 /**
- * ice_reset_subtask - Set up for resetting the device and driver
- * @pf: board private structure
+ * ice_probe_recovery_mode - Load helper when in FW recovery mode
+ * @pf: pointer to the PF instance
  */
-static void ice_reset_subtask(struct ice_pf *pf)
+static int ice_probe_recovery_mode(struct ice_pf *pf)
 {
-	enum ice_reset_req reset_type = ICE_RESET_INVAL;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_netdev_priv *np;
+	struct net_device *netdev;
+	struct ice_vsi *vsi;
+	int err;
 
-	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
+	ice_print_recovery_msg(dev);
+	set_bit(ICE_RECOVERY_MODE, pf->state);
+
+	/* create one single VSI instance and netdev to allow for ethtool
+	 * recovery ops. This VSI cannot be backed by a VSI in the HW as
+	 * the FW is in recovery mode. Thus, no traffic is possible on this
+	 * VSI/netdev
+	 */
+	pf->vsi = devm_kcalloc(dev, 1, sizeof(*pf->vsi), GFP_KERNEL);
+	if (!pf->vsi)
+		return -ENOMEM;
+
+	vsi = devm_kzalloc(dev, sizeof(*vsi), GFP_KERNEL);
+	if (!vsi) {
+		err = -ENOMEM;
+		goto err_vsi;
+	}
+
+	pf->vsi[0] = vsi;
+	vsi->back = pf;
+
+	/* allocate an etherdev with 1 queue pair */
+	netdev = alloc_etherdev(sizeof(*np));
+	if (!netdev) {
+		err = -ENOMEM;
+		goto err_netdev;
+	}
+
+	set_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+	vsi->netdev = netdev;
+	np = netdev_priv(netdev);
+	np->vsi = vsi;
+	SET_NETDEV_DEV(netdev, dev);
+	eth_hw_addr_random(netdev);
+
+	netdev->netdev_ops = &ice_netdev_recovery_ops;
+	ice_set_ethtool_recovery_ops(netdev);
+
+	err = register_netdev(netdev);
+	if (err)
+		goto err_register;
+
+	set_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return 0;
+
+err_register:
+	free_netdev(netdev);
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+err_netdev:
+	devm_kfree(dev, vsi);
+err_vsi:
+	devm_kfree(dev, pf->vsi);
+	return err;
+}
+
+/**
+ * ice_do_reset - Initiate one of many types of resets
+ * @pf: board private structure
+ * @reset_type: reset type requested before this function was called.
+ */
+static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+
+	dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
+
+	ice_prepare_for_reset(pf, reset_type);
+
+	/* trigger the reset */
+	if (ice_reset(hw, reset_type)) {
+		dev_err(dev, "reset %d failed\n", reset_type);
+		set_bit(ICE_RESET_FAILED, pf->state);
+		clear_bit(ICE_RESET_OICR_RECV, pf->state);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		clear_bit(ICE_CORER_REQ, pf->state);
+		clear_bit(ICE_GLOBR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		return;
+	}
+
+	/* PFR is a bit of a special case because it doesn't result in an OICR
+	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
+	 * associated state bits.
+	 */
+	if (reset_type == ICE_RESET_PFR) {
+		pf->pfr_count++;
+		ice_rebuild(pf, reset_type);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		ice_reset_all_vfs(pf, true);
+	}
+}
+
+/**
+ * ice_reset_subtask - Set up for resetting the device and driver
+ * @pf: board private structure
+ */
+static void ice_reset_subtask(struct ice_pf *pf)
+{
+	enum ice_reset_req reset_type = ICE_RESET_INVAL;
+
+	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
 	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what type
 	 * of reset is pending and sets bits in pf->state indicating the reset
-	 * type and __ICE_RESET_OICR_RECV. So, if the latter bit is set
+	 * type and ICE_RESET_OICR_RECV. So, if the latter bit is set
 	 * prepare for pending reset if not already (for PF software-initiated
 	 * global resets the software should already be prepared for it as
-	 * indicated by __ICE_PREPARED_FOR_RESET; for global resets initiated
+	 * indicated by ICE_PREPARED_FOR_RESET; for global resets initiated
 	 * by firmware or software on other PFs, that bit is not set so prepare
 	 * for the reset now), poll for reset done, rebuild and return.
 	 */
-	if (test_bit(__ICE_RESET_OICR_RECV, pf->state)) {
+	if (test_bit(ICE_RESET_OICR_RECV, pf->state)) {
 		/* Perform the largest reset requested */
-		if (test_and_clear_bit(__ICE_CORER_RECV, pf->state))
+		if (test_and_clear_bit(ICE_CORER_RECV, pf->state))
 			reset_type = ICE_RESET_CORER;
-		if (test_and_clear_bit(__ICE_GLOBR_RECV, pf->state))
+		if (test_and_clear_bit(ICE_GLOBR_RECV, pf->state))
 			reset_type = ICE_RESET_GLOBR;
-		if (test_and_clear_bit(__ICE_EMPR_RECV, pf->state))
+		if (test_and_clear_bit(ICE_EMPR_RECV, pf->state))
 			reset_type = ICE_RESET_EMPR;
 		/* return if no valid reset type requested */
 		if (reset_type == ICE_RESET_INVAL)
 			return;
-		ice_prepare_for_reset(pf);
+		if (ice_is_peer_ena(pf))
+			ice_for_each_peer(pf, &reset_type,
+					  ice_close_peer_for_reset);
+		ice_prepare_for_reset(pf, reset_type);
 
 		/* make sure we are ready to rebuild */
 		if (ice_check_reset(&pf->hw)) {
-			set_bit(__ICE_RESET_FAILED, pf->state);
-		} else {
-			/* done with reset. start rebuild */
-			pf->hw.reset_ongoing = false;
-			ice_rebuild(pf, reset_type);
-			/* clear bit to resume normal operations, but
-			 * ICE_NEEDS_RESTART bit is set in case rebuild failed
-			 */
-			clear_bit(__ICE_RESET_OICR_RECV, pf->state);
-			clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
-			clear_bit(__ICE_PFR_REQ, pf->state);
-			clear_bit(__ICE_CORER_REQ, pf->state);
-			clear_bit(__ICE_GLOBR_REQ, pf->state);
-			ice_reset_all_vfs(pf, true);
+			set_bit(ICE_RESET_FAILED, pf->state);
+			clear_bit(ICE_RESET_OICR_RECV, pf->state);
+			clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+			clear_bit(ICE_PFR_REQ, pf->state);
+			clear_bit(ICE_CORER_REQ, pf->state);
+			clear_bit(ICE_GLOBR_REQ, pf->state);
+			wake_up(&pf->reset_wait_queue);
+			if (ice_get_fw_mode(&pf->hw) == ICE_FW_MODE_REC)
+				ice_prepare_for_recovery_mode(pf);
+			return;
 		}
 
+
+
+		/* came out of reset. check if an NVM rollback happened */
+		if (ice_get_fw_mode(&pf->hw) == ICE_FW_MODE_ROLLBACK)
+			ice_print_rollback_msg(&pf->hw);
+
+		/* done with reset. start rebuild */
+		pf->hw.reset_ongoing = false;
+		ice_rebuild(pf, reset_type);
+		/* clear bit to resume normal operations, but
+		 * ICE_NEEDS_RESTART bit is set in case rebuild failed
+		 */
+		clear_bit(ICE_RESET_OICR_RECV, pf->state);
+		clear_bit(ICE_PREPARED_FOR_RESET, pf->state);
+		clear_bit(ICE_PFR_REQ, pf->state);
+		clear_bit(ICE_CORER_REQ, pf->state);
+		clear_bit(ICE_GLOBR_REQ, pf->state);
+		wake_up(&pf->reset_wait_queue);
+		ice_reset_all_vfs(pf, true);
 		return;
 	}
 
 	/* No pending resets to finish processing. Check for new resets */
-	if (test_bit(__ICE_PFR_REQ, pf->state))
+	if (test_bit(ICE_PFR_REQ, pf->state))
 		reset_type = ICE_RESET_PFR;
-	if (test_bit(__ICE_CORER_REQ, pf->state))
+	if (test_bit(ICE_CORER_REQ, pf->state))
 		reset_type = ICE_RESET_CORER;
-	if (test_bit(__ICE_GLOBR_REQ, pf->state))
+	if (test_bit(ICE_GLOBR_REQ, pf->state))
 		reset_type = ICE_RESET_GLOBR;
 	/* If no valid reset type requested just return */
 	if (reset_type == ICE_RESET_INVAL)
 		return;
 
 	/* reset if not already down or busy */
-	if (!test_bit(__ICE_DOWN, pf->state) &&
-	    !test_bit(__ICE_CFG_BUSY, pf->state)) {
+	if (!test_bit(ICE_DOWN, pf->state) &&
+	    !test_bit(ICE_CFG_BUSY, pf->state)) {
 		ice_do_reset(pf, reset_type);
 	}
 }
 
+
+/**
+ * ice_sync_udp_fltr_subtask - sync the VSI filter list with HW
+ * @pf: board private structure
+ */
+static void ice_sync_udp_fltr_subtask(struct ice_pf __always_unused *pf)
+{
+}
+
 /**
  * ice_print_topo_conflict - print topology conflict message
  * @vsi: the VSI whose topology status is being checked
@@ -636,7 +1231,16 @@ static void ice_print_topo_conflict(struct ice_vsi *vsi)
 	switch (vsi->port_info->phy.link_info.topo_media_conflict) {
 	case ICE_AQ_LINK_TOPO_CONFLICT:
 	case ICE_AQ_LINK_MEDIA_CONFLICT:
-		netdev_info(vsi->netdev, "Possible mis-configuration of the Ethernet port detected, please use the Intel(R) Ethernet Port Configuration Tool application to address the issue.\n");
+	case ICE_AQ_LINK_TOPO_UNREACH_PRT:
+	case ICE_AQ_LINK_TOPO_UNDRUTIL_PRT:
+	case ICE_AQ_LINK_TOPO_UNDRUTIL_MEDIA:
+		netdev_info(vsi->netdev, "Potential misconfiguration of the Ethernet port detected. If it was not intended, please use the Intel (R) Ethernet Port Configuration Tool to address the issue.\n");
+		break;
+	case ICE_AQ_LINK_TOPO_UNSUPP_MEDIA:
+		if (test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, vsi->back->flags))
+			netdev_warn(vsi->netdev, "An unsupported module type was detected. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules\n");
+		else
+			netdev_err(vsi->netdev, "Rx/Tx is disabled on this device because an unsupported module type was detected. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
 		break;
 	default:
 		break;
@@ -651,6 +1255,7 @@ static void ice_print_topo_conflict(struct ice_vsi *vsi)
 void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 {
 	struct ice_aqc_get_phy_caps_data *caps;
+	const char *an_advertised;
 	enum ice_status status;
 	const char *fec_req;
 	const char *speed;
@@ -703,7 +1308,7 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 		speed = "100 M";
 		break;
 	default:
-		speed = "Unknown";
+		speed = "Unknown ";
 		break;
 	}
 
@@ -728,7 +1333,6 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 	/* Get FEC mode based on negotiated link info */
 	switch (vsi->port_info->phy.link_info.fec_info) {
 	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
-		/* fall through */
 	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
 		fec = "RS-FEC";
 		break;
@@ -747,17 +1351,20 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 		an = "False";
 
 	/* Get FEC mode requested based on PHY caps last SW configuration */
-	caps = devm_kzalloc(&vsi->back->pdev->dev, sizeof(*caps), GFP_KERNEL);
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
 	if (!caps) {
 		fec_req = "Unknown";
+		an_advertised = "Unknown";
 		goto done;
 	}
 
 	status = ice_aq_get_phy_caps(vsi->port_info, false,
-				     ICE_AQC_REPORT_SW_CFG, caps, NULL);
+				     ICE_AQC_REPORT_ACTIVE_CFG, caps, NULL);
 	if (status)
 		netdev_info(vsi->netdev, "Get phy capability failed.\n");
 
+	an_advertised = ice_is_phy_caps_an_enabled(caps) ? "On" : "Off";
+
 	if (caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_528_REQ ||
 	    caps->link_fec_options & ICE_AQC_PHY_FEC_25G_RS_544_REQ)
 		fec_req = "RS-FEC";
@@ -767,11 +1374,10 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 	else
 		fec_req = "NONE";
 
-	devm_kfree(&vsi->back->pdev->dev, caps);
-
+	kfree(caps);
 done:
-	netdev_info(vsi->netdev, "NIC Link is up %sbps, Requested FEC: %s, FEC: %s, Autoneg: %s, Flow Control: %s\n",
-		    speed, fec_req, fec, an, fc);
+	netdev_info(vsi->netdev, "NIC Link is up %sbps Full Duplex, Requested FEC: %s, Negotiated FEC: %s, Autoneg Advertised: %s, Autoneg Negotiated: %s, Flow Control: %s\n",
+		    speed, fec_req, fec, an_advertised, an, fc);
 	ice_print_topo_conflict(vsi);
 }
 
@@ -785,7 +1391,7 @@ static void ice_vsi_link_event(struct ice_vsi *vsi, bool link_up)
 	if (!vsi)
 		return;
 
-	if (test_bit(__ICE_DOWN, vsi->state) || !vsi->netdev)
+	if (test_bit(ICE_VSI_DOWN, vsi->state) || !vsi->netdev)
 		return;
 
 	if (vsi->type == ICE_VSI_PF) {
@@ -802,6 +1408,127 @@ static void ice_vsi_link_event(struct ice_vsi *vsi, bool link_up)
 	}
 }
 
+
+/**
+ * ice_set_dflt_mib - send a default config MIB to the FW
+ * @pf: private PF struct
+ *
+ * This function sends a default configuration MIB to the FW.
+ *
+ * If this function errors out at any point, the driver is still able to
+ * function.  The main impact is that LFC may not operate as expected.
+ * Therefore an error state in this function should be treated with a DBG
+ * message and continue on with driver rebuild/reenable.
+ */
+static void ice_set_dflt_mib(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u8 mib_type, *buf, *lldpmib = NULL;
+	u16 len, typelen, offset = 0;
+	struct ice_lldp_org_tlv *tlv;
+	struct ice_hw *hw = &pf->hw;
+	u32 ouisubtype;
+
+	mib_type = SET_LOCAL_MIB_TYPE_LOCAL_MIB;
+	lldpmib = kzalloc(ICE_LLDPDU_SIZE, GFP_KERNEL);
+	if (!lldpmib) {
+		dev_dbg(dev, "%s Failed to allocate MIB memory\n",
+			__func__);
+		return;
+	}
+
+	/* Add ETS CFG TLV */
+	tlv = (struct ice_lldp_org_tlv *)lldpmib;
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_IEEE_ETS_TLV_LEN);
+	tlv->typelen = htons(typelen);
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_ETS_CFG);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	buf = tlv->tlvinfo;
+	buf[0] = 0;
+
+	/* ETS CFG all UPs map to TC 0. Next 4 (1 - 4) Octets = 0.
+	 * Octets 5 - 12 are BW values, set octet 5 to 100% BW.
+	 * Octets 13 - 20 are TSA values - leave as zeros
+	 */
+	buf[5] = 0x64;
+	len = (typelen & ICE_LLDP_TLV_LEN_M) >> ICE_LLDP_TLV_LEN_S;
+	offset += len + 2;
+	tlv = (struct ice_lldp_org_tlv *)
+		((char *)tlv + sizeof(tlv->typelen) + len);
+
+	/* Add ETS REC TLV */
+	buf = tlv->tlvinfo;
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_ETS_REC);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* First octet of buf is reserved
+	 * Octets 1 - 4 map UP to TC - all UPs map to zero
+	 * Octets 5 - 12 are BW values - set TC 0 to 100%.
+	 * Octets 13 - 20 are TSA value - leave as zeros
+	 */
+	buf[5] = 0x64;
+	offset += len + 2;
+	tlv = (struct ice_lldp_org_tlv *)
+		((char *)tlv + sizeof(tlv->typelen) + len);
+
+	/* Add PFC CFG TLV */
+	typelen = ((ICE_TLV_TYPE_ORG << ICE_LLDP_TLV_TYPE_S) |
+		   ICE_IEEE_PFC_TLV_LEN);
+	tlv->typelen = htons(typelen);
+
+	ouisubtype = ((ICE_IEEE_8021QAZ_OUI << ICE_LLDP_TLV_OUI_S) |
+		      ICE_IEEE_SUBTYPE_PFC_CFG);
+	tlv->ouisubtype = htonl(ouisubtype);
+
+	/* Octet 1 left as all zeros - PFC disabled */
+	buf[0] = 0x08;
+	len = (typelen & ICE_LLDP_TLV_LEN_M) >> ICE_LLDP_TLV_LEN_S;
+	offset += len + 2;
+
+	if (ice_aq_set_lldp_mib(hw, mib_type, (void *)lldpmib, offset, NULL))
+		dev_dbg(dev, "%s Failed to set default LLDP MIB\n", __func__);
+
+	kfree(lldpmib);
+}
+
+/**
+ * ice_check_module_power
+ * @pf: pointer to PF struct
+ * @link_cfg_err: bitmap from the link info structure
+ *
+ * check module power level returned by a previous call to aq_get_link_info
+ * and print error messages if module power level is not supported
+ */
+static void ice_check_module_power(struct ice_pf *pf, u8 link_cfg_err)
+{
+	/* if module power level is supported, clear the flag */
+	if (!(link_cfg_err & (ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT |
+			      ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED))) {
+		clear_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+		return;
+	}
+
+	/* if ICE_FLAG_MOD_POWER_UNSUPPORTED was previously set and the
+	 * above block didn't clear this bit, there's nothing to do
+	 */
+	if (test_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags))
+		return;
+
+	if (link_cfg_err & ICE_AQ_LINK_INVAL_MAX_POWER_LIMIT) {
+		dev_err(ice_pf_to_dev(pf), "The installed module is incompatible with the device's NVM image. Cannot start link\n");
+		set_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+	} else if (link_cfg_err & ICE_AQ_LINK_MODULE_POWER_UNSUPPORTED) {
+		dev_err(ice_pf_to_dev(pf), "The module's power requirements exceed the device's power supply. Cannot start link\n");
+		set_bit(ICE_FLAG_MOD_POWER_UNSUPPORTED, pf->flags);
+	}
+}
+
 /**
  * ice_link_event - process the link event
  * @pf: PF that the link event is associated with
@@ -815,11 +1542,12 @@ static int
 ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	       u16 link_speed)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_phy_info *phy_info;
+	enum ice_status status;
 	struct ice_vsi *vsi;
 	u16 old_link_speed;
 	bool old_link;
-	int result;
 
 	phy_info = &pi->phy;
 	phy_info->link_info_old = phy_info->link_info;
@@ -830,15 +1558,19 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	/* update the link info structures and re-enable link events,
 	 * don't bail on failure due to other book keeping needed
 	 */
-	result = ice_update_link_info(pi);
-	if (result)
-		dev_dbg(&pf->pdev->dev,
-			"Failed to update link status and re-enable link events for port %d\n",
-			pi->lport);
+	status = ice_update_link_info(pi);
+	if (status)
+		dev_dbg(dev, "Failed to update link status on port %d, err %s aq_err %s\n",
+			pi->lport, ice_stat_str(status),
+			ice_aq_str(pi->hw->adminq.sq_last_status));
 
-	/* if the old link up/down and speed is the same as the new */
-	if (link_up == old_link && link_speed == old_link_speed)
-		return result;
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
+	/* Check if the link state is up after updating link info, and treat
+	 * this event as an UP event since the link is actually UP now.
+	 */
+	if (phy_info->link_info.link_info & ICE_AQ_LINK_UP)
+		link_up = true;
 
 	vsi = ice_get_main_vsi(pf);
 	if (!vsi || !vsi->port_info)
@@ -848,23 +1580,31 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up,
 	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags) &&
 	    !(pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE)) {
 		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
-
-		result = ice_aq_set_link_restart_an(pi, false, NULL);
-		if (result) {
-			dev_dbg(&pf->pdev->dev,
-				"Failed to set link down, VSI %d error %d\n",
-				vsi->vsi_num, result);
-			return result;
-		}
+		ice_set_link(vsi, false);
 	}
 
+
+	/* if the old link up/down and speed is the same as the new */
+	if (link_up == old_link && link_speed == old_link_speed)
+		return 0;
+
+	if (!ice_is_e810(&pf->hw))
+		ice_ptp_link_change(pf, pf->hw.pf_id, link_up);
+
+	if (ice_is_dcb_active(pf)) {
+		if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+			ice_dcb_rebuild(pf);
+	} else {
+		if (link_up)
+			ice_set_dflt_mib(pf);
+	}
 	ice_vsi_link_event(vsi, link_up);
 	ice_print_link_msg(vsi, link_up);
 
-	if (pf->num_alloc_vfs)
-		ice_vc_notify_link_state(pf);
+	ice_vc_notify_link_state(pf);
 
-	return result;
+
+	return 0;
 }
 
 /**
@@ -876,8 +1616,8 @@ static void ice_watchdog_subtask(struct ice_pf *pf)
 	int i;
 
 	/* if interface is down do nothing */
-	if (test_bit(__ICE_DOWN, pf->state) ||
-	    test_bit(__ICE_CFG_BUSY, pf->state))
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    test_bit(ICE_CFG_BUSY, pf->state))
 		return;
 
 	/* make sure we don't do these things too often */
@@ -886,14 +1626,16 @@ static void ice_watchdog_subtask(struct ice_pf *pf)
 		return;
 
 	pf->serv_tmr_prev = jiffies;
-
+	if (!ice_is_e810(&pf->hw))
+		ice_ptp_set_timestamp_offsets(pf);
 	/* Update the stats for active netdevs so the network stack
 	 * can look at updated numbers whenever it cares to
 	 */
 	ice_update_pf_stats(pf);
-	ice_for_each_vsi(pf, i)
+	ice_for_each_vsi(pf, i) {
 		if (pf->vsi[i] && pf->vsi[i]->netdev)
 			ice_update_vsi_stats(pf->vsi[i]);
+	}
 }
 
 /**
@@ -906,19 +1648,23 @@ static int ice_init_link_events(struct ice_port_info *pi)
 {
 	u16 mask;
 
+	if (test_bit(ICE_BAD_EEPROM,
+		     ((struct ice_pf *)pi->hw->back)->state)) {
+		dev_err(ice_hw_to_dev(pi->hw), "Link events disabled due to corrupted eeprom\n");
+		return 0;
+	}
+
 	mask = ~((u16)(ICE_AQ_LINK_EVENT_UPDOWN | ICE_AQ_LINK_EVENT_MEDIA_NA |
 		       ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL));
 
 	if (ice_aq_set_event_mask(pi->hw, pi->lport, mask, NULL)) {
-		dev_dbg(ice_hw_to_dev(pi->hw),
-			"Failed to set link event mask for port %d\n",
+		dev_dbg(ice_hw_to_dev(pi->hw), "Failed to set link event mask for port %d\n",
 			pi->lport);
 		return -EIO;
 	}
 
 	if (ice_aq_get_link_info(pi, true, NULL, NULL)) {
-		dev_dbg(ice_hw_to_dev(pi->hw),
-			"Failed to enable link events for port %d\n",
+		dev_dbg(ice_hw_to_dev(pi->hw), "Failed to enable link events for port %d\n",
 			pi->lport);
 		return -EIO;
 	}
@@ -947,138 +1693,522 @@ ice_handle_link_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 				!!(link_data->link_info & ICE_AQ_LINK_UP),
 				le16_to_cpu(link_data->link_speed));
 	if (status)
-		dev_dbg(&pf->pdev->dev,
-			"Could not process link event, error %d\n", status);
+		dev_dbg(ice_pf_to_dev(pf), "Could not process link event, error %d\n",
+			status);
 
 	return status;
 }
 
+
 /**
- * __ice_clean_ctrlq - helper function to clean controlq rings
- * @pf: ptr to struct ice_pf
- * @q_type: specific Control queue type
+ * ice_print_health_status_string - Print message for given FW health event
+ * @pf: pointer to the PF structure
+ * @hse: pointer to the health status element containing the health status code
+ *
+ * Print the error/diagnostic string in response to the given Health Status
+ * Event code.
  */
-static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
+static void
+ice_print_health_status_string(struct ice_pf *pf,
+			       struct ice_aqc_health_status_elem *hse)
 {
-	struct ice_rq_event_info event;
-	struct ice_hw *hw = &pf->hw;
-	struct ice_ctl_q_info *cq;
-	u16 pending, i = 0;
-	const char *qtype;
-	u32 oldval, val;
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	u32 internal_data1, internal_data2;
+	struct net_device *netdev;
+	u16 status_code;
 
-	/* Do not clean control queue if/when PF reset fails */
-	if (test_bit(__ICE_RESET_FAILED, pf->state))
-		return 0;
+	if (!hse || !vsi)
+		return;
 
-	switch (q_type) {
-	case ICE_CTL_Q_ADMIN:
-		cq = &hw->adminq;
-		qtype = "Admin";
+	netdev = vsi->netdev;
+	status_code = le16_to_cpu(hse->health_status_code);
+	internal_data1 = le32_to_cpu(hse->internal_data1);
+	internal_data2 = le32_to_cpu(hse->internal_data2);
+
+	switch (status_code) {
+	case ICE_AQC_HEALTH_STATUS_INFO_RECOVERY:
+		netdev_info(netdev, "The device is in firmware recovery mode.\n");
+		netdev_info(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_info(netdev, "Extended Error: 0x%08x.\n",
+			    internal_data1);
 		break;
-	case ICE_CTL_Q_MAILBOX:
-		cq = &hw->mailboxq;
-		qtype = "Mailbox";
+	case ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS:
+		netdev_err(netdev, "The flash chip cannot be accessed.\n");
+		netdev_err(netdev, "Possible Solution: If issue persists, call customer support.\n");
+		netdev_err(netdev, "Access Type: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH:
+		netdev_err(netdev, "NVM authentication failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH:
+		netdev_err(netdev, "Option ROM authentication failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH:
+		netdev_err(netdev, "DDP package failed.\n");
+		netdev_err(netdev, "Possible Solution: Update to latest base driver and DDP package.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT:
+		netdev_err(netdev, "NVM image is incompatible.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT:
+		netdev_err(netdev, "Option ROM is incompatible.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Expected PCI Device Id: 0x%08x and Expected Module Id: 0x%08x.\n",
+			   internal_data1, internal_data2);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB:
+		netdev_err(netdev, "Supplied MIB file is invalid. DCB reverted to default configuration.\n");
+		netdev_err(netdev, "Possible Solution: Disable FW-LLDP and check DCBx system configuration.\n");
+		netdev_err(netdev, "Port Number: %d and MIB Id: %d.\n",
+			   internal_data1, internal_data2);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT:
+	case ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT:
+		netdev_err(netdev, "An unsupported module was detected.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE:
+		netdev_err(netdev, "Module type is not supported.\n");
+		netdev_err(netdev, "Possible Solution: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL:
+		netdev_err(netdev, "Module is not qualified.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM:
+		netdev_err(netdev, "Device cannot communicate with the module.\n");
+		netdev_err(netdev, "Possible Solution 1: Check your cable connection.\n");
+		netdev_err(netdev, "Possible Solution 2: Change or replace the module or cable.\n");
+		netdev_err(netdev, "Possible Solution 3: Manually set speed and duplex.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT:
+		netdev_err(netdev, "Unresolved module conflict.\n");
+		netdev_err(netdev, "Possible Solution 1: Manually set speed/duplex or use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT:
+		netdev_err(netdev, "Module is not present.\n");
+		netdev_err(netdev, "Possible Solution 1: Check that the module is inserted correctly.\n");
+		netdev_err(netdev, "Possible Solution 2: If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED:
+		netdev_info(netdev, "Underutilized module.\n");
+		netdev_info(netdev, "Possible Solution 1: Change or replace the module or cable.\n");
+		netdev_info(netdev, "Possible Solution 2: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG:
+		netdev_err(netdev, "Invalid link configuration.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS:
+		netdev_err(netdev, "Port hardware access error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE:
+		netdev_err(netdev, "A port is unreachable.\n");
+		netdev_err(netdev, "Possible Solution 1: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED:
+		netdev_info(netdev, "Port speed is limited due to module.\n");
+		netdev_info(netdev, "Possible Solution: Change the module or use Intel(R) Ethernet Port Configuration Tool to configure the port option to match the current module speed.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT:
+		netdev_err(netdev, "A parallel fault was detected.\n");
+		netdev_err(netdev, "Possible Solution: Check link partner connection and configuration.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED:
+		netdev_info(netdev, "Port speed is limited by PHY capabilities.\n");
+		netdev_info(netdev, "Possible Solution 1: Change the module to align to port option.\n");
+		netdev_info(netdev, "Possible Solution 2: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_info(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO:
+		netdev_err(netdev, "LOM topology netlist is corrupted.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_NETLIST:
+		netdev_err(netdev, "Unrecoverable netlist error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT:
+		netdev_err(netdev, "Port topology conflict.\n");
+		netdev_err(netdev, "Possible Solution 1: Use Intel(R) Ethernet Port Configuration Tool to change the port option.\n");
+		netdev_err(netdev, "Possible Solution 2: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS:
+		netdev_err(netdev, "Unrecoverable hardware access error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME:
+		netdev_err(netdev, "Unrecoverable runtime error.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
+		break;
+	case ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT:
+		netdev_err(netdev, "Link management engine failed to initialize.\n");
+		netdev_err(netdev, "Possible Solution: Update to the latest NVM image.\n");
+		netdev_err(netdev, "Port Number: %d.\n", internal_data1);
 		break;
 	default:
-		dev_warn(&pf->pdev->dev, "Unknown control queue type 0x%x\n",
-			 q_type);
-		return 0;
+		break;
 	}
+}
 
-	/* check for error indications - PF_xx_AxQLEN register layout for
-	 * FW/MBX/SB are identical so just use defines for PF_FW_AxQLEN.
+/**
+ * ice_process_health_status_event - Process the health status event from FW
+ * @pf: pointer to the PF structure
+ * @event: event structure containing the Health Status Event opcode
+ *
+ * Decode the Health Status Events and print the associated messages
+ */
+static void ice_process_health_status_event(struct ice_pf *pf,
+					    struct ice_rq_event_info *event)
+{
+	struct ice_aqc_health_status_elem *health_info;
+	u16 count;
+	int i;
+
+	health_info = (struct ice_aqc_health_status_elem *)event->msg_buf;
+	count = le16_to_cpu(event->desc.params.get_health_status.health_status_count);
+
+	/* In practice it's rare to encounter this event, and even more uncommon
+	 * to observe multiple health status elements in a single message, but
+	 * there's no boundary defined that separates an unlikely scenario from
+	 * an erroneous one. If the count reported by the firmware is clearly
+	 * incorrect then don't process the message and return.
 	 */
-	val = rd32(hw, cq->rq.len);
-	if (val & (PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
-		   PF_FW_ARQLEN_ARQCRIT_M)) {
-		oldval = val;
-		if (val & PF_FW_ARQLEN_ARQVFE_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue VF Error detected\n", qtype);
-		if (val & PF_FW_ARQLEN_ARQOVFL_M) {
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue Overflow Error detected\n",
-				qtype);
-		}
-		if (val & PF_FW_ARQLEN_ARQCRIT_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue Critical Error detected\n",
-				qtype);
-		val &= ~(PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
-			 PF_FW_ARQLEN_ARQCRIT_M);
-		if (oldval != val)
-			wr32(hw, cq->rq.len, val);
+	if (count > (event->buf_len / sizeof(*health_info))) {
+		dev_err(ice_pf_to_dev(pf), "Received a health status event with invalid element count\n");
+		return;
 	}
 
-	val = rd32(hw, cq->sq.len);
-	if (val & (PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
-		   PF_FW_ATQLEN_ATQCRIT_M)) {
-		oldval = val;
-		if (val & PF_FW_ATQLEN_ATQVFE_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue VF Error detected\n", qtype);
-		if (val & PF_FW_ATQLEN_ATQOVFL_M) {
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue Overflow Error detected\n",
-				qtype);
-		}
-		if (val & PF_FW_ATQLEN_ATQCRIT_M)
-			dev_dbg(&pf->pdev->dev,
-				"%s Send Queue Critical Error detected\n",
-				qtype);
-		val &= ~(PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
-			 PF_FW_ATQLEN_ATQCRIT_M);
-		if (oldval != val)
-			wr32(hw, cq->sq.len, val);
+	for (i = 0; i < count; i++) {
+		ice_print_health_status_string(pf, health_info);
+		health_info++;
 	}
+}
 
-	event.buf_len = cq->rq_buf_size;
-	event.msg_buf = devm_kzalloc(&pf->pdev->dev, event.buf_len,
-				     GFP_KERNEL);
-	if (!event.msg_buf)
-		return 0;
 
-	do {
-		enum ice_status ret;
-		u16 opcode;
+enum ice_aq_task_state {
+	ICE_AQ_TASK_WAITING = 0,
+	ICE_AQ_TASK_COMPLETE,
+	ICE_AQ_TASK_CANCELED,
+};
 
-		ret = ice_clean_rq_elem(hw, cq, &event, &pending);
-		if (ret == ICE_ERR_AQ_NO_WORK)
-			break;
-		if (ret) {
-			dev_err(&pf->pdev->dev,
-				"%s Receive Queue event error %d\n", qtype,
-				ret);
-			break;
+struct ice_aq_task {
+	struct hlist_node entry;
+
+	u16 opcode;
+	struct ice_rq_event_info *event;
+	enum ice_aq_task_state state;
+};
+
+/**
+ * ice_aq_wait_for_event - Wait for an AdminQ event from firmware
+ * @pf: pointer to the PF private structure
+ * @opcode: the opcode to wait for
+ * @timeout: how long to wait, in jiffies
+ * @event: storage for the event info
+ *
+ * Waits for a specific AdminQ completion event on the ARQ for a given PF. The
+ * current thread will be put to sleep until the specified event occurs or
+ * until the given timeout is reached.
+ *
+ * To obtain only the descriptor contents, pass an event without an allocated
+ * msg_buf. If the complete data buffer is desired, allocate the
+ * event->msg_buf with enough space ahead of time.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int ice_aq_wait_for_event(struct ice_pf *pf, u16 opcode, unsigned long timeout,
+			  struct ice_rq_event_info *event)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_aq_task *task;
+	unsigned long start;
+	long ret;
+	int err;
+
+	task = kzalloc(sizeof(*task), GFP_KERNEL);
+	if (!task)
+		return -ENOMEM;
+
+	INIT_HLIST_NODE(&task->entry);
+	task->opcode = opcode;
+	task->event = event;
+	task->state = ICE_AQ_TASK_WAITING;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_add_head(&task->entry, &pf->aq_wait_list);
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	start = jiffies;
+
+#ifdef __CHECKER__
+	/* Suppress sparse warning from kernel macro:
+	 * warning: symbol '__ret' shadows an earlier one
+	 */
+	ret = timeout;
+#else
+	ret = wait_event_interruptible_timeout(pf->aq_wait_queue, task->state,
+					       timeout);
+#endif
+	switch (task->state) {
+	case ICE_AQ_TASK_WAITING:
+		err = ret < 0 ? ret : -ETIMEDOUT;
+		break;
+	case ICE_AQ_TASK_CANCELED:
+		err = ret < 0 ? ret : -ECANCELED;
+		break;
+	case ICE_AQ_TASK_COMPLETE:
+		err = ret < 0 ? ret : 0;
+		break;
+	default:
+		WARN(1, "Unexpected AdminQ wait task state %u", task->state);
+		err = -EINVAL;
+		break;
+	}
+
+	dev_dbg(dev, "Waited %u msecs (max %u msecs) for firmware response to op 0x%04x\n",
+		jiffies_to_msecs(jiffies - start),
+		jiffies_to_msecs(timeout),
+		opcode);
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_del(&task->entry);
+	spin_unlock_bh(&pf->aq_wait_lock);
+	kfree(task);
+
+	return err;
+}
+
+/**
+ * ice_aq_check_events - Check if any thread is waiting for an AdminQ event
+ * @pf: pointer to the PF private structure
+ * @opcode: the opcode of the event
+ * @event: the event to check
+ *
+ * Loops over the current list of pending threads waiting for an AdminQ event.
+ * For each matching task, copy the contents of the event into the task
+ * structure and wake up the thread.
+ *
+ * If multiple threads wait for the same opcode, they will all be woken up.
+ *
+ * Note that event->msg_buf will only be duplicated if the event has a buffer
+ * with enough space already allocated. Otherwise, only the descriptor and
+ * message length will be copied.
+ *
+ * Returns: true if an event was found, false otherwise
+ */
+static void ice_aq_check_events(struct ice_pf *pf, u16 opcode,
+				struct ice_rq_event_info *event)
+{
+	struct ice_aq_task *task;
+	bool found = false;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_for_each_entry(task, &pf->aq_wait_list, entry) {
+		if (task->state || task->opcode != opcode)
+			continue;
+
+		memcpy(&task->event->desc, &event->desc, sizeof(event->desc));
+		task->event->msg_len = event->msg_len;
+
+		/* Only copy the data buffer if a destination was set */
+		if (task->event->msg_buf &&
+		    task->event->buf_len > event->buf_len) {
+			memcpy(task->event->msg_buf, event->msg_buf,
+			       event->buf_len);
+			task->event->buf_len = event->buf_len;
+		}
+
+		task->state = ICE_AQ_TASK_COMPLETE;
+		found = true;
+	}
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	if (found)
+		wake_up(&pf->aq_wait_queue);
+}
+
+/**
+ * ice_aq_cancel_waiting_tasks - Immediately cancel all waiting tasks
+ * @pf: the PF private structure
+ *
+ * Set all waiting tasks to ICE_AQ_TASK_CANCELED, and wake up their threads.
+ * This will then cause ice_aq_wait_for_event to exit with -ECANCELED.
+ */
+static void ice_aq_cancel_waiting_tasks(struct ice_pf *pf)
+{
+	struct ice_aq_task *task;
+
+	spin_lock_bh(&pf->aq_wait_lock);
+	hlist_for_each_entry(task, &pf->aq_wait_list, entry)
+		task->state = ICE_AQ_TASK_CANCELED;
+	spin_unlock_bh(&pf->aq_wait_lock);
+
+	wake_up(&pf->aq_wait_queue);
+}
+
+/**
+ * __ice_clean_ctrlq - helper function to clean controlq rings
+ * @pf: ptr to struct ice_pf
+ * @q_type: specific Control queue type
+ */
+static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_rq_event_info event;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_ctl_q_info *cq;
+	u16 pending, i = 0;
+	const char *qtype;
+	u32 oldval, val;
+
+	/* Do not clean control queue if/when PF reset fails */
+	if (test_bit(ICE_RESET_FAILED, pf->state))
+		return 0;
+
+	switch (q_type) {
+	case ICE_CTL_Q_ADMIN:
+		cq = &hw->adminq;
+		qtype = "Admin";
+		break;
+	case ICE_CTL_Q_SB:
+		cq = &hw->sbq;
+		qtype = "Sideband";
+		break;
+	case ICE_CTL_Q_MAILBOX:
+		cq = &hw->mailboxq;
+		qtype = "Mailbox";
+		/* we are going to try to detect a malicious VF, so set the
+		 * state to begin detection
+		 */
+		hw->mbx_snapshot.mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+		break;
+	default:
+		dev_warn(dev, "Unknown control queue type 0x%x\n", q_type);
+		return 0;
+	}
+
+	/* check for error indications - PF_xx_AxQLEN register layout for
+	 * FW/MBX/SB are identical so just use defines for PF_FW_AxQLEN.
+	 */
+	val = rd32(hw, cq->rq.len);
+	if (val & (PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
+		   PF_FW_ARQLEN_ARQCRIT_M)) {
+		oldval = val;
+		if (val & PF_FW_ARQLEN_ARQVFE_M)
+			dev_dbg(dev, "%s Receive Queue VF Error detected\n",
+				qtype);
+		if (val & PF_FW_ARQLEN_ARQOVFL_M) {
+			dev_dbg(dev, "%s Receive Queue Overflow Error detected\n",
+				qtype);
+		}
+		if (val & PF_FW_ARQLEN_ARQCRIT_M)
+			dev_dbg(dev, "%s Receive Queue Critical Error detected\n",
+				qtype);
+		val &= ~(PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M |
+			 PF_FW_ARQLEN_ARQCRIT_M);
+		if (oldval != val)
+			wr32(hw, cq->rq.len, val);
+	}
+
+	val = rd32(hw, cq->sq.len);
+	if (val & (PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
+		   PF_FW_ATQLEN_ATQCRIT_M)) {
+		oldval = val;
+		if (val & PF_FW_ATQLEN_ATQVFE_M)
+			dev_dbg(dev, "%s Send Queue VF Error detected\n",
+				qtype);
+		if (val & PF_FW_ATQLEN_ATQOVFL_M) {
+			dev_dbg(dev, "%s Send Queue Overflow Error detected\n",
+				qtype);
+		}
+		if (val & PF_FW_ATQLEN_ATQCRIT_M)
+			dev_dbg(dev, "%s Send Queue Critical Error detected\n",
+				qtype);
+		val &= ~(PF_FW_ATQLEN_ATQVFE_M | PF_FW_ATQLEN_ATQOVFL_M |
+			 PF_FW_ATQLEN_ATQCRIT_M);
+		if (oldval != val)
+			wr32(hw, cq->sq.len, val);
+	}
+
+	event.buf_len = cq->rq_buf_size;
+	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
+	if (!event.msg_buf)
+		return 0;
+
+	do {
+		enum ice_status ret;
+		u16 opcode;
+
+		ret = ice_clean_rq_elem(hw, cq, &event, &pending);
+		if (ret == ICE_ERR_AQ_NO_WORK)
+			break;
+		if (ret) {
+			dev_err(dev, "%s Receive Queue event error %s\n", qtype,
+				ice_stat_str(ret));
+			break;
 		}
 
 		opcode = le16_to_cpu(event.desc.opcode);
 
+		/* Notify any thread that might be waiting for this event */
+		ice_aq_check_events(pf, opcode, &event);
+
 		switch (opcode) {
 		case ice_aqc_opc_get_link_status:
 			if (ice_handle_link_event(pf, &event))
-				dev_err(&pf->pdev->dev,
-					"Could not handle link event\n");
+				dev_err(dev, "Could not handle link event\n");
+			break;
+		case ice_aqc_opc_event_lan_overflow:
+			ice_vf_lan_overflow_event(pf, &event);
 			break;
 		case ice_mbx_opc_send_msg_to_pf:
-			ice_vc_process_vf_msg(pf, &event);
+			if (!ice_is_malicious_vf(pf, &event, i, pending))
+				ice_vc_process_vf_msg(pf, &event);
 			break;
-		case ice_aqc_opc_fw_logging:
-			ice_output_fw_log(hw, &event.desc, event.msg_buf);
+		case ice_aqc_opc_fw_logs_event:
+			ice_fwlog_event_dump(hw, &event.desc, event.msg_buf);
 			break;
 		case ice_aqc_opc_lldp_set_mib_change:
 			ice_dcb_process_lldp_set_mib_change(pf, &event);
 			break;
+		case ice_aqc_opc_get_health_status:
+			ice_process_health_status_event(pf, &event);
+			break;
 		default:
-			dev_dbg(&pf->pdev->dev,
-				"%s Receive Queue unknown event 0x%04x ignored\n",
+			dev_dbg(dev, "%s Receive Queue unknown event 0x%04x ignored\n",
 				qtype, opcode);
 			break;
 		}
 	} while (pending && (i++ < ICE_DFLT_IRQ_WORK));
 
-	devm_kfree(&pf->pdev->dev, event.msg_buf);
+	kfree(event.msg_buf);
 
 	return pending && (i == ICE_DFLT_IRQ_WORK);
 }
@@ -1106,13 +2236,13 @@ static void ice_clean_adminq_subtask(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
 
-	if (!test_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state))
+	if (!test_bit(ICE_ADMINQ_EVENT_PENDING, pf->state))
 		return;
 
 	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_ADMIN))
 		return;
 
-	clear_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state);
+	clear_bit(ICE_ADMINQ_EVENT_PENDING, pf->state);
 
 	/* There might be a situation where new messages arrive to a control
 	 * queue between processing the last message and clearing the
@@ -1133,13 +2263,13 @@ static void ice_clean_mailboxq_subtask(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
 
-	if (!test_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state))
+	if (!test_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state))
 		return;
 
 	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_MAILBOX))
 		return;
 
-	clear_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	clear_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state);
 
 	if (ice_ctrlq_pending(hw, &hw->mailboxq))
 		__ice_clean_ctrlq(pf, ICE_CTL_Q_MAILBOX);
@@ -1147,17 +2277,48 @@ static void ice_clean_mailboxq_subtask(struct ice_pf *pf)
 	ice_flush(hw);
 }
 
+/**
+ * ice_clean_sbq_subtask - clean the Sideband Queue rings
+ * @pf: board private structure
+ */
+static void ice_clean_sbq_subtask(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	/* if mac_type is not generic, sideband is not supported
+	 * and there's nothing to do here
+	 */
+	if (!ice_is_generic_mac(hw)) {
+		clear_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
+		return;
+	}
+
+	if (!test_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state))
+		return;
+
+	if (__ice_clean_ctrlq(pf, ICE_CTL_Q_SB))
+		return;
+
+	clear_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
+
+	if (ice_ctrlq_pending(hw, &hw->sbq))
+		__ice_clean_ctrlq(pf, ICE_CTL_Q_SB);
+
+	ice_flush(hw);
+}
+
 /**
  * ice_service_task_schedule - schedule the service task to wake up
  * @pf: board private structure
  *
  * If not already scheduled, this puts the task into the work queue.
  */
-static void ice_service_task_schedule(struct ice_pf *pf)
+void ice_service_task_schedule(struct ice_pf *pf)
 {
-	if (!test_bit(__ICE_SERVICE_DIS, pf->state) &&
-	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state) &&
-	    !test_bit(__ICE_NEEDS_RESTART, pf->state))
+	if (!test_bit(ICE_SERVICE_DIS, pf->state) &&
+	    !test_and_set_bit(ICE_SERVICE_SCHED, pf->state) &&
+	    !test_bit(ICE_RECOVERY_MODE, pf->state) &&
+	    !test_bit(ICE_NEEDS_RESTART, pf->state))
 		queue_work(ice_wq, &pf->serv_task);
 }
 
@@ -1167,27 +2328,33 @@ static void ice_service_task_schedule(struct ice_pf *pf)
  */
 static void ice_service_task_complete(struct ice_pf *pf)
 {
-	WARN_ON(!test_bit(__ICE_SERVICE_SCHED, pf->state));
+	WARN_ON(!test_bit(ICE_SERVICE_SCHED, pf->state));
 
 	/* force memory (pf->state) to sync before next service task */
 	smp_mb__before_atomic();
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
 }
 
 /**
  * ice_service_task_stop - stop service task and cancel works
  * @pf: board private structure
+ *
+ * Return 0 if the ICE_SERVICE_DIS bit was not already set,
+ * 1 otherwise.
  */
-static void ice_service_task_stop(struct ice_pf *pf)
+static int ice_service_task_stop(struct ice_pf *pf)
 {
-	set_bit(__ICE_SERVICE_DIS, pf->state);
+	int ret;
+
+	ret = test_and_set_bit(ICE_SERVICE_DIS, pf->state);
 
 	if (pf->serv_tmr.function)
 		del_timer_sync(&pf->serv_tmr);
 	if (pf->serv_task.func)
 		cancel_work_sync(&pf->serv_task);
 
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
+	return ret;
 }
 
 /**
@@ -1198,7 +2365,7 @@ static void ice_service_task_stop(struct ice_pf *pf)
  */
 static void ice_service_task_restart(struct ice_pf *pf)
 {
-	clear_bit(__ICE_SERVICE_DIS, pf->state);
+	clear_bit(ICE_SERVICE_DIS, pf->state);
 	ice_service_task_schedule(pf);
 }
 
@@ -1218,19 +2385,28 @@ static void ice_service_timer(struct timer_list *t)
  * ice_handle_mdd_event - handle malicious driver detect event
  * @pf: pointer to the PF structure
  *
- * Called from service task. OICR interrupt handler indicates MDD event
+ * Called from service task. OICR interrupt handler indicates MDD event.
+ * VF MDD logging is guarded by net_ratelimit. Additional PF and VF log
+ * messages are wrapped by netif_msg_[rx|tx]_err. Since VF Rx MDD events
+ * disable the queue, the PF can be configured to reset the VF using ethtool
+ * private flag mdd-auto-reset-vf.
  */
 static void ice_handle_mdd_event(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
-	bool mdd_detected = false;
+	unsigned int i;
 	u32 reg;
-	int i;
 
-	if (!test_and_clear_bit(__ICE_MDD_EVENT_PENDING, pf->state))
+	if (!test_and_clear_bit(ICE_MDD_EVENT_PENDING, pf->state)) {
+		/* Since the VF MDD event logging is rate limited, check if
+		 * there are pending MDD events.
+		 */
+		ice_print_vfs_mdd_events(pf);
 		return;
+	}
 
-	/* find what triggered the MDD event */
+	/* find what triggered an MDD event */
 	reg = rd32(hw, GL_MDET_TX_PQM);
 	if (reg & GL_MDET_TX_PQM_VALID_M) {
 		u8 pf_num = (reg & GL_MDET_TX_PQM_PF_NUM_M) >>
@@ -1243,10 +2419,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				GL_MDET_TX_PQM_QNUM_S);
 
 		if (netif_msg_tx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
-		mdd_detected = true;
 	}
 
 	reg = rd32(hw, GL_MDET_TX_TCLAN);
@@ -1260,11 +2435,10 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 		u16 queue = ((reg & GL_MDET_TX_TCLAN_QNUM_M) >>
 				GL_MDET_TX_TCLAN_QNUM_S);
 
-		if (netif_msg_rx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff);
-		mdd_detected = true;
 	}
 
 	reg = rd32(hw, GL_MDET_RX);
@@ -1279,89 +2453,93 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				GL_MDET_RX_QNUM_S);
 
 		if (netif_msg_rx_err(pf))
-			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
+			dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
 				 event, queue, pf_num, vf_num);
 		wr32(hw, GL_MDET_RX, 0xffffffff);
-		mdd_detected = true;
 	}
 
-	if (mdd_detected) {
-		bool pf_mdd_detected = false;
-
-		reg = rd32(hw, PF_MDET_TX_PQM);
-		if (reg & PF_MDET_TX_PQM_VALID_M) {
-			wr32(hw, PF_MDET_TX_PQM, 0xFFFF);
-			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
+	/* check to see if this PF caused an MDD event */
+	reg = rd32(hw, PF_MDET_TX_PQM);
+	if (reg & PF_MDET_TX_PQM_VALID_M) {
+		wr32(hw, PF_MDET_TX_PQM, 0xFFFF);
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event TX_PQM detected on PF\n");
+	}
 
-		reg = rd32(hw, PF_MDET_TX_TCLAN);
-		if (reg & PF_MDET_TX_TCLAN_VALID_M) {
-			wr32(hw, PF_MDET_TX_TCLAN, 0xFFFF);
-			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
+	reg = rd32(hw, PF_MDET_TX_TCLAN);
+	if (reg & PF_MDET_TX_TCLAN_VALID_M) {
+		wr32(hw, PF_MDET_TX_TCLAN, 0xFFFF);
+		if (netif_msg_tx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event TX_TCLAN detected on PF\n");
+	}
 
-		reg = rd32(hw, PF_MDET_RX);
-		if (reg & PF_MDET_RX_VALID_M) {
-			wr32(hw, PF_MDET_RX, 0xFFFF);
-			dev_info(&pf->pdev->dev, "RX driver issue detected, PF reset issued\n");
-			pf_mdd_detected = true;
-		}
-		/* Queue belongs to the PF initiate a reset */
-		if (pf_mdd_detected) {
-			set_bit(__ICE_NEEDS_RESTART, pf->state);
-			ice_service_task_schedule(pf);
-		}
+	reg = rd32(hw, PF_MDET_RX);
+	if (reg & PF_MDET_RX_VALID_M) {
+		wr32(hw, PF_MDET_RX, 0xFFFF);
+		if (netif_msg_rx_err(pf))
+			dev_info(dev, "Malicious Driver Detection event RX detected on PF\n");
 	}
 
-	/* check to see if one of the VFs caused the MDD */
-	for (i = 0; i < pf->num_alloc_vfs; i++) {
+	/* Check to see if one of the VFs caused an MDD event, and then
+	 * increment counters and set print pending
+	 */
+	ice_for_each_vf(pf, i) {
 		struct ice_vf *vf = &pf->vf[i];
 
-		bool vf_mdd_detected = false;
-
 		reg = rd32(hw, VP_MDET_TX_PQM(i));
 		if (reg & VP_MDET_TX_PQM_VALID_M) {
 			wr32(hw, VP_MDET_TX_PQM(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_PQM detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_TX_TCLAN(i));
 		if (reg & VP_MDET_TX_TCLAN_VALID_M) {
 			wr32(hw, VP_MDET_TX_TCLAN(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_TCLAN detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_TX_TDPU(i));
 		if (reg & VP_MDET_TX_TDPU_VALID_M) {
 			wr32(hw, VP_MDET_TX_TDPU(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
+			vf->mdd_tx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_tx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event TX_TDPU detected on VF %d\n",
+					 i);
 		}
 
 		reg = rd32(hw, VP_MDET_RX(i));
 		if (reg & VP_MDET_RX_VALID_M) {
 			wr32(hw, VP_MDET_RX(i), 0xFFFF);
-			vf_mdd_detected = true;
-			dev_info(&pf->pdev->dev, "RX driver issue detected on VF %d\n",
-				 i);
-		}
-
-		if (vf_mdd_detected) {
-			vf->num_mdd_events++;
-			if (vf->num_mdd_events &&
-			    vf->num_mdd_events <= ICE_MDD_EVENTS_THRESHOLD)
-				dev_info(&pf->pdev->dev,
-					 "VF %d has had %llu MDD events since last boot, Admin might need to reload AVF driver with this number of events\n",
-					 i, vf->num_mdd_events);
+			vf->mdd_rx_events.count++;
+			set_bit(ICE_MDD_VF_PRINT_PENDING, pf->state);
+			if (netif_msg_rx_err(pf))
+				dev_info(dev, "Malicious Driver Detection event RX detected on VF %d\n",
+					 i);
+
+			/* Since the queue is disabled on VF Rx MDD events, the
+			 * PF can be configured to reset the VF through ethtool
+			 * private flag mdd-auto-reset-vf.
+			 */
+			if (test_bit(ICE_FLAG_MDD_AUTO_RESET_VF, pf->flags)) {
+				/* VF MDD event counters will be cleared by
+				 * reset, so print the event prior to reset.
+				 */
+				ice_print_vf_rx_mdd_event(vf);
+				ice_reset_vf(&pf->vf[i], false);
+			}
 		}
 	}
+
+	ice_print_vfs_mdd_events(pf);
 }
 
 /**
@@ -1389,19 +2567,18 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up)
 	if (vsi->type != ICE_VSI_PF)
 		return 0;
 
-	dev = &vsi->back->pdev->dev;
+	dev = ice_pf_to_dev(vsi->back);
 
 	pi = vsi->port_info;
 
-	pcaps = devm_kzalloc(dev, sizeof(*pcaps), GFP_KERNEL);
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps)
 		return -ENOMEM;
 
-	retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps,
+	retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
 				      NULL);
 	if (retcode) {
-		dev_err(dev,
-			"Failed to get phy capabilities, VSI %d error %d\n",
+		dev_err(dev, "Failed to get phy capabilities, VSI %d error %d\n",
 			vsi->vsi_num, retcode);
 		retcode = -EIO;
 		goto out;
@@ -1412,125 +2589,576 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up)
 	    link_up == !!(pi->phy.link_info.link_info & ICE_AQ_LINK_UP))
 		goto out;
 
-	cfg = devm_kzalloc(dev, sizeof(*cfg), GFP_KERNEL);
+	/* Use the current user PHY configuration. The current user PHY
+	 * configuration is initialized during probe from PHY capabilities
+	 * software mode, and updated on set PHY configuration.
+	 */
+	cfg = kmemdup(&pi->phy.curr_user_phy_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg) {
 		retcode = -ENOMEM;
 		goto out;
 	}
 
-	cfg->phy_type_low = pcaps->phy_type_low;
-	cfg->phy_type_high = pcaps->phy_type_high;
-	cfg->caps = pcaps->caps | ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-	cfg->low_power_ctrl = pcaps->low_power_ctrl;
-	cfg->eee_cap = pcaps->eee_cap;
-	cfg->eeer_value = pcaps->eeer_value;
-	cfg->link_fec_opt = pcaps->link_fec_options;
+	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
 	if (link_up)
 		cfg->caps |= ICE_AQ_PHY_ENA_LINK;
 	else
 		cfg->caps &= ~ICE_AQ_PHY_ENA_LINK;
 
-	retcode = ice_aq_set_phy_cfg(&vsi->back->hw, pi->lport, cfg, NULL);
+	retcode = ice_aq_set_phy_cfg(&vsi->back->hw, pi, cfg, NULL);
 	if (retcode) {
 		dev_err(dev, "Failed to set phy config, VSI %d error %d\n",
 			vsi->vsi_num, retcode);
 		retcode = -EIO;
 	}
 
-	devm_kfree(dev, cfg);
+	kfree(cfg);
 out:
-	devm_kfree(dev, pcaps);
+	kfree(pcaps);
 	return retcode;
 }
 
 /**
- * ice_check_media_subtask - Check for media; bring link up if detected.
- * @pf: pointer to PF struct
+ * ice_init_nvm_phy_type - Initialize the NVM PHY type
+ * @pi: port info structure
+ *
+ * Initialize nvm_phy_type_[low|high] for link lenient mode support
  */
-static void ice_check_media_subtask(struct ice_pf *pf)
+static int ice_init_nvm_phy_type(struct ice_port_info *pi)
 {
-	struct ice_port_info *pi;
-	struct ice_vsi *vsi;
-	int err;
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_pf *pf = pi->hw->back;
+	enum ice_status status;
+	int err = 0;
 
-	vsi = ice_get_main_vsi(pf);
-	if (!vsi)
-		return;
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
 
-	/* No need to check for media if it's already present or the interface
-	 * is down
-	 */
-	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags) ||
-	    test_bit(__ICE_DOWN, vsi->state))
-		return;
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_NO_MEDIA, pcaps,
+				     NULL);
 
-	/* Refresh link info and check if media is present */
-	pi = vsi->port_info;
-	err = ice_update_link_info(pi);
-	if (err)
-		return;
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Get PHY capability failed.\n");
+		err = -EIO;
+		goto out;
+	}
 
-	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		err = ice_force_phys_link_state(vsi, true);
-		if (err)
-			return;
-		clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+	pf->nvm_phy_type_hi = pcaps->phy_type_high;
+	pf->nvm_phy_type_lo = pcaps->phy_type_low;
 
-		/* A Link Status Event will be generated; the event handler
-		 * will complete bringing the interface up
-		 */
-	}
+out:
+	kfree(pcaps);
+	return err;
 }
 
 /**
- * ice_service_task - manage and run subtasks
- * @work: pointer to work_struct contained by the PF struct
+ * ice_init_link_dflt_override - Initialize link default override
+ * @pi: port info structure
+ *
+ * Initialize link default override and PHY total port shutdown during probe
  */
-static void ice_service_task(struct work_struct *work)
+static void ice_init_link_dflt_override(struct ice_port_info *pi)
 {
-	struct ice_pf *pf = container_of(work, struct ice_pf, serv_task);
-	unsigned long start_time = jiffies;
-
-	/* subtasks */
-
-	/* process reset requests first */
-	ice_reset_subtask(pf);
+	struct ice_link_default_override_tlv *ldo;
+	struct ice_pf *pf = pi->hw->back;
 
-	/* bail if a reset/recovery cycle is pending or rebuild failed */
-	if (ice_is_reset_in_progress(pf->state) ||
-	    test_bit(__ICE_SUSPENDED, pf->state) ||
-	    test_bit(__ICE_NEEDS_RESTART, pf->state)) {
-		ice_service_task_complete(pf);
+	ldo = &pf->link_dflt_override;
+	if (ice_get_link_default_override(ldo, pi))
 		return;
-	}
 
-	ice_clean_adminq_subtask(pf);
-	ice_check_media_subtask(pf);
-	ice_check_for_hang_subtask(pf);
-	ice_sync_fltr_subtask(pf);
-	ice_handle_mdd_event(pf);
-	ice_watchdog_subtask(pf);
-
-	if (ice_is_safe_mode(pf)) {
-		ice_service_task_complete(pf);
+	if (!(ldo->options & ICE_LINK_OVERRIDE_PORT_DIS))
 		return;
-	}
-
-	ice_process_vflr_event(pf);
-	ice_clean_mailboxq_subtask(pf);
 
-	/* Clear __ICE_SERVICE_SCHED flag to allow scheduling next event */
-	ice_service_task_complete(pf);
+	/* Enable Total Port Shutdown (override/replace link-down-on-close
+	 * ethtool private flag) for ports with Port Disable bit set.
+	 */
+	set_bit(ICE_FLAG_TOTAL_PORT_SHUTDOWN_ENA, pf->flags);
+	set_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags);
+}
+
+/**
+ * ice_init_phy_cfg_dflt_override - Initialize PHY cfg default override settings
+ * @pi: port info structure
+ *
+ * If default override is enabled, initialize the user PHY cfg speed and FEC
+ * settings using the default override mask from the NVM.
+ *
+ * The PHY should only be configured with the default override settings the
+ * first time media is available. The ICE_LINK_DEFAULT_OVERRIDE_PENDING state
+ * is used to indicate that the user PHY cfg default override is initialized
+ * and the PHY has not been configured with the default override settings. The
+ * state is set here, and cleared in ice_configure_phy the first time the PHY is
+ * configured.
+ *
+ * This function should be called only if the FW doesn't support default
+ * configuration mode, as reported by ice_fw_supports_report_dflt_cfg.
+ */
+static void ice_init_phy_cfg_dflt_override(struct ice_port_info *pi)
+{
+	struct ice_link_default_override_tlv *ldo;
+	struct ice_aqc_set_phy_cfg_data *cfg;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = pi->hw->back;
+
+	ldo = &pf->link_dflt_override;
+
+	/* If link default override is enabled, use to mask NVM PHY capabilities
+	 * for speed and FEC default configuration.
+	 */
+	cfg = &phy->curr_user_phy_cfg;
+
+	if (ldo->phy_type_low || ldo->phy_type_high) {
+		cfg->phy_type_low = pf->nvm_phy_type_lo &
+				    cpu_to_le64(ldo->phy_type_low);
+		cfg->phy_type_high = pf->nvm_phy_type_hi &
+				     cpu_to_le64(ldo->phy_type_high);
+	}
+	cfg->link_fec_opt = ldo->fec_options;
+	phy->curr_user_fec_req = ICE_FEC_AUTO;
+
+	set_bit(ICE_LINK_DEFAULT_OVERRIDE_PENDING, pf->state);
+}
+
+/**
+ * ice_init_phy_user_cfg - Initialize the PHY user configuration
+ * @pi: port info structure
+ *
+ * Initialize the current user PHY configuration, speed, FEC, and FC requested
+ * mode to default. The PHY defaults are from get PHY capabilities topology
+ * with media so call when media is first available. An error is returned if
+ * called when media is not available. The PHY initialization completed state is
+ * set here.
+ *
+ * These configurations are used when setting PHY
+ * configuration. The user PHY configuration is updated on set PHY
+ * configuration. Returns 0 on success, negative on failure
+ */
+static int ice_init_phy_user_cfg(struct ice_port_info *pi)
+{
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = pi->hw->back;
+	enum ice_status status;
+	int err = 0;
+
+	if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE))
+		return -EIO;
+
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
+
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     pcaps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Get PHY capability failed.\n");
+		err = -EIO;
+		goto err_out;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, &pi->phy.curr_user_phy_cfg);
+
+	/* check if lenient mode is supported and enabled */
+	if (ice_fw_supports_link_override(pi->hw) &&
+	    !(pcaps->module_compliance_enforcement &
+	      ICE_AQC_MOD_ENFORCE_STRICT_MODE)) {
+		set_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags);
+
+		/* if the FW supports default PHY configuration mode, then the driver
+		 * does not have to apply link override settings. If not,
+		 * initialize user PHY configuration with link override values
+		 */
+		if (!ice_fw_supports_report_dflt_cfg(pi->hw) &&
+		    (pf->link_dflt_override.options & ICE_LINK_OVERRIDE_EN)) {
+			ice_init_phy_cfg_dflt_override(pi);
+			goto out;
+		}
+	}
+
+	/* if link default override is not enabled, set user flow control and
+	 * FEC settings based on what get_phy_caps returned
+	 */
+	phy->curr_user_fec_req = ice_caps_to_fec_mode(pcaps->caps,
+						      pcaps->link_fec_options);
+	phy->curr_user_fc_req = ice_caps_to_fc_mode(pcaps->caps);
+
+out:
+	phy->curr_user_speed_req = ICE_AQ_LINK_SPEED_M;
+	set_bit(ICE_PHY_INIT_COMPLETE, pf->state);
+err_out:
+	kfree(pcaps);
+	return err;
+}
+
+/**
+ * ice_configure_phy - configure PHY
+ * @vsi: VSI of PHY
+ *
+ * Set the PHY configuration. If the current PHY configuration is the same as
+ * the curr_user_phy_cfg, then do nothing to avoid link flap. Otherwise
+ * configure the based get PHY capabilities for topology with media.
+ */
+static int ice_configure_phy(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_port_info *pi = vsi->port_info;
+	struct ice_aqc_get_phy_caps_data *pcaps;
+	struct ice_aqc_set_phy_cfg_data *cfg;
+	struct ice_phy_info *phy = &pi->phy;
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	int err = 0;
+
+	/* Ensure we have media as we cannot configure a medialess port */
+	if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE))
+		return -EPERM;
+
+	ice_print_topo_conflict(vsi);
+
+	if (!test_bit(ICE_FLAG_LINK_LENIENT_MODE_ENA, pf->flags) &&
+	    phy->link_info.topo_media_conflict == ICE_AQ_LINK_TOPO_UNSUPP_MEDIA)
+		return -EPERM;
+
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags))
+		return ice_force_phys_link_state(vsi, true);
+
+	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
+	if (!pcaps)
+		return -ENOMEM;
+
+	/* Get current PHY config */
+	status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
+				     NULL);
+	if (status) {
+		dev_err(dev, "Failed to get PHY configuration, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+		goto done;
+	}
+
+	/* If PHY enable link is configured and configuration has not changed,
+	 * there's nothing to do
+	 */
+	if (pcaps->caps & ICE_AQC_PHY_EN_LINK &&
+	    ice_phy_caps_equals_cfg(pcaps, &phy->curr_user_phy_cfg))
+		goto done;
+
+	/* Use PHY topology as baseline for configuration */
+	memset(pcaps, 0, sizeof(*pcaps));
+	if (ice_fw_supports_report_dflt_cfg(pi->hw))
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_DFLT_CFG,
+					     pcaps, NULL);
+	else
+		status = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_TOPO_CAP_MEDIA,
+					     pcaps, NULL);
+	if (status) {
+		dev_err(dev, "Failed to get PHY caps, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+		goto done;
+	}
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	ice_copy_phy_caps_to_cfg(pi, pcaps, cfg);
+
+	/* Speed - If default override pending, use curr_user_phy_cfg set in
+	 * ice_init_phy_user_cfg_ldo.
+	 */
+	if (test_and_clear_bit(ICE_LINK_DEFAULT_OVERRIDE_PENDING, pf->state)) {
+		cfg->phy_type_low = phy->curr_user_phy_cfg.phy_type_low;
+		cfg->phy_type_high = phy->curr_user_phy_cfg.phy_type_high;
+	} else {
+		u64 phy_low = 0, phy_high = 0;
+
+		ice_update_phy_type(&phy_low, &phy_high,
+				    phy->curr_user_speed_req);
+		cfg->phy_type_low = pcaps->phy_type_low & cpu_to_le64(phy_low);
+		cfg->phy_type_high = pcaps->phy_type_high &
+				     cpu_to_le64(phy_high);
+	}
+
+	/* Can't provide what was requested; use PHY capabilities */
+	if (!cfg->phy_type_low && !cfg->phy_type_high) {
+		cfg->phy_type_low = pcaps->phy_type_low;
+		cfg->phy_type_high = pcaps->phy_type_high;
+	}
+
+	/* FEC */
+	ice_cfg_phy_fec(pi, cfg, phy->curr_user_fec_req);
+
+	/* Can't provide what was requested; use PHY capabilities */
+	if (cfg->link_fec_opt !=
+	    (cfg->link_fec_opt & pcaps->link_fec_options)) {
+		cfg->caps |= pcaps->caps & ICE_AQC_PHY_EN_AUTO_FEC;
+		cfg->link_fec_opt = pcaps->link_fec_options;
+	}
+
+	/* Flow Control - always supported; no need to check against
+	 * capabilities
+	 */
+	ice_cfg_phy_fc(pi, cfg, phy->curr_user_fc_req);
+
+	/* Enable link and link update */
+	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT | ICE_AQ_PHY_ENA_LINK;
+
+	status = ice_aq_set_phy_cfg(&pf->hw, pi, cfg, NULL);
+	if (status) {
+		dev_err(dev, "Failed to set phy config, VSI %d error %s\n",
+			vsi->vsi_num, ice_stat_str(status));
+		err = -EIO;
+	}
+
+	kfree(cfg);
+done:
+	kfree(pcaps);
+	return err;
+}
+
+/**
+ * ice_check_media_subtask - Check for media
+ * @pf: pointer to PF struct
+ *
+ * If media is available, then initialize PHY user configuration if it is not
+ * been, and configure the PHY if the interface is up.
+ */
+static void ice_check_media_subtask(struct ice_pf *pf)
+{
+	struct ice_port_info *pi;
+	struct ice_vsi *vsi;
+	int err;
+
+	/* No need to check for media if it's already present */
+	if (!test_bit(ICE_FLAG_NO_MEDIA, pf->flags))
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	/* Refresh link info and check if media is present */
+	pi = vsi->port_info;
+	err = ice_update_link_info(pi);
+	if (err)
+		return;
+
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
+	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
+		if (!test_bit(ICE_PHY_INIT_COMPLETE, pf->state))
+			ice_init_phy_user_cfg(pi);
+
+		/* PHY settings are reset on media insertion, reconfigure
+		 * PHY to preserve settings.
+		 */
+		if (test_bit(ICE_VSI_DOWN, vsi->state) &&
+		    test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags))
+			return;
+
+		err = ice_configure_phy(vsi);
+		if (!err)
+			clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+
+		/* A Link Status Event will be generated; the event handler
+		 * will complete bringing the interface up
+		 */
+	}
+}
+
+
+/**
+ * ice_find_tnl - return the matching tunnel entry if it exists
+ * @pf: pointer to PF struct
+ * @tnl_type: tunnel type
+ * @port: tunnel port
+ */
+static struct ice_tnl_entry *
+ice_find_tnl(struct ice_pf *pf, enum ice_tunnel_type tnl_type, u16 port)
+{
+	struct ice_tnl_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &pf->tnl_list, node)
+		if (entry->port == port && entry->type == tnl_type)
+			return entry;
+	return NULL;
+}
+
+/**
+ * ice_handle_tmp_tnl_list - duplicate the tunnel entry and add it to tmp_list
+ * @pf: pointer to PF struct
+ * @entry: tunnel entry to duplicate
+ * @tmp_list: list to add the entry to
+ */
+static void
+ice_handle_tmp_tnl_list(struct ice_pf *pf, struct ice_tnl_entry *entry,
+			struct list_head *tmp_list)
+{
+	struct ice_tnl_entry *tnl;
+
+	tnl = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*tnl), GFP_ATOMIC);
+	if (!tnl)
+		return;
+
+	tnl->type = entry->type;
+	tnl->port = entry->port;
+	INIT_LIST_HEAD(&tnl->node);
+	list_add_tail(&tnl->node, tmp_list);
+}
+
+/**
+ * ice_handle_tunnel - update tunnel entries in hardware
+ * @pf: pointer to PF struct
+ *
+ * Check the list of tunnel entries and add or remove any that that have
+ * changed.
+ */
+static void ice_handle_tunnel(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_tnl_entry *entry, *tmp;
+	LIST_HEAD(tmp_del_list);
+	LIST_HEAD(tmp_add_list);
+	enum ice_status status;
+
+	if (list_empty(&pf->tnl_list))
+		return;
+
+	spin_lock(&pf->tnl_lock);
+	list_for_each_entry_safe(entry, tmp, &pf->tnl_list, node) {
+		if (entry->state & ICE_TNL_STATE_TO_DEL &&
+		    entry->state & ICE_TNL_STATE_ACTIVE)
+			list_move(&entry->node, &tmp_del_list);
+		else if (entry->state & ICE_TNL_STATE_TO_ADD)
+			ice_handle_tmp_tnl_list(pf, entry, &tmp_add_list);
+	}
+	spin_unlock(&pf->tnl_lock);
+
+	/* Process tmp_del_list and call ice_destroy_tunnel for each entry */
+	list_for_each_entry_safe(entry, tmp, &tmp_del_list, node) {
+		status = ice_destroy_tunnel(&pf->hw, entry->port, false);
+		if (status)
+			dev_err(dev, "error deleting port %d from UDP tunnels list\n",
+				entry->port);
+
+		list_del(&entry->node);
+		devm_kfree(dev, entry);
+	}
+
+	/* Process tmp_add_list and create_tunnel, if failed, delete element
+	 * from original tunnel list otherwise set state to ACTIVE.
+	 * purge all entries from tmp_add_list.
+	 */
+	list_for_each_entry_safe(entry, tmp, &tmp_add_list, node) {
+		struct ice_tnl_entry *tnl;
+
+		status = ice_create_tunnel(&pf->hw, entry->type, entry->port);
+		if (status == ICE_ERR_OUT_OF_RANGE)
+			dev_dbg(dev, "Max tunneled UDP ports reached, port %d not added\n",
+				entry->port);
+		else if (status)
+			dev_err(dev, "Error adding UDP tunnel - %s for tnl port %u\n",
+				ice_stat_str(status), entry->port);
+
+		/* delete entry from original tunnel list if failed to add,
+		 * otherwise set state to ACTIVE
+		 */
+		spin_lock(&pf->tnl_lock);
+		tnl = ice_find_tnl(pf, entry->type, entry->port);
+		if (tnl) {
+			if (status) {
+				list_del(&tnl->node);
+				devm_kfree(dev, tnl);
+			} else {
+				/* The tunnel was created successfully, mark
+				 * state of tunnel as ACTIVE, indicating it
+				 * was offloaded in HW.
+				 */
+				tnl->state = ICE_TNL_STATE_ACTIVE;
+			}
+		}
+		spin_unlock(&pf->tnl_lock);
+
+		list_del(&entry->node);
+		devm_kfree(dev, entry);
+	}
+}
+
+/**
+ * ice_service_task - manage and run subtasks
+ * @work: pointer to work_struct contained by the PF struct
+ */
+static void ice_service_task(struct work_struct *work)
+{
+	struct ice_pf *pf = container_of(work, struct ice_pf, serv_task);
+	unsigned long start_time = jiffies;
+
+	/* subtasks */
+
+	/* process reset requests first */
+	ice_reset_subtask(pf);
+
+	/* bail if a reset/recovery cycle is pending or rebuild failed */
+	if (ice_is_reset_in_progress(pf->state) ||
+	    test_bit(ICE_SUSPENDED, pf->state) ||
+	    test_bit(ICE_NEEDS_RESTART, pf->state)) {
+		ice_service_task_complete(pf);
+		return;
+	}
+
+	/* If we are in FW recovery mode, need to exit service tasks here */
+	if (test_bit(ICE_RECOVERY_MODE, pf->state))
+		return;
+
+	ice_clean_adminq_subtask(pf);
+	ice_check_media_subtask(pf);
+	ice_check_for_hang_subtask(pf);
+	ice_sync_fltr_subtask(pf);
+	ice_handle_mdd_event(pf);
+	ice_watchdog_subtask(pf);
+
+	if (ice_is_safe_mode(pf)) {
+		ice_service_task_complete(pf);
+		return;
+	}
+
+	/* Invoke remaining initialization of peer_objs */
+	ice_for_each_peer(pf, NULL, ice_finish_init_peer_obj);
+
+	ice_chnl_subtask_handle_interrupt(pf);
+	ice_channel_sync_global_cntrs(pf);
+	ice_process_vflr_event(pf);
+	ice_sync_udp_fltr_subtask(pf);
+	ice_clean_mailboxq_subtask(pf);
+	ice_clean_sbq_subtask(pf);
+	ice_clean_ptp_subtask(pf);
+	ice_sync_arfs_fltrs(pf);
+	ice_flush_fdir_ctx(pf);
+
+	ice_handle_tunnel(pf);
+
+	/* Clear ICE_SERVICE_SCHED flag to allow scheduling next event */
+	ice_service_task_complete(pf);
 
 	/* If the tasks have taken longer than one service timer period
 	 * or there is more work to be done, reset the service timer to
 	 * schedule the service task now.
 	 */
 	if (time_after(jiffies, (start_time + pf->serv_tmr_period)) ||
-	    test_bit(__ICE_MDD_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_VFLR_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state) ||
-	    test_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state))
+	    test_bit(ICE_MDD_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_VFLR_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state) ||
+	    test_bit(ICE_FD_VF_FLUSH_CTX, pf->state) ||
+	    test_bit(ICE_ADMINQ_EVENT_PENDING, pf->state))
 		mod_timer(&pf->serv_tmr, jiffies);
 }
 
@@ -1544,10 +3172,52 @@ static void ice_set_ctrlq_len(struct ice_hw *hw)
 	hw->adminq.num_sq_entries = ICE_AQ_LEN;
 	hw->adminq.rq_buf_size = ICE_AQ_MAX_BUF_LEN;
 	hw->adminq.sq_buf_size = ICE_AQ_MAX_BUF_LEN;
-	hw->mailboxq.num_rq_entries = ICE_MBXRQ_LEN;
+	hw->mailboxq.num_rq_entries = PF_MBX_ARQLEN_ARQLEN_M;
 	hw->mailboxq.num_sq_entries = ICE_MBXSQ_LEN;
 	hw->mailboxq.rq_buf_size = ICE_MBXQ_MAX_BUF_LEN;
 	hw->mailboxq.sq_buf_size = ICE_MBXQ_MAX_BUF_LEN;
+	hw->sbq.num_rq_entries = ICE_SBQ_LEN;
+	hw->sbq.num_sq_entries = ICE_SBQ_LEN;
+	hw->sbq.rq_buf_size = ICE_SBQ_MAX_BUF_LEN;
+	hw->sbq.sq_buf_size = ICE_SBQ_MAX_BUF_LEN;
+}
+
+/**
+ * ice_schedule_reset - schedule a reset
+ * @pf: board private structure
+ * @reset: reset being requested
+ */
+int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	/* bail out if earlier reset has failed */
+	if (test_bit(ICE_RESET_FAILED, pf->state)) {
+		dev_dbg(dev, "earlier reset has failed\n");
+		return -EIO;
+	}
+	/* bail if reset/recovery already in progress */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_dbg(dev, "Reset already in progress\n");
+		return -EBUSY;
+	}
+
+	switch (reset) {
+	case ICE_RESET_PFR:
+		set_bit(ICE_PFR_REQ, pf->state);
+		break;
+	case ICE_RESET_CORER:
+		set_bit(ICE_CORER_REQ, pf->state);
+		break;
+	case ICE_RESET_GLOBR:
+		set_bit(ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ice_service_task_schedule(pf);
+	return 0;
 }
 
 /**
@@ -1604,11 +3274,13 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 	int q_vectors = vsi->num_q_vectors;
 	struct ice_pf *pf = vsi->back;
 	int base = vsi->base_vector;
+	struct device *dev;
 	int rx_int_idx = 0;
 	int tx_int_idx = 0;
 	int vector, err;
 	int irq_num;
 
+	dev = ice_pf_to_dev(pf);
 	for (vector = 0; vector < q_vectors; vector++) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[vector];
 
@@ -1628,19 +3300,28 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 			/* skip this unused q_vector */
 			continue;
 		}
-		err = devm_request_irq(&pf->pdev->dev, irq_num,
-				       vsi->irq_handler, 0,
-				       q_vector->name, q_vector);
+		if (vsi->type == ICE_VSI_CTRL && vsi->vf_id != ICE_INVAL_VFID)
+			err = devm_request_irq(dev, irq_num, vsi->irq_handler,
+					       IRQF_SHARED, q_vector->name,
+					       q_vector);
+		else
+			err = devm_request_irq(dev, irq_num, vsi->irq_handler,
+					       0, q_vector->name, q_vector);
 		if (err) {
-			netdev_err(vsi->netdev,
-				   "MSIX request_irq failed, error: %d\n", err);
+			netdev_err(vsi->netdev, "MSIX request_irq failed, error: %d\n",
+				   err);
 			goto free_q_irqs;
 		}
 
 		/* register for affinity change notifications */
-		q_vector->affinity_notify.notify = ice_irq_affinity_notify;
-		q_vector->affinity_notify.release = ice_irq_affinity_release;
-		irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
+		if (!IS_ENABLED(CONFIG_RFS_ACCEL)) {
+			struct irq_affinity_notify *affinity_notify;
+
+			affinity_notify = &q_vector->affinity_notify;
+			affinity_notify->notify = ice_irq_affinity_notify;
+			affinity_notify->release = ice_irq_affinity_release;
+			irq_set_affinity_notifier(irq_num, affinity_notify);
+		}
 
 		/* assign the mask for this irq */
 		irq_set_affinity_hint(irq_num, &q_vector->affinity_mask);
@@ -1652,23 +3333,415 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 free_q_irqs:
 	while (vector) {
 		vector--;
-		irq_num = pf->msix_entries[base + vector].vector,
-		irq_set_affinity_notifier(irq_num, NULL);
+		irq_num = pf->msix_entries[base + vector].vector;
+		if (!IS_ENABLED(CONFIG_RFS_ACCEL))
+			irq_set_affinity_notifier(irq_num, NULL);
 		irq_set_affinity_hint(irq_num, NULL);
-		devm_free_irq(&pf->pdev->dev, irq_num, &vsi->q_vectors[vector]);
+		devm_free_irq(dev, irq_num, &vsi->q_vectors[vector]);
 	}
 	return err;
 }
 
+#ifdef HAVE_XDP_SUPPORT
 /**
- * ice_ena_misc_vector - enable the non-queue interrupts
- * @pf: board private structure
+ * ice_xdp_alloc_setup_rings - Allocate and setup Tx rings for XDP
+ * @vsi: VSI to setup Tx rings used by XDP
+ *
+ * Return 0 on success and negative value on error
  */
-static void ice_ena_misc_vector(struct ice_pf *pf)
+static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
 {
-	struct ice_hw *hw = &pf->hw;
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	int i;
+
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		u16 xdp_q_idx = vsi->alloc_txq + i;
+		struct ice_ring *xdp_ring;
+
+		xdp_ring = kzalloc(sizeof(*xdp_ring), GFP_KERNEL);
+
+		if (!xdp_ring)
+			goto free_xdp_rings;
+
+		xdp_ring->q_index = xdp_q_idx;
+		xdp_ring->reg_idx = vsi->txq_map[xdp_q_idx];
+		xdp_ring->vsi = vsi;
+		xdp_ring->netdev = NULL;
+		xdp_ring->dev = dev;
+		xdp_ring->count = vsi->num_tx_desc;
+		WRITE_ONCE(vsi->xdp_rings[i], xdp_ring);
+		if (ice_setup_tx_ring(xdp_ring))
+			goto free_xdp_rings;
+		ice_set_ring_xdp(xdp_ring);
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		xdp_ring->xsk_pool = ice_xsk_umem(xdp_ring);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	}
+
+	return 0;
+
+free_xdp_rings:
+	for (; i >= 0; i--)
+		if (vsi->xdp_rings[i] && vsi->xdp_rings[i]->desc)
+			ice_free_tx_ring(vsi->xdp_rings[i]);
+	return -ENOMEM;
+}
+
+/**
+ * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
+ * @vsi: VSI to set the bpf prog on
+ * @prog: the bpf prog pointer
+ */
+static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
+{
+	struct bpf_prog *old_prog;
+	int i;
+
+	old_prog = xchg(&vsi->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	ice_for_each_rxq(vsi, i)
+		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_clear_xdp_stats - clear all Rx XDP statistics on VSI
+ * @vsi: VSI to clear Rx XDP statistics on
+ */
+static void ice_clear_xdp_stats(struct ice_vsi *vsi)
+{
+	int i;
+	struct ice_ring *ring;
+
+	ice_for_each_alloc_rxq(vsi, i) {
+		ring = READ_ONCE(vsi->rx_rings[i]);
+		if (ring)
+			memset(&ring->xdp_stats, 0, sizeof(ring->xdp_stats));
+	}
+}
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_prepare_xdp_rings - Allocate, configure and setup Tx rings for XDP
+ * @vsi: VSI to bring up Tx rings used by XDP
+ * @prog: bpf program that will be assigned to VSI
+ *
+ * Return 0 on success and negative value on error
+ */
+int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog)
+{
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	int xdp_rings_rem = vsi->num_xdp_txq;
+	struct ice_pf *pf = vsi->back;
+	struct ice_qs_cfg xdp_qs_cfg = {
+		.qs_mutex = &pf->avail_q_mutex,
+		.pf_map = pf->avail_txqs,
+		.pf_map_size = pf->max_pf_txqs,
+		.q_count = vsi->num_xdp_txq,
+		.scatter_count = ICE_MAX_SCATTER_TXQS,
+		.vsi_map = vsi->txq_map,
+		.vsi_map_offset = vsi->alloc_txq,
+		.mapping_mode = ICE_VSI_MAP_CONTIG
+	};
+	enum ice_status status;
+	struct device *dev;
+	int i, v_idx;
+
+	dev = ice_pf_to_dev(pf);
+	vsi->xdp_rings = devm_kcalloc(dev, vsi->num_xdp_txq,
+				      sizeof(*vsi->xdp_rings), GFP_KERNEL);
+	if (!vsi->xdp_rings)
+		return -ENOMEM;
+
+	if (__ice_vsi_get_qs(&xdp_qs_cfg))
+		goto err_map_xdp;
+
+	vsi->xdp_mapping_mode = xdp_qs_cfg.mapping_mode;
+	if (ice_xdp_alloc_setup_rings(vsi))
+		goto clear_xdp_rings;
+
+	/* follow the logic from ice_vsi_map_rings_to_vectors */
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		int xdp_rings_per_v, q_id, q_base;
+
+		xdp_rings_per_v = DIV_ROUND_UP(xdp_rings_rem,
+					       vsi->num_q_vectors - v_idx);
+		q_base = vsi->num_xdp_txq - xdp_rings_rem;
+
+		for (q_id = q_base; q_id < (q_base + xdp_rings_per_v); q_id++) {
+			struct ice_ring *xdp_ring = vsi->xdp_rings[q_id];
+
+			xdp_ring->q_vector = q_vector;
+			xdp_ring->next = q_vector->tx.ring;
+			q_vector->tx.ring = xdp_ring;
+		}
+		xdp_rings_rem -= xdp_rings_per_v;
+	}
+
+	/* omit the scheduler update if in reset path; XDP queues will be
+	 * taken into account at the end of ice_vsi_rebuild, where
+	 * ice_cfg_vsi_lan is being called
+	 */
+	if (ice_is_reset_in_progress(pf->state))
+		return 0;
+
+	/* tell the Tx scheduler that right now we have
+	 * additional queues
+	 */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq + vsi->num_xdp_txq;
+
+	status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+				 max_txqs);
+	if (status) {
+		dev_err(dev, "Failed VSI LAN queue config for XDP, error: %s\n",
+			ice_stat_str(status));
+		goto clear_xdp_rings;
+	}
+	ice_vsi_assign_bpf_prog(vsi, prog);
+
+	return 0;
+clear_xdp_rings:
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		if (vsi->xdp_rings[i]) {
+			kfree_rcu(vsi->xdp_rings[i], rcu);
+			vsi->xdp_rings[i] = NULL;
+		}
+
+err_map_xdp:
+	mutex_lock(&pf->avail_q_mutex);
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		clear_bit(vsi->txq_map[i + vsi->alloc_txq], pf->avail_txqs);
+		vsi->txq_map[i + vsi->alloc_txq] = ICE_INVAL_Q_INDEX;
+	}
+	mutex_unlock(&pf->avail_q_mutex);
+
+	devm_kfree(dev, vsi->xdp_rings);
+	return -ENOMEM;
+}
+
+/**
+ * ice_destroy_xdp_rings - undo the configuration made by ice_prepare_xdp_rings
+ * @vsi: VSI to remove XDP rings
+ *
+ * Detach XDP rings from irq vectors, clean up the PF bitmap and free
+ * resources
+ */
+int ice_destroy_xdp_rings(struct ice_vsi *vsi)
+{
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	struct ice_pf *pf = vsi->back;
+	int i, v_idx;
+
+	/* q_vectors are freed in reset path so there's no point in detaching
+	 * rings; in case of rebuild being triggered not from reset reset bits
+	 * in pf->state won't be set, so additionally check first q_vector
+	 * against NULL
+	 */
+	if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0])
+		goto free_qmap;
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		struct ice_ring *ring;
+
+		ice_for_each_ring(ring, q_vector->tx)
+			if (!ring->tx_buf || !ice_ring_is_xdp(ring))
+				break;
+
+		/* restore the value of last node prior to XDP setup */
+		q_vector->tx.ring = ring;
+	}
+
+free_qmap:
+	mutex_lock(&pf->avail_q_mutex);
+	for (i = 0; i < vsi->num_xdp_txq; i++) {
+		clear_bit(vsi->txq_map[i + vsi->alloc_txq], pf->avail_txqs);
+		vsi->txq_map[i + vsi->alloc_txq] = ICE_INVAL_Q_INDEX;
+	}
+	mutex_unlock(&pf->avail_q_mutex);
+
+	for (i = 0; i < vsi->num_xdp_txq; i++)
+		if (vsi->xdp_rings[i]) {
+			if (vsi->xdp_rings[i]->desc)
+				ice_free_tx_ring(vsi->xdp_rings[i]);
+			kfree_rcu(vsi->xdp_rings[i], rcu);
+			vsi->xdp_rings[i] = NULL;
+		}
+
+	devm_kfree(ice_pf_to_dev(pf), vsi->xdp_rings);
+	vsi->xdp_rings = NULL;
+
+	if (ice_is_reset_in_progress(pf->state) || !vsi->q_vectors[0])
+		return 0;
+
+	ice_vsi_assign_bpf_prog(vsi, NULL);
+
+	/* notify Tx scheduler that we destroyed XDP queues and bring
+	 * back the old number of child nodes
+	 */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq;
+
+	/* change number of XDP Tx queues to 0 */
+	vsi->num_xdp_txq = 0;
+
+	return ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
+			       max_txqs);
+}
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+/**
+ * ice_vsi_rx_napi_schedule - Schedule napi on RX queues from VSI
+ * @vsi: VSI to schedule napi on
+ */
+static void ice_vsi_rx_napi_schedule(struct ice_vsi *vsi)
+{
+	int i;
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *rx_ring = vsi->rx_rings[i];
+
+		if (rx_ring->xsk_pool)
+			napi_schedule(&rx_ring->q_vector->napi);
+	}
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+/**
+ * ice_xdp_setup_prog - Add or remove XDP eBPF program
+ * @vsi: VSI to setup XDP for
+ * @prog: XDP program
+ * @extack: netlink extended ack
+ */
+static int
+ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
+		   struct netlink_ext_ack *extack)
+{
+	int frame_size = vsi->netdev->mtu + ICE_ETH_PKT_HDR_PAD;
+	bool if_running = netif_running(vsi->netdev);
+	int ret = 0, xdp_ring_err = 0;
+
+	if (frame_size > vsi->rx_buf_len) {
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large for loading XDP");
+		return -EOPNOTSUPP;
+	}
+
+	/* need to stop netdev while setting up the program for Rx rings */
+	if (if_running && !test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
+		ret = ice_down(vsi);
+		if (ret) {
+			NL_SET_ERR_MSG_MOD(extack, "Preparing device for XDP attach failed");
+			return ret;
+		}
+	}
+
+	if (!ice_is_xdp_ena_vsi(vsi) && prog) {
+		vsi->num_xdp_txq = vsi->alloc_rxq;
+		xdp_ring_err = ice_prepare_xdp_rings(vsi, prog);
+		if (xdp_ring_err)
+			NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed");
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+		ice_clear_xdp_stats(vsi);
+#endif /* ICE_ADD_PROBES */
+#endif /* HAVE_XDP_SUPPORT */
+	} else if (ice_is_xdp_ena_vsi(vsi) && !prog) {
+		xdp_ring_err = ice_destroy_xdp_rings(vsi);
+		if (xdp_ring_err)
+			NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Tx resources failed");
+	} else {
+		ice_vsi_assign_bpf_prog(vsi, prog);
+	}
+
+	if (if_running)
+		ret = ice_up(vsi);
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!ret && prog && vsi->xsk_umems)
+		ice_vsi_rx_napi_schedule(vsi);
+#else
+	if (!ret && prog)
+		ice_vsi_rx_napi_schedule(vsi);
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+	return (ret || xdp_ring_err) ? -ENOMEM : 0;
+}
+
+/**
+ * ice_xdp - implements XDP handler
+ * @dev: netdevice
+ * @xdp: XDP command
+ */
+#ifdef HAVE_NDO_BPF
+static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+#else
+static int ice_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (vsi->type != ICE_VSI_PF) {
+		NL_SET_ERR_MSG_MOD(xdp->extack, "XDP can be loaded only on PF VSI");
+		return -EINVAL;
+	}
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack);
+#ifdef HAVE_XDP_QUERY_PROG
+	case XDP_QUERY_PROG:
+#ifndef NO_NETDEV_BPF_PROG_ATTACHED
+		xdp->prog_attached = ice_is_xdp_ena_vsi(vsi);
+#endif /* !NO_NETDEV_BPF_PROG_ATTACHED */
+		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
+		return 0;
+#endif /* HAVE_XDP_QUERY_PROG */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	case XDP_SETUP_XSK_POOL:
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+		return ice_xsk_umem_setup(vsi, xdp->xsk.pool,
+#else
+		return ice_xsk_umem_setup(vsi, xdp->xsk.umem,
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+					  xdp->xsk.queue_id);
+#ifndef NO_XDP_QUERY_XSK_UMEM
+	case XDP_QUERY_XSK_UMEM:
+		return ice_xsk_umem_query(vsi, &xdp->xsk.umem,
+					  xdp->xsk.queue_id);
+#endif /* !NO_XDP_QUERY_XSK_UMEM */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	default:
+		return -EINVAL;
+	}
+}
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_ena_misc_vector - enable the non-queue interrupts
+ * @pf: board private structure
+ */
+static void ice_ena_misc_vector(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
 	u32 val;
 
+	/* Disable anti-spoof detection interrupt to prevent spurious event
+	 * interrupts during a function reset. Anti-spoof functionally is
+	 * still supported.
+	 */
+	val = rd32(hw, GL_MDCK_TX_TDPU);
+	val |= GL_MDCK_TX_TDPU_RCU_ANTISPOOF_ITR_DIS_M;
+	wr32(hw, GL_MDCK_TX_TDPU, val);
+
 	/* clear things first */
 	wr32(hw, PFINT_OICR_ENA, 0);	/* disable all */
 	rd32(hw, PFINT_OICR);		/* read to clear */
@@ -1679,7 +3752,9 @@ static void ice_ena_misc_vector(struct ice_pf *pf)
 	       PFINT_OICR_PCI_EXCEPTION_M |
 	       PFINT_OICR_VFLR_M |
 	       PFINT_OICR_HMC_ERR_M |
-	       PFINT_OICR_PE_CRITERR_M);
+	       PFINT_OICR_PE_PUSH_M |
+	       PFINT_OICR_PE_CRITERR_M |
+	       PFINT_OICR_SWINT_M);
 
 	wr32(hw, PFINT_OICR_ENA, val);
 
@@ -1698,10 +3773,13 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 	struct ice_pf *pf = (struct ice_pf *)data;
 	struct ice_hw *hw = &pf->hw;
 	irqreturn_t ret = IRQ_NONE;
+	struct device *dev;
 	u32 oicr, ena_mask;
 
-	set_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state);
-	set_bit(__ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	dev = ice_pf_to_dev(pf);
+	set_bit(ICE_ADMINQ_EVENT_PENDING, pf->state);
+	set_bit(ICE_MAILBOXQ_EVENT_PENDING, pf->state);
+	set_bit(ICE_SIDEBANDQ_EVENT_PENDING, pf->state);
 
 	oicr = rd32(hw, PFINT_OICR);
 	ena_mask = rd32(hw, PFINT_OICR_ENA);
@@ -1713,11 +3791,19 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 
 	if (oicr & PFINT_OICR_MAL_DETECT_M) {
 		ena_mask &= ~PFINT_OICR_MAL_DETECT_M;
-		set_bit(__ICE_MDD_EVENT_PENDING, pf->state);
+		set_bit(ICE_MDD_EVENT_PENDING, pf->state);
 	}
 	if (oicr & PFINT_OICR_VFLR_M) {
-		ena_mask &= ~PFINT_OICR_VFLR_M;
-		set_bit(__ICE_VFLR_EVENT_PENDING, pf->state);
+		/* disable any further VFLR event notifications */
+		if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
+			u32 reg = rd32(hw, PFINT_OICR_ENA);
+
+			reg &= ~PFINT_OICR_VFLR_M;
+			wr32(hw, PFINT_OICR_ENA, reg);
+		} else {
+			ena_mask &= ~PFINT_OICR_VFLR_M;
+			set_bit(ICE_VFLR_EVENT_PENDING, pf->state);
+		}
 	}
 
 	if (oicr & PFINT_OICR_GRST_M) {
@@ -1735,21 +3821,20 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		else if (reset == ICE_RESET_EMPR)
 			pf->empr_count++;
 		else
-			dev_dbg(&pf->pdev->dev, "Invalid reset type %d\n",
-				reset);
+			dev_dbg(dev, "Invalid reset type %d\n", reset);
 
 		/* If a reset cycle isn't already in progress, we set a bit in
 		 * pf->state so that the service task can start a reset/rebuild.
 		 * We also make note of which reset happened so that peer
 		 * devices/drivers can be informed.
 		 */
-		if (!test_and_set_bit(__ICE_RESET_OICR_RECV, pf->state)) {
+		if (!test_and_set_bit(ICE_RESET_OICR_RECV, pf->state)) {
 			if (reset == ICE_RESET_CORER)
-				set_bit(__ICE_CORER_RECV, pf->state);
+				set_bit(ICE_CORER_RECV, pf->state);
 			else if (reset == ICE_RESET_GLOBR)
-				set_bit(__ICE_GLOBR_RECV, pf->state);
+				set_bit(ICE_GLOBR_RECV, pf->state);
 			else
-				set_bit(__ICE_EMPR_RECV, pf->state);
+				set_bit(ICE_EMPR_RECV, pf->state);
 
 			/* There are couple of different bits at play here.
 			 * hw->reset_ongoing indicates whether the hardware is
@@ -1757,7 +3842,7 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 			 * is received and set back to false after the driver
 			 * has determined that the hardware is out of reset.
 			 *
-			 * __ICE_RESET_OICR_RECV in pf->state indicates
+			 * ICE_RESET_OICR_RECV in pf->state indicates
 			 * that a post reset rebuild is required before the
 			 * driver is operational again. This is set above.
 			 *
@@ -1768,35 +3853,56 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		}
 	}
 
-	if (oicr & PFINT_OICR_HMC_ERR_M) {
+	if (oicr & PFINT_OICR_TSYN_TX_M) {
+		ena_mask &= ~PFINT_OICR_TSYN_TX_M;
+		set_bit(ICE_PTP_TX_TS_READY, pf->state);
+	}
+
+	if (oicr & PFINT_OICR_TSYN_EVNT_M) {
+		u8 tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+		u32 gltsyn_stat = rd32(hw, GLTSYN_STAT(tmr_idx));
+
+		/* Save EVENTs from GTSYN register */
+		pf->ptp.ext_ts_irq |= gltsyn_stat & (GLTSYN_STAT_EVENT0_M |
+						     GLTSYN_STAT_EVENT1_M |
+						     GLTSYN_STAT_EVENT2_M);
+		ena_mask &= ~PFINT_OICR_TSYN_EVNT_M;
+		set_bit(ICE_PTP_EXT_TS_READY, pf->state);
+	}
+	if (oicr & (PFINT_OICR_PE_CRITERR_M | PFINT_OICR_HMC_ERR_M | PFINT_OICR_PE_PUSH_M)) {
+		struct ice_event *event;
+
 		ena_mask &= ~PFINT_OICR_HMC_ERR_M;
-		dev_dbg(&pf->pdev->dev,
-			"HMC Error interrupt - info 0x%x, data 0x%x\n",
-			rd32(hw, PFHMC_ERRORINFO),
-			rd32(hw, PFHMC_ERRORDATA));
+		ena_mask &= ~PFINT_OICR_PE_CRITERR_M;
+		ena_mask &= ~PFINT_OICR_PE_PUSH_M;
+		event = kzalloc(sizeof(*event), GFP_ATOMIC);
+		if (event) {
+			set_bit(ICE_EVENT_CRIT_ERR, event->type);
+			event->reporter = NULL;
+			/* report the entire OICR value to peer */
+			event->info.reg = oicr;
+			ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+			kfree(event);
+		}
 	}
 
 	/* Report any remaining unexpected interrupts */
 	oicr &= ena_mask;
 	if (oicr) {
-		dev_dbg(&pf->pdev->dev, "unhandled interrupt oicr=0x%08x\n",
-			oicr);
+		dev_dbg(dev, "unhandled interrupt oicr=0x%08x\n", oicr);
 		/* If a critical error is pending there is no choice but to
 		 * reset the device.
 		 */
-		if (oicr & (PFINT_OICR_PE_CRITERR_M |
-			    PFINT_OICR_PCI_EXCEPTION_M |
+		if (oicr & (PFINT_OICR_PCI_EXCEPTION_M |
 			    PFINT_OICR_ECC_ERR_M)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
+			set_bit(ICE_PFR_REQ, pf->state);
 			ice_service_task_schedule(pf);
 		}
 	}
 	ret = IRQ_HANDLED;
 
-	if (!test_bit(__ICE_DOWN, pf->state)) {
-		ice_service_task_schedule(pf);
-		ice_irq_dynamic_ena(hw, NULL, NULL);
-	}
+	ice_service_task_schedule(pf);
+	ice_irq_dynamic_ena(hw, NULL, NULL);
 
 	return ret;
 }
@@ -1815,6 +3921,9 @@ static void ice_dis_ctrlq_interrupts(struct ice_hw *hw)
 	wr32(hw, PFINT_MBX_CTL,
 	     rd32(hw, PFINT_MBX_CTL) & ~PFINT_MBX_CTL_CAUSE_ENA_M);
 
+	wr32(hw, PFINT_SB_CTL,
+	     rd32(hw, PFINT_SB_CTL) & ~PFINT_SB_CTL_CAUSE_ENA_M);
+
 	/* disable Control queue Interrupt causes */
 	wr32(hw, PFINT_OICR_CTL,
 	     rd32(hw, PFINT_OICR_CTL) & ~PFINT_OICR_CTL_CAUSE_ENA_M);
@@ -1838,7 +3947,7 @@ static void ice_free_irq_msix_misc(struct ice_pf *pf)
 
 	if (pf->msix_entries) {
 		synchronize_irq(pf->msix_entries[pf->oicr_idx].vector);
-		devm_free_irq(&pf->pdev->dev,
+		devm_free_irq(ice_pf_to_dev(pf),
 			      pf->msix_entries[pf->oicr_idx].vector, pf);
 	}
 
@@ -1869,6 +3978,11 @@ static void ice_ena_ctrlq_interrupts(struct ice_hw *hw, u16 reg_idx)
 	       PFINT_MBX_CTL_CAUSE_ENA_M);
 	wr32(hw, PFINT_MBX_CTL, val);
 
+	/* This enables Sideband queue Interrupt causes */
+	val = ((reg_idx & PFINT_SB_CTL_MSIX_INDX_M) |
+	       PFINT_SB_CTL_CAUSE_ENA_M);
+	wr32(hw, PFINT_SB_CTL, val);
+
 	ice_flush(hw);
 }
 
@@ -1882,13 +3996,13 @@ static void ice_ena_ctrlq_interrupts(struct ice_hw *hw, u16 reg_idx)
  */
 static int ice_req_irq_msix_misc(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 	int oicr_idx, err = 0;
 
 	if (!pf->int_name[0])
 		snprintf(pf->int_name, sizeof(pf->int_name) - 1, "%s-%s:misc",
-			 dev_driver_string(&pf->pdev->dev),
-			 dev_name(&pf->pdev->dev));
+			 dev_driver_string(dev), dev_name(dev));
 
 	/* Do not request IRQ but do enable OICR interrupt since settings are
 	 * lost during reset. Note that this function is called only during
@@ -1903,14 +4017,12 @@ static int ice_req_irq_msix_misc(struct ice_pf *pf)
 		return oicr_idx;
 
 	pf->num_avail_sw_msix -= 1;
-	pf->oicr_idx = oicr_idx;
+	pf->oicr_idx = (u16)oicr_idx;
 
-	err = devm_request_irq(&pf->pdev->dev,
-			       pf->msix_entries[pf->oicr_idx].vector,
+	err = devm_request_irq(dev, pf->msix_entries[pf->oicr_idx].vector,
 			       ice_misc_intr, 0, pf->int_name, pf);
 	if (err) {
-		dev_err(&pf->pdev->dev,
-			"devm_request_irq for %s failed: %d\n",
+		dev_err(dev, "devm_request_irq for %s failed: %d\n",
 			pf->int_name, err);
 		ice_free_res(pf->irq_tracker, 1, ICE_RES_MISC_VEC_ID);
 		pf->num_avail_sw_msix += 1;
@@ -1963,7 +4075,6 @@ static void ice_set_ops(struct net_device *netdev)
 		ice_set_ethtool_safe_mode_ops(netdev);
 		return;
 	}
-
 	netdev->netdev_ops = &ice_netdev_ops;
 	ice_set_ethtool_ops(netdev);
 }
@@ -1975,6 +4086,7 @@ static void ice_set_ops(struct net_device *netdev)
 static void ice_set_netdev_features(struct net_device *netdev)
 {
 	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	bool is_dvm_ena = ice_is_dvm_ena(&pf->hw);
 	netdev_features_t csumo_features;
 	netdev_features_t vlano_features;
 	netdev_features_t dflt_features;
@@ -1989,6 +4101,7 @@ static void ice_set_netdev_features(struct net_device *netdev)
 
 	dflt_features = NETIF_F_SG	|
 			NETIF_F_HIGHDMA	|
+			NETIF_F_NTUPLE	|
 			NETIF_F_RXHASH;
 
 	csumo_features = NETIF_F_RXCSUM	  |
@@ -2000,12 +4113,51 @@ static void ice_set_netdev_features(struct net_device *netdev)
 			 NETIF_F_HW_VLAN_CTAG_TX     |
 			 NETIF_F_HW_VLAN_CTAG_RX;
 
-	tso_features = NETIF_F_TSO;
-
+	/* Enable CTAG/STAG filtering by default in Double VLAN Mode (DVM) */
+	if (is_dvm_ena)
+		vlano_features |= NETIF_F_HW_VLAN_STAG_FILTER;
+
+	tso_features = NETIF_F_TSO			|
+		       NETIF_F_TSO_ECN			|
+		       NETIF_F_TSO6			|
+		       NETIF_F_GSO_GRE			|
+		       NETIF_F_GSO_UDP_TUNNEL		|
+#ifdef NETIF_F_GSO_GRE_CSUM
+		       NETIF_F_GSO_GRE_CSUM		|
+		       NETIF_F_GSO_UDP_TUNNEL_CSUM	|
+#endif
+#ifdef NETIF_F_GSO_PARTIAL
+		       NETIF_F_GSO_PARTIAL		|
+#endif
+#ifdef NETIF_F_GSO_IPXIP4
+		       NETIF_F_GSO_IPXIP4		|
+		       NETIF_F_GSO_IPXIP6		|
+#else
+#ifdef NETIF_F_GSO_IPIP
+		       NETIF_F_GSO_IPIP		|
+		       NETIF_F_GSO_SIT		|
+#endif
+#endif /* NETIF_F_GSO_IPXIP4 */
+#ifdef NETIF_F_GSO_UDP_L4
+		       NETIF_F_GSO_UDP_L4	|
+#endif /* NETIF_F_GSO_UDP_L4 */
+		       0;
+
+#ifndef NETIF_F_GSO_PARTIAL
+	tso_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+#else
+	netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM |
+					NETIF_F_GSO_GRE_CSUM;
+#endif
 	/* set features that user can change */
 	netdev->hw_features = dflt_features | csumo_features |
 			      vlano_features | tso_features;
 
+#ifdef HAVE_MPLS_FEATURES
+	/* add support for HW_CSUM on packets with MPLS header */
+	netdev->mpls_features =  NETIF_F_HW_CSUM;
+#endif /* HAVE_MPLS_FEATURES */
+
 	/* enable features */
 	netdev->features |= netdev->hw_features;
 	/* encap and VLAN devices inherit default, csumo and tso features */
@@ -2013,6 +4165,26 @@ static void ice_set_netdev_features(struct net_device *netdev)
 				   tso_features;
 	netdev->vlan_features |= dflt_features | csumo_features |
 				 tso_features;
+
+#ifdef NETIF_F_HW_TC
+	netdev->hw_features |= NETIF_F_HW_TC;
+#endif /* NETIF_F_HW_TC */
+
+	/* advertise support but don't enable by default since only one type of
+	 * VLAN offload can be enabled at a time (i.e. CTAG or STAG). When one
+	 * type turns on the other has to be turned off. This is enforced by the
+	 * ice_fix_features() ndo callback.
+	 */
+	if (is_dvm_ena) {
+		netdev->hw_features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_TX;
+	}
+
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Enable macvlan offloads */
+	if (test_bit(ICE_FLAG_VMDQ_ENA, pf->flags))
+		netdev->hw_features |= NETIF_F_HW_L2FW_DOFFLOAD;
+#endif /* HAVE_NETDEV_SB_DEV */
 }
 
 /**
@@ -2023,17 +4195,25 @@ static void ice_set_netdev_features(struct net_device *netdev)
  */
 static int ice_cfg_netdev(struct ice_vsi *vsi)
 {
-	struct ice_pf *pf = vsi->back;
 	struct ice_netdev_priv *np;
 	struct net_device *netdev;
 	u8 mac_addr[ETH_ALEN];
-	int err;
 
+#ifdef HAVE_NETDEV_SB_DEV
+	/* Inform Kernel beforehand about max number of MACVLAN queues
+	 * supported.
+	 */
+	netdev = alloc_etherdev_mqs(sizeof(*np),
+				    ICE_MAX_MACVLANS + vsi->alloc_txq,
+				    ICE_MAX_MACVLANS + vsi->alloc_rxq);
+#else /* !HAVE_NETDEV_SB_DEV */
 	netdev = alloc_etherdev_mqs(sizeof(*np), vsi->alloc_txq,
 				    vsi->alloc_rxq);
+#endif /* !HAVE_NETDEV_SB_DEV */
 	if (!netdev)
 		return -ENOMEM;
 
+	set_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
 	vsi->netdev = netdev;
 	np = netdev_priv(netdev);
 	np->vsi = vsi;
@@ -2043,7 +4223,7 @@ static int ice_cfg_netdev(struct ice_vsi *vsi)
 	ice_set_ops(netdev);
 
 	if (vsi->type == ICE_VSI_PF) {
-		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
+		SET_NETDEV_DEV(netdev, ice_pf_to_dev(vsi->back));
 		ether_addr_copy(mac_addr, vsi->port_info->mac.perm_addr);
 		ether_addr_copy(netdev->dev_addr, mac_addr);
 		ether_addr_copy(netdev->perm_addr, mac_addr);
@@ -2057,17 +4237,15 @@ static int ice_cfg_netdev(struct ice_vsi *vsi)
 	/* setup watchdog timeout value to be 5 second */
 	netdev->watchdog_timeo = 5 * HZ;
 
+#ifdef HAVE_NETDEVICE_MIN_MAX_MTU
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	netdev->extended->min_mtu = ETH_MIN_MTU;
+	netdev->extended->max_mtu = ICE_MAX_MTU;
+#else
 	netdev->min_mtu = ETH_MIN_MTU;
 	netdev->max_mtu = ICE_MAX_MTU;
-
-	err = register_netdev(vsi->netdev);
-	if (err)
-		return err;
-
-	netif_carrier_off(vsi->netdev);
-
-	/* make sure transmit queues start off as stopped */
-	netif_tx_stop_all_queues(vsi->netdev);
+#endif /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+#endif /* HAVE_NETDEVICE_MIN_MAX_MTU */
 
 	return 0;
 }
@@ -2097,7 +4275,47 @@ void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size)
 static struct ice_vsi *
 ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID);
+	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID, NULL, 0);
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static struct ice_vsi *
+ice_chnl_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
+		   struct ice_channel *ch)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_CHNL, ICE_INVAL_VFID, ch, 0);
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_NETDEV_SB_DEV
+/**
+ * ice_macvlan_vsi_setup - Set up a MACVLAN VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI software struct
+ * on success, otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_macvlan_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_OFFLOAD_MACVLAN, ICE_INVAL_VFID,
+			     NULL, 0);
+}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+/**
+ * ice_ctrl_vsi_setup - Set up a control VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI software struct
+ * on success, otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_ctrl_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_CTRL, ICE_INVAL_VFID, NULL, 0);
 }
 
 /**
@@ -2111,50 +4329,38 @@ ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 struct ice_vsi *
 ice_lb_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_LB, ICE_INVAL_VFID);
+	return ice_vsi_setup(pf, pi, ICE_VSI_LB, ICE_INVAL_VFID, NULL, 0);
 }
 
 /**
  * ice_vlan_rx_add_vid - Add a VLAN ID filter to HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be added
  *
  * net_device_ops implementation for adding VLAN IDs
  */
 static int
-ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
-		    u16 vid)
+ice_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi_vlan_ops *vlan_ops;
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_vlan vlan;
 	int ret;
 
-	if (vid >= VLAN_N_VID) {
-		netdev_err(netdev, "VLAN id requested %d is out of range %d\n",
-			   vid, VLAN_N_VID);
-		return -EINVAL;
-	}
-
-	if (vsi->info.pvid)
-		return -EINVAL;
+	if (!vid)
+		return 0;
 
-	/* Enable VLAN pruning when VLAN 0 is added */
-	if (unlikely(!vid)) {
-		ret = ice_cfg_vlan_pruning(vsi, true, false);
-		if (ret)
-			return ret;
-	}
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* Add all VLAN IDs including 0 to the switch filter. VLAN ID 0 is
-	 * needed to continue allowing all untagged packets since VLAN prune
-	 * list is applied to all packets by the switch
+	/* Add a switch rule for this VLAN ID so its corresponding VLAN tagged
+	 * packets aren't pruned by the device's internal switch on Rx
 	 */
-	ret = ice_vsi_add_vlan(vsi, vid);
-	if (!ret) {
-		vsi->vlan_ena = true;
-		set_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
-	}
+	vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0, ICE_FWD_TO_VSI);
+	ret = vlan_ops->add_vlan(vsi, &vlan);
+	if (!ret)
+		set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
 
 	return ret;
 }
@@ -2162,66 +4368,200 @@ ice_vlan_rx_add_vid(struct net_device *netdev, __always_unused __be16 proto,
 /**
  * ice_vlan_rx_kill_vid - Remove a VLAN ID filter from HW offload
  * @netdev: network interface to be adjusted
- * @proto: unused protocol
+ * @proto: VLAN TPID
  * @vid: VLAN ID to be removed
  *
  * net_device_ops implementation for removing VLAN IDs
  */
 static int
-ice_vlan_rx_kill_vid(struct net_device *netdev, __always_unused __be16 proto,
-		     u16 vid)
+ice_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi_vlan_ops *vlan_ops;
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_vlan vlan;
 	int ret;
 
-	if (vsi->info.pvid)
-		return -EINVAL;
+	if (!vid)
+		return 0;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* Make sure ice_vsi_kill_vlan is successful before updating VLAN
+	/* Make sure VLAN delete is successful before updating VLAN
 	 * information
 	 */
-	ret = ice_vsi_kill_vlan(vsi, vid);
+	vlan = ICE_VLAN(be16_to_cpu(proto), vid, 0, ICE_FWD_TO_VSI);
+	ret = vlan_ops->del_vlan(vsi, &vlan);
 	if (ret)
 		return ret;
 
-	/* Disable VLAN pruning when VLAN 0 is removed */
-	if (unlikely(!vid))
-		ret = ice_cfg_vlan_pruning(vsi, false, false);
-
-	vsi->vlan_ena = false;
-	set_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
-	return ret;
+	set_bit(ICE_VSI_VLAN_FLTR_CHANGED, vsi->state);
+	return 0;
 }
 
 /**
- * ice_setup_pf_sw - Setup the HW switch on startup or after reset
+ * ice_pf_reset_stats - Reset all of the stats for the given PF
  * @pf: board private structure
- *
- * Returns 0 on success, negative value on failure
  */
-static int ice_setup_pf_sw(struct ice_pf *pf)
+static void ice_pf_reset_stats(struct ice_pf *pf)
 {
-	struct ice_vsi *vsi;
-	int status = 0;
+	memset(&pf->stats, 0, sizeof(pf->stats));
+	memset(&pf->stats_prev, 0, sizeof(pf->stats_prev));
+	pf->stat_prev_loaded = false;
+
+	pf->hw_csum_rx_error = 0;
+#ifdef ICE_ADD_PROBES
+	pf->tcp_segs = 0;
+	pf->udp_segs = 0;
+	pf->tx_tcp_cso = 0;
+	pf->tx_udp_cso = 0;
+	pf->tx_sctp_cso = 0;
+	pf->tx_ip4_cso = 0;
+	pf->tx_l3_cso_err = 0;
+	pf->tx_l4_cso_err = 0;
+	pf->rx_tcp_cso = 0;
+	pf->rx_udp_cso = 0;
+	pf->rx_sctp_cso = 0;
+	pf->rx_ip4_cso = 0;
+	pf->rx_ip4_cso_err = 0;
+	pf->rx_tcp_cso_err = 0;
+	pf->rx_udp_cso_err = 0;
+	pf->rx_sctp_cso_err = 0;
+	pf->tx_q_vlano = 0;
+	pf->rx_q_vlano = 0;
+	pf->tx_ad_vlano = 0;
+	pf->rx_ad_vlano = 0;
+#endif
+}
 
-	if (ice_is_reset_in_progress(pf->state))
-		return -EBUSY;
 
-	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
-	if (!vsi) {
-		status = -ENOMEM;
-		goto unroll_vsi_setup;
-	}
+#ifdef HAVE_TC_INDIR_BLOCK
+#ifdef HAVE_FLOW_BLOCK_API
+/**
+ * ice_rep_indr_tc_block_unbind
+ * @cb_priv: indirection block private data
+ */
+static void ice_rep_indr_tc_block_unbind(void *cb_priv)
+{
+	struct ice_indr_block_priv *indr_priv = cb_priv;
 
-	status = ice_cfg_netdev(vsi);
-	if (status) {
-		status = -ENODEV;
-		goto unroll_vsi_setup;
-	}
+	list_del(&indr_priv->list);
+	devm_kfree(&indr_priv->netdev->dev, indr_priv);
+}
+#endif /* HAVE_FLOW_BLOCK_API */
 
-	/* registering the NAPI handler requires both the queues and
-	 * netdev to be created, which are done in ice_pf_vsi_setup()
+/**
+ * ice_tc_indir_block_unregister - Unregister TC indirect block notifications
+ * @vsi: VSI struct which has the netdev
+ */
+static void ice_tc_indir_block_unregister(struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np = netdev_priv(vsi->netdev);
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	/* clean indirect TC block notifications */
+	unregister_netdevice_notifier(&np->netdevice_nb);
+	ice_indr_clean_block_privs(np);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && ((defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP) && defined(HAVE_FLOW_BLOCK_API)) || defined(HAVE_FLOW_INDIR_BLOCK_QDISC))
+	flow_indr_dev_unregister(ice_indr_setup_tc_cb, np,
+				 ice_rep_indr_tc_block_unbind);
+#else
+	flow_indr_dev_unregister(ice_indr_setup_tc_cb, np,
+				 ice_indr_setup_block_cb);
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+}
+
+/**
+ * ice_tc_indir_block_remove - clean indirect TC block notifications
+ * @pf: PF structure
+ */
+static void ice_tc_indir_block_remove(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi = ice_get_main_vsi(pf);
+
+	if (!pf_vsi)
+		return;
+
+	ice_tc_indir_block_unregister(pf_vsi);
+}
+
+/**
+ * ice_tc_indir_block_register - Register TC indirect block notifications
+ * @vsi: VSI struct which has the netdev
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_tc_indir_block_register(struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np;
+
+	if (!vsi || !vsi->netdev)
+		return -EINVAL;
+
+	np = netdev_priv(vsi->netdev);
+
+	INIT_LIST_HEAD(&np->tc_indr_block_priv_list);
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+	np->netdevice_nb.notifier_call = ice_netdevice_event;
+	return register_netdevice_notifier(&np->netdevice_nb);
+#else
+	return flow_indr_dev_register(ice_indr_setup_tc_cb, np);
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+}
+
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+/**
+ * ice_setup_pf_sw - Setup the HW switch on startup or after reset
+ * @pf: board private structure
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_setup_pf_sw(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	bool dvm = ice_is_dvm_ena(&pf->hw);
+	struct ice_vsi *vsi;
+	int status = 0;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+
+
+	status = ice_aq_set_port_params(pf->hw.port_info, 0, false, false, dvm,
+					NULL);
+	if (status)
+		return -EIO;
+
+	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi)
+		return -ENOMEM;
+
+	/* init channel list */
+	INIT_LIST_HEAD(&vsi->ch_list);
+
+	status = ice_cfg_netdev(vsi);
+	if (status) {
+		status = -ENODEV;
+		goto unroll_vsi_setup;
+	}
+
+	/* netdev has to be configured before setting frame size */
+	ice_vsi_cfg_frame_size(vsi);
+
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* init indirect block notifications */
+	status = ice_tc_indir_block_register(vsi);
+	if (status) {
+		dev_err(dev, "Failed to register netdev notifier\n");
+		goto unroll_cfg_netdev;
+	}
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+	/* Setup DCB netlink interface */
+	ice_dcbnl_setup(vsi);
+
+	/* registering the NAPI handler requires both the queues and
+	 * netdev to be created, which are done in ice_pf_vsi_setup()
 	 * and ice_cfg_netdev() respectively
 	 */
 	ice_napi_add(vsi);
@@ -2230,26 +4570,30 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 	if (status)
 		goto unroll_napi_add;
 
+	status = ice_set_cpu_rx_rmap(vsi);
+	if (status) {
+		dev_err(dev, "Failed to set CPU Rx map VSI %d error %d\n",
+			vsi->vsi_num, status);
+		status = -EINVAL;
+		goto unroll_napi_add;
+	}
+
 	return status;
 
 unroll_napi_add:
-	if (vsi) {
-		ice_napi_del(vsi);
-		if (vsi->netdev) {
-			if (vsi->netdev->reg_state == NETREG_REGISTERED)
-				unregister_netdev(vsi->netdev);
-			free_netdev(vsi->netdev);
-			vsi->netdev = NULL;
-		}
+	ice_napi_del(vsi);
+#ifdef HAVE_TC_INDIR_BLOCK
+	ice_tc_indir_block_unregister(vsi);
+unroll_cfg_netdev:
+#endif /* HAVE_TC_INDIR_BLOCK */
+	if (vsi->netdev) {
+		clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
 	}
-
 unroll_vsi_setup:
-	if (vsi) {
-		ice_vsi_free_q_vectors(vsi);
-		ice_vsi_delete(vsi);
-		ice_vsi_put_qs(vsi);
-		ice_vsi_clear(vsi);
-	}
+	ice_vsi_release(vsi);
+
 	return status;
 }
 
@@ -2262,7 +4606,8 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 static u16
 ice_get_avail_q_count(unsigned long *pf_qmap, struct mutex *lock, u16 size)
 {
-	u16 count = 0, bit;
+	unsigned long bit;
+	u16 count = 0;
 
 	mutex_lock(lock);
 	for_each_clear_bit(bit, pf_qmap, size)
@@ -2300,6 +4645,7 @@ static void ice_deinit_pf(struct ice_pf *pf)
 {
 	ice_service_task_stop(pf);
 	mutex_destroy(&pf->sw_mutex);
+	mutex_destroy(&pf->tc_mutex);
 	mutex_destroy(&pf->avail_q_mutex);
 
 	if (pf->avail_txqs) {
@@ -2321,21 +4667,50 @@ static void ice_set_pf_caps(struct ice_pf *pf)
 {
 	struct ice_hw_func_caps *func_caps = &pf->hw.func_caps;
 
+	clear_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+	if (func_caps->common_cap.vmdq)
+		set_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+	clear_bit(ICE_FLAG_IWARP_ENA, pf->flags);
+	clear_bit(ICE_FLAG_PEER_ENA, pf->flags);
+	if (func_caps->common_cap.iwarp && IS_ENABLED(CONFIG_MFD_CORE)) {
+		set_bit(ICE_FLAG_IWARP_ENA, pf->flags);
+		set_bit(ICE_FLAG_PEER_ENA, pf->flags);
+	}
 	clear_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
 	if (func_caps->common_cap.dcb)
 		set_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
-#ifdef CONFIG_PCI_IOV
 	clear_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+	clear_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
 	if (func_caps->common_cap.sr_iov_1_1) {
 		set_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+		set_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
 		pf->num_vfs_supported = min_t(int, func_caps->num_allocd_vfs,
 					      ICE_MAX_VF_COUNT);
 	}
-#endif /* CONFIG_PCI_IOV */
 	clear_bit(ICE_FLAG_RSS_ENA, pf->flags);
 	if (func_caps->common_cap.rss_table_size)
 		set_bit(ICE_FLAG_RSS_ENA, pf->flags);
 
+	clear_bit(ICE_FLAG_FD_ENA, pf->flags);
+	if (func_caps->fd_fltr_guar > 0 || func_caps->fd_fltr_best_effort > 0) {
+		u16 unused;
+
+		/* ctrl_vsi_idx will be set to a valid value when flow director
+		 * is setup by ice_init_fdir
+		 */
+		pf->ctrl_vsi_idx = ICE_NO_VSI;
+		set_bit(ICE_FLAG_FD_ENA, pf->flags);
+		/* force guaranteed filter pool for PF */
+		ice_alloc_fd_guar_item(&pf->hw, &unused,
+				       func_caps->fd_fltr_guar);
+		/* force shared filter pool for PF */
+		ice_alloc_fd_shrd_item(&pf->hw, &unused,
+				       func_caps->fd_fltr_best_effort);
+	}
+	clear_bit(ICE_FLAG_PTP_ENA, pf->flags);
+	if (func_caps->common_cap.ieee_1588)
+		set_bit(ICE_FLAG_PTP_ENA, pf->flags);
+
 	pf->max_pf_txqs = func_caps->common_cap.num_txq;
 	pf->max_pf_rxqs = func_caps->common_cap.num_rxq;
 }
@@ -2349,12 +4724,19 @@ static int ice_init_pf(struct ice_pf *pf)
 	ice_set_pf_caps(pf);
 
 	mutex_init(&pf->sw_mutex);
+	mutex_init(&pf->tc_mutex);
+
+	INIT_HLIST_HEAD(&pf->aq_wait_list);
+	spin_lock_init(&pf->aq_wait_lock);
+	init_waitqueue_head(&pf->aq_wait_queue);
+
+	init_waitqueue_head(&pf->reset_wait_queue);
 
 	/* setup service timer and periodic service task */
 	timer_setup(&pf->serv_tmr, ice_service_timer, 0);
 	pf->serv_tmr_period = HZ;
 	INIT_WORK(&pf->serv_task, ice_service_task);
-	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+	clear_bit(ICE_SERVICE_SCHED, pf->state);
 
 	mutex_init(&pf->avail_q_mutex);
 	pf->avail_txqs = bitmap_zalloc(pf->max_pf_txqs, GFP_KERNEL);
@@ -2363,94 +4745,173 @@ static int ice_init_pf(struct ice_pf *pf)
 
 	pf->avail_rxqs = bitmap_zalloc(pf->max_pf_rxqs, GFP_KERNEL);
 	if (!pf->avail_rxqs) {
-		devm_kfree(&pf->pdev->dev, pf->avail_txqs);
+		devm_kfree(ice_pf_to_dev(pf), pf->avail_txqs);
 		pf->avail_txqs = NULL;
 		return -ENOMEM;
 	}
 
+	/* init tunnel list and lock */
+	spin_lock_init(&pf->tnl_lock);
+	INIT_LIST_HEAD(&pf->tnl_list);
+
+	return 0;
+}
+
+static int ice_alloc_msix_entries(struct ice_pf *pf, u16 num_entries)
+{
+	u16 i;
+
+	pf->msix_entries = devm_kcalloc(ice_pf_to_dev(pf), num_entries,
+					sizeof(*pf->msix_entries), GFP_KERNEL);
+	if (!pf->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		pf->msix_entries[i].entry = i;
+
 	return 0;
 }
 
+static void ice_free_msix_entries(struct ice_pf *pf)
+{
+	devm_kfree(ice_pf_to_dev(pf), pf->msix_entries);
+	pf->msix_entries = NULL;
+}
+
 /**
- * ice_ena_msix_range - Request a range of MSIX vectors from the OS
+ * ice_ena_msix_range - request a range of MSI-X vectors from the OS
  * @pf: board private structure
  *
- * compute the number of MSIX vectors required (v_budget) and request from
- * the OS. Return the number of vectors reserved or negative on failure
+ * The driver first tries to enable best-case scenario MSI-X vectors. If that
+ * doesn't succeeed then a fall-back method is employed.
+ *
+ * The fall-back logic is described below with each [#] being an attempt at
+ * enabling a certain number of MSI-X. If any of the steps succeed, then return
+ * the number of MSI-X enabled from pci_ena_msix_exact(). If any of the attempts
+ * fail, then goto the next step.
+ *
+ * Attempt [0]: Enable the best-case scenario MSI-X vectors.
+ *
+ * Attempt [1]: Enable MSI-X vectors with eswitch support disabled
+ *
+ * Attempt [2]: Enable MSI-X vectors with MACVLAN support disabled, which
+ * reduces the request by the MSI-X vectors needed for MACVLAN.
+ *
+ * Attempt [3]: Enable MSI-X vectors with the number of pf->num_lan_msix reduced
+ * by a factor of 2 from the previous attempts (i.e. num_online_cpus() / 2).
+ * Also, with the number of pf->num_rdma_msix reduced by a factor of ~2 from the
+ * previous attempts (i.e. num_online_cpus() / 2 + ICE_RDMA_NUM_AEQ_MSIX).
+ *
+ * Attempt [4]: Same as attempt [3], except reduce both by a factor of 4.
+ *
+ * Attempt [5]: Enable the bare-minimum MSI-X vectors.
+ *
+ * Also, if the adjusted_base_msix ever hits the mimimum required for LAN or
+ * RDMA, then just set the needed MSI-X for that feature to the minimum (similar
+ * to attempt [5]).
  */
 static int ice_ena_msix_range(struct ice_pf *pf)
 {
-	int v_left, v_actual, v_budget = 0;
-	int needed, err, i;
+	int err = -ENOSPC, num_cpus, attempt, adjusted_msix_divisor = 1, needed;
+	struct device *dev = ice_pf_to_dev(pf);
 
-	v_left = pf->hw.func_caps.common_cap.num_msix_vectors;
+	num_cpus = num_online_cpus();
 
-	/* reserve one vector for miscellaneous handler */
-	needed = 1;
-	if (v_left < needed)
-		goto no_hw_vecs_left_err;
-	v_budget += needed;
-	v_left -= needed;
+#define ICE_MAX_ENABLE_MSIX_ATTEMPTS 5
+	/* make multiple passes at enabling MSI-X vectors in case there aren't
+	 * enough available for the best-case scenario
+	 */
+	for (attempt = 0; attempt <= ICE_MAX_ENABLE_MSIX_ATTEMPTS; attempt++) {
+		int adjusted_base_msix = num_cpus / adjusted_msix_divisor;
 
-	/* reserve vectors for LAN traffic */
-	needed = min_t(int, num_online_cpus(), v_left);
-	if (v_left < needed)
-		goto no_hw_vecs_left_err;
-	pf->num_lan_msix = needed;
-	v_budget += needed;
-	v_left -= needed;
+		/* attempt to enable minimum MSI-X range */
+		if (attempt == ICE_MAX_ENABLE_MSIX_ATTEMPTS) {
+			needed = ICE_MIN_MSIX;
+			pf->num_lan_msix = ICE_MIN_LAN_MSIX;
 
-	pf->msix_entries = devm_kcalloc(&pf->pdev->dev, v_budget,
-					sizeof(*pf->msix_entries), GFP_KERNEL);
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				needed += ICE_MIN_RDMA_MSIX;
+				pf->num_rdma_msix = ICE_MIN_RDMA_MSIX;
+			}
+		} else {
+			if (adjusted_base_msix > ICE_MIN_LAN_MSIX)
+				pf->num_lan_msix = adjusted_base_msix;
+			else
+				pf->num_lan_msix = ICE_MIN_LAN_MSIX;
 
-	if (!pf->msix_entries) {
-		err = -ENOMEM;
-		goto exit_err;
-	}
+			needed = pf->num_lan_msix + ICE_OICR_MSIX;
 
-	for (i = 0; i < v_budget; i++)
-		pf->msix_entries[i].entry = i;
+			if (attempt == 0 &&
+			    test_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags)) {
+				needed += ICE_ESWITCH_MSIX;
+			} else if (attempt == 1) {
+				dev_warn(dev, "Not enough MSI-X for eswitch support, disabling feature\n");
+				clear_bit(ICE_FLAG_ESWITCH_CAPABLE, pf->flags);
+			}
+#ifdef HAVE_NETDEV_SB_DEV
+
+			/* only reserve MACVLAN MSI-X on the first and second
+			 * attempt
+			 */
+			if ((attempt == 0 || attempt == 1) &&
+			    test_bit(ICE_FLAG_VMDQ_ENA, pf->flags)) {
+				needed += ICE_MAX_MACVLANS * ICE_DFLT_VEC_VMDQ_VSI;
+			} else if (attempt == 2) {
+				dev_warn(dev, "Not enough MSI-X for hardware MACVLAN support, disabling feature.\n");
+				clear_bit(ICE_FLAG_VMDQ_ENA, pf->flags);
+			}
+#endif /* HAVE_NETDEV_SB_DEV */
 
-	/* actually reserve the vectors */
-	v_actual = pci_enable_msix_range(pf->pdev, pf->msix_entries,
-					 ICE_MIN_MSIX, v_budget);
+			/* reserve vectors for RDMA peer driver */
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags)) {
+				if (adjusted_base_msix > ICE_MIN_RDMA_MSIX)
+					pf->num_rdma_msix = adjusted_base_msix +
+						ICE_RDMA_NUM_AEQ_MSIX;
+				else
+					pf->num_rdma_msix = ICE_MIN_RDMA_MSIX;
 
-	if (v_actual < 0) {
-		dev_err(&pf->pdev->dev, "unable to reserve MSI-X vectors\n");
-		err = v_actual;
-		goto msix_err;
-	}
+				needed += pf->num_rdma_msix;
+			}
+		}
 
-	if (v_actual < v_budget) {
-		dev_warn(&pf->pdev->dev,
-			 "not enough OS MSI-X vectors. requested = %d, obtained = %d\n",
-			 v_budget, v_actual);
-/* 2 vectors for LAN (traffic + OICR) */
-#define ICE_MIN_LAN_VECS 2
+		if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+			needed += ICE_FDIR_MSIX;
 
-		if (v_actual < ICE_MIN_LAN_VECS) {
-			/* error if we can't get minimum vectors */
-			pci_disable_msix(pf->pdev);
-			err = -ERANGE;
-			goto msix_err;
+		err = ice_alloc_msix_entries(pf, needed);
+		if (err)
+			goto err_out;
+
+		dev_dbg(dev, "attempting to enable %d MSI-X vectors\n", needed);
+		err = pci_enable_msix_exact(pf->pdev, pf->msix_entries, needed);
+		if (err < 0) {
+			ice_free_msix_entries(pf);
+			dev_notice(dev, "Couldn't get %d MSI-X vectors due to OS, Platform, and/or PCI-function limitations. Reducing request and retrying.",
+				   needed);
+
+			/* MACVLAN support already disabled and we still failed
+			 * to enable MSI-X, so  make another attempt at enabling
+			 * MSI-X by reducing the needed amount
+			 */
+			if (attempt > 1)
+				adjusted_msix_divisor *= 2;
 		} else {
-			pf->num_lan_msix = ICE_MIN_LAN_VECS;
-		}
-	}
+			if (pf->num_lan_msix != num_cpus)
+				dev_notice(dev, "Enabled %d MSI-X vectors for LAN traffic.\n",
+					   pf->num_lan_msix);
 
-	return v_actual;
+			if (test_bit(ICE_FLAG_IWARP_ENA, pf->flags) &&
+			    pf->num_rdma_msix != (num_cpus + ICE_RDMA_NUM_AEQ_MSIX))
+				dev_notice(dev, "Enabled %d MSI-X vectors for RDMA.\n",
+					   pf->num_rdma_msix);
 
-msix_err:
-	devm_kfree(&pf->pdev->dev, pf->msix_entries);
-	goto exit_err;
+			return needed;
+		}
+	}
 
-no_hw_vecs_left_err:
-	dev_err(&pf->pdev->dev,
-		"not enough device MSI-X vectors. requested = %d, available = %d\n",
-		needed, v_left);
-	err = -ERANGE;
-exit_err:
+err_out:
+	dev_err(dev, "failed to enable MSI-X vectors\n");
 	pf->num_lan_msix = 0;
+	pf->num_rdma_msix = 0;
 	return err;
 }
 
@@ -2461,8 +4922,7 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 static void ice_dis_msix(struct ice_pf *pf)
 {
 	pci_disable_msix(pf->pdev);
-	devm_kfree(&pf->pdev->dev, pf->msix_entries);
-	pf->msix_entries = NULL;
+	ice_free_msix_entries(pf);
 }
 
 /**
@@ -2474,7 +4934,7 @@ static void ice_clear_interrupt_scheme(struct ice_pf *pf)
 	ice_dis_msix(pf);
 
 	if (pf->irq_tracker) {
-		devm_kfree(&pf->pdev->dev, pf->irq_tracker);
+		devm_kfree(ice_pf_to_dev(pf), pf->irq_tracker);
 		pf->irq_tracker = NULL;
 	}
 }
@@ -2494,21 +4954,142 @@ static int ice_init_interrupt_scheme(struct ice_pf *pf)
 
 	/* set up vector assignment tracking */
 	pf->irq_tracker =
-		devm_kzalloc(&pf->pdev->dev, sizeof(*pf->irq_tracker) +
-			     (sizeof(u16) * vectors), GFP_KERNEL);
+		devm_kzalloc(ice_pf_to_dev(pf),
+			     struct_size(pf->irq_tracker, list, vectors),
+			     GFP_KERNEL);
 	if (!pf->irq_tracker) {
 		ice_dis_msix(pf);
 		return -ENOMEM;
 	}
 
 	/* populate SW interrupts pool with number of OS granted IRQs. */
-	pf->num_avail_sw_msix = vectors;
-	pf->irq_tracker->num_entries = vectors;
+	pf->num_avail_sw_msix = (u16)vectors;
+	pf->irq_tracker->num_entries = (u16)vectors;
 	pf->irq_tracker->end = pf->irq_tracker->num_entries;
 
 	return 0;
 }
 
+/**
+ * ice_is_wol_supported - check if WoL is supported
+ * @hw: pointer to hardware info
+ *
+ * Check if WoL is supported based on the HW configuration.
+ * Returns true if NVM supports and enables WoL for this port, false otherwise
+ */
+bool ice_is_wol_supported(struct ice_hw *hw)
+{
+	u16 wol_ctrl;
+
+	/* A bit set to 1 in the NVM Software Reserved Word 2 (WoL control
+	 * word) indicates WoL is not supported on the corresponding PF ID.
+	 */
+	if (ice_read_sr_word(hw, ICE_SR_NVM_WOL_CFG, &wol_ctrl))
+		return false;
+
+	return !(BIT(hw->port_info->lport) & wol_ctrl);
+}
+
+/**
+ * ice_vsi_recfg_qs - Change the number of queues on a VSI
+ * @vsi: VSI being changed
+ * @new_rx: new number of Rx queues
+ * @new_tx: new number of Tx queues
+ *
+ * Only change the number of queues if new_tx, or new_rx is non-0.
+ *
+ * Returns 0 on success.
+ */
+int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx)
+{
+	struct ice_pf *pf = vsi->back;
+	int err = 0, timeout = 50;
+
+	if (!new_rx && !new_tx)
+		return -EINVAL;
+
+	while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+
+	if (new_tx)
+		vsi->req_txq = (u16)new_tx;
+	if (new_rx)
+		vsi->req_rxq = (u16)new_rx;
+
+	/* set for the next time the netdev is started */
+	if (!netif_running(vsi->netdev)) {
+		ice_vsi_rebuild(vsi, false);
+		dev_dbg(ice_pf_to_dev(pf), "Link is down, queue count change happens when link is brought up\n");
+		goto done;
+	}
+
+	ice_vsi_close(vsi);
+	ice_vsi_rebuild(vsi, false);
+	ice_pf_dcb_recfg(pf);
+	ice_vsi_open(vsi);
+done:
+	clear_bit(ICE_CFG_BUSY, pf->state);
+	return err;
+}
+
+/**
+ * ice_set_safe_mode_vlan_cfg - configure PF VSI to allow all VLANs in safe mode
+ * @pf: PF to configure
+ *
+ * No VLAN offloads/filtering are advertised in safe mode so make sure the PF
+ * VSI can still Tx/Rx VLAN tagged packets.
+ */
+static void ice_set_safe_mode_vlan_cfg(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = ice_get_main_vsi(pf);
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	struct ice_hw *hw;
+
+	if (!vsi)
+		return;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return;
+
+	hw = &pf->hw;
+	ctxt->info = vsi->info;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
+			    ICE_AQ_VSI_PROP_SECURITY_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	/* disable VLAN anti-spoof */
+	ctxt->info.sec_flags &= ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+				  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+
+	/* disable VLAN pruning and keep all other settings */
+	ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	/* allow all VLANs on Tx and don't strip on Rx */
+	ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL |
+		ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to update VSI for safe mode VLANs, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	} else {
+		vsi->info.sec_flags = ctxt->info.sec_flags;
+		vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+		vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+	}
+
+	kfree(ctxt);
+}
+
 /**
  * ice_log_pkg_init - log result of DDP package load
  * @hw: pointer to hardware info
@@ -2517,9 +5098,10 @@ static int ice_init_interrupt_scheme(struct ice_pf *pf)
 static void
 ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 {
-	struct ice_pf *pf = (struct ice_pf *)hw->back;
-	struct device *dev = &pf->pdev->dev;
+	struct ice_pf *pf = hw->back;
+	struct device *dev;
 
+	dev = ice_pf_to_dev(pf);
 	switch (*status) {
 	case ICE_SUCCESS:
 		/* The package download AdminQ command returned success because
@@ -2533,16 +5115,14 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 		    !memcmp(hw->pkg_name, hw->active_pkg_name,
 			    sizeof(hw->pkg_name))) {
 			if (hw->pkg_dwnld_status == ICE_AQ_RC_EEXIST)
-				dev_info(dev,
-					 "DDP package already present on device: %s version %d.%d.%d.%d\n",
+				dev_info(dev, "DDP package already present on device: %s version %d.%d.%d.%d\n",
 					 hw->active_pkg_name,
 					 hw->active_pkg_ver.major,
 					 hw->active_pkg_ver.minor,
 					 hw->active_pkg_ver.update,
 					 hw->active_pkg_ver.draft);
 			else
-				dev_info(dev,
-					 "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n",
+				dev_info(dev, "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n",
 					 hw->active_pkg_name,
 					 hw->active_pkg_ver.major,
 					 hw->active_pkg_ver.minor,
@@ -2550,8 +5130,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 					 hw->active_pkg_ver.draft);
 		} else if (hw->active_pkg_ver.major != ICE_PKG_SUPP_VER_MAJ ||
 			   hw->active_pkg_ver.minor != ICE_PKG_SUPP_VER_MNR) {
-			dev_err(dev,
-				"The device has a DDP package that is not supported by the driver.  The device has package '%s' version %d.%d.x.x.  The driver requires version %d.%d.x.x.  Entering Safe Mode.\n",
+			dev_err(dev, "The device has a DDP package that is not supported by the driver.  The device has package '%s' version %d.%d.x.x.  The driver requires version %d.%d.x.x.  Entering Safe Mode.\n",
 				hw->active_pkg_name,
 				hw->active_pkg_ver.major,
 				hw->active_pkg_ver.minor,
@@ -2559,8 +5138,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 			*status = ICE_ERR_NOT_SUPPORTED;
 		} else if (hw->active_pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 			   hw->active_pkg_ver.minor == ICE_PKG_SUPP_VER_MNR) {
-			dev_info(dev,
-				 "The driver could not load the DDP package file because a compatible DDP package is already present on the device.  The device has package '%s' version %d.%d.%d.%d.  The package file found by the driver: '%s' version %d.%d.%d.%d.\n",
+			dev_info(dev, "The driver could not load the DDP package file because a compatible DDP package is already present on the device.  The device has package '%s' version %d.%d.%d.%d.  The package file found by the driver: '%s' version %d.%d.%d.%d.\n",
 				 hw->active_pkg_name,
 				 hw->active_pkg_ver.major,
 				 hw->active_pkg_ver.minor,
@@ -2572,54 +5150,51 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status)
 				 hw->pkg_ver.update,
 				 hw->pkg_ver.draft);
 		} else {
-			dev_err(dev,
-				"An unknown error occurred when loading the DDP package, please reboot the system.  If the problem persists, update the NVM.  Entering Safe Mode.\n");
+			dev_err(dev, "An unknown error occurred when loading the DDP package, please reboot the system.  If the problem persists, update the NVM.  Entering Safe Mode.\n");
 			*status = ICE_ERR_NOT_SUPPORTED;
 		}
 		break;
+	case ICE_ERR_FW_DDP_MISMATCH:
+		dev_err(dev, "The firmware loaded on the device is not compatible with the DDP package.  Please update the device's NVM.  Entering safe mode.\n");
+		break;
 	case ICE_ERR_BUF_TOO_SHORT:
-		/* fall-through */
 	case ICE_ERR_CFG:
-		dev_err(dev,
-			"The DDP package file is invalid. Entering Safe Mode.\n");
+		dev_err(dev, "The DDP package file is invalid. Entering Safe Mode.\n");
 		break;
 	case ICE_ERR_NOT_SUPPORTED:
 		/* Package File version not supported */
 		if (hw->pkg_ver.major > ICE_PKG_SUPP_VER_MAJ ||
 		    (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 		     hw->pkg_ver.minor > ICE_PKG_SUPP_VER_MNR))
-			dev_err(dev,
-				"The DDP package file version is higher than the driver supports.  Please use an updated driver.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP package file version is higher than the driver supports.  Please use an updated driver.  Entering Safe Mode.\n");
 		else if (hw->pkg_ver.major < ICE_PKG_SUPP_VER_MAJ ||
 			 (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ &&
 			  hw->pkg_ver.minor < ICE_PKG_SUPP_VER_MNR))
-			dev_err(dev,
-				"The DDP package file version is lower than the driver supports.  The driver requires version %d.%d.x.x.  Please use an updated DDP Package file.  Entering Safe Mode.\n",
+			dev_err(dev, "The DDP package file version is lower than the driver supports.  The driver requires version %d.%d.x.x.  Please use an updated DDP Package file.  Entering Safe Mode.\n",
 				ICE_PKG_SUPP_VER_MAJ, ICE_PKG_SUPP_VER_MNR);
 		break;
 	case ICE_ERR_AQ_ERROR:
-		switch (hw->adminq.sq_last_status) {
+		switch (hw->pkg_dwnld_status) {
 		case ICE_AQ_RC_ENOSEC:
 		case ICE_AQ_RC_EBADSIG:
-			dev_err(dev,
-				"The DDP package could not be loaded because its signature is not valid.  Please use a valid DDP Package.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP package could not be loaded because its signature is not valid.  Please use a valid DDP Package.  Entering Safe Mode.\n");
 			return;
 		case ICE_AQ_RC_ESVN:
-			dev_err(dev,
-				"The DDP Package could not be loaded because its security revision is too low.  Please use an updated DDP Package.  Entering Safe Mode.\n");
+			dev_err(dev, "The DDP Package could not be loaded because its security revision is too low.  Please use an updated DDP Package.  Entering Safe Mode.\n");
 			return;
 		case ICE_AQ_RC_EBADMAN:
 		case ICE_AQ_RC_EBADBUF:
-			dev_err(dev,
-				"An error occurred on the device while loading the DDP package.  The device will be reset.\n");
+			dev_err(dev, "An error occurred on the device while loading the DDP package.  The device will be reset.\n");
+			/* poll for reset to complete */
+			if (ice_check_reset(hw))
+				dev_err(dev, "Error resetting device. Please reload the driver\n");
 			return;
 		default:
 			break;
 		}
 		/* fall-through */
 	default:
-		dev_err(dev,
-			"An unknown error (%d) occurred when loading the DDP package.  Entering Safe Mode.\n",
+		dev_err(dev, "An unknown error (%d) occurred when loading the DDP package.  Entering Safe Mode.\n",
 			*status);
 		break;
 	}
@@ -2637,7 +5212,7 @@ static void
 ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 {
 	enum ice_status status = ICE_ERR_PARAM;
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 
 	/* Load DDP Package */
@@ -2650,8 +5225,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 		status = ice_init_pkg(hw, hw->pkg_copy, hw->pkg_size);
 		ice_log_pkg_init(hw, &status);
 	} else {
-		dev_err(dev,
-			"The DDP package file failed to load. Entering Safe Mode.\n");
+		dev_err(dev, "The DDP package file failed to load. Entering Safe Mode.\n");
 	}
 
 	if (status) {
@@ -2666,6 +5240,83 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 	set_bit(ICE_FLAG_ADV_FEATURES, pf->flags);
 }
 
+/**
+ * ice_prepare_for_safe_mode - Disable advanced features
+ * @pf: board private structure
+ *
+ * If package download failed during reset, then driver clears
+ * ICE_FLAG_ADV_FEATURES PF flag bit, and device is official in safe mode.
+ * So, all advance features have to be disabled.
+ */
+static int ice_prepare_for_safe_mode(struct ice_pf *pf)
+{
+	struct ice_vsi *pf_vsi;
+	u16 val;
+	int err;
+
+	/* Device not in Safe Mode, so bail out here */
+	if (!ice_is_safe_mode(pf))
+		return 0;
+
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi)
+		return -EINVAL;
+
+	/* only one queue pair in safe mode */
+	pf_vsi->req_txq = 1;
+	pf_vsi->req_rxq = 1;
+
+	/* remove RSS configuration */
+	ice_rem_vsi_rss_list(&pf_vsi->back->hw, pf_vsi->idx);
+
+	/* if the PF VSI was flow director enabled, disable it
+	 * in the VSI context as we won't be doing flow director
+	 * in safe mode. Not doing this causes the add VSI in
+	 * ice_rebuild to fail.
+	 */
+	val = le16_to_cpu(pf_vsi->info.fd_options);
+	val &= ~(ICE_AQ_VSI_FD_ENABLE | ICE_AQ_VSI_FD_PROG_ENABLE);
+	pf_vsi->info.fd_options = cpu_to_le16(val);
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
+		ice_free_vfs(pf);
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		int v;
+
+		ice_for_each_vsi(pf, v) {
+			struct ice_vsi *vsi = pf->vsi[v];
+
+			if (vsi && vsi->type == ICE_VSI_OFFLOAD_MACVLAN)
+				ice_deinit_macvlan(vsi);
+		}
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	ice_set_safe_mode_vlan_cfg(pf);
+
+	/* Need to update netdev features, netdev ops and ethtool ops
+	 * for safe mode, so free the PF netdev and setup a new one
+	 */
+	unregister_netdev(pf_vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_REGISTERED, pf_vsi->state);
+	free_netdev(pf_vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, pf_vsi->state);
+	pf_vsi->netdev = NULL;
+
+	ice_set_safe_mode_caps(&pf->hw);
+	ice_set_pf_caps(pf);
+	err = ice_cfg_netdev(pf_vsi);
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "could not allocate netdev, err %d\n",
+			err);
+		return err;
+	}
+
+	return 0;
+}
+
+
 /**
  * ice_verify_cacheline_size - verify driver's assumption of 64 Byte cache lines
  * @pf: pointer to the PF structure
@@ -2677,8 +5328,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf)
 static void ice_verify_cacheline_size(struct ice_pf *pf)
 {
 	if (rd32(&pf->hw, GLPCI_CNF2) & GLPCI_CNF2_CACHELINE_SIZE_M)
-		dev_warn(&pf->pdev->dev,
-			 "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n",
+		dev_warn(ice_pf_to_dev(pf), "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n",
 			 ICE_CACHE_LINE_BYTES);
 }
 
@@ -2696,61 +5346,152 @@ static enum ice_status ice_send_version(struct ice_pf *pf)
 	dv.minor_ver = DRV_VERSION_MINOR;
 	dv.build_ver = DRV_VERSION_BUILD;
 	dv.subbuild_ver = 0;
-	strscpy((char *)dv.driver_string, DRV_VERSION,
+	strscpy((char *)dv.driver_string, DRV_VERSION DRV_VERSION_EXTRA,
 		sizeof(dv.driver_string));
 	return ice_aq_send_driver_ver(&pf->hw, &dv, NULL);
 }
 
 /**
- * ice_get_opt_fw_name - return optional firmware file name or NULL
- * @pf: pointer to the PF instance
+ * ice_init_acl - Initializes the ACL block
+ * @pf: ptr to PF device
+ *
+ * returns 0 on success, negative on error
  */
-static char *ice_get_opt_fw_name(struct ice_pf *pf)
+int ice_init_acl(struct ice_pf *pf)
 {
-	/* Optional firmware name same as default with additional dash
-	 * followed by a EUI-64 identifier (PCIe Device Serial Number)
+	struct ice_acl_tbl_params params;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	int divider;
+	u16 scen_id;
+
+	/* Creates a single ACL table that consist of src_ip(4 byte),
+	 * dest_ip(4 byte), src_port(2 byte) and dst_port(2 byte) for a total
+	 * of 12 bytes (96 bits), hence 120 bit wide keys, i.e. 3 TCAM slices.
+	 * If the given hardware card contains less than 8 PFs (ports) then
+	 * each PF will have its own TCAM slices. For 8 PFs, a given slice will
+	 * be shared by 2 different PFs.
 	 */
-	struct pci_dev *pdev = pf->pdev;
-	char *opt_fw_filename = NULL;
-	u32 dword;
-	u8 dsn[8];
-	int pos;
+	if (hw->dev_caps.num_funcs < 8)
+		divider = ICE_ACL_ENTIRE_SLICE;
+	else
+		divider = ICE_ACL_HALF_SLICE;
 
-	/* Determine the name of the optional file using the DSN (two
-	 * dwords following the start of the DSN Capability).
-	 */
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
-	if (pos) {
-		opt_fw_filename = kzalloc(NAME_MAX, GFP_KERNEL);
-		if (!opt_fw_filename)
-			return NULL;
+	memset(&params, 0, sizeof(params));
+	params.width = ICE_AQC_ACL_KEY_WIDTH_BYTES * 3;
+	params.depth = ICE_AQC_ACL_TCAM_DEPTH / divider;
+	params.entry_act_pairs = 1;
+	params.concurr = false;
 
-		pci_read_config_dword(pdev, pos + 4, &dword);
-		put_unaligned_le32(dword, &dsn[0]);
-		pci_read_config_dword(pdev, pos + 8, &dword);
-		put_unaligned_le32(dword, &dsn[4]);
-		snprintf(opt_fw_filename, NAME_MAX,
-			 "%sice-%02x%02x%02x%02x%02x%02x%02x%02x.pkg",
-			 ICE_DDP_PKG_PATH,
-			 dsn[7], dsn[6], dsn[5], dsn[4],
-			 dsn[3], dsn[2], dsn[1], dsn[0]);
-	}
 
-	return opt_fw_filename;
+	status = ice_acl_create_tbl(hw, &params);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return ice_status_to_errno(ice_acl_create_scen(hw, params.width,
+						       params.depth, &scen_id));
 }
 
 /**
- * ice_request_fw - Device initialization routine
+ * ice_deinit_acl - Unroll the initialization of the ACL block
+ * @pf: ptr to PF device
+ */
+static void ice_deinit_acl(struct ice_pf *pf)
+{
+	ice_acl_destroy_tbl(&pf->hw);
+}
+
+/**
+ * ice_init_fdir - Initialize flow director VSI and configuration
  * @pf: pointer to the PF instance
+ *
+ * returns 0 on success, negative on error
  */
-static void ice_request_fw(struct ice_pf *pf)
+static int ice_init_fdir(struct ice_pf *pf)
 {
-	char *opt_fw_filename = ice_get_opt_fw_name(pf);
-	const struct firmware *firmware = NULL;
-	struct device *dev = &pf->pdev->dev;
-	int err = 0;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *ctrl_vsi;
+	int err;
 
-	/* optional device-specific DDP (if present) overrides the default DDP
+	/* Side Band Flow Director needs to have a control VSI.
+	 * Allocate it and store it in the PF.
+	 */
+	ctrl_vsi = ice_ctrl_vsi_setup(pf, pf->hw.port_info);
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "could not create control VSI\n");
+		return -ENOMEM;
+	}
+
+	err = ice_vsi_open_ctrl(ctrl_vsi);
+	if (err) {
+		dev_dbg(dev, "could not open control VSI\n");
+		goto err_vsi_open;
+	}
+
+	mutex_init(&pf->hw.fdir_fltr_lock);
+
+	err = ice_fdir_create_dflt_rules(pf);
+	if (err)
+		goto err_fdir_rule;
+
+	return 0;
+
+err_fdir_rule:
+	ice_fdir_release_flows(&pf->hw);
+	ice_vsi_close(ctrl_vsi);
+err_vsi_open:
+	ice_vsi_release(ctrl_vsi);
+	if (pf->ctrl_vsi_idx != ICE_NO_VSI) {
+		pf->vsi[pf->ctrl_vsi_idx] = NULL;
+		pf->ctrl_vsi_idx = ICE_NO_VSI;
+	}
+	return err;
+}
+
+/**
+ * ice_get_opt_fw_name - return optional firmware file name or NULL
+ * @pf: pointer to the PF instance
+ */
+static char *ice_get_opt_fw_name(struct ice_pf *pf)
+{
+	/* Optional firmware name same as default with additional dash
+	 * followed by a EUI-64 identifier (PCIe Device Serial Number)
+	 */
+	struct pci_dev *pdev = pf->pdev;
+	char *opt_fw_filename;
+	u64 dsn;
+
+	/* Determine the name of the optional file using the DSN (two
+	 * dwords following the start of the DSN Capability).
+	 */
+	dsn = pci_get_dsn(pdev);
+	if (!dsn)
+		return NULL;
+
+	opt_fw_filename = kzalloc(NAME_MAX, GFP_KERNEL);
+	if (!opt_fw_filename)
+		return NULL;
+
+	snprintf(opt_fw_filename, NAME_MAX, "%sice-%016llx.pkg",
+		 ICE_DDP_PKG_PATH, dsn);
+
+	memcpy(pf->dcf.dsn, &dsn, sizeof(pf->dcf.dsn));
+
+	return opt_fw_filename;
+}
+
+/**
+ * ice_request_fw - Device initialization routine
+ * @pf: pointer to the PF instance
+ */
+static void ice_request_fw(struct ice_pf *pf)
+{
+	char *opt_fw_filename = ice_get_opt_fw_name(pf);
+	const struct firmware *firmware = NULL;
+	struct device *dev = ice_pf_to_dev(pf);
+	int err = 0;
+
+	/* optional device-specific DDP (if present) overrides the default DDP
 	 * package file. kernel logs a debug message if the file doesn't exist,
 	 * and warning messages for other errors.
 	 */
@@ -2771,8 +5512,7 @@ static void ice_request_fw(struct ice_pf *pf)
 dflt_pkg_load:
 	err = request_firmware(&firmware, ICE_DDP_PKG_FILE, dev);
 	if (err) {
-		dev_err(dev,
-			"The DDP package file was not found or could not be read. Entering Safe Mode\n");
+		dev_err(dev, "The DDP package file was not found or could not be read. Entering Safe Mode\n");
 		return;
 	}
 
@@ -2781,6 +5521,240 @@ static void ice_request_fw(struct ice_pf *pf)
 	release_firmware(firmware);
 }
 
+/**
+ * ice_verify_eeprom - make sure eeprom is good to use
+ * @pf: board private structure
+ */
+static void ice_verify_eeprom(struct ice_pf *pf)
+{
+	int err;
+
+	err = ice_nvm_validate_checksum(&pf->hw);
+	if (err) {
+		set_bit(ICE_BAD_EEPROM, pf->state);
+		dev_err(ice_pf_to_dev(pf), "Bad EEPROM checksum detected, err %d, please update your NVM.\n",
+			err);
+	} else {
+		clear_bit(ICE_BAD_EEPROM, pf->state);
+	}
+}
+
+/*
+ * ice_print_wake_reason - show the wake up cause in the log
+ * @pf: pointer to the PF struct
+ */
+static void ice_print_wake_reason(struct ice_pf *pf)
+{
+	u32 wus = pf->wakeup_reason;
+	const char *wake_str;
+
+	/* if no wake event, nothing to print */
+	if (!wus)
+		return;
+
+	if (wus & PFPM_WUS_LNKC_M)
+		wake_str = "Link\n";
+	else if (wus & PFPM_WUS_MAG_M)
+		wake_str = "Magic Packet\n";
+	else if (wus & PFPM_WUS_MNG_M)
+		wake_str = "Management\n";
+	else if (wus & PFPM_WUS_FW_RST_WK_M)
+		wake_str = "Firmware Reset\n";
+	else
+		wake_str = "Unknown\n";
+
+	dev_info(ice_pf_to_dev(pf), "Wake reason: %s", wake_str);
+}
+
+/*
+ * ice_config_health_events - Enable or disable FW health event reporting
+ * @pf: pointer to the PF struct
+ * @enable: whether to enable or disable the events
+ */
+static void
+ice_config_health_events(struct ice_pf *pf, bool enable)
+{
+	enum ice_status ret;
+	u8 enable_bits = 0;
+
+	if (!ice_is_fw_health_report_supported(&pf->hw))
+		return;
+
+	if (enable)
+		enable_bits = ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK |
+			      ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK;
+
+	ret = ice_aq_set_health_status_config(&pf->hw, enable_bits, NULL);
+
+	if (ret)
+		dev_err(ice_pf_to_dev(pf), "Failed to %sable firmware health events, err %s aq_err %s\n",
+			enable ? "en" : "dis",
+			ice_stat_str(ret),
+			ice_aq_str(pf->hw.adminq.sq_last_status));
+}
+
+/*
+ * ice_register_netdev - register netdev and devlink port
+ * @pf: pointer to the PF struct
+ */
+static int ice_register_netdev(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	int err = 0;
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !vsi->netdev)
+		return -EIO;
+
+	err = register_netdev(vsi->netdev);
+	if (err)
+		goto err_register_netdev;
+
+	set_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+	netif_carrier_off(vsi->netdev);
+	netif_tx_stop_all_queues(vsi->netdev);
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	err = ice_devlink_create_pf_port(pf);
+	if (err)
+		goto err_devlink_create;
+
+	devlink_port_type_eth_set(&pf->devlink_port, vsi->netdev);
+#endif /* CONFIG_NET_DEVLINK */
+
+	return 0;
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+err_devlink_create:
+	unregister_netdev(vsi->netdev);
+	clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state);
+#endif /* CONFIG_NET_DEVLINK */
+err_register_netdev:
+	free_netdev(vsi->netdev);
+	vsi->netdev = NULL;
+	clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state);
+	return err;
+}
+
+
+/**
+ * ice_pf_fwlog_is_input_valid - validate user input level/events
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ */
+static bool
+ice_pf_fwlog_is_input_valid(struct ice_pf *pf,
+			    struct ice_fwlog_user_input *user_input)
+{
+	unsigned long events = user_input->events;
+	u8 log_level = user_input->log_level;
+
+	if (log_level >= ICE_FWLOG_LEVEL_INVALID) {
+		dev_err(ice_pf_to_dev(pf), "Invalid FW log level %u, all level(s) >= %u are invalid\n",
+			log_level, ICE_FWLOG_LEVEL_INVALID);
+		return false;
+	}
+
+	if (events >= BIT(ICE_AQC_FW_LOG_ID_MAX)) {
+		dev_err(ice_pf_to_dev(pf), "Invalid FW log events 0x%lx, all FW log event bits >= 0x%lx are invalid\n",
+			events, BIT(ICE_AQC_FW_LOG_ID_MAX));
+		return false;
+	}
+
+
+	return true;
+}
+
+
+/**
+ * ice_pf_fwlog_populate_cfg - populate FW log configuration
+ * @cfg: configuration to populate
+ * @user_input: input parameters to validate
+ *
+ * For each set event, set the @cfg's log_level to the @log_level. For all
+ * cleared events set ICE_FWLOG_LEVEL_NONE.
+ */
+static void
+ice_pf_fwlog_populate_cfg(struct ice_fwlog_cfg *cfg,
+			  struct ice_fwlog_user_input *user_input)
+{
+	u16 module_id;
+
+#define ICE_FWLOG_DFLT_LOG_RESOLUTION	10
+	cfg->log_resolution = ICE_FWLOG_DFLT_LOG_RESOLUTION;
+	cfg->options = ICE_FWLOG_OPTION_ARQ_ENA;
+
+	for (module_id = 0; module_id < ICE_AQC_FW_LOG_ID_MAX; module_id++) {
+		struct ice_fwlog_module_entry *entry =
+			&cfg->module_entries[module_id];
+
+		entry->module_id = module_id;
+		if (test_bit(module_id, &user_input->events))
+			entry->log_level = user_input->log_level;
+		else
+			entry->log_level = ICE_FWLOG_LEVEL_NONE;
+	}
+}
+
+/**
+ * ice_pf_fwlog_set - set FW logging configuration
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ *
+ * After calling this function the @events at the specified @log_level will be
+ * enabled. However, the PF must still register for FW logging events if it has
+ * not yet done so. Otherwise no events will be received.
+ */
+static int
+ice_pf_fwlog_set(struct ice_pf *pf, struct ice_fwlog_user_input *user_input)
+{
+	struct ice_fwlog_cfg cfg = {};
+	enum ice_status status;
+
+	if (!ice_pf_fwlog_is_input_valid(pf, user_input))
+		return -EINVAL;
+
+	ice_pf_fwlog_populate_cfg(&cfg, user_input);
+
+	status = ice_fwlog_set(&pf->hw, &cfg);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to set FW log configuration. fwlog_events: 0x%lx fwlog_level: %u\n",
+			user_input->events, user_input->log_level);
+		return ice_status_to_errno(status);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_pf_fwlog_init - initialize FW logging configuration on device init
+ * @pf: pointer to the PF struct
+ * @user_input: input parameters to validate
+ *
+ * This should always be called before ice_hw_init() as this will enable FW
+ * logging @events at the specified @log_level to be enabled/registered as soon
+ * as the driver can communicate with FW.
+ */
+static int
+ice_pf_fwlog_init(struct ice_pf *pf, struct ice_fwlog_user_input *user_input)
+{
+	struct ice_fwlog_cfg cfg = {};
+	enum ice_status status;
+
+	if (!ice_pf_fwlog_is_input_valid(pf, user_input))
+		return -EINVAL;
+
+	ice_pf_fwlog_populate_cfg(&cfg, user_input);
+	if (hweight_long(user_input->events))
+		cfg.options |= ICE_FWLOG_OPTION_REGISTER_ON_INIT;
+
+	status = ice_fwlog_init(&pf->hw, &cfg);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "Failed to init FW log configuration. fwlog_events: 0x%lx fwlog_level: %u\n",
+			user_input->events, user_input->log_level);
+		return ice_status_to_errno(status);
+	}
+
+	return 0;
+}
+
 /**
  * ice_probe - Device initialization routine
  * @pdev: PCI device information struct
@@ -2791,23 +5765,27 @@ static void ice_request_fw(struct ice_pf *pf)
 static int
 ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 {
+	struct ice_fwlog_user_input user_input = { 0 };
 	struct device *dev = &pdev->dev;
 	struct ice_pf *pf;
 	struct ice_hw *hw;
 	int err;
 
-	/* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */
+	/* this driver uses devres, see
+	 * Documentation/driver-api/driver-model/devres.rst
+	 */
 	err = pcim_enable_device(pdev);
 	if (err)
 		return err;
 
-	err = pcim_iomap_regions(pdev, BIT(ICE_BAR0), pci_name(pdev));
+	err = pcim_iomap_regions(pdev, BIT(ICE_BAR0), dev_driver_string(dev));
 	if (err) {
 		dev_err(dev, "BAR0 I/O map error %d\n", err);
 		return err;
 	}
 
-	pf = devm_kzalloc(dev, sizeof(*pf), GFP_KERNEL);
+
+	pf = ice_allocate_pf(dev);
 	if (!pf)
 		return -ENOMEM;
 
@@ -2825,12 +5803,14 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	pf->pdev = pdev;
 	pci_set_drvdata(pdev, pf);
-	set_bit(__ICE_DOWN, pf->state);
+	set_bit(ICE_DOWN, pf->state);
 	/* Disable service task until DOWN bit is cleared */
-	set_bit(__ICE_SERVICE_DIS, pf->state);
+	set_bit(ICE_SERVICE_DIS, pf->state);
 
 	hw = &pf->hw;
 	hw->hw_addr = pcim_iomap_table(pdev)[ICE_BAR0];
+	pci_save_state(pdev);
+
 	hw->back = pf;
 	hw->vendor_id = pdev->vendor;
 	hw->device_id = pdev->device;
@@ -2843,11 +5823,36 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	pf->msg_enable = netif_msg_init(debug, ICE_DFLT_NETIF_M);
 
+	err = ice_devlink_register(pf);
+	if (err) {
+		dev_err(dev, "ice_devlink_register failed: %d\n", err);
+		goto err_exit_unroll;
+	}
+
 #ifndef CONFIG_DYNAMIC_DEBUG
 	if (debug < -1)
 		hw->debug_mask = debug;
 #endif
 
+	/* check if device FW is in recovery mode */
+	if (ice_get_fw_mode(hw) == ICE_FW_MODE_REC) {
+		err = ice_probe_recovery_mode(pf);
+		if (err)
+			goto err_rec_mode;
+
+		return 0;
+	}
+
+	ice_debugfs_pf_init(pf);
+
+	user_input.log_level = fwlog_level;
+	user_input.events = fwlog_events;
+	if (ice_pf_fwlog_init(pf, &user_input)) {
+		dev_err(dev, "failed to initialize FW logging: %d\n", err);
+		err = -EIO;
+		goto err_exit_unroll;
+	}
+
 	err = ice_init_hw(hw);
 	if (err) {
 		dev_err(dev, "ice_init_hw failed: %d\n", err);
@@ -2855,10 +5860,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_exit_unroll;
 	}
 
-	dev_info(dev, "firmware %d.%d.%d api %d.%d.%d nvm %s build 0x%08x\n",
-		 hw->fw_maj_ver, hw->fw_min_ver, hw->fw_patch,
-		 hw->api_maj_ver, hw->api_min_ver, hw->api_patch,
-		 ice_nvm_version_str(hw), hw->fw_build);
+	ice_init_feature_support(pf);
 
 	ice_request_fw(pf);
 
@@ -2867,8 +5869,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	 * true
 	 */
 	if (ice_is_safe_mode(pf)) {
-		dev_err(dev,
-			"Package download failed. Advanced features disabled - Device now in Safe Mode\n");
 		/* we already got function/device capabilities but these don't
 		 * reflect what the driver needs to do in safe mode. Instead of
 		 * adding conditional logic everywhere to ignore these
@@ -2877,11 +5877,36 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		ice_set_safe_mode_caps(hw);
 	}
 
+	ice_set_umac_shared(hw);
+
+
 	err = ice_init_pf(pf);
 	if (err) {
 		dev_err(dev, "ice_init_pf failed: %d\n", err);
 		goto err_init_pf_unroll;
 	}
+	ice_verify_eeprom(pf);
+#ifndef ETHTOOL_GFECPARAM
+	switch (pf->hw.port_info->phy.link_info.fec_info) {
+	case (ICE_AQ_LINK_25G_RS_528_FEC_EN | ICE_AQ_LINK_25G_KR_FEC_EN):
+	case (ICE_AQ_LINK_25G_RS_544_FEC_EN | ICE_AQ_LINK_25G_KR_FEC_EN):
+		set_bit(ICE_FLAG_RS_FEC, pf->flags);
+		set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+		break;
+	case ICE_AQ_LINK_25G_RS_528_FEC_EN:
+	case ICE_AQ_LINK_25G_RS_544_FEC_EN:
+		set_bit(ICE_FLAG_RS_FEC, pf->flags);
+		break;
+	case ICE_AQ_LINK_25G_KR_FEC_EN:
+		set_bit(ICE_FLAG_BASE_R_FEC, pf->flags);
+		break;
+	default:
+		break;
+	}
+#endif /* ETHTOOL_GFECPARAM */
+
+	ice_devlink_init_regions(pf);
+	ice_devlink_params_publish(pf);
 
 	pf->num_alloc_vsi = hw->func_caps.guar_num_vsi;
 	if (!pf->num_alloc_vsi) {
@@ -2903,9 +5928,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_init_vsi_unroll;
 	}
 
-	/* Driver is mostly up */
-	clear_bit(__ICE_DOWN, pf->state);
-
 	/* In case of MSIX we are going to setup the misc vector right here
 	 * to handle admin queue events etc. In case of legacy and MSI
 	 * the misc functionality and queue processing is combined in
@@ -2917,6 +5939,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		goto err_init_interrupt_unroll;
 	}
 
+
 	/* create switch struct for the switch element created by FW on boot */
 	pf->first_sw = devm_kzalloc(dev, sizeof(*pf->first_sw), GFP_KERNEL);
 	if (!pf->first_sw) {
@@ -2936,19 +5959,21 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 
 	err = ice_setup_pf_sw(pf);
 	if (err) {
-		dev_err(dev, "probe failed due to setup PF switch:%d\n", err);
+		dev_err(dev, "probe failed due to setup PF switch: %d\n", err);
 		goto err_alloc_sw_unroll;
 	}
 
-	clear_bit(__ICE_SERVICE_DIS, pf->state);
+	clear_bit(ICE_SERVICE_DIS, pf->state);
+
+	/* by default, set the PF level feature flags to be ON */
+	set_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags);
 
 	/* tell the firmware we are up */
 	err = ice_send_version(pf);
 	if (err) {
-		dev_err(dev,
-			"probe failed sending driver version %s. error: %d\n",
+		dev_err(dev, "probe failed sending driver version %s. error: %d\n",
 			ice_drv_ver, err);
-		goto err_alloc_sw_unroll;
+		goto err_send_version_unroll;
 	}
 
 	/* since everything is good, start the service timer */
@@ -2957,17 +5982,100 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	err = ice_init_link_events(pf->hw.port_info);
 	if (err) {
 		dev_err(dev, "ice_init_link_events failed: %d\n", err);
-		goto err_alloc_sw_unroll;
+		goto err_send_version_unroll;
+	}
+
+	/* not a fatal error if this fails */
+	err = ice_init_nvm_phy_type(pf->hw.port_info);
+	if (err)
+		dev_err(dev, "ice_init_nvm_phy_type failed: %d\n", err);
+
+	ice_init_link_dflt_override(pf->hw.port_info);
+
+	/* not a fatal error if this fails */
+	err = ice_update_link_info(pf->hw.port_info);
+	if (err)
+		dev_err(dev, "ice_update_link_info failed: %d\n", err);
+
+	ice_check_module_power(pf, pf->hw.port_info->phy.link_info.link_cfg_err);
+
+	/* if media available, initialize PHY settings */
+	if (pf->hw.port_info->phy.link_info.link_info &
+	    ICE_AQ_MEDIA_AVAILABLE) {
+		/* not a fatal error if this fails */
+		err = ice_init_phy_user_cfg(pf->hw.port_info);
+		if (err)
+			dev_err(dev, "ice_init_phy_user_cfg failed: %d\n", err);
+
+		if (!test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags)) {
+			struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+			if (vsi)
+				ice_configure_phy(vsi);
+		}
+	} else {
+		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
 	}
 
 	ice_verify_cacheline_size(pf);
 
-	/* If no DDP driven features have to be setup, return here */
-	if (ice_is_safe_mode(pf))
-		return 0;
+	/* Save wakeup reason register for later use */
+	pf->wakeup_reason = rd32(hw, PFPM_WUS);
+
+	/* check for a power management event */
+	ice_print_wake_reason(pf);
+
+	/* clear wake status, all bits */
+	wr32(hw, PFPM_WUS, U32_MAX);
+
+	/* Disable WoL at init, wait for user to enable */
+	device_set_wakeup_enable(dev, false);
+
+	/* init peers only if supported */
+	if (ice_is_peer_ena(pf)) {
+		pf->peers = devm_kcalloc(dev, ICE_MAX_NUM_PEERS,
+					 sizeof(*pf->peers), GFP_KERNEL);
+		if (!pf->peers) {
+			err = -ENOMEM;
+			goto err_init_peer_unroll;
+		}
+
+		err = ice_init_peer_devices(pf);
+		if (err) {
+			dev_err(dev, "Failed to initialize peer_objs: 0x%x\n",
+				err);
+			err = -EIO;
+			goto err_init_peer_unroll;
+		}
+	} else {
+		dev_warn(dev, "RDMA is not supported on this device\n");
+	}
+
+	if (ice_is_safe_mode(pf)) {
+		ice_set_safe_mode_vlan_cfg(pf);
+		goto probe_done;
+	}
 
 	/* initialize DDP driven features */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_init(pf);
+
+	/* Note: Flow director init failure is non-fatal to load */
+	if (ice_init_fdir(pf))
+		dev_err(dev, "could not initialize flow director\n");
+
+	/* set DCF ACL enable flag as false by default */
+	hw->dcf_caps &= ~DCF_ACL_CAP;
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		/* Note: ACL init failure is non-fatal to load */
+		err = ice_init_acl(pf);
+		if (err)
+			dev_err(&pf->pdev->dev,
+				"Failed to initialize ACL: %d\n", err);
+	}
 
+	/* set DCF UDP tunnel enable flag as false by default */
+	hw->dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
 	/* Note: DCB init failure is non-fatal to load */
 	if (ice_init_pf_dcb(pf, false)) {
 		clear_bit(ICE_FLAG_DCB_CAPABLE, pf->flags);
@@ -2976,12 +6084,42 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		ice_cfg_lldp_mib_change(&pf->hw, true);
 	}
 
+#ifdef HAVE_NETDEV_UPPER_INFO
+	if (ice_init_lag(pf))
+		dev_warn(dev, "Failed to init link aggregation support\n");
+
+#endif /* HAVE_NETDEV_UPPER_INFO */
+	/* print PCI link speed and width */
+	pcie_print_link_status(pf->pdev);
+
+probe_done:
+	err = ice_register_netdev(pf);
+	if (err)
+		goto err_netdev_reg;
+
+	ice_config_health_events(pf, true);
+
+	/* ready to go, so clear down state bit */
+	clear_bit(ICE_DOWN, pf->state);
+
 	return 0;
 
+	/* Unwind non-managed device resources, etc. if something failed */
+err_netdev_reg:
+err_init_peer_unroll:
+	if (ice_is_peer_ena(pf)) {
+		ice_for_each_peer(pf, NULL, ice_unroll_peer);
+		if (pf->peers) {
+			devm_kfree(dev, pf->peers);
+			pf->peers = NULL;
+		}
+	}
+err_send_version_unroll:
+	ice_vsi_release_all(pf);
 err_alloc_sw_unroll:
-	set_bit(__ICE_SERVICE_DIS, pf->state);
-	set_bit(__ICE_DOWN, pf->state);
-	devm_kfree(&pf->pdev->dev, pf->first_sw);
+	set_bit(ICE_SERVICE_DIS, pf->state);
+	set_bit(ICE_DOWN, pf->state);
+	devm_kfree(dev, pf->first_sw);
 err_msix_misc_unroll:
 	ice_free_irq_msix_misc(pf);
 err_init_interrupt_unroll:
@@ -2990,52 +6128,412 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	devm_kfree(dev, pf->vsi);
 err_init_pf_unroll:
 	ice_deinit_pf(pf);
+	ice_devlink_params_unpublish(pf);
+	ice_devlink_destroy_regions(pf);
 	ice_deinit_hw(hw);
 err_exit_unroll:
+	ice_debugfs_pf_exit(pf);
+err_rec_mode:
+	ice_devlink_unregister(pf);
 	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
 	return err;
 }
 
+
 /**
- * ice_remove - Device removal routine
- * @pdev: PCI device information struct
+ * ice_set_wake - enable or disable Wake on LAN
+ * @pf: pointer to the PF struct
+ *
+ * Simple helper for WoL control
  */
-static void ice_remove(struct pci_dev *pdev)
+static void ice_set_wake(struct ice_pf *pf)
 {
-	struct ice_pf *pf = pci_get_drvdata(pdev);
-	int i;
+	struct ice_hw *hw = &pf->hw;
+	bool wol = pf->wol_ena;
 
-	if (!pf)
-		return;
+	/* clear wake state, otherwise new wake events won't fire */
+	wr32(hw, PFPM_WUS, U32_MAX);
 
-	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
-		if (!ice_is_reset_in_progress(pf->state))
-			break;
-		msleep(100);
-	}
+	/* enable / disable APM wake up, no RMW needed */
+	wr32(hw, PFPM_APM, wol ? PFPM_APM_APME_M : 0);
 
-	set_bit(__ICE_DOWN, pf->state);
-	ice_service_task_stop(pf);
+	/* set magic packet filter enabled */
+	wr32(hw, PFPM_WUFC, wol ? PFPM_WUFC_MAG_M : 0);
+}
 
-	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags))
-		ice_free_vfs(pf);
-	ice_vsi_release_all(pf);
-	ice_free_irq_msix_misc(pf);
-	ice_for_each_vsi(pf, i) {
+/**
+ * ice_setup_mc_magic_wake - setup device to wake on multicast magic packet
+ * @pf: pointer to the PF struct
+ *
+ * Issue firmware command to enable multicast magic wake, making
+ * sure that any locally administered address (LAA) is used for
+ * wake, and that PF reset doesn't undo the LAA.
+ */
+static void ice_setup_mc_magic_wake(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 mac_addr[ETH_ALEN];
+	struct ice_vsi *vsi;
+	u8 flags;
+
+	if (!pf->wol_ena)
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	/* Get current MAC address in case it's an LAA */
+	if (vsi->netdev)
+		ether_addr_copy(mac_addr, vsi->netdev->dev_addr);
+	else
+		ether_addr_copy(mac_addr, vsi->port_info->mac.perm_addr);
+
+	flags = ICE_AQC_MAN_MAC_WR_MC_MAG_EN |
+		ICE_AQC_MAN_MAC_UPDATE_LAA_WOL |
+		ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP;
+
+	status = ice_aq_manage_mac_write(hw, mac_addr, flags, NULL);
+	if (status)
+		dev_err(dev, "Failed to enable Multicast Magic Packet wake, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+}
+
+/**
+ * ice_remove - Device removal routine
+ * @pdev: PCI device information struct
+ */
+static void ice_remove(struct pci_dev *pdev)
+{
+	struct ice_pf *pf = pci_get_drvdata(pdev);
+	int i;
+
+	if (!pf)
+		return;
+
+	/* ICE_PREPPED_RECOVERY_MODE is set when the up and running
+	 * driver transitions to recovery mode. If this is not set
+	 * it means that the driver went into recovery mode on load.
+	 * For the former case, go through the usual flow for module
+	 * unload. For the latter, call ice_remove_recovery_mode
+	 * and return.
+	 */
+	if (!test_bit(ICE_PREPPED_RECOVERY_MODE, pf->state) &&
+	    test_bit(ICE_RECOVERY_MODE, pf->state)) {
+		ice_remove_recovery_mode(pf);
+		return;
+	}
+
+	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
+		if (!ice_is_reset_in_progress(pf->state))
+			break;
+		msleep(100);
+	}
+#ifdef HAVE_TC_INDIR_BLOCK
+	/* clear indirect block notification before cleaning up of ADQ
+	 * resources
+	 */
+	ice_tc_indir_block_remove(pf);
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+
+	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) {
+		set_bit(ICE_VF_RESETS_DISABLED, pf->state);
+		ice_free_vfs(pf);
+	}
+
+	ice_service_task_stop(pf);
+
+	ice_aq_cancel_waiting_tasks(pf);
+
+	if (ice_is_peer_ena(pf)) {
+		enum ice_close_reason reason;
+
+		reason = ICE_REASON_INTERFACE_DOWN;
+		ice_for_each_peer(pf, &reason, ice_peer_close);
+	}
+	set_bit(ICE_DOWN, pf->state);
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		ice_deinit_acl(pf);
+	mutex_destroy(&(&pf->hw)->fdir_fltr_lock);
+#ifdef HAVE_NETDEV_UPPER_INFO
+	ice_deinit_lag(pf);
+#endif /* HAVE_NETDEV_UPPER_INFO */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_release(pf);
+	if (!ice_is_safe_mode(pf))
+		ice_remove_arfs(pf);
+	ice_setup_mc_magic_wake(pf);
+	ice_vsi_release_all(pf);
+	if (ice_is_peer_ena(pf)) {
+#if IS_ENABLED(CONFIG_MFD_CORE)
+		ida_simple_remove(&ice_peer_index_ida, pf->peer_idx);
+#endif
+		ice_for_each_peer(pf, NULL, ice_unreg_peer_obj);
+		devm_kfree(&pdev->dev, pf->peers);
+	}
+	ice_set_wake(pf);
+	ice_free_irq_msix_misc(pf);
+	ice_for_each_vsi(pf, i) {
 		if (!pf->vsi[i])
 			continue;
 		ice_vsi_free_q_vectors(pf->vsi[i]);
 	}
+
 	ice_deinit_pf(pf);
+	ice_devlink_params_unpublish(pf);
+	ice_devlink_destroy_regions(pf);
+	if (ice_fwlog_unregister(&pf->hw))
+		dev_dbg(&pdev->dev, "failed to unregister from FW logging\n");
 	ice_deinit_hw(&pf->hw);
-	ice_clear_interrupt_scheme(pf);
+	ice_devlink_unregister(pf);
+	ice_debugfs_pf_exit(pf);
 	/* Issue a PFR as part of the prescribed driver unload flow.  Do not
 	 * do it via ice_schedule_reset() since there is no need to rebuild
 	 * and the service task is already stopped.
 	 */
 	ice_reset(&pf->hw, ICE_RESET_PFR);
+	pci_wait_for_pending_transaction(pdev);
+	ice_clear_interrupt_scheme(pf);
 	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+}
+
+/**
+ * ice_shutdown - PCI callback for shutting down device
+ * @pdev: PCI device information struct
+ */
+static void ice_shutdown(struct pci_dev *pdev)
+{
+	struct ice_pf *pf = pci_get_drvdata(pdev);
+
+	ice_remove(pdev);
+
+	if (system_state == SYSTEM_POWER_OFF) {
+		pci_wake_from_d3(pdev, pf->wol_ena);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
+}
+
+#ifdef CONFIG_PM
+/**
+ * ice_prepare_for_shutdown - prep for PCI shutdown
+ * @pf: board private structure
+ *
+ * Inform or close all dependent features in prep for PCI device shutdown
+ */
+static void ice_prepare_for_shutdown(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	u32 v;
+
+	/* Notify VFs of impending reset */
+	if (ice_check_sq_alive(hw, &hw->mailboxq))
+		ice_vc_notify_reset(pf);
+
+	dev_dbg(ice_pf_to_dev(pf), "Tearing down internal switch for shutdown\n");
+
+	/* disable the VSIs and their queues that are not already DOWN */
+	ice_pf_dis_all_vsi(pf, false);
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			pf->vsi[v]->vsi_num = 0;
+
+	ice_shutdown_all_ctrlq(hw);
+}
+
+/**
+ * ice_reinit_interrupt_scheme - Reinitialize interrupt scheme
+ * @pf: board private structure to reinitialize
+ *
+ * This routine reinitialize interrupt scheme that was cleared during
+ * power management suspend callback.
+ *
+ * This should be called during resume routine to re-allocate the q_vectors
+ * and reacquire interrupts.
+ */
+static int ice_reinit_interrupt_scheme(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret, v;
+
+	/* Since we clear MSIX flag during suspend, we need to
+	 * set it back during resume...
+	 */
+
+	ret = ice_init_interrupt_scheme(pf);
+	if (ret) {
+		dev_err(dev, "Failed to re-initialize interrupt %d\n", ret);
+		return ret;
+	}
+
+	/* Remap vectors and rings, after successful re-init interrupts */
+	ice_for_each_vsi(pf, v) {
+		if (!pf->vsi[v])
+			continue;
+
+		ret = ice_vsi_alloc_q_vectors(pf->vsi[v]);
+		if (ret)
+			goto err_reinit;
+		ice_vsi_map_rings_to_vectors(pf->vsi[v]);
+	}
+
+	ret = ice_req_irq_msix_misc(pf);
+	if (ret) {
+		dev_err(dev, "Setting up misc vector failed after device suspend %d\n",
+			ret);
+		goto err_reinit;
+	}
+
+	return 0;
+
+err_reinit:
+	while (v--)
+		if (pf->vsi[v])
+			ice_vsi_free_q_vectors(pf->vsi[v]);
+
+	return ret;
+}
+
+/**
+ * ice_suspend
+ * @dev: generic device information structure
+ *
+ * Power Management callback to quiesce the device and prepare
+ * for D3 transition.
+ */
+static int __maybe_unused ice_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct ice_pf *pf;
+	int disabled, v;
+
+	pf = pci_get_drvdata(pdev);
+
+	if (!ice_pf_state_is_nominal(pf)) {
+		dev_err(dev, "Device is not ready, no need to suspend it\n");
+		return -EBUSY;
+	}
+
+	/* Stop watchdog tasks until resume completion.
+	 * Even though it is most likely that the service task is
+	 * disabled if the device is suspended or down, the service task's
+	 * state is controlled by a different state bit, and we should
+	 * store and honor whatever state that bit is in at this point.
+	 */
+	disabled = ice_service_task_stop(pf);
+
+	if (ice_is_peer_ena(pf)) {
+		enum ice_close_reason reason;
+
+		reason = ICE_REASON_INTERFACE_DOWN;
+		ice_for_each_peer(pf, &reason, ice_peer_close);
+	}
+
+	/* Already suspended?, then there is nothing to do */
+	if (test_and_set_bit(ICE_SUSPENDED, pf->state)) {
+		if (!disabled)
+			ice_service_task_restart(pf);
+		return 0;
+	}
+
+	if (test_bit(ICE_DOWN, pf->state) ||
+	    ice_is_reset_in_progress(pf->state)) {
+		dev_err(dev, "can't suspend device in reset or already down\n");
+		if (!disabled)
+			ice_service_task_restart(pf);
+		return 0;
+	}
+
+	ice_setup_mc_magic_wake(pf);
+
+	ice_prepare_for_shutdown(pf);
+
+	ice_set_wake(pf);
+
+	/* Free vectors, clear the interrupt scheme and release IRQs
+	 * for proper hibernation, especially with large number of CPUs.
+	 * Otherwise hibernation might fail when mapping all the vectors back
+	 * to CPU0.
+	 */
+	ice_free_irq_msix_misc(pf);
+	ice_for_each_vsi(pf, v) {
+		if (!pf->vsi[v])
+			continue;
+		ice_vsi_free_q_vectors(pf->vsi[v]);
+	}
+	ice_clear_interrupt_scheme(pf);
+
+	pci_save_state(pdev);
+	pci_wake_from_d3(pdev, pf->wol_ena);
+	pci_set_power_state(pdev, PCI_D3hot);
+	return 0;
+}
+
+/**
+ * ice_resume - PM callback for waking up from D3
+ * @dev: generic device information structure
+ */
+static int __maybe_unused ice_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	enum ice_reset_req reset_type;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int ret;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	if (!pci_device_is_present(pdev))
+		return -ENODEV;
+
+	ret = pci_enable_device_mem(pdev);
+	if (ret) {
+		dev_err(dev, "Cannot enable device after suspend\n");
+		return ret;
+	}
+
+	pf = pci_get_drvdata(pdev);
+	hw = &pf->hw;
+
+	pf->wakeup_reason = rd32(hw, PFPM_WUS);
+	ice_print_wake_reason(pf);
+
+	/* We cleared the interrupt scheme when we suspended, so we need to
+	 * restore it now to resume device functionality.
+	 */
+	ret = ice_reinit_interrupt_scheme(pf);
+	if (ret)
+		dev_err(dev, "Cannot restore interrupt scheme: %d\n", ret);
+
+	ice_peer_refresh_msix(pf);
+
+	clear_bit(ICE_DOWN, pf->state);
+	/* Now perform PF reset and rebuild */
+	reset_type = ICE_RESET_PFR;
+	/* re-enable service task for reset, but allow reset to schedule it */
+	clear_bit(ICE_SERVICE_DIS, pf->state);
+
+	if (ice_schedule_reset(pf, reset_type))
+		dev_err(dev, "Reset during resume failed.\n");
+
+	clear_bit(ICE_SUSPENDED, pf->state);
+	ice_service_task_restart(pf);
+
+	/* Restart the service task */
+	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
+
+	return 0;
 }
+#endif /* CONFIG_PM */
 
 /**
  * ice_pci_err_detected - warning that PCI error has been detected
@@ -3046,7 +6544,7 @@ static void ice_remove(struct pci_dev *pdev)
  * is in progress.  Allows the driver to gracefully prepare/handle PCI errors.
  */
 static pci_ers_result_t
-ice_pci_err_detected(struct pci_dev *pdev, enum pci_channel_state err)
+ice_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t err)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
@@ -3056,12 +6554,12 @@ ice_pci_err_detected(struct pci_dev *pdev, enum pci_channel_state err)
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (!test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (!test_bit(ICE_SUSPENDED, pf->state)) {
 		ice_service_task_stop(pf);
 
-		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
-			ice_prepare_for_reset(pf);
+		if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
+			set_bit(ICE_PFR_REQ, pf->state);
+			ice_prepare_for_reset(pf, ICE_RESET_PFR);
 		}
 	}
 
@@ -3084,8 +6582,7 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
 
 	err = pci_enable_device_mem(pdev);
 	if (err) {
-		dev_err(&pdev->dev,
-			"Cannot re-enable PCI device after reset, error %d\n",
+		dev_err(&pdev->dev, "Cannot re-enable PCI device after reset, error %d\n",
 			err);
 		result = PCI_ERS_RESULT_DISCONNECT;
 	} else {
@@ -3102,10 +6599,9 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
 			result = PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	err = pci_aer_clear_nonfatal_status(pdev);
 	if (err)
-		dev_dbg(&pdev->dev,
-			"pci_cleanup_aer_uncorrect_error_status failed, error %d\n",
+		dev_dbg(&pdev->dev, "pci_aer_clear_nonfatal_status() failed, error %d\n",
 			err);
 		/* non-fatal, continue */
 
@@ -3124,22 +6620,25 @@ static void ice_pci_err_resume(struct pci_dev *pdev)
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
 	if (!pf) {
-		dev_err(&pdev->dev,
-			"%s failed, device is unrecoverable\n", __func__);
+		dev_err(&pdev->dev, "%s failed, device is unrecoverable\n",
+			__func__);
 		return;
 	}
 
-	if (test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (test_bit(ICE_SUSPENDED, pf->state)) {
 		dev_dbg(&pdev->dev, "%s failed to resume normal operations!\n",
 			__func__);
 		return;
 	}
 
+	ice_restore_all_vfs_msi_state(pdev);
+
 	ice_do_reset(pf, ICE_RESET_PFR);
 	ice_service_task_restart(pf);
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
 }
 
+#if defined(HAVE_PCI_ERROR_HANDLER_RESET_PREPARE) || defined(HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY) || defined(HAVE_RHEL7_PCI_RESET_NOTIFY)
 /**
  * ice_pci_err_reset_prepare - prepare device driver for PCI reset
  * @pdev: PCI device information struct
@@ -3148,12 +6647,12 @@ static void ice_pci_err_reset_prepare(struct pci_dev *pdev)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
 
-	if (!test_bit(__ICE_SUSPENDED, pf->state)) {
+	if (!test_bit(ICE_SUSPENDED, pf->state)) {
 		ice_service_task_stop(pf);
 
-		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state)) {
-			set_bit(__ICE_PFR_REQ, pf->state);
-			ice_prepare_for_reset(pf);
+		if (!test_bit(ICE_PREPARED_FOR_RESET, pf->state)) {
+			set_bit(ICE_PFR_REQ, pf->state);
+			ice_prepare_for_reset(pf, ICE_RESET_PFR);
 		}
 	}
 }
@@ -3166,6 +6665,24 @@ static void ice_pci_err_reset_done(struct pci_dev *pdev)
 {
 	ice_pci_err_resume(pdev);
 }
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_PREPARE || HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY || HAVE_RHEL7_PCI_RESET_NOTIFY */
+
+#if defined(HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY) || (defined(HAVE_RHEL7_PCI_RESET_NOTIFY) && defined(HAVE_RHEL7_PCI_DRIVER_RH))
+/**
+ * ice_pci_err_reset_notify - notify device driver of pci reset
+ * @pdev: PCI device information struct
+ * @prepare: whether or not to prepare for reset or reset is complete
+ *
+ * Called to perform PF reset when a PCI function level reset is triggered
+ */
+static void ice_pci_err_reset_notify(struct pci_dev *pdev, bool prepare)
+{
+	if (prepare)
+		ice_pci_err_reset_prepare(pdev);
+	else
+		ice_pci_err_reset_done(pdev);
+}
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY || (HAVE_RHEL7_PCI_RESET_NOTIFY && HAVE_RHEL7_PCI_DRIVER_RH) */
 
 /* ice_pci_tbl - PCI Device ID Table
  *
@@ -3179,25 +6696,73 @@ static const struct pci_device_id ice_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_BACKPLANE), 0 },
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_QSFP), 0 },
 	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E810_XXV_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823C_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_QSFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SGMII), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_BACKPLANE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_SFP), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_10G_BASE_T), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_1GBE), 0 },
+	{ PCI_VDEVICE(INTEL, ICE_DEV_ID_E823L_QSFP), 0 },
 	/* required last entry */
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, ice_pci_tbl);
 
+static __maybe_unused SIMPLE_DEV_PM_OPS(ice_pm_ops, ice_suspend, ice_resume);
+
+#ifdef HAVE_CONST_STRUCT_PCI_ERROR_HANDLERS
 static const struct pci_error_handlers ice_pci_err_handler = {
+#else
+static struct pci_error_handlers ice_pci_err_handler = {
+#endif /* HAVE_CONST_STRUCT_PCI_ERROR_HANDLERS */
 	.error_detected = ice_pci_err_detected,
 	.slot_reset = ice_pci_err_slot_reset,
+#ifdef HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY
+	.reset_notify = ice_pci_err_reset_notify,
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY */
+#ifdef HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
 	.reset_prepare = ice_pci_err_reset_prepare,
 	.reset_done = ice_pci_err_reset_done,
+#endif /* HAVE_PCI_ERROR_HANDLER_RESET_PREPARE */
 	.resume = ice_pci_err_resume
 };
 
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+static struct pci_driver_rh ice_driver_rh = {
+#ifdef HAVE_RHEL7_PCI_RESET_NOTIFY
+	.reset_notify = ice_pci_err_reset_notify,
+#endif /* HAVE_RHEL7_PCI_RESET_NOTIFY */
+};
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
+
 static struct pci_driver ice_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = ice_pci_tbl,
 	.probe = ice_probe,
 	.remove = ice_remove,
+#ifdef CONFIG_PM
+	.driver.pm = &ice_pm_ops,
+#endif /* CONFIG_PM */
+	.shutdown = ice_shutdown,
 	.sriov_configure = ice_sriov_configure,
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+	.pci_driver_rh = &ice_driver_rh,
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
 	.err_handler = &ice_pci_err_handler
 };
 
@@ -3214,16 +6779,31 @@ static int __init ice_module_init(void)
 	pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver);
 	pr_info("%s\n", ice_copyright);
 
-	ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME);
+
+	ice_wq = alloc_workqueue("%s", 0, 0, KBUILD_MODNAME);
 	if (!ice_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
 	}
 
+#ifdef HAVE_RHEL7_PCI_DRIVER_RH
+	/* The size member must be initialized in the driver via a call to
+	 * set_pci_driver_rh_size before pci_register_driver is called
+	 */
+	set_pci_driver_rh_size(ice_driver_rh);
+
+#endif /* HAVE_RHEL7_PCI_DRIVER_RH */
+
+	ice_debugfs_init();
+
 	status = pci_register_driver(&ice_driver);
 	if (status) {
 		pr_err("failed to register PCI driver, err %d\n", status);
 		destroy_workqueue(ice_wq);
+		ice_debugfs_exit();
+#if IS_ENABLED(CONFIG_MFD_CORE)
+		ida_destroy(&ice_peer_index_ida);
+#endif
 	}
 
 	return status;
@@ -3240,6 +6820,13 @@ static void __exit ice_module_exit(void)
 {
 	pci_unregister_driver(&ice_driver);
 	destroy_workqueue(ice_wq);
+	ice_debugfs_exit();
+	/* release all cached layer within ida tree, associated with
+	 * ice_peer_index_ida object
+	 */
+#if IS_ENABLED(CONFIG_MFD_CORE)
+	ida_destroy(&ice_peer_index_ida);
+#endif
 	pr_info("module unloaded\n");
 }
 module_exit(ice_module_exit);
@@ -3273,32 +6860,46 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 		return 0;
 	}
 
-	if (test_bit(__ICE_DOWN, pf->state) ||
+	if (test_bit(ICE_DOWN, pf->state) ||
 	    ice_is_reset_in_progress(pf->state)) {
 		netdev_err(netdev, "can't set mac %pM. device not ready\n",
 			   mac);
 		return -EBUSY;
 	}
 
-	/* When we change the MAC address we also have to change the MAC address
-	 * based filter rules that were created previously for the old MAC
-	 * address. So first, we remove the old filter rule using ice_remove_mac
-	 * and then create a new filter rule using ice_add_mac via
-	 * ice_vsi_cfg_mac_fltr function call for both add and/or remove
-	 * filters.
-	 */
-	status = ice_vsi_cfg_mac_fltr(vsi, netdev->dev_addr, false);
-	if (status) {
-		err = -EADDRNOTAVAIL;
-		goto err_update_filters;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	if (ice_chnl_dmac_fltr_cnt(pf)) {
+		netdev_err(netdev,
+			   "can't set mac %pM. Device has tc-flower filters, delete all of them and try again\n",
+			   mac);
+		return -EAGAIN;
 	}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
-	status = ice_vsi_cfg_mac_fltr(vsi, mac, true);
-	if (status) {
+	/* Clean up old MAC filter. Not an error if old filter doesn't exist */
+	status = ice_fltr_remove_mac(vsi, netdev->dev_addr, ICE_FWD_TO_VSI);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
 		err = -EADDRNOTAVAIL;
 		goto err_update_filters;
 	}
 
+	/* Add filter for new MAC. If filter exists, return success */
+	status = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		/* Although this MAC filter is already present in hardware it's
+		 * possible in some cases (e.g. bonding) that dev_addr was
+		 * modified outside of the driver and needs to be restored back
+		 * to this value.
+		 */
+		memcpy(netdev->dev_addr, mac, netdev->addr_len);
+		netdev_dbg(netdev, "filter for MAC %pM already exists\n", mac);
+		return 0;
+	}
+
+	/* error if the new filter addition failed */
+	if (status)
+		err = -EADDRNOTAVAIL;
+
 err_update_filters:
 	if (err) {
 		netdev_err(netdev, "can't set MAC %pM. filter update failed\n",
@@ -3315,8 +6916,8 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi)
 	flags = ICE_AQC_MAN_MAC_UPDATE_LAA_WOL;
 	status = ice_aq_manage_mac_write(hw, mac, flags, NULL);
 	if (status) {
-		netdev_err(netdev, "can't set MAC %pM. write to firmware failed error %d\n",
-			   mac, status);
+		netdev_err(netdev, "can't set MAC %pM. write to firmware failed error %s\n",
+			   mac, ice_stat_str(status));
 	}
 	return 0;
 }
@@ -3337,8 +6938,8 @@ static void ice_set_rx_mode(struct net_device *netdev)
 	 * ndo_set_rx_mode may be triggered even without a change in netdev
 	 * flags
 	 */
-	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
-	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_UMAC_FLTR_CHANGED, vsi->state);
+	set_bit(ICE_VSI_MMAC_FLTR_CHANGED, vsi->state);
 	set_bit(ICE_FLAG_FLTR_SYNC, vsi->back->flags);
 
 	/* schedule our worker thread which will take care of
@@ -3347,6 +6948,50 @@ static void ice_set_rx_mode(struct net_device *netdev)
 	ice_service_task_schedule(vsi->back);
 }
 
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+/**
+ * ice_set_tx_maxrate - NDO callback to set the maximum per-queue bitrate
+ * @netdev: network interface device structure
+ * @queue_index: Queue ID
+ * @maxrate: maximum bandwidth in Mbps
+ */
+static int
+ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	enum ice_status status;
+	u16 q_handle;
+	u8 tc;
+
+	/* Validate maxrate requested is within permitted range */
+	if (maxrate && (maxrate > (ICE_SCHED_MAX_BW / 1000))) {
+		netdev_err(netdev, "Invalid max rate %d specified for the queue %d\n",
+			   maxrate, queue_index);
+		return -EINVAL;
+	}
+
+	q_handle = vsi->tx_rings[queue_index]->q_handle;
+	tc = ice_dcb_get_tc(vsi, queue_index);
+
+	/* Set BW back to default, when user set maxrate to 0 */
+	if (!maxrate)
+		status = ice_cfg_q_bw_dflt_lmt(vsi->port_info, vsi->idx, tc,
+					       q_handle, ICE_MAX_BW);
+	else
+		status = ice_cfg_q_bw_lmt(vsi->port_info, vsi->idx, tc,
+					  q_handle, ICE_MAX_BW, maxrate * 1000);
+	if (status) {
+		netdev_err(netdev, "Unable to set Tx max rate, error %s\n",
+			   ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+
+#ifdef HAVE_NDO_FDB_ADD_EXTACK
 /**
  * ice_fdb_add - add an entry to the hardware database
  * @ndm: the input from the stack
@@ -3361,13 +7006,25 @@ static int
 ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
 	    struct net_device *dev, const unsigned char *addr, u16 vid,
 	    u16 flags, struct netlink_ext_ack __always_unused *extack)
+#elif defined(HAVE_NDO_FDB_ADD_VID)
+static int
+ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
+	    struct net_device *dev, const unsigned char *addr, u16 vid,
+	    u16 flags)
+#else
+static int
+ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
+	    struct net_device *dev, const unsigned char *addr, u16 flags)
+#endif /* HAVE_NDO_FDB_ADD_VID */
 {
 	int err;
 
+#ifdef HAVE_NDO_FDB_ADD_VID
 	if (vid) {
 		netdev_err(dev, "VLANs aren't supported yet for dev_uc|mc_add()\n");
 		return -EINVAL;
 	}
+#endif
 	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
 		netdev_err(dev, "FDB only supports static addresses\n");
 		return -EINVAL;
@@ -3387,6 +7044,7 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
 	return err;
 }
 
+#ifdef HAVE_NDO_FDB_ADD_VID
 /**
  * ice_fdb_del - delete an entry from the hardware database
  * @ndm: the input from the stack
@@ -3399,6 +7057,11 @@ static int
 ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
 	    struct net_device *dev, const unsigned char *addr,
 	    __always_unused u16 vid)
+#else
+static int
+ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
+	    struct net_device *dev, const unsigned char *addr)
+#endif
 {
 	int err;
 
@@ -3417,473 +7080,820 @@ ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
 	return err;
 }
 
+#ifdef HAVE_NETDEV_SB_DEV
 /**
- * ice_set_features - set the netdev feature flags
- * @netdev: ptr to the netdev being adjusted
- * @features: the feature set that the stack is suggesting
+ * ice_vsi_cfg_netdev_tc0 - Setup the netdev TC 0 configuration
+ * @vsi: the VSI being configured
+ *
+ * This function configures netdev parameters for traffic class 0
  */
-static int
-ice_set_features(struct net_device *netdev, netdev_features_t features)
+int ice_vsi_cfg_netdev_tc0(struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	int ret = 0;
+	struct net_device *netdev = vsi->netdev;
+	int ret;
 
-	/* Don't set any netdev advanced features with device in Safe Mode */
-	if (ice_is_safe_mode(vsi->back)) {
-		dev_err(&vsi->back->pdev->dev,
-			"Device is in Safe Mode - not enabling advanced netdev features\n");
+	if (!netdev)
+		return -EINVAL;
+
+	ret = netdev_set_num_tc(netdev, 1);
+	if (ret) {
+		netdev_err(netdev, "Error setting num TC\n");
 		return ret;
 	}
 
-	/* Multiple features can be changed in one call so keep features in
-	 * separate if/else statements to guarantee each feature is checked
-	 */
-	if (features & NETIF_F_RXHASH && !(netdev->features & NETIF_F_RXHASH))
-		ret = ice_vsi_manage_rss_lut(vsi, true);
-	else if (!(features & NETIF_F_RXHASH) &&
-		 netdev->features & NETIF_F_RXHASH)
-		ret = ice_vsi_manage_rss_lut(vsi, false);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-		ret = ice_vsi_manage_vlan_stripping(vsi, true);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_RX) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_RX))
-		ret = ice_vsi_manage_vlan_stripping(vsi, false);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_TX) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-		ret = ice_vsi_manage_vlan_insertion(vsi);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_TX) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_TX))
-		ret = ice_vsi_manage_vlan_insertion(vsi);
-
-	if ((features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-	    !(netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		ret = ice_cfg_vlan_pruning(vsi, true, false);
-	else if (!(features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
-		 (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		ret = ice_cfg_vlan_pruning(vsi, false, false);
+	/* Set queue information for lowerdev */
+	ret = netdev_set_tc_queue(netdev, 0, vsi->num_txq, 0);
+	if (ret) {
+		netdev_err(netdev, "Error setting TC queue\n");
+		goto set_tc_queue_err;
+	}
 
+	return 0;
+set_tc_queue_err:
+	netdev_set_num_tc(netdev, 0);
 	return ret;
 }
 
 /**
- * ice_vsi_vlan_setup - Setup VLAN offload properties on a VSI
- * @vsi: VSI to setup VLAN properties for
+ * ice_fwd_add_macvlan - Configure MACVLAN interface
+ * @netdev: Main net device to configure
+ * @vdev: MACVLAN subordinate device
  */
-static int ice_vsi_vlan_setup(struct ice_vsi *vsi)
+static void *
+ice_fwd_add_macvlan(struct net_device *netdev, struct net_device *vdev)
 {
-	int ret = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *parent_vsi = np->vsi, *vsi;
+	struct ice_pf *pf = parent_vsi->back;
+	struct ice_macvlan *mv = NULL;
+	int avail_id, ret, offset, i;
+	enum ice_status status;
+	struct device *dev;
+	u8 mac[ETH_ALEN];
 
-	if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
-		ret = ice_vsi_manage_vlan_stripping(vsi, true);
-	if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)
-		ret = ice_vsi_manage_vlan_insertion(vsi);
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		netdev_err(netdev, "Can't do MACVLAN offload. Device is in Safe Mode\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
-	return ret;
-}
+	if (pf->num_macvlan == pf->max_num_macvlan) {
+		netdev_err(netdev, "MACVLAN offload limit reached\n");
+		return ERR_PTR(-ENOSPC);
+	}
 
-/**
- * ice_vsi_cfg - Setup the VSI
- * @vsi: the VSI being configured
- *
- * Return 0 on success and negative value on error
- */
-int ice_vsi_cfg(struct ice_vsi *vsi)
-{
-	int err;
+	if (vdev->num_rx_queues != 1 || vdev->num_tx_queues != 1) {
+		netdev_err(netdev, "Can't do MACVLAN offload. %s has multiple queues\n",
+			   vdev->name);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
-	if (vsi->netdev) {
-		ice_set_rx_mode(vsi->netdev);
+	if (ice_get_avail_txq_count(pf) < ICE_DFLT_TXQ_VMDQ_VSI ||
+	    ice_get_avail_rxq_count(pf) < ICE_DFLT_RXQ_VMDQ_VSI) {
+		netdev_err(netdev, "Can't do MACVLAN offload. Not enough queues\n");
+		return ERR_PTR(-ENOSPC);
+	}
 
-		err = ice_vsi_vlan_setup(vsi);
+	avail_id = find_first_zero_bit(pf->avail_macvlan, pf->max_num_macvlan);
 
-		if (err)
-			return err;
+	vsi = ice_macvlan_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi) {
+		netdev_err(netdev, "Failed to create MACVLAN offload (VMDQ) VSI\n");
+		return ERR_PTR(-EIO);
 	}
-	ice_vsi_cfg_dcb_rings(vsi);
 
-	err = ice_vsi_cfg_lan_txqs(vsi);
-	if (!err)
-		err = ice_vsi_cfg_rxqs(vsi);
 
-	return err;
+	pf->num_macvlan++;
+	offset = parent_vsi->alloc_txq + avail_id;
+
+	ret = netdev_set_sb_channel(vdev, avail_id + 1);
+	if (ret) {
+		netdev_err(netdev, "Error setting netdev_set_sb_channel %d\n",
+			   ret);
+		goto set_sb_channel_err;
+	}
+
+	/* configure sbdev with the number of queues and offset within PF
+	 * queues range
+	 */
+	ret = netdev_bind_sb_channel_queue(netdev, vdev, 0, vsi->num_txq,
+					   offset);
+	if (ret) {
+		netdev_err(netdev, "Error setting netdev_bind_sb_channel_queue %d\n",
+			   ret);
+		goto bind_sb_channel_err;
+	}
+
+	vsi->netdev = vdev;
+	/* Set MACVLAN ring in root device Tx rings */
+	ice_for_each_txq(vsi, i)
+		parent_vsi->tx_rings[offset + i] = vsi->tx_rings[i];
+
+	ice_napi_add(vsi);
+
+	ret = ice_vsi_open(vsi);
+	if (ret)
+		goto vsi_open_err;
+
+	ether_addr_copy(mac, vdev->dev_addr);
+	status = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		dev_info(dev, "can't add MAC filters %pM for VSI %d, error %s\n",
+			 mac, vsi->idx, ice_stat_str(status));
+	} else if (status) {
+		dev_err(dev, "can't add MAC filters %pM for VSI %d, error %s\n",
+			mac, vsi->idx, ice_stat_str(status));
+		ret = -ENOMEM;
+		goto add_mac_err;
+	}
+
+	mv = devm_kzalloc(dev, sizeof(*mv), GFP_KERNEL);
+	if (!mv) {
+		ret = -ENOMEM;
+		goto mv_init_err;
+	}
+	INIT_LIST_HEAD(&mv->list);
+	mv->parent_vsi = parent_vsi;
+	mv->vsi = vsi;
+	mv->id = avail_id;
+	mv->vdev = vdev;
+	ether_addr_copy(mv->mac, mac);
+	list_add(&mv->list, &pf->macvlan_list);
+
+	set_bit(avail_id, pf->avail_macvlan);
+	netdev_info(netdev, "MACVLAN offloads for %s are on\n", vdev->name);
+	return mv;
+
+mv_init_err:
+	ice_fltr_remove_all(vsi);
+add_mac_err:
+	ice_vsi_close(vsi);
+vsi_open_err:
+	ice_napi_del(vsi);
+	vsi->netdev = NULL;
+	netdev_unbind_sb_channel(netdev, vdev);
+bind_sb_channel_err:
+	netdev_set_sb_channel(vdev, 0);
+set_sb_channel_err:
+	pf->num_macvlan--;
+	ice_vsi_release(vsi);
+	return ERR_PTR(ret);
 }
 
 /**
- * ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
- * @vsi: the VSI being configured
+ * ice_fwd_del_macvlan - Delete MACVLAN interface resources
+ * @netdev: Main net device
+ * @accel_priv: MACVLAN sub ordinate device
  */
-static void ice_napi_enable_all(struct ice_vsi *vsi)
+static void ice_fwd_del_macvlan(struct net_device *netdev, void *accel_priv)
 {
-	int q_idx;
+	struct ice_macvlan *mv = (struct ice_macvlan *)accel_priv;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *parent_vsi = np->vsi;
+	struct ice_pf *pf = parent_vsi->back;
+	struct net_device *vdev = mv->vdev;
 
-	if (!vsi->netdev)
-		return;
+	netdev_unbind_sb_channel(netdev, vdev);
+	netdev_set_sb_channel(vdev, 0);
 
-	ice_for_each_q_vector(vsi, q_idx) {
-		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+	ice_vsi_release(mv->vsi);
+	parent_vsi->tx_rings[parent_vsi->num_txq + mv->id] = NULL;
 
-		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_enable(&q_vector->napi);
-	}
+	pf->num_macvlan--;
+
+	clear_bit(mv->id, pf->avail_macvlan);
+	list_del(&mv->list);
+	devm_kfree(ice_pf_to_dev(pf), mv);
+
+	netdev_info(netdev, "MACVLAN offloads for %s are off\n", vdev->name);
 }
 
 /**
- * ice_up_complete - Finish the last steps of bringing up a connection
- * @vsi: The VSI being configured
- *
- * Return 0 on success and negative value on error
+ * ice_init_macvlan - Configure PF VSI to be able to offload MACVLAN
+ * @vsi: Main VSI pointer where sb_dev is attached to
+ * @init: Set to false when called in replay path otherwise true
  */
-static int ice_up_complete(struct ice_vsi *vsi)
+static int ice_init_macvlan(struct ice_vsi *vsi, bool init)
 {
+	struct net_device *netdev = vsi->netdev;
 	struct ice_pf *pf = vsi->back;
-	int err;
+	struct ice_ring **tmp_rings;
+	unsigned int total_rings;
+	struct device *dev;
+	int i, ret;
 
-	ice_vsi_cfg_msix(vsi);
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_FLAG_VMDQ_ENA, pf->flags)) {
+		dev_err(dev, "MACVLAN offload cannot be supported - VMDQ is disabled\n");
+		return -EPERM;
+	}
 
-	/* Enable only Rx rings, Tx rings were enabled by the FW when the
-	 * Tx queue group list was configured and the context bits were
-	 * programmed using ice_vsi_cfg_txqs
-	 */
-	err = ice_vsi_start_rx_rings(vsi);
-	if (err)
-		return err;
+	if (ice_is_safe_mode(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - Device is in Safe Mode\n");
+		return -EOPNOTSUPP;
+	}
 
-	clear_bit(__ICE_DOWN, vsi->state);
-	ice_napi_enable_all(vsi);
-	ice_vsi_ena_irq(vsi);
+#ifdef NETIF_F_HW_TC
+	if (ice_is_adq_active(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - ADQ is active. Delete ADQ configs using TC and try again\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
 
-	if (vsi->port_info &&
-	    (vsi->port_info->phy.link_info.link_info & ICE_AQ_LINK_UP) &&
-	    vsi->netdev) {
-		ice_print_link_msg(vsi, true);
-		netif_tx_start_all_queues(vsi->netdev);
-		netif_carrier_on(vsi->netdev);
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. L2 Forwarding Offload cannot be enabled.\n");
+		return -EOPNOTSUPP;
 	}
 
-	ice_service_task_schedule(pf);
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+		dev_err(dev, "MACVLAN offload cannot be configured - switchdev is enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	pf->max_num_macvlan = min3(ice_get_avail_txq_count(pf),
+				   ice_get_avail_rxq_count(pf),
+				   (u16)ICE_MAX_MACVLANS);
+
+	total_rings = vsi->alloc_txq + pf->max_num_macvlan;
+
+	/* Allocate memory for Tx and MACVLAN ring pointers */
+	tmp_rings = devm_kcalloc(dev, total_rings, sizeof(*tmp_rings),
+				 GFP_KERNEL);
+	if (!tmp_rings) {
+		ret = -ENOMEM;
+		goto alloc_ring_err;
+	}
+
+	/* Copy existing ring pointers to new temporary ones */
+	for (i = 0; i < vsi->alloc_txq; i++)
+		tmp_rings[i] = vsi->tx_rings[i];
+	vsi->base_tx_rings = vsi->tx_rings;
+	vsi->tx_rings = tmp_rings;
+
+	if (!init)
+		return 0;
+
+	ret = netif_set_real_num_tx_queues(netdev, total_rings);
+	if (ret) {
+		netdev_err(netdev, "Error setting real num queue\n");
+		goto set_num_real_txq_err;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (!ice_is_adq_active(pf)) {
+		ret = ice_vsi_cfg_netdev_tc0(vsi);
+		if (ret)
+			goto set_num_tc_err;
+	}
+#else
+	ret = ice_vsi_cfg_netdev_tc0(vsi);
+	if (ret)
+		goto set_num_tc_err;
+#endif /* NETIF_F_HW_TC */
+
+	INIT_LIST_HEAD(&pf->macvlan_list);
+	set_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
 
 	return 0;
+
+set_num_tc_err:
+	netif_set_real_num_tx_queues(netdev, vsi->num_txq);
+set_num_real_txq_err:
+	vsi->tx_rings = vsi->base_tx_rings;
+	vsi->base_tx_rings = NULL;
+	devm_kfree(dev, tmp_rings);
+alloc_ring_err:
+	pf->max_num_macvlan = 0;
+	return ret;
 }
 
 /**
- * ice_up - Bring the connection back up after being down
- * @vsi: VSI being configured
+ * ice_deinit_macvlan - Release and cleanup MACVLAN resources
+ * @vsi: Main VSI pointer where sb_dev is attached to
  */
-int ice_up(struct ice_vsi *vsi)
+static void ice_deinit_macvlan(struct ice_vsi *vsi)
 {
-	int err;
+	struct ice_macvlan *mv, *mv_tmp;
+	struct ice_pf *pf = vsi->back;
+	struct ice_ring **tmp_rings;
 
-	err = ice_vsi_cfg(vsi);
-	if (!err)
-		err = ice_up_complete(vsi);
+	clear_bit(ICE_FLAG_MACVLAN_ENA, pf->flags);
 
-	return err;
+	/* Remove offload from existing MACVLANs; clear software book-keeping
+	 * structures and reclaim hardware resources
+	 */
+	list_for_each_entry_safe(mv, mv_tmp, &pf->macvlan_list, list) {
+		ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+		macvlan_release_l2fw_offload(mv->vdev);
+		ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (!ice_is_adq_active(pf))
+		netdev_set_num_tc(vsi->netdev, 0);
+#else
+	netdev_set_num_tc(vsi->netdev, 0);
+#endif /* NETIF_F_HW_TC */
+	netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
+	pf->max_num_macvlan = 0;
+
+	/* Restore original Tx ring pointers */
+	tmp_rings = vsi->tx_rings;
+	vsi->tx_rings = vsi->base_tx_rings;
+	devm_kfree(ice_pf_to_dev(pf), tmp_rings);
 }
 
 /**
- * ice_fetch_u64_stats_per_ring - get packets and bytes stats per ring
- * @ring: Tx or Rx ring to read stats from
- * @pkts: packets stats counter
- * @bytes: bytes stats counter
- *
- * This function fetches stats from the ring considering the atomic operations
- * that needs to be performed to read u64 values in 32 bit machine.
+ * ice_vsi_replay_macvlan - Configure MACVLAN netdev settings after reset
+ * @pf: board private structure
  */
-static void
-ice_fetch_u64_stats_per_ring(struct ice_ring *ring, u64 *pkts, u64 *bytes)
+static void ice_vsi_replay_macvlan(struct ice_pf *pf)
 {
-	unsigned int start;
-	*pkts = 0;
-	*bytes = 0;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_macvlan *mv, *mv_temp;
 
-	if (!ring)
-		return;
-	do {
-		start = u64_stats_fetch_begin_irq(&ring->syncp);
-		*pkts = ring->stats.pkts;
-		*bytes = ring->stats.bytes;
-	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+	list_for_each_entry_safe(mv, mv_temp, &pf->macvlan_list, list) {
+		struct ice_vsi *vsi = mv->parent_vsi;
+		int offset = vsi->alloc_txq + mv->id;
+		int ret = 0, i;
+
+		ice_for_each_txq(mv->vsi, i)
+			vsi->tx_rings[offset + i] = mv->vsi->tx_rings[i];
+
+		ret = netdev_set_sb_channel(mv->vdev, mv->id + 1);
+		if (ret) {
+			dev_dbg(dev, "Error setting netdev_set_sb_channel %d\n",
+				ret);
+			/* Do not return error, try to configure as many as
+			 * possible
+			 */
+			ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+			macvlan_release_l2fw_offload(mv->vdev);
+			ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+			continue;
+		}
+
+		ret = netdev_bind_sb_channel_queue(vsi->netdev, mv->vdev, 0,
+						   mv->vsi->num_txq, offset);
+		if (ret) {
+			dev_dbg(dev, "Error setting netdev_bind_sb_channel_queue %d\n",
+				ret);
+			/* Do not return error, try to configure as many as
+			 * possible
+			 */
+			ice_fltr_remove_mac(mv->vsi, mv->mac, ICE_FWD_TO_VSI);
+			macvlan_release_l2fw_offload(mv->vdev);
+			ice_fwd_del_macvlan(mv->parent_vsi->netdev, mv);
+			continue;
+		}
+	}
 }
+#endif /* HAVE_NETDEV_SB_DEV */
+
+#define NETIF_VLAN_OFFLOAD_FEATURES	(NETIF_F_HW_VLAN_CTAG_RX | \
+					 NETIF_F_HW_VLAN_CTAG_TX | \
+					 NETIF_F_HW_VLAN_STAG_RX | \
+					 NETIF_F_HW_VLAN_STAG_TX)
+
+#define NETIF_VLAN_FILTERING_FEATURES	(NETIF_F_HW_VLAN_CTAG_FILTER | \
+					 NETIF_F_HW_VLAN_STAG_FILTER)
 
 /**
- * ice_update_vsi_ring_stats - Update VSI stats counters
- * @vsi: the VSI to be updated
+ * ice_fix_features - fix the netdev features flags based on device limitations
+ * @netdev: ptr to the netdev that flags are being fixed on
+ * @features: features that need to be checked and possibly fixed
+ *
+ * Make sure any fixups are made to features in this callback. This enables the
+ * driver to not have to check unsupported configurations throughout the driver
+ * because that's the responsiblity of this callback.
+ *
+ * Single VLAN Mode (SVM) Supported Features:
+ *	NETIF_F_HW_VLAN_CTAG_FILTER
+ *	NETIF_F_HW_VLAN_CTAG_RX
+ *	NETIF_F_HW_VLAN_CTAG_TX
+ *
+ * Double VLAN Mode (DVM) Supported Features:
+ *	NETIF_F_HW_VLAN_CTAG_FILTER
+ *	NETIF_F_HW_VLAN_CTAG_RX
+ *	NETIF_F_HW_VLAN_CTAG_TX
+ *
+ *	NETIF_F_HW_VLAN_STAG_FILTER
+ *	NETIF_HW_VLAN_STAG_RX
+ *	NETIF_HW_VLAN_STAG_TX
+ *
+ * Features that need fixing:
+ *	Cannot simultaneously enable CTAG and STAG stripping and/or insertion.
+ *	These are mutually exlusive as the VSI context cannot support multiple
+ *	VLAN ethertypes simultaneously for stripping and/or insertion. If this
+ *	is not done, then default to clearing the requested STAG offload
+ *	settings.
+ *
+ *	All supported filtering has to be enabled or disabled together. For
+ *	example, in DVM, CTAG and STAG filtering have to be enabled and disabled
+ *	together. If this is not done, then default to VLAN filtering disabled.
+ *	These are mutually exclusive as there is currently no way to
+ *	enable/disable VLAN filtering based on VLAN ethertype when using VLAN
+ *	prune rules.
  */
-static void ice_update_vsi_ring_stats(struct ice_vsi *vsi)
+static netdev_features_t
+ice_fix_features(struct net_device *netdev, netdev_features_t features)
 {
-	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
-	struct ice_ring *ring;
-	u64 pkts, bytes;
-	int i;
-
-	/* reset netdev stats */
-	vsi_stats->tx_packets = 0;
-	vsi_stats->tx_bytes = 0;
-	vsi_stats->rx_packets = 0;
-	vsi_stats->rx_bytes = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	netdev_features_t supported_vlan_filtering;
+	netdev_features_t requested_vlan_filtering;
+	struct ice_vsi *vsi = np->vsi;
 
-	/* reset non-netdev (extended) stats */
-	vsi->tx_restart = 0;
-	vsi->tx_busy = 0;
-	vsi->tx_linearize = 0;
-	vsi->rx_buf_failed = 0;
-	vsi->rx_page_failed = 0;
+	requested_vlan_filtering = features & NETIF_VLAN_FILTERING_FEATURES;
 
-	rcu_read_lock();
+	/* make sure supported_vlan_filtering works for both SVM and DVM */
+	supported_vlan_filtering = NETIF_F_HW_VLAN_CTAG_FILTER;
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		supported_vlan_filtering |= NETIF_F_HW_VLAN_STAG_FILTER;
 
-	/* update Tx rings counters */
-	ice_for_each_txq(vsi, i) {
-		ring = READ_ONCE(vsi->tx_rings[i]);
-		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
-		vsi_stats->tx_packets += pkts;
-		vsi_stats->tx_bytes += bytes;
-		vsi->tx_restart += ring->tx_stats.restart_q;
-		vsi->tx_busy += ring->tx_stats.tx_busy;
-		vsi->tx_linearize += ring->tx_stats.tx_linearize;
+	if (requested_vlan_filtering &&
+	    requested_vlan_filtering != supported_vlan_filtering) {
+		if (requested_vlan_filtering & NETIF_F_HW_VLAN_CTAG_FILTER) {
+			netdev_warn(netdev, "cannot support requested VLAN filtering settings, enabling all supported VLAN filtering settings\n");
+			features |= supported_vlan_filtering;
+		} else {
+			netdev_warn(netdev, "cannot support requested VLAN filtering settings, clearing all supported VLAN filtering settings\n");
+			features &= ~supported_vlan_filtering;
+		}
 	}
 
-	/* update Rx rings counters */
-	ice_for_each_rxq(vsi, i) {
-		ring = READ_ONCE(vsi->rx_rings[i]);
-		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
-		vsi_stats->rx_packets += pkts;
-		vsi_stats->rx_bytes += bytes;
-		vsi->rx_buf_failed += ring->rx_stats.alloc_buf_failed;
-		vsi->rx_page_failed += ring->rx_stats.alloc_page_failed;
+	if ((features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX)) &&
+	    (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))) {
+		netdev_warn(netdev, "cannot support CTAG and STAG VLAN stripping and/or insertion simultaneously since CTAG and STAG offloads are mutually exclusive, clearing STAG offload settings\n");
+		features &= ~(NETIF_F_HW_VLAN_STAG_RX |
+			      NETIF_F_HW_VLAN_STAG_TX);
 	}
 
-	rcu_read_unlock();
+	return features;
 }
 
 /**
- * ice_update_vsi_stats - Update VSI stats counters
- * @vsi: the VSI to be updated
+ * ice_set_vlan_offload_features - set VLAN offload features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN offload settings
+ *
+ * First, determine the vlan_ethertype based on the VLAN offload bits in
+ * features. Then determine if stripping and insertion should be enabled or
+ * disabled. Finally enable or disable VLAN stripping and insertion.
  */
-void ice_update_vsi_stats(struct ice_vsi *vsi)
+static int
+ice_set_vlan_offload_features(struct ice_vsi *vsi, netdev_features_t features)
 {
-	struct rtnl_link_stats64 *cur_ns = &vsi->net_stats;
-	struct ice_eth_stats *cur_es = &vsi->eth_stats;
-	struct ice_pf *pf = vsi->back;
+	bool enable_stripping = true, enable_insertion = true;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int strip_err = 0, insert_err = 0;
+	u16 vlan_ethertype = 0;
 
-	if (test_bit(__ICE_DOWN, vsi->state) ||
-	    test_bit(__ICE_CFG_BUSY, pf->state))
-		return;
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* get stats as recorded by Tx/Rx rings */
-	ice_update_vsi_ring_stats(vsi);
+	if (features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX))
+		vlan_ethertype = ETH_P_8021AD;
+	else if (features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX))
+		vlan_ethertype = ETH_P_8021Q;
 
-	/* get VSI stats as recorded by the hardware */
-	ice_update_eth_stats(vsi);
+	if (!(features & (NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_CTAG_RX)))
+		enable_stripping = false;
+	if (!(features & (NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_TX)))
+		enable_insertion = false;
 
-	cur_ns->tx_errors = cur_es->tx_errors;
-	cur_ns->rx_dropped = cur_es->rx_discards;
-	cur_ns->tx_dropped = cur_es->tx_discards;
-	cur_ns->multicast = cur_es->rx_multicast;
+	if (enable_stripping)
+		strip_err = vlan_ops->ena_stripping(vsi, vlan_ethertype);
+	else
+		strip_err = vlan_ops->dis_stripping(vsi);
 
-	/* update some more netdev stats if this is main VSI */
-	if (vsi->type == ICE_VSI_PF) {
-		cur_ns->rx_crc_errors = pf->stats.crc_errors;
-		cur_ns->rx_errors = pf->stats.crc_errors +
-				    pf->stats.illegal_bytes;
-		cur_ns->rx_length_errors = pf->stats.rx_len_errors;
-		/* record drops from the port level */
-		cur_ns->rx_missed_errors = pf->stats.eth.rx_discards;
-	}
+	if (enable_insertion)
+		insert_err = vlan_ops->ena_insertion(vsi, vlan_ethertype);
+	else
+		insert_err = vlan_ops->dis_insertion(vsi);
+
+	if (strip_err || insert_err)
+		return -EIO;
+
+	return 0;
 }
 
 /**
- * ice_update_pf_stats - Update PF port stats counters
- * @pf: PF whose stats needs to be updated
+ * ice_set_vlan_filtering_features - set VLAN filtering features for the PF VSI
+ * @vsi: PF's VSI
+ * @features: features used to determine VLAN filtering settings
+ *
+ * Enable or disable Rx VLAN filtering based on the VLAN filtering bits in the
+ * features.
  */
-void ice_update_pf_stats(struct ice_pf *pf)
+static int
+ice_set_vlan_filtering_features(struct ice_vsi *vsi, netdev_features_t features)
 {
-	struct ice_hw_port_stats *prev_ps, *cur_ps;
-	struct ice_hw *hw = &pf->hw;
-	u8 port;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	int err = 0;
 
-	port = hw->port_info->lport;
-	prev_ps = &pf->stats_prev;
-	cur_ps = &pf->stats;
+	/* support Single VLAN Mode (SVM) and Double VLAN Mode (DVM) by checking
+	 * if either bit is set
+	 */
+	if (features &
+	    (NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER))
+		err = vlan_ops->ena_rx_filtering(vsi);
+	else
+		err = vlan_ops->dis_rx_filtering(vsi);
 
-	ice_stat_update40(hw, GLPRT_GORCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_bytes,
-			  &cur_ps->eth.rx_bytes);
+	return err;
+}
 
-	ice_stat_update40(hw, GLPRT_UPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_unicast,
-			  &cur_ps->eth.rx_unicast);
+/**
+ * ice_set_vlan_features - set VLAN settings based on suggested feature set
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ *
+ * Only update VLAN settings if the requested_vlan_features are different than
+ * the current_vlan_features.
+ */
+static int
+ice_set_vlan_features(struct net_device *netdev, netdev_features_t features)
+{
+	netdev_features_t current_vlan_features, requested_vlan_features;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int err;
 
-	ice_stat_update40(hw, GLPRT_MPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_multicast,
-			  &cur_ps->eth.rx_multicast);
+	current_vlan_features = netdev->features & NETIF_VLAN_OFFLOAD_FEATURES;
+	requested_vlan_features = features & NETIF_VLAN_OFFLOAD_FEATURES;
+	if (current_vlan_features ^ requested_vlan_features) {
+		err = ice_set_vlan_offload_features(vsi, features);
+		if (err)
+			return err;
+	}
 
-	ice_stat_update40(hw, GLPRT_BPRCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_broadcast,
-			  &cur_ps->eth.rx_broadcast);
+	current_vlan_features = netdev->features &
+		NETIF_VLAN_FILTERING_FEATURES;
+	requested_vlan_features = features & NETIF_VLAN_FILTERING_FEATURES;
+	if (current_vlan_features ^ requested_vlan_features) {
+		err = ice_set_vlan_filtering_features(vsi, features);
+		if (err)
+			return err;
+	}
 
-	ice_stat_update32(hw, PRTRPB_RDPC, pf->stat_prev_loaded,
-			  &prev_ps->eth.rx_discards,
-			  &cur_ps->eth.rx_discards);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_GOTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_bytes,
-			  &cur_ps->eth.tx_bytes);
+/**
+ * ice_set_features - set the netdev feature flags
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ */
+static int
+ice_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int ret = 0;
 
-	ice_stat_update40(hw, GLPRT_UPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_unicast,
-			  &cur_ps->eth.tx_unicast);
+	/* Don't set any netdev advanced features with device in Safe Mode */
+	if (ice_is_safe_mode(vsi->back)) {
+		dev_err(ice_pf_to_dev(vsi->back), "Device is in Safe Mode - not enabling advanced netdev features\n");
+		return ret;
+	}
 
-	ice_stat_update40(hw, GLPRT_MPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_multicast,
-			  &cur_ps->eth.tx_multicast);
+	/* Do not change setting during reset */
+	if (ice_is_reset_in_progress(pf->state)) {
+		dev_err(ice_pf_to_dev(vsi->back), "Device is resetting, changing advanced netdev features temporarily unavailable.\n");
+		return -EBUSY;
+	}
 
-	ice_stat_update40(hw, GLPRT_BPTCL(port), pf->stat_prev_loaded,
-			  &prev_ps->eth.tx_broadcast,
-			  &cur_ps->eth.tx_broadcast);
+	/* Multiple features can be changed in one call so keep features in
+	 * separate if/else statements to guarantee each feature is checked
+	 */
+	if (features & NETIF_F_RXHASH && !(netdev->features & NETIF_F_RXHASH))
+		ice_vsi_manage_rss_lut(vsi, true);
+	else if (!(features & NETIF_F_RXHASH) &&
+		 netdev->features & NETIF_F_RXHASH)
+		ice_vsi_manage_rss_lut(vsi, false);
 
-	ice_stat_update32(hw, GLPRT_TDOLD(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_dropped_link_down,
-			  &cur_ps->tx_dropped_link_down);
+	ret = ice_set_vlan_features(netdev, features);
+	if (ret)
+		return ret;
 
-	ice_stat_update40(hw, GLPRT_PRC64L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_64, &cur_ps->rx_size_64);
+#ifdef HAVE_NETDEV_SB_DEV
 
-	ice_stat_update40(hw, GLPRT_PRC127L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_127, &cur_ps->rx_size_127);
+	if ((features & NETIF_F_HW_L2FW_DOFFLOAD) &&
+	    !(netdev->features & NETIF_F_HW_L2FW_DOFFLOAD)) {
+		ret = ice_init_macvlan(vsi, true);
+		if (ret)
+			return ret;
+	} else if (!(features & NETIF_F_HW_L2FW_DOFFLOAD) &&
+		 (netdev->features & NETIF_F_HW_L2FW_DOFFLOAD)) {
+		ice_deinit_macvlan(vsi);
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
 
-	ice_stat_update40(hw, GLPRT_PRC255L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_255, &cur_ps->rx_size_255);
+	if ((features & NETIF_F_NTUPLE) &&
+	    !(netdev->features & NETIF_F_NTUPLE)) {
+		ice_vsi_manage_fdir(vsi, true);
+		ice_init_arfs(vsi);
+	} else if (!(features & NETIF_F_NTUPLE) &&
+		 (netdev->features & NETIF_F_NTUPLE)) {
+		ice_vsi_manage_fdir(vsi, false);
+		ice_clear_arfs(vsi);
+	}
 
-	ice_stat_update40(hw, GLPRT_PRC511L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_511, &cur_ps->rx_size_511);
+#ifdef NETIF_F_HW_TC
+	/* don't turn off hw_tc_offload when ADQ is already enabled */
+	if (!(features & NETIF_F_HW_TC) && ice_is_adq_active(pf)) {
+		dev_err(ice_pf_to_dev(pf), "ADQ is active, can't turn hw_tc_offload off\n");
+		return -EACCES;
+	}
 
-	ice_stat_update40(hw, GLPRT_PRC1023L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_1023, &cur_ps->rx_size_1023);
+	if ((features & NETIF_F_HW_TC) &&
+	    !(netdev->features & NETIF_F_HW_TC))
+		set_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+	else
+		clear_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+#endif /* NETIF_F_HW_TC */
 
-	ice_stat_update40(hw, GLPRT_PRC1522L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_1522, &cur_ps->rx_size_1522);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_PRC9522L(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_size_big, &cur_ps->rx_size_big);
+/**
+ * ice_vsi_vlan_setup - Setup VLAN offload properties on a PF VSI
+ * @vsi: VSI to setup VLAN properties for
+ */
+static int ice_vsi_vlan_setup(struct ice_vsi *vsi)
+{
+	int err;
 
-	ice_stat_update40(hw, GLPRT_PTC64L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_64, &cur_ps->tx_size_64);
+	err = ice_set_vlan_offload_features(vsi, vsi->netdev->features);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC127L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_127, &cur_ps->tx_size_127);
+	err = ice_set_vlan_filtering_features(vsi, vsi->netdev->features);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC255L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_255, &cur_ps->tx_size_255);
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err)
+		return err;
 
-	ice_stat_update40(hw, GLPRT_PTC511L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_511, &cur_ps->tx_size_511);
+	return 0;
+}
 
-	ice_stat_update40(hw, GLPRT_PTC1023L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_1023, &cur_ps->tx_size_1023);
+/**
+ * ice_vsi_cfg - Setup the VSI
+ * @vsi: the VSI being configured
+ *
+ * Return 0 on success and negative value on error
+ */
+int ice_vsi_cfg(struct ice_vsi *vsi)
+{
+	int err;
 
-	ice_stat_update40(hw, GLPRT_PTC1522L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_1522, &cur_ps->tx_size_1522);
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		ice_set_rx_mode(vsi->netdev);
 
-	ice_stat_update40(hw, GLPRT_PTC9522L(port), pf->stat_prev_loaded,
-			  &prev_ps->tx_size_big, &cur_ps->tx_size_big);
+		err = ice_vsi_vlan_setup(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXONRXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xon_rx, &cur_ps->link_xon_rx);
+		if (err)
+			return err;
+	}
+	ice_vsi_cfg_dcb_rings(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXOFFRXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xoff_rx, &cur_ps->link_xoff_rx);
+	err = ice_vsi_cfg_lan_txqs(vsi);
+#ifdef HAVE_XDP_SUPPORT
+	if (!err && ice_is_xdp_ena_vsi(vsi))
+		err = ice_vsi_cfg_xdp_txqs(vsi);
+#endif /* HAVE_XDP_SUPPORT */
+	if (!err)
+		err = ice_vsi_cfg_rxqs(vsi);
 
-	ice_stat_update32(hw, GLPRT_LXONTXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xon_tx, &cur_ps->link_xon_tx);
+	return err;
+}
 
-	ice_stat_update32(hw, GLPRT_LXOFFTXC(port), pf->stat_prev_loaded,
-			  &prev_ps->link_xoff_tx, &cur_ps->link_xoff_tx);
+/* THEORY OF MODERATION:
+ * The ice driver hardware works differently than the hardware that DIMLIB was
+ * originally made for. ice hardware doesn't have packet count limits that
+ * can trigger an interrupt, but it *does* have interrupt rate limit support,
+ * which is hard-coded to limit to 250,000 ints/second.
+ * If not using dynamic moderation, the INTRL value can be modified
+ * by ethtool rx-usecs-high.
+ */
+struct ice_dim {
+	/* the throttle rate for interrupts, basically worst case delay before
+	 * an initial interrupt fires, value is stored in microseconds.
+	 */
+	u16 itr;
+};
 
-	ice_update_dcb_stats(pf);
+/* Make a different profile for RX that doesn't allow quite so aggressive
+ * moderation at the high end (it maxxes out at 126us or about 8k interrupts a
+ * second.
+ */
+static const struct ice_dim rx_profile[] = {
+	{2},    /* 500,000 ints/s, capped at 250K by INTRL */
+	{8},    /* 125,000 ints/s */
+	{16},   /*  62,500 ints/s */
+	{62},   /*  16,129 ints/s */
+	{126}   /*   7,936 ints/s */
+};
 
-	ice_stat_update32(hw, GLPRT_CRCERRS(port), pf->stat_prev_loaded,
-			  &prev_ps->crc_errors, &cur_ps->crc_errors);
+/* The transmit profile, which has the same sorts of values
+ * as the previous struct
+ */
+static const struct ice_dim tx_profile[] = {
+	{2},    /* 500,000 ints/s, capped at 250K by INTRL */
+	{8},    /* 125,000 ints/s */
+	{40},   /*  16,125 ints/s */
+	{128},  /*   7,812 ints/s */
+	{256}   /*   3,906 ints/s */
+};
 
-	ice_stat_update32(hw, GLPRT_ILLERRC(port), pf->stat_prev_loaded,
-			  &prev_ps->illegal_bytes, &cur_ps->illegal_bytes);
+static void ice_tx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct dim *dim;
+	u16 itr;
 
-	ice_stat_update32(hw, GLPRT_MLFC(port), pf->stat_prev_loaded,
-			  &prev_ps->mac_local_faults,
-			  &cur_ps->mac_local_faults);
+	dim = container_of(work, struct dim, work);
+	rc = (struct ice_ring_container *)dim->priv;
 
-	ice_stat_update32(hw, GLPRT_MRFC(port), pf->stat_prev_loaded,
-			  &prev_ps->mac_remote_faults,
-			  &cur_ps->mac_remote_faults);
+	WARN_ON(dim->profile_ix >= ARRAY_SIZE(tx_profile));
 
-	ice_stat_update32(hw, GLPRT_RLEC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_len_errors, &cur_ps->rx_len_errors);
+	/* look up the values in our local table */
+	itr = tx_profile[dim->profile_ix].itr;
 
-	ice_stat_update32(hw, GLPRT_RUC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_undersize, &cur_ps->rx_undersize);
+	ice_trace(tx_dim_work, container_of(rc, struct ice_q_vector, tx), dim);
+	ice_write_itr(rc, itr);
 
-	ice_stat_update32(hw, GLPRT_RFC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_fragments, &cur_ps->rx_fragments);
+	dim->state = DIM_START_MEASURE;
+}
 
-	ice_stat_update32(hw, GLPRT_ROC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_oversize, &cur_ps->rx_oversize);
+static void ice_rx_dim_work(struct work_struct *work)
+{
+	struct ice_ring_container *rc;
+	struct dim *dim;
+	u16 itr;
 
-	ice_stat_update32(hw, GLPRT_RJC(port), pf->stat_prev_loaded,
-			  &prev_ps->rx_jabber, &cur_ps->rx_jabber);
+	dim = container_of(work, struct dim, work);
+	rc = (struct ice_ring_container *)dim->priv;
 
-	pf->stat_prev_loaded = true;
-}
+	WARN_ON(dim->profile_ix >= ARRAY_SIZE(rx_profile));
 
-/**
- * ice_get_stats64 - get statistics for network device structure
- * @netdev: network interface device structure
- * @stats: main device statistics structure
- */
-static
-void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
-{
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct rtnl_link_stats64 *vsi_stats;
-	struct ice_vsi *vsi = np->vsi;
+	/* look up the values in our local table */
+	itr = rx_profile[dim->profile_ix].itr;
 
-	vsi_stats = &vsi->net_stats;
+	ice_trace(rx_dim_work, container_of(rc, struct ice_q_vector, rx), dim);
+	ice_write_itr(rc, itr);
 
-	if (!vsi->num_txq || !vsi->num_rxq)
-		return;
+	dim->state = DIM_START_MEASURE;
+}
 
-	/* netdev packet/byte stats come from ring counter. These are obtained
-	 * by summing up ring counters (done by ice_update_vsi_ring_stats).
-	 * But, only call the update routine and read the registers if VSI is
-	 * not down.
-	 */
-	if (!test_bit(__ICE_DOWN, vsi->state))
-		ice_update_vsi_ring_stats(vsi);
-	stats->tx_packets = vsi_stats->tx_packets;
-	stats->tx_bytes = vsi_stats->tx_bytes;
-	stats->rx_packets = vsi_stats->rx_packets;
-	stats->rx_bytes = vsi_stats->rx_bytes;
+#define ICE_DIM_DEFAULT_PROFILE_IX 1
 
-	/* The rest of the stats can be read from the hardware but instead we
-	 * just return values that the watchdog task has already obtained from
-	 * the hardware.
-	 */
-	stats->multicast = vsi_stats->multicast;
-	stats->tx_errors = vsi_stats->tx_errors;
-	stats->tx_dropped = vsi_stats->tx_dropped;
-	stats->rx_errors = vsi_stats->rx_errors;
-	stats->rx_dropped = vsi_stats->rx_dropped;
-	stats->rx_crc_errors = vsi_stats->rx_crc_errors;
-	stats->rx_length_errors = vsi_stats->rx_length_errors;
+/**
+ * ice_init_moderation - set up interrupt moderation
+ * @q_vector: the vector containing rings to be configured
+ *
+ * Set up interrupt moderation registers, with the intent to do the right thing
+ * when called from reset or from probe, and whether or not dynamic moderation
+ * is enabled or not. Take special care to write all the registers in both
+ * dynamic mode or not in order to make sure hardware is in a known state.
+ */
+static void ice_init_moderation(struct ice_q_vector *q_vector)
+{
+	struct ice_ring_container *rc;
+	bool tx_dynamic, rx_dynamic;
+
+	rc = &q_vector->tx;
+	INIT_WORK(&rc->dim.work, ice_tx_dim_work);
+	rc->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	rc->dim.profile_ix = ICE_DIM_DEFAULT_PROFILE_IX;
+	rc->dim.priv = rc;
+	tx_dynamic = ITR_IS_DYNAMIC(rc);
+
+	/* set the initial TX ITR to match the above */
+	ice_write_itr(rc, tx_dynamic ?
+		      tx_profile[rc->dim.profile_ix].itr : rc->itr_setting);
+
+	rc = &q_vector->rx;
+	INIT_WORK(&rc->dim.work, ice_rx_dim_work);
+	rc->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	rc->dim.profile_ix = ICE_DIM_DEFAULT_PROFILE_IX;
+	rc->dim.priv = rc;
+	rx_dynamic = ITR_IS_DYNAMIC(rc);
+
+	/* set the initial RX ITR to match the above */
+	ice_write_itr(rc, rx_dynamic ? rx_profile[rc->dim.profile_ix].itr :
+				       rc->itr_setting);
+
+	ice_set_q_vector_intrl(q_vector);
 }
 
 /**
- * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
- * @vsi: VSI having NAPI disabled
+ * ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
+ * @vsi: the VSI being configured
  */
-static void ice_napi_disable_all(struct ice_vsi *vsi)
+static void ice_napi_enable_all(struct ice_vsi *vsi)
 {
 	int q_idx;
 
@@ -3893,8 +7903,495 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
 	ice_for_each_q_vector(vsi, q_idx) {
 		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
 
+		ice_init_moderation(q_vector);
+
 		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_disable(&q_vector->napi);
+			napi_enable(&q_vector->napi);
+	}
+}
+
+/**
+ * ice_up_complete - Finish the last steps of bringing up a connection
+ * @vsi: The VSI being configured
+ *
+ * Return 0 on success and negative value on error
+ */
+static int ice_up_complete(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	ice_vsi_cfg_msix(vsi);
+
+	/* Enable only Rx rings, Tx rings were enabled by the FW when the
+	 * Tx queue group list was configured and the context bits were
+	 * programmed using ice_vsi_cfg_txqs
+	 */
+	err = ice_vsi_start_all_rx_rings(vsi);
+	if (err)
+		return err;
+
+
+	clear_bit(ICE_VSI_DOWN, vsi->state);
+	ice_napi_enable_all(vsi);
+	ice_vsi_ena_irq(vsi);
+
+	if (vsi->port_info &&
+	    (vsi->port_info->phy.link_info.link_info & ICE_AQ_LINK_UP) &&
+	    vsi->netdev && vsi->type == ICE_VSI_PF) {
+		ice_print_link_msg(vsi, true);
+		netif_tx_start_all_queues(vsi->netdev);
+		netif_carrier_on(vsi->netdev);
+		if (!ice_is_e810(&pf->hw))
+			ice_ptp_link_change(pf, pf->hw.pf_id, true);
+	}
+
+
+
+	if (vsi->type == ICE_VSI_PF)
+		ice_service_task_schedule(pf);
+
+	return 0;
+}
+
+/**
+ * ice_up - Bring the connection back up after being down
+ * @vsi: VSI being configured
+ */
+int ice_up(struct ice_vsi *vsi)
+{
+	int err;
+
+	err = ice_vsi_cfg(vsi);
+	if (!err)
+		err = ice_up_complete(vsi);
+
+	return err;
+}
+
+/**
+ * ice_fetch_u64_stats_per_ring - get packets and bytes stats per ring
+ * @ring: Tx or Rx ring to read stats from
+ * @pkts: packets stats counter
+ * @bytes: bytes stats counter
+ *
+ * This function fetches stats from the ring considering the atomic operations
+ * that needs to be performed to read u64 values in 32 bit machine.
+ */
+static void
+ice_fetch_u64_stats_per_ring(struct ice_ring *ring, u64 *pkts, u64 *bytes)
+{
+	unsigned int start;
+	*pkts = 0;
+	*bytes = 0;
+
+	if (!ring)
+		return;
+	do {
+		start = u64_stats_fetch_begin_irq(&ring->syncp);
+		*pkts = ring->stats.pkts;
+		*bytes = ring->stats.bytes;
+	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+}
+
+/**
+ * ice_update_vsi_tx_ring_stats - Update VSI Tx ring stats counters
+ * @vsi: the VSI to be updated
+ * @rings: rings to work on
+ * @count: number of rings
+ */
+static void
+ice_update_vsi_tx_ring_stats(struct ice_vsi *vsi, struct ice_ring **rings,
+			     u16 count)
+{
+	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
+	u16 i;
+
+	for (i = 0; i < count; i++) {
+		struct ice_ring *ring;
+		u64 pkts, bytes;
+
+		ring = READ_ONCE(rings[i]);
+		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
+		vsi_stats->tx_packets += pkts;
+		vsi_stats->tx_bytes += bytes;
+		vsi->tx_restart += ring->tx_stats.restart_q;
+		vsi->tx_busy += ring->tx_stats.tx_busy;
+		vsi->tx_linearize += ring->tx_stats.tx_linearize;
+	}
+}
+
+/**
+ * ice_update_vsi_ring_stats - Update VSI stats counters
+ * @vsi: the VSI to be updated
+ */
+static void ice_update_vsi_ring_stats(struct ice_vsi *vsi)
+{
+	struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats;
+	u64 pkts, bytes;
+	int i;
+
+	/* reset netdev stats */
+	vsi_stats->tx_packets = 0;
+	vsi_stats->tx_bytes = 0;
+	vsi_stats->rx_packets = 0;
+	vsi_stats->rx_bytes = 0;
+
+	/* reset non-netdev (extended) stats */
+	vsi->tx_restart = 0;
+	vsi->tx_busy = 0;
+	vsi->tx_linearize = 0;
+	vsi->rx_buf_failed = 0;
+	vsi->rx_page_failed = 0;
+#ifdef ICE_ADD_PROBES
+	vsi->rx_page_reuse = 0;
+#endif /* ICE_ADD_PROBES */
+
+	rcu_read_lock();
+
+	/* update Tx rings counters */
+	ice_update_vsi_tx_ring_stats(vsi, vsi->tx_rings, vsi->num_txq);
+
+	/* update Rx rings counters */
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = READ_ONCE(vsi->rx_rings[i]);
+
+		ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes);
+		vsi_stats->rx_packets += pkts;
+		vsi_stats->rx_bytes += bytes;
+		vsi->rx_buf_failed += ring->rx_stats.alloc_buf_failed;
+		vsi->rx_page_failed += ring->rx_stats.alloc_page_failed;
+#ifdef ICE_ADD_PROBES
+		vsi->rx_page_reuse += ring->rx_stats.page_reuse;
+#endif /* ICE_ADD_PROBES */
+	}
+
+#ifdef HAVE_XDP_SUPPORT
+	/* update XDP Tx rings counters */
+	if (ice_is_xdp_ena_vsi(vsi))
+		ice_update_vsi_tx_ring_stats(vsi, vsi->xdp_rings,
+					     vsi->num_xdp_txq);
+
+#endif /* HAVE_XDP_SUPPORT */
+	rcu_read_unlock();
+}
+
+/**
+ * ice_update_vsi_stats - Update VSI stats counters
+ * @vsi: the VSI to be updated
+ */
+void ice_update_vsi_stats(struct ice_vsi *vsi)
+{
+	struct rtnl_link_stats64 *cur_ns = &vsi->net_stats;
+	struct ice_eth_stats *cur_es = &vsi->eth_stats;
+	struct ice_pf *pf = vsi->back;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state) ||
+	    test_bit(ICE_CFG_BUSY, pf->state))
+		return;
+
+	/* get stats as recorded by Tx/Rx rings */
+	ice_update_vsi_ring_stats(vsi);
+
+	/* get VSI stats as recorded by the hardware */
+	ice_update_eth_stats(vsi);
+
+	cur_ns->tx_errors = cur_es->tx_errors;
+	cur_ns->rx_dropped = cur_es->rx_discards;
+	cur_ns->tx_dropped = cur_es->tx_discards;
+	cur_ns->multicast = cur_es->rx_multicast;
+
+	/* update some more netdev stats if this is main VSI */
+	if (vsi->type == ICE_VSI_PF) {
+		cur_ns->rx_crc_errors = pf->stats.crc_errors;
+		cur_ns->rx_errors = pf->stats.crc_errors +
+				    pf->stats.illegal_bytes +
+				    pf->stats.rx_len_errors +
+				    pf->stats.rx_undersize +
+				    pf->hw_csum_rx_error +
+				    pf->stats.rx_jabber +
+				    pf->stats.rx_fragments +
+				    pf->stats.rx_oversize;
+		cur_ns->rx_length_errors = pf->stats.rx_len_errors;
+		/* record drops from the port level */
+		cur_ns->rx_missed_errors = pf->stats.eth.rx_discards;
+	}
+}
+
+/**
+ * ice_update_pf_stats - Update PF port stats counters
+ * @pf: PF whose stats needs to be updated
+ */
+void ice_update_pf_stats(struct ice_pf *pf)
+{
+	struct ice_hw_port_stats *prev_ps, *cur_ps;
+	struct ice_hw *hw = &pf->hw;
+	u16 fd_ctr_base;
+	u8 port;
+
+	port = hw->port_info->lport;
+	prev_ps = &pf->stats_prev;
+	cur_ps = &pf->stats;
+
+	ice_stat_update40(hw, GLPRT_GORCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_bytes,
+			  &cur_ps->eth.rx_bytes);
+
+	ice_stat_update40(hw, GLPRT_UPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_unicast,
+			  &cur_ps->eth.rx_unicast);
+
+	ice_stat_update40(hw, GLPRT_MPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_multicast,
+			  &cur_ps->eth.rx_multicast);
+
+	ice_stat_update40(hw, GLPRT_BPRCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_broadcast,
+			  &cur_ps->eth.rx_broadcast);
+
+	ice_stat_update32(hw, PRTRPB_RDPC, pf->stat_prev_loaded,
+			  &prev_ps->eth.rx_discards,
+			  &cur_ps->eth.rx_discards);
+
+	ice_stat_update40(hw, GLPRT_GOTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_bytes,
+			  &cur_ps->eth.tx_bytes);
+
+	ice_stat_update40(hw, GLPRT_UPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_unicast,
+			  &cur_ps->eth.tx_unicast);
+
+	ice_stat_update40(hw, GLPRT_MPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_multicast,
+			  &cur_ps->eth.tx_multicast);
+
+	ice_stat_update40(hw, GLPRT_BPTCL(port), pf->stat_prev_loaded,
+			  &prev_ps->eth.tx_broadcast,
+			  &cur_ps->eth.tx_broadcast);
+
+	ice_stat_update32(hw, GLPRT_TDOLD(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_dropped_link_down,
+			  &cur_ps->tx_dropped_link_down);
+
+	ice_stat_update40(hw, GLPRT_PRC64L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_64, &cur_ps->rx_size_64);
+
+	ice_stat_update40(hw, GLPRT_PRC127L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_127, &cur_ps->rx_size_127);
+
+	ice_stat_update40(hw, GLPRT_PRC255L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_255, &cur_ps->rx_size_255);
+
+	ice_stat_update40(hw, GLPRT_PRC511L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_511, &cur_ps->rx_size_511);
+
+	ice_stat_update40(hw, GLPRT_PRC1023L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_1023, &cur_ps->rx_size_1023);
+
+	ice_stat_update40(hw, GLPRT_PRC1522L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_1522, &cur_ps->rx_size_1522);
+
+	ice_stat_update40(hw, GLPRT_PRC9522L(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_size_big, &cur_ps->rx_size_big);
+
+	ice_stat_update40(hw, GLPRT_PTC64L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_64, &cur_ps->tx_size_64);
+
+	ice_stat_update40(hw, GLPRT_PTC127L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_127, &cur_ps->tx_size_127);
+
+	ice_stat_update40(hw, GLPRT_PTC255L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_255, &cur_ps->tx_size_255);
+
+	ice_stat_update40(hw, GLPRT_PTC511L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_511, &cur_ps->tx_size_511);
+
+	ice_stat_update40(hw, GLPRT_PTC1023L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_1023, &cur_ps->tx_size_1023);
+
+	ice_stat_update40(hw, GLPRT_PTC1522L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_1522, &cur_ps->tx_size_1522);
+
+	ice_stat_update40(hw, GLPRT_PTC9522L(port), pf->stat_prev_loaded,
+			  &prev_ps->tx_size_big, &cur_ps->tx_size_big);
+
+	fd_ctr_base = hw->fd_ctr_base;
+
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_FD_SB_STAT_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->fd_sb_match,
+			  &cur_ps->fd_sb_match);
+#ifdef ADQ_PERF_COUNTERS
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_FD_CH_STAT_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->ch_atr_match,
+			  &cur_ps->ch_atr_match);
+#endif /* ADQ_PERF_COUNTERS */
+#ifdef ICE_ADD_PROBES
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_TCPV4_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_tcpv4_match,
+			  &cur_ps->arfs_tcpv4_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_TCPV6_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_tcpv6_match,
+			  &cur_ps->arfs_tcpv6_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_UDPV4_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_udpv4_match,
+			  &cur_ps->arfs_udpv4_match);
+	ice_stat_update40(hw,
+			  GLSTAT_FD_CNT0L(ICE_ARFS_STAT_UDPV6_IDX(fd_ctr_base)),
+			  pf->stat_prev_loaded, &prev_ps->arfs_udpv6_match,
+			  &cur_ps->arfs_udpv6_match);
+#endif /* ICE_ADD_PROBES */
+	ice_stat_update32(hw, GLPRT_LXONRXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xon_rx, &cur_ps->link_xon_rx);
+
+	ice_stat_update32(hw, GLPRT_LXOFFRXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xoff_rx, &cur_ps->link_xoff_rx);
+
+	ice_stat_update32(hw, GLPRT_LXONTXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xon_tx, &cur_ps->link_xon_tx);
+
+	ice_stat_update32(hw, GLPRT_LXOFFTXC(port), pf->stat_prev_loaded,
+			  &prev_ps->link_xoff_tx, &cur_ps->link_xoff_tx);
+
+	ice_update_dcb_stats(pf);
+
+	ice_stat_update32(hw, GLPRT_CRCERRS(port), pf->stat_prev_loaded,
+			  &prev_ps->crc_errors, &cur_ps->crc_errors);
+
+	ice_stat_update32(hw, GLPRT_ILLERRC(port), pf->stat_prev_loaded,
+			  &prev_ps->illegal_bytes, &cur_ps->illegal_bytes);
+
+	ice_stat_update32(hw, GLPRT_MLFC(port), pf->stat_prev_loaded,
+			  &prev_ps->mac_local_faults,
+			  &cur_ps->mac_local_faults);
+
+	ice_stat_update32(hw, GLPRT_MRFC(port), pf->stat_prev_loaded,
+			  &prev_ps->mac_remote_faults,
+			  &cur_ps->mac_remote_faults);
+
+	ice_stat_update32(hw, GLPRT_RLEC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_len_errors, &cur_ps->rx_len_errors);
+
+	ice_stat_update32(hw, GLPRT_RUC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_undersize, &cur_ps->rx_undersize);
+
+	ice_stat_update32(hw, GLPRT_RFC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_fragments, &cur_ps->rx_fragments);
+
+	ice_stat_update32(hw, GLPRT_ROC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_oversize, &cur_ps->rx_oversize);
+
+	ice_stat_update32(hw, GLPRT_RJC(port), pf->stat_prev_loaded,
+			  &prev_ps->rx_jabber, &cur_ps->rx_jabber);
+
+
+	cur_ps->fd_sb_status = test_bit(ICE_FLAG_FD_ENA, pf->flags) ? 1 : 0;
+
+	pf->stat_prev_loaded = true;
+}
+
+/**
+ * ice_get_stats64 - get statistics for network device structure
+ * @netdev: network interface device structure
+ * @stats: main device statistics structure
+ */
+static
+#ifdef HAVE_VOID_NDO_GET_STATS64
+void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+#else /* HAVE_VOID_NDO_GET_STATS64 */
+struct rtnl_link_stats64 *
+ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+#endif /* !HAVE_VOID_NDO_GET_STATS64 */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct rtnl_link_stats64 *vsi_stats;
+	struct ice_vsi *vsi = np->vsi;
+
+	vsi_stats = &vsi->net_stats;
+
+	if (!vsi->num_txq || !vsi->num_rxq)
+#ifdef HAVE_VOID_NDO_GET_STATS64
+		return;
+#else
+		return stats;
+#endif
+
+	/* netdev packet/byte stats come from ring counter. These are obtained
+	 * by summing up ring counters (done by ice_update_vsi_ring_stats).
+	 * But, only call the update routine and read the registers if VSI is
+	 * not down.
+	 */
+	if (!test_bit(ICE_VSI_DOWN, vsi->state))
+		ice_update_vsi_ring_stats(vsi);
+	stats->tx_packets = vsi_stats->tx_packets;
+	stats->tx_bytes = vsi_stats->tx_bytes;
+	stats->rx_packets = vsi_stats->rx_packets;
+	stats->rx_bytes = vsi_stats->rx_bytes;
+
+	/* The rest of the stats can be read from the hardware but instead we
+	 * just return values that the watchdog task has already obtained from
+	 * the hardware.
+	 */
+	stats->multicast = vsi_stats->multicast;
+	stats->tx_errors = vsi_stats->tx_errors;
+	stats->tx_dropped = vsi_stats->tx_dropped;
+	stats->rx_errors = vsi_stats->rx_errors;
+	stats->rx_dropped = vsi_stats->rx_dropped;
+	stats->rx_crc_errors = vsi_stats->rx_crc_errors;
+	stats->rx_length_errors = vsi_stats->rx_length_errors;
+#ifndef HAVE_VOID_NDO_GET_STATS64
+
+	return stats;
+#endif
+}
+
+#ifdef HAVE_NETPOLL_CONTROLLER
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/**
+ * ice_netpoll - polling "interrupt" handler
+ * @netdev: network interface device structure
+ *
+ * Used by netconsole to send skbs without having to re-enable interrupts.
+ * This is not called in the normal interrupt path.
+ */
+static void ice_netpoll(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	int i;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	ice_for_each_q_vector(vsi, i)
+		ice_msix_clean_rings(0, vsi->q_vectors[i]);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+#endif /* HAVE_NETPOLL_CONTROLLER */
+
+/**
+ * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
+ * @vsi: VSI having NAPI disabled
+ */
+static void ice_napi_disable_all(struct ice_vsi *vsi)
+{
+	int q_idx;
+
+	if (!vsi->netdev)
+		return;
+
+	ice_for_each_q_vector(vsi, q_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring || q_vector->tx.ring)
+			napi_disable(&q_vector->napi);
+
+		cancel_work_sync(&q_vector->tx.dim.work);
+		cancel_work_sync(&q_vector->rx.dim.work);
 	}
 }
 
@@ -3902,827 +8399,2967 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)
  * ice_down - Shutdown the connection
  * @vsi: The VSI being stopped
  */
-int ice_down(struct ice_vsi *vsi)
+int ice_down(struct ice_vsi *vsi)
+{
+	int link_err = 0, vlan_err = 0;
+	int i, tx_err, rx_err;
+
+	/* Caller of this function is expected to set the
+	 * vsi->state ICE_DOWN bit
+	 */
+	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
+		vlan_err = ice_vsi_del_vlan_zero(vsi);
+		if (!ice_is_e810(&vsi->back->hw))
+			ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false);
+		netif_carrier_off(vsi->netdev);
+		netif_tx_disable(vsi->netdev);
+	} else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) {
+		ice_eswitch_stop_all_tx_queues(vsi->back);
+	}
+
+
+	ice_vsi_dis_irq(vsi);
+
+	tx_err = ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, 0);
+	if (tx_err)
+		netdev_err(vsi->netdev, "Failed stop Tx rings, VSI %d error %d\n",
+			   vsi->vsi_num, tx_err);
+#ifdef HAVE_XDP_SUPPORT
+	if (!tx_err && ice_is_xdp_ena_vsi(vsi)) {
+		tx_err = ice_vsi_stop_xdp_tx_rings(vsi);
+		if (tx_err)
+			netdev_err(vsi->netdev, "Failed stop XDP rings, VSI %d error %d\n",
+				   vsi->vsi_num, tx_err);
+	}
+#endif /* HAVE_XDP_SUPPORT */
+
+	rx_err = ice_vsi_stop_all_rx_rings(vsi);
+	if (rx_err)
+		netdev_err(vsi->netdev, "Failed stop Rx rings, VSI %d error %d\n",
+			   vsi->vsi_num, rx_err);
+
+	ice_napi_disable_all(vsi);
+
+	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
+		link_err = ice_force_phys_link_state(vsi, false);
+		if (link_err)
+			netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
+				   vsi->vsi_num, link_err);
+	}
+
+	ice_for_each_txq(vsi, i)
+		ice_clean_tx_ring(vsi->tx_rings[i]);
+
+	ice_for_each_rxq(vsi, i)
+		ice_clean_rx_ring(vsi->rx_rings[i]);
+
+	if (tx_err || rx_err || link_err || vlan_err) {
+		netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n",
+			   vsi->vsi_num, vsi->vsw->sw_id);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_setup_tx_rings - Allocate VSI Tx queue resources
+ * @vsi: VSI having resources allocated
+ *
+ * Return 0 on success, negative on failure
+ */
+int ice_vsi_setup_tx_rings(struct ice_vsi *vsi)
+{
+	int i, err = 0;
+
+	if (!vsi->num_txq) {
+		dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Tx queues\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	ice_for_each_txq(vsi, i) {
+		struct ice_ring *ring = vsi->tx_rings[i];
+
+		if (!ring)
+			return -EINVAL;
+		if (vsi->netdev)
+			ring->netdev = vsi->netdev;
+		err = ice_setup_tx_ring(ring);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_setup_rx_rings - Allocate VSI Rx queue resources
+ * @vsi: VSI having resources allocated
+ *
+ * Return 0 on success, negative on failure
+ */
+int ice_vsi_setup_rx_rings(struct ice_vsi *vsi)
+{
+	int i, err = 0;
+
+	if (!vsi->num_rxq) {
+		dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Rx queues\n",
+			vsi->vsi_num);
+		return -EINVAL;
+	}
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *ring = vsi->rx_rings[i];
+
+		if (!ring)
+			return -EINVAL;
+		if (vsi->netdev)
+			ring->netdev = vsi->netdev;
+		err = ice_setup_rx_ring(ring);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_open_ctrl - open control VSI for use
+ * @vsi: the VSI to open
+ *
+ * Initialization of the Control VSI
+ *
+ * Returns 0 on success, negative value on error
+ */
+int ice_vsi_open_ctrl(struct ice_vsi *vsi)
+{
+	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	/* allocate descriptors */
+	err = ice_vsi_setup_tx_rings(vsi);
+	if (err)
+		goto err_setup_tx;
+
+	err = ice_vsi_setup_rx_rings(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	err = ice_vsi_cfg(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	snprintf(int_name, sizeof(int_name) - 1, "%s-%s:ctrl",
+		 dev_driver_string(dev), dev_name(dev));
+	err = ice_vsi_req_irq_msix(vsi, int_name);
+	if (err)
+		goto err_setup_rx;
+
+	ice_vsi_cfg_msix(vsi);
+
+	err = ice_vsi_start_all_rx_rings(vsi);
+	if (err)
+		goto err_up_complete;
+
+	clear_bit(ICE_VSI_DOWN, vsi->state);
+	ice_vsi_ena_irq(vsi);
+
+	return 0;
+
+err_up_complete:
+	ice_down(vsi);
+err_setup_rx:
+	ice_vsi_free_rx_rings(vsi);
+err_setup_tx:
+	ice_vsi_free_tx_rings(vsi);
+
+	return err;
+}
+
+/**
+ * ice_vsi_open - Called when a network interface is made active
+ * @vsi: the VSI to open
+ *
+ * Initialization of the VSI
+ *
+ * Returns 0 on success, negative value on error
+ */
+int ice_vsi_open(struct ice_vsi *vsi)
+{
+	char int_name[ICE_INT_NAME_STR_LEN];
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	/* allocate descriptors */
+	err = ice_vsi_setup_tx_rings(vsi);
+	if (err)
+		goto err_setup_tx;
+
+	err = ice_vsi_setup_rx_rings(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	err = ice_vsi_cfg(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	snprintf(int_name, sizeof(int_name) - 1, "%s-%s",
+		 dev_driver_string(ice_pf_to_dev(pf)), vsi->netdev->name);
+	err = ice_vsi_req_irq_msix(vsi, int_name);
+	if (err)
+		goto err_setup_rx;
+
+	if (vsi->type == ICE_VSI_PF) {
+#ifdef HAVE_NETDEV_SB_DEV
+		unsigned int total_qs = vsi->num_txq;
+
+		if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags))
+			total_qs = vsi->alloc_txq + pf->max_num_macvlan;
+
+		/* Notify the stack of the actual queue counts. */
+		err = netif_set_real_num_tx_queues(vsi->netdev, total_qs);
+#else
+		/* Notify the stack of the actual queue counts. */
+		err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
+#endif /* HAVE_NETDEV_SB_DEV */
+		if (err)
+			goto err_set_qs;
+
+		err = netif_set_real_num_rx_queues(vsi->netdev, vsi->num_rxq);
+		if (err)
+			goto err_set_qs;
+	}
+
+	err = ice_up_complete(vsi);
+	if (err)
+		goto err_up_complete;
+	return 0;
+
+err_up_complete:
+	ice_down(vsi);
+err_set_qs:
+	ice_vsi_free_irq(vsi);
+err_setup_rx:
+	ice_vsi_free_rx_rings(vsi);
+err_setup_tx:
+	ice_vsi_free_tx_rings(vsi);
+
+	return err;
+}
+
+/**
+ * ice_vsi_release_all - Delete all VSIs
+ * @pf: PF from which all VSIs are being removed
+ */
+static void ice_vsi_release_all(struct ice_pf *pf)
+{
+	int err, i;
+
+	if (!pf->vsi)
+		return;
+
+	ice_for_each_vsi(pf, i) {
+		if (!pf->vsi[i])
+			continue;
+
+		if (pf->vsi[i]->type == ICE_VSI_CHNL)
+			continue;
+
+		err = ice_vsi_release(pf->vsi[i]);
+		if (err)
+			dev_dbg(ice_pf_to_dev(pf), "Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
+				i, err, pf->vsi[i]->vsi_num);
+	}
+}
+
+/**
+ * ice_vsi_rebuild_by_type - Rebuild VSI of a given type
+ * @pf: pointer to the PF instance
+ * @type: VSI type to rebuild
+ *
+ * Iterates through the pf->vsi array and rebuilds VSIs of the requested type
+ */
+static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_status status;
+	int i, err;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi || vsi->type != type)
+			continue;
+
+		/* rebuild the VSI */
+		err = ice_vsi_rebuild(vsi, true);
+		if (err) {
+			dev_err(dev, "rebuild VSI failed, err %d, VSI index %d, type %s\n",
+				err, vsi->idx, ice_vsi_type_str(type));
+			return err;
+		}
+
+		/* replay filters for the VSI */
+		status = ice_replay_vsi(&pf->hw, vsi->idx);
+		if (status) {
+			dev_err(dev, "replay VSI failed, status %s, VSI index %d, type %s\n",
+				ice_stat_str(status), vsi->idx,
+				ice_vsi_type_str(type));
+			return -EIO;
+		}
+
+		/* Re-map HW VSI number, using VSI handle that has been
+		 * previously validated in ice_replay_vsi() call above
+		 */
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+
+		/* enable the VSI */
+		err = ice_ena_vsi(vsi, false);
+		if (err) {
+			dev_err(dev, "enable VSI failed, err %d, VSI index %d, type %s\n",
+				err, vsi->idx, ice_vsi_type_str(type));
+			return err;
+		}
+
+		dev_info(dev, "VSI rebuilt. VSI index %d, type %s\n", vsi->idx,
+			 ice_vsi_type_str(type));
+	}
+
+	return 0;
+}
+
+/**
+ * ice_update_pf_netdev_link - Update PF netdev link status
+ * @pf: pointer to the PF instance
+ */
+static void ice_update_pf_netdev_link(struct ice_pf *pf)
+{
+	bool link_up;
+	int i;
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+
+		if (!vsi || vsi->type != ICE_VSI_PF)
+			return;
+
+		ice_get_link_status(pf->vsi[i]->port_info, &link_up);
+		if (link_up) {
+			netif_carrier_on(pf->vsi[i]->netdev);
+			netif_tx_wake_all_queues(pf->vsi[i]->netdev);
+		} else {
+			netif_carrier_off(pf->vsi[i]->netdev);
+			netif_tx_stop_all_queues(pf->vsi[i]->netdev);
+		}
+	}
+}
+
+/**
+ * ice_rebuild - rebuild after reset
+ * @pf: PF to rebuild
+ * @reset_type: type of reset
+ *
+ * Do not rebuild VF VSI in this flow because that is already handled via
+ * ice_reset_all_vfs(). This is because requirements for resetting a VF after a
+ * PFR/CORER/GLOBER/etc. are different than the normal flow. Also, we don't want
+ * to reset/rebuild all the VF VSI twice.
+ */
+static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
+{
+	struct ice_fwlog_user_input user_input = { 0 };
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status ret;
+	bool dvm;
+	int err;
+
+	if (test_bit(ICE_DOWN, pf->state))
+		goto clear_recovery;
+
+	dev_dbg(dev, "rebuilding PF after reset_type=%d\n", reset_type);
+
+#define ICE_EMP_RESET_SLEEP 5000
+	if (reset_type == ICE_RESET_EMPR)
+		msleep(ICE_EMP_RESET_SLEEP);
+
+
+	ret = ice_init_all_ctrlq(hw);
+	if (ret) {
+		dev_err(dev, "control queues init failed %s\n",
+			ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	user_input.log_level = fwlog_level;
+	user_input.events = fwlog_events;
+	if (!ice_pf_fwlog_set(pf, &user_input)) {
+		if (ice_fwlog_register(hw))
+			dev_dbg(dev, "Failed to register for FW logging events\n");
+	} else {
+		dev_dbg(dev, "Failed to re-enable FW logging\n");
+	}
+	/* if DDP was previously loaded successfully */
+	if (!ice_is_safe_mode(pf)) {
+		/* reload the SW DB of filter tables */
+		if (reset_type == ICE_RESET_PFR) {
+			ice_fill_blk_tbls(hw);
+		} else {
+			/* Reload DDP Package after CORER/GLOBR reset */
+			ice_load_pkg(NULL, pf);
+
+			/* check if package reloaded */
+			if (ice_is_safe_mode(pf)) {
+				dev_err(dev, "failed to reload DDP Package\n");
+				if (ice_prepare_for_safe_mode(pf)) {
+					dev_err(dev, "could not transition to safe mode\n");
+					goto err_init_ctrlq;
+				}
+			}
+		}
+	}
+
+	ret = ice_clear_pf_cfg(hw);
+	if (ret) {
+		dev_err(dev, "clear PF configuration failed %s\n",
+			ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	if (pf->first_sw->dflt_vsi_ena)
+		dev_info(dev, "Clearing default VSI, re-enable after reset completes\n");
+	/* clear the default VSI configuration if it exists */
+	pf->first_sw->dflt_vsi = NULL;
+	pf->first_sw->dflt_vsi_ena = false;
+
+	ice_clear_pxe_mode(hw);
+
+	ret = ice_init_nvm(hw);
+	if (ret) {
+		dev_err(dev, "ice_init_nvm failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	ret = ice_get_caps(hw);
+	if (ret) {
+		dev_err(dev, "ice_get_caps failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	ret = ice_aq_set_mac_cfg(hw, ICE_AQ_SET_MAC_FRAME_SIZE_MAX, NULL);
+	if (ret) {
+		dev_err(dev, "set_mac_cfg failed %s\n", ice_stat_str(ret));
+		goto err_init_ctrlq;
+	}
+
+	dvm = ice_is_dvm_ena(hw);
+
+	err = ice_aq_set_port_params(pf->hw.port_info, 0, false, false, dvm,
+				     NULL);
+	if (err)
+		goto err_init_ctrlq;
+
+	err = ice_sched_init_port(hw->port_info);
+	if (err)
+		goto err_sched_init_port;
+
+
+	ice_pf_reset_stats(pf);
+
+	/* start misc vector */
+	err = ice_req_irq_msix_misc(pf);
+	if (err) {
+		dev_err(dev, "misc vector setup failed: %d\n", err);
+		goto err_sched_init_port;
+	}
+
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		wr32(hw, PFQF_FD_ENA, PFQF_FD_ENA_FD_ENA_M);
+		if (!rd32(hw, PFQF_FD_SIZE)) {
+			u16 unused, guar, b_effort;
+
+			guar = hw->func_caps.fd_fltr_guar;
+			b_effort = hw->func_caps.fd_fltr_best_effort;
+
+			/* force guaranteed filter pool for PF */
+			ice_alloc_fd_guar_item(hw, &unused, guar);
+			/* force shared filter pool for PF */
+			ice_alloc_fd_shrd_item(hw, &unused, b_effort);
+		}
+	}
+
+	if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
+		ice_dcb_rebuild(pf);
+
+	/* If the PF previously had enabled PTP, PTP init needs to happen before
+	 * the VSI rebuild. If not, this causes the PTP link status events to
+	 * fail.
+	 */
+	if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		ice_ptp_init(pf);
+
+
+	/* rebuild PF VSI */
+	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_PF);
+	if (err) {
+		dev_err(dev, "PF VSI rebuild failed: %d\n", err);
+		goto err_vsi_rebuild;
+	}
+	if (ice_is_peer_ena(pf)) {
+		struct ice_vsi *vsi = ice_get_main_vsi(pf);
+
+		if (!vsi) {
+			dev_err(dev, "No PF_VSI to update peer\n");
+			goto err_vsi_rebuild;
+		}
+		ice_for_each_peer(pf, vsi, ice_peer_update_vsi);
+	}
+#ifdef HAVE_NETDEV_SB_DEV
+	if (test_bit(ICE_FLAG_MACVLAN_ENA, pf->flags)) {
+		struct ice_vsi *vsi;
+
+		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_OFFLOAD_MACVLAN);
+		if (err) {
+			dev_err(dev, "MACVLAN VSI rebuild failed: %d\n", err);
+			goto err_vsi_rebuild;
+		}
+
+		vsi = ice_get_main_vsi(pf);
+		if (!vsi) {
+			dev_err(dev, "main VSI doesn't exist\n");
+			goto err_vsi_rebuild;
+		}
+
+		err = ice_init_macvlan(vsi, false);
+		if (err) {
+			dev_err(dev, "Failed to init macvlan\n");
+			goto err_vsi_rebuild;
+		}
+
+		ice_vsi_replay_macvlan(pf);
+	}
+#endif /* HAVE_NETDEV_SB_DEV */
+
+	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_SWITCHDEV_CTRL);
+	if (err) {
+		dev_err(dev, "Switchdev CTRL VSI rebuild failed: %d\n", err);
+		goto err_vsi_rebuild;
+	}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (reset_type == ICE_RESET_PFR) {
+		err = ice_rebuild_channels(pf);
+		if (err) {
+			dev_err(dev, "failed to rebuild and replay ADQ VSIs, err %d\n",
+				err);
+			goto err_vsi_rebuild;
+		}
+	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	/* If Flow Director is active */
+	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
+		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_CTRL);
+		if (err) {
+			dev_err(dev, "control VSI rebuild failed: %d\n", err);
+			goto err_vsi_rebuild;
+		}
+
+		/* replay HW Flow Director recipes */
+		if (hw->fdir_prof)
+			ice_fdir_replay_flows(hw);
+
+		/* replay Flow Director filters */
+		ice_fdir_replay_fltrs(pf);
+
+		ice_rebuild_arfs(pf);
+	}
+
+
+	ice_update_pf_netdev_link(pf);
+
+	ice_config_health_events(pf, true);
+
+	/* tell the firmware we are up */
+	ret = ice_send_version(pf);
+	if (ret) {
+		dev_err(dev, "Rebuild failed due to error sending driver version: %s\n",
+			ice_stat_str(ret));
+		goto err_vsi_rebuild;
+	}
+
+	ice_replay_post(hw);
+	/* if we get here, reset flow is successful */
+	clear_bit(ICE_RESET_FAILED, pf->state);
+	return;
+
+err_vsi_rebuild:
+
+err_sched_init_port:
+	ice_sched_cleanup_all(hw);
+err_init_ctrlq:
+	ice_shutdown_all_ctrlq(hw);
+	set_bit(ICE_RESET_FAILED, pf->state);
+clear_recovery:
+	/* set this bit in PF state to control service task scheduling */
+	set_bit(ICE_NEEDS_RESTART, pf->state);
+	dev_err(dev, "Rebuild failed, unload and reload driver\n");
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_max_xdp_frame_size - returns the maximum allowed frame size for XDP
+ * @vsi: Pointer to VSI structure
+ */
+static int ice_max_xdp_frame_size(struct ice_vsi *vsi)
+{
+	if (PAGE_SIZE >= 8192 || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
+		return ICE_RXBUF_2048 - XDP_PACKET_HEADROOM;
+	else
+		return ICE_RXBUF_3072;
+}
+#endif /* HAVE_XDP_SUPPORT */
+
+/**
+ * ice_change_mtu - NDO callback to change the MTU
+ * @netdev: network interface device structure
+ * @new_mtu: new value for maximum frame size
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_event *event;
+	u8 count = 0;
+	int err = 0;
+
+	if (new_mtu == (int)netdev->mtu) {
+		netdev_warn(netdev, "MTU is already %u\n", netdev->mtu);
+		return 0;
+	}
+#ifdef HAVE_XDP_SUPPORT
+
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		int frame_size = ice_max_xdp_frame_size(vsi);
+
+		if (new_mtu + ICE_ETH_PKT_HDR_PAD > frame_size) {
+			netdev_err(netdev, "max MTU for XDP usage is %d\n",
+				   frame_size - ICE_ETH_PKT_HDR_PAD);
+			return -EINVAL;
+		}
+	}
+
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_NETDEVICE_MIN_MAX_MTU
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	if (new_mtu < netdev->extended->min_mtu) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   netdev->extended->min_mtu);
+		return -EINVAL;
+	} else if (new_mtu > netdev->extended->max_mtu) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   netdev->extended->min_mtu);
+		return -EINVAL;
+	}
+#else /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+	if (new_mtu < (int)netdev->min_mtu) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	} else if (new_mtu > (int)netdev->max_mtu) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	}
+#endif /* HAVE_RHEL7_EXTENDED_MIN_MAX_MTU */
+#else /* HAVE_NETDEVICE_MIN_MAX_MTU */
+	if (new_mtu < ETH_MIN_MTU) {
+		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
+			   ETH_MIN_MTU);
+		return -EINVAL;
+	} else if (new_mtu > ICE_MAX_MTU) {
+		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
+			   ICE_MAX_MTU);
+		return -EINVAL;
+	}
+#endif /* HAVE_NETDEVICE_MIN_MAX_MTU */
+	/* if a reset is in progress, wait for some time for it to complete */
+	do {
+		if (ice_is_reset_in_progress(pf->state)) {
+			count++;
+			usleep_range(1000, 2000);
+		} else {
+			break;
+		}
+
+	} while (count < 100);
+
+	if (count == 100) {
+		netdev_err(netdev, "can't change MTU. Device is busy\n");
+		return -EBUSY;
+	}
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	netdev->mtu = (unsigned int)new_mtu;
+
+	/* if VSI is up, bring it down and then back up */
+	if (!test_and_set_bit(ICE_VSI_DOWN, vsi->state)) {
+		err = ice_down(vsi);
+		if (err) {
+			netdev_err(netdev, "change MTU if_down err %d\n", err);
+			goto free_event;
+		}
+
+		err = ice_up(vsi);
+		if (err) {
+			netdev_err(netdev, "change MTU if_up err %d\n", err);
+			goto free_event;
+		}
+	}
+
+	if (ice_is_safe_mode(pf))
+		goto out;
+
+	set_bit(ICE_EVENT_MTU_CHANGE, event->type);
+	event->reporter = NULL;
+	event->info.mtu = (u16)new_mtu;
+	ice_for_each_peer(pf, event, ice_peer_check_for_reg);
+
+out:
+	netdev_dbg(netdev, "changed MTU to %d\n", new_mtu);
+free_event:
+	kfree(event);
+	return err;
+}
+
+/**
+ * ice_do_ioctl - Access the hwtstamp interface
+ * @netdev: network interface device structure
+ * @ifr: interface request data
+ * @cmd: ioctl command
+ */
+static int ice_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+
+	switch (cmd) {
+#ifdef SIOCGHWTSTAMP
+	case SIOCGHWTSTAMP:
+		return ice_ptp_get_ts_config(pf, ifr);
+#endif
+	case SIOCSHWTSTAMP:
+		return ice_ptp_set_ts_config(pf, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
+ * ice_aq_str - convert AQ err code to a string
+ * @aq_err: the AQ error code to convert
+ */
+const char *ice_aq_str(enum ice_aq_err aq_err)
+{
+	switch (aq_err) {
+	case ICE_AQ_RC_OK:
+		return "OK";
+	case ICE_AQ_RC_EPERM:
+		return "ICE_AQ_RC_EPERM";
+	case ICE_AQ_RC_ENOENT:
+		return "ICE_AQ_RC_ENOENT";
+	case ICE_AQ_RC_ESRCH:
+		return "ICE_AQ_RC_ESRCH";
+	case ICE_AQ_RC_EINTR:
+		return "ICE_AQ_RC_EINTR";
+	case ICE_AQ_RC_EIO:
+		return "ICE_AQ_RC_EIO";
+	case ICE_AQ_RC_ENXIO:
+		return "ICE_AQ_RC_ENXIO";
+	case ICE_AQ_RC_E2BIG:
+		return "ICE_AQ_RC_E2BIG";
+	case ICE_AQ_RC_EAGAIN:
+		return "ICE_AQ_RC_EAGAIN";
+	case ICE_AQ_RC_ENOMEM:
+		return "ICE_AQ_RC_ENOMEM";
+	case ICE_AQ_RC_EACCES:
+		return "ICE_AQ_RC_EACCES";
+	case ICE_AQ_RC_EFAULT:
+		return "ICE_AQ_RC_EFAULT";
+	case ICE_AQ_RC_EBUSY:
+		return "ICE_AQ_RC_EBUSY";
+	case ICE_AQ_RC_EEXIST:
+		return "ICE_AQ_RC_EEXIST";
+	case ICE_AQ_RC_EINVAL:
+		return "ICE_AQ_RC_EINVAL";
+	case ICE_AQ_RC_ENOTTY:
+		return "ICE_AQ_RC_ENOTTY";
+	case ICE_AQ_RC_ENOSPC:
+		return "ICE_AQ_RC_ENOSPC";
+	case ICE_AQ_RC_ENOSYS:
+		return "ICE_AQ_RC_ENOSYS";
+	case ICE_AQ_RC_ERANGE:
+		return "ICE_AQ_RC_ERANGE";
+	case ICE_AQ_RC_EFLUSHED:
+		return "ICE_AQ_RC_EFLUSHED";
+	case ICE_AQ_RC_BAD_ADDR:
+		return "ICE_AQ_RC_BAD_ADDR";
+	case ICE_AQ_RC_EMODE:
+		return "ICE_AQ_RC_EMODE";
+	case ICE_AQ_RC_EFBIG:
+		return "ICE_AQ_RC_EFBIG";
+	case ICE_AQ_RC_ESBCOMP:
+		return "ICE_AQ_RC_ESBCOMP";
+	case ICE_AQ_RC_ENOSEC:
+		return "ICE_AQ_RC_ENOSEC";
+	case ICE_AQ_RC_EBADSIG:
+		return "ICE_AQ_RC_EBADSIG";
+	case ICE_AQ_RC_ESVN:
+		return "ICE_AQ_RC_ESVN";
+	case ICE_AQ_RC_EBADMAN:
+		return "ICE_AQ_RC_EBADMAN";
+	case ICE_AQ_RC_EBADBUF:
+		return "ICE_AQ_RC_EBADBUF";
+	case ICE_AQ_RC_EACCES_BMCU:
+		return "ICE_AQ_RC_EACCES_BMCU";
+	}
+
+	return "ICE_AQ_RC_UNKNOWN";
+}
+
+/**
+ * ice_stat_str - convert status err code to a string
+ * @stat_err: the status error code to convert
+ */
+const char *ice_stat_str(enum ice_status stat_err)
+{
+	switch (stat_err) {
+	case ICE_SUCCESS:
+		return "OK";
+	case ICE_ERR_PARAM:
+		return "ICE_ERR_PARAM";
+	case ICE_ERR_NOT_IMPL:
+		return "ICE_ERR_NOT_IMPL";
+	case ICE_ERR_NOT_READY:
+		return "ICE_ERR_NOT_READY";
+	case ICE_ERR_NOT_SUPPORTED:
+		return "ICE_ERR_NOT_SUPPORTED";
+	case ICE_ERR_BAD_PTR:
+		return "ICE_ERR_BAD_PTR";
+	case ICE_ERR_INVAL_SIZE:
+		return "ICE_ERR_INVAL_SIZE";
+	case ICE_ERR_DEVICE_NOT_SUPPORTED:
+		return "ICE_ERR_DEVICE_NOT_SUPPORTED";
+	case ICE_ERR_RESET_FAILED:
+		return "ICE_ERR_RESET_FAILED";
+	case ICE_ERR_FW_API_VER:
+		return "ICE_ERR_FW_API_VER";
+	case ICE_ERR_NO_MEMORY:
+		return "ICE_ERR_NO_MEMORY";
+	case ICE_ERR_CFG:
+		return "ICE_ERR_CFG";
+	case ICE_ERR_OUT_OF_RANGE:
+		return "ICE_ERR_OUT_OF_RANGE";
+	case ICE_ERR_ALREADY_EXISTS:
+		return "ICE_ERR_ALREADY_EXISTS";
+	case ICE_ERR_NVM:
+		return "ICE_ERR_NVM";
+	case ICE_ERR_NVM_CHECKSUM:
+		return "ICE_ERR_NVM_CHECKSUM";
+	case ICE_ERR_BUF_TOO_SHORT:
+		return "ICE_ERR_BUF_TOO_SHORT";
+	case ICE_ERR_NVM_BLANK_MODE:
+		return "ICE_ERR_NVM_BLANK_MODE";
+	case ICE_ERR_IN_USE:
+		return "ICE_ERR_IN_USE";
+	case ICE_ERR_MAX_LIMIT:
+		return "ICE_ERR_MAX_LIMIT";
+	case ICE_ERR_RESET_ONGOING:
+		return "ICE_ERR_RESET_ONGOING";
+	case ICE_ERR_HW_TABLE:
+		return "ICE_ERR_HW_TABLE";
+	case ICE_ERR_DOES_NOT_EXIST:
+		return "ICE_ERR_DOES_NOT_EXIST";
+	case ICE_ERR_FW_DDP_MISMATCH:
+		return "ICE_ERR_FW_DDP_MISMATCH";
+	case ICE_ERR_AQ_ERROR:
+		return "ICE_ERR_AQ_ERROR";
+	case ICE_ERR_AQ_TIMEOUT:
+		return "ICE_ERR_AQ_TIMEOUT";
+	case ICE_ERR_AQ_FULL:
+		return "ICE_ERR_AQ_FULL";
+	case ICE_ERR_AQ_NO_WORK:
+		return "ICE_ERR_AQ_NO_WORK";
+	case ICE_ERR_AQ_EMPTY:
+		return "ICE_ERR_AQ_EMPTY";
+	case ICE_ERR_AQ_FW_CRITICAL:
+		return "ICE_ERR_AQ_FW_CRITICAL";
+	}
+
+	return "ICE_ERR_UNKNOWN";
+}
+
+/**
+ * ice_set_rss_lut - Set RSS LUT
+ * @vsi: Pointer to VSI structure
+ * @lut: Lookup table
+ * @lut_size: Lookup table size
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_set_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size)
+{
+	struct ice_aq_get_set_rss_lut_params params = {};
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!lut)
+		return -EINVAL;
+
+	params.vsi_handle = vsi->idx;
+	params.lut_size = lut_size;
+	params.lut_type = vsi->rss_lut_type;
+	params.lut = lut;
+	if (vsi->global_lut_id)
+		params.global_lut_id = *vsi->global_lut_id;
+
+	status = ice_aq_set_rss_lut(hw, &params);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot set RSS lut, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_rss_key - Set RSS key
+ * @vsi: Pointer to the VSI structure
+ * @seed: RSS hash seed
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_set_rss_key(struct ice_vsi *vsi, u8 *seed)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!seed)
+		return -EINVAL;
+
+	status = ice_aq_set_rss_key(hw, vsi->idx, (struct ice_aqc_get_set_rss_keys *)seed);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot set RSS key, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_lut - Get RSS LUT
+ * @vsi: Pointer to VSI structure
+ * @lut: Buffer to store the lookup table entries
+ * @lut_size: Size of buffer to store the lookup table entries
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_get_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size)
+{
+	struct ice_aq_get_set_rss_lut_params params = {};
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!lut)
+		return -EINVAL;
+
+	params.vsi_handle = vsi->idx;
+	params.lut_size = lut_size;
+	params.lut_type = vsi->rss_lut_type;
+	params.lut = lut;
+	if (vsi->global_lut_id)
+		params.global_lut_id = *vsi->global_lut_id;
+
+	status = ice_aq_get_rss_lut(hw, &params);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot get RSS lut, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_rss_key - Get RSS key
+ * @vsi: Pointer to VSI structure
+ * @seed: Buffer to store the key in
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_get_rss_key(struct ice_vsi *vsi, u8 *seed)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	enum ice_status status;
+
+	if (!seed)
+		return -EINVAL;
+
+	status = ice_aq_get_rss_key(hw, vsi->idx, (struct ice_aqc_get_set_rss_keys *)seed);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Cannot get RSS key, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_bridge_getlink - Get the hardware bridge mode
+ * @skb: skb buff
+ * @pid: process ID
+ * @seq: RTNL message seq
+ * @dev: the netdev being configured
+ * @filter_mask: filter mask passed in
+ * @nlflags: netlink flags passed in
+ *
+ * Return the bridge mode (VEB/VEPA)
+ */
+static int
+#ifdef HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 filter_mask, int nlflags)
+#elif defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS)
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 __always_unused filter_mask,
+		   int nlflags)
+#else
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 __always_unused filter_mask)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 bmode;
+
+	bmode = pf->first_sw->bridge_mode;
+
+#ifdef HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
+				       filter_mask, NULL);
+#elif defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS)
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0,
+				       nlflags);
+#elif defined(HAVE_NDO_FDB_ADD_VID) || defined(NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS)
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0);
+#else
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode);
+#endif
+}
+
+/**
+ * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
+ * @vsi: Pointer to VSI structure
+ * @bmode: Hardware bridge mode (VEB/VEPA)
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+{
+	struct ice_aqc_vsi_props *vsi_props;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int ret = 0;
+
+	vsi_props = &vsi->info;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	if (bmode == BRIDGE_MODE_VEB)
+		/* change from VEPA to VEB mode */
+		ctxt->info.sw_flags |= (ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
+	else
+		/* change from VEB to VEPA mode */
+		ctxt->info.sw_flags &= ~(ICE_AQ_VSI_SW_FLAG_ALLOW_LB |
+					 ICE_AQ_VSI_SW_FLAG_LOCAL_LB);
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for bridge mode failed, bmode = %d err %s aq_err %s\n",
+			bmode, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto out;
+	}
+	/* Update sw flags for book keeping */
+	vsi_props->sw_flags = ctxt->info.sw_flags;
+
+out:
+	kfree(ctxt);
+	return ret;
+}
+
+#ifdef HAVE_NDO_BRIDGE_SETLINK_EXTACK
+/**
+ * ice_bridge_setlink - Set the hardware bridge mode
+ * @dev: the netdev being configured
+ * @nlh: RTNL message
+ * @flags: bridge setlink flags
+ * @extack: netlink extended ack
+ *
+ * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
+ * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
+ * not already set for all VSIs connected to this switch. And also update the
+ * unicast switch filter rules for the corresponding switch of the netdev.
+ */
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags,
+		   struct netlink_ext_ack __always_unused *extack)
+#elif defined(HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS)
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags)
+#else
+static int ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+#endif
 {
-	int i, tx_err, rx_err, link_err = 0;
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_pf *pf = np->vsi->back;
+	struct nlattr *attr, *br_spec;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct ice_sw *pf_sw;
+	int rem, v, err = 0;
 
-	/* Caller of this function is expected to set the
-	 * vsi->state __ICE_DOWN bit
-	 */
-	if (vsi->netdev) {
-		netif_carrier_off(vsi->netdev);
-		netif_tx_disable(vsi->netdev);
+	pf_sw = pf->first_sw;
+	/* find the attribute in the netlink message */
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		__u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+		mode = nla_get_u16(attr);
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+		/* Continue  if bridge mode is not being flipped */
+		if (mode == pf_sw->bridge_mode)
+			continue;
+		/* Iterates through the PF VSI list and update the loopback
+		 * mode of the VSI
+		 */
+		ice_for_each_vsi(pf, v) {
+			if (!pf->vsi[v])
+				continue;
+			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
+			if (err)
+				return err;
+		}
+
+		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
+		/* Update the unicast switch filter rules for the corresponding
+		 * switch of the netdev
+		 */
+		status = ice_update_sw_rule_bridge_mode(hw);
+		if (status) {
+			netdev_err(dev, "switch rule update failed, mode = %d err %s aq_err %s\n",
+				   mode, ice_stat_str(status),
+				   ice_aq_str(hw->adminq.sq_last_status));
+			/* revert hw->evb_veb */
+			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
+			return -EIO;
+		}
+
+		pf_sw->bridge_mode = mode;
 	}
 
-	ice_vsi_dis_irq(vsi);
+	return 0;
+}
 
-	tx_err = ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, 0);
-	if (tx_err)
-		netdev_err(vsi->netdev,
-			   "Failed stop Tx rings, VSI %d error %d\n",
-			   vsi->vsi_num, tx_err);
+#ifdef HAVE_TX_TIMEOUT_TXQUEUE
+/**
+ * ice_tx_timeout - Respond to a Tx Hang
+ * @netdev: network interface device structure
+ * @txqueue: Tx queue
+ */
+static void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue)
+#else
+static void ice_tx_timeout(struct net_device *netdev)
+#endif
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_ring *tx_ring = NULL;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+#ifndef HAVE_TX_TIMEOUT_TXQUEUE
+	unsigned int txqueue = 0;
+#endif /* !HAVE_TX_TIMEOUT_TXQUEUE */
+	unsigned int i;
 
-	rx_err = ice_vsi_stop_rx_rings(vsi);
-	if (rx_err)
-		netdev_err(vsi->netdev,
-			   "Failed stop Rx rings, VSI %d error %d\n",
-			   vsi->vsi_num, rx_err);
+	pf->tx_timeout_count++;
 
-	ice_napi_disable_all(vsi);
+#ifndef HAVE_TX_TIMEOUT_TXQUEUE
+	/* find the stopped queue the same way dev_watchdog() does */
+	for (i = 0; i < netdev->num_tx_queues; i++) {
+		unsigned long trans_start;
+		struct netdev_queue *q;
 
-	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
-		link_err = ice_force_phys_link_state(vsi, false);
-		if (link_err)
-			netdev_err(vsi->netdev,
-				   "Failed to set physical link down, VSI %d error %d\n",
-				   vsi->vsi_num, link_err);
+		q = netdev_get_tx_queue(netdev, i);
+		trans_start = q->trans_start;
+		if (netif_xmit_stopped(q) &&
+		    time_after(jiffies,
+			       trans_start + netdev->watchdog_timeo)) {
+			txqueue = i;
+			break;
+		}
 	}
 
-	ice_for_each_txq(vsi, i)
-		ice_clean_tx_ring(vsi->tx_rings[i]);
+	if (i == netdev->num_tx_queues) {
+		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
+		return;
+	}
+#endif /* !HAVE_TX_TIMEOUT_TXQUEUE */
 
-	ice_for_each_rxq(vsi, i)
-		ice_clean_rx_ring(vsi->rx_rings[i]);
+	/* Check if PFC is enabled for the TC to which the queue belongs
+	 * to. If yes then Tx timeout is not caused by a hung queue, no
+	 * need to reset and rebuild
+	 */
+	if (ice_is_pfc_causing_hung_q(pf, txqueue)) {
+		dev_info(ice_pf_to_dev(pf), "Fake Tx hang detected on queue %u, timeout caused by PFC storm\n",
+			 txqueue);
+		return;
+	}
 
-	if (tx_err || rx_err || link_err) {
-		netdev_err(vsi->netdev,
-			   "Failed to close VSI 0x%04X on switch 0x%04X\n",
-			   vsi->vsi_num, vsi->vsw->sw_id);
-		return -EIO;
+	/* now that we have an index, find the tx_ring struct */
+	for (i = 0; i < vsi->num_txq; i++)
+		if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
+			if (txqueue == vsi->tx_rings[i]->q_index) {
+				tx_ring = vsi->tx_rings[i];
+				break;
+			}
+
+	/* Reset recovery level if enough time has elapsed after last timeout.
+	 * Also ensure no new reset action happens before next timeout period.
+	 */
+	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ * 20)))
+		pf->tx_timeout_recovery_level = 1;
+	else if (time_before(jiffies, (pf->tx_timeout_last_recovery +
+				       netdev->watchdog_timeo)))
+		return;
+
+	if (tx_ring) {
+		struct ice_hw *hw = &pf->hw;
+		u32 head, val = 0;
+
+		head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[txqueue])) &
+			QTX_COMM_HEAD_HEAD_M) >> QTX_COMM_HEAD_HEAD_S;
+		/* Read interrupt register */
+		val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx));
+
+		netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %u, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n",
+			    vsi->vsi_num, tx_ring->q_index,
+			    tx_ring->next_to_clean, head, tx_ring->next_to_use,
+			    val);
 	}
 
-	return 0;
+	pf->tx_timeout_last_recovery = jiffies;
+	netdev_info(netdev, "tx_timeout recovery level %d, txqueue %u\n",
+		    pf->tx_timeout_recovery_level, txqueue);
+
+	switch (pf->tx_timeout_recovery_level) {
+	case 1:
+		set_bit(ICE_PFR_REQ, pf->state);
+		break;
+	case 2:
+		set_bit(ICE_CORER_REQ, pf->state);
+		break;
+	case 3:
+		set_bit(ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
+		set_bit(ICE_DOWN, pf->state);
+		set_bit(ICE_VSI_NEEDS_RESTART, vsi->state);
+		set_bit(ICE_SERVICE_DIS, pf->state);
+		break;
+	}
+
+	ice_service_task_schedule(pf);
+	pf->tx_timeout_recovery_level++;
 }
 
 /**
- * ice_vsi_setup_tx_rings - Allocate VSI Tx queue resources
- * @vsi: VSI having resources allocated
- *
- * Return 0 on success, negative on failure
+ * ice_udp_tunnel_add - Get notifications about UDP tunnel ports that come up
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
  */
-int ice_vsi_setup_tx_rings(struct ice_vsi *vsi)
+static void __maybe_unused
+ice_udp_tunnel_add(struct net_device *netdev, struct udp_tunnel_info *ti)
 {
-	int i, err = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_tnl_entry *tnl_entry;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	enum ice_tunnel_type tnl_type;
+	u16 port = ntohs(ti->port);
+	enum ice_status status;
 
-	if (!vsi->num_txq) {
-		dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Tx queues\n",
-			vsi->vsi_num);
-		return -EINVAL;
+	if (ice_dcf_is_udp_tunnel_capable(&pf->hw)) {
+		netdev_info(netdev, "Cannot config tunnel, the capability is used by DCF\n");
+		return;
 	}
 
-	ice_for_each_txq(vsi, i) {
-		struct ice_ring *ring = vsi->tx_rings[i];
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		tnl_type = TNL_VXLAN;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		tnl_type = TNL_GENEVE;
+		break;
+	default:
+		netdev_err(netdev, "Unknown tunnel type\n");
+		return;
+	}
 
-		if (!ring)
-			return -EINVAL;
+	status = ice_is_create_tunnel_possible(&pf->hw, tnl_type, port);
+	if (status == ICE_ERR_OUT_OF_RANGE) {
+		netdev_info(netdev, "Max tunneled UDP ports reached, port %d not added\n",
+			    port);
+		return;
+	}
 
-		ring->netdev = vsi->netdev;
-		err = ice_setup_tx_ring(ring);
-		if (err)
-			break;
+	spin_lock(&pf->tnl_lock);
+	tnl_entry = ice_find_tnl(pf, tnl_type, port);
+
+	if (tnl_entry) {
+		tnl_entry->ref_cnt++;
+		/* if the entry is scheduled for deletion, cancel this */
+		tnl_entry->state &= ~ICE_TNL_STATE_TO_DEL;
+	} else {
+		tnl_entry = devm_kzalloc(ice_pf_to_dev(pf),
+					 sizeof(*tnl_entry), GFP_ATOMIC);
+		if (!tnl_entry) {
+			spin_unlock(&pf->tnl_lock);
+			return;
+		}
+		tnl_entry->type = tnl_type;
+		tnl_entry->port = port;
+		tnl_entry->state = ICE_TNL_STATE_TO_ADD;
+		tnl_entry->ref_cnt = 1;
+		INIT_LIST_HEAD(&tnl_entry->node);
+		list_add_tail(&tnl_entry->node, &pf->tnl_list);
 	}
+	spin_unlock(&pf->tnl_lock);
 
-	return err;
+	/* kick the service_task so that it can create the tunnel */
+	ice_service_task_schedule(vsi->back);
 }
 
 /**
- * ice_vsi_setup_rx_rings - Allocate VSI Rx queue resources
- * @vsi: VSI having resources allocated
- *
- * Return 0 on success, negative on failure
+ * ice_udp_tunnel_del - Get notifications about UDP tunnel ports that go away
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
  */
-int ice_vsi_setup_rx_rings(struct ice_vsi *vsi)
+static void __maybe_unused
+ice_udp_tunnel_del(struct net_device *netdev, struct udp_tunnel_info *ti)
 {
-	int i, err = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	enum ice_tunnel_type tnl_type;
+	struct ice_tnl_entry *entry;
+	u16 port = ntohs(ti->port);
 
-	if (!vsi->num_rxq) {
-		dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Rx queues\n",
-			vsi->vsi_num);
-		return -EINVAL;
+	if (ice_dcf_is_udp_tunnel_capable(&pf->hw)) {
+		netdev_info(netdev, "Cannot config tunnel, the capability is used by DCF\n");
+		return;
 	}
 
-	ice_for_each_rxq(vsi, i) {
-		struct ice_ring *ring = vsi->rx_rings[i];
-
-		if (!ring)
-			return -EINVAL;
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		tnl_type = TNL_VXLAN;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		tnl_type = TNL_GENEVE;
+		break;
+	default:
+		netdev_err(netdev, "Unknown tunnel type\n");
+		return;
+	}
 
-		ring->netdev = vsi->netdev;
-		err = ice_setup_rx_ring(ring);
-		if (err)
-			break;
+	spin_lock(&pf->tnl_lock);
+	entry = ice_find_tnl(pf, tnl_type, port);
+	if (entry) {
+		if (entry->ref_cnt > 1) {
+			entry->ref_cnt--;
+		} else if (entry->state & ICE_TNL_STATE_ACTIVE) {
+			entry->ref_cnt = 0;
+			entry->state |= ICE_TNL_STATE_TO_DEL;
+		} else {
+			list_del(&entry->node);
+			devm_kfree(ice_pf_to_dev(pf), entry);
+		}
+	} else {
+		netdev_err(netdev, "Unable to find Tunnel, port %u, tnl_type %u\n",
+			   port, tnl_type);
 	}
+	spin_unlock(&pf->tnl_lock);
 
-	return err;
+	/* kick the service_task so that it can destroy the tunnel */
+	ice_service_task_schedule(vsi->back);
 }
 
+#if defined(HAVE_VXLAN_RX_OFFLOAD) && !defined(HAVE_UDP_ENC_RX_OFFLOAD)
+#if IS_ENABLED(CONFIG_VXLAN)
 /**
- * ice_vsi_open - Called when a network interface is made active
- * @vsi: the VSI to open
- *
- * Initialization of the VSI
- *
- * Returns 0 on success, negative value on error
+ * ice_add_vxlan_port - Get notifications about VxLAN ports that come up
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that VxLAN is notifying us about
+ * @port: New UDP port number that VxLAN started listening to
  */
-static int ice_vsi_open(struct ice_vsi *vsi)
+static void
+ice_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+		   __be16 port)
 {
-	char int_name[ICE_INT_NAME_STR_LEN];
-	struct ice_pf *pf = vsi->back;
-	int err;
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_VXLAN,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	/* allocate descriptors */
-	err = ice_vsi_setup_tx_rings(vsi);
-	if (err)
-		goto err_setup_tx;
+	ice_udp_tunnel_add(netdev, &ti);
+}
 
-	err = ice_vsi_setup_rx_rings(vsi);
-	if (err)
-		goto err_setup_rx;
+/**
+ * ice_del_vxlan_port - Get notifications about VxLAN ports that go away
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that VxLAN is notifying us about
+ * @port: UDP port number that VxLAN stopped listening to
+ */
+static void
+ice_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+		   __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_VXLAN,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	err = ice_vsi_cfg(vsi);
-	if (err)
-		goto err_setup_rx;
+	ice_udp_tunnel_del(netdev, &ti);
+}
+#endif /* CONFIG_VXLAN */
+#endif /* HAVE_VXLAN_RX_OFFLOAD && !HAVE_UDP_ENC_RX_OFFLOAD */
 
-	snprintf(int_name, sizeof(int_name) - 1, "%s-%s",
-		 dev_driver_string(&pf->pdev->dev), vsi->netdev->name);
-	err = ice_vsi_req_irq_msix(vsi, int_name);
-	if (err)
-		goto err_setup_rx;
+#if defined(HAVE_GENEVE_RX_OFFLOAD) && !defined(HAVE_UDP_ENC_RX_OFFLOAD)
+#if IS_ENABLED(CONFIG_GENEVE)
+/**
+ * ice_add_geneve_port - Get notifications about GENEVE ports that come up
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that GENEVE is notifying us about
+ * @port: New UDP port number that GENEVE started listening to
+ */
+static void
+ice_add_geneve_port(struct net_device *netdev, sa_family_t sa_family,
+		    __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_GENEVE,
+		.sa_family = sa_family,
+		.port = port,
+	};
+
+	ice_udp_tunnel_add(netdev, &ti);
+}
+
+/**
+ * ice_del_geneve_port - Get notifications about GENEVE ports that go away
+ * @netdev: This physical port's netdev
+ * @sa_family: Socket Family that GENEVE is notifying us about
+ * @port: UDP port number that GENEVE stopped listening to
+ */
+static void
+ice_del_geneve_port(struct net_device *netdev, sa_family_t sa_family,
+		    __be16 port)
+{
+	struct udp_tunnel_info ti = {
+		.type = UDP_TUNNEL_TYPE_GENEVE,
+		.sa_family = sa_family,
+		.port = port,
+	};
 
-	/* Notify the stack of the actual queue counts. */
-	err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq);
-	if (err)
-		goto err_set_qs;
+	ice_udp_tunnel_del(netdev, &ti);
+}
 
-	err = netif_set_real_num_rx_queues(vsi->netdev, vsi->num_rxq);
-	if (err)
-		goto err_set_qs;
+#endif /* CONFIG_GENEVE */
+#endif /* HAVE_GENEVE_RX_OFFLOAD  && !HAVE_UDP_ENC_RX_OFFLOAD */
 
-	err = ice_up_complete(vsi);
-	if (err)
-		goto err_up_complete;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_setup_tc_cls_flower - flower classifier offloads
+ * @np: net device to configure
+ * @filter_dev: device on which filter is added
+ * @cls_flower: offload data
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_setup_tc_cls_flower(struct ice_netdev_priv *np,
+			struct net_device *filter_dev,
+			struct flow_cls_offload *cls_flower)
+#else
+static int
+ice_setup_tc_cls_flower(struct ice_netdev_priv *np,
+			struct net_device __always_unused *filter_dev,
+			struct tc_cls_flower_offload *cls_flower)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_vsi *vsi = np->vsi;
 
-	return 0;
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	if (cls_flower->common.chain_index)
+		return -EOPNOTSUPP;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
 
-err_up_complete:
-	ice_down(vsi);
-err_set_qs:
-	ice_vsi_free_irq(vsi);
-err_setup_rx:
-	ice_vsi_free_rx_rings(vsi);
-err_setup_tx:
-	ice_vsi_free_tx_rings(vsi);
+	if (ice_is_dcf_enabled(vsi->back))
+		return -EOPNOTSUPP;
 
-	return err;
+	switch (cls_flower->command) {
+	case FLOW_CLS_REPLACE:
+		return ice_add_cls_flower(filter_dev, vsi, cls_flower);
+	case FLOW_CLS_DESTROY:
+		return ice_del_cls_flower(vsi, cls_flower);
+	default:
+		return -EINVAL;
+	}
 }
 
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
 /**
- * ice_vsi_release_all - Delete all VSIs
- * @pf: PF from which all VSIs are being removed
+ * ice_setup_tc_block_cb - callback handler registered for TC block
+ * @type: TC SETUP type
+ * @type_data: TC flower offload data that contains user input
+ * @cb_priv: netdev private data
  */
-static void ice_vsi_release_all(struct ice_pf *pf)
+static int
+ice_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
 {
-	int err, i;
-
-	if (!pf->vsi)
-		return;
-
-	ice_for_each_vsi(pf, i) {
-		if (!pf->vsi[i])
-			continue;
+	struct ice_netdev_priv *np = (struct ice_netdev_priv *)cb_priv;
 
-		err = ice_vsi_release(pf->vsi[i]);
-		if (err)
-			dev_dbg(&pf->pdev->dev,
-				"Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
-				i, err, pf->vsi[i]->vsi_num);
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, np->vsi->netdev,
+					       (struct flow_cls_offload *)
+					       type_data);
+	default:
+		return -EOPNOTSUPP;
 	}
 }
 
 /**
- * ice_ena_vsi - resume a VSI
- * @vsi: the VSI being resume
- * @locked: is the rtnl_lock already held
+ * ice_validate_mqprio_qopt - Validate TCF input parameters
+ * @vsi: Pointer to VSI
+ * @mqprio_qopt: input parameters for mqprio queue configuration
+ *
+ * This function validates MQPRIO params, such as qcount (power of 2 wherever
+ * needed), and make sure user doesn't specify qcount and BW rate limit
+ * for TCs, which are more than "num_tc"
  */
-static int ice_ena_vsi(struct ice_vsi *vsi, bool locked)
+static int
+ice_validate_mqprio_qopt(struct ice_vsi *vsi,
+			 struct tc_mqprio_qopt_offload *mqprio_qopt)
 {
-	int err = 0;
+	u64 sum_max_rate = 0, sum_min_rate = 0;
+	int non_power_of_2_qcount = 0;
+	struct ice_pf *pf = vsi->back;
+	int max_rss_q_cnt = 0;
+	struct device *dev;
+	int i, speed;
+	u8 num_tc;
 
-	if (!test_bit(__ICE_NEEDS_RESTART, vsi->state))
-		return 0;
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	if (mqprio_qopt->qopt.offset[0] != 0 ||
+	    mqprio_qopt->qopt.num_tc < 1 ||
+	    mqprio_qopt->qopt.num_tc > ICE_CHNL_MAX_TC)
+		return -EINVAL;
 
-	clear_bit(__ICE_NEEDS_RESTART, vsi->state);
+	dev = ice_pf_to_dev(pf);
+	vsi->ch_rss_size = 0;
+	num_tc = mqprio_qopt->qopt.num_tc;
 
-	if (vsi->netdev && vsi->type == ICE_VSI_PF) {
-		if (netif_running(vsi->netdev)) {
-			if (!locked)
-				rtnl_lock();
+	for (i = 0; num_tc; i++) {
+		int qcount = mqprio_qopt->qopt.count[i];
+		u64 max_rate, min_rate;
+
+		max_rate = mqprio_qopt->max_rate[i];
+
+		if (!qcount)
+			return -EINVAL;
+
+		if (!i && !is_power_of_2(qcount)) {
+			dev_err(dev, "TC0:qcount[%d] must be a power of 2\n",
+				qcount);
+			return -EINVAL;
+		} else if (non_power_of_2_qcount) {
+			if (qcount > non_power_of_2_qcount) {
+				dev_err(dev, "TC%d:qcount[%d] > non_power_of_2_qcount [%d]\n",
+					i, qcount, non_power_of_2_qcount);
+				return -EINVAL;
+			} else if (qcount < non_power_of_2_qcount) {
+				/* it must be power of 2, otherwise fail */
+				if (!is_power_of_2(qcount)) {
+					dev_err(dev, "qcount must be a power of 2, TC%d: qcnt[%d] < non_power_of_2_qcount [%d]\n",
+						i, qcount,
+						non_power_of_2_qcount);
+					return -EINVAL;
+				}
+			}
+		} else if (!is_power_of_2(qcount)) {
+			/* after tc0, next TCs qcount can be non-power of 2,
+			 * if so, set channel RSS size to be the count of that
+			 * TC
+			 */
+			non_power_of_2_qcount = qcount;
+			max_rss_q_cnt = qcount;
+			dev_dbg(dev, "TC%d:count[%d] non power of 2\n", i,
+				qcount);
+		}
+
+		/* figure out max_rss_q_cnt based on TC's qcount */
+		if (max_rss_q_cnt) {
+			if (qcount > max_rss_q_cnt)
+				max_rss_q_cnt = qcount;
+		} else {
+			max_rss_q_cnt = qcount;
+		}
 
-			err = ice_open(vsi->netdev);
+		/* Convert input bandwidth from Bytes/s to Kbps */
+		/* TC tool converts the bandwidth rate limit into Bytes/s when
+		 * passing it down to the driver whereas the TC command can
+		 * take bandwidth inputs in Kbps, Mbps or Gbps
+		 */
+		do_div(max_rate, ICE_BW_KBPS_DIVISOR);
+		sum_max_rate += max_rate;
+
+		/* min_rate is minimum guaranteed rate and it can't be zero */
+		min_rate = mqprio_qopt->min_rate[i];
+		do_div(min_rate, ICE_BW_KBPS_DIVISOR);
+		if (min_rate && min_rate < ICE_MIN_BW_LIMIT) {
+			dev_err(dev, "TC%d: min_rate(%llu Kbps) < %u Kbps\n", i,
+				min_rate, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		if (min_rate % ICE_MIN_BW_LIMIT != 0) {
+			dev_err(dev, "TC%d: Min Rate not in increment of %u Kbps",
+				i, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		if (max_rate % ICE_MIN_BW_LIMIT != 0) {
+			dev_err(dev, "TC%d: Max Rate not in increment of %u Kbps",
+				i, ICE_MIN_BW_LIMIT);
+			return -EINVAL;
+		}
+		sum_min_rate += min_rate;
 
-			if (!locked)
-				rtnl_unlock();
+		/* min_rate can't be more than max_rate, except when max_rate
+		 * is zero (which is valid and it means bandwidth is sought for
+		 * max line rate). In such a case min_rate can be more than max.
+		 */
+		if (max_rate && min_rate > max_rate) {
+			dev_err(dev, "min_rate %llu Kbps can't be more than max_rate %llu Kbps\n",
+				min_rate, max_rate);
+			return -EINVAL;
 		}
+
+		if (i >= mqprio_qopt->qopt.num_tc - 1)
+			break;
+		if (mqprio_qopt->qopt.offset[i + 1] !=
+		    (mqprio_qopt->qopt.offset[i] + qcount))
+			return -EINVAL;
+	}
+	if (vsi->num_rxq <
+	    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+		return -EINVAL;
+	if (vsi->num_txq <
+	    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+		return -EINVAL;
+
+	speed = ice_get_link_speed_kbps(vsi);
+	if (sum_min_rate && sum_min_rate > (u64)speed) {
+		dev_err(dev, "Invalid min Tx rate(%llu) Kbps > speed (%u) Kbps specified\n",
+			sum_min_rate, speed);
+		return -EINVAL;
+	}
+	if (sum_max_rate && sum_max_rate > (u64)speed) {
+		dev_err(dev, "Invalid max Tx rate(%llu) Kbps > speed(%u) Kbps specified\n",
+			sum_max_rate, speed);
+		return -EINVAL;
 	}
 
-	return err;
+	/* make sure vsi->ch_rss_size is set correctly based on TC's qcount */
+	vsi->ch_rss_size = max_rss_q_cnt;
+
+	return 0;
 }
 
 /**
- * ice_pf_ena_all_vsi - Resume all VSIs on a PF
- * @pf: the PF
- * @locked: is the rtnl_lock already held
+ * ice_add_vsi_to_fdir - add a VSI to the flow director group for PF
+ * @pf: ptr to PF device
+ * @vsi: ptr to VSI
  */
-#ifdef CONFIG_DCB
-int ice_pf_ena_all_vsi(struct ice_pf *pf, bool locked)
+static int ice_add_vsi_to_fdir(struct ice_pf *pf, struct ice_vsi *vsi)
 {
-	int v;
+	struct device *dev = ice_pf_to_dev(pf);
+	bool added = false;
+	struct ice_hw *hw;
+	int flow;
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			if (ice_ena_vsi(pf->vsi[v], locked))
-				return -EIO;
+	if (!(vsi->num_gfltr || vsi->num_bfltr))
+		return -EINVAL;
+
+	hw = &pf->hw;
+	for (flow = 0; flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		enum ice_block blk = ICE_BLK_FD;
+		struct ice_fd_hw_prof *prof;
+		enum ice_status status;
+		u64 entry_h;
+		int tun;
+
+		if (!(hw->fdir_prof && hw->fdir_prof[flow] &&
+		      hw->fdir_prof[flow]->cnt))
+			continue;
+
+		for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
+			enum ice_flow_priority prio;
+			u64 prof_id;
+
+			/* add this VSI to FDir profile for this flow */
+			prio = ICE_FLOW_PRIO_NORMAL;
+			prof = hw->fdir_prof[flow];
+			prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
+			status = ice_flow_add_entry(hw, blk, prof_id,
+						    prof->vsi_h[0], vsi->idx,
+						    prio, prof->fdir_seg[tun],
+						    NULL, 0, &entry_h);
+			if (status) {
+				dev_err(dev, "channel VSI idx %d, not able to add to group %d\n",
+					vsi->idx, flow);
+				continue;
+			}
+
+			prof->entry_h[prof->cnt][tun] = entry_h;
+		}
 
+		/* store VSI for filter replay and delete */
+		prof->vsi_h[prof->cnt] = vsi->idx;
+		prof->cnt++;
+
+		/* loop bookkeeping */
+		added = true;
+		dev_dbg(dev, "VSI idx %d added to fdir group %d\n", vsi->idx,
+			flow);
+	}
+
+	if (!added)
+		dev_dbg(dev, "VSI idx %d not added to fdir groups\n", vsi->idx);
+	else
+		set_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
 	return 0;
 }
-#endif /* CONFIG_DCB */
 
 /**
- * ice_vsi_rebuild_by_type - Rebuild VSI of a given type
- * @pf: pointer to the PF instance
- * @type: VSI type to rebuild
+ * ice_add_channel - add a channel by adding VSI
+ * @pf: ptr to PF device
+ * @sw_id: underlying HW switching element ID
+ * @ch: ptr to channel structure
  *
- * Iterates through the pf->vsi array and rebuilds VSIs of the requested type
+ * Add a channel (VSI) using add_vsi and queue_map
  */
-static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type)
+static int ice_add_channel(struct ice_pf *pf, u16 sw_id, struct ice_channel *ch)
 {
-	enum ice_status status;
-	int i, err;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *vsi;
 
-	ice_for_each_vsi(pf, i) {
-		struct ice_vsi *vsi = pf->vsi[i];
+	if (ch->type != ICE_VSI_CHNL) {
+		dev_err(dev, "add new VSI failed, ch->type %d\n", ch->type);
+		return -EINVAL;
+	}
 
-		if (!vsi || vsi->type != type)
-			continue;
+	vsi = ice_chnl_vsi_setup(pf, pf->hw.port_info, ch);
+	if (!vsi || vsi->type != ICE_VSI_CHNL) {
+		dev_err(dev, "create chnl VSI failure\n");
+		return -EINVAL;
+	}
 
-		/* rebuild the VSI */
-		err = ice_vsi_rebuild(vsi);
-		if (err) {
-			dev_err(&pf->pdev->dev,
-				"rebuild VSI failed, err %d, VSI index %d, type %d\n",
-				err, vsi->idx, type);
-			return err;
-		}
+	/* set/clear VSI level feature flag for packet based optimization
+	 * (this is related to SW triggred interrupt from napi_poll - which is
+	 * generally based off data packets or not)
+	 */
+	if (test_bit(ICE_FLAG_CHNL_PKT_INSPECT_OPT_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_PKT_INSPECT_OPT_ENA, vsi->features);
 
-		/* replay filters for the VSI */
-		status = ice_replay_vsi(&pf->hw, vsi->idx);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"replay VSI failed, status %d, VSI index %d, type %d\n",
-				status, vsi->idx, type);
-			return -EIO;
-		}
+	/* set/clear VSI level feature flag for ADQ (aka channel) VSIs
+	 * based on PF level private flags: this flag meant to harvest
+	 * clean of Rx queue upon busy_poll stop and after that clean
+	 * once only.
+	 */
+	if (test_bit(ICE_FLAG_CHNL_PKT_CLEAN_BP_STOP_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_PKT_CLEAN_BP_STOP_ENA,
+			  vsi->features);
 
-		/* Re-map HW VSI number, using VSI handle that has been
-		 * previously validated in ice_replay_vsi() call above
-		 */
-		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+	/* set/clear inline flow-director bits for newly created VSI based
+	 * on PF level private flags
+	 */
+	if (test_bit(ICE_FLAG_CHNL_INLINE_FD_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_INLINE_FD_ENA, vsi->features);
 
-		/* enable the VSI */
-		err = ice_ena_vsi(vsi, false);
-		if (err) {
-			dev_err(&pf->pdev->dev,
-				"enable VSI failed, err %d, VSI index %d, type %d\n",
-				err, vsi->idx, type);
-			return err;
-		}
+	if (test_bit(ICE_FLAG_CHNL_INLINE_FD_MARK_ENA, pf->flags))
+		set_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+	else
+		clear_bit(ICE_CHNL_FEATURE_INLINE_FD_MARK_ENA, vsi->features);
+
+	/* if VSI has some FD resources reserved (either from guaranteed or
+	 * best-effort quota), add VSI into VSI group which has FD
+	 * input set defined so that, newly created VSI can use FD
+	 * resources (side-band flow director type filter and/or
+	 * inline flow-director type of filters which are typically
+	 * setup during normal transmit path if packet being transmitted
+	 * has SYN, SYN+ACK, RST, FIN flags set)
+	 */
+	clear_bit(ICE_CHNL_FEATURE_FD_ENA, vsi->features);
 
-		dev_info(&pf->pdev->dev, "VSI rebuilt. VSI index %d, type %d\n",
-			 vsi->idx, type);
-	}
+	ice_add_vsi_to_fdir(pf, vsi);
+
+	ch->sw_id = sw_id;
+	ch->vsi_num = vsi->vsi_num;
+	ch->info.mapping_flags = vsi->info.mapping_flags;
+	ch->ch_vsi = vsi;
+	/* initialize filter type to be INVALID */
+	ch->fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+	/* set the back pointer of channel for newly created VSI */
+	vsi->ch = ch;
+
+	memcpy(&ch->info.q_mapping, &vsi->info.q_mapping,
+	       sizeof(vsi->info.q_mapping));
+	memcpy(&ch->info.tc_mapping, vsi->info.tc_mapping,
+	       sizeof(vsi->info.tc_mapping));
 
 	return 0;
 }
 
 /**
- * ice_update_pf_netdev_link - Update PF netdev link status
- * @pf: pointer to the PF instance
+ * ice_chnl_cfg_res
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Configure channel specific resources such as rings, vector.
  */
-static void ice_update_pf_netdev_link(struct ice_pf *pf)
+static void ice_chnl_cfg_res(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	bool link_up;
 	int i;
 
-	ice_for_each_vsi(pf, i) {
-		struct ice_vsi *vsi = pf->vsi[i];
 
-		if (!vsi || vsi->type != ICE_VSI_PF)
-			return;
+	for (i = 0; i < ch->num_txq; i++) {
+		struct ice_q_vector *tx_q_vector, *rx_q_vector;
+		struct ice_ring *tx_ring, *rx_ring;
+		struct ice_ring_container *rc;
 
-		ice_get_link_status(pf->vsi[i]->port_info, &link_up);
-		if (link_up) {
-			netif_carrier_on(pf->vsi[i]->netdev);
-			netif_tx_wake_all_queues(pf->vsi[i]->netdev);
-		} else {
-			netif_carrier_off(pf->vsi[i]->netdev);
-			netif_tx_stop_all_queues(pf->vsi[i]->netdev);
+		tx_ring = vsi->tx_rings[ch->base_q + i];
+		rx_ring = vsi->rx_rings[ch->base_q + i];
+		if (!tx_ring || !rx_ring)
+			continue;
+
+		/* setup ring being channel enabled */
+		tx_ring->ch = ch;
+		rx_ring->ch = ch;
+
+		tx_ring->ch_inline_fd_cnt_index = ch->fd_cnt_index;
+
+		/* following code block sets up vector specific attributes */
+		tx_q_vector = tx_ring->q_vector;
+		rx_q_vector = rx_ring->q_vector;
+		if (!tx_q_vector && !rx_q_vector)
+			continue;
+
+		if (tx_q_vector) {
+			tx_q_vector->ch = ch;
+			tx_q_vector->state_flags = 0;
+			tx_q_vector->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+			/* setup Tx and Rx ITR setting if DIM is off */
+			rc = &tx_q_vector->tx;
+			if (!ITR_IS_DYNAMIC(rc))
+				ice_write_itr(rc, rc->itr_setting);
+		}
+		if (rx_q_vector) {
+			rx_q_vector->ch = ch;
+			rx_q_vector->state_flags = 0;
+			rx_q_vector->max_limit_process_rx_queues =
+					ICE_MAX_LIMIT_PROCESS_RX_PKTS_DFLT;
+			/* setup Tx and Rx ITR setting if DIM is off */
+			rc = &rx_q_vector->rx;
+			if (!ITR_IS_DYNAMIC(rc))
+				ice_write_itr(rc, rc->itr_setting);
 		}
 	}
+
+	/* it is safe to assume that, if channel has non-zero num_t[r]xq, then
+	 * GLINT_ITR register would have written to perform in-context
+	 * update, hence perform flush
+	 */
+	if (ch->num_txq || ch->num_rxq)
+		ice_flush(&vsi->back->hw);
 }
 
 /**
- * ice_rebuild - rebuild after reset
- * @pf: PF to rebuild
- * @reset_type: type of reset
+ * ice_cfg_chnl_all_res - configure channel resources
+ * @vsi: pte to main_vsi
+ * @ch: ptr to channel structure
+ *
+ * This function configures channel specific resources such as flow-director
+ * counter index, and other resources such as queues, vectors, ITR settings
  */
-static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
+static void
+ice_cfg_chnl_all_res(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	struct device *dev = &pf->pdev->dev;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status ret;
-	int err;
-
-	if (test_bit(__ICE_DOWN, pf->state))
-		goto clear_recovery;
-
-	dev_dbg(dev, "rebuilding PF after reset_type=%d\n", reset_type);
+	struct ice_pf *pf = vsi->back;
 
-	ret = ice_init_all_ctrlq(hw);
-	if (ret) {
-		dev_err(dev, "control queues init failed %d\n", ret);
-		goto err_init_ctrlq;
-	}
+	/* setup inline-FD counter index per channel, eventually
+	 * used separate counter index per channel, to offer
+	 * better granularity and QoS per channel for RSS and FD
+	 */
+	ch->fd_cnt_index = ICE_FD_CH_STAT_IDX(pf->hw.fd_ctr_base);
+	/* reset source for all counters is CORER, typically upon
+	 * driver load, those counters may have stale value, hence
+	 * initialize counter to zero, access type for counters is RWC
+	 */
+	ice_clear_cntr(pf, ch->fd_cnt_index);
 
-	/* if DDP was previously loaded successfully */
-	if (!ice_is_safe_mode(pf)) {
-		/* reload the SW DB of filter tables */
-		if (reset_type == ICE_RESET_PFR)
-			ice_fill_blk_tbls(hw);
-		else
-			/* Reload DDP Package after CORER/GLOBR reset */
-			ice_load_pkg(NULL, pf);
-	}
+	/* configure channel (aka ADQ) resources such as queues, vectors,
+	 * ITR settings for channel specific vectors and anything else
+	 */
+	ice_chnl_cfg_res(vsi, ch);
+}
 
-	ret = ice_clear_pf_cfg(hw);
-	if (ret) {
-		dev_err(dev, "clear PF configuration failed %d\n", ret);
-		goto err_init_ctrlq;
-	}
+/**
+ * ice_setup_hw_channel - setup new channel
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ * @sw_id: underlying HW switching element ID
+ * @type: type of channel to be created (VMDq2/VF)
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and configures Tx rings accordingly
+ */
+static int
+ice_setup_hw_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+		     struct ice_channel *ch, u16 sw_id, u8 type)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	int ret;
 
-	ice_clear_pxe_mode(hw);
+	ch->base_q = vsi->next_base_q;
+	ch->type = type;
 
-	ret = ice_get_caps(hw);
+	ret = ice_add_channel(pf, sw_id, ch);
 	if (ret) {
-		dev_err(dev, "ice_get_caps failed %d\n", ret);
-		goto err_init_ctrlq;
+		dev_err(dev, "failed to add_channel using sw_id %u\n", sw_id);
+		return ret;
 	}
 
-	err = ice_sched_init_port(hw->port_info);
-	if (err)
-		goto err_sched_init_port;
-
-	err = ice_update_link_info(hw->port_info);
-	if (err)
-		dev_err(&pf->pdev->dev, "Get link status error %d\n", err);
+	/* configure/setup ADQ specific resources */
+	ice_cfg_chnl_all_res(vsi, ch);
 
-	/* start misc vector */
-	err = ice_req_irq_msix_misc(pf);
-	if (err) {
-		dev_err(dev, "misc vector setup failed: %d\n", err);
-		goto err_sched_init_port;
-	}
+	/* make sure to update the next_base_q so that subsequent channel's
+	 * (aka ADQ) VSI queue map is correct
+	 */
+	vsi->next_base_q = vsi->next_base_q + ch->num_rxq;
+	dev_dbg(dev, "added channel: vsi_num %u, num_rxq %u\n", ch->vsi_num,
+		ch->num_rxq);
 
-	if (test_bit(ICE_FLAG_DCB_ENA, pf->flags))
-		ice_dcb_rebuild(pf);
+	return 0;
+}
 
-	/* rebuild PF VSI */
-	err = ice_vsi_rebuild_by_type(pf, ICE_VSI_PF);
-	if (err) {
-		dev_err(dev, "PF VSI rebuild failed: %d\n", err);
-		goto err_vsi_rebuild;
-	}
+/**
+ * ice_setup_channel - setup new channel using uplink element
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and uplink switching element
+ */
+static bool
+ice_setup_channel(struct ice_pf *pf, struct ice_vsi *vsi,
+		  struct ice_channel *ch)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 sw_id;
+	int ret;
 
-	if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) {
-		err = ice_vsi_rebuild_by_type(pf, ICE_VSI_VF);
-		if (err) {
-			dev_err(dev, "VF VSI rebuild failed: %d\n", err);
-			goto err_vsi_rebuild;
-		}
+	if (vsi->type != ICE_VSI_PF) {
+		dev_err(dev, "unsupported parent VSI type(%d)\n", vsi->type);
+		return false;
 	}
 
-	ice_update_pf_netdev_link(pf);
+	sw_id = pf->first_sw->sw_id;
 
-	/* tell the firmware we are up */
-	ret = ice_send_version(pf);
+	/* create channel (VSI) */
+	ret = ice_setup_hw_channel(pf, vsi, ch, sw_id, ICE_VSI_CHNL);
 	if (ret) {
-		dev_err(dev,
-			"Rebuild failed due to error sending driver version: %d\n",
-			ret);
-		goto err_vsi_rebuild;
+		dev_err(dev, "failed to setup hw_channel\n");
+		return false;
 	}
+	dev_dbg(dev, "successfully created channel()\n");
 
-	ice_replay_post(hw);
+	return ch->ch_vsi ? true : false;
+}
 
-	/* if we get here, reset flow is successful */
-	clear_bit(__ICE_RESET_FAILED, pf->state);
-	return;
+/**
+ * ice_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
+ * ice_set_min_bw_limit - setup minimum BW limit for Tx based on min_tx_rate
+ * @vsi: VSI to be configured
+ * @max_tx_rate: max Tx rate in Kbps to be configured as maximum BW limit
+ * @min_tx_rate: min Tx rate in Kbps to be configured as mimimum BW limit
+ */
+static int
+ice_set_bw_limit(struct ice_vsi *vsi, u64 max_tx_rate, u64 min_tx_rate)
+{
+	int err;
 
-err_vsi_rebuild:
-err_sched_init_port:
-	ice_sched_cleanup_all(hw);
-err_init_ctrlq:
-	ice_shutdown_all_ctrlq(hw);
-	set_bit(__ICE_RESET_FAILED, pf->state);
-clear_recovery:
-	/* set this bit in PF state to control service task scheduling */
-	set_bit(__ICE_NEEDS_RESTART, pf->state);
-	dev_err(dev, "Rebuild failed, unload and reload driver\n");
+	err = ice_set_min_bw_limit(vsi, min_tx_rate);
+	if (err)
+		return err;
+
+	err = ice_set_max_bw_limit(vsi, max_tx_rate);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 /**
- * ice_change_mtu - NDO callback to change the MTU
- * @netdev: network interface device structure
- * @new_mtu: new value for maximum frame size
+ * ice_create_q_channel - function to create channel
+ * @vsi: VSI to be configured
+ * @ch: ptr to channel (it contains channel specific params)
  *
- * Returns 0 on success, negative on failure
+ * This function creates channel (VSI) using num_queues specified by user,
+ * reconfigs RSS if needed.
  */
-static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+static int ice_create_q_channel(struct ice_vsi *vsi, struct ice_channel *ch)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u8 count = 0;
+	struct device *dev;
 
-	if (new_mtu == netdev->mtu) {
-		netdev_warn(netdev, "MTU is already %u\n", netdev->mtu);
-		return 0;
-	}
+	if (!ch)
+		return -EINVAL;
 
-	if (new_mtu < netdev->min_mtu) {
-		netdev_err(netdev, "new MTU invalid. min_mtu is %d\n",
-			   netdev->min_mtu);
+	dev = ice_pf_to_dev(pf);
+	if (!ch->num_txq || !ch->num_rxq) {
+		dev_err(dev, "Invalid num_queues requested: %d\n", ch->num_rxq);
 		return -EINVAL;
-	} else if (new_mtu > netdev->max_mtu) {
-		netdev_err(netdev, "new MTU invalid. max_mtu is %d\n",
-			   netdev->min_mtu);
+	}
+
+	if (!vsi->cnt_q_avail || vsi->cnt_q_avail < ch->num_txq) {
+		dev_err(dev, "Error: cnt_q_avail (%u) less than num_queues %d\n",
+			vsi->cnt_q_avail, ch->num_txq);
 		return -EINVAL;
 	}
-	/* if a reset is in progress, wait for some time for it to complete */
-	do {
-		if (ice_is_reset_in_progress(pf->state)) {
-			count++;
-			usleep_range(1000, 2000);
-		} else {
-			break;
-		}
 
-	} while (count < 100);
+	if (!ice_setup_channel(pf, vsi, ch)) {
+		dev_info(dev, "Failed to setup channel\n");
+		return -EINVAL;
+	}
+	/* configure BW rate limit */
+	if (ch->ch_vsi && (ch->max_tx_rate || ch->min_tx_rate)) {
+		int ret;
 
-	if (count == 100) {
-		netdev_err(netdev, "can't change MTU. Device is busy\n");
-		return -EBUSY;
+		ret = ice_set_bw_limit(ch->ch_vsi, ch->max_tx_rate,
+				       ch->min_tx_rate);
+		if (ret)
+			dev_err(dev, "failed to set Tx rate of %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->ch_vsi->vsi_num);
+		else
+			dev_dbg(dev, "set Tx rate of %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->ch_vsi->vsi_num);
 	}
 
-	netdev->mtu = new_mtu;
+	vsi->cnt_q_avail -= ch->num_txq;
 
-	/* if VSI is up, bring it down and then back up */
-	if (!test_and_set_bit(__ICE_DOWN, vsi->state)) {
-		int err;
+	return 0;
+}
 
-		err = ice_down(vsi);
-		if (err) {
-			netdev_err(netdev, "change MTU if_up err %d\n", err);
-			return err;
-		}
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_rem_all_chnl_fltrs - removes all channel filters
+ * @pf: ptr to PF, TC-flower based filter are tracked at PF level
+ *
+ * Remove all advanced switch filters only if they are channel specific
+ * tc-flower based filter
+ */
+static void ice_rem_all_chnl_fltrs(struct ice_pf *pf)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node2;
+
+	/* to remove all channel filters, iterate an ordered list of filters */
+	hlist_for_each_entry_safe(fltr, node2,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		struct ice_rule_query_data rule;
+		enum ice_status status;
+
+		/* for now process only channel specific filters */
+		if (!ice_is_chnl_fltr(fltr))
+			continue;
 
-		err = ice_up(vsi);
-		if (err) {
-			netdev_err(netdev, "change MTU if_up err %d\n", err);
-			return err;
+		rule.rid = fltr->rid;
+		rule.rule_id = fltr->rule_id;
+		rule.vsi_handle = fltr->dest_id;
+		status = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+		if (status) {
+			if (status == ICE_ERR_DOES_NOT_EXIST)
+				dev_dbg(ice_pf_to_dev(pf),
+					"TC flower filter (rule_id %u) does not exist\n",
+					rule.rule_id);
+			else
+				dev_err(ice_pf_to_dev(pf),
+					"failed to delete TC flower filter, status %d\n",
+					status);
+		} else if (fltr->dest_vsi) {
+			/* update advanced switch filter count */
+			if (fltr->dest_vsi->type == ICE_VSI_CHNL) {
+				u32 flags = fltr->flags;
+
+				fltr->dest_vsi->num_chnl_fltr--;
+				if (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+					     ICE_TC_FLWR_FIELD_ENC_DST_MAC))
+					pf->num_dmac_chnl_fltrs--;
+			}
 		}
-	}
 
-	netdev_info(netdev, "changed MTU to %d\n", new_mtu);
-	return 0;
+		hlist_del(&fltr->tc_flower_node);
+		kfree(fltr);
+	}
 }
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
 
 /**
- * ice_set_rss - Set RSS keys and lut
- * @vsi: Pointer to VSI structure
- * @seed: RSS hash seed
- * @lut: Lookup table
- * @lut_size: Lookup table size
+ * ice_remove_q_channels - Remove queue channels for the TCs
+ * @vsi: VSI to be configured
+ * @rem_fltr: delete advanced switch filter or not
  *
- * Returns 0 on success, negative on failure
+ * Remove queue channels for the TCs
  */
-int ice_set_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+static void ice_remove_q_channels(struct ice_vsi *vsi, bool rem_fltr)
+#else
+static void ice_remove_q_channels(struct ice_vsi *vsi,
+				  bool __always_unused rem_fltr)
+#endif
 {
+	struct ice_channel *ch, *ch_tmp;
 	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
+	int i;
 
-	if (seed) {
-		struct ice_aqc_get_set_rss_keys *buf =
-				  (struct ice_aqc_get_set_rss_keys *)seed;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* remove all tc-flower based filter if they are channel filters only */
+	if (rem_fltr)
+		ice_rem_all_chnl_fltrs(pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
-		status = ice_aq_set_rss_key(hw, vsi->idx, buf);
+	/* perform cleanup for channels if they exist */
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
 
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot set RSS key, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+		list_del(&ch->list);
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi) {
+			kfree(ch);
+			continue;
 		}
-	}
 
-	if (lut) {
-		status = ice_aq_set_rss_lut(hw, vsi->idx, vsi->rss_lut_type,
-					    lut, lut_size);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot set RSS lut, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+		/* Reset queue contexts */
+		for (i = 0; i < ch->num_rxq; i++) {
+			struct ice_ring *tx_ring, *rx_ring;
+
+			tx_ring = vsi->tx_rings[ch->base_q + i];
+			rx_ring = vsi->rx_rings[ch->base_q + i];
+			if (tx_ring) {
+				tx_ring->ch = NULL;
+				if (tx_ring->q_vector)
+					tx_ring->q_vector->ch = NULL;
+			}
+			if (rx_ring) {
+				rx_ring->ch = NULL;
+				if (rx_ring->q_vector)
+					rx_ring->q_vector->ch = NULL;
+			}
 		}
+
+		/* Release FD resources for the channel VSI */
+		ice_fdir_rem_adq_chnl(&pf->hw, ch->ch_vsi->idx);
+
+		/* clear the VSI from schedular tree */
+		ice_rm_vsi_lan_cfg(ch->ch_vsi->port_info, ch->ch_vsi->idx);
+
+		/* Delete VSI from FW */
+		ice_vsi_delete(ch->ch_vsi);
+
+		/* Delete VSI from PF and HW VSI arrays */
+		ice_vsi_clear(ch->ch_vsi);
+
+		/* free the channel */
+		kfree(ch);
 	}
 
-	return 0;
+	/* clear the channel VSI map which is stored in main VSI */
+	ice_for_each_chnl_tc(i)
+		vsi->tc_map_vsi[i] = NULL;
+
+	/* reset main VSI's all TC information */
+	vsi->all_enatc = 0;
+	vsi->all_numtc = 0;
 }
 
 /**
- * ice_get_rss - Get RSS keys and lut
- * @vsi: Pointer to VSI structure
- * @seed: Buffer to store the keys
- * @lut: Buffer to store the lookup table entries
- * @lut_size: Size of buffer to store the lookup table entries
+ * ice_rebuild_channels - rebuild channel
+ * @pf: ptr to PF
  *
- * Returns 0 on success, negative on failure
+ * Recreate channel VSIs and replay filters
  */
-int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+static int ice_rebuild_channels(struct ice_pf *pf)
 {
-	struct ice_pf *pf = vsi->back;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_vsi *main_vsi;
+	bool rem_adv_fltr = true;
+	struct ice_channel *ch;
+	struct ice_vsi *vsi;
+	int tc_idx = 1;
+	int i, err;
 
-	if (seed) {
-		struct ice_aqc_get_set_rss_keys *buf =
-				  (struct ice_aqc_get_set_rss_keys *)seed;
+	main_vsi = ice_get_main_vsi(pf);
+	if (!main_vsi)
+		return 0;
 
-		status = ice_aq_get_rss_key(hw, vsi->idx, buf);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot get RSS key, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
-		}
+	if (!test_bit(ICE_FLAG_TC_MQPRIO, pf->flags) ||
+	    main_vsi->old_numtc == 1)
+		return 0; /* nothing to be done */
+
+
+	/* reconfigure main VSI based on old value of TC and cached values
+	 * for MQPRIO opts
+	 */
+	err = ice_vsi_cfg_tc(main_vsi, main_vsi->old_ena_tc);
+	if (err) {
+		dev_err(dev, "failed configuring TC(ena_tc:0x%02x) for HW VSI=%u\n",
+			main_vsi->old_ena_tc, main_vsi->vsi_num);
+		return err;
 	}
 
-	if (lut) {
-		status = ice_aq_get_rss_lut(hw, vsi->idx, vsi->rss_lut_type,
-					    lut, lut_size);
-		if (status) {
-			dev_err(&pf->pdev->dev,
-				"Cannot get RSS lut, err %d aq_err %d\n",
-				status, hw->adminq.rq_last_status);
-			return -EIO;
+	/* rebuild ADQ VSIs */
+	ice_for_each_vsi(pf, i) {
+		enum ice_vsi_type type;
+
+		vsi = pf->vsi[i];
+		if (!vsi || vsi->type != ICE_VSI_CHNL)
+			continue;
+
+		type = vsi->type;
+
+		/* rebuild ADQ VSI */
+		err = ice_vsi_rebuild(vsi, true);
+		if (err) {
+			dev_err(dev, "VSI (type:%s) at index %d rebuild failed, err %d\n",
+				ice_vsi_type_str(type), vsi->idx, err);
+			goto cleanup;
 		}
+
+		/* Re-map HW VSI number, using VSI handle that has been
+		 * previously validated in ice_replay_vsi() call above
+		 */
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+
+		/* replay filters for the VSI */
+		err = ice_replay_vsi(&pf->hw, vsi->idx);
+		if (err) {
+			dev_err(dev, "VSI (type:%s) replay failed, err %d, VSI index %d\n",
+				ice_vsi_type_str(type), err, vsi->idx);
+			rem_adv_fltr = false;
+			goto cleanup;
+		}
+		dev_info(dev, "VSI (type:%s) at index %d rebuilt successfully\n",
+			 ice_vsi_type_str(type), vsi->idx);
+
+		/* store ADQ VSI at correct TC index in main VSI's
+		 * map of TC to VSI
+		 */
+		main_vsi->tc_map_vsi[tc_idx++] = vsi;
+	}
+
+	/* ADQ VSI(s) has been rebuild successfully, so setup
+	 * channel for main VSI's Tx and Rx rings
+	 */
+	list_for_each_entry(ch, &main_vsi->ch_list, list) {
+		struct ice_vsi *ch_vsi;
+
+		ch_vsi = ch->ch_vsi;
+		if (!ch_vsi)
+			continue;
+
+		/* reconfig channel resources */
+		ice_cfg_chnl_all_res(main_vsi, ch);
+
+		/* replay BW rate limit it it is non-zero */
+		if (!ch->max_tx_rate && !ch->min_tx_rate)
+			continue;
+
+		err = ice_set_bw_limit(ch_vsi, ch->max_tx_rate,
+				       ch->min_tx_rate);
+		if (err)
+			dev_err(dev, "failed (err:%d) to rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+				err, ch->max_tx_rate, ch->min_tx_rate,
+				ch_vsi->vsi_num);
+		else
+			dev_dbg(dev, "successfully rebuild BW rate limit, max_tx_rate: %llu Kbps, min_tx_rate: %llu Kbps for VSI(%u)\n",
+				ch->max_tx_rate, ch->min_tx_rate,
+				ch_vsi->vsi_num);
 	}
 
+	/* reconfig RSS for main VSI */
+	if (main_vsi->ch_rss_size)
+		ice_vsi_cfg_rss_lut_key(main_vsi);
+
 	return 0;
+
+cleanup:
+	ice_remove_q_channels(main_vsi, rem_adv_fltr);
+	return err;
 }
 
 /**
- * ice_bridge_getlink - Get the hardware bridge mode
- * @skb: skb buff
- * @pid: process ID
- * @seq: RTNL message seq
- * @dev: the netdev being configured
- * @filter_mask: filter mask passed in
- * @nlflags: netlink flags passed in
+ * ice_cfg_q_channels - Add queue channel for the given TCs
+ * @vsi: VSI to be configured
  *
- * Return the bridge mode (VEB/VEPA)
+ * Configures queue channel mapping to the given TCs
  */
-static int
-ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-		   struct net_device *dev, u32 filter_mask, int nlflags)
+static int ice_cfg_q_channels(struct ice_vsi *vsi)
 {
-	struct ice_netdev_priv *np = netdev_priv(dev);
-	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
-	u16 bmode;
+	struct ice_channel *ch;
+	int ret = 0, i;
 
-	bmode = pf->first_sw->bridge_mode;
+	ice_for_each_chnl_tc(i) {
+		if (!(vsi->all_enatc & BIT(i)))
+			continue;
 
-	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
-				       filter_mask, NULL);
+		ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+		if (!ch) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+		INIT_LIST_HEAD(&ch->list);
+		ch->num_rxq = vsi->mqprio_qopt.qopt.count[i];
+		ch->num_txq = vsi->mqprio_qopt.qopt.count[i];
+		ch->base_q = vsi->mqprio_qopt.qopt.offset[i];
+		ch->max_tx_rate = vsi->mqprio_qopt.max_rate[i];
+		ch->min_tx_rate = vsi->mqprio_qopt.min_rate[i];
+
+		/* convert to Kbits/s */
+		if (ch->max_tx_rate)
+			do_div(ch->max_tx_rate, ICE_BW_KBPS_DIVISOR);
+		if (ch->min_tx_rate)
+			do_div(ch->min_tx_rate, ICE_BW_KBPS_DIVISOR);
+
+		ret = ice_create_q_channel(vsi, ch);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf),
+				"failed creating channel TC:%d\n", i);
+			kfree(ch);
+			goto err_free;
+		}
+		list_add_tail(&ch->list, &vsi->ch_list);
+		vsi->tc_map_vsi[i] = ch->ch_vsi;
+		dev_dbg(ice_pf_to_dev(pf),
+			"successfully created channel: VSI %pK\n", ch->ch_vsi);
+	}
+	return ret;
+
+err_free:
+	ice_remove_q_channels(vsi, false);
+
+	return ret;
 }
 
 /**
- * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
- * @vsi: Pointer to VSI structure
- * @bmode: Hardware bridge mode (VEB/VEPA)
- *
- * Returns 0 on success, negative on failure
+ * ice_setup_tc_qdisc - configure multiple traffic classes
+ * @netdev: net device to configure
+ * @type_data: TC offload data
  */
-static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+static int ice_setup_tc_qdisc(struct net_device *netdev, void *type_data)
 {
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_aqc_vsi_props *vsi_props;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
+	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 mode, ena_tc_qdisc = 0;
+	int cur_txq, cur_rxq;
+	u8 hw = 0, num_tcf;
+	struct device *dev;
+	int ret, i;
+
+	dev = ice_pf_to_dev(pf);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	num_tcf = mqprio_qopt->qopt.num_tc;
+	hw = mqprio_qopt->qopt.hw;
+	mode = mqprio_qopt->mode;
+	if (!hw) {
+		clear_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+		vsi->ch_rss_size = 0;
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+		goto config_tcf;
+	}
+
+	/* Generate queue region map for number of TCF requested */
+	for (i = 0; i < num_tcf; i++)
+		ena_tc_qdisc |= BIT(i);
+
+	switch (mode) {
+	case TC_MQPRIO_MODE_DCB:
+		netdev_err(netdev, "TC_MQPRIO_MODE_DCB not supported yet\n");
+		return -EINVAL;
+	case TC_MQPRIO_MODE_CHANNEL:
+		if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+			netdev_err(netdev, "TC MQPRIO offload not supported,FW LLDP is enabled\n");
+			return -EINVAL;
+		}
 
-	vsi_props = &vsi->info;
+		ret = ice_validate_mqprio_qopt(vsi, mqprio_qopt);
+		if (ret) {
+			netdev_err(netdev, "failed to validate_mqprio_qopt(), ret %d\n",
+				   ret);
+			return ret;
+		}
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+		set_bit(ICE_FLAG_TC_MQPRIO, pf->flags);
+		/* don't assume state of hw_tc_offload during driver load
+		 * and set the flag for TC flower filter if hw_tc_offload
+		 * already ON
+		 */
+		if (vsi->netdev->features & NETIF_F_HW_TC)
+			set_bit(ICE_FLAG_CLS_FLOWER, pf->flags);
+		break;
+	default:
+		return -EINVAL;
+	}
 
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
+config_tcf:
+#else
+	num_tcf =  tc;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
-	ctxt->info = vsi->info;
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	/* Requesting same TCF configuration as already enabled */
+	if (ena_tc_qdisc == vsi->tc_cfg.ena_tc &&
+	    mode != TC_MQPRIO_MODE_CHANNEL)
+		return 0;
 
-	if (bmode == BRIDGE_MODE_VEB)
-		/* change from VEPA to VEB mode */
-		ctxt->info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
-	else
-		/* change from VEB to VEPA mode */
-		ctxt->info.sw_flags &= ~ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+	/* Pause VSI queues */
+	ice_dis_vsi(vsi, true);
+
+	if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
+		ice_remove_q_channels(vsi, true);
+#else
+	if (ena_tc_qdisc == vsi->tc_cfg.ena_tc)
+		return 0;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+	if (!hw && !test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		vsi->req_txq = min_t(int, ice_get_avail_txq_count(pf),
+				     num_online_cpus());
+		vsi->req_rxq = min_t(int, ice_get_avail_rxq_count(pf),
+				     num_online_cpus());
+	} else {
+		/* logic to rebuild VSI, same like ethtool -L */
+		u16 offset = 0, qcount_tx = 0, qcount_rx = 0;
+
+		for (i = 0; i < num_tcf; i++) {
+			if (!(ena_tc_qdisc & BIT(i)))
+				continue;
+
+			offset = vsi->mqprio_qopt.qopt.offset[i];
+			qcount_rx = vsi->mqprio_qopt.qopt.count[i];
+			qcount_tx = vsi->mqprio_qopt.qopt.count[i];
+		}
+		vsi->req_txq = offset + qcount_tx;
+		vsi->req_rxq = offset + qcount_rx;
+
+		/* store away original rss_size info, so that it gets reused
+		 * form ice_vsi_rebuild during tc-qdisc delete stage - to
+		 * determine, what should be the rss_sizefor main VSI
+		 */
+		vsi->orig_rss_size = vsi->rss_size;
+	}
+
+	/* save current values of Tx and Rx queues before calling VSI rebuild
+	 * for fallback option
+	 */
+	cur_txq = vsi->num_txq;
+	cur_rxq = vsi->num_rxq;
+
+	/* proceed with rebuild main VSI using correct number of queues */
+	ret = ice_vsi_rebuild(vsi, false);
+	if (ret) {
+		/* fallback to current number of queues */
+		dev_info(dev, "Rebuild failed with new queues, try with current number of queues\n");
+		vsi->req_txq = cur_txq;
+		vsi->req_rxq = cur_rxq;
+		clear_bit(ICE_RESET_FAILED, pf->state);
+		if (ice_vsi_rebuild(vsi, false)) {
+			dev_err(dev, "Rebuild of main VSI failed again\n");
+			return ret;
+		}
+	}
+
+	vsi->all_numtc = num_tcf;
+	vsi->all_enatc = ena_tc_qdisc;
+	ret = ice_vsi_cfg_tc(vsi, ena_tc_qdisc);
+	if (ret) {
+		netdev_err(netdev, "failed configuring TC for VSI id=%d\n",
+			   vsi->vsi_num);
+		goto exit;
+	}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (test_bit(ICE_FLAG_TC_MQPRIO, pf->flags)) {
+		u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+		u64 min_tx_rate = vsi->mqprio_qopt.min_rate[0];
+
+		/* set TC0 rate limit if specified */
+		if (max_tx_rate || min_tx_rate) {
+			/* convert to Kbits/s */
+			if (max_tx_rate)
+				do_div(max_tx_rate, ICE_BW_KBPS_DIVISOR);
+			if (min_tx_rate)
+				do_div(min_tx_rate, ICE_BW_KBPS_DIVISOR);
+
+			ret = ice_set_bw_limit(vsi, max_tx_rate, min_tx_rate);
+			if (!ret) {
+				dev_dbg(dev, "set Tx rate max %llu min %llu for VSI(%u)\n",
+					max_tx_rate, min_tx_rate, vsi->vsi_num);
+			} else {
+				dev_err(dev, "failed to set Tx rate max %llu min %llu for VSI(%u)\n",
+					max_tx_rate, min_tx_rate, vsi->vsi_num);
+				goto exit;
+			}
+		}
+		ret = ice_cfg_q_channels(vsi);
+		if (ret) {
+			netdev_err(netdev, "failed configuring queue channels\n");
+			goto exit;
+		} else {
+			netdev_dbg(netdev, "successfully configured channels\n");
+		}
+	}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
 
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_err(dev, "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n",
-			bmode, status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
+	if (vsi->ch_rss_size)
+		ice_vsi_cfg_rss_lut_key(vsi);
+
+exit:
+	/* if error, reset the all_numtc and all_enatc */
+	if (ret) {
+		vsi->all_numtc = 0;
+		vsi->all_enatc = 0;
 	}
-	/* Update sw flags for book keeping */
-	vsi_props->sw_flags = ctxt->info.sw_flags;
+	/* resume VSI */
+	ice_ena_vsi(vsi, true);
 
-out:
-	devm_kfree(dev, ctxt);
 	return ret;
 }
 
-/**
- * ice_bridge_setlink - Set the hardware bridge mode
- * @dev: the netdev being configured
- * @nlh: RTNL message
- * @flags: bridge setlink flags
- * @extack: netlink extended ack
- *
- * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
- * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
- * not already set for all VSIs connected to this switch. And also update the
- * unicast switch filter rules for the corresponding switch of the netdev.
- */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static LIST_HEAD(ice_block_cb_list);
+#endif
+
 static int
-ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
-		   u16 __always_unused flags,
-		   struct netlink_ext_ack __always_unused *extack)
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+ice_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+	     void *type_data)
+#elif defined(HAVE_NDO_SETUP_TC_CHAIN_INDEX)
+ice_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+	     u32 __always_unused chain_index, __be16 proto,
+	     struct tc_to_netdev *tc)
+#else
+ice_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+	     __be16 __always_unused proto, struct tc_to_netdev *tc)
+#endif
 {
-	struct ice_netdev_priv *np = netdev_priv(dev);
+#ifndef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+	unsigned int type = tc->type;
+#elif !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	struct tc_cls_flower_offload *cls_flower = (struct
+						   tc_cls_flower_offload *)
+						   type_data;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
 	struct ice_pf *pf = np->vsi->back;
-	struct nlattr *attr, *br_spec;
-	struct ice_hw *hw = &pf->hw;
-	enum ice_status status;
-	struct ice_sw *pf_sw;
-	int rem, v, err = 0;
-
-	pf_sw = pf->first_sw;
-	/* find the attribute in the netlink message */
-	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
-
-	nla_for_each_nested(attr, br_spec, rem) {
-		__u16 mode;
+	int err;
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-		mode = nla_get_u16(attr);
-		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
-			return -EINVAL;
-		/* Continue  if bridge mode is not being flipped */
-		if (mode == pf_sw->bridge_mode)
-			continue;
-		/* Iterates through the PF VSI list and update the loopback
-		 * mode of the VSI
-		 */
-		ice_for_each_vsi(pf, v) {
-			if (!pf->vsi[v])
-				continue;
-			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
-			if (err)
-				return err;
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		/* setup traffic classifier for receive side */
+		mutex_lock(&pf->tc_mutex);
+		if (ice_is_dcb_active(pf)) {
+			if (pf->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED) {
+				netdev_err(netdev,
+					   "TC_SETUP_QDISC_MQPRIO not supported when DCB is active, managed by FW LLDP agent\n");
+				mutex_unlock(&pf->tc_mutex);
+				return -EOPNOTSUPP;
+			} else if (pf->dcbx_cap & DCB_CAP_DCBX_HOST) {
+				/* if SW LLDP is running and if numtc is more
+				 * than 1, then SW LLDP must have enabled
+				 * multi TC mode - only in that scenario
+				 * stop ADQ config. This change will allow
+				 * co-existence of lldpad (SW LLDP) running
+				 * and ADQ. Once ADQ is configured, subsequent
+				 * SW LLDP - via netlink are handled (not
+				 * supported if ADQ is active)
+				 */
+				if (np->vsi->tc_cfg.numtc > 1) {
+					netdev_err(netdev,
+						   "TC_SETUP_QDISC_MQPRIO not supported when DCB is active, managed by SW LLDP agent, num_tc %u\n",
+						   np->vsi->tc_cfg.numtc);
+					mutex_unlock(&pf->tc_mutex);
+					return -EOPNOTSUPP;
+				}
+			}
 		}
-
-		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
-		/* Update the unicast switch filter rules for the corresponding
-		 * switch of the netdev
-		 */
-		status = ice_update_sw_rule_bridge_mode(hw);
-		if (status) {
-			netdev_err(dev, "switch rule update failed, mode = %d err %d aq_err %d\n",
-				   mode, status, hw->adminq.sq_last_status);
-			/* revert hw->evb_veb */
-			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
-			return -EIO;
+#ifdef HAVE_NETDEV_SB_DEV
+		if (ice_is_offloaded_macvlan_ena(pf)) {
+			netdev_err(netdev, "TC_SETUP_QDISC_MQPRIO not supported when MACVLAN offloade support is ON. Turn off MACVLAN offload support thru ethtool and try again\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
+		}
+#endif /* HAVE_NETDEV_SB_DEV */
+		if (ice_is_dcf_enabled(pf)) {
+			netdev_err(netdev, "TC_SETUP_QDISC_MQPRIO not supported when Device Control Functionality is enabled.\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
 		}
+		if (ice_is_eswitch_mode_switchdev(pf)) {
+			netdev_err(netdev, "TC MQPRIO offload not supported, switchdev is enabled\n");
+			mutex_unlock(&pf->tc_mutex);
+			return -EOPNOTSUPP;
+		}
+		err = ice_setup_tc_qdisc(netdev, type_data);
+		mutex_unlock(&pf->tc_mutex);
+		return err;
+	case TC_SETUP_BLOCK:
+		return flow_block_cb_setup_simple(type_data,
+						  &ice_block_cb_list,
+						  ice_setup_tc_block_cb,
+						  np, np, true);
+	default:
+		return -EOPNOTSUPP;
+	}
+#elif !defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV) || !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
 
-		pf_sw->bridge_mode = mode;
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, np->vsi->netdev, cls_flower);
+	default:
+		return -EOPNOTSUPP;
 	}
+#endif
+	return -EOPNOTSUPP;
+}
 
-	return 0;
+#ifdef HAVE_TC_INDIR_BLOCK
+static struct ice_indr_block_priv *
+ice_indr_block_priv_lookup(struct ice_netdev_priv *np,
+			   struct net_device *netdev)
+{
+	struct ice_indr_block_priv *cb_priv;
+
+	/* All callback list access should be protected by RTNL. */
+	ASSERT_RTNL();
+
+	list_for_each_entry(cb_priv, &np->tc_indr_block_priv_list, list) {
+		if (!cb_priv->netdev)
+			return NULL;
+		if (cb_priv->netdev == netdev)
+			return cb_priv;
+	}
+	return NULL;
 }
 
-/**
- * ice_tx_timeout - Respond to a Tx Hang
- * @netdev: network interface device structure
- */
-static void ice_tx_timeout(struct net_device *netdev)
+static int
+ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data,
+			void *indr_priv)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_ring *tx_ring = NULL;
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	int hung_queue = -1;
-	u32 i;
+	struct ice_indr_block_priv *priv = indr_priv;
+	struct ice_netdev_priv *np = priv->np;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_setup_tc_cls_flower(np, priv->netdev,
+					       (struct flow_cls_offload *)
+					       type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
 
-	pf->tx_timeout_count++;
+static int
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+ice_indr_setup_tc_block(struct net_device *netdev, struct Qdisc *sch,
+			struct ice_netdev_priv *np,
+			struct flow_block_offload *f, void *data,
+			void (*cleanup)(struct flow_block_cb *block_cb))
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+ice_indr_setup_tc_block(struct net_device *netdev, struct ice_netdev_priv *np,
+			struct flow_block_offload *f, void *data,
+			void (*cleanup)(struct flow_block_cb *block_cb))
+#else
+ice_indr_setup_tc_block(struct net_device *netdev, struct ice_netdev_priv *np,
+			struct flow_block_offload *f)
+#endif
+{
+	struct ice_indr_block_priv *indr_priv;
+#ifdef HAVE_FLOW_BLOCK_API
+	struct flow_block_cb *block_cb;
+#else
+	int err = 0;
+#endif
+#ifdef HAVE_TC_FLOW_INDIR_DEV
+	int tunnel_type = ice_tc_tun_get_type(netdev, NULL);
 
-	/* find the stopped queue the same way dev_watchdog() does */
-	for (i = 0; i < netdev->num_tx_queues; i++) {
-		unsigned long trans_start;
-		struct netdev_queue *q;
+	if (tunnel_type != TNL_VXLAN && tunnel_type != TNL_GENEVE &&
+	    !(is_vlan_dev(netdev) &&
+	    vlan_dev_real_dev(netdev) == np->vsi->netdev))
+		return -EOPNOTSUPP;
+#endif
 
-		q = netdev_get_tx_queue(netdev, i);
-		trans_start = q->trans_start;
-		if (netif_xmit_stopped(q) &&
-		    time_after(jiffies,
-			       trans_start + netdev->watchdog_timeo)) {
-			hung_queue = i;
-			break;
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		indr_priv = ice_indr_block_priv_lookup(np, netdev);
+		if (indr_priv)
+			return -EEXIST;
+
+		indr_priv = devm_kzalloc(&netdev->dev, sizeof(*indr_priv),
+					 GFP_KERNEL);
+		if (!indr_priv)
+			return -ENOMEM;
+
+		indr_priv->netdev = netdev;
+		indr_priv->np = np;
+		list_add(&indr_priv->list, &np->tc_indr_block_priv_list);
+
+#ifdef HAVE_FLOW_BLOCK_API
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+		block_cb =
+			flow_indr_block_cb_alloc(ice_indr_setup_block_cb,
+						 indr_priv, indr_priv,
+						 ice_rep_indr_tc_block_unbind,
+						 f, netdev, sch, data, np,
+						 cleanup);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		block_cb =
+			flow_indr_block_cb_alloc(ice_indr_setup_block_cb,
+						 indr_priv, indr_priv,
+						 ice_rep_indr_tc_block_unbind,
+						 f, netdev, data, np, cleanup);
+#else
+		block_cb = flow_block_cb_alloc(ice_indr_setup_block_cb,
+					       indr_priv, indr_priv,
+					       ice_rep_indr_tc_block_unbind);
+#endif
+		if (IS_ERR(block_cb)) {
+			list_del(&indr_priv->list);
+			devm_kfree(&netdev->dev, indr_priv);
+			return PTR_ERR(block_cb);
+		}
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &ice_block_cb_list);
+		return 0;
+#else /* !HAVE_FLOW_BLOCK_API */
+		err = tcf_block_cb_register(f->block, ice_indr_setup_block_cb,
+					    indr_priv, indr_priv, f->extack);
+		if (err) {
+			list_del(&indr_priv->list);
+			devm_kfree(&netdev->dev, indr_priv);
 		}
+		return err;
+#endif /* !HAVE_FLOW_BLOCK_API */
+	case FLOW_BLOCK_UNBIND:
+		indr_priv = ice_indr_block_priv_lookup(np, netdev);
+		if (!indr_priv)
+			return -ENOENT;
+
+#ifdef HAVE_FLOW_BLOCK_API
+		block_cb = flow_block_cb_lookup(f->block,
+						ice_indr_setup_block_cb,
+						indr_priv);
+		if (!block_cb)
+			return -ENOENT;
+
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		flow_indr_block_cb_remove(block_cb, f);
+#else
+		flow_block_cb_remove(block_cb, f);
+#endif /* HAVE_TC_FLOW_INDIR_DEV && HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP */
+		list_del(&block_cb->driver_list);
+		return 0;
+#else
+		tcf_block_cb_unregister(f->block, ice_indr_setup_block_cb,
+					indr_priv);
+		list_del(&indr_priv->list);
+		devm_kfree(&netdev->dev, indr_priv);
+		return 0;
+#endif
+	default:
+		return -EOPNOTSUPP;
 	}
+	return 0;
+}
 
-	if (i == netdev->num_tx_queues)
-		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
-	else
-		/* now that we have an index, find the tx_ring struct */
-		for (i = 0; i < vsi->num_txq; i++)
-			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
-				if (hung_queue == vsi->tx_rings[i]->q_index) {
-					tx_ring = vsi->tx_rings[i];
-					break;
-				}
-
-	/* Reset recovery level if enough time has elapsed after last timeout.
-	 * Also ensure no new reset action happens before next timeout period.
-	 */
-	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ * 20)))
-		pf->tx_timeout_recovery_level = 1;
-	else if (time_before(jiffies, (pf->tx_timeout_last_recovery +
-				       netdev->watchdog_timeo)))
-		return;
+static int
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+ice_indr_setup_tc_cb(struct net_device *netdev, struct Qdisc *sch,
+		     void *cb_priv, enum tc_setup_type type, void *type_data,
+		     void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb))
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data, void *data,
+		     void (*cleanup)(struct flow_block_cb *block_cb))
+#else
+ice_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
+		     enum tc_setup_type type, void *type_data)
+#endif
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+#if defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_FLOW_INDIR_BLOCK_QDISC)
+		return ice_indr_setup_tc_block(netdev, sch, cb_priv, type_data,
+					       data, cleanup);
+#elif defined(HAVE_TC_FLOW_INDIR_DEV) && defined(HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP)
+		return ice_indr_setup_tc_block(netdev, cb_priv, type_data, data,
+					       cleanup);
+#else
+		return ice_indr_setup_tc_block(netdev, cb_priv, type_data);
+#endif
 
-	if (tx_ring) {
-		struct ice_hw *hw = &pf->hw;
-		u32 head, val = 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
 
-		head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[hung_queue])) &
-			QTX_COMM_HEAD_HEAD_M) >> QTX_COMM_HEAD_HEAD_S;
-		/* Read interrupt register */
-		val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx));
+#ifndef HAVE_TC_FLOW_INDIR_DEV
+static int
+ice_indr_register_block(struct ice_netdev_priv *np, struct net_device *netdev)
+{
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int err;
 
-		netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %d, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n",
-			    vsi->vsi_num, hung_queue, tx_ring->next_to_clean,
-			    head, tx_ring->next_to_use, val);
+	err = __flow_indr_block_cb_register(netdev, np, ice_indr_setup_tc_cb,
+					    np);
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "Failed to register remote block notifier for %s err=%d\n",
+			netdev_name(netdev), err);
 	}
+	return err;
+}
 
-	pf->tx_timeout_last_recovery = jiffies;
-	netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n",
-		    pf->tx_timeout_recovery_level, hung_queue);
+static void
+ice_indr_unregister_block(struct ice_netdev_priv *np, struct net_device *netdev)
+{
+	__flow_indr_block_cb_unregister(netdev, ice_indr_setup_tc_cb, np);
+}
 
-	switch (pf->tx_timeout_recovery_level) {
-	case 1:
-		set_bit(__ICE_PFR_REQ, pf->state);
-		break;
-	case 2:
-		set_bit(__ICE_CORER_REQ, pf->state);
-		break;
-	case 3:
-		set_bit(__ICE_GLOBR_REQ, pf->state);
+static void ice_indr_clean_block_privs(struct ice_netdev_priv *np)
+{
+	struct ice_indr_block_priv *cb_priv, *temp;
+	struct list_head *head = &np->tc_indr_block_priv_list;
+
+	list_for_each_entry_safe(cb_priv, temp, head, list) {
+		ice_indr_unregister_block(np, cb_priv->netdev);
+		devm_kfree(&cb_priv->netdev->dev, cb_priv);
+	}
+}
+
+static int
+ice_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+	struct ice_netdev_priv *np = container_of(nb, struct ice_netdev_priv,
+						  netdevice_nb);
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	int tunnel_type = ice_tc_tun_get_type(netdev, NULL);
+
+	if (tunnel_type != TNL_VXLAN && tunnel_type != TNL_GENEVE &&
+	    !(is_vlan_dev(netdev) &&
+	    vlan_dev_real_dev(netdev) == np->vsi->netdev))
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		ice_indr_register_block(np, netdev);
 		break;
-	default:
-		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
-		set_bit(__ICE_DOWN, pf->state);
-		set_bit(__ICE_NEEDS_RESTART, vsi->state);
-		set_bit(__ICE_SERVICE_DIS, pf->state);
+	case NETDEV_UNREGISTER:
+		ice_indr_unregister_block(np, netdev);
 		break;
 	}
-
-	ice_service_task_schedule(pf);
-	pf->tx_timeout_recovery_level++;
+	return NOTIFY_OK;
 }
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+#endif /* HAVE_TC_INDIR_BLOCK */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 
 /**
  * ice_open - Called when a network interface becomes active
@@ -4737,13 +11374,41 @@ static void ice_tx_timeout(struct net_device *netdev)
  * Returns 0 on success, negative value on failure
  */
 int ice_open(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+
+	if (ice_is_reset_in_progress(pf->state)) {
+		netdev_err(netdev, "can't open net device while reset is in progress");
+		return -EBUSY;
+	}
+
+	return ice_open_internal(netdev);
+}
+
+/**
+ * ice_open_internal - Called when a network interface becomes active
+ * @netdev: network interface device structure
+ *
+ * Internal ice_open implementation. Should not be used directly except for ice_open and reset
+ * handling routine
+ *
+ * Returns 0 on success, negative value on failure
+ */
+int ice_open_internal(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
 	struct ice_port_info *pi;
+	enum ice_status status;
 	int err;
 
-	if (test_bit(__ICE_NEEDS_RESTART, vsi->back->state)) {
+	/* disallow open if eeprom is corrupted */
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return -EOPNOTSUPP;
+
+	if (test_bit(ICE_NEEDS_RESTART, pf->state)) {
 		netdev_err(netdev, "driver needs to be unloaded and reloaded\n");
 		return -EIO;
 	}
@@ -4751,36 +11416,59 @@ int ice_open(struct net_device *netdev)
 	netif_carrier_off(netdev);
 
 	pi = vsi->port_info;
-	err = ice_update_link_info(pi);
-	if (err) {
-		netdev_err(netdev, "Failed to get link info, error %d\n",
-			   err);
-		return err;
+	status = ice_update_link_info(pi);
+	if (status) {
+		netdev_err(netdev, "Failed to get link info, error %s\n",
+			   ice_stat_str(status));
+		return -EIO;
 	}
 
+	ice_check_module_power(pf, pi->phy.link_info.link_cfg_err);
+
 	/* Set PHY if there is media, otherwise, turn off PHY */
 	if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) {
-		err = ice_force_phys_link_state(vsi, true);
+		clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+		if (!test_bit(ICE_PHY_INIT_COMPLETE, pf->state)) {
+			err = ice_init_phy_user_cfg(pi);
+			if (err) {
+				netdev_err(netdev, "Failed to initialize PHY settings, error %d\n",
+					   err);
+				return err;
+			}
+		}
+
+		err = ice_configure_phy(vsi);
 		if (err) {
-			netdev_err(netdev,
-				   "Failed to set physical link up, error %d\n",
+			netdev_err(netdev, "Failed to set physical link up, error %d\n",
 				   err);
 			return err;
 		}
 	} else {
-		err = ice_aq_set_link_restart_an(pi, false, NULL);
-		if (err) {
-			netdev_err(netdev, "Failed to set PHY state, VSI %d error %d\n",
-				   vsi->vsi_num, err);
-			return err;
-		}
-		set_bit(ICE_FLAG_NO_MEDIA, vsi->back->flags);
+		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
+		ice_set_link(vsi, false);
 	}
 
 	err = ice_vsi_open(vsi);
 	if (err)
 		netdev_err(netdev, "Failed to open VSI 0x%04X on switch 0x%04X\n",
 			   vsi->vsi_num, vsi->vsw->sw_id);
+
+	/* Update existing tunnels information */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+	udp_tunnel_get_rx_info(netdev);
+#else /* HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_VXLAN_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_VXLAN)
+	vxlan_get_rx_port(netdev);
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#ifdef HAVE_GENEVE_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_GENEVE)
+	geneve_get_rx_port(netdev);
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD */
+#endif /* HAVE_UDP_ENC_RX_OFFLOAD */
+
 	return err;
 }
 
@@ -4798,12 +11486,19 @@ int ice_stop(struct net_device *netdev)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+
+	if (ice_is_reset_in_progress(pf->state)) {
+		netdev_err(netdev, "can't stop net device while reset is in progress");
+		return -EBUSY;
+	}
 
 	ice_vsi_close(vsi);
 
 	return 0;
 }
 
+#ifdef HAVE_NDO_FEATURES_CHECK
 /**
  * ice_features_check - Validate encapsulated packet conforms to limits
  * @skb: skb buffer
@@ -4831,21 +11526,21 @@ ice_features_check(struct sk_buff *skb,
 		features &= ~NETIF_F_GSO_MASK;
 
 	len = skb_network_header(skb) - skb->data;
-	if (len & ~(ICE_TXD_MACLEN_MAX))
+	if (len > ICE_TXD_MACLEN_MAX || len & 0x1)
 		goto out_rm_features;
 
 	len = skb_transport_header(skb) - skb_network_header(skb);
-	if (len & ~(ICE_TXD_IPLEN_MAX))
+	if (len > ICE_TXD_IPLEN_MAX || len & 0x1)
 		goto out_rm_features;
 
 	if (skb->encapsulation) {
 		len = skb_inner_network_header(skb) - skb_transport_header(skb);
-		if (len & ~(ICE_TXD_L4LEN_MAX))
+		if (len > ICE_TXD_L4LEN_MAX || len & 0x1)
 			goto out_rm_features;
 
 		len = skb_inner_transport_header(skb) -
 		      skb_inner_network_header(skb);
-		if (len & ~(ICE_TXD_IPLEN_MAX))
+		if (len > ICE_TXD_IPLEN_MAX || len & 0x1)
 			goto out_rm_features;
 	}
 
@@ -4853,6 +11548,7 @@ ice_features_check(struct sk_buff *skb,
 out_rm_features:
 	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 }
+#endif /* HAVE_NDO_FEATURES_CHECK */
 
 static const struct net_device_ops ice_netdev_safe_mode_ops = {
 	.ndo_open = ice_open,
@@ -4860,7 +11556,11 @@ static const struct net_device_ops ice_netdev_safe_mode_ops = {
 	.ndo_start_xmit = ice_start_xmit,
 	.ndo_set_mac_address = ice_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	.extended.ndo_change_mtu = ice_change_mtu,
+#else
 	.ndo_change_mtu = ice_change_mtu,
+#endif
 	.ndo_get_stats64 = ice_get_stats64,
 	.ndo_tx_timeout = ice_tx_timeout,
 };
@@ -4869,24 +11569,127 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_open = ice_open,
 	.ndo_stop = ice_stop,
 	.ndo_start_xmit = ice_start_xmit,
+#ifdef HAVE_NDO_FEATURES_CHECK
 	.ndo_features_check = ice_features_check,
+#endif /* HAVE_NDO_FEATURES_CHECK */
+	.ndo_fix_features = ice_fix_features,
 	.ndo_set_rx_mode = ice_set_rx_mode,
 	.ndo_set_mac_address = ice_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
+#ifdef HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+	.extended.ndo_change_mtu = ice_change_mtu,
+#else
 	.ndo_change_mtu = ice_change_mtu,
+#endif
 	.ndo_get_stats64 = ice_get_stats64,
+#ifdef HAVE_NETPOLL_CONTROLLER
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = ice_netpoll,
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+#endif /* HAVE_NETPOLL_CONTROLLER */
+#ifdef HAVE_NDO_SET_TX_MAXRATE
+#ifdef HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE
+	.extended.ndo_set_tx_maxrate = ice_set_tx_maxrate,
+#else
+	.ndo_set_tx_maxrate = ice_set_tx_maxrate,
+#endif /* HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE */
+#endif /* HAVE_NDO_SET_TX_MAXRATE */
+	.ndo_do_ioctl = ice_do_ioctl,
 	.ndo_set_vf_spoofchk = ice_set_vf_spoofchk,
+#ifdef HAVE_NDO_SET_VF_TRUST
 	.ndo_set_vf_mac = ice_set_vf_mac,
 	.ndo_get_vf_config = ice_get_vf_cfg,
+#ifdef HAVE_RHEL7_NET_DEVICE_OPS_EXT
+	/* RHEL7 requires ndo_size to be defined to enable extended ops */
+	.ndo_size = sizeof(const struct net_device_ops),
+	.extended.ndo_set_vf_trust = ice_set_vf_trust,
+#else
 	.ndo_set_vf_trust = ice_set_vf_trust,
+#endif /* HAVE_RHEL7_NET_DEVICE_OPS_EXT */
+#endif /* HAVE_NDO_SET_VF_TRUST */
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN
+	.extended.ndo_set_vf_vlan = ice_set_vf_port_vlan,
+#else
 	.ndo_set_vf_vlan = ice_set_vf_port_vlan,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN */
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 	.ndo_set_vf_link_state = ice_set_vf_link_state,
+#endif
+#ifdef HAVE_VF_STATS
+	.ndo_get_vf_stats = ice_get_vf_stats,
+#endif /* HAVE_VF_STATS */
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	.ndo_set_vf_rate = ice_set_vf_bw,
+#else
+	.ndo_set_vf_tx_rate = ice_set_vf_bw,
+#endif
 	.ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid,
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+	.extended.ndo_setup_tc_rh = ice_setup_tc,
+#else
+	.ndo_setup_tc = ice_setup_tc,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
 	.ndo_set_features = ice_set_features,
 	.ndo_bridge_getlink = ice_bridge_getlink,
 	.ndo_bridge_setlink = ice_bridge_setlink,
 	.ndo_fdb_add = ice_fdb_add,
 	.ndo_fdb_del = ice_fdb_del,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer = ice_rx_flow_steer,
+#endif
 	.ndo_tx_timeout = ice_tx_timeout,
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_NDO_BPF
+	.ndo_bpf = ice_xdp,
+#else
+	.ndo_xdp = ice_xdp,
+#endif /* HAVE_NDO_BPF */
+	.ndo_xdp_xmit = ice_xdp_xmit,
+#ifndef NO_NDO_XDP_FLUSH
+	.ndo_xdp_flush = ice_xdp_flush,
+#endif /* !NO_NDO_XDP_FLUSH */
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef HAVE_NDO_XSK_WAKEUP
+	.ndo_xsk_wakeup = ice_xsk_wakeup,
+#else
+	.ndo_xsk_async_xmit = ice_xsk_async_xmit,
+#endif /* HAVE_NDO_XSK_WAKEUP */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_UDP_ENC_RX_OFFLOAD
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_UDP_TUNNEL
+	.extended.ndo_udp_tunnel_add = ice_udp_tunnel_add,
+	.extended.ndo_udp_tunnel_del = ice_udp_tunnel_del,
+#else
+#ifndef HAVE_UDP_TUNNEL_NIC_INFO
+	.ndo_udp_tunnel_add = ice_udp_tunnel_add,
+	.ndo_udp_tunnel_del = ice_udp_tunnel_del,
+#endif /* !HAVE_UDP_TUNNEL_NIC_INFO */
+#endif
+#else /* !HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_VXLAN_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_VXLAN)
+	.ndo_add_vxlan_port = ice_add_vxlan_port,
+	.ndo_del_vxlan_port = ice_del_vxlan_port,
+#endif
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#ifdef HAVE_GENEVE_RX_OFFLOAD
+#if IS_ENABLED(CONFIG_GENEVE)
+	.ndo_add_geneve_port = ice_add_geneve_port,
+	.ndo_del_geneve_port = ice_del_geneve_port,
+#endif
+#endif /* HAVE_GENEVE_RX_OFFLOAD */
+#endif /* HAVE_UDP_ENC_RX_OFFLOAD */
+#ifdef HAVE_NETDEV_SB_DEV
+#ifdef HAVE_RHEL7_NET_DEVICE_OPS_EXT
+	.extended.ndo_dfwd_add_station = ice_fwd_add_macvlan,
+	.extended.ndo_dfwd_del_station = ice_fwd_del_macvlan,
+#else
+	.ndo_dfwd_add_station = ice_fwd_add_macvlan,
+	.ndo_dfwd_del_station = ice_fwd_del_macvlan,
+#endif /* HAVE_RHEL7_NET_DEVICE_OPS_EXT */
+#endif /* HAVE_NETDEV_SB_DEV */
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c
index bcb431f1bd92b150609ded35e074f53f47cb4267..51ebb17dac854ff98096fdc4d724db087eb99dc2 100644
--- a/drivers/net/ethernet/intel/ice/ice_nvm.c
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
 
+
 /**
  * ice_aq_read_nvm
  * @hw: pointer to the HW struct
@@ -11,25 +12,29 @@
  * @length: length of the section to be read (in bytes from the offset)
  * @data: command buffer (size [bytes] = length)
  * @last_command: tells if this is the last command in a series
+ * @read_shadow_ram: tell if this is a shadow RAM read
  * @cd: pointer to command details structure or NULL
  *
  * Read the NVM using the admin queue commands (0x0701)
  */
 static enum ice_status
 ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset, u16 length,
-		void *data, bool last_command, struct ice_sq_cd *cd)
+		void *data, bool last_command, bool read_shadow_ram,
+		struct ice_sq_cd *cd)
 {
 	struct ice_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
 
 	cmd = &desc.params.nvm;
 
-	/* In offset the highest byte must be zeroed. */
-	if (offset & 0xFF000000)
+	if (offset > ICE_AQC_NVM_MAX_OFFSET)
 		return ICE_ERR_PARAM;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_read);
 
+	if (!read_shadow_ram && module_typeid == ICE_AQC_NVM_START_POINT)
+		cmd->cmd_flags |= ICE_AQC_NVM_FLASH_ONLY;
+
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->cmd_flags |= ICE_AQC_NVM_LAST_CMD;
@@ -42,173 +47,406 @@ ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset, u16 length,
 }
 
 /**
- * ice_check_sr_access_params - verify params for Shadow RAM R/W operations.
- * @hw: pointer to the HW structure
- * @offset: offset in words from module start
- * @words: number of words to access
+ * ice_read_flat_nvm - Read portion of NVM by flat offset
+ * @hw: pointer to the HW struct
+ * @offset: offset from beginning of NVM
+ * @length: (in) number of bytes to read; (out) number of bytes actually read
+ * @data: buffer to return data in (sized to fit the specified length)
+ * @read_shadow_ram: if true, read from shadow RAM instead of NVM
+ *
+ * Reads a portion of the NVM, as a flat memory space. This function correctly
+ * breaks read requests across Shadow RAM sectors and ensures that no single
+ * read request exceeds the maximum 4KB read for a single AdminQ command.
+ *
+ * Returns a status code on failure. Note that the data pointer may be
+ * partially updated if some reads succeed before a failure.
  */
-static enum ice_status
-ice_check_sr_access_params(struct ice_hw *hw, u32 offset, u16 words)
+enum ice_status
+ice_read_flat_nvm(struct ice_hw *hw, u32 offset, u32 *length, u8 *data,
+		  bool read_shadow_ram)
 {
-	if ((offset + words) > hw->nvm.sr_words) {
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: offset beyond SR lmt.\n");
-		return ICE_ERR_PARAM;
-	}
+	enum ice_status status;
+	u32 inlen = *length;
+	u32 bytes_read = 0;
+	bool last_cmd;
+
+	*length = 0;
 
-	if (words > ICE_SR_SECTOR_SIZE_IN_WORDS) {
-		/* We can access only up to 4KB (one sector), in one AQ write */
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: tried to access %d words, limit is %d.\n",
-			  words, ICE_SR_SECTOR_SIZE_IN_WORDS);
+	/* Verify the length of the read if this is for the Shadow RAM */
+	if (read_shadow_ram && ((offset + inlen) > (hw->flash.sr_words * 2u))) {
+		ice_debug(hw, ICE_DBG_NVM, "NVM error: requested data is beyond Shadow RAM limit\n");
 		return ICE_ERR_PARAM;
 	}
 
-	if (((offset + (words - 1)) / ICE_SR_SECTOR_SIZE_IN_WORDS) !=
-	    (offset / ICE_SR_SECTOR_SIZE_IN_WORDS)) {
-		/* A single access cannot spread over two sectors */
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM error: cannot spread over two sectors.\n");
+	do {
+		u32 read_size, sector_offset;
+
+		/* ice_aq_read_nvm cannot read more than 4KB at a time.
+		 * Additionally, a read from the Shadow RAM may not cross over
+		 * a sector boundary. Conveniently, the sector size is also
+		 * 4KB.
+		 */
+		sector_offset = offset % ICE_AQ_MAX_BUF_LEN;
+		read_size = min_t(u32, ICE_AQ_MAX_BUF_LEN - sector_offset,
+				  inlen - bytes_read);
+
+		last_cmd = !(bytes_read + read_size < inlen);
+
+		/* ice_aq_read_nvm takes the length as a u16. Our read_size is
+		 * calculated using a u32, but the ICE_AQ_MAX_BUF_LEN maximum
+		 * size guarantees that it will fit within the 2 bytes.
+		 */
+		status = ice_aq_read_nvm(hw, ICE_AQC_NVM_START_POINT,
+					 offset, (u16)read_size,
+					 data + bytes_read, last_cmd,
+					 read_shadow_ram, NULL);
+		if (status)
+			break;
+
+		bytes_read += read_size;
+		offset += read_size;
+	} while (!last_cmd);
+
+	*length = bytes_read;
+	return status;
+}
+
+/**
+ * ice_aq_update_nvm
+ * @hw: pointer to the HW struct
+ * @module_typeid: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be written (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @command_flags: command parameters
+ * @cd: pointer to command details structure or NULL
+ *
+ * Update the NVM using the admin queue commands (0x0703)
+ */
+enum ice_status
+ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
+		  u16 length, void *data, bool last_command, u8 command_flags,
+		  struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	struct ice_aqc_nvm *cmd;
+
+	cmd = &desc.params.nvm;
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000)
 		return ICE_ERR_PARAM;
-	}
 
-	return 0;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_write);
+
+	cmd->cmd_flags |= command_flags;
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->cmd_flags |= ICE_AQC_NVM_LAST_CMD;
+	cmd->module_typeid = cpu_to_le16(module_typeid);
+	cmd->offset_low = cpu_to_le16(offset & 0xFFFF);
+	cmd->offset_high = (offset >> 16) & 0xFF;
+	cmd->length = cpu_to_le16(length);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	return ice_aq_send_cmd(hw, &desc, data, length, cd);
 }
 
 /**
- * ice_read_sr_aq - Read Shadow RAM.
- * @hw: pointer to the HW structure
- * @offset: offset in words from module start
- * @words: number of words to read
- * @data: buffer for words reads from Shadow RAM
- * @last_command: tells the AdminQ that this is the last command
+ * ice_aq_erase_nvm
+ * @hw: pointer to the HW struct
+ * @module_typeid: module pointer location in words from the NVM beginning
+ * @cd: pointer to command details structure or NULL
  *
- * Reads 16-bit word buffers from the Shadow RAM using the admin command.
+ * Erase the NVM sector using the admin queue commands (0x0702)
  */
-static enum ice_status
-ice_read_sr_aq(struct ice_hw *hw, u32 offset, u16 words, u16 *data,
-	       bool last_command)
+enum ice_status
+ice_aq_erase_nvm(struct ice_hw *hw, u16 module_typeid, struct ice_sq_cd *cd)
 {
+	struct ice_aq_desc desc;
+	struct ice_aqc_nvm *cmd;
 	enum ice_status status;
+	__le16 len;
 
-	status = ice_check_sr_access_params(hw, offset, words);
+	/* read a length value from SR, so module_typeid is equal to 0 */
+	/* calculate offset where module size is placed from bytes to words */
+	/* set last command and read from SR values to true */
+	status = ice_aq_read_nvm(hw, 0, 2 * module_typeid + 2, 2, &len, true,
+				 true, NULL);
+	if (status)
+		return status;
 
-	/* values in "offset" and "words" parameters are sized as words
-	 * (16 bits) but ice_aq_read_nvm expects these values in bytes.
-	 * So do this conversion while calling ice_aq_read_nvm.
-	 */
-	if (!status)
-		status = ice_aq_read_nvm(hw, 0, 2 * offset, 2 * words, data,
-					 last_command, NULL);
+	cmd = &desc.params.nvm;
 
-	return status;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_erase);
+
+	cmd->module_typeid = cpu_to_le16(module_typeid);
+	cmd->length = len;
+	cmd->offset_low = 0;
+	cmd->offset_high = 0;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
 }
 
+
 /**
  * ice_read_sr_word_aq - Reads Shadow RAM via AQ
  * @hw: pointer to the HW structure
  * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
  * @data: word read from the Shadow RAM
  *
- * Reads one 16 bit word from the Shadow RAM using the ice_read_sr_aq method.
+ * Reads one 16 bit word from the Shadow RAM using ice_read_flat_nvm.
  */
 static enum ice_status
 ice_read_sr_word_aq(struct ice_hw *hw, u16 offset, u16 *data)
 {
+	u32 bytes = sizeof(u16);
 	enum ice_status status;
+	__le16 data_local;
 
-	status = ice_read_sr_aq(hw, offset, 1, data, true);
-	if (!status)
-		*data = le16_to_cpu(*(__force __le16 *)data);
+	/* Note that ice_read_flat_nvm checks if the read is past the Shadow
+	 * RAM size, and ensures we don't read across a Shadow RAM sector
+	 * boundary
+	 */
+	status = ice_read_flat_nvm(hw, offset * sizeof(u16), &bytes,
+				   (__force u8 *)&data_local, true);
+	if (status)
+		return status;
 
-	return status;
+	*data = le16_to_cpu(data_local);
+	return 0;
 }
 
+
 /**
- * ice_read_sr_buf_aq - Reads Shadow RAM buf via AQ
+ * ice_acquire_nvm - Generic request for acquiring the NVM ownership
  * @hw: pointer to the HW structure
- * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
- * @words: (in) number of words to read; (out) number of words actually read
- * @data: words read from the Shadow RAM
+ * @access: NVM access type (read or write)
+ *
+ * This function will request NVM ownership.
+ */
+enum ice_status
+ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access)
+{
+	if (hw->flash.blank_nvm_mode)
+		return 0;
+
+	return ice_acquire_res(hw, ICE_NVM_RES_ID, access, ICE_NVM_TIMEOUT);
+}
+
+/**
+ * ice_release_nvm - Generic request for releasing the NVM ownership
+ * @hw: pointer to the HW structure
+ *
+ * This function will release NVM ownership.
+ */
+void ice_release_nvm(struct ice_hw *hw)
+{
+	if (hw->flash.blank_nvm_mode)
+		return;
+
+	ice_release_res(hw, ICE_NVM_RES_ID);
+}
+
+/**
+ * ice_get_flash_bank_offset - Get offset into requested flash bank
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive flash bank
+ * @module: the module to read from
+ *
+ * Based on the module, lookup the module offset from the beginning of the
+ * flash.
+ *
+ * Returns the flash offset. Note that a value of zero is invalid and must be
+ * treated as an error.
+ */
+static u32 ice_get_flash_bank_offset(struct ice_hw *hw, enum ice_bank_select bank, u16 module)
+{
+	struct ice_bank_info *banks = &hw->flash.banks;
+	enum ice_flash_bank active_bank;
+	bool second_bank_active;
+	u32 offset, size;
+
+	switch (module) {
+	case ICE_SR_1ST_NVM_BANK_PTR:
+		offset = banks->nvm_ptr;
+		size = banks->nvm_size;
+		active_bank = banks->nvm_bank;
+		break;
+	case ICE_SR_1ST_OROM_BANK_PTR:
+		offset = banks->orom_ptr;
+		size = banks->orom_size;
+		active_bank = banks->orom_bank;
+		break;
+	case ICE_SR_NETLIST_BANK_PTR:
+		offset = banks->netlist_ptr;
+		size = banks->netlist_size;
+		active_bank = banks->netlist_bank;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected value for flash module: 0x%04x\n", module);
+		return 0;
+	}
+
+	switch (active_bank) {
+	case ICE_1ST_FLASH_BANK:
+		second_bank_active = false;
+		break;
+	case ICE_2ND_FLASH_BANK:
+		second_bank_active = true;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected value for active flash bank: %u\n",
+			  active_bank);
+		return 0;
+	}
+
+	/* The second flash bank is stored immediately following the first
+	 * bank. Based on whether the 1st or 2nd bank is active, and whether
+	 * we want the active or inactive bank, calculate the desired offset.
+	 */
+	switch (bank) {
+	case ICE_ACTIVE_FLASH_BANK:
+		return offset + (second_bank_active ? size : 0);
+	case ICE_INACTIVE_FLASH_BANK:
+		return offset + (second_bank_active ? 0 : size);
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "Unexpected value for flash bank selection: %u\n", bank);
+	return 0;
+}
+
+/**
+ * ice_read_flash_module - Read a word from one of the main NVM modules
+ * @hw: pointer to the HW structure
+ * @bank: which bank of the module to read
+ * @module: the module to read
+ * @offset: the offset into the module in bytes
+ * @data: storage for the word read from the flash
+ * @length: bytes of data to read
+ *
+ * Read data from the specified flash module. The bank parameter indicates
+ * whether or not to read from the active bank or the inactive bank of that
+ * module.
  *
- * Reads 16 bit words (data buf) from the SR using the ice_read_sr_aq
- * method. Ownership of the NVM is taken before reading the buffer and later
- * released.
+ * The word will be read using flat NVM access, and relies on the
+ * hw->flash.banks data being setup by ice_determine_active_flash_banks()
+ * during initialization.
  */
 static enum ice_status
-ice_read_sr_buf_aq(struct ice_hw *hw, u16 offset, u16 *words, u16 *data)
+ice_read_flash_module(struct ice_hw *hw, enum ice_bank_select bank, u16 module,
+		      u32 offset, u8 *data, u32 length)
 {
 	enum ice_status status;
-	bool last_cmd = false;
-	u16 words_read = 0;
-	u16 i = 0;
+	u32 start;
 
-	do {
-		u16 read_size, off_w;
+	start = ice_get_flash_bank_offset(hw, bank, module);
+	if (!start) {
+		ice_debug(hw, ICE_DBG_NVM, "Unable to calculate flash bank offset for module 0x%04x\n",
+			  module);
+		return ICE_ERR_PARAM;
+	}
 
-		/* Calculate number of bytes we should read in this step.
-		 * It's not allowed to read more than one page at a time or
-		 * to cross page boundaries.
-		 */
-		off_w = offset % ICE_SR_SECTOR_SIZE_IN_WORDS;
-		read_size = off_w ?
-			min_t(u16, *words,
-			      (ICE_SR_SECTOR_SIZE_IN_WORDS - off_w)) :
-			min_t(u16, (*words - words_read),
-			      ICE_SR_SECTOR_SIZE_IN_WORDS);
-
-		/* Check if this is last command, if so set proper flag */
-		if ((words_read + read_size) >= *words)
-			last_cmd = true;
-
-		status = ice_read_sr_aq(hw, offset, read_size,
-					data + words_read, last_cmd);
-		if (status)
-			goto read_nvm_buf_aq_exit;
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
 
-		/* Increment counter for words already read and move offset to
-		 * new read location
-		 */
-		words_read += read_size;
-		offset += read_size;
-	} while (words_read < *words);
+	status = ice_read_flat_nvm(hw, start + offset, &length, data, false);
 
-	for (i = 0; i < *words; i++)
-		data[i] = le16_to_cpu(((__force __le16 *)data)[i]);
+	ice_release_nvm(hw);
 
-read_nvm_buf_aq_exit:
-	*words = words_read;
 	return status;
 }
 
 /**
- * ice_acquire_nvm - Generic request for acquiring the NVM ownership
+ * ice_read_nvm_module - Read from the active main NVM module
  * @hw: pointer to the HW structure
- * @access: NVM access type (read or write)
+ * @bank: whether to read from active or inactive NVM module
+ * @offset: offset into the NVM module to read, in words
+ * @data: storage for returned word value
  *
- * This function will request NVM ownership.
+ * Read the specified word from the active NVM module. This includes the CSS
+ * header at the start of the NVM module.
  */
 static enum ice_status
-ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access)
+ice_read_nvm_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
 {
-	if (hw->nvm.blank_nvm_mode)
-		return 0;
+	enum ice_status status;
+	__le16 data_local;
 
-	return ice_acquire_res(hw, ICE_NVM_RES_ID, access, ICE_NVM_TIMEOUT);
+	status = ice_read_flash_module(hw, bank, ICE_SR_1ST_NVM_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
 }
 
 /**
- * ice_release_nvm - Generic request for releasing the NVM ownership
+ * ice_read_nvm_sr_copy - Read a word from the Shadow RAM copy in the NVM bank
  * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive NVM module
+ * @offset: offset into the Shadow RAM copy to read, in words
+ * @data: storage for returned word value
  *
- * This function will release NVM ownership.
+ * Read the specified word from the copy of the Shadow RAM found in the
+ * specified NVM module.
  */
-static void ice_release_nvm(struct ice_hw *hw)
+static enum ice_status
+ice_read_nvm_sr_copy(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
 {
-	if (hw->nvm.blank_nvm_mode)
-		return;
+	return ice_read_nvm_module(hw, bank, ICE_NVM_SR_COPY_WORD_OFFSET + offset, data);
+}
 
-	ice_release_res(hw, ICE_NVM_RES_ID);
+/**
+ * ice_read_orom_module - Read from the active Option ROM module
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from active or inactive OROM module
+ * @offset: offset into the OROM module to read, in words
+ * @data: storage for returned word value
+ *
+ * Read the specified word from the active Option ROM module of the flash.
+ * Note that unlike the NVM module, the CSS data is stored at the end of the
+ * module instead of at the beginning.
+ */
+static enum ice_status
+ice_read_orom_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
+{
+	enum ice_status status;
+	__le16 data_local;
+
+	status = ice_read_flash_module(hw, bank, ICE_SR_1ST_OROM_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
+}
+
+/**
+ * ice_read_netlist_module - Read data from the netlist module area
+ * @hw: pointer to the HW structure
+ * @bank: whether to read from the active or inactive module
+ * @offset: offset into the netlist to read from
+ * @data: storage for returned word value
+ *
+ * Read a word from the specified netlist bank.
+ */
+static enum ice_status
+ice_read_netlist_module(struct ice_hw *hw, enum ice_bank_select bank, u32 offset, u16 *data)
+{
+	enum ice_status status;
+	__le16 data_local;
+
+	status = ice_read_flash_module(hw, bank, ICE_SR_NETLIST_BANK_PTR, offset * sizeof(u16),
+				       (__force u8 *)&data_local, sizeof(u16));
+	if (!status)
+		*data = le16_to_cpu(data_local);
+
+	return status;
 }
 
 /**
@@ -219,8 +457,7 @@ static void ice_release_nvm(struct ice_hw *hw)
  *
  * Reads one 16 bit word from the Shadow RAM using the ice_read_sr_word_aq.
  */
-static enum ice_status
-ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
+enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
 {
 	enum ice_status status;
 
@@ -234,111 +471,734 @@ ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data)
 }
 
 /**
- * ice_init_nvm - initializes NVM setting
- * @hw: pointer to the HW struct
+ * ice_get_pfa_module_tlv - Reads sub module TLV from NVM PFA
+ * @hw: pointer to hardware structure
+ * @module_tlv: pointer to module TLV to return
+ * @module_tlv_len: pointer to module TLV length to return
+ * @module_type: module type requested
  *
- * This function reads and populates NVM settings such as Shadow RAM size,
- * max_timeout, and blank_nvm_mode
+ * Finds the requested sub module TLV type from the Preserved Field
+ * Area (PFA) and returns the TLV pointer and length. The caller can
+ * use these to read the variable length TLV value.
  */
-enum ice_status ice_init_nvm(struct ice_hw *hw)
+enum ice_status
+ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len,
+		       u16 module_type)
 {
-	struct ice_nvm_info *nvm = &hw->nvm;
-	u16 eetrack_lo, eetrack_hi;
-	enum ice_status status = 0;
-	u32 fla, gens_stat;
-	u8 sr_size;
-
-	/* The SR size is stored regardless of the NVM programming mode
-	 * as the blank mode may be used in the factory line.
-	 */
-	gens_stat = rd32(hw, GLNVM_GENS);
-	sr_size = (gens_stat & GLNVM_GENS_SR_SIZE_M) >> GLNVM_GENS_SR_SIZE_S;
-
-	/* Switching to words (sr_size contains power of 2) */
-	nvm->sr_words = BIT(sr_size) * ICE_SR_WORDS_IN_1KB;
+	enum ice_status status;
+	u16 pfa_len, pfa_ptr;
+	u16 next_tlv;
 
-	/* Check if we are in the normal or blank NVM programming mode */
-	fla = rd32(hw, GLNVM_FLA);
-	if (fla & GLNVM_FLA_LOCKED_M) { /* Normal programming mode */
-		nvm->blank_nvm_mode = false;
-	} else { /* Blank programming mode */
-		nvm->blank_nvm_mode = true;
-		status = ICE_ERR_NVM_BLANK_MODE;
-		ice_debug(hw, ICE_DBG_NVM,
-			  "NVM init error: unsupported blank mode.\n");
+	status = ice_read_sr_word(hw, ICE_SR_PFA_PTR, &pfa_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Preserved Field Array pointer.\n");
 		return status;
 	}
-
-	status = ice_read_sr_word(hw, ICE_SR_NVM_DEV_STARTER_VER, &hw->nvm.ver);
+	status = ice_read_sr_word(hw, pfa_ptr, &pfa_len);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT,
-			  "Failed to read DEV starter version.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PFA length.\n");
 		return status;
 	}
+	/* Starting with first TLV after PFA length, iterate through the list
+	 * of TLVs to find the requested one.
+	 */
+	next_tlv = pfa_ptr + 1;
+	while (next_tlv < pfa_ptr + pfa_len) {
+		u16 tlv_sub_module_type;
+		u16 tlv_len;
+
+		/* Read TLV type */
+		status = ice_read_sr_word(hw, next_tlv, &tlv_sub_module_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read TLV type.\n");
+			break;
+		}
+		/* Read TLV length */
+		status = ice_read_sr_word(hw, next_tlv + 1, &tlv_len);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read TLV length.\n");
+			break;
+		}
+		if (tlv_sub_module_type == module_type) {
+			if (tlv_len) {
+				*module_tlv = next_tlv;
+				*module_tlv_len = tlv_len;
+				return 0;
+			}
+			return ICE_ERR_INVAL_SIZE;
+		}
+		/* Check next TLV, i.e. current TLV pointer + length + 2 words
+		 * (for current TLV's type and length)
+		 */
+		next_tlv = next_tlv + tlv_len + 2;
+	}
+	/* Module does not exist */
+	return ICE_ERR_DOES_NOT_EXIST;
+}
+
+/**
+ * ice_read_pba_string - Reads part number string from NVM
+ * @hw: pointer to hardware structure
+ * @pba_num: stores the part number string from the NVM
+ * @pba_num_size: part number string buffer length
+ *
+ * Reads the part number string from the NVM.
+ */
+enum ice_status
+ice_read_pba_string(struct ice_hw *hw, u8 *pba_num, u32 pba_num_size)
+{
+	u16 pba_tlv, pba_tlv_len;
+	enum ice_status status;
+	u16 pba_word, pba_size;
+	u16 i;
 
-	status = ice_read_sr_word(hw, ICE_SR_NVM_EETRACK_LO, &eetrack_lo);
+	status = ice_get_pfa_module_tlv(hw, &pba_tlv, &pba_tlv_len,
+					ICE_SR_PBA_BLOCK_PTR);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "Failed to read EETRACK lo.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Block TLV.\n");
 		return status;
 	}
-	status = ice_read_sr_word(hw, ICE_SR_NVM_EETRACK_HI, &eetrack_hi);
+
+	/* pba_size is the next word */
+	status = ice_read_sr_word(hw, (pba_tlv + 2), &pba_size);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "Failed to read EETRACK hi.\n");
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Section size.\n");
 		return status;
 	}
 
-	hw->nvm.eetrack = (eetrack_hi << 16) | eetrack_lo;
+	if (pba_tlv_len < pba_size) {
+		ice_debug(hw, ICE_DBG_INIT, "Invalid PBA Block TLV size.\n");
+		return ICE_ERR_INVAL_SIZE;
+	}
+
+	/* Subtract one to get PBA word count (PBA Size word is included in
+	 * total size)
+	 */
+	pba_size--;
+	if (pba_num_size < (((u32)pba_size * 2) + 1)) {
+		ice_debug(hw, ICE_DBG_INIT, "Buffer too small for PBA data.\n");
+		return ICE_ERR_PARAM;
+	}
+
+	for (i = 0; i < pba_size; i++) {
+		status = ice_read_sr_word(hw, (pba_tlv + 2 + 1) + i, &pba_word);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to read PBA Block word %d.\n", i);
+			return status;
+		}
+
+		pba_num[(i * 2)] = (pba_word >> 8) & 0xFF;
+		pba_num[(i * 2) + 1] = pba_word & 0xFF;
+	}
+	pba_num[(pba_size * 2)] = '\0';
 
 	return status;
 }
 
 /**
- * ice_read_sr_buf - Reads Shadow RAM buf and acquire lock if necessary
- * @hw: pointer to the HW structure
- * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
- * @words: (in) number of words to read; (out) number of words actually read
- * @data: words read from the Shadow RAM
+ * ice_get_nvm_srev - Read the security revision from the NVM CSS header
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @srev: storage for security revision
  *
- * Reads 16 bit words (data buf) from the SR using the ice_read_nvm_buf_aq
- * method. The buf read is preceded by the NVM ownership take
- * and followed by the release.
+ * Read the security revision out of the CSS header of the active NVM module
+ * bank.
  */
-enum ice_status
-ice_read_sr_buf(struct ice_hw *hw, u16 offset, u16 *words, u16 *data)
+static enum ice_status ice_get_nvm_srev(struct ice_hw *hw, enum ice_bank_select bank, u32 *srev)
 {
 	enum ice_status status;
+	u16 srev_l, srev_h;
 
-	status = ice_acquire_nvm(hw, ICE_RES_READ);
-	if (!status) {
-		status = ice_read_sr_buf_aq(hw, offset, words, data);
-		ice_release_nvm(hw);
-	}
+	status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_SREV_L, &srev_l);
+	if (status)
+		return status;
 
-	return status;
+	status = ice_read_nvm_module(hw, bank, ICE_NVM_CSS_SREV_H, &srev_h);
+	if (status)
+		return status;
+
+	*srev = srev_h << 16 | srev_l;
+
+	return 0;
 }
 
 /**
- * ice_nvm_validate_checksum
+ * ice_get_nvm_ver_info - Read NVM version information
  * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @nvm: pointer to NVM info structure
  *
- * Verify NVM PFA checksum validity (0x0706)
+ * Read the NVM EETRACK ID and map version of the main NVM image bank, filling
+ * in the nvm info structure.
  */
-enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
+static enum ice_status
+ice_get_nvm_ver_info(struct ice_hw *hw, enum ice_bank_select bank, struct ice_nvm_info *nvm)
 {
-	struct ice_aqc_nvm_checksum *cmd;
-	struct ice_aq_desc desc;
+	u16 eetrack_lo, eetrack_hi, ver;
 	enum ice_status status;
 
-	status = ice_acquire_nvm(hw, ICE_RES_READ);
-	if (status)
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_DEV_STARTER_VER, &ver);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read DEV starter version.\n");
 		return status;
+	}
 
-	cmd = &desc.params.nvm_checksum;
+	nvm->major = (ver & ICE_NVM_VER_HI_MASK) >> ICE_NVM_VER_HI_SHIFT;
+	nvm->minor = (ver & ICE_NVM_VER_LO_MASK) >> ICE_NVM_VER_LO_SHIFT;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
-	cmd->flags = ICE_AQC_NVM_CHECKSUM_VERIFY;
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_EETRACK_LO, &eetrack_lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read EETRACK lo.\n");
+		return status;
+	}
+	status = ice_read_nvm_sr_copy(hw, bank, ICE_SR_NVM_EETRACK_HI, &eetrack_hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read EETRACK hi.\n");
+		return status;
+	}
+
+	nvm->eetrack = (eetrack_hi << 16) | eetrack_lo;
+
+	status = ice_get_nvm_srev(hw, bank, &nvm->srev);
+	if (status)
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM security revision.\n");
+
+	return 0;
+}
+
+/**
+ * ice_get_inactive_nvm_ver - Read Option ROM version from the inactive bank
+ * @hw: pointer to the HW structure
+ * @nvm: storage for Option ROM version information
+ *
+ * Reads the NVM EETRACK ID, Map version, and security revision of the
+ * inactive NVM bank. Used to access version data for a pending update that
+ * has not yet been activated.
+ */
+enum ice_status ice_get_inactive_nvm_ver(struct ice_hw *hw, struct ice_nvm_info *nvm)
+{
+	return ice_get_nvm_ver_info(hw, ICE_INACTIVE_FLASH_BANK, nvm);
+}
+
+/**
+ * ice_get_orom_srev - Read the security revision from the OROM CSS header
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from active or inactive flash module
+ * @srev: storage for security revision
+ *
+ * Read the security revision out of the CSS header of the active OROM module
+ * bank.
+ */
+static enum ice_status ice_get_orom_srev(struct ice_hw *hw, enum ice_bank_select bank, u32 *srev)
+{
+	enum ice_status status;
+	u16 srev_l, srev_h;
+	u32 css_start;
+
+	if (hw->flash.banks.orom_size < ICE_NVM_OROM_TRAILER_LENGTH) {
+		ice_debug(hw, ICE_DBG_NVM, "Unexpected Option ROM Size of %u\n",
+			  hw->flash.banks.orom_size);
+		return ICE_ERR_CFG;
+	}
+
+	/* calculate how far into the Option ROM the CSS header starts. Note
+	 * that ice_read_orom_module takes a word offset so we need to
+	 * divide by 2 here.
+	 */
+	css_start = (hw->flash.banks.orom_size - ICE_NVM_OROM_TRAILER_LENGTH) / 2;
+
+	status = ice_read_orom_module(hw, bank, css_start + ICE_NVM_CSS_SREV_L, &srev_l);
+	if (status)
+		return status;
+
+	status = ice_read_orom_module(hw, bank, css_start + ICE_NVM_CSS_SREV_H, &srev_h);
+	if (status)
+		return status;
+
+	*srev = srev_h << 16 | srev_l;
+
+	return 0;
+}
+
+/**
+ * ice_get_orom_civd_data - Get the combo version information from Option ROM
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash module
+ * @civd: storage for the Option ROM CIVD data.
+ *
+ * Searches through the Option ROM flash contents to locate the CIVD data for
+ * the image.
+ */
+static enum ice_status
+ice_get_orom_civd_data(struct ice_hw *hw, enum ice_bank_select bank,
+		       struct ice_orom_civd_info *civd)
+{
+	struct ice_orom_civd_info tmp;
+	enum ice_status status;
+	u32 offset;
+
+	/* The CIVD section is located in the Option ROM aligned to 512 bytes.
+	 * The first 4 bytes must contain the ASCII characters "$CIV".
+	 * A simple modulo 256 sum of all of the bytes of the structure must
+	 * equal 0.
+	 */
+	for (offset = 0; (offset + 512) <= hw->flash.banks.orom_size; offset += 512) {
+		u8 sum = 0, i;
+
+		status = ice_read_flash_module(hw, bank, ICE_SR_1ST_OROM_BANK_PTR,
+					       offset, (u8 *)&tmp, sizeof(tmp));
+		if (status) {
+			ice_debug(hw, ICE_DBG_NVM, "Unable to read Option ROM CIVD data\n");
+			return status;
+		}
+
+		/* Skip forward until we find a matching signature */
+		if (memcmp("$CIV", tmp.signature, sizeof(tmp.signature)) != 0)
+			continue;
+
+		/* Verify that the simple checksum is zero */
+		for (i = 0; i < sizeof(tmp); i++)
+			/* cppcheck-suppress objectIndex */
+			sum += ((u8 *)&tmp)[i];
+
+		if (sum) {
+			ice_debug(hw, ICE_DBG_NVM, "Found CIVD data with invalid checksum of %u\n",
+				  sum);
+			return ICE_ERR_NVM;
+		}
+
+		*civd = tmp;
+		return 0;
+	}
+
+	return ICE_ERR_NVM;
+}
+
+/**
+ * ice_get_orom_ver_info - Read Option ROM version information
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash module
+ * @orom: pointer to Option ROM info structure
+ *
+ * Read Option ROM version and security revision from the Option ROM flash
+ * section.
+ */
+static enum ice_status
+ice_get_orom_ver_info(struct ice_hw *hw, enum ice_bank_select bank, struct ice_orom_info *orom)
+{
+	struct ice_orom_civd_info civd;
+	enum ice_status status;
+	u32 combo_ver;
+
+	status = ice_get_orom_civd_data(hw, bank, &civd);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to locate valid Option ROM CIVD data\n");
+		return status;
+	}
+
+	combo_ver = le32_to_cpu(civd.combo_ver);
+
+	orom->major = (u8)((combo_ver & ICE_OROM_VER_MASK) >> ICE_OROM_VER_SHIFT);
+	orom->patch = (u8)(combo_ver & ICE_OROM_VER_PATCH_MASK);
+	orom->build = (u16)((combo_ver & ICE_OROM_VER_BUILD_MASK) >> ICE_OROM_VER_BUILD_SHIFT);
+
+	status = ice_get_orom_srev(hw, bank, &orom->srev);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Option ROM security revision.\n");
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_inactive_orom_ver - Read Option ROM version from the inactive bank
+ * @hw: pointer to the HW structure
+ * @orom: storage for Option ROM version information
+ *
+ * Reads the Option ROM version and security revision data for the inactive
+ * section of flash. Used to access version data for a pending update that has
+ * not yet been activated.
+ */
+enum ice_status ice_get_inactive_orom_ver(struct ice_hw *hw, struct ice_orom_info *orom)
+{
+	return ice_get_orom_ver_info(hw, ICE_INACTIVE_FLASH_BANK, orom);
+}
+
+/**
+ * ice_get_netlist_info
+ * @hw: pointer to the HW struct
+ * @bank: whether to read from the active or inactive flash bank
+ * @netlist: pointer to netlist version info structure
+ *
+ * Get the netlist version information from the requested bank. Reads the Link
+ * Topology section to find the Netlist ID block and extract the relevant
+ * information into the netlist version structure.
+ */
+static enum ice_status
+ice_get_netlist_info(struct ice_hw *hw, enum ice_bank_select bank,
+		     struct ice_netlist_info *netlist)
+{
+	u16 module_id, length, node_count, i;
+	enum ice_status status;
+	u16 *id_blk;
+
+	status = ice_read_netlist_module(hw, bank, ICE_NETLIST_TYPE_OFFSET, &module_id);
+	if (status)
+		return status;
+
+	if (module_id != ICE_NETLIST_LINK_TOPO_MOD_ID) {
+		ice_debug(hw, ICE_DBG_NVM, "Expected netlist module_id ID of 0x%04x, but got 0x%04x\n",
+			  ICE_NETLIST_LINK_TOPO_MOD_ID, module_id);
+		return ICE_ERR_NVM;
+	}
+
+	status = ice_read_netlist_module(hw, bank, ICE_LINK_TOPO_MODULE_LEN, &length);
+	if (status)
+		return status;
+
+	/* sanity check that we have at least enough words to store the netlist ID block */
+	if (length < ICE_NETLIST_ID_BLK_SIZE) {
+		ice_debug(hw, ICE_DBG_NVM, "Netlist Link Topology module too small. Expected at least %u words, but got %u words.\n",
+			  ICE_NETLIST_ID_BLK_SIZE, length);
+		return ICE_ERR_NVM;
+	}
+
+	status = ice_read_netlist_module(hw, bank, ICE_LINK_TOPO_NODE_COUNT, &node_count);
+	if (status)
+		return status;
+	node_count &= ICE_LINK_TOPO_NODE_COUNT_M;
+
+	id_blk = devm_kcalloc(ice_hw_to_dev(hw), ICE_NETLIST_ID_BLK_SIZE,
+			      sizeof(*id_blk), GFP_KERNEL);
+	if (!id_blk)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Read out the entire Netlist ID Block at once. */
+	status = ice_read_flash_module(hw, bank, ICE_SR_NETLIST_BANK_PTR,
+				       ICE_NETLIST_ID_BLK_OFFSET(node_count) * sizeof(u16),
+				       (u8 *)id_blk, ICE_NETLIST_ID_BLK_SIZE * sizeof(u16));
+	if (status)
+		goto exit_error;
+
+	for (i = 0; i < ICE_NETLIST_ID_BLK_SIZE; i++)
+		id_blk[i] = le16_to_cpu(((__force __le16 *)id_blk)[i]);
+
+	netlist->major = id_blk[ICE_NETLIST_ID_BLK_MAJOR_VER_HIGH] << 16 |
+			 id_blk[ICE_NETLIST_ID_BLK_MAJOR_VER_LOW];
+	netlist->minor = id_blk[ICE_NETLIST_ID_BLK_MINOR_VER_HIGH] << 16 |
+			 id_blk[ICE_NETLIST_ID_BLK_MINOR_VER_LOW];
+	netlist->type = id_blk[ICE_NETLIST_ID_BLK_TYPE_HIGH] << 16 |
+			id_blk[ICE_NETLIST_ID_BLK_TYPE_LOW];
+	netlist->rev = id_blk[ICE_NETLIST_ID_BLK_REV_HIGH] << 16 |
+		       id_blk[ICE_NETLIST_ID_BLK_REV_LOW];
+	netlist->cust_ver = id_blk[ICE_NETLIST_ID_BLK_CUST_VER];
+	/* Read the left most 4 bytes of SHA */
+	netlist->hash = id_blk[ICE_NETLIST_ID_BLK_SHA_HASH_WORD(15)] << 16 |
+			id_blk[ICE_NETLIST_ID_BLK_SHA_HASH_WORD(14)];
+
+exit_error:
+	devm_kfree(ice_hw_to_dev(hw), id_blk);
+
+	return status;
+}
+
+
+/**
+ * ice_get_inactive_netlist_ver
+ * @hw: pointer to the HW struct
+ * @netlist: pointer to netlist version info structure
+ *
+ * Read the netlist version data from the inactive netlist bank. Used to
+ * extract version data of a pending flash update in order to display the
+ * version data.
+ */
+enum ice_status ice_get_inactive_netlist_ver(struct ice_hw *hw, struct ice_netlist_info *netlist)
+{
+	return ice_get_netlist_info(hw, ICE_INACTIVE_FLASH_BANK, netlist);
+}
+
+/**
+ * ice_discover_flash_size - Discover the available flash size.
+ * @hw: pointer to the HW struct
+ *
+ * The device flash could be up to 16MB in size. However, it is possible that
+ * the actual size is smaller. Use bisection to determine the accessible size
+ * of flash memory.
+ */
+static enum ice_status ice_discover_flash_size(struct ice_hw *hw)
+{
+	u32 min_size = 0, max_size = ICE_AQC_NVM_MAX_OFFSET + 1;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	while ((max_size - min_size) > 1) {
+		u32 offset = (max_size + min_size) / 2;
+		u32 len = 1;
+		u8 data;
+
+		status = ice_read_flat_nvm(hw, offset, &len, &data, false);
+		if (status == ICE_ERR_AQ_ERROR &&
+		    hw->adminq.sq_last_status == ICE_AQ_RC_EINVAL) {
+			ice_debug(hw, ICE_DBG_NVM, "%s: New upper bound of %u bytes\n",
+				  __func__, offset);
+			status = 0;
+			max_size = offset;
+		} else if (!status) {
+			ice_debug(hw, ICE_DBG_NVM, "%s: New lower bound of %u bytes\n",
+				  __func__, offset);
+			min_size = offset;
+		} else {
+			/* an unexpected error occurred */
+			goto err_read_flat_nvm;
+		}
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "Predicted flash size is %u bytes\n", max_size);
+
+	hw->flash.flash_size = max_size;
+
+err_read_flat_nvm:
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_read_sr_pointer - Read the value of a Shadow RAM pointer word
+ * @hw: pointer to the HW structure
+ * @offset: the word offset of the Shadow RAM word to read
+ * @pointer: pointer value read from Shadow RAM
+ *
+ * Read the given Shadow RAM word, and convert it to a pointer value specified
+ * in bytes. This function assumes the specified offset is a valid pointer
+ * word.
+ *
+ * Each pointer word specifies whether it is stored in word size or 4KB
+ * sector size by using the highest bit. The reported pointer value will be in
+ * bytes, intended for flat NVM reads.
+ */
+static enum ice_status
+ice_read_sr_pointer(struct ice_hw *hw, u16 offset, u32 *pointer)
+{
+	enum ice_status status;
+	u16 value;
+
+	status = ice_read_sr_word(hw, offset, &value);
+	if (status)
+		return status;
+
+	/* Determine if the pointer is in 4KB or word units */
+	if (value & ICE_SR_NVM_PTR_4KB_UNITS)
+		*pointer = (value & ~ICE_SR_NVM_PTR_4KB_UNITS) * 4 * 1024;
+	else
+		*pointer = value * 2;
+
+	return 0;
+}
+
+/**
+ * ice_read_sr_area_size - Read an area size from a Shadow RAM word
+ * @hw: pointer to the HW structure
+ * @offset: the word offset of the Shadow RAM to read
+ * @size: size value read from the Shadow RAM
+ *
+ * Read the given Shadow RAM word, and convert it to an area size value
+ * specified in bytes. This function assumes the specified offset is a valid
+ * area size word.
+ *
+ * Each area size word is specified in 4KB sector units. This function reports
+ * the size in bytes, intended for flat NVM reads.
+ */
+static enum ice_status
+ice_read_sr_area_size(struct ice_hw *hw, u16 offset, u32 *size)
+{
+	enum ice_status status;
+	u16 value;
+
+	status = ice_read_sr_word(hw, offset, &value);
+	if (status)
+		return status;
+
+	/* Area sizes are always specified in 4KB units */
+	*size = value * 4 * 1024;
+
+	return 0;
+}
+
+/**
+ * ice_determine_active_flash_banks - Discover active bank for each module
+ * @hw: pointer to the HW struct
+ *
+ * Read the Shadow RAM control word and determine which banks are active for
+ * the NVM, OROM, and Netlist modules. Also read and calculate the associated
+ * pointer and size. These values are then cached into the ice_flash_info
+ * structure for later use in order to calculate the correct offset to read
+ * from the active module.
+ */
+static enum ice_status
+ice_determine_active_flash_banks(struct ice_hw *hw)
+{
+	struct ice_bank_info *banks = &hw->flash.banks;
+	enum ice_status status;
+	u16 ctrl_word;
+
+	status = ice_read_sr_word(hw, ICE_SR_NVM_CTRL_WORD, &ctrl_word);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read the Shadow RAM control word\n");
+		return status;
+	}
+
+	/* Check that the control word indicates validity */
+	if ((ctrl_word & ICE_SR_CTRL_WORD_1_M) >> ICE_SR_CTRL_WORD_1_S != ICE_SR_CTRL_WORD_VALID) {
+		ice_debug(hw, ICE_DBG_NVM, "Shadow RAM control word is invalid\n");
+		return ICE_ERR_CFG;
+	}
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_NVM_BANK))
+		banks->nvm_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->nvm_bank = ICE_2ND_FLASH_BANK;
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_OROM_BANK))
+		banks->orom_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->orom_bank = ICE_2ND_FLASH_BANK;
+
+	if (!(ctrl_word & ICE_SR_CTRL_WORD_NETLIST_BANK))
+		banks->netlist_bank = ICE_1ST_FLASH_BANK;
+	else
+		banks->netlist_bank = ICE_2ND_FLASH_BANK;
+
+	status = ice_read_sr_pointer(hw, ICE_SR_1ST_NVM_BANK_PTR, &banks->nvm_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_NVM_BANK_SIZE, &banks->nvm_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read NVM bank area size\n");
+		return status;
+	}
+
+	status = ice_read_sr_pointer(hw, ICE_SR_1ST_OROM_BANK_PTR, &banks->orom_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read OROM bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_OROM_BANK_SIZE, &banks->orom_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read OROM bank area size\n");
+		return status;
+	}
+
+	status = ice_read_sr_pointer(hw, ICE_SR_NETLIST_BANK_PTR, &banks->netlist_ptr);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Netlist bank pointer\n");
+		return status;
+	}
+
+	status = ice_read_sr_area_size(hw, ICE_SR_NETLIST_BANK_SIZE, &banks->netlist_size);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to read Netlist bank area size\n");
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_init_nvm - initializes NVM setting
+ * @hw: pointer to the HW struct
+ *
+ * This function reads and populates NVM settings such as Shadow RAM size,
+ * max_timeout, and blank_nvm_mode
+ */
+enum ice_status ice_init_nvm(struct ice_hw *hw)
+{
+	struct ice_flash_info *flash = &hw->flash;
+	enum ice_status status;
+	u32 fla, gens_stat;
+	u8 sr_size;
+
+	/* The SR size is stored regardless of the NVM programming mode
+	 * as the blank mode may be used in the factory line.
+	 */
+	gens_stat = rd32(hw, GLNVM_GENS);
+	sr_size = (gens_stat & GLNVM_GENS_SR_SIZE_M) >> GLNVM_GENS_SR_SIZE_S;
+
+	/* Switching to words (sr_size contains power of 2) */
+	flash->sr_words = BIT(sr_size) * ICE_SR_WORDS_IN_1KB;
+
+	/* Check if we are in the normal or blank NVM programming mode */
+	fla = rd32(hw, GLNVM_FLA);
+	if (fla & GLNVM_FLA_LOCKED_M) { /* Normal programming mode */
+		flash->blank_nvm_mode = false;
+	} else {
+		/* Blank programming mode */
+		flash->blank_nvm_mode = true;
+		ice_debug(hw, ICE_DBG_NVM, "NVM init error: unsupported blank mode.\n");
+		return ICE_ERR_NVM_BLANK_MODE;
+	}
+
+	status = ice_discover_flash_size(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "NVM init error: failed to discover flash size.\n");
+		return status;
+	}
+
+	status = ice_determine_active_flash_banks(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to determine active flash banks.\n");
+		return status;
+	}
+
+	status = ice_get_nvm_ver_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->nvm);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read NVM info.\n");
+		return status;
+	}
+
+	status = ice_get_orom_ver_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->orom);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read Option ROM info.\n");
+
+	/* read the netlist version information */
+	status = ice_get_netlist_info(hw, ICE_ACTIVE_FLASH_BANK, &flash->netlist);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to read netlist info.\n");
+	return 0;
+}
+
+
+/**
+ * ice_nvm_validate_checksum
+ * @hw: pointer to the HW struct
+ *
+ * Verify NVM PFA checksum validity (0x0706)
+ */
+enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
+{
+	struct ice_aqc_nvm_checksum *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	cmd = &desc.params.nvm_checksum;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
+	cmd->flags = ICE_AQC_NVM_CHECKSUM_VERIFY;
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+
 	ice_release_nvm(hw);
 
 	if (!status)
@@ -347,3 +1207,496 @@ enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw)
 
 	return status;
 }
+
+/**
+ * ice_nvm_recalculate_checksum
+ * @hw: pointer to the HW struct
+ *
+ * Recalculate NVM PFA checksum (0x0706)
+ */
+enum ice_status ice_nvm_recalculate_checksum(struct ice_hw *hw)
+{
+	struct ice_aqc_nvm_checksum *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	cmd = &desc.params.nvm_checksum;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
+	cmd->flags = ICE_AQC_NVM_CHECKSUM_RECALC;
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_nvm_write_activate
+ * @hw: pointer to the HW struct
+ * @cmd_flags: NVM activate admin command bits (banks to be validated)
+ *
+ * Update the control word with the required banks' validity bits
+ * and dumps the Shadow RAM to flash (0x0707)
+ */
+enum ice_status ice_nvm_write_activate(struct ice_hw *hw, u8 cmd_flags)
+{
+	struct ice_aqc_nvm *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.nvm;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_write_activate);
+
+	cmd->cmd_flags = cmd_flags;
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/**
+ * ice_get_nvm_minsrevs - Get the Minimum Security Revision values from flash
+ * @hw: pointer to the HW struct
+ * @minsrevs: structure to store NVM and OROM minsrev values
+ *
+ * Read the Minimum Security Revision TLV and extract the revision values from
+ * the flash image into a readable structure for processing.
+ */
+enum ice_status
+ice_get_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs)
+{
+	struct ice_aqc_nvm_minsrev data;
+	enum ice_status status;
+	u16 valid;
+
+	status = ice_acquire_nvm(hw, ICE_RES_READ);
+	if (status)
+		return status;
+
+	status = ice_aq_read_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data),
+				 &data, true, false, NULL);
+
+	ice_release_nvm(hw);
+
+	if (status)
+		return status;
+
+	valid = le16_to_cpu(data.validity);
+
+	/* Extract NVM minimum security revision */
+	if (valid & ICE_AQC_NVM_MINSREV_NVM_VALID) {
+		u16 minsrev_l, minsrev_h;
+
+		minsrev_l = le16_to_cpu(data.nvm_minsrev_l);
+		minsrev_h = le16_to_cpu(data.nvm_minsrev_h);
+
+		minsrevs->nvm = minsrev_h << 16 | minsrev_l;
+		minsrevs->nvm_valid = true;
+	}
+
+	/* Extract the OROM minimum security revision */
+	if (valid & ICE_AQC_NVM_MINSREV_OROM_VALID) {
+		u16 minsrev_l, minsrev_h;
+
+		minsrev_l = le16_to_cpu(data.orom_minsrev_l);
+		minsrev_h = le16_to_cpu(data.orom_minsrev_h);
+
+		minsrevs->orom = minsrev_h << 16 | minsrev_l;
+		minsrevs->orom_valid = true;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_update_nvm_minsrevs - Update minimum security revision TLV data in flash
+ * @hw: pointer to the HW struct
+ * @minsrevs: minimum security revision information
+ *
+ * Update the NVM or Option ROM minimum security revision fields in the PFA
+ * area of the flash. Reads the minsrevs->nvm_valid and minsrevs->orom_valid
+ * fields to determine what update is being requested. If the valid bit is not
+ * set for that module, then the associated minsrev will be left as is.
+ */
+enum ice_status
+ice_update_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs)
+{
+	struct ice_aqc_nvm_minsrev data;
+	enum ice_status status;
+
+	if (!minsrevs->nvm_valid && !minsrevs->orom_valid) {
+		ice_debug(hw, ICE_DBG_NVM, "At least one of NVM and OROM MinSrev must be valid");
+		return ICE_ERR_PARAM;
+	}
+
+	status = ice_acquire_nvm(hw, ICE_RES_WRITE);
+	if (status)
+		return status;
+
+	/* Get current data */
+	status = ice_aq_read_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data),
+				 &data, true, false, NULL);
+	if (status)
+		goto exit_release_res;
+
+	if (minsrevs->nvm_valid) {
+		data.nvm_minsrev_l = cpu_to_le16(minsrevs->nvm & 0xFFFF);
+		data.nvm_minsrev_h = cpu_to_le16(minsrevs->nvm >> 16);
+		data.validity |= cpu_to_le16(ICE_AQC_NVM_MINSREV_NVM_VALID);
+	}
+
+	if (minsrevs->orom_valid) {
+		data.orom_minsrev_l = cpu_to_le16(minsrevs->orom & 0xFFFF);
+		data.orom_minsrev_h = cpu_to_le16(minsrevs->orom >> 16);
+		data.validity |= cpu_to_le16(ICE_AQC_NVM_MINSREV_OROM_VALID);
+	}
+
+	/* Update flash data */
+	status = ice_aq_update_nvm(hw, ICE_AQC_NVM_MINSREV_MOD_ID, 0, sizeof(data), &data,
+				   true, ICE_AQC_NVM_SPECIAL_UPDATE, NULL);
+	if (status)
+		goto exit_release_res;
+
+	/* Dump the Shadow RAM to the flash */
+	status = ice_nvm_write_activate(hw, 0);
+
+exit_release_res:
+	ice_release_nvm(hw);
+
+	return status;
+}
+
+/**
+ * ice_nvm_access_get_features - Return the NVM access features structure
+ * @cmd: NVM access command to process
+ * @data: storage for the driver NVM features
+ *
+ * Fill in the data section of the NVM access request with a copy of the NVM
+ * features structure.
+ */
+static enum ice_status
+ice_nvm_access_get_features(struct ice_nvm_access_cmd *cmd,
+			    union ice_nvm_access_data *data)
+{
+	/* The provided data_size must be at least as large as our NVM
+	 * features structure. A larger size should not be treated as an
+	 * error, to allow future extensions to the features structure to
+	 * work on older drivers.
+	 */
+	if (cmd->data_size < sizeof(struct ice_nvm_features))
+		return ICE_ERR_NO_MEMORY;
+
+	/* Initialize the data buffer to zeros */
+	memset(data, 0, cmd->data_size);
+
+	/* Fill in the features data */
+	data->drv_features.major = ICE_NVM_ACCESS_MAJOR_VER;
+	data->drv_features.minor = ICE_NVM_ACCESS_MINOR_VER;
+	data->drv_features.size = sizeof(struct ice_nvm_features);
+	data->drv_features.features[0] = ICE_NVM_FEATURES_0_REG_ACCESS;
+
+	return 0;
+}
+
+/**
+ * ice_nvm_access_get_module - Helper function to read module value
+ * @cmd: NVM access command structure
+ *
+ * Reads the module value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_module(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_MODULE_M) >> ICE_NVM_CFG_MODULE_S);
+}
+
+/**
+ * ice_nvm_access_get_flags - Helper function to read flags value
+ * @cmd: NVM access command structure
+ *
+ * Reads the flags value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_flags(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_FLAGS_M) >> ICE_NVM_CFG_FLAGS_S);
+}
+
+/**
+ * ice_nvm_access_get_adapter - Helper function to read adapter info
+ * @cmd: NVM access command structure
+ *
+ * Read the adapter info value out of the NVM access config field.
+ */
+static u32 ice_nvm_access_get_adapter(struct ice_nvm_access_cmd *cmd)
+{
+	return ((cmd->config & ICE_NVM_CFG_ADAPTER_INFO_M) >>
+		ICE_NVM_CFG_ADAPTER_INFO_S);
+}
+
+/**
+ * ice_validate_nvm_rw_reg - Check than an NVM access request is valid
+ * @cmd: NVM access command structure
+ *
+ * Validates that an NVM access structure is request to read or write a valid
+ * register offset. First validates that the module and flags are correct, and
+ * then ensures that the register offset is one of the accepted registers.
+ */
+static enum ice_status
+ice_validate_nvm_rw_reg(struct ice_nvm_access_cmd *cmd)
+{
+	u32 module, flags, offset;
+	u16 i;
+
+	module = ice_nvm_access_get_module(cmd);
+	flags = ice_nvm_access_get_flags(cmd);
+	offset = cmd->offset;
+
+	/* Make sure the module and flags indicate a read/write request */
+	if (module != ICE_NVM_REG_RW_MODULE ||
+	    flags != ICE_NVM_REG_RW_FLAGS ||
+	    cmd->data_size != sizeof_field(union ice_nvm_access_data, regval))
+		return ICE_ERR_PARAM;
+
+	switch (offset) {
+	case GL_HICR:
+	case GL_HICR_EN: /* Note, this register is read only */
+	case GL_FWSTS:
+	case GL_MNG_FWSM:
+	case GLGEN_CSR_DEBUG_C:
+	case GLGEN_RSTAT:
+	case GLPCI_LBARCTRL:
+	case GLNVM_GENS:
+	case GLNVM_FLA:
+	case PF_FUNC_RID:
+		return 0;
+	default:
+		break;
+	}
+
+	for (i = 0; i <= ICE_NVM_ACCESS_GL_HIDA_MAX; i++)
+		if (offset == (u32)GL_HIDA(i))
+			return 0;
+
+	for (i = 0; i <= ICE_NVM_ACCESS_GL_HIBA_MAX; i++)
+		if (offset == (u32)GL_HIBA(i))
+			return 0;
+
+	/* All other register offsets are not valid */
+	return ICE_ERR_OUT_OF_RANGE;
+}
+
+/**
+ * ice_nvm_access_read - Handle an NVM read request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command to process
+ * @data: storage for the register value read
+ *
+ * Process an NVM access request to read a register.
+ */
+static enum ice_status
+ice_nvm_access_read(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		    union ice_nvm_access_data *data)
+{
+	enum ice_status status;
+
+	/* Always initialize the output data, even on failure */
+	memset(data, 0, cmd->data_size);
+
+	/* Make sure this is a valid read/write access request */
+	status = ice_validate_nvm_rw_reg(cmd);
+	if (status)
+		return status;
+
+	ice_debug(hw, ICE_DBG_NVM, "NVM access: reading register %08x\n",
+		  cmd->offset);
+
+	/* Read the register and store the contents in the data field */
+	data->regval = rd32(hw, cmd->offset);
+
+	return 0;
+}
+
+/**
+ * ice_nvm_access_write - Handle an NVM write request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command to process
+ * @data: NVM access data to write
+ *
+ * Process an NVM access request to write a register.
+ */
+static enum ice_status
+ice_nvm_access_write(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		     union ice_nvm_access_data *data)
+{
+	enum ice_status status;
+
+	/* Make sure this is a valid read/write access request */
+	status = ice_validate_nvm_rw_reg(cmd);
+	if (status)
+		return status;
+
+	/* Reject requests to write to read-only registers */
+	switch (cmd->offset) {
+	case GL_HICR_EN:
+	case GLGEN_RSTAT:
+		return ICE_ERR_OUT_OF_RANGE;
+	default:
+		break;
+	}
+
+	ice_debug(hw, ICE_DBG_NVM, "NVM access: writing register %08x with value %08x\n",
+		  cmd->offset, data->regval);
+
+	/* Write the data field to the specified register */
+	wr32(hw, cmd->offset, data->regval);
+
+	return 0;
+}
+
+/**
+ * ice_handle_nvm_access - Handle an NVM access request
+ * @hw: pointer to the HW struct
+ * @cmd: NVM access command info
+ * @data: pointer to read or return data
+ *
+ * Process an NVM access request. Read the command structure information and
+ * determine if it is valid. If not, report an error indicating the command
+ * was invalid.
+ *
+ * For valid commands, perform the necessary function, copying the data into
+ * the provided data buffer.
+ */
+enum ice_status
+ice_handle_nvm_access(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		      union ice_nvm_access_data *data)
+{
+	u32 module, flags, adapter_info;
+
+	/* Extended flags are currently reserved and must be zero */
+	if ((cmd->config & ICE_NVM_CFG_EXT_FLAGS_M) != 0)
+		return ICE_ERR_PARAM;
+
+	/* Adapter info must match the HW device ID */
+	adapter_info = ice_nvm_access_get_adapter(cmd);
+	if (adapter_info != hw->device_id)
+		return ICE_ERR_PARAM;
+
+	switch (cmd->command) {
+	case ICE_NVM_CMD_READ:
+		module = ice_nvm_access_get_module(cmd);
+		flags = ice_nvm_access_get_flags(cmd);
+
+		/* Getting the driver's NVM features structure shares the same
+		 * command type as reading a register. Read the config field
+		 * to determine if this is a request to get features.
+		 */
+		if (module == ICE_NVM_GET_FEATURES_MODULE &&
+		    flags == ICE_NVM_GET_FEATURES_FLAGS &&
+		    cmd->offset == 0)
+			return ice_nvm_access_get_features(cmd, data);
+		else
+			return ice_nvm_access_read(hw, cmd, data);
+	case ICE_NVM_CMD_WRITE:
+		return ice_nvm_access_write(hw, cmd, data);
+	default:
+		return ICE_ERR_PARAM;
+	}
+}
+
+/**
+ * ice_aq_nvm_update_empr
+ * @hw: pointer to the HW struct
+ *
+ * Update empr (0x0709). This command allows SW to
+ * request an EMPR to activate new FW.
+ */
+enum ice_status ice_aq_nvm_update_empr(struct ice_hw *hw)
+{
+	struct ice_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_update_empr);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
+
+/* ice_nvm_set_pkg_data
+ * @hw: pointer to the HW struct
+ * @del_pkg_data_flag: If is set then the current pkg_data store by FW
+ *		       is deleted.
+ *		       If bit is set to 1, then buffer should be size 0.
+ * @data: pointer to buffer
+ * @length: length of the buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set package data (0x070A). This command is equivalent to the reception
+ * of a PLDM FW Update GetPackageData cmd. This command should be sent
+ * as part of the NVM update as the first cmd in the flow.
+ */
+
+enum ice_status
+ice_nvm_set_pkg_data(struct ice_hw *hw, bool del_pkg_data_flag, u8 *data,
+		     u16 length, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_nvm_pkg_data *cmd;
+	struct ice_aq_desc desc;
+
+	if (length != 0 && !data)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.pkg_data;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_pkg_data);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	if (del_pkg_data_flag)
+		cmd->cmd_flags |= ICE_AQC_NVM_PKG_DELETE;
+
+	return ice_aq_send_cmd(hw, &desc, data, length, cd);
+}
+
+/* ice_nvm_pass_component_tbl
+ * @hw: pointer to the HW struct
+ * @data: pointer to buffer
+ * @length: length of the buffer
+ * @transfer_flag: parameter for determining stage of the update
+ * @comp_response: a pointer to the response from the 0x070B AQC.
+ * @comp_response_code: a pointer to the response code from the 0x070B AQC.
+ * @cd: pointer to command details structure or NULL
+ *
+ * Pass component table (0x070B). This command is equivalent to the reception
+ * of a PLDM FW Update PassComponentTable cmd. This command should be sent once
+ * per component. It can be only sent after Set Package Data cmd and before
+ * actual update. FW will assume these commands are going to be sent until
+ * the TransferFlag is set to End or StartAndEnd.
+ */
+
+enum ice_status
+ice_nvm_pass_component_tbl(struct ice_hw *hw, u8 *data, u16 length,
+			   u8 transfer_flag, u8 *comp_response,
+			   u8 *comp_response_code, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_nvm_pass_comp_tbl *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	if (!data || !comp_response || !comp_response_code)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.pass_comp_tbl;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_nvm_pass_component_tbl);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->transfer_flag = transfer_flag;
+	status = ice_aq_send_cmd(hw, &desc, data, length, cd);
+
+	if (!status) {
+		*comp_response = cmd->component_response;
+		*comp_response_code = cmd->component_response_code;
+	}
+	return status;
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.h b/drivers/net/ethernet/intel/ice/ice_nvm.h
new file mode 100644
index 0000000000000000000000000000000000000000..c37481f62786d0fad89144228a87074b714651fc
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_NVM_H_
+#define _ICE_NVM_H_
+
+#define ICE_NVM_CMD_READ		0x0000000B
+#define ICE_NVM_CMD_WRITE		0x0000000C
+
+/* NVM Access config bits */
+#define ICE_NVM_CFG_MODULE_M		ICE_M(0xFF, 0)
+#define ICE_NVM_CFG_MODULE_S		0
+#define ICE_NVM_CFG_FLAGS_M		ICE_M(0xF, 8)
+#define ICE_NVM_CFG_FLAGS_S		8
+#define ICE_NVM_CFG_EXT_FLAGS_M		ICE_M(0xF, 12)
+#define ICE_NVM_CFG_EXT_FLAGS_S		12
+#define ICE_NVM_CFG_ADAPTER_INFO_M	ICE_M(0xFFFF, 16)
+#define ICE_NVM_CFG_ADAPTER_INFO_S	16
+
+/* NVM Read Get Driver Features */
+#define ICE_NVM_GET_FEATURES_MODULE	0xE
+#define ICE_NVM_GET_FEATURES_FLAGS	0xF
+
+/* NVM Read/Write Mapped Space */
+#define ICE_NVM_REG_RW_MODULE	0x0
+#define ICE_NVM_REG_RW_FLAGS	0x1
+
+struct ice_orom_civd_info {
+	u8 signature[4];	/* Must match ASCII '$CIV' characters */
+	u8 checksum;		/* Simple modulo 256 sum of all structure bytes must equal 0 */
+	__le32 combo_ver;	/* Combo Image Version number */
+	u8 combo_name_len;	/* Length of the unicode combo image version string, max of 32 */
+	__le16 combo_name[32];	/* Unicode string representing the Combo Image version */
+} __packed;
+
+#define ICE_NVM_ACCESS_MAJOR_VER	0
+#define ICE_NVM_ACCESS_MINOR_VER	5
+
+/* NVM Access feature flags. Other bits in the features field are reserved and
+ * should be set to zero when reporting the ice_nvm_features structure.
+ */
+#define ICE_NVM_FEATURES_0_REG_ACCESS	BIT(1)
+
+/* NVM Access Features */
+struct ice_nvm_features {
+	u8 major;		/* Major version (informational only) */
+	u8 minor;		/* Minor version (informational only) */
+	u16 size;		/* size of ice_nvm_features structure */
+	u8 features[12];	/* Array of feature bits */
+};
+
+/* NVM Access command */
+struct ice_nvm_access_cmd {
+	u32 command;		/* NVM command: READ or WRITE */
+	u32 config;		/* NVM command configuration */
+	u32 offset;		/* offset to read/write, in bytes */
+	u32 data_size;		/* size of data field, in bytes */
+};
+
+/* NVM Access data */
+union ice_nvm_access_data {
+	u32 regval;	/* Storage for register value */
+	struct ice_nvm_features drv_features; /* NVM features */
+};
+
+/* NVM Access registers */
+#define GL_HIDA(_i)			(0x00082000 + ((_i) * 4))
+#define GL_HIBA(_i)			(0x00081000 + ((_i) * 4))
+#define GL_HICR				0x00082040
+#define GL_HICR_EN			0x00082044
+#define GLGEN_CSR_DEBUG_C		0x00075750
+#define GLPCI_LBARCTRL			0x0009DE74
+#define GLNVM_GENS			0x000B6100
+#define GLNVM_FLA			0x000B6108
+
+#define ICE_NVM_ACCESS_GL_HIDA_MAX	15
+#define ICE_NVM_ACCESS_GL_HIBA_MAX	1023
+
+enum ice_status
+ice_handle_nvm_access(struct ice_hw *hw, struct ice_nvm_access_cmd *cmd,
+		      union ice_nvm_access_data *data);
+enum ice_status
+ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access);
+void ice_release_nvm(struct ice_hw *hw);
+enum ice_status
+ice_read_flat_nvm(struct ice_hw *hw, u32 offset, u32 *length, u8 *data,
+		  bool read_shadow_ram);
+enum ice_status
+ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len,
+		       u16 module_type);
+enum ice_status
+ice_get_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs);
+enum ice_status
+ice_update_nvm_minsrevs(struct ice_hw *hw, struct ice_minsrev_info *minsrevs);
+enum ice_status
+ice_get_inactive_orom_ver(struct ice_hw *hw, struct ice_orom_info *orom);
+enum ice_status
+ice_get_inactive_nvm_ver(struct ice_hw *hw, struct ice_nvm_info *nvm);
+enum ice_status
+ice_get_inactive_netlist_ver(struct ice_hw *hw, struct ice_netlist_info *netlist);
+enum ice_status
+ice_read_pba_string(struct ice_hw *hw, u8 *pba_num, u32 pba_num_size);
+enum ice_status ice_init_nvm(struct ice_hw *hw);
+enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data);
+enum ice_status
+ice_aq_erase_nvm(struct ice_hw *hw, u16 module_typeid, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
+		  u16 length, void *data, bool last_command, u8 command_flags,
+		  struct ice_sq_cd *cd);
+enum ice_status ice_nvm_validate_checksum(struct ice_hw *hw);
+enum ice_status ice_nvm_recalculate_checksum(struct ice_hw *hw);
+enum ice_status ice_nvm_write_activate(struct ice_hw *hw, u8 cmd_flags);
+enum ice_status ice_aq_nvm_update_empr(struct ice_hw *hw);
+enum ice_status
+ice_nvm_set_pkg_data(struct ice_hw *hw, bool del_pkg_data_flag, u8 *data,
+		     u16 length, struct ice_sq_cd *cd);
+enum ice_status
+ice_nvm_pass_component_tbl(struct ice_hw *hw, u8 *data, u16 length,
+			   u8 transfer_flag, u8 *comp_response,
+			   u8 *comp_response_code, struct ice_sq_cd *cd);
+#endif /* _ICE_NVM_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_osdep.h b/drivers/net/ethernet/intel/ice/ice_osdep.h
index f57c414bc0a9b75abb7989a41978057e6c20b9fa..3dacbda6597b958711b7f7dc758f630e091e64c1 100644
--- a/drivers/net/ethernet/intel/ice/ice_osdep.h
+++ b/drivers/net/ethernet/intel/ice/ice_osdep.h
@@ -1,14 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_OSDEP_H_
 #define _ICE_OSDEP_H_
 
 #include <linux/types.h>
 #include <linux/io.h>
-#ifndef CONFIG_64BIT
-#include <linux/io-64-nonatomic-lo-hi.h>
-#endif
+#include <linux/bitops.h>
+#include <linux/if_ether.h>
+#include "kcompat.h"
 
 #define wr32(a, reg, value)	writel((value), ((a)->hw_addr + (reg)))
 #define rd32(a, reg)		readl((a)->hw_addr + (reg))
@@ -16,6 +16,7 @@
 #define rd64(a, reg)		readq((a)->hw_addr + (reg))
 
 #define ice_flush(a)		rd32((a), GLGEN_STAT)
+
 #define ICE_M(m, s)		((m) << (s))
 
 struct ice_dma_mem {
@@ -27,10 +28,18 @@ struct ice_dma_mem {
 #define ice_hw_to_dev(ptr)	\
 	(&(container_of((ptr), struct ice_pf, hw))->pdev->dev)
 
+#define ice_info_fwlog(hw, rowsize, groupsize, buf, len)	\
+	print_hex_dump(KERN_INFO, " FWLOG: ",			\
+		       DUMP_PREFIX_NONE,			\
+		       rowsize, groupsize, buf,			\
+		       len, false)
+
+
 #ifdef CONFIG_DYNAMIC_DEBUG
 #define ice_debug(hw, type, fmt, args...) \
 	dev_dbg(ice_hw_to_dev(hw), fmt, ##args)
 
+
 #define ice_debug_array(hw, type, rowsize, groupsize, buf, len) \
 	print_hex_dump_debug(KBUILD_MODNAME " ",		\
 			     DUMP_PREFIX_OFFSET, rowsize,	\
@@ -51,6 +60,7 @@ do {								\
 				     rowsize, groupsize, buf,	\
 				     len, false);		\
 } while (0)
+
 #else
 #define ice_debug_array(hw, type, rowsize, groupsize, buf, len) \
 do {								\
@@ -67,6 +77,7 @@ do {								\
 				  i, ((len_l) - i), ((buf_l) + i));\
 	}							\
 } while (0)
+
 #endif /* DEBUG */
 #endif /* CONFIG_DYNAMIC_DEBUG */
 
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
new file mode 100644
index 0000000000000000000000000000000000000000..4c9581da9cccca2ad1407144fc1076e4c50dfcf0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_pf_vsi_vlan_ops.h"
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+
+	if (ice_is_dvm_ena(&vsi->back->hw)) {
+		vlan_ops = &vsi->outer_vlan_ops;
+
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->del_vlan = ice_vsi_del_vlan;
+		vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+		vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+		vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+		vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+		vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	} else {
+		vlan_ops = &vsi->inner_vlan_ops;
+
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->del_vlan = ice_vsi_del_vlan;
+		vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+		vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+		vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+		vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		vlan_ops->ena_rx_filtering = ice_vsi_ena_rx_vlan_filtering;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	}
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..14f8b7ecac7e82b2e081691e5f103648743bc774
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_pf_vsi_vlan_ops.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PF_VSI_VLAN_OPS_H_
+#define _ICE_PF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_pf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_protocol_type.h b/drivers/net/ethernet/intel/ice/ice_protocol_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b5861138096e5d8a4854e1b072386c9ec436883
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_protocol_type.h
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PROTOCOL_TYPE_H_
+#define _ICE_PROTOCOL_TYPE_H_
+#include "ice_flex_type.h"
+#define ICE_IPV6_ADDR_LENGTH 16
+
+/* Each recipe can match up to 5 different fields. Fields to match can be meta-
+ * data, values extracted from packet headers, or results from other recipes.
+ * One of the 5 fields is reserved for matching the switch ID. So, up to 4
+ * recipes can provide intermediate results to another one through chaining,
+ * e.g. recipes 0, 1, 2, and 3 can provide intermediate results to recipe 4.
+ */
+#define ICE_NUM_WORDS_RECIPE 4
+
+/* Max recipes that can be chained */
+#define ICE_MAX_CHAIN_RECIPE 5
+
+/* 1 word reserved for switch ID from allowed 5 words.
+ * So a recipe can have max 4 words. And you can chain 5 such recipes
+ * together. So maximum words that can be programmed for look up is 5 * 4.
+ */
+#define ICE_MAX_CHAIN_WORDS (ICE_NUM_WORDS_RECIPE * ICE_MAX_CHAIN_RECIPE)
+
+/* Field vector index corresponding to chaining */
+#define ICE_CHAIN_FV_INDEX_START 47
+
+enum ice_protocol_type {
+	ICE_MAC_OFOS = 0,
+	ICE_MAC_IL,
+	ICE_ETYPE_OL,
+	ICE_VLAN_OFOS,
+	ICE_IPV4_OFOS,
+	ICE_IPV4_IL,
+	ICE_IPV6_OFOS,
+	ICE_IPV6_IL,
+	ICE_TCP_IL,
+	ICE_UDP_OF,
+	ICE_UDP_ILOS,
+	ICE_SCTP_IL,
+	ICE_VXLAN,
+	ICE_GENEVE,
+	ICE_VXLAN_GPE,
+	ICE_NVGRE,
+	ICE_GTP,
+	ICE_PPPOE,
+	ICE_PFCP,
+	ICE_L2TPV3,
+	ICE_ESP,
+	ICE_AH,
+	ICE_NAT_T,
+	ICE_GTP_NO_PAY,
+	ICE_VLAN_EX,
+	ICE_VLAN_IN,
+	ICE_PROTOCOL_LAST
+};
+
+enum ice_sw_tunnel_type {
+	ICE_NON_TUN = 0,
+	ICE_SW_TUN_AND_NON_TUN,
+	ICE_SW_TUN_VXLAN_GPE,
+	ICE_SW_TUN_GENEVE,      /* GENEVE matches only non-VLAN pkts */
+	ICE_SW_TUN_GENEVE_VLAN, /* GENEVE matches both VLAN and non-VLAN pkts */
+	ICE_SW_TUN_VXLAN,	/* VXLAN matches only non-VLAN pkts */
+	ICE_SW_TUN_VXLAN_VLAN,  /* VXLAN matches both VLAN and non-VLAN pkts */
+	ICE_SW_TUN_NVGRE,
+	ICE_SW_TUN_UDP, /* This means all "UDP" tunnel types: VXLAN-GPE, VXLAN
+			 * and GENEVE
+			 */
+	ICE_SW_TUN_IPV4_GTP_IPV4_TCP,
+	ICE_SW_TUN_IPV4_GTP_IPV4_UDP,
+	ICE_SW_TUN_IPV4_GTP_IPV6_TCP,
+	ICE_SW_TUN_IPV4_GTP_IPV6_UDP,
+	ICE_SW_TUN_IPV6_GTP_IPV4_TCP,
+	ICE_SW_TUN_IPV6_GTP_IPV4_UDP,
+	ICE_SW_TUN_IPV6_GTP_IPV6_TCP,
+	ICE_SW_TUN_IPV6_GTP_IPV6_UDP,
+
+	/* following adds support for GTP, just using inner protocols,
+	 * outer L3 and L4 protocols can be anything
+	 */
+	ICE_SW_TUN_GTP_IPV4_TCP,
+	ICE_SW_TUN_GTP_IPV4_UDP,
+	ICE_SW_TUN_GTP_IPV6_TCP,
+	ICE_SW_TUN_GTP_IPV6_UDP,
+	ICE_SW_TUN_IPV4_GTPU_IPV4,
+	ICE_SW_TUN_IPV4_GTPU_IPV6,
+	ICE_SW_TUN_IPV6_GTPU_IPV4,
+	ICE_SW_TUN_IPV6_GTPU_IPV6,
+	ICE_SW_TUN_GTP_IPV4,
+	ICE_SW_TUN_GTP_IPV6,
+	ICE_ALL_TUNNELS /* All tunnel types including NVGRE */
+};
+
+/* Decoders for ice_prot_id:
+ * - F: First
+ * - I: Inner
+ * - L: Last
+ * - O: Outer
+ * - S: Single
+ */
+enum ice_prot_id {
+	ICE_PROT_ID_INVAL	= 0,
+	ICE_PROT_MAC_OF_OR_S	= 1,
+	ICE_PROT_MAC_O2		= 2,
+	ICE_PROT_MAC_IL		= 4,
+	ICE_PROT_MAC_IN_MAC	= 7,
+	ICE_PROT_ETYPE_OL	= 9,
+	ICE_PROT_ETYPE_IL	= 10,
+	ICE_PROT_PAY		= 15,
+	ICE_PROT_EVLAN_O	= 16,
+	ICE_PROT_VLAN_O		= 17,
+	ICE_PROT_VLAN_IF	= 18,
+	ICE_PROT_MPLS_OL_MINUS_1 = 27,
+	ICE_PROT_MPLS_OL_OR_OS	= 28,
+	ICE_PROT_MPLS_IL	= 29,
+	ICE_PROT_IPV4_OF_OR_S	= 32,
+	ICE_PROT_IPV4_IL	= 33,
+	ICE_PROT_IPV6_OF_OR_S	= 40,
+	ICE_PROT_IPV6_IL	= 41,
+	ICE_PROT_IPV6_FRAG	= 47,
+	ICE_PROT_TCP_IL		= 49,
+	ICE_PROT_UDP_OF		= 52,
+	ICE_PROT_UDP_IL_OR_S	= 53,
+	ICE_PROT_GRE_OF		= 64,
+	ICE_PROT_NSH_F		= 84,
+	ICE_PROT_ESP_F		= 88,
+	ICE_PROT_ESP_2		= 89,
+	ICE_PROT_SCTP_IL	= 96,
+	ICE_PROT_ICMP_IL	= 98,
+	ICE_PROT_ICMPV6_IL	= 100,
+	ICE_PROT_VRRP_F		= 101,
+	ICE_PROT_OSPF		= 102,
+	ICE_PROT_PPPOE		= 103,
+	ICE_PROT_L2TPV3		= 104,
+	ICE_PROT_ECPRI		= 105,
+	ICE_PROT_PPP		= 106,
+	ICE_PROT_ATAOE_OF	= 114,
+	ICE_PROT_CTRL_OF	= 116,
+	ICE_PROT_LLDP_OF	= 117,
+	ICE_PROT_ARP_OF		= 118,
+	ICE_PROT_EAPOL_OF	= 120,
+	ICE_PROT_META_ID	= 255, /* when offset == metaddata */
+	ICE_PROT_INVALID	= 255  /* when offset == ICE_FV_OFFSET_INVAL */
+};
+
+#define ICE_VNI_OFFSET		12 /* offset of VNI from ICE_PROT_UDP_OF */
+
+
+#define ICE_MAC_OFOS_HW		1
+#define ICE_MAC_IL_HW		4
+#define ICE_ETYPE_OL_HW		9
+#define ICE_VLAN_OF_HW		16
+#define ICE_VLAN_OL_HW		17
+#define ICE_IPV4_OFOS_HW	32
+#define ICE_IPV4_IL_HW		33
+#define ICE_IPV6_OFOS_HW	40
+#define ICE_IPV6_IL_HW		41
+#define ICE_TCP_IL_HW		49
+#define ICE_UDP_ILOS_HW		53
+#define ICE_ESP_HW			88
+#define ICE_AH_HW			89
+#define ICE_SCTP_IL_HW		96
+#define ICE_PPPOE_HW		103
+#define ICE_L2TPV3_HW		104
+
+/* ICE_UDP_OF is used to identify all 3 tunnel types
+ * VXLAN, GENEVE and VXLAN_GPE. To differentiate further
+ * need to use flags from the field vector
+ */
+#define ICE_UDP_OF_HW	52 /* UDP Tunnels */
+#define ICE_GRE_OF_HW	64 /* NVGRE */
+#define ICE_META_DATA_ID_HW 255 /* this is used for tunnel type */
+
+#define ICE_MDID_SIZE 2
+#define ICE_TUN_FLAG_MDID 21
+#define ICE_TUN_FLAG_MDID_OFF (ICE_MDID_SIZE * ICE_TUN_FLAG_MDID)
+#define ICE_TUN_FLAG_MASK 0xFF
+#define ICE_TUN_FLAG_VLAN_MASK 0x01
+#define ICE_TUN_FLAG_FV_IND 2
+
+#define ICE_PROTOCOL_MAX_ENTRIES 16
+
+/* Mapping of software defined protocol ID to hardware defined protocol ID */
+struct ice_protocol_entry {
+	enum ice_protocol_type type;
+	u8 protocol_id;
+};
+
+
+struct ice_ether_hdr {
+	u8 dst_addr[ETH_ALEN];
+	u8 src_addr[ETH_ALEN];
+};
+
+struct ice_ethtype_hdr {
+	__be16 ethtype_id;
+};
+
+struct ice_ether_vlan_hdr {
+	u8 dst_addr[ETH_ALEN];
+	u8 src_addr[ETH_ALEN];
+	__be32 vlan_id;
+};
+
+struct ice_vlan_hdr {
+	__be16 vlan;
+	__be16 type;
+};
+
+struct ice_ipv4_hdr {
+	u8 version;
+	u8 tos;
+	__be16 total_length;
+	__be16 id;
+	__be16 frag_off;
+	u8 time_to_live;
+	u8 protocol;
+	__be16 check;
+	__be32 src_addr;
+	__be32 dst_addr;
+};
+
+struct ice_le_ver_tc_flow {
+	union {
+		struct {
+			u32 flow_label : 20;
+			u32 tc : 8;
+			u32 version : 4;
+		} fld;
+		u32 val;
+	} u;
+};
+
+struct ice_ipv6_hdr {
+	__be32 be_ver_tc_flow;
+	__be16 payload_len;
+	u8 next_hdr;
+	u8 hop_limit;
+	u8 src_addr[ICE_IPV6_ADDR_LENGTH];
+	u8 dst_addr[ICE_IPV6_ADDR_LENGTH];
+};
+
+struct ice_sctp_hdr {
+	__be16 src_port;
+	__be16 dst_port;
+	__be32 verification_tag;
+	__be32 check;
+};
+
+struct ice_l4_hdr {
+	__be16 src_port;
+	__be16 dst_port;
+	__be16 len;
+	__be16 check;
+};
+
+struct ice_udp_tnl_hdr {
+	__be16 field;
+	__be16 proto_type;
+	__be32 vni;	/* only use lower 24-bits */
+};
+
+struct ice_udp_gtp_hdr {
+	u8 flags;
+	u8 msg_type;
+	__be16 rsrvd_len;
+	__be32 teid;
+	__be16 rsrvd_seq_nbr;
+	u8 rsrvd_n_pdu_nbr;
+	u8 rsrvd_next_ext;
+	u8 rsvrd_ext_len;
+	u8 pdu_type;
+	u8 qfi;
+	u8 rsvrd;
+};
+
+struct ice_pppoe_hdr {
+	u8 rsrvd_ver_type;
+	u8 rsrvd_code;
+	__be16 session_id;
+	__be16 length;
+	__be16 ppp_prot_id; /* control and data only */
+};
+
+struct ice_pfcp_hdr {
+	u8 flags;
+	u8 msg_type;
+	__be16 length;
+	__be64 seid;
+	__be32 seq;
+	u8 spare;
+};
+
+struct ice_l2tpv3_sess_hdr {
+	__be32 session_id;
+	__be64 cookie;
+};
+
+struct ice_esp_hdr {
+	__be32 spi;
+	__be32 seq;
+};
+
+struct ice_ah_hdr {
+	u8 next_hdr;
+	u8 paylen;
+	__be16 rsrvd;
+	__be32 spi;
+	__be32 seq;
+};
+
+struct ice_nat_t_hdr {
+	struct ice_esp_hdr esp;
+};
+
+
+struct ice_nvgre {
+	__be16 flags;
+	__be16 protocol;
+	__be32 tni_flow;
+};
+
+union ice_prot_hdr {
+	struct ice_ether_hdr eth_hdr;
+	struct ice_ethtype_hdr ethertype;
+	struct ice_vlan_hdr vlan_hdr;
+	struct ice_ipv4_hdr ipv4_hdr;
+	struct ice_ipv6_hdr ipv6_hdr;
+	struct ice_l4_hdr l4_hdr;
+	struct ice_sctp_hdr sctp_hdr;
+	struct ice_udp_tnl_hdr tnl_hdr;
+	struct ice_nvgre nvgre_hdr;
+	struct ice_udp_gtp_hdr gtp_hdr;
+	struct ice_pppoe_hdr pppoe_hdr;
+	struct ice_pfcp_hdr pfcp_hdr;
+	struct ice_l2tpv3_sess_hdr l2tpv3_sess_hdr;
+	struct ice_esp_hdr esp_hdr;
+	struct ice_ah_hdr ah_hdr;
+	struct ice_nat_t_hdr nat_t_hdr;
+};
+
+/* This is mapping table entry that maps every word within a given protocol
+ * structure to the real byte offset as per the specification of that
+ * protocol header.
+ * for e.g. dst address is 3 words in ethertype header and corresponding bytes
+ * are 0, 2, 3 in the actual packet header and src address is at 4, 6, 8
+ */
+struct ice_prot_ext_tbl_entry {
+	enum ice_protocol_type prot_type;
+	/* Byte offset into header of given protocol type */
+	u8 offs[sizeof(union ice_prot_hdr)];
+};
+
+/* Extractions to be looked up for a given recipe */
+struct ice_prot_lkup_ext {
+	u16 prot_type;
+	u8 n_val_words;
+	/* create a buffer to hold max words per recipe */
+	u16 field_off[ICE_MAX_CHAIN_WORDS];
+	u16 field_mask[ICE_MAX_CHAIN_WORDS];
+
+	struct ice_fv_word fv_words[ICE_MAX_CHAIN_WORDS];
+
+	/* Indicate field offsets that have field vector indices assigned */
+	DECLARE_BITMAP(done, ICE_MAX_CHAIN_WORDS);
+};
+
+struct ice_pref_recipe_group {
+	u8 n_val_pairs;		/* Number of valid pairs */
+	struct ice_fv_word pairs[ICE_NUM_WORDS_RECIPE];
+	u16 mask[ICE_NUM_WORDS_RECIPE];
+};
+
+struct ice_recp_grp_entry {
+	struct list_head l_entry;
+
+#define ICE_INVAL_CHAIN_IND 0xFF
+	u16 rid;
+	u8 chain_idx;
+	u16 fv_idx[ICE_NUM_WORDS_RECIPE];
+	u16 fv_mask[ICE_NUM_WORDS_RECIPE];
+	struct ice_pref_recipe_group r_group;
+};
+#endif /* _ICE_PROTOCOL_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
new file mode 100644
index 0000000000000000000000000000000000000000..7198f55cdb0df8b6efa91eb020b059df9dad06fc
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -0,0 +1,3648 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_lib.h"
+
+#define E810_OUT_PROP_DELAY_NS 1
+
+
+#define LOCKED_INCVAL_E822 0x100000000ULL
+
+static const struct ptp_pin_desc ice_e810t_pin_desc[] = {
+	/* name     idx   func         chan */
+	 { "GNSS",  GNSS, PTP_PF_EXTTS, 0, { 0, } },
+	 { "SMA1",  SMA1, PTP_PF_NONE, 1, { 0, } },
+	 { "U.FL1", UFL1, PTP_PF_NONE, 1, { 0, } },
+	 { "SMA2",  SMA2, PTP_PF_NONE, 2, { 0, } },
+	 { "U.FL2", UFL2, PTP_PF_NONE, 2, { 0, } },
+};
+
+/**
+ * ice_enable_e810t_sma_ctrl
+ * @hw: pointer to the hw struct
+ * @ena: set true to enable and false to disable
+ *
+ * Enables or disable the SMA control logic
+ */
+static int ice_enable_e810t_sma_ctrl(struct ice_hw *hw, bool ena)
+{
+	int err;
+	u8 data;
+
+	/* Set expander bits as outputs */
+	err = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_CFG, &data);
+	if (err)
+		return err;
+
+	if (ena)
+		data &= (~ICE_E810T_SMA_CTRL_MASK);
+	else
+		data |= ICE_E810T_SMA_CTRL_MASK;
+
+	return ice_write_e810t_pca9575_reg(hw, ICE_PCA9575_P1_CFG, data);
+}
+
+/**
+ * ice_get_e810t_sma_config
+ * @hw: pointer to the hw struct
+ * @ptp_pins:pointer to the ptp_pin_desc struture
+ *
+ * Read the configuration of the SMA control logic and put it into the
+ * ptp_pin_desc structure
+ */
+static int
+ice_get_e810t_sma_config(struct ice_hw *hw, struct ptp_pin_desc *ptp_pins)
+{
+	enum ice_status status;
+	u8 data, i;
+
+	/* Read initial pin state */
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, &data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	/* initialize with defaults */
+	for (i = 0; i < NUM_E810T_PTP_PINS; i++) {
+		snprintf(ptp_pins[i].name, sizeof(ptp_pins[i].name),
+			 "%s", ice_e810t_pin_desc[i].name);
+		ptp_pins[i].index = ice_e810t_pin_desc[i].index;
+		ptp_pins[i].func = ice_e810t_pin_desc[i].func;
+		ptp_pins[i].chan = ice_e810t_pin_desc[i].chan;
+	}
+
+	/* Parse SMA1/UFL1 */
+	switch (data & ICE_E810T_SMA1_CTRL_MASK) {
+	case ICE_E810T_SMA1_CTRL_MASK:
+	default:
+		ptp_pins[SMA1].func = PTP_PF_NONE;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case ICE_E810T_P1_SMA1_DIR_EN:
+		ptp_pins[SMA1].func = PTP_PF_PEROUT;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case ICE_E810T_P1_SMA1_TX_EN:
+		ptp_pins[SMA1].func = PTP_PF_EXTTS;
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+		break;
+	case 0:
+		ptp_pins[SMA1].func = PTP_PF_EXTTS;
+		ptp_pins[UFL1].func = PTP_PF_PEROUT;
+		break;
+	}
+
+	/* Parse SMA2/UFL2 */
+	switch (data & ICE_E810T_SMA2_CTRL_MASK) {
+	case ICE_E810T_SMA2_CTRL_MASK:
+	default:
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_TX_EN | ICE_E810T_P1_SMA2_UFL2_RX_DIS):
+		ptp_pins[SMA2].func = PTP_PF_EXTTS;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_UFL2_RX_DIS):
+		ptp_pins[SMA2].func = PTP_PF_PEROUT;
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+		break;
+	case (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_TX_EN):
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+		ptp_pins[UFL2].func = PTP_PF_EXTTS;
+		break;
+	case ICE_E810T_P1_SMA2_DIR_EN:
+		ptp_pins[SMA2].func = PTP_PF_PEROUT;
+		ptp_pins[UFL2].func = PTP_PF_EXTTS;
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_set_e810t_sma_state
+ * @hw: pointer to the hw struct
+ * @ptp_pins: pointer to the ptp_pin_desc struture
+ *
+ * Set the configuration of the SMA control logic based on the configuration in
+ * num_pins parameter
+ */
+static int
+ice_ptp_set_e810t_sma_state(struct ice_hw *hw,
+			    const struct ptp_pin_desc *ptp_pins)
+{
+	enum ice_status status;
+	u8 data;
+
+	/* SMA1 and UFL1 cannot be set to TX at the same time */
+	if (ptp_pins[SMA1].func == PTP_PF_PEROUT &&
+	    ptp_pins[UFL1].func == PTP_PF_PEROUT)
+		return ICE_ERR_PARAM;
+
+	/* SMA2 and UFL2 cannot be set to RX at the same time */
+	if (ptp_pins[SMA2].func == PTP_PF_EXTTS &&
+	    ptp_pins[UFL2].func == PTP_PF_EXTTS)
+		return ICE_ERR_PARAM;
+
+	/* Read initial pin state value */
+	status = ice_read_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, &data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	/* Set the right sate based on the desired configuration */
+	data &= ~ICE_E810T_SMA1_CTRL_MASK;
+	if (ptp_pins[SMA1].func == PTP_PF_NONE &&
+	    ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 + U.FL1 disabled");
+		data |= ICE_E810T_SMA1_CTRL_MASK;
+	} else if (ptp_pins[SMA1].func == PTP_PF_EXTTS &&
+		   ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX");
+		data |= ICE_E810T_P1_SMA1_TX_EN;
+	} else if (ptp_pins[SMA1].func == PTP_PF_NONE &&
+		   ptp_pins[UFL1].func == PTP_PF_PEROUT) {
+		/* U.FL 1 TX will always enable SMA 1 RX */
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX + U.FL1 TX");
+	} else if (ptp_pins[SMA1].func == PTP_PF_EXTTS &&
+		   ptp_pins[UFL1].func == PTP_PF_PEROUT) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 RX + U.FL1 TX");
+	} else if (ptp_pins[SMA1].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL1].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA1 TX");
+		data |= ICE_E810T_P1_SMA1_DIR_EN;
+	}
+
+	data &= (~ICE_E810T_SMA2_CTRL_MASK);
+	if (ptp_pins[SMA2].func == PTP_PF_NONE &&
+	    ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 + U.FL2 disabled");
+		data |= ICE_E810T_SMA2_CTRL_MASK;
+	} else if (ptp_pins[SMA2].func == PTP_PF_EXTTS &&
+			ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 RX");
+		data |= (ICE_E810T_P1_SMA2_TX_EN |
+			 ICE_E810T_P1_SMA2_UFL2_RX_DIS);
+	} else if (ptp_pins[SMA2].func == PTP_PF_NONE &&
+		   ptp_pins[UFL2].func == PTP_PF_EXTTS) {
+		dev_info(ice_hw_to_dev(hw), "UFL2 RX");
+		data |= (ICE_E810T_P1_SMA2_DIR_EN | ICE_E810T_P1_SMA2_TX_EN);
+	} else if (ptp_pins[SMA2].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL2].func == PTP_PF_NONE) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 TX");
+		data |= (ICE_E810T_P1_SMA2_DIR_EN |
+			 ICE_E810T_P1_SMA2_UFL2_RX_DIS);
+	} else if (ptp_pins[SMA2].func == PTP_PF_PEROUT &&
+		   ptp_pins[UFL2].func == PTP_PF_EXTTS) {
+		dev_info(ice_hw_to_dev(hw), "SMA2 TX + U.FL2 RX");
+		data |= ICE_E810T_P1_SMA2_DIR_EN;
+	}
+
+	status = ice_write_e810t_pca9575_reg(hw, ICE_PCA9575_P1_OUT, data);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_set_e810t_sma
+ * @info: the driver's PTP info structure
+ * @pin: pin index in kernel structure
+ * @func: Pin function to be set (PTP_PF_NONE, PTP_PF_EXTTS or PTP_PF_PEROUT)
+ *
+ * Set the configuration of a single SMA pin
+ */
+static int
+ice_ptp_set_e810t_sma(struct ptp_clock_info *info, unsigned int pin,
+		      enum ptp_pin_function func)
+{
+	struct ptp_pin_desc ptp_pins[NUM_E810T_PTP_PINS];
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	int err;
+
+	if (pin < SMA1 || func > PTP_PF_PEROUT)
+		return -EOPNOTSUPP;
+
+	err = ice_get_e810t_sma_config(hw, ptp_pins);
+	if (err)
+		return err;
+
+	/* Disable the same function on the other pin sharing the channel */
+	if (pin == SMA1 && ptp_pins[UFL1].func == func)
+		ptp_pins[UFL1].func = PTP_PF_NONE;
+	if (pin == UFL1 && ptp_pins[SMA1].func == func)
+		ptp_pins[SMA1].func = PTP_PF_NONE;
+
+	if (pin == SMA2 && ptp_pins[UFL2].func == func)
+		ptp_pins[UFL2].func = PTP_PF_NONE;
+	if (pin == UFL2 && ptp_pins[SMA2].func == func)
+		ptp_pins[SMA2].func = PTP_PF_NONE;
+
+	/* Set up new pin function in the temp table */
+	ptp_pins[pin].func = func;
+
+	return ice_ptp_set_e810t_sma_state(hw, ptp_pins);
+}
+
+/**
+ * ice_e810t_verify_pin
+ * @info: the driver's PTP info structure
+ * @pin: Pin index
+ * @func: Assigned function
+ * @chan: Assigned channel
+ *
+ * Verify if pin supports requested pin function. If the Check pins consistency.
+ * Reconfigure the SMA logic attached to the given pin to enable its
+ * desired functionality
+ */
+static int
+ice_e810t_verify_pin(struct ptp_clock_info *info, unsigned int pin,
+		     enum ptp_pin_function func, unsigned int chan)
+{
+	/* Don't allow channel reassignment */
+	if (chan != ice_e810t_pin_desc[pin].chan)
+		return -EOPNOTSUPP;
+
+	/* Check if functions are properly assigned */
+	switch (func) {
+	case PTP_PF_NONE:
+		break;
+	case PTP_PF_EXTTS:
+		if (pin == UFL1)
+			return -EOPNOTSUPP;
+		break;
+	case PTP_PF_PEROUT:
+		if (pin == UFL2 || pin == GNSS)
+			return -EOPNOTSUPP;
+		break;
+	case PTP_PF_PHYSYNC:
+		return -EOPNOTSUPP;
+	}
+
+	return ice_ptp_set_e810t_sma(info, pin, func);
+}
+
+
+
+/**
+ * mul_u128_u64_fac - Multiplies two 64bit factors to the 128b result
+ * @a: First factor to multiply
+ * @b: Second factor to multiply
+ * @hi: Pointer for higher part of 128b result
+ * @lo: Pointer for lower part of 128b result
+ *
+ * This function performs multiplication of two 64 bit factors with 128b
+ * output.
+ */
+static inline void mul_u128_u64_fac(u64 a, u64 b, u64 *hi, u64 *lo)
+{
+	u64 mask = GENMASK_ULL(31, 0);
+	u64 a_lo = a & mask;
+	u64 b_lo = b & mask;
+
+	a >>= 32;
+	b >>= 32;
+
+	*hi = (a * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) >> 32) +
+	      (((a_lo * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) & mask)) >> 32);
+	*lo = (((a_lo * b) + (((a * b_lo) + ((a_lo * b_lo) >> 32)) & mask)) << 32) +
+	      ((a_lo * b_lo) & mask);
+}
+
+
+/**
+ * ice_set_tx_tstamp - Enable or disable Tx timestamping
+ * @pf: The PF pointer to search in
+ * @on: bool value for whether timestamps are enabled or disabled
+ */
+static void ice_set_tx_tstamp(struct ice_pf *pf, bool on)
+{
+	struct ice_vsi *vsi;
+	u32 val;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	vsi->ptp_tx = on;
+
+	/* Enable/disable the TX timestamp interrupt  */
+	val = rd32(&pf->hw, PFINT_OICR_ENA);
+	if (on)
+		val |= PFINT_OICR_TSYN_TX_M;
+	else
+		val &= ~PFINT_OICR_TSYN_TX_M;
+	wr32(&pf->hw, PFINT_OICR_ENA, val);
+
+	if (on)
+		pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_ON;
+	else
+		pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_OFF;
+}
+
+/**
+ * ice_set_rx_tstamp - Enable or disable Rx timestamping
+ * @pf: The PF pointer to search in
+ * @on: bool value for whether timestamps are enabled or disabled
+ */
+static void ice_set_rx_tstamp(struct ice_pf *pf, bool on)
+{
+	struct ice_vsi *vsi;
+	u16 i;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	ice_for_each_rxq(vsi, i) {
+		if (!vsi->rx_rings[i])
+			continue;
+		vsi->rx_rings[i]->ptp_rx = on;
+	}
+
+	if (on)
+		pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_ALL;
+	else
+		pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
+
+/**
+ * ice_ptp_cfg_timestamp - Configure timestamp for init/deinit
+ * @pf: Board private structure
+ * @ena: bool value to enable or disable time stamp
+ *
+ * This function will configure timestamping during PTP initialization
+ * and deinitialization
+ */
+static void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena)
+{
+	ice_set_tx_tstamp(pf, ena);
+	ice_set_rx_tstamp(pf, ena);
+
+}
+
+/**
+ * ice_get_ptp_clock_index - Get the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Determine the clock index of the PTP clock associated with this device. If
+ * this is the PF controlling the clock, just use the local access to the
+ * clock device pointer.
+ *
+ * Otherwise, read from the driver shared parameters to determine the clock
+ * index value.
+ *
+ * Returns: the index of the PTP clock associated with this device, or -1 if
+ * there is no associated clock.
+ */
+int ice_get_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+	u32 value;
+
+	/* Use the ptp_clock structure if we're the main PF */
+	if (pf->ptp.clock)
+		return ptp_clock_index(pf->ptp.clock);
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	status = ice_aq_get_driver_param(hw, param_idx, &value, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf),
+			"Failed to read PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		return -1;
+	}
+
+	/* The PTP clock index is an integer, and will be between 0 and
+	 * INT_MAX. The highest bit of the driver shared parameter is used to
+	 * indicate whether or not the currently stored clock index is valid.
+	 */
+	if (!(value & PTP_SHARED_CLK_IDX_VALID))
+		return -1;
+
+	return value & ~PTP_SHARED_CLK_IDX_VALID;
+}
+
+/**
+ * ice_set_ptp_clock_index - Set the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Set the PTP clock index for this device into the shared driver parameters,
+ * so that other PFs associated with this device can read it.
+ *
+ * If the PF is unable to store the clock index, it will log an error, but
+ * will continue operating PTP.
+ */
+static void ice_set_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+	u32 value;
+
+	if (!pf->ptp.clock)
+		return;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	value = (u32)ptp_clock_index(pf->ptp.clock);
+	if (value > INT_MAX) {
+		dev_err(ice_pf_to_dev(pf), "PTP Clock index is too large to store\n");
+		return;
+	}
+	value |= PTP_SHARED_CLK_IDX_VALID;
+
+	status = ice_aq_set_driver_param(hw, param_idx, value, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf),
+			"Failed to set PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	}
+}
+
+/**
+ * ice_clear_ptp_clock_index - Clear the PTP clock index
+ * @pf: the PF pointer
+ *
+ * Clear the PTP clock index for this device. Must be called when
+ * unregistering the PTP clock, in order to ensure other PFs stop reporting
+ * a clock object that no longer exists.
+ */
+static void ice_clear_ptp_clock_index(struct ice_pf *pf)
+{
+	enum ice_aqc_driver_params param_idx;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u8 tmr_idx;
+
+	/* Do not clear the index if we don't own the timer */
+	if (!hw->func_caps.ts_func_info.src_tmr_owned)
+		return;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+	if (!tmr_idx)
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR0;
+	else
+		param_idx = ICE_AQC_DRIVER_PARAM_CLK_IDX_TMR1;
+
+	status = ice_aq_set_driver_param(hw, param_idx, 0, NULL);
+	if (status) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"Failed to clear PTP clock index parameter, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+	}
+}
+
+/**
+ * ice_ptp_read_src_clk_reg - Read the source clock register
+ * @pf: Board private structure
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ */
+u64
+ice_ptp_read_src_clk_reg(struct ice_pf *pf, struct ptp_system_timestamp *sts)
+{
+	struct ice_hw *hw = &pf->hw;
+	u32 hi, lo, lo2;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	/* Read the system timestamp pre PHC read */
+	if (sts)
+		ptp_read_system_prets(sts);
+
+	lo = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+
+	/* Read the system timestamp post PHC read */
+	if (sts)
+		ptp_read_system_postts(sts);
+
+	hi = rd32(hw, GLTSYN_TIME_H(tmr_idx));
+	lo2 = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+
+	if (lo2 < lo) {
+		/* if TIME_L rolled over read TIME_L again and update
+		 *system timestamps
+		 */
+		if (sts)
+			ptp_read_system_prets(sts);
+		lo = rd32(hw, GLTSYN_TIME_L(tmr_idx));
+		if (sts)
+			ptp_read_system_postts(sts);
+		hi = rd32(hw, GLTSYN_TIME_H(tmr_idx));
+	}
+
+	return ((u64)hi << 32) | lo;
+}
+
+
+/**
+ * ice_ptp_update_cached_systime - Update the cached system time values
+ * @pf: Board specific private structure
+ *
+ * This function updates the system time values which are cached in the PF
+ * structure and the Rx rings.
+ *
+ * This should be called periodically at least once a second, and whenever the
+ * system time has been adjusted.
+ */
+static void ice_ptp_update_cached_systime(struct ice_pf *pf)
+{
+	u64 systime;
+	int i;
+
+	/* Read the current system time */
+	systime = ice_ptp_read_src_clk_reg(pf, NULL);
+
+	/* Update the cached system time stored in the PF structure */
+	WRITE_ONCE(pf->ptp.cached_phc_time, systime);
+
+	ice_for_each_vsi(pf, i) {
+		struct ice_vsi *vsi = pf->vsi[i];
+		int j;
+
+		if (!vsi)
+			continue;
+
+#ifdef HAVE_NETDEV_SB_DEV
+		if (vsi->type != ICE_VSI_PF &&
+		    vsi->type != ICE_VSI_OFFLOAD_MACVLAN)
+			continue;
+#else
+		if (vsi->type != ICE_VSI_PF)
+			continue;
+#endif /* HAVE_NETDEV_SB_DEV */
+
+		ice_for_each_rxq(vsi, j) {
+			if (!vsi->rx_rings[j])
+				continue;
+			WRITE_ONCE(vsi->rx_rings[j]->cached_systime, systime);
+		}
+	}
+}
+
+/**
+ * ice_ptp_extend_32b_ts - Convert a 32b nanoseconds timestamp to 64b
+ * @cached_phc_time: recently cached copy of PHC time
+ * @in_tstamp: Ingress/egress 32b nanoseconds timestamp value
+ *
+ * Hardware captures timestamps which contain only 32 bits of nominal
+ * nanoseconds, as opposed to the 64bit timestamps that the stack expects.
+ * Note that the captured timestamp values may be 40 bits, but the lower
+ * 8 bits are sub-nanoseconds and generally discarded.
+ *
+ * Extend the 32bit nanosecond timestamp using the following algorithm and
+ * assumptions:
+ *
+ * 1) have a recently cached copy of the PHC time
+ * 2) assume that the in_tstamp was captured 2^31 nanoseconds (~2.1
+ *    seconds) before or after the PHC time was captured.
+ * 3) calculate the delta between the cached time and the timestamp
+ * 4) if the delta is smaller than 2^31 nanoseconds, then the timestamp was
+ *    captured after the PHC time. In this case, the full timestamp is just
+ *    the cached PHC time plus the delta.
+ * 5) otherwise, if the delta is larger than 2^31 nanoseconds, then the
+ *    timestamp was captured *before* the PHC time, i.e. because the PHC
+ *    cache was updated after the timestamp was captured by hardware. In this
+ *    case, the full timestamp is the cached time minus the inverse delta.
+ *
+ * This algorithm works even if the PHC time was updated after a Tx timestamp
+ * was requested, but before the Tx timestamp event was reported from
+ * hardware.
+ *
+ * This calculation primarily relies on keeping the cached PHC time up to
+ * date. If the timestamp was captured more than 2^31 nanoseconds after the
+ * PHC time, it is possible that the lower 32bits of PHC time have
+ * overflowed more than once, and we might generate an incorrect timestamp.
+ *
+ * This is prevented by (a) periodically updating the cached PHC time once
+ * a second, and (b) discarding any Tx timestamp packet if it has waited for
+ * a timestamp for more than one second.
+ */
+static u64 ice_ptp_extend_32b_ts(u64 cached_phc_time, u32 in_tstamp)
+{
+	u32 delta, phc_time_lo;
+	u64 ns;
+
+	/* Extract the lower 32 bits of the PHC time */
+	phc_time_lo = (u32)cached_phc_time;
+
+	/* Calculate the delta between the lower 32bits of the cached PHC
+	 * time and the in_tstamp value
+	 */
+	delta = (in_tstamp - phc_time_lo);
+
+	/* Do not assume that the in_tstamp is always more recent than the
+	 * cached PHC time. If the delta is large, it indicates that the
+	 * in_tstamp was taken in the past, and should be converted
+	 * forward.
+	 */
+	if (delta > (U32_MAX / 2)) {
+		/* reverse the delta calculation here */
+		delta = (phc_time_lo - in_tstamp);
+		ns = cached_phc_time - delta;
+	} else {
+		ns = cached_phc_time + delta;
+	}
+
+	return ns;
+}
+
+/**
+ * ice_ptp_extend_40b_ts - Convert a 40b timestamp to 64b nanoseconds
+ * @pf: Board private structure
+ * @in_tstamp: Ingress/egress 40b timestamp value
+ *
+ * The Tx and Rx timestamps are 40 bits wide, including 32 bits of nominal
+ * nanoseconds, 7 bits of sub-nanoseconds, and a valid bit.
+ *
+ *  *--------------------------------------------------------------*
+ *  | 32 bits of nanoseconds | 7 high bits of sub ns underflow | v |
+ *  *--------------------------------------------------------------*
+ *
+ * The low bit is an indicator of whether the timestamp is valid. The next
+ * 7 bits are a capture of the upper 7 bits of the sub-nanosecond underflow,
+ * and the remaining 32 bits are the lower 32 bits of the PHC timer.
+ *
+ * It is assumed that the caller verifies the timestamp is valid prior to
+ * calling this function.
+ *
+ * Extract the 32bit nominal nanoseconds and extend them. Use the cached PHC
+ * time stored in the device private PTP structure as the basis for timestamp
+ * extension.
+ *
+ * See ice_ptp_extend_32b_ts for a detailed explanation of the extension
+ * algorithm.
+ */
+static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp)
+{
+	const u64 mask = GENMASK_ULL(31, 0);
+	return ice_ptp_extend_32b_ts(pf->ptp.cached_phc_time,
+				     (in_tstamp >> 8) & mask);
+}
+
+/**
+ * ice_ptp_get_ts_idx - Find the free Tx index based on current logical port
+ * @vsi: lport corresponding VSI
+ */
+int ice_ptp_get_ts_idx(struct ice_vsi *vsi)
+{
+	u8 own_idx_start, own_idx_end, lport, qport;
+	int i;
+
+	lport = vsi->port_info->lport;
+	qport = lport % ICE_PORTS_PER_QUAD;
+	/* Check on own idx window */
+	own_idx_start = qport * INDEX_PER_PORT;
+	own_idx_end = own_idx_start + INDEX_PER_PORT;
+
+	for (i = own_idx_start; i < own_idx_end; i++) {
+		if (!test_and_set_bit(i, vsi->ptp_tx_idx))
+			return i;
+	}
+
+	return -1;
+}
+
+/**
+ * ice_ptp_rel_all_skb - Free all pending skb waiting for timestamp
+ * @pf: The PF private structure
+ */
+static void ice_ptp_rel_all_skb(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	int idx;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+	for (idx = 0; idx < INDEX_PER_QUAD; idx++) {
+		if (vsi->ptp_tx_skb[idx]) {
+			dev_kfree_skb_any(vsi->ptp_tx_skb[idx]);
+			vsi->ptp_tx_skb[idx] = NULL;
+		}
+	}
+}
+
+static const u64 txrx_lane_par_clk[NUM_ICE_PTP_LNK_SPD] = {
+	31250000,	/* 1G */
+	257812500,	/* 10G */
+	644531250,	/* 25G */
+	161132812,	/* 25G RS */
+	257812500,	/* 40G */
+	644531250,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+static const u64 txrx_lane_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	125000000,	/* 1G */
+	156250000,	/* 10G */
+	390625000,	/* 25G */
+	97656250,	/* 25G RS */
+	156250000,	/* 40G */
+	390625000,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+static const u64 txrx_rsgb_par_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0,		/* 1G */
+	0,		/* 10G */
+	0,		/* 25G */
+	322265625,	/* 25G RS */
+	0,		/* 40G */
+	0,		/* 50G */
+	644531250,	/* 50G RS */
+	1289062500,	/* 100G RS */
+};
+
+static const u64 txrx_rsgb_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0, 0, 0, 97656250, 0, 0, 195312500, 390625000
+};
+
+static const u64 rx_desk_par_pcs_clk[NUM_ICE_PTP_LNK_SPD] = {
+	0,		/* 1G */
+	0,		/* 10G */
+	0,		/* 25G */
+	0,		/* 25G RS */
+	156250000,	/* 40G */
+	19531250,	/* 50G */
+	644531250,	/* 50G RS */
+	644531250,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_parpcs_incval - Set PAR/PCS PHY cycle count
+ * @pf: Board private struct
+ * @port: Port we are configuring PHY for
+ *
+ * Note that this function is only expected to be called during port up and
+ * during a link event.
+ */
+static void ice_ptp_port_phy_set_parpcs_incval(struct ice_pf *pf, int port)
+{
+	u64 cur_freq, clk_incval, uix, phy_tus;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_ptp_fec_mode fec_mode;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+	clk_incval = ice_ptp_read_src_incval(hw);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
+	if (status)
+		goto exit;
+
+	/* UIX programming */
+	/* We split a 'divide by 1e11' operation into a 'divide by 256' and a
+	 * 'divide by 390625000' operation to be able to do the calculation
+	 * using fixed-point math.
+	 */
+	if (link_spd == ICE_PTP_LNK_SPD_10G ||
+	    link_spd == ICE_PTP_LNK_SPD_40G) {
+#define LINE_UI_10G_40G 640 /* 6600 UI at 10Gb line rate */
+		uix = (cur_freq * LINE_UI_10G_40G) >> 8;
+		uix *= clk_incval;
+		uix /= 390625000;
+
+		val = TS_LOW_M & uix;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_UIX66_10G_40G_L,
+						val);
+		if (status)
+			goto exit;
+		val = (uix >> 32) & TS_LOW_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_UIX66_10G_40G_U,
+						val);
+		if (status)
+			goto exit;
+	} else if (link_spd == ICE_PTP_LNK_SPD_25G ||
+		   link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+#define LINE_UI_25G_100G 256 /* 6600 UI at 25Gb line rate */
+		uix = (cur_freq * LINE_UI_25G_100G) >> 8;
+		uix *= clk_incval;
+		uix /= 390625000;
+
+		val = TS_LOW_M & uix;
+		status = ice_write_phy_reg_e822(hw, port,
+						P_REG_UIX66_25G_100G_L, val);
+		if (status)
+			goto exit;
+		val = (uix >> 32) & TS_LOW_M;
+		status = ice_write_phy_reg_e822(hw, port,
+						P_REG_UIX66_25G_100G_U, val);
+		if (status)
+			goto exit;
+	}
+
+	if (link_spd == ICE_PTP_LNK_SPD_25G_RS) {
+		phy_tus = (cur_freq * clk_incval * 2) /
+			  txrx_rsgb_par_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_RX_TUS_L, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_TX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_RX_TUS_U, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PAR_TX_TUS_U, val);
+
+		phy_tus = (cur_freq * clk_incval) /
+			  txrx_rsgb_pcs_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_RX_TUS_L, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_TX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_RX_TUS_U, val);
+		ice_write_phy_reg_e822(hw, port, P_REG_DESK_PCS_TX_TUS_U, val);
+	} else {
+		phy_tus = (cur_freq * clk_incval) /
+			txrx_lane_par_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PAR_RX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PAR_RX_TUS_U, val);
+
+		if (link_spd != ICE_PTP_LNK_SPD_50G_RS &&
+		    link_spd != ICE_PTP_LNK_SPD_100G_RS) {
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_PAR_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_PAR_TX_TUS_U, val);
+		} else {
+			phy_tus = (cur_freq * clk_incval * 2) /
+				txrx_rsgb_par_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_TX_TUS_U, val);
+		}
+
+		phy_tus = (cur_freq * clk_incval) /
+			txrx_lane_pcs_clk[link_spd];
+		val = phy_tus & TS_PHY_LOW_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PCS_RX_TUS_L, val);
+		val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+		ice_write_phy_reg_e822(hw, port, P_REG_PCS_RX_TUS_U, val);
+
+		if (link_spd != ICE_PTP_LNK_SPD_50G_RS &&
+		    link_spd != ICE_PTP_LNK_SPD_100G_RS) {
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port, P_REG_PCS_TX_TUS_L,
+					       val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port, P_REG_PCS_TX_TUS_U,
+					       val);
+		} else {
+			phy_tus = (cur_freq * clk_incval) /
+				txrx_rsgb_pcs_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_TX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_TX_TUS_U, val);
+		}
+
+		if (link_spd == ICE_PTP_LNK_SPD_40G ||
+		    link_spd == ICE_PTP_LNK_SPD_50G) {
+			phy_tus = (cur_freq * clk_incval) /
+				rx_desk_par_pcs_clk[link_spd];
+			val = phy_tus & TS_PHY_LOW_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_L, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_L, val);
+			val = (phy_tus >> 8) & TS_PHY_HIGH_M;
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PAR_RX_TUS_U, val);
+			ice_write_phy_reg_e822(hw, port,
+					       P_REG_DESK_PCS_RX_TUS_U, val);
+		}
+	}
+
+exit:
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP Vernier configuration failed on port %d, status %s\n",
+			port, ice_stat_str(status));
+}
+
+/* Values of tx_offset_delay in units of 1/100th of a nanosecond */
+static const u64 tx_offset_delay[NUM_ICE_PTP_LNK_SPD] = {
+	25140,	/* 1G */
+	6938,	/* 10G */
+	2778,	/* 25G */
+	3928,	/* 25G RS */
+	5666,	/* 40G */
+	2778,	/* 50G */
+	2095,	/* 50G RS */
+	1620,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_tx_offset - Set PHY clock Tx timestamp offset
+ * @ptp_port: the PTP port we are configuring the PHY for
+ */
+static int ice_ptp_port_phy_set_tx_offset(struct ice_ptp_port *ptp_port)
+{
+	u64 cur_freq, clk_incval, offset;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int port;
+	u32 val;
+
+	pf = ptp_port_to_pf(ptp_port);
+	port = ptp_port->port_num;
+	hw = &pf->hw;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw))
+		return -EBUSY;
+
+	clk_incval = ice_ptp_read_src_incval(hw);
+	ice_ptp_unlock(hw);
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, NULL);
+	if (status)
+		goto exit;
+
+	offset = cur_freq * clk_incval;
+	offset /= 10000;
+	offset *= tx_offset_delay[link_spd];
+	offset /= 10000000;
+
+	if (link_spd == ICE_PTP_LNK_SPD_1G ||
+	    link_spd == ICE_PTP_LNK_SPD_10G ||
+	    link_spd == ICE_PTP_LNK_SPD_25G ||
+	    link_spd == ICE_PTP_LNK_SPD_25G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_40G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G) {
+		status = ice_read_phy_reg_e822(hw, port,
+					       P_REG_PAR_PCS_TX_OFFSET_L,
+					       &val);
+		if (status)
+			goto exit;
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port,
+					       P_REG_PAR_PCS_TX_OFFSET_U,
+					       &val);
+		if (status)
+			goto exit;
+		offset += (u64)val << 32;
+	}
+
+	if (link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_TX_TIME_L,
+					       &val);
+		if (status)
+			goto exit;
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_TX_TIME_U,
+					       &val);
+		if (status)
+			goto exit;
+		offset += (u64)val << 32;
+	}
+
+	val = (u32)offset;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_TX_OFFSET_L, val);
+	if (status)
+		goto exit;
+	val = (u32)(offset >> 32);
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_TX_OFFSET_U, val);
+	if (status)
+		goto exit;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TX_OR, 1);
+	if (status)
+		goto exit;
+
+	atomic_set(&ptp_port->tx_offset_ready, 1);
+exit:
+	if (status)
+		dev_err(ice_pf_to_dev(pf),
+			"PTP tx offset configuration failed on port %d status=%s\n",
+			port, ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_calc_pmd_adj - Calculate PMD adjustment using integers
+ * @cur_freq: PHY clock frequency
+ * @clk_incval: Source clock incval
+ * @calc_numerator: Value to divide
+ * @calc_denominator: Remainder of the division
+ *
+ * This is the integer math calculation which attempts to avoid overflowing
+ * a u64. The division (in this case 1/25.78125e9) is split into two parts 125
+ * and the remainder, which is the stored in calc_denominator.
+ */
+static u64
+ice_ptp_calc_pmd_adj(u64 cur_freq, u64 clk_incval, u64 calc_numerator,
+		     u64 calc_denominator)
+{
+	u64 pmd_adj = calc_numerator;
+
+	pmd_adj *= cur_freq;
+	pmd_adj /= 125;
+	pmd_adj *= clk_incval;
+	pmd_adj /= calc_denominator;
+	return pmd_adj;
+}
+
+/**
+ * ice_ptp_get_pmd_adj - Calculate total PMD adjustment
+ * @pf: Board private struct
+ * @port: Port we are configuring PHY for
+ * @cur_freq: PHY clock frequency
+ * @link_spd: PHY link speed
+ * @clk_incval: source clock incval
+ * @mode: FEC mode
+ * @pmd_adj: PMD adjustment to be calculated
+ */
+static int ice_ptp_get_pmd_adj(struct ice_pf *pf, int port, u64 cur_freq,
+			       enum ice_ptp_link_spd link_spd, u64 clk_incval,
+			       enum ice_ptp_fec_mode mode, u64 *pmd_adj)
+{
+	u64 calc_numerator, calc_denominator;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	u8 pmd;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PMD_ALIGNMENT, &val);
+	if (status)
+		return -EIO;
+
+	pmd = (u8)val;
+
+	/* RS mode overrides all the other pmd_alignment calculations. */
+	if (link_spd == ICE_PTP_LNK_SPD_25G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		u64 pmd_cycle_adj = 0;
+		u8 rx_cycle;
+
+		if (link_spd == ICE_PTP_LNK_SPD_50G ||
+		    link_spd == ICE_PTP_LNK_SPD_50G_RS) {
+			ice_read_phy_reg_e822(hw, port, P_REG_RX_80_TO_160_CNT,
+					      &val);
+			rx_cycle = val & P_REG_RX_80_TO_160_CNT_RXCYC_M;
+		} else {
+			ice_read_phy_reg_e822(hw, port, P_REG_RX_40_TO_160_CNT,
+					      &val);
+			rx_cycle = val & P_REG_RX_40_TO_160_CNT_RXCYC_M;
+		}
+		calc_numerator = pmd;
+		if (pmd < 17)
+			calc_numerator += 40;
+		calc_denominator = 206250000;
+
+		*pmd_adj = ice_ptp_calc_pmd_adj(cur_freq, clk_incval,
+						calc_numerator,
+						calc_denominator);
+
+		if (rx_cycle != 0) {
+			if (link_spd == ICE_PTP_LNK_SPD_25G_RS)
+				calc_numerator = 4 - rx_cycle;
+			else if (link_spd == ICE_PTP_LNK_SPD_50G_RS)
+				calc_numerator = rx_cycle;
+			else
+				calc_numerator = 0;
+			calc_numerator *= 40;
+			pmd_cycle_adj = ice_ptp_calc_pmd_adj(cur_freq,
+							     clk_incval,
+							     calc_numerator,
+							     calc_denominator);
+		}
+		*pmd_adj += pmd_cycle_adj;
+	} else {
+		calc_numerator = 0;
+		calc_denominator = 1;
+		if (link_spd == ICE_PTP_LNK_SPD_1G) {
+			if (pmd == 4)
+				calc_numerator = 10;
+			else
+				calc_numerator = (pmd + 6) % 10;
+			calc_denominator = 10000000;
+		} else if (link_spd == ICE_PTP_LNK_SPD_10G ||
+			   link_spd == ICE_PTP_LNK_SPD_40G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd;
+				calc_denominator = 82500000;
+			}
+		} else if (link_spd == ICE_PTP_LNK_SPD_25G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd;
+				calc_denominator = 206250000;
+			}
+		} else if (link_spd == ICE_PTP_LNK_SPD_50G) {
+			if (pmd != 65 || mode == ICE_PTP_FEC_MODE_CLAUSE74) {
+				calc_numerator = pmd * 2;
+				calc_denominator = 206250000;
+			}
+		}
+		*pmd_adj = ice_ptp_calc_pmd_adj(cur_freq, clk_incval,
+						calc_numerator,
+						calc_denominator);
+	}
+
+	return 0;
+}
+
+/* Values of rx_offset_delay in units of 1/100th of a nanosecond */
+static const u64 rx_offset_delay[NUM_ICE_PTP_LNK_SPD] = {
+	17372,	/* 1G */
+	6212,	/* 10G */
+	2491,	/* 25G */
+	29535,	/* 25G RS */
+	4244,	/* 40G */
+	2868,	/* 50G */
+	14524,	/* 50G RS */
+	7775,	/* 100G RS */
+};
+
+/**
+ * ice_ptp_port_phy_set_rx_offset - Set PHY clock Tx timestamp offset
+ * @ptp_port: PTP port we are configuring PHY for
+ */
+static int ice_ptp_port_phy_set_rx_offset(struct ice_ptp_port *ptp_port)
+{
+	u64 cur_freq, clk_incval, offset, pmd_adj;
+	enum ice_ptp_link_spd link_spd;
+	enum ice_ptp_fec_mode fec_mode;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int err, port;
+	u32 val;
+
+	pf = ptp_port_to_pf(ptp_port);
+	port = ptp_port->port_num;
+	hw = &pf->hw;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	clk_incval = ice_ptp_read_src_incval(hw);
+	ice_ptp_unlock(hw);
+
+	cur_freq = ice_e822_pll_freq(pf->ptp.time_ref_freq);
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, &fec_mode);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	offset = cur_freq * clk_incval;
+	offset /= 10000;
+	offset *= rx_offset_delay[link_spd];
+	offset /= 10000000;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_PCS_RX_OFFSET_L,
+				       &val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	offset += val;
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_PCS_RX_OFFSET_U,
+				       &val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	offset += (u64)val << 32;
+
+	if (link_spd == ICE_PTP_LNK_SPD_40G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G ||
+	    link_spd == ICE_PTP_LNK_SPD_50G_RS ||
+	    link_spd == ICE_PTP_LNK_SPD_100G_RS) {
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_RX_TIME_L,
+					       &val);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto exit;
+		}
+		offset += val;
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PAR_RX_TIME_U,
+					       &val);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto exit;
+		}
+		offset += (u64)val << 32;
+	}
+
+	err = ice_ptp_get_pmd_adj(pf, port, cur_freq, link_spd, clk_incval,
+				  fec_mode, &pmd_adj);
+	if (err)
+		goto exit;
+
+	if (fec_mode == ICE_PTP_FEC_MODE_RS_FEC)
+		offset += pmd_adj;
+	else
+		offset -= pmd_adj;
+
+	val = (u32)offset;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_RX_OFFSET_L, val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+	val = (u32)(offset >> 32);
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TOTAL_RX_OFFSET_U, val);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_RX_OR, 1);
+	if (status) {
+		err = ice_status_to_errno(status);
+		goto exit;
+	}
+
+	atomic_set(&ptp_port->rx_offset_ready, 1);
+exit:
+	if (err)
+		dev_err(ice_pf_to_dev(pf),
+			"PTP rx offset configuration failed on port %d, err=%d\n",
+			port, err);
+	return err;
+}
+
+/**
+ * ice_ptp_port_sync_src_timer - Sync PHY timer with source timer
+ * @pf: Board private structure
+ * @port: Port for which the PHY start is set
+ *
+ * Sync PHY timer with source timer after calculating and setting Tx/Rx
+ * Vernier offset.
+ */
+static enum ice_status ice_ptp_port_sync_src_timer(struct ice_pf *pf, int port)
+{
+	u64 src_time = 0x0, tx_time, rx_time, temp_adj;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 time_adj;
+	u32 zo, lo;
+	u8 tmr_idx;
+
+	/* Get the PTP HW lock */
+	if (!ice_ptp_lock(hw)) {
+		dev_err(dev, "PTP failed to acquire semaphore\n");
+		return ICE_ERR_NOT_READY;
+	}
+
+	/* Program cmd to source timer */
+	ice_ptp_src_cmd(hw, READ_TIME);
+
+	/* Program cmd to PHY port */
+	status = ice_ptp_one_port_cmd(hw, port, READ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+
+	/* Read source timer SHTIME_0 and SHTIME_L */
+	zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx));
+	lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx));
+	src_time |= (u64)lo;
+	src_time = (src_time << 32) | (u64)zo;
+
+	/* Read Tx and Rx capture from PHY */
+	status = ice_ptp_read_port_capture(hw, port, &tx_time, &rx_time);
+	if (status)
+		goto unlock;
+
+	if (tx_time != rx_time)
+		dev_info(dev, "Port %d Rx and Tx times do not match\n", port);
+
+	/* Calculate amount to adjust port timer and account for case where
+	 * delta is larger/smaller than S64_MAX/S64_MIN
+	 */
+	if (src_time > tx_time) {
+		temp_adj = src_time - tx_time;
+		if (temp_adj & BIT_ULL(63)) {
+			time_adj = temp_adj >> 1;
+		} else {
+			time_adj = temp_adj;
+			/* Set to zero to indicate adjustment done */
+			temp_adj = 0x0;
+		}
+	} else {
+		temp_adj = tx_time - src_time;
+		if (temp_adj & BIT_ULL(63)) {
+			time_adj = -(temp_adj >> 1);
+		} else {
+			time_adj = -temp_adj;
+			/* Set to zero to indicate adjustment done */
+			temp_adj = 0x0;
+		}
+	}
+
+	status = ice_ptp_prep_port_adj_e822(hw, port, time_adj, true);
+	if (status)
+		goto unlock;
+
+	status = ice_ptp_one_port_cmd(hw, port, ADJ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	/* Do a second adjustment if original was too large/small to fit into
+	 * a S64
+	 */
+	if (temp_adj) {
+		status = ice_ptp_prep_port_adj_e822(hw, port, time_adj, true);
+		if (status)
+			goto unlock;
+
+		status = ice_ptp_one_port_cmd(hw, port, ADJ_TIME, true);
+		if (!status)
+			/* Issue sync to activate commands */
+			wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+	}
+
+	/* This second register read is to flush out the port and source
+	 * command registers. Multiple successive calls to this function
+	 * require this
+	 */
+
+	/* Program cmd to source timer */
+	ice_ptp_src_cmd(hw, READ_TIME);
+
+	/* Program cmd to PHY port */
+	status = ice_ptp_one_port_cmd(hw, port, READ_TIME, true);
+	if (status)
+		goto unlock;
+
+	/* Issue sync to activate commands */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	/* Read source timer SHTIME_0 and SHTIME_L */
+	zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx));
+	lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx));
+	src_time = (u64)lo;
+	src_time = (src_time << 32) | (u64)zo;
+
+	/* Read Tx and Rx capture from PHY */
+	status = ice_ptp_read_port_capture(hw, port, &tx_time, &rx_time);
+
+	if (status)
+		goto unlock;
+	dev_info(dev, "Port %d PTP synced to source 0x%016llX, 0x%016llX\n",
+		 port, src_time, tx_time);
+unlock:
+	ice_ptp_unlock(hw);
+
+	if (status)
+		dev_err(dev, "PTP failed to sync port %d PHY time, status %s\n",
+			port, ice_stat_str(status));
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_time - Read the time from the device
+ * @pf: Board private structure
+ * @ts: timespec structure to hold the current time value
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ *
+ * This function reads the source clock registers and stores them in a timespec.
+ * However, since the registers are 64 bits of nanoseconds, we must convert the
+ * result to a timespec before we can return.
+ */
+static void ice_ptp_read_time(struct ice_pf *pf, struct timespec64 *ts,
+			      struct ptp_system_timestamp *sts)
+{
+	u64 time_ns;
+
+	if (pf->ptp.src_tmr_mode != ICE_SRC_TMR_MODE_NANOSECONDS) {
+		dev_err(ice_pf_to_dev(pf),
+			"PTP Locked mode is not supported!\n");
+		return;
+	}
+	time_ns = ice_ptp_read_src_clk_reg(pf, sts);
+
+	*ts = ns_to_timespec64(time_ns);
+}
+
+/**
+ * ice_ptp_write_init - Set PHC time to provided value
+ * @pf: Board private structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the PHC time to the specified time provided in the timespec.
+ */
+static int ice_ptp_write_init(struct ice_pf *pf, struct timespec64 *ts)
+{
+	u64 ns = timespec64_to_ns(ts);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 val;
+
+	if (pf->ptp.src_tmr_mode != ICE_SRC_TMR_MODE_NANOSECONDS) {
+		dev_err(ice_pf_to_dev(pf),
+			"PTP Locked mode is not supported!\n");
+		return ICE_ERR_NOT_SUPPORTED;
+	}
+	val = ns;
+
+	status = ice_ptp_init_time(hw, val);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_write_adj - Adjust PHC clock time atomically
+ * @pf: Board private structure
+ * @adj: Adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Perform an atomic adjustment of the PHC time by the specified number of
+ * nanoseconds.
+ */
+static int
+ice_ptp_write_adj(struct ice_pf *pf, s32 adj, bool lock_sbq)
+{
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+
+
+	status = ice_ptp_adj_clock(hw, adj, lock_sbq);
+	if (status)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+
+/**
+ * ice_ptp_get_incval - Get clock increment params
+ * @pf: Board private structure
+ * @time_ref_freq: TIME_REF frequency
+ * @src_tmr_mode: Source timer mode (nanoseconds or locked)
+ */
+int ice_ptp_get_incval(struct ice_pf *pf, enum ice_time_ref_freq *time_ref_freq,
+		       enum ice_src_tmr_mode *src_tmr_mode)
+{
+	*time_ref_freq = pf->ptp.time_ref_freq;
+	*src_tmr_mode = pf->ptp.src_tmr_mode;
+
+	return 0;
+}
+
+/**
+ * ice_base_incval - Get base timer increment value
+ * @pf: Board private structure
+ *
+ * Look up the base timer increment value for this device. The base increment
+ * value is used to define the nominal clock tick rate. This increment value
+ * is programmed during device initialization. It is also used as the basis
+ * for calculating adjustments using scaled_ppm.
+ */
+static u64 ice_base_incval(struct ice_pf *pf)
+{
+	u64 incval;
+
+	if (ice_is_e810(&pf->hw))
+		incval = ICE_PTP_NOMINAL_INCVAL_E810;
+	else if (pf->ptp.time_ref_freq < NUM_ICE_TIME_REF_FREQ)
+		incval = ice_e822_nominal_incval(pf->ptp.time_ref_freq);
+	else
+		incval = LOCKED_INCVAL_E822;
+
+	dev_dbg(ice_pf_to_dev(pf), "PTP: using base increment value of 0x%016llx\n",
+		incval);
+
+	return incval;
+}
+
+/**
+ * ice_ptp_reset_ts_memory_quad - Reset timestamp memory for one quad
+ * @pf: The PF private data structure
+ * @quad: The quad (0-4)
+ */
+static void ice_ptp_reset_ts_memory_quad(struct ice_pf *pf, int quad)
+{
+	struct ice_hw *hw = &pf->hw;
+
+	ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, Q_REG_TS_CTRL_M);
+	ice_write_quad_reg_e822(hw, quad, Q_REG_TS_CTRL, ~(u32)Q_REG_TS_CTRL_M);
+}
+
+/**
+ * ice_ptp_check_tx_fifo - Check whether Tx FIFO is in an OK state
+ * @port: PTP port for which Tx FIFO is checked
+ */
+static int ice_ptp_check_tx_fifo(struct ice_ptp_port *port)
+{
+	int quad = port->port_num / ICE_PORTS_PER_QUAD;
+	int offs = port->port_num % ICE_PORTS_PER_QUAD;
+	enum ice_status status;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u32 val, phy_sts;
+
+	pf = ptp_port_to_pf(port);
+	hw = &pf->hw;
+
+
+	if (port->tx_fifo_busy_cnt == FIFO_OK)
+		return 0;
+
+	/* need to read FIFO state */
+	if (offs == 0 || offs == 1)
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO01_STATUS,
+						&val);
+	else
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO23_STATUS,
+						&val);
+
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to check port %d Tx FIFO, status %s\n",
+			port->port_num, ice_stat_str(status));
+		return ice_status_to_errno(status);
+	}
+
+	if (offs & 0x1)
+		phy_sts = (val & Q_REG_FIFO13_M) >> Q_REG_FIFO13_S;
+	else
+		phy_sts = (val & Q_REG_FIFO02_M) >> Q_REG_FIFO02_S;
+
+	if (phy_sts & FIFO_EMPTY) {
+		port->tx_fifo_busy_cnt = FIFO_OK;
+		return 0;
+	}
+
+	port->tx_fifo_busy_cnt++;
+
+	dev_dbg(ice_pf_to_dev(pf), "Try %d, port %d FIFO not empty\n",
+		port->tx_fifo_busy_cnt, port->port_num);
+
+	if (port->tx_fifo_busy_cnt == ICE_PTP_FIFO_NUM_CHECKS) {
+		dev_dbg(ice_pf_to_dev(pf),
+			"Port %d Tx FIFO still not empty; resetting quad %d\n",
+			port->port_num, quad);
+		ice_ptp_reset_ts_memory_quad(pf, quad);
+		port->tx_fifo_busy_cnt = FIFO_OK;
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+/**
+ * ice_ptp_check_tx_offset_valid - Check if the Tx PHY offset is valid
+ * @port: the PTP port to check
+ *
+ * Checks whether the Tx offset for the PHY associated with this port is
+ * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
+ * not.
+ */
+static int ice_ptp_check_tx_offset_valid(struct ice_ptp_port *port)
+{
+	struct ice_pf *pf = ptp_port_to_pf(port);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	int err;
+
+	/* Check if the offset is already valid */
+	if (atomic_read(&port->tx_offset_ready))
+		return 0;
+
+	/* Take the bit lock to prevent cross thread interaction */
+	if (atomic_cmpxchg(&port->tx_offset_lock, false, true))
+		return -EBUSY;
+
+	err = ice_ptp_check_tx_fifo(port);
+	if (err)
+		goto out_unlock;
+
+	status = ice_read_phy_reg_e822(hw, port->port_num, P_REG_TX_OV_STATUS,
+				       &val);
+	if (status) {
+		dev_err(dev, "Failed to read TX_OV_STATUS for port %d, status %s\n",
+			port->port_num, ice_stat_str(status));
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	if (!(val & P_REG_TX_OV_STATUS_OV_M)) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	err = ice_ptp_port_phy_set_tx_offset(port);
+	if (err) {
+		dev_err(dev, "Failed to set PHY Rx offset for port %d, err %d\n",
+			port->port_num, err);
+		goto out_unlock;
+	}
+
+	dev_info(dev, "Port %d Tx calibration complete\n", port->port_num);
+
+
+out_unlock:
+	atomic_set(&port->tx_offset_lock, false);
+
+	return err;
+}
+
+/**
+ * ice_ptp_check_rx_offset_valid - Check if the Rx PHY offset is valid
+ * @port: the PTP port to check
+ *
+ * Checks whether the Rx offset for the PHY associated with this port is
+ * valid. Returns 0 if the offset is valid, and a non-zero error code if it is
+ * not.
+ */
+static int ice_ptp_check_rx_offset_valid(struct ice_ptp_port *port)
+{
+	struct ice_pf *pf = ptp_port_to_pf(port);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+	int err;
+
+	/* Check if the offset is already valid */
+	if (atomic_read(&port->rx_offset_ready))
+		return 0;
+
+	/* Take the bit lock to prevent cross thread interaction */
+	if (atomic_cmpxchg(&port->rx_offset_lock, false, true))
+		return -EBUSY;
+
+	status = ice_read_phy_reg_e822(hw, port->port_num, P_REG_RX_OV_STATUS,
+				       &val);
+	if (status) {
+		dev_err(dev, "Failed to read RX_OV_STATUS for port %d, status %s\n",
+			port->port_num, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto out_unlock;
+	}
+
+	if (!(val & P_REG_RX_OV_STATUS_OV_M)) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	err = ice_ptp_port_phy_set_rx_offset(port);
+	if (err) {
+		dev_err(dev, "Failed to set PHY Rx offset for port %d, err %d\n",
+			port->port_num, err);
+		goto out_unlock;
+	}
+
+	dev_info(dev, "Port %d Rx calibration complete\n", port->port_num);
+
+out_unlock:
+	atomic_set(&port->rx_offset_lock, false);
+
+	return err;
+}
+
+/**
+ * ice_ptp_check_offset_valid - Check port offset valid bit
+ * @port: Port for which offset valid bit is checked
+ *
+ * Returns 0 if both Tx and Rx offset are valid, and -EAGAIN if one of the
+ * offset is not ready.
+ */
+static int ice_ptp_check_offset_valid(struct ice_ptp_port *port)
+{
+	int tx_err, rx_err;
+
+	/* always check both Tx and Rx offset validity */
+	tx_err = ice_ptp_check_tx_offset_valid(port);
+	rx_err = ice_ptp_check_rx_offset_valid(port);
+
+	if (tx_err || rx_err)
+		return -EAGAIN;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_wait_for_offset_valid - Poll offset valid reg until set or timeout
+ * @work: Pointer to struct work_struct
+ */
+static void ice_ptp_wait_for_offset_valid(struct work_struct *work)
+{
+	struct ice_ptp_port *port;
+	struct ice_pf *pf;
+	int i;
+
+	port = container_of(work, struct ice_ptp_port, ov_task);
+	pf = ptp_port_to_pf(port);
+
+#define OV_POLL_PERIOD_MS 10
+#define OV_POLL_ATTEMPTS 20
+	for (i = 0; i < OV_POLL_ATTEMPTS; i++) {
+		if (atomic_read(&pf->ptp.phy_reset_lock))
+			return;
+
+		if (!ice_ptp_check_offset_valid(port))
+			return;
+
+		msleep(OV_POLL_PERIOD_MS);
+	}
+}
+
+/**
+ * ice_ptp_port_phy_start - Set or clear PHY start for port timestamping
+ * @ptp_port: PTP port for which the PHY start is set
+ * @phy_start: Value to be set
+ */
+static int
+ice_ptp_port_phy_start(struct ice_ptp_port *ptp_port, bool phy_start)
+{
+	struct ice_pf *pf = ptp_port_to_pf(ptp_port);
+	u8 port = ptp_port->port_num;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u32 val;
+
+	mutex_lock(&ptp_port->ps_lock);
+
+	atomic_set(&ptp_port->tx_offset_ready, 0);
+	atomic_set(&ptp_port->rx_offset_ready, 0);
+	ptp_port->tx_fifo_busy_cnt = 0;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_TX_OR, 0);
+	if (status)
+		goto out_unlock;
+
+	status = ice_write_phy_reg_e822(hw, port, P_REG_RX_OR, 0);
+	if (status)
+		goto out_unlock;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_PS, &val);
+	if (status)
+		goto out_unlock;
+
+	val &= ~P_REG_PS_START_M;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+	if (status)
+		goto out_unlock;
+
+	val &= ~P_REG_PS_ENA_CLK_M;
+	status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+	if (status)
+		goto out_unlock;
+
+
+	if (phy_start && ptp_port->link_up) {
+		ice_phy_cfg_lane_e822(hw, port);
+		ice_ptp_port_phy_set_parpcs_incval(pf, port);
+
+		status = ice_ptp_write_incval_locked(hw, ice_base_incval(pf));
+		if (status)
+			goto out_unlock;
+
+
+		status = ice_read_phy_reg_e822(hw, port, P_REG_PS, &val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_SFT_RESET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_START_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val &= ~P_REG_PS_SFT_RESET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		status = ice_ptp_write_incval_locked(hw, ice_base_incval(pf));
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_ENA_CLK_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		val |= P_REG_PS_LOAD_OFFSET_M;
+		status = ice_write_phy_reg_e822(hw, port, P_REG_PS, val);
+		if (status)
+			goto out_unlock;
+
+		wr32(&pf->hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+		status = ice_ptp_port_sync_src_timer(pf, port);
+		if (status)
+			goto out_unlock;
+
+		queue_work(pf->ptp.ov_wq, &ptp_port->ov_task);
+	}
+
+out_unlock:
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set PHY port %d %s, status=%s\n",
+			port, phy_start ? "up" : "down", ice_stat_str(status));
+
+	mutex_unlock(&ptp_port->ps_lock);
+
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_link_change - Set or clear port registers for timestamping
+ * @pf: Board private structure
+ * @port: Port for which the PHY start is set
+ * @linkup: Link is up or down
+ */
+int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+{
+	/* If PTP is not supported on this function, nothing to do */
+	if (!test_bit(ICE_FLAG_PTP_ENA, pf->flags))
+		return 0;
+
+	if (linkup && !test_bit(ICE_FLAG_PTP, pf->flags)) {
+		dev_err(ice_pf_to_dev(pf), "PTP not ready, failed to prepare port %d\n",
+			port);
+		return -EAGAIN;
+	}
+
+	if (port >= ICE_NUM_EXTERNAL_PORTS)
+		return -EINVAL;
+
+	pf->ptp.port.link_up = linkup;
+
+	return ice_ptp_port_phy_start(&pf->ptp.port, linkup);
+}
+
+
+/**
+ * ice_ptp_reset_ts_memory - Reset timestamp memory for all quads
+ * @pf: The PF private data structure
+ */
+static void ice_ptp_reset_ts_memory(struct ice_pf *pf)
+{
+	int quad;
+
+	quad = pf->hw.port_info->lport / ICE_PORTS_PER_QUAD;
+	ice_ptp_reset_ts_memory_quad(pf, quad);
+}
+
+/**
+ * ice_ptp_tx_ena_intr - Enable or disable the Tx timestamp interrupt
+ * @pf: PF private structure
+ * @ena: bool value to enable or disable interrupt
+ * @threshold: Minimum number of packets at which intr is triggered
+ *
+ * Utility function to enable or disable Tx timestamp interrupt and threshold
+ */
+static int ice_ptp_tx_ena_intr(struct ice_pf *pf, bool ena, u32 threshold)
+{
+	enum ice_status status = 0;
+	struct ice_hw *hw = &pf->hw;
+	int quad;
+	u32 val;
+
+	ice_ptp_reset_ts_memory(pf);
+
+	for (quad = 0; quad < ICE_MAX_QUAD; quad++) {
+		status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+						&val);
+		if (status)
+			break;
+
+		if (ena) {
+			val |= Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M;
+			val &= ~Q_REG_TX_MEM_GBL_CFG_INTR_THR_M;
+			val |= ((threshold << Q_REG_TX_MEM_GBL_CFG_INTR_THR_S) &
+				Q_REG_TX_MEM_GBL_CFG_INTR_THR_M);
+		} else {
+			val &= ~Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M;
+		}
+
+		status = ice_write_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+						 val);
+		if (status)
+			break;
+	}
+
+	if (status)
+		dev_err(ice_pf_to_dev(pf), "PTP failed in intr ena, status %s\n",
+			ice_stat_str(status));
+	return ice_status_to_errno(status);
+}
+
+/**
+ * ice_ptp_reset_phy_timestamping - Reset PHY timestamp registers values
+ * @pf: Board private structure
+ */
+static void ice_ptp_reset_phy_timestamping(struct ice_pf *pf)
+{
+	int i;
+
+#define PHY_RESET_TRIES		5
+#define PHY_RESET_SLEEP_MS	5
+
+	for (i = 0; i < PHY_RESET_TRIES; i++) {
+		if (atomic_cmpxchg(&pf->ptp.phy_reset_lock, false, true))
+			goto reset;
+
+		msleep(PHY_RESET_SLEEP_MS);
+	}
+	return;
+
+reset:
+	flush_workqueue(pf->ptp.ov_wq);
+	ice_ptp_port_phy_start(&pf->ptp.port, false);
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, true);
+
+	ice_ptp_reset_ts_memory(pf);
+	atomic_set(&pf->ptp.phy_reset_lock, false);
+}
+
+/**
+ * ice_ptp_update_incval - Update clock increment rate
+ * @pf: Board private structure
+ * @time_ref_freq: TIME_REF frequency to use
+ * @src_tmr_mode: Src timer mode (nanoseconds or locked)
+ */
+int
+ice_ptp_update_incval(struct ice_pf *pf, enum ice_time_ref_freq time_ref_freq,
+		      enum ice_src_tmr_mode src_tmr_mode)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct timespec64 ts;
+	s64 incval;
+	int err;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags)) {
+		dev_err(dev, "PTP not ready, failed to update incval\n");
+		return -EINVAL;
+	}
+
+	if ((time_ref_freq >= NUM_ICE_TIME_REF_FREQ ||
+	     src_tmr_mode >= NUM_ICE_SRC_TMR_MODE))
+		return -EINVAL;
+
+	if (src_tmr_mode == ICE_SRC_TMR_MODE_NANOSECONDS)
+		incval = ice_e822_nominal_incval(time_ref_freq);
+	else
+		incval = LOCKED_INCVAL_E822;
+
+	if (!ice_ptp_lock(hw))
+		return -EBUSY;
+
+	status = ice_ptp_write_incval(hw, incval);
+	if (status) {
+		dev_err(dev, "PTP failed to update incval, status %s\n",
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto err_unlock;
+	}
+
+	pf->ptp.time_ref_freq = time_ref_freq;
+	pf->ptp.src_tmr_mode = src_tmr_mode;
+
+	ts = ktime_to_timespec64(ktime_get_real());
+	err = ice_ptp_write_init(pf, &ts);
+	if (err) {
+		dev_err(dev, "PTP failed to program time registers, err %d\n",
+			err);
+		goto err_unlock;
+	}
+
+	/* unlock PTP semaphore first before resetting PHY timestamping */
+	ice_ptp_unlock(hw);
+	ice_ptp_reset_phy_timestamping(pf);
+
+	return 0;
+
+err_unlock:
+	ice_ptp_unlock(hw);
+
+	return err;
+}
+
+#ifdef HAVE_PTP_CLOCK_INFO_ADJFINE
+/**
+ * ice_ptp_adjfine - Adjust clock increment rate
+ * @info: the driver's PTP info structure
+ * @scaled_ppm: Parts per million with 16-bit fractional field
+ *
+ * Adjust the frequency of the clock by the indicated scaled ppm from the
+ * base frequency.
+ */
+static int ice_ptp_adjfine(struct ptp_clock_info *info, long scaled_ppm)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	u64 freq, divisor = 1000000ULL;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 incval, diff;
+	int neg_adj = 0;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"adjfreq not supported in locked mode\n");
+		return -EPERM;
+	}
+
+	incval = ice_base_incval(pf);
+
+	if (scaled_ppm < 0) {
+		neg_adj = 1;
+		scaled_ppm = -scaled_ppm;
+	}
+
+	while ((u64)scaled_ppm > div_u64(U64_MAX, incval)) {
+		/* handle overflow by scaling down the scaled_ppm and
+		 * the divisor, losing some precision
+		 */
+		scaled_ppm >>= 2;
+		divisor >>= 2;
+	}
+
+	freq = (incval * (u64)scaled_ppm) >> 16;
+	diff = div_u64(freq, divisor);
+
+	if (neg_adj)
+		incval -= diff;
+	else
+		incval += diff;
+
+	status = ice_ptp_write_incval_locked(hw, incval);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set incval, status %s\n",
+			ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#else
+/**
+ * ice_ptp_adjfreq - Adjust the frequency of the clock
+ * @info: the driver's PTP info structure
+ * @ppb: Parts per billion adjustment from the base
+ *
+ * Adjust the frequency of the clock by the indicated parts per billion from the
+ * base frequency.
+ */
+static int ice_ptp_adjfreq(struct ptp_clock_info *info, s32 ppb)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	s64 incval, freq, diff;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"adjfreq not supported in locked mode\n");
+		return -EPERM;
+	}
+
+	incval = ice_base_incval(pf);
+
+	freq = incval * ppb;
+	diff = div_s64(freq, 1000000000ULL);
+	incval += diff;
+
+	status = ice_ptp_write_incval_locked(hw, incval);
+	if (status) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set incval, status %s\n",
+			ice_stat_str(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#endif
+/**
+ * ice_ptp_extts_work - Workqueue task function
+ * @pf: Board private structure
+ *
+ * Service for PTP external clock event
+ */
+static void ice_ptp_extts_work(struct ice_pf *pf)
+{
+	struct ptp_clock_event event;
+	struct ice_hw *hw = &pf->hw;
+	u8 chan, tmr_idx;
+	u32 hi, lo;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	/* Event time is captured by one of the two matched registers
+	 *      GLTSYN_EVNT_L: 32 LSB of sampled time event
+	 *      GLTSYN_EVNT_H: 32 MSB of sampled time event
+	 * Event is defined in GLTSYN_EVNT_0 register
+	 */
+	for (chan = 0; chan < GLTSYN_EVNT_H_IDX_MAX; chan++) {
+		/* Check if channel is enabled */
+		if (pf->ptp.ext_ts_irq & (1 << chan)) {
+			lo = rd32(hw, GLTSYN_EVNT_L(chan, tmr_idx));
+			hi = rd32(hw, GLTSYN_EVNT_H(chan, tmr_idx));
+			event.timestamp = (((u64)hi) << 32) | lo;
+			event.type = PTP_CLOCK_EXTTS;
+			event.index = chan;
+
+			/* Fire event */
+			ptp_clock_event(pf->ptp.clock, &event);
+			pf->ptp.ext_ts_irq &= ~(1 << chan);
+		}
+	}
+}
+
+/**
+ * ice_ptp_cfg_extts - Configure EXTTS pin and channel
+ * @pf: Board private structure
+ * @ena: true to enable; false to disable
+ * @chan: GPIO channel (0-3)
+ * @gpio_pin: GPIO pin
+ * @extts_flags: request flags from the ptp_extts_request.flags
+ */
+static int
+ice_ptp_cfg_extts(struct ice_pf *pf, bool ena, unsigned int chan, u32 gpio_pin,
+		  unsigned int extts_flags)
+{
+	u32 func, aux_reg, gpio_reg, irq_reg;
+	struct ice_hw *hw = &pf->hw;
+	u8 tmr_idx;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf), "Locked mode EXTTS not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (chan > (unsigned int)pf->ptp.info.n_ext_ts)
+		return -EINVAL;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	irq_reg = rd32(hw, PFINT_OICR_ENA);
+
+	if (ena) {
+		/* Enable the interrupt */
+		irq_reg |= PFINT_OICR_TSYN_EVNT_M;
+		aux_reg = GLTSYN_AUX_IN_0_INT_ENA_M;
+
+#define GLTSYN_AUX_IN_0_EVNTLVL_RISING_EDGE	BIT(0)
+#define GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE	BIT(1)
+
+		/* set event level to requested edge */
+		if (extts_flags & PTP_FALLING_EDGE)
+			aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE;
+		if (extts_flags & PTP_RISING_EDGE)
+			aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_RISING_EDGE;
+
+		/* Write GPIO CTL reg.
+		 * 0x1 is input sampled by EVENT register(channel)
+		 * + num_in_channels * tmr_idx
+		 */
+		func = 1 + chan + (tmr_idx * 3);
+		gpio_reg = ((func << GLGEN_GPIO_CTL_PIN_FUNC_S) &
+			    GLGEN_GPIO_CTL_PIN_FUNC_M);
+		pf->ptp.ext_ts_chan |= (1 << chan);
+	} else {
+		/* clear the values we set to reset defaults */
+		aux_reg = 0;
+		gpio_reg = 0;
+		pf->ptp.ext_ts_chan &= ~(1 << chan);
+		if (!pf->ptp.ext_ts_chan)
+			irq_reg &= ~PFINT_OICR_TSYN_EVNT_M;
+	}
+
+	wr32(hw, PFINT_OICR_ENA, irq_reg);
+	wr32(hw, GLTSYN_AUX_IN(chan, tmr_idx), aux_reg);
+	wr32(hw, GLGEN_GPIO_CTL(gpio_pin), gpio_reg);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_cfg_clkout - Configure clock to generate periodic wave
+ * @pf: Board private structure
+ * @chan: GPIO channel (0-3)
+ * @config: desired periodic clk configuration. NULL will disable channel
+ * @store: If set to true the values will be stored
+ *
+ * Configure the internal clock generator modules to generate the clock wave of
+ * specified period.
+ */
+int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan,
+		       struct ice_perout_channel *config, bool store)
+{
+	struct ice_hw *hw = &pf->hw;
+	u64 current_time, period, start_time;
+	u32 func, val, gpio_pin;
+	u8 tmr_idx;
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(ice_pf_to_dev(pf),
+			"locked mode PPS/PEROUT not supported\n");
+		return -EIO;
+	}
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* 0. Reset mode & out_en in AUX_OUT */
+	wr32(hw, GLTSYN_AUX_OUT(chan, tmr_idx), 0);
+
+	/* If we're disabling the output, clear out CLKO and TGT and keep
+	 * output level low
+	 */
+	if (!config || !config->ena) {
+		wr32(hw, GLTSYN_CLKO(chan, tmr_idx), 0);
+		wr32(hw, GLTSYN_TGT_L(chan, tmr_idx), 0);
+		wr32(hw, GLTSYN_TGT_H(chan, tmr_idx), 0);
+
+		val = GLGEN_GPIO_CTL_PIN_DIR_M;
+		gpio_pin = pf->ptp.perout_channels[chan].gpio_pin;
+		wr32(hw, GLGEN_GPIO_CTL(gpio_pin), val);
+
+		/* Store the value if requested */
+		if (store)
+			memset(&pf->ptp.perout_channels[chan], 0,
+			       sizeof(struct ice_perout_channel));
+
+		return 0;
+	}
+	period = config->period;
+	start_time = config->start_time;
+	gpio_pin = config->gpio_pin;
+
+	/* 1. Write clkout with half of required period value */
+	if (period & 0x1) {
+		dev_err(ice_pf_to_dev(pf), "CLK Period must be an even value\n");
+		goto err;
+	}
+
+	period >>= 1;
+
+	/* For proper operation, the GLTSYN_CLKO must be larger than clock tick
+	 */
+#define MIN_PULSE 3
+	if (period <= MIN_PULSE || period > U32_MAX) {
+		dev_err(ice_pf_to_dev(pf), "CLK Period must be > %d && < 2^33",
+			MIN_PULSE * 2);
+		goto err;
+	}
+
+	wr32(hw, GLTSYN_CLKO(chan, tmr_idx), lower_32_bits(period));
+
+	/* Allow time for programming before start_time is hit */
+	current_time = ice_ptp_read_src_clk_reg(pf, NULL);
+
+	/* if start time is in the past start the timer at the nearest second
+	 * maintaining phase
+	 */
+	if (start_time < current_time)
+		start_time = roundup(current_time + NSEC_PER_MSEC,
+				     NSEC_PER_SEC) + start_time % NSEC_PER_SEC;
+
+	if (ice_is_e810(hw))
+		start_time -= E810_OUT_PROP_DELAY_NS;
+	else
+		start_time -= ice_e822_pps_delay(pf->ptp.time_ref_freq);
+
+	/* 2. Write TARGET time */
+	wr32(hw, GLTSYN_TGT_L(chan, tmr_idx), lower_32_bits(start_time));
+	wr32(hw, GLTSYN_TGT_H(chan, tmr_idx), upper_32_bits(start_time));
+
+	/* 3. Write AUX_OUT register */
+	val = GLTSYN_AUX_OUT_0_OUT_ENA_M | GLTSYN_AUX_OUT_0_OUTMOD_M;
+	wr32(hw, GLTSYN_AUX_OUT(chan, tmr_idx), val);
+
+	/* 4. write GPIO CTL reg */
+	func = 8 + chan + (tmr_idx * 4);
+	val = GLGEN_GPIO_CTL_PIN_DIR_M |
+	      ((func << GLGEN_GPIO_CTL_PIN_FUNC_S) & GLGEN_GPIO_CTL_PIN_FUNC_M);
+	wr32(hw, GLGEN_GPIO_CTL(gpio_pin), val);
+
+	/* Store the value if requested */
+	if (store) {
+		memcpy(&pf->ptp.perout_channels[chan], config,
+		       sizeof(struct ice_perout_channel));
+		pf->ptp.perout_channels[chan].start_time %= NSEC_PER_SEC;
+	}
+
+	return 0;
+err:
+	dev_err(ice_pf_to_dev(pf), "PTP failed to cfg per_clk\n");
+	return -EFAULT;
+}
+
+/**
+ * ice_ptp_gettimex64 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure to hold the current time value
+ * @sts: Optional parameter for holding a pair of system timestamps from
+ *       the system clock. Will be ignored if NULL is given.
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int
+ice_ptp_gettimex64(struct ptp_clock_info *info, struct timespec64 *ts,
+		   struct ptp_system_timestamp *sts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+
+	if (!ice_ptp_lock(hw)) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to get time\n");
+		return -EBUSY;
+	}
+
+	ice_ptp_read_time(pf, ts, sts);
+	ice_ptp_unlock(hw);
+
+	return 0;
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIMEX64
+/**
+ * ice_ptp_gettime64 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int ice_ptp_gettime64(struct ptp_clock_info *info, struct timespec64 *ts)
+{
+	return ice_ptp_gettimex64(info, ts, NULL);
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIME64
+/**
+ * ice_ptp_gettime32 - Get the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ */
+static int ice_ptp_gettime32(struct ptp_clock_info *info, struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	if (ice_ptp_gettime64(info, &ts64))
+		return -EFAULT;
+
+	*ts = timespec64_to_timespec(ts64);
+	return 0;
+}
+
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIME64 */
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIMEX64 */
+/**
+ * ice_ptp_settime64 - Set the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec64 structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ */
+static int
+ice_ptp_settime64(struct ptp_clock_info *info, const struct timespec64 *ts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct timespec64 ts64 = *ts;
+	struct ice_hw *hw = &pf->hw;
+	u8 i;
+	int err;
+
+	/* For Vernier mode, we need to recalibrate after new settime
+	 * Start with disabling timestamp block
+	 */
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, false);
+
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	/* Disable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	err = ice_ptp_write_init(pf, &ts64);
+	ice_ptp_unlock(hw);
+
+	if (!err)
+		ice_ptp_update_cached_systime(pf);
+
+	/* Reenable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, &pf->ptp.perout_channels[i],
+					   false);
+
+	/* Recalibrate and re-enable timestamp block */
+	if (pf->ptp.port.link_up)
+		ice_ptp_port_phy_start(&pf->ptp.port, true);
+exit:
+	if (err) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to set time %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+#ifndef HAVE_PTP_CLOCK_INFO_GETTIME64
+/**
+ * ice_ptp_settime32 - Set the time of the clock
+ * @info: the driver's PTP info structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ */
+static int
+ice_ptp_settime32(struct ptp_clock_info *info, const struct timespec *ts)
+{
+	struct timespec64 ts64 = timespec_to_timespec64(*ts);
+
+	return ice_ptp_settime64(info, &ts64);
+}
+#endif /* !HAVE_PTP_CLOCK_INFO_GETTIME64 */
+
+/**
+ * ice_ptp_adjtime_nonatomic - Do a non-atomic clock adjustment
+ * @info: the driver's PTP info structure
+ * @delta: Offset in nanoseconds to adjust the time by
+ */
+static int ice_ptp_adjtime_nonatomic(struct ptp_clock_info *info, s64 delta)
+{
+	struct timespec64 now, then;
+
+	then = ns_to_timespec64(delta);
+	ice_ptp_gettimex64(info, &now, NULL);
+	now = timespec64_add(now, then);
+
+	return ice_ptp_settime64(info, (const struct timespec64 *)&now);
+}
+
+
+/**
+ * ice_ptp_adjtime - Adjust the time of the clock by the indicated delta
+ * @info: the driver's PTP info structure
+ * @delta: Offset in nanoseconds to adjust the time by
+ */
+static int ice_ptp_adjtime(struct ptp_clock_info *info, s64 delta)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_hw *hw = &pf->hw;
+	struct device *dev;
+	int err;
+	u8 i;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (pf->ptp.src_tmr_mode == ICE_SRC_TMR_MODE_LOCKED) {
+		dev_err(dev, "Locked Mode adjtime not supported\n");
+		return -EIO;
+	}
+
+	/* Hardware only supports atomic adjustments using signed 32-bit
+	 * integers. For any adjustment outside this range, perform
+	 * a non-atomic get->adjust->set flow.
+	 */
+	if (delta > S32_MAX || delta < S32_MIN) {
+		dev_dbg(dev, "delta = %lld, adjtime non-atomic\n", delta);
+		return ice_ptp_adjtime_nonatomic(info, delta);
+	}
+
+	if (!ice_ptp_lock(hw)) {
+		dev_err(dev, "PTP failed to acquire semaphore in adjtime\n");
+		return -EBUSY;
+	}
+
+	/* Disable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	err = ice_ptp_write_adj(pf, delta, true);
+
+	/* Reenable periodic outputs */
+	for (i = 0; i < info->n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, &pf->ptp.perout_channels[i],
+					   false);
+
+	ice_ptp_unlock(hw);
+
+	/* Check error after restarting periodic outputs and releasing the PTP
+	 * hardware lock.
+	 */
+	if (err) {
+		dev_err(dev, "PTP failed to adjust time, err %d\n", err);
+		return err;
+	}
+
+	ice_ptp_update_cached_systime(pf);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_gpio_enable_e822 - Enable/disable ancillary features of PHC
+ * @info: the driver's PTP info structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ */
+static int
+ice_ptp_gpio_enable_e822(struct ptp_clock_info *info,
+			 struct ptp_clock_request *rq, int on)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_perout_channel clk_cfg = {0};
+	int err;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		clk_cfg.gpio_pin = PPS_PIN_INDEX;
+		clk_cfg.period = ((rq->perout.period.sec * NSEC_PER_SEC) +
+				   rq->perout.period.nsec);
+		clk_cfg.start_time = ((rq->perout.start.sec * NSEC_PER_SEC) +
+				       rq->perout.start.nsec);
+		clk_cfg.ena = !!on;
+
+		err = ice_ptp_cfg_clkout(pf, rq->perout.index, &clk_cfg, true);
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		err = ice_ptp_cfg_extts(pf, !!on, rq->extts.index,
+					TIME_SYNC_PIN_INDEX, rq->extts.flags);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+/**
+ * ice_ptp_gpio_enable_e810 - Enable/disable ancillary features of PHC
+ * @info: the driver's PTP info structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ */
+static int
+ice_ptp_gpio_enable_e810(struct ptp_clock_info *info,
+			 struct ptp_clock_request *rq, int on)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	struct ice_perout_channel clk_cfg = {0};
+	unsigned int chan;
+	u32 gpio_pin;
+	int err;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		chan = rq->perout.index;
+		if (ice_is_e810t(&pf->hw)) {
+			if (chan == ice_e810t_pin_desc[SMA1].chan)
+				clk_cfg.gpio_pin = GPIO_20;
+			else if (chan == ice_e810t_pin_desc[SMA2].chan)
+				clk_cfg.gpio_pin = GPIO_22;
+			else
+				return -1;
+		} else if (chan == PPS_CLK_GEN_CHAN) {
+			clk_cfg.gpio_pin = PPS_PIN_INDEX;
+		} else {
+			clk_cfg.gpio_pin = chan;
+		}
+
+		clk_cfg.period = ((rq->perout.period.sec * NSEC_PER_SEC) +
+				   rq->perout.period.nsec);
+		clk_cfg.start_time = ((rq->perout.start.sec * NSEC_PER_SEC) +
+				       rq->perout.start.nsec);
+		clk_cfg.ena = !!on;
+
+		err = ice_ptp_cfg_clkout(pf, chan, &clk_cfg, true);
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		chan = rq->extts.index;
+		if (ice_is_e810t(&pf->hw)) {
+			if (chan < 2)
+				gpio_pin = GPIO_21;
+			else
+				gpio_pin = GPIO_23;
+		} else {
+			gpio_pin = chan;
+		}
+
+		err = ice_ptp_cfg_extts(pf, !!on, chan, gpio_pin,
+					rq->extts.flags);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+/**
+ * ice_ptp_get_syncdevicetime - Get the cross time stamp info
+ * @device: Current device time
+ * @system: System counter value read synchronously with device time
+ * @ctx: Context provided by timekeeping code
+ *
+ * Read device and system (ART) clock simultaneously and return the corrected
+ * clock values in ns.
+ */
+static int
+ice_ptp_get_syncdevicetime(ktime_t *device,
+			   struct system_counterval_t *system,
+			   void *ctx)
+{
+	struct ice_pf *pf = (struct ice_pf *)ctx;
+	struct ice_hw *hw = &pf->hw;
+	u32 hh_lock, hh_art_ctl;
+	int i;
+
+	/* Get the HW lock */
+	hh_lock = rd32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+	if (hh_lock & PFHH_SEM_BUSY_M) {
+		dev_err(ice_pf_to_dev(pf), "PTP failed to get hh lock\n");
+		return -EFAULT;
+	}
+
+	/* Start the ART and device clock sync sequence */
+	hh_art_ctl = rd32(hw, GLHH_ART_CTL);
+	hh_art_ctl = hh_art_ctl | GLHH_ART_CTL_ACTIVE_M;
+	wr32(hw, GLHH_ART_CTL, hh_art_ctl);
+
+#define MAX_HH_LOCK_TRIES 100
+
+	for (i = 0; i < MAX_HH_LOCK_TRIES; i++) {
+		/* Wait for sync to complete */
+		hh_art_ctl = rd32(hw, GLHH_ART_CTL);
+		if (hh_art_ctl & GLHH_ART_CTL_ACTIVE_M) {
+			udelay(1);
+			continue;
+		} else {
+			u32 hh_ts_lo, hh_ts_hi, tmr_idx;
+			u64 hh_ts;
+
+			tmr_idx = hw->func_caps.ts_func_info.tmr_index_assoc;
+			/* Read ART time */
+			hh_ts_lo = rd32(hw, GLHH_ART_TIME_L);
+			hh_ts_hi = rd32(hw, GLHH_ART_TIME_H);
+			hh_ts = ((u64)hh_ts_hi << 32) | hh_ts_lo;
+			*system = convert_art_ns_to_tsc(hh_ts);
+			/* Read Device source clock time */
+			hh_ts_lo = rd32(hw, GLTSYN_HHTIME_L(tmr_idx));
+			hh_ts_hi = rd32(hw, GLTSYN_HHTIME_H(tmr_idx));
+			hh_ts = ((u64)hh_ts_hi << 32) | hh_ts_lo;
+			*device = ns_to_ktime(hh_ts);
+			break;
+		}
+	}
+	/* Release HW lock */
+	hh_lock = rd32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+	hh_lock = hh_lock & ~PFHH_SEM_BUSY_M;
+	wr32(hw, PFHH_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), hh_lock);
+
+	if (i == MAX_HH_LOCK_TRIES)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_getcrosststamp_e822 - Capture a device cross timestamp
+ * @info: the driver's PTP info structure
+ * @cts: The memory to fill the cross timestamp info
+ *
+ * Capture a cross timestamp between the ART and the device PTP hardware
+ * clock. Fill the cross timestamp information and report it back to the
+ * caller.
+ *
+ * This is only valid for E822 devices which have support for generating the
+ * cross timestamp via PCIe PTM.
+ *
+ * In order to correctly correlate the ART timestamp back to the TSC time, the
+ * CPU must have X86_FEATURE_TSC_KNOWN_FREQ.
+ */
+static int
+ice_ptp_getcrosststamp_e822(struct ptp_clock_info *info,
+			    struct system_device_crosststamp *cts)
+{
+	struct ice_pf *pf = ptp_info_to_pf(info);
+	return get_device_system_crosststamp(ice_ptp_get_syncdevicetime,
+					     pf, NULL, cts);
+}
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+
+/**
+ * ice_ptp_set_timestamp_mode - Setup driver for requested timestamp mode
+ * @pf: Board private structure
+ * @config: hwtstamp settings requested or saved
+ */
+static int
+ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config)
+{
+	/* Reserved for future extensions. */
+	if (config->flags)
+		return -EINVAL;
+
+	switch (config->tx_type) {
+	case HWTSTAMP_TX_OFF:
+		ice_set_tx_tstamp(pf, false);
+		break;
+	case HWTSTAMP_TX_ON:
+		ice_set_tx_tstamp(pf, true);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		ice_set_rx_tstamp(pf, false);
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+#ifdef HAVE_HWTSTAMP_FILTER_NTP_ALL
+	case HWTSTAMP_FILTER_NTP_ALL:
+#endif /* HAVE_HWTSTAMP_FILTER_NTP_ALL */
+	case HWTSTAMP_FILTER_ALL:
+		ice_set_rx_tstamp(pf, true);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_get_ts_config - ioctl interface to read the timestamping config
+ * @pf: Board private structure
+ * @ifr: ioctl data
+ *
+ * Copy the timestamping config to user buffer
+ */
+int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config *config;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return -EIO;
+
+	config = &pf->ptp.tstamp_config;
+
+	return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * ice_ptp_set_ts_config - ioctl interface to control the timestamping
+ * @pf: Board private structure
+ * @ifr: ioctl data
+ *
+ * Get the user config and store it
+ */
+int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return -EAGAIN;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	err = ice_ptp_set_timestamp_mode(pf, &config);
+	if (err)
+		return err;
+
+	/* Save these settings for future reference */
+	pf->ptp.tstamp_config = config;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * ice_ptp_get_tx_hwtstamp_ver - Returns the Tx timestamp and valid bits
+ * @pf: Board specific private structure
+ * @tx_idx_req: Bitmap of timestamp indices to read
+ * @quad: Quad to read
+ * @ts: Timestamps read from PHY
+ * @ts_read: On return, if non-NULL: bitmap of read timestamp indices
+ *
+ * Read the value of the Tx timestamp from the registers and build a
+ * bitmap of successfully read indices and count of the number successfully
+ * read.
+ *
+ * There are 3 possible return values,
+ * 0 = success
+ *
+ * -EIO = unable to read a register, this could be to a variety of issues but
+ *  should be very rare.  Up to caller how to respond to this (retry, abandon,
+ *  etc).  But once this situation occurs, stop reading as we cannot
+ *  guarantee what state the PHY or Timestamp Unit is in.
+ *
+ * -EINVAL = (at least) one of the timestamps that was read did not have the
+ *  TS_VALID bit set, and is probably zero.  Be aware that not all of the
+ *  timestamps that were read (so the TS_READY bit for this timestamp was
+ *  cleared but no valid TS was retrieved) are present.  Expect at least one
+ *  ts_read index that should be 1 is zero.
+ */
+static int ice_ptp_get_tx_hwtstamp_ver(struct ice_pf *pf, u64 tx_idx_req,
+				       u8 quad, u64 *ts, u64 *ts_read)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	unsigned long i;
+	u64 ts_ns;
+
+
+	for_each_set_bit(i, (unsigned long *)&tx_idx_req, INDEX_PER_QUAD) {
+		ts[i] = 0x0;
+
+		status = ice_read_phy_tstamp(hw, quad, i, &ts_ns);
+		if (status) {
+			dev_dbg(dev, "PTP Tx read failed, status %s\n",
+				ice_stat_str(status));
+			return ice_status_to_errno(status);
+		}
+
+		if (ts_read)
+			*ts_read |= BIT(i);
+
+		if (!(ts_ns & ICE_PTP_TS_VALID)) {
+			dev_dbg(dev, "PTP tx invalid\n");
+			continue;
+		}
+
+		ts_ns = ice_ptp_extend_40b_ts(pf, ts_ns);
+		/* Each timestamp will be offset in the array of
+		 * timestamps by the index's value.  So the timestamp
+		 * from index n will be in ts[n] position.
+		 */
+		ts[i] = ts_ns;
+	}
+
+	return 0;
+}
+
+
+/**
+ * ice_ptp_get_tx_hwtstamp_ready - Get the Tx timestamp ready bitmap
+ * @pf: The PF private data structure
+ * @quad: Quad to read (0-4)
+ * @ts_ready: Bitmap where each bit set indicates that the corresponding
+ *            timestamp register is ready to read
+ *
+ * Read the PHY timestamp ready registers for a particular bank.
+ */
+static void
+ice_ptp_get_tx_hwtstamp_ready(struct ice_pf *pf, u8 quad, u64 *ts_ready)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	u64 bitmap;
+	u32 val;
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_U,
+					&val);
+	if (status) {
+		dev_dbg(dev, "TX_MEMORY_STATUS_U read failed for quad %u\n",
+			quad);
+		return;
+	}
+
+	bitmap = val;
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEMORY_STATUS_L,
+					&val);
+	if (status) {
+		dev_dbg(dev, "TX_MEMORY_STATUS_L read failed for quad %u\n",
+			quad);
+		return;
+	}
+
+	bitmap = (bitmap << 32) | val;
+
+	*ts_ready = bitmap;
+
+}
+
+/**
+ * ice_ptp_tx_hwtstamp_vsi - Return the Tx timestamp for a specified VSI
+ * @vsi: lport corresponding VSI
+ * @idx: Index of timestamp read from QUAD memory
+ * @hwtstamp: Timestamps read from PHY
+ *
+ * Helper function for ice_ptp_tx_hwtstamp.
+ */
+static void
+ice_ptp_tx_hwtstamp_vsi(struct ice_vsi *vsi, int idx, u64 hwtstamp)
+{
+	struct skb_shared_hwtstamps shhwtstamps = {};
+	struct sk_buff *skb;
+
+	skb = vsi->ptp_tx_skb[idx];
+	if (!skb)
+		return;
+
+	shhwtstamps.hwtstamp = ns_to_ktime(hwtstamp);
+
+	vsi->ptp_tx_skb[idx] = NULL;
+
+	/* Notify the stack and free the skb after we've unlocked */
+	skb_tstamp_tx(skb, &shhwtstamps);
+	dev_kfree_skb_any(skb);
+	clear_bit(idx, vsi->ptp_tx_idx);
+}
+
+/**
+ * ice_ptp_tx_hwtstamp - Return the Tx timestamps
+ * @pf: Board private structure
+ *
+ * Read the tx_memory_status registers for the PHY timestamp block. Determine
+ * which entries contain a valid ready timestamp. Read out the timestamp from
+ * the table. Convert the 40b timestamp value into the 64b nanosecond value
+ * consumed by the stack, and then report it as part of the related skb's
+ * shhwtstamps structure.
+ *
+ * Note that new timestamps might come in while we're reading the timestamp
+ * block. However, no interrupts will be triggered until the intr_threshold is
+ * crossed again. Thus we read the status registers in a loop until no more
+ * timestamps are ready.
+ */
+static void ice_ptp_tx_hwtstamp(struct ice_pf *pf)
+{
+	u8 quad, lport, qport;
+	struct ice_vsi *vsi;
+	int msk_shft;
+	u64 rdy_msk;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi)
+		return;
+
+	lport = vsi->port_info->lport;
+	qport = lport % ICE_PORTS_PER_QUAD;
+	quad = lport / ICE_PORTS_PER_QUAD;
+	msk_shft = qport * INDEX_PER_PORT;
+	rdy_msk = GENMASK_ULL(msk_shft + INDEX_PER_PORT - 1, msk_shft);
+
+	while (true) {
+		u64 ready_map = 0, valid_map = 0;
+		u64 hwtstamps[INDEX_PER_QUAD];
+		int i, ret;
+
+		ice_ptp_get_tx_hwtstamp_ready(pf, quad, &ready_map);
+		ready_map &= rdy_msk;
+		if (!ready_map)
+			break;
+
+		ret = ice_ptp_get_tx_hwtstamp_ver(pf, ready_map, quad,
+						  hwtstamps, &valid_map);
+		if (ret == -EIO)
+			break;
+
+		for_each_set_bit(i, (unsigned long *)&valid_map, INDEX_PER_QUAD)
+			if (test_bit(i, vsi->ptp_tx_idx))
+				ice_ptp_tx_hwtstamp_vsi(vsi, i, hwtstamps[i]);
+	}
+}
+
+/**
+ * ice_ptp_tx_hwtstamp_ext - Return the Tx timestamp
+ * @pf: Board private structure
+ *
+ * Read the value of the Tx timestamp from the registers, convert it into
+ * a value consumable by the stack, and store that result into the shhwtstamps
+ * struct before returning it up the stack.
+ */
+static void ice_ptp_tx_hwtstamp_ext(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	struct ice_vsi *vsi;
+	u8 lport;
+	int idx;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !vsi->ptp_tx)
+		return;
+	lport = hw->port_info->lport;
+
+	/* Don't attempt to timestamp if we don't have an skb */
+	for (idx = 0; idx < INDEX_PER_QUAD; idx++) {
+		struct skb_shared_hwtstamps shhwtstamps = {};
+		enum ice_status status;
+		struct sk_buff *skb;
+		u64 ts_ns;
+
+		skb = vsi->ptp_tx_skb[idx];
+		if (!skb)
+			continue;
+
+		status = ice_read_phy_tstamp(hw, lport, idx, &ts_ns);
+		if (status) {
+			dev_err(ice_pf_to_dev(pf), "PTP tx rd failed, status %s\n",
+				ice_stat_str(status));
+			vsi->ptp_tx_skb[idx] = NULL;
+			dev_kfree_skb_any(skb);
+			clear_bit(idx, vsi->ptp_tx_idx);
+		}
+
+		ts_ns = ice_ptp_extend_40b_ts(pf, ts_ns);
+
+		shhwtstamps.hwtstamp = ns_to_ktime(ts_ns);
+
+		vsi->ptp_tx_skb[idx] = NULL;
+
+		/* Notify the stack and free the skb after
+		 * we've unlocked
+		 */
+		skb_tstamp_tx(skb, &shhwtstamps);
+		dev_kfree_skb_any(skb);
+		clear_bit(idx, vsi->ptp_tx_idx);
+	}
+}
+
+/**
+ * ice_ptp_rx_hwtstamp - Check for an Rx timestamp
+ * @rx_ring: Ring to get the VSI info
+ * @rx_desc: Receive descriptor
+ * @skb: Particular skb to send timestamp with
+ *
+ * The driver receives a notification in the receive descriptor with timestamp.
+ * The timestamp is in ns, so we must convert the result first.
+ */
+void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring,
+			 union ice_32b_rx_flex_desc *rx_desc,
+			 struct sk_buff *skb)
+{
+	u32 ts_high;
+	u64 ts_ns;
+
+	/* Populate timesync data into skb */
+	if (rx_desc->wb.time_stamp_low & ICE_PTP_TS_VALID) {
+		struct skb_shared_hwtstamps *hwtstamps;
+
+		/* Use ice_ptp_extend_32b_ts directly, using the ring-specific
+		 * cached PHC value, rather than accessing the PF. This also
+		 * allows us to simply pass the upper 32bits of nanoseconds
+		 * directly. Calling ice_ptp_extend_40b_ts is unnecessary as
+		 * it would just discard these bits itself.
+		 */
+		ts_high = le32_to_cpu(rx_desc->wb.flex_ts.ts_high);
+		ts_ns = ice_ptp_extend_32b_ts(rx_ring->cached_systime, ts_high);
+
+		hwtstamps = skb_hwtstamps(skb);
+		memset(hwtstamps, 0, sizeof(*hwtstamps));
+		hwtstamps->hwtstamp = ns_to_ktime(ts_ns);
+	}
+}
+
+/**
+ * ice_ptp_setup_pins_e810t - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e810t(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->n_per_out = E810T_N_PER_OUT;
+
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+
+	info->n_ext_ts = E810_N_EXT_TS;
+	info->n_pins = NUM_E810T_PTP_PINS;
+	info->verify = ice_e810t_verify_pin;
+}
+
+/**
+ * ice_ptp_setup_pins_e810 - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e810(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->n_per_out = E810_N_PER_OUT;
+
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+
+	info->n_ext_ts = E810_N_EXT_TS;
+}
+
+/**
+ * ice_ptp_setup_pins_e822 - Setup PTP pins in sysfs
+ * @pf: pointer to the PF instance
+ * @info: PTP clock capabilities
+ */
+static void
+ice_ptp_setup_pins_e822(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->pps = 1;
+	info->n_per_out = 1;
+	if (!ice_is_feature_supported(pf, ICE_F_PTP_EXTTS))
+		return;
+	info->n_ext_ts = 1;
+}
+
+/**
+ * ice_ptp_set_funcs_e822 - Set specialized functions for E822 support
+ * @pf: Board private structure
+ * @info: PTP info to fill
+ *
+ * Assign functions to the PTP capabiltiies structure for E822 devices.
+ * Functions which operate across all device families should be set directly
+ * in ice_ptp_set_caps. Only add functions here which are distinct for E822
+ * devices.
+ */
+static void
+ice_ptp_set_funcs_e822(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+	if (boot_cpu_has(X86_FEATURE_ART) &&
+	    boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ))
+		info->getcrosststamp = ice_ptp_getcrosststamp_e822;
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+	info->enable = ice_ptp_gpio_enable_e822;
+
+	ice_ptp_setup_pins_e822(pf, info);
+}
+
+/**
+ * ice_ptp_set_funcs_e810 - Set specialized functions for E810 support
+ * @pf: Board private structure
+ * @info: PTP info to fill
+ *
+ * Assign functions to the PTP capabiltiies structure for E810 devices.
+ * Functions which operate across all device families should be set directly
+ * in ice_ptp_set_caps. Only add functions here which are distinct for e810
+ * devices.
+ */
+static void
+ice_ptp_set_funcs_e810(struct ice_pf *pf, struct ptp_clock_info *info)
+{
+	info->enable = ice_ptp_gpio_enable_e810;
+
+	if (ice_is_e810t(&pf->hw))
+		ice_ptp_setup_pins_e810t(pf, info);
+	else
+		ice_ptp_setup_pins_e810(pf, info);
+}
+
+/**
+ * ice_ptp_set_caps - Set PTP capabilities
+ * @pf: Board private structure
+ */
+static void ice_ptp_set_caps(struct ice_pf *pf)
+{
+	struct ptp_clock_info *info = &pf->ptp.info;
+	struct device *dev = ice_pf_to_dev(pf);
+
+	snprintf(info->name, sizeof(info->name) - 1, "%s-%s-clk",
+		 dev_driver_string(dev), dev_name(dev));
+	info->owner = THIS_MODULE;
+	info->max_adj = 999999999;
+	info->adjtime = ice_ptp_adjtime;
+#ifdef HAVE_PTP_CLOCK_INFO_ADJFINE
+	info->adjfine = ice_ptp_adjfine;
+#else
+	info->adjfreq = ice_ptp_adjfreq;
+#endif
+#if defined(HAVE_PTP_CLOCK_INFO_GETTIMEX64)
+	info->gettimex64 = ice_ptp_gettimex64;
+#elif defined(HAVE_PTP_CLOCK_INFO_GETTIME64)
+	info->gettime64 = ice_ptp_gettime64;
+#else
+	info->gettime = ice_ptp_gettime32;
+#endif
+#ifdef HAVE_PTP_CLOCK_INFO_GETTIME64
+	info->settime64 = ice_ptp_settime64;
+#else
+	info->settime = ice_ptp_settime32;
+#endif /* HAVE_PTP_CLOCK_INFO_GETTIME64 */
+
+	if (ice_is_e810(&pf->hw))
+		ice_ptp_set_funcs_e810(pf, info);
+	else
+		ice_ptp_set_funcs_e822(pf, info);
+}
+
+/**
+ * ice_ptp_create_clock - Create PTP clock device for userspace
+ * @pf: Board private structure
+ *
+ * This function creates a new PTP clock device. It only creates one if we
+ * don't already have one. Will return error if it can't create one, but success
+ * if we already have a device. Should be used by ice_ptp_init to create clock
+ * initially, and prevent global resets from creating new clock devices.
+ */
+static long ice_ptp_create_clock(struct ice_pf *pf)
+{
+	struct ptp_clock_info *info;
+	struct ptp_clock *clock;
+	struct device *dev;
+
+	/* No need to create a clock device if we already have one */
+	if (pf->ptp.clock)
+		return 0;
+
+	ice_ptp_set_caps(pf);
+
+	info = &pf->ptp.info;
+	dev = ice_pf_to_dev(pf);
+
+	/* Allocate memory for kernel pins interface */
+	if (info->n_pins) {
+		info->pin_config = devm_kcalloc(dev, info->n_pins,
+						sizeof(*info->pin_config),
+						GFP_KERNEL);
+		if (!info->pin_config) {
+			info->n_pins = 0;
+			return ICE_ERR_NO_MEMORY;
+		}
+	}
+
+	if (ice_is_e810t(&pf->hw)) {
+		/* Enable SMA controller */
+		int err = ice_enable_e810t_sma_ctrl(&pf->hw, true);
+
+		if (err)
+			return err;
+
+		/* Read current SMA status */
+		err = ice_get_e810t_sma_config(&pf->hw, info->pin_config);
+		if (err)
+			return err;
+	}
+
+	/* Attempt to register the clock before enabling the hardware. */
+	clock = ptp_clock_register(info, dev);
+	if (IS_ERR(clock))
+		return PTR_ERR(clock);
+
+	pf->ptp.clock = clock;
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_owner - Initialize PTP_1588_CLOCK device
+ * @pf: Board private structure
+ *
+ * Setup and initialize a PTP clock device that represents the device hardware
+ * clock. Save the clock index for other functions connected to the same
+ * hardware resource.
+ */
+static int ice_ptp_init_owner(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct timespec64 ts;
+	int err, itr = 1;
+	u8 src_idx;
+	u32 regval;
+
+	if (ice_is_e810(hw))
+		wr32(hw, GLTSYN_SYNC_DLAY, 0);
+
+	/* Clear some HW residue and enable source clock */
+	src_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Enable source clocks */
+	wr32(hw, GLTSYN_ENA(src_idx), GLTSYN_ENA_TSYN_ENA_M);
+
+	if (ice_is_e810(hw)) {
+		/* Enable PHY time sync */
+		status = ice_ptp_init_phy_e810(hw);
+		if (status) {
+			err = ice_status_to_errno(status);
+			goto err_exit;
+		}
+	}
+
+	/* Clear event status indications for auxiliary pins */
+	(void)rd32(hw, GLTSYN_STAT(src_idx));
+
+#define PF_SB_REM_DEV_CTL_PHY0	BIT(2)
+	if (!ice_is_e810(hw)) {
+		regval = rd32(hw, PF_SB_REM_DEV_CTL);
+		regval |= PF_SB_REM_DEV_CTL_PHY0;
+		wr32(hw, PF_SB_REM_DEV_CTL, regval);
+	}
+
+	/* Acquire the global hardware lock */
+	if (!ice_ptp_lock(hw)) {
+		err = -EBUSY;
+		goto err_exit;
+	}
+
+	/* Write the increment time value to PHY and LAN */
+	status = ice_ptp_write_incval(hw, ice_base_incval(pf));
+	if (status) {
+		err = ice_status_to_errno(status);
+		ice_ptp_unlock(hw);
+		goto err_exit;
+	}
+
+	ts = ktime_to_timespec64(ktime_get_real());
+	/* Write the initial Time value to PHY and LAN */
+	err = ice_ptp_write_init(pf, &ts);
+	if (err) {
+		ice_ptp_unlock(hw);
+		goto err_exit;
+	}
+
+	/* Release the global hardware lock */
+	ice_ptp_unlock(hw);
+
+	if (!ice_is_e810(hw)) {
+		/* Set window length for all the ports */
+		status = ice_ptp_set_vernier_wl(hw);
+		if (status)  {
+			err = ice_status_to_errno(status);
+			goto err_exit;
+		}
+
+		/* Enable quad interrupts */
+		err = ice_ptp_tx_ena_intr(pf, true, itr);
+		if (err)
+			goto err_exit;
+
+		/* Reset timestamping memory in QUADs */
+		ice_ptp_reset_ts_memory(pf);
+	}
+
+	/* Ensure we have a clock device */
+	err = ice_ptp_create_clock(pf);
+	if (err)
+		goto err_clk;
+
+	/* Store the PTP clock index for other PFs */
+	ice_set_ptp_clock_index(pf);
+
+	return 0;
+
+err_clk:
+	pf->ptp.clock = NULL;
+err_exit:
+	dev_err(dev, "PTP failed to register clock, err %d\n", err);
+
+	return err;
+}
+
+/**
+ * ice_ptp_init - Initialize PTP hardware clock support
+ * @pf: Board private structure
+ *
+ * Setup the device for interacting with the PTP hardware clock for all
+ * functions, both the function that owns the clock hardware, and the
+ * functions connected to the clock hardware.
+ *
+ * The clock owner will allocate and register a ptp_clock with the
+ * PTP_1588_CLOCK infrastructure. All functions allocate a kthread and work
+ * items used for asynchronous work such as Tx timestamps and periodic work.
+ */
+void ice_ptp_init(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int err;
+
+
+	/* If this function owns the clock hardware, it must allocate and
+	 * configure the PTP clock device to represent it.
+	 */
+	if (hw->func_caps.ts_func_info.src_tmr_owned) {
+		err = ice_ptp_init_owner(pf);
+		if (err)
+			return;
+	}
+
+	/* Disable timestamping for both Tx and Rx */
+	ice_ptp_cfg_timestamp(pf, false);
+
+	/* Initialize work structures */
+	mutex_init(&pf->ptp.port.ps_lock);
+	pf->ptp.port.link_up = false;
+	pf->ptp.port.port_num = pf->hw.pf_id;
+	INIT_WORK(&pf->ptp.port.ov_task, ice_ptp_wait_for_offset_valid);
+
+	/* Allocate workqueue for 2nd part of Vernier calibration */
+	pf->ptp.ov_wq = alloc_workqueue("%s_ov", WQ_MEM_RECLAIM, 0,
+					KBUILD_MODNAME);
+	if (!pf->ptp.ov_wq) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	set_bit(ICE_FLAG_PTP, pf->flags);
+	dev_info(dev, "PTP init successful\n");
+
+	if (hw->func_caps.ts_func_info.src_tmr_owned && !ice_is_e810(hw))
+		ice_cgu_init_state(pf);
+	return;
+
+err_wq:
+	/* If we registered a PTP clock, release it */
+	if (pf->ptp.clock) {
+		ptp_clock_unregister(pf->ptp.clock);
+		pf->ptp.clock = NULL;
+	}
+	dev_err(dev, "PTP failed %d\n", err);
+}
+
+/**
+ * ice_ptp_release - Disable the driver/HW support and unregister the clock
+ * @pf: Board private structure
+ *
+ * This function handles the cleanup work required from the initialization by
+ * clearing out the important information and unregistering the clock
+ */
+void ice_ptp_release(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi;
+	char *dev_name;
+	u8 quad, i;
+
+	if (!pf)
+		return;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi || !test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	dev_name = vsi->netdev->name;
+
+	/* Disable timestamping for both Tx and Rx */
+	ice_ptp_cfg_timestamp(pf, false);
+	/* Clear PHY bank residues if any */
+	quad = vsi->port_info->lport / ICE_PORTS_PER_QUAD;
+
+	if (!ice_is_e810(&pf->hw) && !pf->hw.reset_ongoing) {
+		u64 tx_idx = ~((u64)0);
+		u64 ts[INDEX_PER_QUAD];
+
+		ice_ptp_get_tx_hwtstamp_ver(pf, tx_idx, quad, ts, NULL);
+	} else {
+		ice_ptp_tx_hwtstamp_ext(pf);
+	}
+
+	/* Release any pending skb */
+	ice_ptp_rel_all_skb(pf);
+
+	clear_bit(ICE_FLAG_PTP, pf->flags);
+
+	pf->ptp.port.link_up = false;
+	if (pf->ptp.ov_wq) {
+		destroy_workqueue(pf->ptp.ov_wq);
+		pf->ptp.ov_wq = NULL;
+	}
+
+	if (!pf->ptp.clock)
+		return;
+
+	/* Disable periodic outputs */
+	for (i = 0; i < pf->ptp.info.n_per_out; i++)
+		if (pf->ptp.perout_channels[i].ena)
+			ice_ptp_cfg_clkout(pf, i, NULL, false);
+
+	ice_clear_ptp_clock_index(pf);
+	ptp_clock_unregister(pf->ptp.clock);
+	pf->ptp.clock = NULL;
+
+	/* Free pin config */
+	if (pf->ptp.info.pin_config) {
+		devm_kfree(ice_pf_to_dev(pf), pf->ptp.info.pin_config);
+		pf->ptp.info.pin_config = NULL;
+	}
+
+	dev_info(ice_pf_to_dev(pf), "removed Clock from %s\n", dev_name);
+}
+
+/**
+ * ice_ptp_set_timestamp_offsets - Calculate timestamp offsets on each port
+ * @pf: Board private structure
+ *
+ * This function calculates timestamp Tx/Rx offset on each port after at least
+ * one packet was sent/received by the PHY.
+ */
+void ice_ptp_set_timestamp_offsets(struct ice_pf *pf)
+{
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	if (atomic_read(&pf->ptp.phy_reset_lock))
+		return;
+
+	ice_ptp_check_offset_valid(&pf->ptp.port);
+}
+
+/**
+ * ice_clean_ptp_subtask - Handle the service task events
+ * @pf: Board private structure
+ */
+void ice_clean_ptp_subtask(struct ice_pf *pf)
+{
+	if (!test_bit(ICE_FLAG_PTP, pf->flags))
+		return;
+
+	ice_ptp_update_cached_systime(pf);
+	if (test_and_clear_bit(ICE_PTP_EXT_TS_READY, pf->state))
+		ice_ptp_extts_work(pf);
+	if (test_and_clear_bit(ICE_PTP_TX_TS_READY, pf->state)) {
+		struct ice_hw *hw = &pf->hw;
+
+		if (ice_is_e810(hw))
+			ice_ptp_tx_hwtstamp_ext(pf);
+		else
+			ice_ptp_tx_hwtstamp(pf);
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4ac0fa76523f96f91279efb8c82f5ab084dad60
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.h
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_H_
+#define _ICE_PTP_H_
+
+#include <linux/clocksource.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_classify.h>
+#include <linux/highuid.h>
+
+#include "ice_ptp_hw.h"
+
+enum ice_ptp_pin {
+	GPIO_20 = 0,
+	GPIO_21,
+	GPIO_22,
+	GPIO_23,
+	NUM_ICE_PTP_PIN
+};
+
+
+#define ICE_E810T_SMA1_CTRL_MASK	(ICE_E810T_P1_SMA1_DIR_EN | \
+						ICE_E810T_P1_SMA1_TX_EN)
+#define ICE_E810T_SMA2_CTRL_MASK	(ICE_E810T_P1_SMA2_UFL2_RX_DIS | \
+						ICE_E810T_P1_SMA2_DIR_EN | \
+						ICE_E810T_P1_SMA2_TX_EN)
+#define ICE_E810T_SMA_CTRL_MASK		(ICE_E810T_SMA1_CTRL_MASK | \
+						ICE_E810T_SMA2_CTRL_MASK)
+
+enum ice_e810t_ptp_pins {
+	GNSS = 0,
+	SMA1,
+	UFL1,
+	SMA2,
+	UFL2,
+	NUM_E810T_PTP_PINS
+};
+
+#define ICE_SUBDEV_ID_E810_T 0x000E
+
+static inline bool ice_is_e810t(struct ice_hw *hw)
+{
+	return (hw->device_id == ICE_DEV_ID_E810C_SFP &&
+		hw->subsystem_device_id == ICE_SUBDEV_ID_E810_T);
+}
+
+struct ice_perout_channel {
+	bool ena;
+	u32 gpio_pin;
+	u64 period;
+	u64 start_time;
+};
+
+
+/**
+ * struct ice_ptp_port - data used to initialize an external port for PTP
+ *
+ * This structure contains data indicating whether a single external port is
+ * ready for PTP functionality. It is used to track the port initialization
+ * and determine when the port's PHY offset is valid.
+ *
+ * @ov_task: work task for tracking when PHY offset is valid
+ * @tx_offset_ready: indicates the Tx offset for the port is ready
+ * @rx_offset_ready: indicates the Rx offset for the port is ready
+ * @tx_offset_lock: lock used to protect the tx_offset_ready field
+ * @rx_offset_lock: lock used to protect the rx_offset_ready field
+ * @ps_lock: mutex used to protect the overall PTP PHY start procedure
+ * @link_up: indicates whether the link is up
+ * @tx_fifo_busy_cnt: number of times the Tx FIFO was busy
+ * @port_num: the port number this structure represents
+ */
+struct ice_ptp_port {
+	struct work_struct ov_task;
+	atomic_t tx_offset_ready;
+	atomic_t rx_offset_ready;
+	atomic_t tx_offset_lock;
+	atomic_t rx_offset_lock;
+	struct mutex ps_lock; /* protects overall PTP PHY start procedure */
+	bool link_up;
+	u8 tx_fifo_busy_cnt;
+	u8 port_num;
+};
+
+#define GLTSYN_TGT_H_IDX_MAX		4
+
+/**
+ * struct ice_ptp - data used for integrating with CONFIG_PTP_1588_CLOCK
+ * @port: data for the PHY port initialization procedure
+ * @cached_phc_time: a cached copy of the PHC time for timestamp extension
+ * @ext_ts_chan: the external timestamp channel in use
+ * @ext_ts_irq: the external timestamp IRQ in use
+ * @phy_reset_lock: bit lock for preventing PHY start while resetting
+ * @ov_wq: work queue for the offset validity task
+ * @perout_channels: periodic output data
+ * @info: structure defining PTP hardware capabilities
+ * @clock: pointer to registered PTP clock device
+ * @tstamp_config: hardware timestamping configuration
+ * @time_ref_freq: current device timer frequency (for E822 devices)
+ * @src_tmr_mode: current device timer mode (locked or nanoseconds)
+ */
+struct ice_ptp {
+	struct ice_ptp_port port;
+	u64 cached_phc_time;
+	u8 ext_ts_chan;
+	u8 ext_ts_irq;
+	atomic_t phy_reset_lock;
+	struct workqueue_struct *ov_wq;
+	struct ice_perout_channel perout_channels[GLTSYN_TGT_H_IDX_MAX];
+	struct ptp_clock_info info;
+	struct ptp_clock *clock;
+	struct hwtstamp_config tstamp_config;
+	enum ice_time_ref_freq time_ref_freq;
+	enum ice_src_tmr_mode src_tmr_mode;
+};
+
+#define __ptp_port_to_ptp(p) \
+	container_of((p), struct ice_ptp, port)
+#define ptp_port_to_pf(p) \
+	container_of(__ptp_port_to_ptp((p)), struct ice_pf, ptp)
+
+#define __ptp_info_to_ptp(i) \
+	container_of((i), struct ice_ptp, info)
+#define ptp_info_to_pf(i) \
+	container_of(__ptp_info_to_ptp((i)), struct ice_pf, ptp)
+
+#define MAC_RX_LINK_COUNTER(_port)	(0x600090 + 0x1000 * (_port))
+#define PFTSYN_SEM_BYTES		4
+#define PTP_SHARED_CLK_IDX_VALID	BIT(31)
+#define PHY_TIMER_SELECT_VALID_BIT	0
+#define PHY_TIMER_SELECT_BIT		1
+#define PHY_TIMER_SELECT_MASK		0xFFFFFFFC
+#define TS_CMD_MASK_EXT			0xFF
+#define TS_CMD_MASK			0xF
+#define SYNC_EXEC_CMD			0x3
+#define ICE_PTP_TS_VALID		BIT(0)
+#define FIFO_EMPTY			BIT(2)
+#define FIFO_OK				0xFF
+#define ICE_PTP_FIFO_NUM_CHECKS		5
+/* PHY, quad and port definitions */
+#define INDEX_PER_QUAD			64
+#define INDEX_PER_PORT			(INDEX_PER_QUAD / ICE_PORTS_PER_QUAD)
+#define TX_INTR_QUAD_MASK		0x03
+/* Per-channel register definitions */
+#define GLTSYN_AUX_OUT(_chan, _idx)	(GLTSYN_AUX_OUT_0(_idx) + ((_chan) * 8))
+#define GLTSYN_AUX_IN(_chan, _idx)	(GLTSYN_AUX_IN_0(_idx) + ((_chan) * 8))
+#define GLTSYN_CLKO(_chan, _idx)	(GLTSYN_CLKO_0(_idx) + ((_chan) * 8))
+#define GLTSYN_TGT_L(_chan, _idx)	(GLTSYN_TGT_L_0(_idx) + ((_chan) * 16))
+#define GLTSYN_TGT_H(_chan, _idx)	(GLTSYN_TGT_H_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_L(_chan, _idx)	(GLTSYN_EVNT_L_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_H(_chan, _idx)	(GLTSYN_EVNT_H_0(_idx) + ((_chan) * 16))
+#define GLTSYN_EVNT_H_IDX_MAX		3
+
+/* Pin definitions for PTP PPS out */
+#define PPS_CLK_GEN_CHAN		3
+#define PPS_CLK_SRC_CHAN		2
+#define PPS_PIN_INDEX			5
+#define TIME_SYNC_PIN_INDEX		4
+#define E810_N_EXT_TS			3
+#define E810_N_PER_OUT			4
+#define E810T_N_PER_OUT			3
+/* Macros to derive the low and high addresses for PHY */
+#define LOWER_ADDR_SIZE			16
+/* Macros to derive offsets for TimeStampLow and TimeStampHigh */
+#define PORT_TIMER_ASSOC(_i)		(0x0300102C + ((_i) * 256))
+#define ETH_GLTSYN_ENA(_i)		(0x03000348 + ((_i) * 4))
+
+/* Time allowed for programming periodic clock output */
+#define START_OFFS_NS 100000000
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+struct ice_pf;
+int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr);
+int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr);
+int ice_ptp_get_ts_idx(struct ice_vsi *vsi);
+int ice_get_ptp_clock_index(struct ice_pf *pf);
+
+void ice_clean_ptp_subtask(struct ice_pf *pf);
+void ice_ptp_set_timestamp_offsets(struct ice_pf *pf);
+u64
+ice_ptp_read_src_clk_reg(struct ice_pf *pf, struct ptp_system_timestamp *sts);
+void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
+			 struct sk_buff *skb);
+void ice_ptp_init(struct ice_pf *pf);
+void ice_ptp_release(struct ice_pf *pf);
+int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup);
+int ice_ptp_check_rx_fifo(struct ice_pf *pf, u8 port);
+int ptp_ts_enable(struct ice_pf *pf, u8 port, bool enable);
+int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan,
+		       struct ice_perout_channel *config, bool store);
+int ice_ptp_update_incval(struct ice_pf *pf, enum ice_time_ref_freq time_ref_freq,
+			  enum ice_src_tmr_mode src_tmr_mode);
+int ice_ptp_get_incval(struct ice_pf *pf, enum ice_time_ref_freq *time_ref_freq,
+		       enum ice_src_tmr_mode *src_tmr_mode);
+#else /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+static inline int ice_ptp_set_ts_config(struct ice_pf __always_unused *pf,
+					struct ifreq __always_unused *ifr)
+{
+	return 0;
+}
+
+static inline int ice_ptp_get_ts_config(struct ice_pf __always_unused *pf,
+					struct ifreq __always_unused *ifr)
+{
+	return 0;
+}
+
+static inline int
+ice_ptp_check_rx_fifo(struct ice_pf __always_unused *pf,
+		      u8 __always_unused port)
+{
+	return 0;
+}
+
+static inline int ice_ptp_get_ts_idx(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+static inline int ice_get_ptp_clock_index(struct ice_pf __always_unused *pf)
+{
+	return 0;
+}
+static inline void ice_clean_ptp_subtask(struct ice_pf *pf) { }
+static inline void ice_ptp_set_timestamp_offsets(struct ice_pf *pf) { }
+static inline void ice_ptp_rx_hwtstamp(struct ice_ring *rx_ring,
+				       union ice_32b_rx_flex_desc *rx_desc,
+				       struct sk_buff *skb) { }
+static inline void ice_ptp_init(struct ice_pf *pf) { }
+static inline void ice_ptp_release(struct ice_pf *pf) { }
+static inline int ice_ptp_link_change(struct ice_pf *pf, u8 port, bool linkup)
+{ return 0; }
+#endif /* IS_ENABLED(CONFIG_PTP_1588_CLOCK) */
+#endif /* _ICE_PTP_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c0d390e62764ae2a19c576b9b439eebd1aa1f82
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_CONSTS_H_
+#define _ICE_PTP_CONSTS_H_
+
+/* Constant definitions related to the hardware clock used for PTP 1588
+ * features and functionality.
+ */
+/* Constants defined for the PTP 1588 clock hardware. */
+
+/*
+ * struct ice_time_ref_info_e822
+ *
+ * E822 hardware can use different sources as the reference for the PTP
+ * hardware clock. Each clock has different characteristics such as a slightly
+ * different frequency, etc.
+ *
+ * This lookup table defines several constants that depend on the current time
+ * reference. See the struct ice_time_ref_info_e822 for information about the
+ * meaning of each constant.
+ */
+const struct ice_time_ref_info_e822 e822_time_ref[NUM_ICE_TIME_REF_FREQ] = {
+	/* ICE_TIME_REF_FREQ_25_000 -> 25 MHz */
+	{
+		/* pll_freq */
+		823437500, /* 823.4375 MHz PLL */
+		/* nominal_incval */
+		0x136e44fabULL,
+		/* pps_delay */
+		11,
+	},
+
+	/* ICE_TIME_REF_FREQ_122_880 -> 122.88 MHz */
+	{
+		/* pll_freq */
+		783360000, /* 783.36 MHz */
+		/* nominal_incval */
+		0x146cc2177ULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_125_000 -> 125 MHz */
+	{
+		/* pll_freq */
+		796875000, /* 796.875 MHz */
+		/* nominal_incval */
+		0x141414141ULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_153_600 -> 153.6 MHz */
+	{
+		/* pll_freq */
+		816000000, /* 816 MHz */
+		/* nominal_incval */
+		0x139b9b9baULL,
+		/* pps_delay */
+		12,
+	},
+
+	/* ICE_TIME_REF_FREQ_156_250 -> 156.25 MHz */
+	{
+		/* pll_freq */
+		830078125, /* 830.78125 MHz */
+		/* nominal_incval */
+		0x134679aceULL,
+		/* pps_delay */
+		11,
+	},
+
+	/* ICE_TIME_REF_FREQ_245_760 -> 245.76 MHz */
+	{
+		/* pll_freq */
+		783360000, /* 783.36 MHz */
+		/* nominal_incval */
+		0x146cc2177ULL,
+		/* pps_delay */
+		12,
+	},
+};
+
+#endif /* _ICE_PTP_CONSTS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
new file mode 100644
index 0000000000000000000000000000000000000000..7e1c33bed88d752cb2da2a6514ff008094061acd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -0,0 +1,1774 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_type.h"
+#include "ice_common.h"
+#include "ice_ptp_hw.h"
+#include "ice_ptp_consts.h"
+
+/* Low level functions for interacting with and managing the device clock used
+ * for the Precision Time Protocol.
+ *
+ * The ice hardware represents the current time using three registers:
+ *
+ *    GLTSYN_TIME_H     GLTSYN_TIME_L     GLTSYN_TIME_R
+ *  +---------------+ +---------------+ +---------------+
+ *  |    32 bits    | |    32 bits    | |    32 bits    |
+ *  +---------------+ +---------------+ +---------------+
+ *
+ * The registers are incremented every clock tick using a 40bit increment
+ * value defined over two registers:
+ *
+ *                     GLTSYN_INCVAL_H   GLTSYN_INCVAL_L
+ *                    +---------------+ +---------------+
+ *                    |    8 bit s    | |    32 bits    |
+ *                    +---------------+ +---------------+
+ *
+ * The increment value is added to the GLSTYN_TIME_R and GLSTYN_TIME_L
+ * registers every clock source tick. Depending on the specific device
+ * configuration, the clock source frequency could be one of a number of
+ * values.
+ *
+ * For E810 devices, the increment frequency is 812.5 MHz
+ *
+ * For E822 devices the clock can be derived from different sources, and the
+ * increment has an effective frequency of one of the following:
+ * - 823.4375 MHz
+ * - 783.36 MHz
+ * - 796.875 MHz
+ * - 816 MHz
+ * - 830.078125 MHz
+ * - 783.36 MHz
+ *
+ * The hardware captures timestamps in the PHY for incoming packets, and for
+ * outgoing packets on request. To support this, the PHY maintains a timer
+ * that matches the lower 64 bits of the global source timer.
+ *
+ * In order to ensure that the PHY timers and the source timer are equivalent,
+ * shadow registers are used to prepare the desired initial values. A special
+ * sync command is issued to trigger copying from the shadow registers into
+ * the appropriate source and PHY registers simultaneously.
+ *
+ * The driver supports devices which have different PHYs with subtly different
+ * mechanisms to program and control the timers. We divide the devices into
+ * families named after the first major device, E810 and similar devices, and
+ * E822 and similar devices.
+ *
+ * - E822 based devices have additional support for fine grained Vernier
+ *   calibration which requires significant setup
+ * - The layout of timestamp data in the PHY register blocks is different
+ * - The way timer synchronization commands are issued is different.
+ *
+ * To support this, very low level functions have an e810 or e822 suffix
+ * indicating what type of device they work on. Higher level abstractions for
+ * tasks that can be done on both devices do not have the suffix and will
+ * correctly look up the appropriate low level function when running.
+ *
+ * Functions which only make sense on a single device family may not have
+ * a suitable generic implementation
+ */
+
+/**
+ * ice_get_ptp_src_clock_index - determine source clock index
+ * @hw: pointer to HW struct
+ *
+ * Determine the source clock index currently in use, based on device
+ * capabilities reported during initialization.
+ */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw)
+{
+	return hw->func_caps.ts_func_info.tmr_index_assoc;
+}
+
+/**
+ * ice_ptp_read_src_incval - Read source timer increment value
+ * @hw: pointer to HW struct
+ *
+ * Read the increment value of the source timer and return it.
+ */
+u64 ice_ptp_read_src_incval(struct ice_hw *hw)
+{
+	u32 lo, hi;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+
+	lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx));
+	hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx));
+
+	return ((u64)(hi & INCVAL_HIGH_M) << 32) | lo;
+}
+
+/* E822 family functions
+ *
+ * The following functions operate on the E822 family of devices.
+ */
+
+/**
+ * ice_fill_phy_msg_e822 - Fill message data for a PHY register access
+ * @msg: the PHY message buffer to fill in
+ * @port: the port to access
+ * @offset: the register offset
+ */
+static void
+ice_fill_phy_msg_e822(struct ice_sbq_msg_input *msg, u8 port, u16 offset)
+{
+	int phy_port, phy, quadtype;
+
+	phy_port = port % ICE_PORTS_PER_PHY;
+	phy = port / ICE_PORTS_PER_PHY;
+	quadtype = (port / ICE_PORTS_PER_QUAD) % ICE_NUM_QUAD_TYPE;
+
+	if (quadtype == 0) {
+		msg->msg_addr_low = P_Q0_L(P_0_BASE + offset, phy_port);
+		msg->msg_addr_high = P_Q0_H(P_0_BASE + offset, phy_port);
+	} else {
+		msg->msg_addr_low = P_Q1_L(P_4_BASE + offset, phy_port);
+		msg->msg_addr_high = P_Q1_H(P_4_BASE + offset, phy_port);
+	}
+
+	if (phy == 0)
+		msg->dest_dev = rmn_0;
+	else if (phy == 1)
+		msg->dest_dev = rmn_1;
+	else
+		msg->dest_dev = rmn_2;
+}
+
+/**
+ * ice_read_phy_reg_e822_lp - Read a PHY register
+ * @hw: pointer to the HW struct
+ * @port: PHY port to read from
+ * @offset: PHY register offset to read
+ * @val: on return, the contents read from the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a PHY register for the given port over the device sideband queue.
+ */
+static enum ice_status
+ice_read_phy_reg_e822_lp(struct ice_hw *hw, u8 port, u16 offset, u32 *val,
+			 bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+
+	ice_fill_phy_msg_e822(&msg, port, offset);
+	msg.opcode = ice_sbq_msg_rd;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+enum ice_status
+ice_read_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 *val)
+{
+	return ice_read_phy_reg_e822_lp(hw, port, offset, val, true);
+}
+
+/**
+ * ice_write_phy_reg_e822_lp - Write a PHY register
+ * @hw: pointer to the HW struct
+ * @port: PHY port to write to
+ * @offset: PHY register offset to write
+ * @val: The value to write to the register
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a PHY register for the given port over the device sideband queue.
+ */
+static enum ice_status
+ice_write_phy_reg_e822_lp(struct ice_hw *hw, u8 port, u16 offset, u32 val,
+			  bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+
+	ice_fill_phy_msg_e822(&msg, port, offset);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+enum ice_status
+ice_write_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 val)
+{
+	return ice_write_phy_reg_e822_lp(hw, port, offset, val, true);
+}
+
+/**
+ * ice_fill_quad_msg_e822 - Fill message data for quad register access
+ * @msg: the PHY message buffer to fill in
+ * @quad: the quad to access
+ * @offset: the register offset
+ *
+ * Fill a message buffer for accessing a register in a quad shared between
+ * multiple PHYs.
+ */
+static void
+ice_fill_quad_msg_e822(struct ice_sbq_msg_input *msg, u8 quad, u16 offset)
+{
+	u32 addr;
+
+	msg->dest_dev = rmn_0;
+
+	if ((quad % ICE_NUM_QUAD_TYPE) == 0)
+		addr = Q_0_BASE + offset;
+	else
+		addr = Q_1_BASE + offset;
+
+	msg->msg_addr_low = ICE_LO_WORD(addr);
+	msg->msg_addr_high = ICE_HI_WORD(addr);
+}
+
+/**
+ * ice_read_quad_reg_e822_lp - Read a PHY quad register
+ * @hw: pointer to the HW struct
+ * @quad: quad to read from
+ * @offset: quad register offset to read
+ * @val: on return, the contents read from the quad
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a quad register over the device sideband queue. Quad registers are
+ * shared between multiple PHYs.
+ */
+static enum ice_status
+ice_read_quad_reg_e822_lp(struct ice_hw *hw, u8 quad, u16 offset, u32 *val,
+			  bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	if (quad >= ICE_MAX_QUAD)
+		return ICE_ERR_PARAM;
+
+	ice_fill_quad_msg_e822(&msg, quad, offset);
+	msg.opcode = ice_sbq_msg_rd;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+enum ice_status
+ice_read_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 *val)
+{
+	return ice_read_quad_reg_e822_lp(hw, quad, offset, val, true);
+}
+
+/**
+ * ice_write_quad_reg_e822_lp - Write a PHY quad register
+ * @hw: pointer to the HW struct
+ * @quad: quad to write to
+ * @offset: quad register offset to write
+ * @val: The value to write to the register
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a quad register over the device sideband queue. Quad registers are
+ * shared between multiple PHYs.
+ */
+static enum ice_status
+ice_write_quad_reg_e822_lp(struct ice_hw *hw, u8 quad, u16 offset, u32 val,
+			   bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	if (quad >= ICE_MAX_QUAD)
+		return ICE_ERR_PARAM;
+
+	ice_fill_quad_msg_e822(&msg, quad, offset);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+enum ice_status
+ice_write_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 val)
+{
+	return ice_write_quad_reg_e822_lp(hw, quad, offset, val, true);
+}
+
+/**
+ * ice_read_phy_tstamp_e822 - Read a PHY timestamp out of the quad block
+ * @hw: pointer to the HW struct
+ * @quad: the quad to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the two associated registers in the
+ * quad memory block that is shared between the internal PHYs of the E822
+ * family of devices.
+ */
+static enum ice_status
+ice_read_phy_tstamp_e822(struct ice_hw *hw, u8 quad, u8 idx, u64 *tstamp)
+{
+	enum ice_status status;
+	u16 lo_addr, hi_addr;
+	u32 lo, hi;
+
+	lo_addr = (u16)TS_L(Q_REG_TX_MEMORY_BANK_START, idx);
+	hi_addr = (u16)TS_H(Q_REG_TX_MEMORY_BANK_START, idx);
+
+	status = ice_read_quad_reg_e822(hw, quad, lo_addr, &lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_quad_reg_e822(hw, quad, hi_addr, &hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* For E822 based internal PHYs, the timestamp is reported with the
+	 * lower 8 bits in the low register, and the upper 32 bits in the high
+	 * register.
+	 */
+	*tstamp = ((u64)hi) << TS_PHY_HIGH_S | ((u64)lo & TS_PHY_LOW_M);
+
+	return 0;
+}
+
+/**
+ * ice_clear_phy_tstamp_e822 - Clear a timestamp from the quad block
+ * @hw: pointer to the HW struct
+ * @quad: the quad to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the PHY quad block that is
+ * shared between the internal PHYs on the E822 devices.
+ */
+static enum ice_status
+ice_clear_phy_tstamp_e822(struct ice_hw *hw, u8 quad, u8 idx)
+{
+	enum ice_status status;
+	u16 lo_addr, hi_addr;
+
+	lo_addr = (u16)TS_L(Q_REG_TX_MEMORY_BANK_START, idx);
+	hi_addr = (u16)TS_H(Q_REG_TX_MEMORY_BANK_START, idx);
+
+	status = ice_write_quad_reg_e822(hw, quad, lo_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_quad_reg_e822(hw, quad, hi_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_time_e822 - Prepare PHY port with initial time
+ * @hw: pointer to the HW struct
+ * @time: Time to initialize the PHY port clocks to
+ *
+ * Program the PHY port registers with a new initial time value. The port
+ * clock will be initialized once the driver issues an INIT_TIME sync
+ * command. The time value is the upper 32 bits of the PHY timer, usually in
+ * units of nominal nanoseconds.
+ */
+static enum ice_status
+ice_ptp_prep_phy_time_e822(struct ice_hw *hw, u32 time)
+{
+	enum ice_status status;
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+
+		/* Tx case */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_INC_PRE_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_INC_PRE_U,
+						   time, true);
+		if (status)
+			goto exit_err;
+
+		/* Rx case */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_INC_PRE_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_INC_PRE_U,
+						   time, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write init time for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_prep_port_adj_e822 - Prepare a single port for time adjust
+ * @hw: pointer to HW struct
+ * @port: Port number to be programmed
+ * @time: time in cycles to adjust the port Tx and Rx clocks
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Program the port for an atomic adjustment by writing the Tx and Rx timer
+ * registers. The atomic adjustment won't be completed until the driver issues
+ * an ADJ_TIME command.
+ *
+ * Note that time is not in units of nanoseconds. It is in clock time
+ * including the lower sub-nanosecond portion of the port timer.
+ *
+ * Negative adjustments are supported using 2s complement arithmetic.
+ */
+enum ice_status
+ice_ptp_prep_port_adj_e822(struct ice_hw *hw, u8 port, s64 time,
+			   bool lock_sbq)
+{
+	enum ice_status status;
+	u32 l_time, u_time;
+
+	l_time = lower_32_bits(time);
+	u_time = upper_32_bits(time);
+
+	/* Tx case */
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TIMER_INC_PRE_L,
+					   l_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TIMER_INC_PRE_U,
+					   u_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	/* Rx case */
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TIMER_INC_PRE_L,
+					   l_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TIMER_INC_PRE_U,
+					   u_time, lock_sbq);
+	if (status)
+		goto exit_err;
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write time adjust for port %u, status %d\n",
+		  port, status);
+	return status;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_e822 - Prep PHY ports for a time adjustment
+ * @hw: pointer to HW struct
+ * @adj: adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Prepare the PHY ports for an atomic time adjustment by programming the PHY
+ * Tx and Rx port registers. The actual adjustment is completed by issuing an
+ * ADJ_TIME or ADJ_TIME_AT_TIME sync command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_e822(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	s64 cycles;
+	u8 port;
+
+	/* The port clock supports adjustment of the sub-nanosecond portion of
+	 * the clock. We shift the provided adjustment in nanoseconds to
+	 * calculate the appropriate adjustment to program into the PHY ports.
+	 */
+	if (adj > 0)
+		cycles = (s64)adj << 32;
+	else
+		cycles = -(((s64)-adj) << 32);
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_ptp_prep_port_adj_e822(hw, port, cycles,
+						    lock_sbq);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_incval_e822 - Prepare PHY ports for time adjustment
+ * @hw: pointer to HW struct
+ * @incval: new increment value to prepare
+ *
+ * Prepare each of the PHY ports for a new increment value by programming each
+ * port's TIMETUS registers. The new increment value will be updated after
+ * issuing an INIT_INCVAL command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_incval_e822(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u32 high, low;
+	u8 port;
+
+	/* The PHY registers for the increment value divide the lower 8 bits
+	 * into the first low register, and the next 32 bits into the second
+	 * high register.
+	 */
+	low = (u32)(incval & P_REG_TIMETUS_LOW_M);
+	high = (u32)(incval >> P_REG_TIMETUS_HIGH_S);
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_L,
+						   low, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_U,
+						   high, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write incval for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_phy_incval_e822 - Read a PHY port's current incval
+ * @hw: pointer to the HW struct
+ * @port: the port to read
+ * @incval: on return, the time_clk_cyc incval for this port
+ *
+ * Read the time_clk_cyc increment value for a given PHY port.
+ */
+enum ice_status
+ice_ptp_read_phy_incval_e822(struct ice_hw *hw, u8 port, u64 *incval)
+{
+	enum ice_status status;
+	u32 high, low;
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TIMETUS_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TIMETUS_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TIMETUS_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*incval = high << P_REG_TIMETUS_HIGH_S | (low & P_REG_TIMETUS_LOW_M);
+	ice_debug(hw, ICE_DBG_PTP, "read INCVAL = 0x%08x%08x\n", high, low);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_target_e822 - Prepare PHY for adjust at target time
+ * @hw: pointer to HW struct
+ * @target_time: target time to program
+ *
+ * Program the PHY port Tx and Rx TIMER_CNT_ADJ registers used for the
+ * ADJ_TIME_AT_TIME command. This should be used in conjunction with
+ * ice_ptp_prep_phy_adj_e822 to program an atomic adjustment that is
+ * delayed until a specified target time.
+ *
+ * Note that a target time adjustment is not currently supported on E810
+ * devices.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_target_e822(struct ice_hw *hw, u32 target_time)
+{
+	enum ice_status status;
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+
+		/* Tx case */
+		/* No sub-nanoseconds data */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_CNT_ADJ_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_TX_TIMER_CNT_ADJ_U,
+						   target_time, true);
+		if (status)
+			goto exit_err;
+
+		/* Rx case */
+		/* No sub-nanoseconds data */
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_CNT_ADJ_L,
+						   0, true);
+		if (status)
+			goto exit_err;
+
+		status = ice_write_phy_reg_e822_lp(hw, port,
+						   P_REG_RX_TIMER_CNT_ADJ_U,
+						   target_time, true);
+		if (status)
+			goto exit_err;
+	}
+
+	return 0;
+
+exit_err:
+	ice_debug(hw, ICE_DBG_PTP, "Failed to write target time for port %u, status %d\n",
+		  port, status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_read_port_capture - Read a port's local time capture
+ * @hw: pointer to HW struct
+ * @port: Port number to read
+ * @tx_ts: on return, the Tx port time capture
+ * @rx_ts: on return, the Rx port time capture
+ *
+ * Read the port's Tx and Rx local time capture values.
+ *
+ * Note this has no equivalent for the E810 devices.
+ */
+enum ice_status
+ice_ptp_read_port_capture(struct ice_hw *hw, u8 port, u64 *tx_ts, u64 *rx_ts)
+{
+	enum ice_status status;
+	u32 high, low;
+
+	/* Tx case */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_CAPTURE_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read REG_TX_CAPTURE_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_CAPTURE_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_CAPTURE_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*tx_ts = (u64)high << 32 | low;
+	ice_debug(hw, ICE_DBG_PTP, "tx_init = 0x%016llx\n", *tx_ts);
+
+	/* Rx case */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_CAPTURE_L,
+					  &low, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_CAPTURE_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_CAPTURE_U,
+					  &high, true);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_CAPTURE_U, status %d\n",
+			  status);
+		return status;
+	}
+
+	*rx_ts = (u64)high << 32 | low;
+	ice_debug(hw, ICE_DBG_PTP, "rx_init = 0x%016llx\n", *rx_ts);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_one_port_cmd - Prepare a single PHY port for a timer command
+ * @hw: pointer to HW struct
+ * @port: Port to which cmd has to be sent
+ * @cmd: Command to be sent to the port
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the requested port for an upcoming timer sync command.
+ *
+ * Note there is no equivalent of this operation on E810, as that device
+ * always handles all external PHYs internally.
+ */
+enum ice_status
+ice_ptp_one_port_cmd(struct ice_hw *hw, u8 port, enum ice_ptp_tmr_cmd cmd,
+		     bool lock_sbq)
+{
+	enum ice_status status;
+	u32 cmd_val, val;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	cmd_val = tmr_idx << SEL_PHY_SRC;
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val |= PHY_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val |= PHY_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val |= PHY_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val |= PHY_CMD_ADJ_TIME_AT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val |= PHY_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return ICE_ERR_PARAM;
+	}
+
+	/* Tx case */
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_TX_TMR_CMD, &val,
+					  lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_TX_TMR_CMD, val,
+					   lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back TX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Rx case */
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e822_lp(hw, port, P_REG_RX_TMR_CMD, &val,
+					  lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read RX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e822_lp(hw, port, P_REG_RX_TMR_CMD, val,
+					   lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back RX_TMR_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_port_cmd_e822 - Prepare all ports for a timer command
+ * @hw: pointer to the HW struct
+ * @cmd: timer command to prepare
+ * @lock_sbq: true if the sideband queue lock must  be acquired
+ *
+ * Prepare all ports connected to this device for an upcoming timer sync
+ * command.
+ */
+static enum ice_status
+ice_ptp_port_cmd_e822(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd,
+		      bool lock_sbq)
+{
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_ptp_one_port_cmd(hw, port, cmd, lock_sbq);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+/* E822 Vernier calibration functions
+ *
+ * The following functions are used as part of the vernier calibration of
+ * a port. This calibration increases the precision of the timestamps on the
+ * port.
+ */
+
+/**
+ * ice_ptp_set_vernier_wl - Set the window length for vernier calibration
+ * @hw: pointer to the HW struct
+ *
+ * Set the window length used for the vernier port calibration process.
+ */
+enum ice_status ice_ptp_set_vernier_wl(struct ice_hw *hw)
+{
+	u8 port;
+
+	for (port = 0; port < ICE_NUM_EXTERNAL_PORTS; port++) {
+		enum ice_status status;
+
+
+		status = ice_write_phy_reg_e822_lp(hw, port, P_REG_WL,
+						   PTP_VERNIER_WL, true);
+		if (status) {
+			ice_debug(hw, ICE_DBG_PTP, "Failed to set vernier window length for port %u, status %d\n",
+				  port, status);
+			return status;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_phy_get_speed_and_fec_e822 - Get link speed and FEC based on serdes mode
+ * @hw: pointer to HW struct
+ * @port: the port to read from
+ * @link_out: if non-NULL, holds link speed on success
+ * @fec_out: if non-NULL, holds FEC algorithm on success
+ *
+ * Read the serdes data for the PHY port and extract the link speed and FEC
+ * algorithm.
+ */
+enum ice_status
+ice_phy_get_speed_and_fec_e822(struct ice_hw *hw, u8 port,
+			       enum ice_ptp_link_spd *link_out,
+			       enum ice_ptp_fec_mode *fec_out)
+{
+	enum ice_ptp_link_spd link;
+	enum ice_ptp_fec_mode fec;
+	enum ice_status status;
+	u32 serdes;
+
+	status = ice_read_phy_reg_e822(hw, port, P_REG_LINK_SPEED, &serdes);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read serdes info\n");
+		return status;
+	}
+
+	/* Determine the FEC algorithm */
+	fec = (enum ice_ptp_fec_mode)P_REG_LINK_SPEED_FEC_MODE(serdes);
+
+	serdes &= P_REG_LINK_SPEED_SERDES_M;
+
+	/* Determine the link speed */
+	if (fec == ICE_PTP_FEC_MODE_RS_FEC) {
+		switch (serdes) {
+		case ICE_PTP_SERDES_25G:
+			link = ICE_PTP_LNK_SPD_25G_RS;
+			break;
+		case ICE_PTP_SERDES_50G:
+			link = ICE_PTP_LNK_SPD_50G_RS;
+			break;
+		case ICE_PTP_SERDES_100G:
+			link = ICE_PTP_LNK_SPD_100G_RS;
+			break;
+		default:
+			return ICE_ERR_OUT_OF_RANGE;
+		}
+	} else {
+		switch (serdes) {
+		case ICE_PTP_SERDES_1G:
+			link = ICE_PTP_LNK_SPD_1G;
+			break;
+		case ICE_PTP_SERDES_10G:
+			link = ICE_PTP_LNK_SPD_10G;
+			break;
+		case ICE_PTP_SERDES_25G:
+			link = ICE_PTP_LNK_SPD_25G;
+			break;
+		case ICE_PTP_SERDES_40G:
+			link = ICE_PTP_LNK_SPD_40G;
+			break;
+		case ICE_PTP_SERDES_50G:
+			link = ICE_PTP_LNK_SPD_50G;
+			break;
+		default:
+			return ICE_ERR_OUT_OF_RANGE;
+		}
+	}
+
+	if (link_out)
+		*link_out = link;
+	if (fec_out)
+		*fec_out = fec;
+
+	return 0;
+}
+
+/**
+ * ice_phy_cfg_lane_e822 - Configure PHY quad for single/multi-lane timestamp
+ * @hw: pointer to HW struct
+ * @port: to configure the quad for
+ */
+void ice_phy_cfg_lane_e822(struct ice_hw *hw, u8 port)
+{
+	enum ice_ptp_link_spd link_spd;
+	enum ice_status status;
+	int quad;
+	u32 val;
+
+	quad = port / ICE_PORTS_PER_QUAD;
+
+	status = ice_phy_get_speed_and_fec_e822(hw, port, &link_spd, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to get PHY link speed, status %d\n",
+			  status);
+		return;
+	}
+
+	status = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG, &val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read TX_MEM_GLB_CFG, status %d\n",
+			  status);
+		return;
+	}
+
+	if (link_spd >= ICE_PTP_LNK_SPD_40G)
+		val &= ~Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M;
+	else
+		val |= Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M;
+
+	status = ice_write_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG, val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back TX_MEM_GBL_CFG, status %d\n",
+			  status);
+		return;
+	}
+}
+
+/* E810 functions
+ *
+ * The following functions operate on the E810 series devices which use
+ * a separate external PHY.
+ */
+
+/**
+ * ice_read_phy_reg_e810_lp - Read register from external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to read from
+ * @val: On return, the value read from the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Read a register from the external PHY on the E810 device.
+ */
+static enum ice_status
+ice_read_phy_reg_e810_lp(struct ice_hw *hw, u32 addr, u32 *val, bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	msg.msg_addr_low = ICE_LO_WORD(addr);
+	msg.msg_addr_high = ICE_HI_WORD(addr);
+	msg.opcode = ice_sbq_msg_rd;
+	msg.dest_dev = rmn_0;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+static enum ice_status
+ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val)
+{
+	return ice_read_phy_reg_e810_lp(hw, addr, val, true);
+}
+
+/**
+ * ice_write_phy_reg_e810_lp - Write register on external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to writem to
+ * @val: the value to write to the PHY
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Write a value to a register of the external PHY on the E810 device.
+ */
+static enum ice_status
+ice_write_phy_reg_e810_lp(struct ice_hw *hw, u32 addr, u32 val, bool lock_sbq)
+{
+	struct ice_sbq_msg_input msg = {0};
+	enum ice_status status;
+
+	msg.msg_addr_low = ICE_LO_WORD(addr);
+	msg.msg_addr_high = ICE_HI_WORD(addr);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.dest_dev = rmn_0;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg_lp(hw, &msg, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to phy, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+static enum ice_status
+ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val)
+{
+	return ice_write_phy_reg_e810_lp(hw, addr, val, true);
+}
+
+/**
+ * ice_read_phy_tstamp_e810 - Read a PHY timestamp out of the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block of the external PHY
+ * on the E810 device.
+ */
+static enum ice_status
+ice_read_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx, u64 *tstamp)
+{
+	enum ice_status status;
+	u32 lo_addr, hi_addr, lo, hi;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_read_phy_reg_e810(hw, lo_addr, &lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e810(hw, hi_addr, &hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* For E810 devices, the timestamp is reported with the lower 32 bits
+	 * in the low register, and the upper 8 bits in the high register.
+	 */
+	*tstamp = ((u64)hi) << TS_HIGH_S | ((u64)lo & TS_LOW_M);
+
+	return 0;
+}
+
+/**
+ * ice_clear_phy_tstamp_e810 - Clear a timestamp from the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block of the
+ * external PHY on the E810 device.
+ */
+static enum ice_status
+ice_clear_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx)
+{
+	enum ice_status status;
+	u32 lo_addr, hi_addr;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_write_phy_reg_e810(hw, lo_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, hi_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_phy_e810 - Enable PTP function on the external PHY
+ * @hw: pointer to HW struct
+ *
+ * Enable the timesync PTP functionality for the external PHY connected to
+ * this function.
+ *
+ * Note there is no equivalent function needed on E822 based devices.
+ */
+enum ice_status ice_ptp_init_phy_e810(struct ice_hw *hw)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_ENA(tmr_idx),
+					GLTSYN_ENA_TSYN_ENA_M);
+	if (status)
+		ice_debug(hw, ICE_DBG_PTP, "PTP failed in ena_phy_time_syn %d\n",
+			  status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_prep_phy_time_e810 - Prepare PHY port with initial time
+ * @hw: Board private structure
+ * @time: Time to initialize the PHY port clock to
+ *
+ * Program the PHY port ETH_GLTSYN_SHTIME registers in preparation setting the
+ * initial clock time. The time will not actually be programmed until the
+ * driver issues an INIT_TIME command.
+ *
+ * The time value is the upper 32 bits of the PHY timer, usually in units of
+ * nominal nanoseconds.
+ */
+static enum ice_status ice_ptp_prep_phy_time_e810(struct ice_hw *hw, u32 time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_0(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_0, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_L(tmr_idx), time);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_e810 - Prep PHY port for a time adjustment
+ * @hw: pointer to HW struct
+ * @adj: adjustment value to program
+ * @lock_sbq: true if the sideband queue luck must be acquired
+ *
+ * Prepare the PHY port for an atomic adjustment by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual adjustment
+ * is completed by issuing an ADJ_TIME sync command.
+ *
+ * The adjustment value only contains the portion used for the upper 32bits of
+ * the PHY timer, usually in units of nominal nanoseconds. Negative
+ * adjustments are supported using 2s complement arithmetic.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_e810(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Adjustments are represented as signed 2's complement values in
+	 * nanoseconds. Sub-nanosecond adjustment is not supported.
+	 */
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_SHADJ_L(tmr_idx),
+					   0, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_SHADJ_H(tmr_idx),
+					   adj, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_incval_e810 - Prep PHY port increment value change
+ * @hw: pointer to HW struct
+ * @incval: The new 40bit increment value to prepare
+ *
+ * Prepare the PHY port for a new increment value by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual change is
+ * completed by issuing an INIT_INCVAL command.
+ */
+static enum ice_status
+ice_ptp_prep_phy_incval_e810(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u32 high, low;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	low = lower_32_bits(incval);
+	high = upper_32_bits(incval);
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_L(tmr_idx), low);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_H(tmr_idx), high);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_target_e810 - Prepare PHY port with adjust target
+ * @hw: Board private structure
+ * @target_time: Time to trigger the clock adjustment at
+ *
+ * Program the PHY port ETH_GLTSYN_SHTIME registers in preparation for
+ * a target time adjust, which will trigger an adjustment of the clock in the
+ * future. The actual adjustment will occur the next time the PHY port timer
+ * crosses over the provided value after the driver issues an ADJ_TIME_AT_TIME
+ * command.
+ *
+ * The time value is the upper 32 bits of the PHY timer, usually in units of
+ * nominal nanoseconds.
+ */
+static enum ice_status
+ice_ptp_prep_phy_adj_target_e810(struct ice_hw *hw, u32 target_time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_0(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write target time to SHTIME_0, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_L(tmr_idx),
+					target_time);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write target time to SHTIME_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_port_cmd_e810 - Prepare all external PHYs for a timer command
+ * @hw: pointer to HW struct
+ * @cmd: Command to be sent to the port
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the external PHYs connected to this device for a timer sync
+ * command.
+ */
+static enum ice_status
+ice_ptp_port_cmd_e810(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd,
+		      bool lock_sbq)
+{
+	enum ice_status status;
+	u32 cmd_val, val;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val = GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val = GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val = GLTSYN_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return ICE_ERR_PARAM;
+	}
+
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e810_lp(hw, ETH_GLTSYN_CMD, &val, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read GLTSYN_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK_E810;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e810_lp(hw, ETH_GLTSYN_CMD, val, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back GLTSYN_CMD, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/* Device agnostic functions
+ *
+ * The following functions implement shared behavior common to both E822 and
+ * E810 devices, possibly calling a device specific implementation where
+ * necessary.
+ */
+
+/**
+ * ice_ptp_lock - Acquire PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Acquire the global PTP hardware semaphore lock. Returns true if the lock
+ * was acquired, false otherwise.
+ *
+ * The PFTSYN_SEM register sets the busy bit on read, returning the previous
+ * value. If software sees the busy bit cleared, this means that this function
+ * acquired the lock (and the busy bit is now set). If software sees the busy
+ * bit set, it means that another function acquired the lock.
+ *
+ * Software must clear the busy bit with a write to release the lock for other
+ * functions when done.
+ */
+bool ice_ptp_lock(struct ice_hw *hw)
+{
+	u32 hw_lock;
+	int i;
+
+#define MAX_TRIES 5
+
+	for (i = 0; i < MAX_TRIES; i++) {
+		hw_lock = rd32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+		hw_lock = hw_lock & PFTSYN_SEM_BUSY_M;
+		if (hw_lock) {
+			/* Somebody is holding the lock */
+			msleep(10);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	return !hw_lock;
+}
+
+/**
+ * ice_ptp_unlock - Release PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Release the global PTP hardware semaphore lock. This is done by writing to
+ * the PFTSYN_SEM register.
+ */
+void ice_ptp_unlock(struct ice_hw *hw)
+{
+	wr32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), 0);
+}
+
+/**
+ * ice_ptp_src_cmd - Prepare source timer for a timer command
+ * @hw: pointer to HW structure
+ * @cmd: Timer command
+ *
+ * Prepare the source timer for an upcoming timer sync command.
+ */
+void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd)
+{
+	u32 cmd_val;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	cmd_val = tmr_idx << SEL_CPK_SRC;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val |= GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val |= GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val |= GLTSYN_CMD_READ_TIME;
+		break;
+	default:
+		dev_warn(ice_hw_to_dev(hw), "Unknown timer command %u\n", cmd);
+		return;
+	}
+
+	wr32(hw, GLTSYN_CMD, cmd_val);
+}
+
+/**
+ * ice_ptp_tmr_cmd - Prepare and trigger a timer sync command
+ * @hw: pointer to HW struct
+ * @cmd: the command to issue
+ * @lock_sbq: true if the sideband queue lock must be acquired
+ *
+ * Prepare the source timer and PHY timers and then trigger the requested
+ * command. This causes the shadow registers previously written in preparation
+ * for the command to be synchronously applied to both the source and PHY
+ * timers.
+ */
+static enum ice_status
+ice_ptp_tmr_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd, bool lock_sbq)
+{
+	enum ice_status status;
+
+	/* First, prepare the source timer */
+	ice_ptp_src_cmd(hw, cmd);
+
+	/* Next, prepare the ports */
+	if (ice_is_e810(hw))
+		status = ice_ptp_port_cmd_e810(hw, cmd, lock_sbq);
+	else
+		status = ice_ptp_port_cmd_e822(hw, cmd, lock_sbq);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to prepare PHY ports for timer command %u, status %d\n",
+			  cmd, status);
+		return status;
+	}
+
+	/* Write the sync command register to drive both source and PHY timer
+	 * commands synchronously
+	 */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_time - Initialize device time to provided value
+ * @hw: pointer to HW struct
+ * @time: 64bits of time (GLTSYN_TIME_L and GLTSYN_TIME_H)
+ *
+ * Initialize the device to the specified time provided. This requires a three
+ * step process:
+ *
+ * 1) write the new init time to the source timer shadow registers
+ * 2) write the new init time to the phy timer shadow registers
+ * 3) issue an init_time timer command to synchronously switch both the source
+ *    and port timers to the new init time value at the next clock cycle.
+ */
+enum ice_status ice_ptp_init_time(struct ice_hw *hw, u64 time)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Source timers */
+	wr32(hw, GLTSYN_SHTIME_L(tmr_idx), lower_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_H(tmr_idx), upper_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_0(tmr_idx), 0);
+
+	/* PHY Clks */
+	/* Fill Rx and Tx ports and send msg to PHY */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_time_e810(hw, time & 0xFFFFFFFF);
+	else
+		status = ice_ptp_prep_phy_time_e822(hw, time & 0xFFFFFFFF);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_TIME, true);
+}
+
+/**
+ * ice_ptp_write_incval - Program PHC with new increment value
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program the PHC with a new increment value. This requires a three-step
+ * process:
+ *
+ * 1) Write the increment value to the source timer shadow registers
+ * 2) Write the increment value to the PHY timer shadow registers
+ * 3) Issue an INIT_INCVAL timer command to synchronously switch both the
+ *    source and port timers to the new increment value at the next clock
+ *    cycle.
+ */
+enum ice_status ice_ptp_write_incval(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Shadow Adjust */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), lower_32_bits(incval));
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), upper_32_bits(incval));
+
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_incval_e810(hw, incval);
+	else
+		status = ice_ptp_prep_phy_incval_e822(hw, incval);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_INCVAL, true);
+}
+
+/**
+ * ice_ptp_write_incval_locked - Program new incval while holding semaphore
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program a new PHC incval while holding the PTP semaphore.
+ */
+enum ice_status ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval)
+{
+	enum ice_status status;
+
+	if (!ice_ptp_lock(hw))
+		return ICE_ERR_NOT_READY;
+
+	status = ice_ptp_write_incval(hw, incval);
+
+	ice_ptp_unlock(hw);
+
+	return status;
+}
+
+/**
+ * ice_ptp_adj_clock - Adjust PHC clock time atomically
+ * @hw: pointer to HW struct
+ * @adj: Adjustment in nanoseconds
+ * @lock_sbq: true to lock the sbq sq_lock (the usual case); false if the
+ *            sq_lock has already been locked at a higher level
+ *
+ * Perform an atomic adjustment of the PHC time by the specified number of
+ * nanoseconds. This requires a three-step process:
+ *
+ * 1) Write the adjustment to the source timer shadow registers
+ * 2) Write the adjustment to the PHY timer shadow registers
+ * 3) Issue an ADJ_TIME timer command to synchronously apply the adjustment to
+ *    both the source and port timers at the next clock cycle.
+ */
+enum ice_status ice_ptp_adj_clock(struct ice_hw *hw, s32 adj, bool lock_sbq)
+{
+	enum ice_status status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Write the desired clock adjustment into the GLTSYN_SHADJ register.
+	 * For an ADJ_TIME command, this set of registers represents the value
+	 * to add to the clock time. It supports subtraction by interpreting
+	 * the value as a 2's complement integer.
+	 */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), adj);
+
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_e810(hw, adj, lock_sbq);
+	else
+		status = ice_ptp_prep_phy_adj_e822(hw, adj, lock_sbq);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, ADJ_TIME, lock_sbq);
+}
+
+/**
+ * ice_ptp_adj_clock_at_time - Adjust PHC atomically at specified time
+ * @hw: pointer to HW struct
+ * @at_time: Time in nanoseconds at which to perform the adjustment
+ * @adj: Adjustment in nanoseconds
+ *
+ * Perform an atomic adjustment to the PHC clock at the specified time. This
+ * requires a five-step process:
+ *
+ * 1) Write the adjustment to the source timer shadow adjust registers
+ * 2) Write the target time to the source timer shadow time registers
+ * 3) Write the adjustment to the PHY timers shadow adjust registers
+ * 4) Write the target time to the PHY timers shadow adjust registers
+ * 5) Issue an ADJ_TIME_AT_TIME command to initiate the atomic adjustment.
+ */
+enum ice_status
+ice_ptp_adj_clock_at_time(struct ice_hw *hw, u64 at_time, s32 adj)
+{
+	enum ice_status status;
+	u32 time_lo, time_hi;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	time_lo = lower_32_bits(at_time);
+	time_hi = upper_32_bits(at_time);
+
+	/* Write the desired clock adjustment into the GLTSYN_SHADJ register.
+	 * For an ADJ_TIME_AT_TIME command, this set of registers represents
+	 * the value to add to the clock time. It supports subtraction by
+	 * interpreting the value as a 2's complement integer.
+	 */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), adj);
+
+	/* Write the target time to trigger the adjustment for source clock */
+	wr32(hw, GLTSYN_SHTIME_0(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHTIME_L(tmr_idx), time_lo);
+	wr32(hw, GLTSYN_SHTIME_H(tmr_idx), time_hi);
+
+	/* Prepare PHY port adjustments */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_e810(hw, adj, true);
+	else
+		status = ice_ptp_prep_phy_adj_e822(hw, adj, true);
+	if (status)
+		return status;
+
+	/* Set target time for each PHY port */
+	if (ice_is_e810(hw))
+		status = ice_ptp_prep_phy_adj_target_e810(hw, time_lo);
+	else
+		status = ice_ptp_prep_phy_adj_target_e822(hw, time_lo);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, ADJ_TIME_AT_TIME, true);
+}
+
+/**
+ * ice_read_phy_tstamp - Read a PHY timestamp from the timestamo block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block. For E822 devices,
+ * the block is the quad to read from. For E810 devices, the block is the
+ * logical port to read from.
+ */
+enum ice_status
+ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp)
+{
+	if (ice_is_e810(hw))
+		return ice_read_phy_tstamp_e810(hw, block, idx, tstamp);
+	else
+		return ice_read_phy_tstamp_e822(hw, block, idx, tstamp);
+}
+
+/**
+ * ice_clear_phy_tstamp - Clear a timestamp from the timestamp block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block. For
+ * E822 devices, the block is the quad to clear from. For E810 devices, the
+ * block is the logical port to clear from.
+ */
+enum ice_status
+ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx)
+{
+	if (ice_is_e810(hw))
+		return ice_clear_phy_tstamp_e810(hw, block, idx);
+	else
+		return ice_clear_phy_tstamp_e822(hw, block, idx);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
new file mode 100644
index 0000000000000000000000000000000000000000..e63b9ca75260e7b19291dc91ef614a683126d1a8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_HW_H_
+#define _ICE_PTP_HW_H_
+
+enum ice_ptp_tmr_cmd {
+	INIT_TIME,
+	INIT_INCVAL,
+	ADJ_TIME,
+	ADJ_TIME_AT_TIME,
+	READ_TIME
+};
+
+enum ice_ptp_serdes {
+	ICE_PTP_SERDES_1G,
+	ICE_PTP_SERDES_10G,
+	ICE_PTP_SERDES_25G,
+	ICE_PTP_SERDES_40G,
+	ICE_PTP_SERDES_50G,
+	ICE_PTP_SERDES_100G
+};
+
+enum ice_ptp_link_spd {
+	ICE_PTP_LNK_SPD_1G,
+	ICE_PTP_LNK_SPD_10G,
+	ICE_PTP_LNK_SPD_25G,
+	ICE_PTP_LNK_SPD_25G_RS,
+	ICE_PTP_LNK_SPD_40G,
+	ICE_PTP_LNK_SPD_50G,
+	ICE_PTP_LNK_SPD_50G_RS,
+	ICE_PTP_LNK_SPD_100G_RS,
+	NUM_ICE_PTP_LNK_SPD /* Must be last */
+};
+
+enum ice_ptp_fec_mode {
+	ICE_PTP_FEC_MODE_NONE,
+	ICE_PTP_FEC_MODE_CLAUSE74,
+	ICE_PTP_FEC_MODE_RS_FEC
+};
+
+/**
+ * struct ice_time_ref_info_e822
+ * @pll_freq: Frequency of PLL that drives timer ticks in Hz
+ * @nominal_incval: increment to generate nanoseconds in GLTSYN_TIME_L
+ * @pps_delay: propagation delay of the PPS output signal
+ *
+ * Characteristic information for the various TIME_REF sources possible in the
+ * E822 devices
+ */
+struct ice_time_ref_info_e822 {
+	u64 pll_freq;
+	u64 nominal_incval;
+	u8 pps_delay;
+};
+
+/* Table of constants related to possible TIME_REF sources */
+extern const struct ice_time_ref_info_e822 e822_time_ref[NUM_ICE_TIME_REF_FREQ];
+
+/* Increment value to generate nanoseconds in the GLTSYN_TIME_L register for
+ * the E810 devices. Based off of a PLL with an 812.5 MHz frequency.
+ */
+#define ICE_PTP_NOMINAL_INCVAL_E810 0x13b13b13bULL
+
+/* Device agnostic functions */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw);
+u64 ice_ptp_read_src_incval(struct ice_hw *hw);
+bool ice_ptp_lock(struct ice_hw *hw);
+void ice_ptp_unlock(struct ice_hw *hw);
+void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd);
+enum ice_status ice_ptp_init_time(struct ice_hw *hw, u64 time);
+enum ice_status ice_ptp_write_incval(struct ice_hw *hw, u64 incval);
+enum ice_status ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval);
+enum ice_status ice_ptp_adj_clock(struct ice_hw *hw, s32 adj, bool lock_sbq);
+enum ice_status
+ice_ptp_adj_clock_at_time(struct ice_hw *hw, u64 at_time, s32 adj);
+enum ice_status
+ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp);
+enum ice_status
+ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx);
+
+/* E822 family functions */
+enum ice_status
+ice_read_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 *val);
+enum ice_status
+ice_write_phy_reg_e822(struct ice_hw *hw, u8 port, u16 offset, u32 val);
+enum ice_status
+ice_read_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 *val);
+enum ice_status
+ice_write_quad_reg_e822(struct ice_hw *hw, u8 quad, u16 offset, u32 val);
+enum ice_status
+ice_ptp_prep_port_adj_e822(struct ice_hw *hw, u8 port, s64 time,
+			   bool lock_sbq);
+enum ice_status
+ice_ptp_read_phy_incval_e822(struct ice_hw *hw, u8 port, u64 *incval);
+enum ice_status
+ice_ptp_read_port_capture(struct ice_hw *hw, u8 port, u64 *tx_ts, u64 *rx_ts);
+enum ice_status
+ice_ptp_one_port_cmd(struct ice_hw *hw, u8 port, enum ice_ptp_tmr_cmd cmd,
+		     bool lock_sbq);
+
+static inline u64 ice_e822_pll_freq(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].pll_freq;
+}
+
+static inline u64 ice_e822_nominal_incval(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].nominal_incval;
+}
+
+static inline u64 ice_e822_pps_delay(enum ice_time_ref_freq time_ref)
+{
+	return e822_time_ref[time_ref].pps_delay;
+}
+
+/* E822 Vernier calibration functions */
+enum ice_status ice_ptp_set_vernier_wl(struct ice_hw *hw);
+enum ice_status
+ice_phy_get_speed_and_fec_e822(struct ice_hw *hw, u8 port,
+			       enum ice_ptp_link_spd *link_out,
+			       enum ice_ptp_fec_mode *fec_out);
+void ice_phy_cfg_lane_e822(struct ice_hw *hw, u8 port);
+
+/* E810 family functions */
+enum ice_status ice_ptp_init_phy_e810(struct ice_hw *hw);
+
+#define PFTSYN_SEM_BYTES	4
+
+#define ICE_PTP_CLOCK_INDEX_0	0x00
+#define ICE_PTP_CLOCK_INDEX_1	0x01
+
+/* PHY timer commands */
+#define SEL_CPK_SRC	8
+#define SEL_PHY_SRC	3
+
+/* Time Sync command Definitions */
+#define GLTSYN_CMD_INIT_TIME		BIT(0)
+#define GLTSYN_CMD_INIT_INCVAL		BIT(1)
+#define GLTSYN_CMD_INIT_TIME_INCVAL	(BIT(0) | BIT(1))
+#define GLTSYN_CMD_ADJ_TIME		BIT(2)
+#define GLTSYN_CMD_ADJ_INIT_TIME	(BIT(2) | BIT(3))
+#define GLTSYN_CMD_READ_TIME		BIT(7)
+
+/* PHY port Time Sync command definitions */
+#define PHY_CMD_INIT_TIME		BIT(0)
+#define PHY_CMD_INIT_INCVAL		BIT(1)
+#define PHY_CMD_ADJ_TIME		(BIT(0) | BIT(1))
+#define PHY_CMD_ADJ_TIME_AT_TIME	(BIT(0) | BIT(2))
+#define PHY_CMD_READ_TIME		(BIT(0) | BIT(1) | BIT(2))
+
+#define TS_CMD_MASK_E810		0xFF
+#define TS_CMD_MASK			0xF
+#define SYNC_EXEC_CMD			0x3
+
+/* Macros to derive port low and high addresses on both quads */
+#define P_Q0_L(a, p) ((((a) + (0x2000 * (p)))) & 0xFFFF)
+#define P_Q0_H(a, p) ((((a) + (0x2000 * (p)))) >> 16)
+#define P_Q1_L(a, p) ((((a) - (0x2000 * ((p) - ICE_PORTS_PER_QUAD)))) & 0xFFFF)
+#define P_Q1_H(a, p) ((((a) - (0x2000 * ((p) - ICE_PORTS_PER_QUAD)))) >> 16)
+
+/* PHY QUAD register base addresses */
+#define Q_0_BASE			0x94000
+#define Q_1_BASE			0x114000
+
+/* Timestamp memory reset registers */
+#define Q_REG_TS_CTRL			0x618
+#define Q_REG_TS_CTRL_S			0
+#define Q_REG_TS_CTRL_M			BIT(0)
+
+/* Timestamp availability status registers */
+#define Q_REG_TX_MEMORY_STATUS_L	0xCF0
+#define Q_REG_TX_MEMORY_STATUS_U	0xCF4
+
+/* Tx FIFO status registers */
+#define Q_REG_FIFO23_STATUS		0xCF8
+#define Q_REG_FIFO01_STATUS		0xCFC
+#define Q_REG_FIFO02_S			0
+#define Q_REG_FIFO02_M			ICE_M(0x3FF, 0)
+#define Q_REG_FIFO13_S			10
+#define Q_REG_FIFO13_M			ICE_M(0x3FF, 10)
+
+/* Interrupt control Config registers */
+#define Q_REG_TX_MEM_GBL_CFG		0xC08
+#define Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_S	0
+#define Q_REG_TX_MEM_GBL_CFG_LANE_TYPE_M	BIT(0)
+#define Q_REG_TX_MEM_GBL_CFG_TX_TYPE_S	1
+#define Q_REG_TX_MEM_GBL_CFG_TX_TYPE_M	ICE_M(0xFF, 1)
+#define Q_REG_TX_MEM_GBL_CFG_INTR_THR_S	9
+#define Q_REG_TX_MEM_GBL_CFG_INTR_THR_M ICE_M(0x3F, 9)
+#define Q_REG_TX_MEM_GBL_CFG_INTR_ENA_S	15
+#define Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M	BIT(15)
+
+/* Tx Timestamp data registers */
+#define Q_REG_TX_MEMORY_BANK_START	0xA00
+
+/* PHY port register base addresses */
+#define P_0_BASE			0x80000
+#define P_4_BASE			0x106000
+
+/* Timestamp init registers */
+#define P_REG_RX_TIMER_INC_PRE_L	0x46C
+#define P_REG_RX_TIMER_INC_PRE_U	0x470
+#define P_REG_TX_TIMER_INC_PRE_L	0x44C
+#define P_REG_TX_TIMER_INC_PRE_U	0x450
+
+/* Timestamp match and adjust target registers */
+#define P_REG_RX_TIMER_CNT_ADJ_L	0x474
+#define P_REG_RX_TIMER_CNT_ADJ_U	0x478
+#define P_REG_TX_TIMER_CNT_ADJ_L	0x454
+#define P_REG_TX_TIMER_CNT_ADJ_U	0x458
+
+/* Timestamp capture registers */
+#define P_REG_RX_CAPTURE_L		0x4D8
+#define P_REG_RX_CAPTURE_U		0x4DC
+#define P_REG_TX_CAPTURE_L		0x4B4
+#define P_REG_TX_CAPTURE_U		0x4B8
+
+/* Timestamp PHY incval registers */
+#define P_REG_TIMETUS_L			0x410
+#define P_REG_TIMETUS_U			0x414
+
+#define P_REG_TIMETUS_LOW_M		0xFF
+#define P_REG_TIMETUS_HIGH_S		8
+
+/* PHY window length registers */
+#define P_REG_WL			0x40C
+
+#define PTP_VERNIER_WL			0x111ed
+
+/* PHY start registers */
+#define P_REG_PS			0x408
+#define P_REG_PS_START_S		0
+#define P_REG_PS_START_M		BIT(0)
+#define P_REG_PS_BYPASS_MODE_S		1
+#define P_REG_PS_BYPASS_MODE_M		BIT(1)
+#define P_REG_PS_ENA_CLK_S		2
+#define P_REG_PS_ENA_CLK_M		BIT(2)
+#define P_REG_PS_LOAD_OFFSET_S		3
+#define P_REG_PS_LOAD_OFFSET_M		BIT(3)
+#define P_REG_PS_SFT_RESET_S		11
+#define P_REG_PS_SFT_RESET_M		BIT(11)
+
+/* PHY offset valid registers */
+#define P_REG_TX_OV_STATUS		0x4D4
+#define P_REG_TX_OV_STATUS_OV_S		0
+#define P_REG_TX_OV_STATUS_OV_M		BIT(0)
+#define P_REG_RX_OV_STATUS		0x4F8
+#define P_REG_RX_OV_STATUS_OV_S		0
+#define P_REG_RX_OV_STATUS_OV_M		BIT(0)
+
+/* PHY offset ready registers */
+#define P_REG_TX_OR			0x45C
+#define P_REG_RX_OR			0x47C
+
+/* PHY total offset registers */
+#define P_REG_TOTAL_RX_OFFSET_L		0x460
+#define P_REG_TOTAL_RX_OFFSET_U		0x464
+#define P_REG_TOTAL_TX_OFFSET_L		0x440
+#define P_REG_TOTAL_TX_OFFSET_U		0x444
+
+/* Timestamp PAR/PCS registers */
+#define P_REG_UIX66_10G_40G_L		0x480
+#define P_REG_UIX66_10G_40G_U		0x484
+#define P_REG_UIX66_25G_100G_L		0x488
+#define P_REG_UIX66_25G_100G_U		0x48C
+#define P_REG_DESK_PAR_RX_TUS_L		0x490
+#define P_REG_DESK_PAR_RX_TUS_U		0x494
+#define P_REG_DESK_PAR_TX_TUS_L		0x498
+#define P_REG_DESK_PAR_TX_TUS_U		0x49C
+#define P_REG_DESK_PCS_RX_TUS_L		0x4A0
+#define P_REG_DESK_PCS_RX_TUS_U		0x4A4
+#define P_REG_DESK_PCS_TX_TUS_L		0x4A8
+#define P_REG_DESK_PCS_TX_TUS_U		0x4AC
+#define P_REG_PAR_RX_TUS_L		0x420
+#define P_REG_PAR_RX_TUS_U		0x424
+#define P_REG_PAR_TX_TUS_L		0x428
+#define P_REG_PAR_TX_TUS_U		0x42C
+#define P_REG_PCS_RX_TUS_L		0x430
+#define P_REG_PCS_RX_TUS_U		0x434
+#define P_REG_PCS_TX_TUS_L		0x438
+#define P_REG_PCS_TX_TUS_U		0x43C
+#define P_REG_PAR_RX_TIME_L		0x4F0
+#define P_REG_PAR_RX_TIME_U		0x4F4
+#define P_REG_PAR_TX_TIME_L		0x4CC
+#define P_REG_PAR_TX_TIME_U		0x4D0
+#define P_REG_PAR_PCS_RX_OFFSET_L	0x4E8
+#define P_REG_PAR_PCS_RX_OFFSET_U	0x4EC
+#define P_REG_PAR_PCS_TX_OFFSET_L	0x4C4
+#define P_REG_PAR_PCS_TX_OFFSET_U	0x4C8
+#define P_REG_LINK_SPEED		0x4FC
+#define P_REG_LINK_SPEED_SERDES_S	0
+#define P_REG_LINK_SPEED_SERDES_M	ICE_M(0x7, 0)
+#define P_REG_LINK_SPEED_FEC_MODE_S	3
+#define P_REG_LINK_SPEED_FEC_MODE_M	ICE_M(0x3, 3)
+#define P_REG_LINK_SPEED_FEC_MODE(reg)			\
+	(((reg) & P_REG_LINK_SPEED_FEC_MODE_M) >>	\
+	 P_REG_LINK_SPEED_FEC_MODE_S)
+
+/* PHY timestamp related registers */
+#define P_REG_PMD_ALIGNMENT		0x0FC
+#define P_REG_RX_80_TO_160_CNT		0x6FC
+#define P_REG_RX_80_TO_160_CNT_RXCYC_S	0
+#define P_REG_RX_80_TO_160_CNT_RXCYC_M	BIT(0)
+#define P_REG_RX_40_TO_160_CNT		0x8FC
+#define P_REG_RX_40_TO_160_CNT_RXCYC_S	0
+#define P_REG_RX_40_TO_160_CNT_RXCYC_M	ICE_M(0x3, 0)
+
+/* Rx FIFO status registers */
+#define P_REG_RX_OV_FS			0x4F8
+#define P_REG_RX_OV_FS_FIFO_STATUS_S	2
+#define P_REG_RX_OV_FS_FIFO_STATUS_M	ICE_M(0x3FF, 2)
+
+/* Timestamp command registers */
+#define P_REG_TX_TMR_CMD		0x448
+#define P_REG_RX_TMR_CMD		0x468
+
+/* E810 timesync enable register */
+#define ETH_GLTSYN_ENA(_i)		(0x03000348 + ((_i) * 4))
+
+/* E810 shadow init time registers */
+#define ETH_GLTSYN_SHTIME_0(i)		(0x03000368 + ((i) * 32))
+#define ETH_GLTSYN_SHTIME_L(i)		(0x0300036C + ((i) * 32))
+
+/* E810 shadow time adjust registers */
+#define ETH_GLTSYN_SHADJ_L(_i)		(0x03000378 + ((_i) * 32))
+#define ETH_GLTSYN_SHADJ_H(_i)		(0x0300037C + ((_i) * 32))
+
+/* E810 timer command register */
+#define ETH_GLTSYN_CMD			0x03000344
+
+/* Source timer incval macros */
+#define INCVAL_HIGH_M			0xFF
+
+/* Timestamp block macros */
+#define TS_LOW_M			0xFFFFFFFF
+#define TS_HIGH_M			0xFF
+#define TS_HIGH_S			32
+
+#define TS_PHY_LOW_M			0xFF
+#define TS_PHY_HIGH_M			0xFFFFFFFF
+#define TS_PHY_HIGH_S			8
+
+#define BYTES_PER_IDX_ADDR_L_U		8
+#define BYTES_PER_IDX_ADDR_L		4
+
+/* Internal PHY timestamp address */
+#define TS_L(a, idx) ((a) + ((idx) * BYTES_PER_IDX_ADDR_L_U))
+#define TS_H(a, idx) ((a) + ((idx) * BYTES_PER_IDX_ADDR_L_U +		\
+			     BYTES_PER_IDX_ADDR_L))
+
+/* External PHY timestamp address */
+#define TS_EXT(a, port, idx) ((a) + (0x1000 * (port)) +			\
+				 ((idx) * BYTES_PER_IDX_ADDR_L_U))
+
+#define LOW_TX_MEMORY_BANK_START	0x03090000
+#define HIGH_TX_MEMORY_BANK_START	0x03090004
+
+#endif /* _ICE_PTP_HW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f4d9b6e8a3323a0e83be4692d070b8e4d232d4f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_repr.c
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_eswitch.h"
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#include "ice_devlink.h"
+#endif /* CONFIG_NET_DEVLINK */
+#include "ice_virtchnl_pf.h"
+#include "ice_tc_lib.h"
+
+#ifdef HAVE_NDO_GET_PHYS_PORT_NAME
+/**
+ * ice_repr_get_sw_port_id - get port ID associated with representor
+ * @repr: pointer to port representor
+ */
+static int ice_repr_get_sw_port_id(struct ice_repr *repr)
+{
+	return repr->vf->pf->hw.port_info->lport;
+}
+
+/**
+ * ice_repr_get_phys_port_name - get phys port name
+ * @netdev: pointer to port representor netdev
+ * @buf: write here port name
+ * @len: max length of buf
+ */
+static int
+ice_repr_get_phys_port_name(struct net_device *netdev, char *buf, size_t len)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_repr *repr = np->repr;
+	int res;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	/* Devlink port is registered and devlink core is taking care of name formatting. */
+	if (repr->vf->devlink_port.registered)
+		return -EOPNOTSUPP;
+#endif /* CONFIG_NET_DEVLINK */
+
+	res = snprintf(buf, len, "pf%dvfr%d", ice_repr_get_sw_port_id(repr),
+		       repr->vf->vf_id);
+	if (res <= 0)
+		return -EOPNOTSUPP;
+	return 0;
+}
+#endif /* HAVE_NDO_GET_PHYS_PORT_NAME */
+
+/**
+ * ice_repr_get_stats64 - get VF stats for VFPR use
+ * @netdev: pointer to port representor netdev
+ * @stats: pointer to struct where stats can be stored
+ */
+#ifdef HAVE_VOID_NDO_GET_STATS64
+static void
+#else
+static struct rtnl_link_stats64 *
+#endif
+ice_repr_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_eth_stats *eth_stats;
+	struct ice_vsi *vsi;
+
+	if (ice_check_vf_ready_for_cfg(np->repr->vf))
+#ifdef HAVE_VOID_NDO_GET_STATS64
+		return;
+#else
+		return stats;
+#endif
+	vsi = np->repr->src_vsi;
+
+	ice_update_vsi_stats(vsi);
+	eth_stats = &vsi->eth_stats;
+
+	stats->tx_packets = eth_stats->tx_unicast + eth_stats->tx_broadcast +
+			    eth_stats->tx_multicast;
+	stats->rx_packets = eth_stats->rx_unicast + eth_stats->rx_broadcast +
+			    eth_stats->rx_multicast;
+	stats->tx_bytes = eth_stats->tx_bytes;
+	stats->rx_bytes = eth_stats->rx_bytes;
+	stats->multicast = eth_stats->rx_multicast;
+	stats->tx_errors = eth_stats->tx_errors;
+	stats->tx_dropped = eth_stats->tx_discards;
+	stats->rx_dropped = eth_stats->rx_discards;
+#ifndef HAVE_VOID_NDO_GET_STATS64
+
+	return stats;
+#endif
+}
+
+/**
+ * ice_netdev_to_repr - Get port representor for given netdevice
+ * @netdev: pointer to port representor netdev
+ */
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	return np->repr;
+}
+
+/**
+ * ice_repr_open - Enable port representor's network interface
+ * @netdev: network interface device structure
+ *
+ * The open entry point is called when a port representor's network
+ * interface is made active by the system (IFF_UP). Corresponding
+ * VF is notified about link status change.
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_repr_open(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+	struct ice_vf *vf;
+
+	vf = repr->vf;
+	vf->link_forced = true;
+	vf->link_up = true;
+	ice_vc_notify_vf_link_state(vf);
+
+	netif_carrier_on(netdev);
+	netif_tx_start_all_queues(netdev);
+
+	return 0;
+}
+
+/**
+ * ice_repr_stop - Disable port representor's network interface
+ * @netdev: network interface device structure
+ *
+ * The stop entry point is called when a port representor's network
+ * interface is de-activated by the system. Corresponding
+ * VF is notified about link status change.
+ *
+ * Returns 0 on success, negative value on failure
+ */
+static int ice_repr_stop(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+	struct ice_vf *vf;
+
+	vf = repr->vf;
+	vf->link_forced = true;
+	vf->link_up = false;
+	ice_vc_notify_vf_link_state(vf);
+
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK) && defined(HAVE_DEVLINK_PORT_ATTR_PCI_VF)
+static struct devlink_port *
+ice_repr_get_devlink_port(struct net_device *netdev)
+{
+	struct ice_repr *repr = ice_netdev_to_repr(netdev);
+
+	return &repr->vf->devlink_port;
+}
+#endif /* CONFIG_NET_DEVLINK && HAVE_DEVLINK_PORT_ATTR_PCI_VF*/
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+static int
+ice_repr_setup_tc_cls_flower(struct ice_repr *repr,
+			     struct flow_cls_offload *flower)
+{
+	switch (flower->command) {
+	case FLOW_CLS_REPLACE:
+		return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower);
+	case FLOW_CLS_DESTROY:
+		return ice_del_cls_flower(repr->src_vsi, flower);
+	default:
+		return -EINVAL;
+	}
+}
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+static int
+ice_repr_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+			   void *cb_priv)
+{
+	struct flow_cls_offload *flower = (struct flow_cls_offload *)type_data;
+	struct ice_netdev_priv *np = (struct ice_netdev_priv *)cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return ice_repr_setup_tc_cls_flower(np->repr, flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static LIST_HEAD(ice_repr_block_cb_list);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+static int
+#ifdef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+ice_repr_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+		  void *type_data)
+#elif defined(HAVE_NDO_SETUP_TC_CHAIN_INDEX)
+ice_repr_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+		  __always_unused chain_index, __be16 proto,
+		  struct tc_to_netdev *tc)
+#else
+ice_repr_setup_tc(struct net_device *netdev, u32 __always_unused handle,
+		  __be16 __always_unused proto, struct tc_to_netdev *tc)
+#endif
+{
+#ifndef HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+	unsigned int type = tc->type;
+#elif !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	struct tc_cls_flower_offload *cls_flower = (struct
+						   tc_cls_flower_offload *)
+						   type_data;
+#endif /* HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV */
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+
+	switch (type) {
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	case TC_SETUP_BLOCK:
+		return flow_block_cb_setup_simple((struct flow_block_offload *)
+						  type_data,
+						  &ice_repr_block_cb_list,
+						  ice_repr_setup_tc_block_cb,
+						  np, np, true);
+#elif !defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV) || !defined(HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO)
+	case TC_SETUP_CLSFLOWER:
+		return ice_repr_setup_tc_cls_flower(np->repr, cls_flower);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif /* ESWITCH_SUPPORT */
+
+static const struct net_device_ops ice_repr_netdev_ops = {
+#ifdef HAVE_NDO_GET_PHYS_PORT_NAME
+	.ndo_get_phys_port_name = ice_repr_get_phys_port_name,
+#endif /* HAVE_NDO_GET_PHYS_PORT_NAME */
+	.ndo_get_stats64 = ice_repr_get_stats64,
+	.ndo_open = ice_repr_open,
+	.ndo_stop = ice_repr_stop,
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	.ndo_start_xmit = ice_eswitch_port_start_xmit,
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	.ndo_get_devlink_port = ice_repr_get_devlink_port,
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#ifdef HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+	.extended.ndo_setup_tc_rh = ice_repr_setup_tc,
+#else
+	.ndo_setup_tc = ice_repr_setup_tc,
+#endif /* HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#endif /* CONFIG_NET_DEVLINK */
+};
+
+/**
+ * ice_is_port_repr_netdev - Check if a given netdevice is a port representor
+ * netdev
+ * @netdev: pointer to netdev
+ */
+bool ice_is_port_repr_netdev(struct net_device *netdev)
+{
+	return netdev && (netdev->netdev_ops == &ice_repr_netdev_ops);
+}
+
+/**
+ * ice_repr_reg_netdev - register port representor netdev
+ * @netdev: pointer to port representor netdev
+ */
+static int
+ice_repr_reg_netdev(struct net_device *netdev)
+{
+	eth_hw_addr_random(netdev);
+	netdev->netdev_ops = &ice_repr_netdev_ops;
+	ice_set_ethtool_repr_ops(netdev);
+
+#ifdef NETIF_F_HW_TC
+	netdev->hw_features |= NETIF_F_HW_TC;
+#endif /* NETIF_F_HW_TC */
+
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	return register_netdev(netdev);
+}
+
+/**
+ * ice_repr_add - add representor for VF
+ * @vf: pointer to VF structure
+ */
+static int ice_repr_add(struct ice_vf *vf)
+{
+	struct ice_q_vector *q_vector;
+	struct ice_netdev_priv *np;
+	struct ice_repr *repr;
+	int err;
+
+	repr = kzalloc(sizeof(*repr), GFP_KERNEL);
+	if (!repr)
+		return -ENOMEM;
+
+	repr->netdev = alloc_etherdev(sizeof(struct ice_netdev_priv));
+	if (!repr->netdev) {
+		err =  -ENOMEM;
+		goto err_alloc;
+	}
+
+	repr->src_vsi = ice_get_vf_vsi(vf);
+	repr->vf = vf;
+	vf->repr = repr;
+	np = netdev_priv(repr->netdev);
+	np->repr = repr;
+
+	q_vector = kzalloc(sizeof(*q_vector), GFP_KERNEL);
+	if (!q_vector) {
+		err = -ENOMEM;
+		goto err_alloc_q_vector;
+	}
+	repr->q_vector = q_vector;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	err = ice_devlink_create_vf_port(vf);
+	if (err)
+		goto err_devlink;
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+
+	err = ice_repr_reg_netdev(repr->netdev);
+	if (err)
+		goto err_netdev;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	devlink_port_type_eth_set(&vf->devlink_port, repr->netdev);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+
+	return 0;
+
+err_netdev:
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	ice_devlink_destroy_vf_port(vf);
+err_devlink:
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+	kfree(repr->q_vector);
+	vf->repr->q_vector = NULL;
+err_alloc_q_vector:
+	free_netdev(repr->netdev);
+	repr->netdev = NULL;
+err_alloc:
+	kfree(repr);
+	vf->repr = NULL;
+	return err;
+}
+
+/**
+ * ice_repr_rem - remove representor from VF
+ * @vf: pointer to VF structure
+ */
+static void ice_repr_rem(struct ice_vf *vf)
+{
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+#ifdef HAVE_DEVLINK_PORT_ATTR_PCI_VF
+	ice_devlink_destroy_vf_port(vf);
+#endif /* HAVE_DEVLINK_PORT_ATTR_PCI_VF */
+#endif /* CONFIG_NET_DEVLINK */
+	kfree(vf->repr->q_vector);
+	vf->repr->q_vector = NULL;
+	unregister_netdev(vf->repr->netdev);
+	free_netdev(vf->repr->netdev);
+	vf->repr->netdev = NULL;
+	kfree(vf->repr);
+	vf->repr = NULL;
+}
+
+
+/**
+ * ice_repr_add_for_all_vfs - add port representor for all VFs
+ * @pf: pointer to PF structure
+ */
+int ice_repr_add_for_all_vfs(struct ice_pf *pf)
+{
+	int err;
+	int i;
+
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		err = ice_repr_add(vf);
+		if (err)
+			goto err;
+
+		ice_vc_change_ops_to_repr(&vf->vc_ops);
+	}
+
+	return 0;
+
+err:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_repr_rem(vf);
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+
+	return err;
+}
+
+/**
+ * ice_repr_rem_from_all_vfs - remove port representor for all VFs
+ * @pf: pointer to PF structure
+ */
+void ice_repr_rem_from_all_vfs(struct ice_pf *pf)
+{
+	int i;
+
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_repr_rem(vf);
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+}
+
+/**
+ * ice_repr_start_tx_queues - start Tx queues of port representor
+ * @repr: pointer to repr structure
+ */
+void ice_repr_start_tx_queues(struct ice_repr *repr)
+{
+	netif_carrier_on(repr->netdev);
+	netif_tx_start_all_queues(repr->netdev);
+}
+
+/**
+ * ice_repr_stop_tx_queues - stop Tx queues of port representor
+ * @repr: pointer to repr structure
+ */
+void ice_repr_stop_tx_queues(struct ice_repr *repr)
+{
+	netif_carrier_off(repr->netdev);
+	netif_tx_stop_all_queues(repr->netdev);
+}
+
+#ifdef HAVE_METADATA_PORT_INFO
+/**
+ * ice_repr_set_traffic_vsi - set traffic VSI for port representor
+ * @repr: repr on with VSI will be set
+ * @vsi: pointer to VSI that will be used by port representor to pass traffic
+ */
+void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi)
+{
+	struct ice_netdev_priv *np = netdev_priv(repr->netdev);
+
+	np->vsi = vsi;
+}
+#endif /* HAVE_METADATA_PORT_INFO */
diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h
new file mode 100644
index 0000000000000000000000000000000000000000..a20c9d5ebe4fa68d7e7be158f1bb703d03750e6a
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_repr.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_REPR_H_
+#define _ICE_REPR_H_
+#include "ice.h"
+
+struct ice_repr {
+	struct ice_vsi *src_vsi;
+	struct ice_vf *vf;
+	struct ice_q_vector *q_vector;
+	struct net_device *netdev;
+	struct metadata_dst *dst;
+};
+
+int ice_repr_add_for_all_vfs(struct ice_pf *pf);
+void ice_repr_rem_from_all_vfs(struct ice_pf *pf);
+void ice_repr_start_tx_queues(struct ice_repr *repr);
+void ice_repr_stop_tx_queues(struct ice_repr *repr);
+#ifdef HAVE_METADATA_PORT_INFO
+void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi);
+#endif /* HAVE_METADATA_PORT_INFO */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+bool ice_is_port_repr_netdev(struct net_device *netdev);
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev);
+#else
+static inline
+bool ice_is_port_repr_netdev(struct net_device *netdev) { return false; }
+static inline
+struct ice_repr *ice_netdev_to_repr(struct net_device *netdev) { return NULL; }
+#endif /* CONFIG_NET_DEVLINK */
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h
new file mode 100644
index 0000000000000000000000000000000000000000..0610fd43fd533b2fd913745a7227d26e5be84f48
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_SBQ_CMD_H_
+#define _ICE_SBQ_CMD_H_
+
+/* This header file defines the Sideband Queue commands, error codes and
+ * descriptor format. It is shared between Firmware and Software.
+ */
+
+/* Sideband Queue command structure and opcodes */
+enum ice_sbq_opc {
+	/* Sideband Queue commands */
+	ice_sbq_opc_neigh_dev_req			= 0x0C00,
+	ice_sbq_opc_neigh_dev_ev			= 0x0C01
+};
+
+/* Sideband Queue descriptor. Indirect command
+ * and non posted
+ */
+struct ice_sbq_cmd_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 cmd_retval;
+
+	/* Opaque message data */
+	__le32 cookie_high;
+	__le32 cookie_low;
+
+	union {
+		__le16 cmd_len;
+		__le16 cmpl_len;
+	} param0;
+
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+struct ice_sbq_evt_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 cmd_retval;
+	u8 data[24];
+};
+
+enum ice_sbq_msg_dev {
+	rmn_0	= 0x02,
+	rmn_1	= 0x03,
+	rmn_2	= 0x04,
+	cgu	= 0x06
+};
+
+enum ice_sbq_msg_opcode {
+	ice_sbq_msg_rd	= 0x00,
+	ice_sbq_msg_wr	= 0x01
+};
+
+#define ICE_SBQ_MSG_FLAGS	0x40
+#define ICE_SBQ_MSG_SBE_FBE	0x0F
+
+struct ice_sbq_msg_req {
+	u8 dest_dev;
+	u8 src_dev;
+	u8 opcode;
+	u8 flags;
+	u8 sbe_fbe;
+	u8 func_id;
+	__le16 msg_addr_low;
+	__le32 msg_addr_high;
+	__le32 data;
+};
+
+struct ice_sbq_msg_cmpl {
+	u8 dest_dev;
+	u8 src_dev;
+	u8 opcode;
+	u8 flags;
+	__le32 data;
+};
+
+/* Internal struct */
+struct ice_sbq_msg_input {
+	u8 dest_dev;
+	u8 opcode;
+	u16 msg_addr_low;
+	u32 msg_addr_high;
+	u32 data;
+};
+#endif /* _ICE_SBQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index 2fde9653a608fb7bc520006561e01d1335b9f24a..bcfed501e99967ed6e20eda8bcf65774ceb37df5 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_sched.h"
 
+
+
 /**
  * ice_sched_add_root_node - Insert the Tx scheduler root node in SW DB
  * @pi: port information structure
@@ -129,7 +131,7 @@ ice_aqc_send_sched_elem_cmd(struct ice_hw *hw, enum ice_adminq_opc cmd_opc,
  */
 enum ice_status
 ice_aq_query_sched_elems(struct ice_hw *hw, u16 elems_req,
-			 struct ice_aqc_get_elem *buf, u16 buf_size,
+			 struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
 			 u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_get_sched_elems,
@@ -149,8 +151,8 @@ enum ice_status
 ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 		   struct ice_aqc_txsched_elem_data *info)
 {
+	struct ice_aqc_txsched_elem_data elem;
 	struct ice_sched_node *parent;
-	struct ice_aqc_get_elem elem;
 	struct ice_sched_node *node;
 	enum ice_status status;
 	struct ice_hw *hw;
@@ -164,19 +166,17 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	parent = ice_sched_find_node_by_teid(pi->root,
 					     le32_to_cpu(info->parent_teid));
 	if (!parent) {
-		ice_debug(hw, ICE_DBG_SCHED,
-			  "Parent Node not found for parent_teid=0x%x\n",
+		ice_debug(hw, ICE_DBG_SCHED, "Parent Node not found for parent_teid=0x%x\n",
 			  le32_to_cpu(info->parent_teid));
 		return ICE_ERR_PARAM;
 	}
 
-	/* query the current node information from FW  before additing it
+	/* query the current node information from FW before adding it
 	 * to the SW DB
 	 */
 	status = ice_sched_query_elem(hw, le32_to_cpu(info->node_teid), &elem);
 	if (status)
 		return status;
-
 	node = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*node), GFP_KERNEL);
 	if (!node)
 		return ICE_ERR_NO_MEMORY;
@@ -195,7 +195,7 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	node->parent = parent;
 	node->tx_sched_layer = layer;
 	parent->children[parent->num_children++] = node;
-	memcpy(&node->info, &elem.generic[0], sizeof(node->info));
+	node->info = elem;
 	return 0;
 }
 
@@ -238,7 +238,7 @@ ice_sched_remove_elems(struct ice_hw *hw, struct ice_sched_node *parent,
 	enum ice_status status;
 	u16 buf_size;
 
-	buf_size = sizeof(*buf) + sizeof(u32) * (num_nodes - 1);
+	buf_size = struct_size(buf, teid, num_nodes);
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
@@ -370,7 +370,7 @@ void ice_free_sched_node(struct ice_port_info *pi, struct ice_sched_node *node)
  *
  * Get default scheduler topology (0x400)
  */
-static enum ice_status
+enum ice_status
 ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
 		     struct ice_aqc_get_topo_elem *buf, u16 buf_size,
 		     u8 *num_branches, struct ice_sq_cd *cd)
@@ -410,6 +410,48 @@ ice_aq_add_sched_elems(struct ice_hw *hw, u16 grps_req,
 					   grps_added, cd);
 }
 
+/**
+ * ice_aq_cfg_sched_elems - configures scheduler elements
+ * @hw: pointer to the HW struct
+ * @elems_req: number of elements to configure
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @elems_cfgd: returns total number of elements configured
+ * @cd: pointer to command details structure or NULL
+ *
+ * Configure scheduling elements (0x0403)
+ */
+static enum ice_status
+ice_aq_cfg_sched_elems(struct ice_hw *hw, u16 elems_req,
+		       struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+		       u16 *elems_cfgd, struct ice_sq_cd *cd)
+{
+	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_cfg_sched_elems,
+					   elems_req, (void *)buf, buf_size,
+					   elems_cfgd, cd);
+}
+
+/**
+ * ice_aq_move_sched_elems - move scheduler elements
+ * @hw: pointer to the HW struct
+ * @grps_req: number of groups to move
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @grps_movd: returns total number of groups moved
+ * @cd: pointer to command details structure or NULL
+ *
+ * Move scheduling elements (0x0408)
+ */
+static enum ice_status
+ice_aq_move_sched_elems(struct ice_hw *hw, u16 grps_req,
+			struct ice_aqc_move_elem *buf, u16 buf_size,
+			u16 *grps_movd, struct ice_sq_cd *cd)
+{
+	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_move_sched_elems,
+					   grps_req, (void *)buf, buf_size,
+					   grps_movd, cd);
+}
+
 /**
  * ice_aq_suspend_sched_elems - suspend scheduler elements
  * @hw: pointer to the HW struct
@@ -422,8 +464,7 @@ ice_aq_add_sched_elems(struct ice_hw *hw, u16 grps_req,
  * Suspend scheduling elements (0x0409)
  */
 static enum ice_status
-ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req,
-			   struct ice_aqc_suspend_resume_elem *buf,
+ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req, __le32 *buf,
 			   u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_suspend_sched_elems,
@@ -443,8 +484,7 @@ ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req,
  * resume scheduling elements (0x040A)
  */
 static enum ice_status
-ice_aq_resume_sched_elems(struct ice_hw *hw, u16 elems_req,
-			  struct ice_aqc_suspend_resume_elem *buf,
+ice_aq_resume_sched_elems(struct ice_hw *hw, u16 elems_req, __le32 *buf,
 			  u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
 {
 	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_resume_sched_elems,
@@ -485,9 +525,9 @@ static enum ice_status
 ice_sched_suspend_resume_elems(struct ice_hw *hw, u8 num_nodes, u32 *node_teids,
 			       bool suspend)
 {
-	struct ice_aqc_suspend_resume_elem *buf;
 	u16 i, buf_size, num_elem_ret = 0;
 	enum ice_status status;
+	__le32 *buf;
 
 	buf_size = sizeof(*buf) * num_nodes;
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
@@ -495,7 +535,7 @@ ice_sched_suspend_resume_elems(struct ice_hw *hw, u8 num_nodes, u32 *node_teids,
 		return ICE_ERR_NO_MEMORY;
 
 	for (i = 0; i < num_nodes; i++)
-		buf->teid[i] = cpu_to_le32(node_teids[i]);
+		buf[i] = cpu_to_le32(node_teids[i]);
 
 	if (suspend)
 		status = ice_aq_suspend_sched_elems(hw, num_nodes, buf,
@@ -556,6 +596,208 @@ ice_alloc_lan_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 new_numqs)
 	return 0;
 }
 
+/**
+ * ice_alloc_rdma_q_ctx - allocate RDMA queue contexts for the given VSI and TC
+ * @hw: pointer to the HW struct
+ * @vsi_handle: VSI handle
+ * @tc: TC number
+ * @new_numqs: number of queues
+ */
+static enum ice_status
+ice_alloc_rdma_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 new_numqs)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+	struct ice_q_ctx *q_ctx;
+
+	vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	/* allocate RDMA queue contexts */
+	if (!vsi_ctx->rdma_q_ctx[tc]) {
+		vsi_ctx->rdma_q_ctx[tc] = devm_kcalloc(ice_hw_to_dev(hw),
+						       new_numqs,
+						       sizeof(*q_ctx),
+						       GFP_KERNEL);
+		if (!vsi_ctx->rdma_q_ctx[tc])
+			return ICE_ERR_NO_MEMORY;
+		vsi_ctx->num_rdma_q_entries[tc] = new_numqs;
+		return 0;
+	}
+	/* num queues are increased, update the queue contexts */
+	if (new_numqs > vsi_ctx->num_rdma_q_entries[tc]) {
+		u16 prev_num = vsi_ctx->num_rdma_q_entries[tc];
+
+		q_ctx = devm_kcalloc(ice_hw_to_dev(hw), new_numqs,
+				     sizeof(*q_ctx), GFP_KERNEL);
+		if (!q_ctx)
+			return ICE_ERR_NO_MEMORY;
+		memcpy(q_ctx, vsi_ctx->rdma_q_ctx[tc],
+		       prev_num * sizeof(*q_ctx));
+		devm_kfree(ice_hw_to_dev(hw), vsi_ctx->rdma_q_ctx[tc]);
+		vsi_ctx->rdma_q_ctx[tc] = q_ctx;
+		vsi_ctx->num_rdma_q_entries[tc] = new_numqs;
+	}
+	return 0;
+}
+
+/**
+ * ice_aq_rl_profile - performs a rate limiting task
+ * @hw: pointer to the HW struct
+ * @opcode: opcode for add, query, or remove profile(s)
+ * @num_profiles: the number of profiles
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_processed: number of processed add or remove profile(s) to return
+ * @cd: pointer to command details structure
+ *
+ * RL profile function to add, query, or remove profile(s)
+ */
+static enum ice_status
+ice_aq_rl_profile(struct ice_hw *hw, enum ice_adminq_opc opcode,
+		  u16 num_profiles, struct ice_aqc_rl_profile_elem *buf,
+		  u16 buf_size, u16 *num_processed, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_rl_profile *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.rl_profile;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd->num_profiles = cpu_to_le16(num_profiles);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && num_processed)
+		*num_processed = le16_to_cpu(cmd->num_processed);
+	return status;
+}
+
+/**
+ * ice_aq_add_rl_profile - adds rate limiting profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to be add
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_profiles_added: total number of profiles added to return
+ * @cd: pointer to command details structure
+ *
+ * Add RL profile (0x0410)
+ */
+static enum ice_status
+ice_aq_add_rl_profile(struct ice_hw *hw, u16 num_profiles,
+		      struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+		      u16 *num_profiles_added, struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_add_rl_profiles, num_profiles,
+				 buf, buf_size, num_profiles_added, cd);
+}
+
+/**
+ * ice_aq_query_rl_profile - query rate limiting profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to query
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure
+ *
+ * Query RL profile (0x0411)
+ */
+enum ice_status
+ice_aq_query_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_query_rl_profiles,
+				 num_profiles, buf, buf_size, NULL, cd);
+}
+
+/**
+ * ice_aq_remove_rl_profile - removes RL profile(s)
+ * @hw: pointer to the HW struct
+ * @num_profiles: the number of profile(s) to remove
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @num_profiles_removed: total number of profiles removed to return
+ * @cd: pointer to command details structure or NULL
+ *
+ * Remove RL profile (0x0415)
+ */
+static enum ice_status
+ice_aq_remove_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			 struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			 u16 *num_profiles_removed, struct ice_sq_cd *cd)
+{
+	return ice_aq_rl_profile(hw, ice_aqc_opc_remove_rl_profiles,
+				 num_profiles, buf, buf_size,
+				 num_profiles_removed, cd);
+}
+
+/**
+ * ice_sched_del_rl_profile - remove RL profile
+ * @hw: pointer to the HW struct
+ * @rl_info: rate limit profile information
+ *
+ * If the profile ID is not referenced anymore, it removes profile ID with
+ * its associated parameters from HW DB,and locally. The caller needs to
+ * hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_del_rl_profile(struct ice_hw *hw,
+			 struct ice_aqc_rl_profile_info *rl_info)
+{
+	struct ice_aqc_rl_profile_elem *buf;
+	u16 num_profiles_removed;
+	enum ice_status status;
+	u16 num_profiles = 1;
+
+	if (rl_info->prof_id_ref != 0)
+		return ICE_ERR_IN_USE;
+
+	/* Safe to remove profile ID */
+	buf = &rl_info->profile;
+	status = ice_aq_remove_rl_profile(hw, num_profiles, buf, sizeof(*buf),
+					  &num_profiles_removed, NULL);
+	if (status || num_profiles_removed != num_profiles)
+		return ICE_ERR_CFG;
+
+	/* Delete stale entry now */
+	list_del(&rl_info->list_entry);
+	devm_kfree(ice_hw_to_dev(hw), rl_info);
+	return status;
+}
+
+/**
+ * ice_sched_clear_rl_prof - clears RL prof entries
+ * @pi: port information structure
+ *
+ * This function removes all RL profile from HW as well as from SW DB.
+ */
+static void ice_sched_clear_rl_prof(struct ice_port_info *pi)
+{
+	u16 ln;
+	struct ice_hw *hw = pi->hw;
+
+
+	for (ln = 0; ln < hw->num_tx_sched_layers; ln++) {
+		struct ice_aqc_rl_profile_info *rl_prof_elem;
+		struct ice_aqc_rl_profile_info *rl_prof_tmp;
+
+		list_for_each_entry_safe(rl_prof_elem, rl_prof_tmp,
+					 &hw->rl_prof_list[ln], list_entry) {
+			enum ice_status status;
+
+			rl_prof_elem->prof_id_ref = 0;
+			status = ice_sched_del_rl_profile(hw, rl_prof_elem);
+			if (status) {
+				ice_debug(hw, ICE_DBG_SCHED, "Remove rl profile failed\n");
+				/* On error, free mem required */
+				list_del(&rl_prof_elem->list_entry);
+				devm_kfree(ice_hw_to_dev(hw), rl_prof_elem);
+			}
+		}
+	}
+}
+
 /**
  * ice_sched_clear_agg - clears the aggregator related information
  * @hw: pointer to the hardware structure
@@ -592,6 +834,8 @@ static void ice_sched_clear_tx_topo(struct ice_port_info *pi)
 {
 	if (!pi)
 		return;
+	/* remove RL profiles related lists */
+	ice_sched_clear_rl_prof(pi);
 	if (pi->root) {
 		ice_free_sched_node(pi, pi->root);
 		pi->root = NULL;
@@ -632,8 +876,7 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
 		hw->layer_info = NULL;
 	}
 
-	if (hw->port_info)
-		ice_sched_clear_port(hw->port_info);
+	ice_sched_clear_port(hw->port_info);
 
 	hw->num_tx_sched_layers = 0;
 	hw->num_tx_sched_phys_layers = 0;
@@ -641,6 +884,33 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
 	hw->max_cgds = 0;
 }
 
+/**
+ * ice_aq_cfg_l2_node_cgd - configures L2 node to CGD mapping
+ * @hw: pointer to the HW struct
+ * @num_l2_nodes: the number of L2 nodes whose CGDs to configure
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure or NULL
+ *
+ * Configure L2 Node CGD (0x0414)
+ */
+enum ice_status
+ice_aq_cfg_l2_node_cgd(struct ice_hw *hw, u16 num_l2_nodes,
+		       struct ice_aqc_cfg_l2_node_cgd_elem *buf,
+		       u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_cfg_l2_node_cgd *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.cfg_l2_node_cgd;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_cfg_l2_node_cgd);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_l2_nodes = cpu_to_le16(num_l2_nodes);
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+}
+
+
 /**
  * ice_sched_add_elems - add nodes to HW and SW DB
  * @pi: port information structure
@@ -663,10 +933,10 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 	u16 i, num_groups_added = 0;
 	enum ice_status status = 0;
 	struct ice_hw *hw = pi->hw;
-	size_t buf_size;
+	u16 buf_size;
 	u32 teid;
 
-	buf_size = struct_size(buf, generic, num_nodes - 1);
+	buf_size = struct_size(buf, generic, num_nodes);
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
@@ -704,8 +974,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 	for (i = 0; i < num_nodes; i++) {
 		status = ice_sched_add_node(pi, layer, &buf->generic[i]);
 		if (status) {
-			ice_debug(hw, ICE_DBG_SCHED,
-				  "add nodes in SW DB failed status =%d\n",
+			ice_debug(hw, ICE_DBG_SCHED, "add nodes in SW DB failed status =%d\n",
 				  status);
 			break;
 		}
@@ -713,8 +982,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		teid = le32_to_cpu(buf->generic[i].node_teid);
 		new_node = ice_sched_find_node_by_teid(parent, teid);
 		if (!new_node) {
-			ice_debug(hw, ICE_DBG_SCHED,
-				  "Node is missing for teid =%d\n", teid);
+			ice_debug(hw, ICE_DBG_SCHED, "Node is missing for teid =%d\n", teid);
 			break;
 		}
 
@@ -743,7 +1011,7 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 }
 
 /**
- * ice_sched_add_nodes_to_layer - Add nodes to a given layer
+ * ice_sched_add_nodes_to_hw_layer - Add nodes to hw layer
  * @pi: port information structure
  * @tc_node: pointer to TC node
  * @parent: pointer to parent node
@@ -752,82 +1020,107 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
  * @first_node_teid: pointer to the first node TEID
  * @num_nodes_added: pointer to number of nodes added
  *
- * This function add nodes to a given layer.
+ * Add nodes into specific hw layer.
  */
 static enum ice_status
-ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
-			     struct ice_sched_node *tc_node,
-			     struct ice_sched_node *parent, u8 layer,
-			     u16 num_nodes, u32 *first_node_teid,
-			     u16 *num_nodes_added)
+ice_sched_add_nodes_to_hw_layer(struct ice_port_info *pi,
+				struct ice_sched_node *tc_node,
+				struct ice_sched_node *parent, u8 layer,
+				u16 num_nodes, u32 *first_node_teid,
+				u16 *num_nodes_added)
 {
-	u32 *first_teid_ptr = first_node_teid;
-	u16 new_num_nodes, max_child_nodes;
-	enum ice_status status = 0;
-	struct ice_hw *hw = pi->hw;
-	u16 num_added = 0;
-	u32 temp;
+	u16 max_child_nodes;
 
 	*num_nodes_added = 0;
 
 	if (!num_nodes)
-		return status;
+		return 0;
 
-	if (!parent || layer < hw->sw_entry_point_layer)
+	if (!parent || layer < pi->hw->sw_entry_point_layer)
 		return ICE_ERR_PARAM;
 
 	/* max children per node per layer */
-	max_child_nodes = hw->max_children[parent->tx_sched_layer];
+	max_child_nodes = pi->hw->max_children[parent->tx_sched_layer];
 
-	/* current number of children + required nodes exceed max children ? */
+	/* current number of children + required nodes exceed max children */
 	if ((parent->num_children + num_nodes) > max_child_nodes) {
 		/* Fail if the parent is a TC node */
 		if (parent == tc_node)
 			return ICE_ERR_CFG;
+		return ICE_ERR_MAX_LIMIT;
+	}
+
+	return ice_sched_add_elems(pi, tc_node, parent, layer, num_nodes,
+				   num_nodes_added, first_node_teid);
+}
+
+/**
+ * ice_sched_add_nodes_to_layer - Add nodes to a given layer
+ * @pi: port information structure
+ * @tc_node: pointer to TC node
+ * @parent: pointer to parent node
+ * @layer: layer number to add nodes
+ * @num_nodes: number of nodes to be added
+ * @first_node_teid: pointer to the first node TEID
+ * @num_nodes_added: pointer to number of nodes added
+ *
+ * This function add nodes to a given layer.
+ */
+static enum ice_status
+ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
+			     struct ice_sched_node *tc_node,
+			     struct ice_sched_node *parent, u8 layer,
+			     u16 num_nodes, u32 *first_node_teid,
+			     u16 *num_nodes_added)
+{
+	u32 *first_teid_ptr = first_node_teid;
+	u16 new_num_nodes = num_nodes;
+	enum ice_status status = 0;
 
+	*num_nodes_added = 0;
+	while (*num_nodes_added < num_nodes) {
+		u16 max_child_nodes, num_added = 0;
+		/* cppcheck-suppress unusedVariable */
+		u32 temp;
+
+		status = ice_sched_add_nodes_to_hw_layer(pi, tc_node, parent,
+							 layer,	new_num_nodes,
+							 first_teid_ptr,
+							 &num_added);
+		if (!status)
+			*num_nodes_added += num_added;
+		/* added more nodes than requested ? */
+		if (*num_nodes_added > num_nodes) {
+			ice_debug(pi->hw, ICE_DBG_SCHED, "added extra nodes %d %d\n", num_nodes,
+				  *num_nodes_added);
+			status = ICE_ERR_CFG;
+			break;
+		}
+		/* break if all the nodes are added successfully */
+		if (!status && (*num_nodes_added == num_nodes))
+			break;
+		/* break if the error is not max limit */
+		if (status && status != ICE_ERR_MAX_LIMIT)
+			break;
+		/* Exceeded the max children */
+		max_child_nodes = pi->hw->max_children[parent->tx_sched_layer];
 		/* utilize all the spaces if the parent is not full */
 		if (parent->num_children < max_child_nodes) {
 			new_num_nodes = max_child_nodes - parent->num_children;
-			/* this recursion is intentional, and wouldn't
-			 * go more than 2 calls
+		} else {
+			/* This parent is full, try the next sibling */
+			parent = parent->sibling;
+			/* Don't modify the first node TEID memory if the
+			 * first node was added already in the above call.
+			 * Instead send some temp memory for all other
+			 * recursive calls.
 			 */
-			status = ice_sched_add_nodes_to_layer(pi, tc_node,
-							      parent, layer,
-							      new_num_nodes,
-							      first_node_teid,
-							      &num_added);
-			if (status)
-				return status;
+			if (num_added)
+				first_teid_ptr = &temp;
 
-			*num_nodes_added += num_added;
+			new_num_nodes = num_nodes - *num_nodes_added;
 		}
-		/* Don't modify the first node TEID memory if the first node was
-		 * added already in the above call. Instead send some temp
-		 * memory for all other recursive calls.
-		 */
-		if (num_added)
-			first_teid_ptr = &temp;
-
-		new_num_nodes = num_nodes - num_added;
-
-		/* This parent is full, try the next sibling */
-		parent = parent->sibling;
-
-		/* this recursion is intentional, for 1024 queues
-		 * per VSI, it goes max of 16 iterations.
-		 * 1024 / 8 = 128 layer 8 nodes
-		 * 128 /8 = 16 (add 8 nodes per iteration)
-		 */
-		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent,
-						      layer, new_num_nodes,
-						      first_teid_ptr,
-						      &num_added);
-		*num_nodes_added += num_added;
-		return status;
 	}
-
-	status = ice_sched_add_elems(pi, tc_node, parent, layer, num_nodes,
-				     num_nodes_added, first_node_teid);
 	return status;
 }
 
@@ -866,6 +1159,29 @@ static u8 ice_sched_get_vsi_layer(struct ice_hw *hw)
 	return hw->sw_entry_point_layer;
 }
 
+/**
+ * ice_sched_get_agg_layer - get the current aggregator layer number
+ * @hw: pointer to the HW struct
+ *
+ * This function returns the current aggregator layer number
+ */
+static u8 ice_sched_get_agg_layer(struct ice_hw *hw)
+{
+	/* Num Layers       aggregator layer
+	 *     9               4
+	 *     7 or less       sw_entry_point_layer
+	 */
+	/* calculate the aggregator layer based on number of layers. */
+	if (hw->num_tx_sched_layers > ICE_AGG_LAYER_OFFSET + 1) {
+		u8 layer = hw->num_tx_sched_layers - ICE_AGG_LAYER_OFFSET;
+
+		if (layer > hw->sw_entry_point_layer)
+			return layer;
+	}
+	return hw->sw_entry_point_layer;
+}
+
+
 /**
  * ice_rm_dflt_leaf_node - remove the default leaf node in the tree
  * @pi: port information structure
@@ -944,6 +1260,7 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 		return ICE_ERR_PARAM;
 	hw = pi->hw;
 
+
 	/* Query the Default Topology from FW */
 	buf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
 	if (!buf)
@@ -1014,6 +1331,8 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 	/* initialize the port for handling the scheduler tree */
 	pi->port_state = ICE_SCHED_PORT_STATE_READY;
 	mutex_init(&pi->sched_lock);
+	for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
+		INIT_LIST_HEAD(&hw->rl_prof_list[i]);
 
 err_init_port:
 	if (status && pi->root) {
@@ -1025,6 +1344,32 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 	return status;
 }
 
+/**
+ * ice_sched_get_node - Get the struct ice_sched_node for given TEID
+ * @pi: port information structure
+ * @teid: Scheduler node TEID
+ *
+ * This function retrieves the ice_sched_node struct for given TEID from
+ * the SW DB and returns it to the caller.
+ */
+struct ice_sched_node *ice_sched_get_node(struct ice_port_info *pi, u32 teid)
+{
+	struct ice_sched_node *node;
+
+	if (!pi)
+		return NULL;
+
+	/* Find the node starting from root */
+	mutex_lock(&pi->sched_lock);
+	node = ice_sched_find_node_by_teid(pi->root, teid);
+	mutex_unlock(&pi->sched_lock);
+
+	if (!node)
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Node not found for teid=0x%x\n", teid);
+
+	return node;
+}
+
 /**
  * ice_sched_query_res_alloc - query the FW for num of logical sched layers
  * @hw: pointer to the HW struct
@@ -1036,7 +1381,7 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 	struct ice_aqc_query_txsched_res_resp *buf;
 	enum ice_status status = 0;
 	__le16 max_sibl;
-	u16 i;
+	u8 i;
 
 	if (hw->layer_info)
 		return status;
@@ -1062,25 +1407,65 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 	 * and so on. This array will be populated from root (index 0) to
 	 * qgroup layer 7. Leaf node has no children.
 	 */
-	for (i = 0; i < hw->num_tx_sched_layers; i++) {
-		max_sibl = buf->layer_props[i].max_sibl_grp_sz;
+	for (i = 0; i < hw->num_tx_sched_layers - 1; i++) {
+		max_sibl = buf->layer_props[i + 1].max_sibl_grp_sz;
 		hw->max_children[i] = le16_to_cpu(max_sibl);
 	}
 
 	hw->layer_info = devm_kmemdup(ice_hw_to_dev(hw), buf->layer_props,
-				      (hw->num_tx_sched_layers *
-				       sizeof(*hw->layer_info)),
+				      (hw->num_tx_sched_layers * sizeof(*hw->layer_info)),
 				      GFP_KERNEL);
 	if (!hw->layer_info) {
 		status = ICE_ERR_NO_MEMORY;
 		goto sched_query_out;
 	}
 
+
 sched_query_out:
 	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
+/**
+ * ice_sched_get_psm_clk_freq - determine the PSM clock frequency
+ * @hw: pointer to the HW struct
+ *
+ * Determine the PSM clock frequency and store in HW struct
+ */
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw)
+{
+	u32 val, clk_src;
+
+	val = rd32(hw, GLGEN_CLKSTAT_SRC);
+	clk_src = (val & GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M) >>
+		GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S;
+
+#define PSM_CLK_SRC_367_MHZ 0x0
+#define PSM_CLK_SRC_416_MHZ 0x1
+#define PSM_CLK_SRC_446_MHZ 0x2
+#define PSM_CLK_SRC_390_MHZ 0x3
+
+	switch (clk_src) {
+	case PSM_CLK_SRC_367_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_367MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_416_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_416MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_446_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_390_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_390MHZ_IN_HZ;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_SCHED, "PSM clk_src unexpected %u\n",
+			  clk_src);
+		/* fall back to a safe default */
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+	}
+}
+
 /**
  * ice_sched_find_node_in_subtree - Find node in part of base node subtree
  * @hw: pointer to the HW struct
@@ -1090,7 +1475,7 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
  * This function checks whether a given node is part of the base node
  * subtree or not
  */
-static bool
+bool
 ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
 			       struct ice_sched_node *node)
 {
@@ -1114,6 +1499,53 @@ ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
 	return false;
 }
 
+/**
+ * ice_sched_get_free_qgrp - Scan all queue group siblings and find a free node
+ * @pi: port information structure
+ * @vsi_node: software VSI handle
+ * @qgrp_node: first queue group node identified for scanning
+ * @owner: LAN or RDMA
+ *
+ * This function retrieves a free LAN or RDMA queue group node by scanning
+ * qgrp_node and its siblings for the queue group with the fewest number
+ * of queues currently assigned.
+ */
+static struct ice_sched_node *
+ice_sched_get_free_qgrp(struct ice_port_info *pi,
+			struct ice_sched_node *vsi_node,
+			struct ice_sched_node *qgrp_node, u8 owner)
+{
+	struct ice_sched_node *min_qgrp;
+	u8 min_children;
+
+	if (!qgrp_node)
+		return qgrp_node;
+	min_children = qgrp_node->num_children;
+	if (!min_children)
+		return qgrp_node;
+	min_qgrp = qgrp_node;
+	/* scan all queue groups until find a node which has less than the
+	 * minimum number of children. This way all queue group nodes get
+	 * equal number of shares and active. The bandwidth will be equally
+	 * distributed across all queues.
+	 */
+	while (qgrp_node) {
+		/* make sure the qgroup node is part of the VSI subtree */
+		if (ice_sched_find_node_in_subtree(pi->hw, vsi_node, qgrp_node))
+			if (qgrp_node->num_children < min_children &&
+			    qgrp_node->owner == owner) {
+				/* replace the new min queue group node */
+				min_qgrp = qgrp_node;
+				min_children = min_qgrp->num_children;
+				/* break if it has no children, */
+				if (!min_children)
+					break;
+			}
+		qgrp_node = qgrp_node->sibling;
+	}
+	return min_qgrp;
+}
+
 /**
  * ice_sched_get_free_qparent - Get a free LAN or RDMA queue group node
  * @pi: port information structure
@@ -1127,7 +1559,7 @@ struct ice_sched_node *
 ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 			   u8 owner)
 {
-	struct ice_sched_node *vsi_node, *qgrp_node = NULL;
+	struct ice_sched_node *vsi_node, *qgrp_node;
 	struct ice_vsi_ctx *vsi_ctx;
 	u16 max_children;
 	u8 qgrp_layer;
@@ -1141,7 +1573,7 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 	vsi_node = vsi_ctx->sched.vsi_node[tc];
 	/* validate invalid VSI ID */
 	if (!vsi_node)
-		goto lan_q_exit;
+		return NULL;
 
 	/* get the first queue group node from VSI sub-tree */
 	qgrp_node = ice_sched_get_first_node(pi, vsi_node, qgrp_layer);
@@ -1154,28 +1586,28 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 		qgrp_node = qgrp_node->sibling;
 	}
 
-lan_q_exit:
-	return qgrp_node;
+	/* Select the best queue group */
+	return ice_sched_get_free_qgrp(pi, vsi_node, qgrp_node, owner);
 }
 
 /**
  * ice_sched_get_vsi_node - Get a VSI node based on VSI ID
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port information structure
  * @tc_node: pointer to the TC node
  * @vsi_handle: software VSI handle
  *
  * This function retrieves a VSI node for a given VSI ID from a given
  * TC branch
  */
-static struct ice_sched_node *
-ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
+struct ice_sched_node *
+ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		       u16 vsi_handle)
 {
 	struct ice_sched_node *node;
 	u8 vsi_layer;
 
-	vsi_layer = ice_sched_get_vsi_layer(hw);
-	node = ice_sched_get_first_node(hw->port_info, tc_node, vsi_layer);
+	vsi_layer = ice_sched_get_vsi_layer(pi->hw);
+	node = ice_sched_get_first_node(pi, tc_node, vsi_layer);
 
 	/* Check whether it already exists */
 	while (node) {
@@ -1187,6 +1619,65 @@ ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
 	return node;
 }
 
+/**
+ * ice_sched_get_agg_node - Get an aggregator node based on aggregator ID
+ * @pi: pointer to the port information structure
+ * @tc_node: pointer to the TC node
+ * @agg_id: aggregator ID
+ *
+ * This function retrieves an aggregator node for a given aggregator ID from
+ * a given TC branch
+ */
+static struct ice_sched_node *
+ice_sched_get_agg_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		       u32 agg_id)
+{
+	struct ice_sched_node *node;
+	struct ice_hw *hw = pi->hw;
+	u8 agg_layer;
+
+	if (!hw)
+		return NULL;
+	agg_layer = ice_sched_get_agg_layer(hw);
+	node = ice_sched_get_first_node(pi, tc_node, agg_layer);
+
+	/* Check whether it already exists */
+	while (node) {
+		if (node->agg_id == agg_id)
+			return node;
+		node = node->sibling;
+	}
+
+	return node;
+}
+
+/**
+ * ice_sched_check_node - Compare node parameters between SW DB and HW DB
+ * @hw: pointer to the HW struct
+ * @node: pointer to the ice_sched_node struct
+ *
+ * This function queries and compares the HW element with SW DB node parameters
+ */
+static bool ice_sched_check_node(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	enum ice_status status;
+	u32 node_teid;
+
+	node_teid = le32_to_cpu(node->info.node_teid);
+	status = ice_sched_query_elem(hw, node_teid, &buf);
+	if (status)
+		return false;
+
+	if (memcmp(&buf, &node->info, sizeof(buf))) {
+		ice_debug(hw, ICE_DBG_SCHED, "Node mismatch for teid=0x%x\n",
+			  node_teid);
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes
  * @hw: pointer to the HW struct
@@ -1240,7 +1731,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 	qgl = ice_sched_get_qgrp_layer(hw);
 	vsil = ice_sched_get_vsi_layer(hw);
-	parent = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	parent = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	for (i = vsil + 1; i <= qgl; i++) {
 		if (!parent)
 			return ICE_ERR_CFG;
@@ -1273,7 +1764,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 /**
  * ice_sched_calc_vsi_support_nodes - calculate number of VSI support nodes
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port info structure
  * @tc_node: pointer to TC node
  * @num_nodes: pointer to num nodes array
  *
@@ -1282,15 +1773,15 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
  * layers
  */
 static void
-ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
+ice_sched_calc_vsi_support_nodes(struct ice_port_info *pi,
 				 struct ice_sched_node *tc_node, u16 *num_nodes)
 {
 	struct ice_sched_node *node;
 	u8 vsil;
 	int i;
 
-	vsil = ice_sched_get_vsi_layer(hw);
-	for (i = vsil; i >= hw->sw_entry_point_layer; i--)
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	for (i = vsil; i >= pi->hw->sw_entry_point_layer; i--)
 		/* Add intermediate nodes if TC has no children and
 		 * need at least one node for VSI
 		 */
@@ -1300,11 +1791,11 @@ ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
 			/* If intermediate nodes are reached max children
 			 * then add a new one.
 			 */
-			node = ice_sched_get_first_node(hw->port_info, tc_node,
-							(u8)i);
+			node = ice_sched_get_first_node(pi, tc_node, (u8)i);
 			/* scan all the siblings */
 			while (node) {
-				if (node->num_children < hw->max_children[i])
+				if (node->num_children <
+				    pi->hw->max_children[i])
 					break;
 				node = node->sibling;
 			}
@@ -1384,14 +1875,13 @@ ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_handle, u8 tc)
 {
 	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
 	struct ice_sched_node *tc_node;
-	struct ice_hw *hw = pi->hw;
 
 	tc_node = ice_sched_get_tc_node(pi, tc);
 	if (!tc_node)
 		return ICE_ERR_PARAM;
 
 	/* calculate number of supported nodes needed for this VSI */
-	ice_sched_calc_vsi_support_nodes(hw, tc_node, num_nodes);
+	ice_sched_calc_vsi_support_nodes(pi, tc_node, num_nodes);
 
 	/* add VSI supported nodes to TC subtree */
 	return ice_sched_add_vsi_support_nodes(pi, vsi_handle, tc_node,
@@ -1424,7 +1914,7 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	if (!tc_node)
 		return ICE_ERR_CFG;
 
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	if (!vsi_node)
 		return ICE_ERR_CFG;
 
@@ -1432,13 +1922,22 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	if (!vsi_ctx)
 		return ICE_ERR_PARAM;
 
-	prev_numqs = vsi_ctx->sched.max_lanq[tc];
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		prev_numqs = vsi_ctx->sched.max_lanq[tc];
+	else
+		prev_numqs = vsi_ctx->sched.max_rdmaq[tc];
 	/* num queues are not changed or less than the previous number */
 	if (new_numqs <= prev_numqs)
 		return status;
-	status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs);
-	if (status)
-		return status;
+	if (owner == ICE_SCHED_NODE_OWNER_LAN) {
+		status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs);
+		if (status)
+			return status;
+	} else {
+		status = ice_alloc_rdma_q_ctx(hw, vsi_handle, tc, new_numqs);
+		if (status)
+			return status;
+	}
 
 	if (new_numqs)
 		ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes);
@@ -1453,7 +1952,10 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 					       new_num_nodes, owner);
 	if (status)
 		return status;
-	vsi_ctx->sched.max_lanq[tc] = new_numqs;
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		vsi_ctx->sched.max_lanq[tc] = new_numqs;
+	else
+		vsi_ctx->sched.max_rdmaq[tc] = new_numqs;
 
 	return 0;
 }
@@ -1487,7 +1989,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 	vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
 	if (!vsi_ctx)
 		return ICE_ERR_PARAM;
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 
 	/* suspend the VSI if TC is not enabled */
 	if (!enable) {
@@ -1508,7 +2010,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		if (status)
 			return status;
 
-		vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			return ICE_ERR_CFG;
 
@@ -1519,6 +2021,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		 * recreate the child nodes all the time in these cases.
 		 */
 		vsi_ctx->sched.max_lanq[tc] = 0;
+		vsi_ctx->sched.max_rdmaq[tc] = 0;
 	}
 
 	/* update the VSI child nodes */
@@ -1540,15 +2043,14 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 }
 
 /**
- * ice_sched_rm_agg_vsi_entry - remove aggregator related VSI info entry
+ * ice_sched_rm_agg_vsi_info - remove aggregator related VSI info entry
  * @pi: port information structure
  * @vsi_handle: software VSI handle
  *
  * This function removes single aggregator VSI info entry from
  * aggregator list.
  */
-static void
-ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle)
+static void ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle)
 {
 	struct ice_sched_agg_info *agg_info;
 	struct ice_sched_agg_info *atmp;
@@ -1618,13 +2120,12 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 		if (!tc_node)
 			continue;
 
-		vsi_node = ice_sched_get_vsi_node(pi->hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			continue;
 
 		if (ice_sched_is_leaf_node_present(vsi_node)) {
-			ice_debug(pi->hw, ICE_DBG_SCHED,
-				  "VSI has leaf nodes in TC %d\n", i);
+			ice_debug(pi->hw, ICE_DBG_SCHED, "VSI has leaf nodes in TC %d\n", i);
 			status = ICE_ERR_IN_USE;
 			goto exit_sched_rm_vsi_cfg;
 		}
@@ -1650,6 +2151,8 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 		}
 		if (owner == ICE_SCHED_NODE_OWNER_LAN)
 			vsi_ctx->sched.max_lanq[i] = 0;
+		else
+			vsi_ctx->sched.max_rdmaq[i] = 0;
 	}
 	status = 0;
 
@@ -1670,3 +2173,3654 @@ enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle)
 {
 	return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_LAN);
 }
+
+/**
+ * ice_rm_vsi_rdma_cfg - remove VSI and its RDMA children nodes
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function clears the VSI and its RDMA children nodes from scheduler tree
+ * for all TCs.
+ */
+enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle)
+{
+	return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_RDMA);
+}
+
+/**
+ * ice_sched_is_tree_balanced - Check tree nodes are identical or not
+ * @hw: pointer to the HW struct
+ * @node: pointer to the ice_sched_node struct
+ *
+ * This function compares all the nodes for a given tree against HW DB nodes
+ * This function needs to be called with the port_info->sched_lock held
+ */
+bool ice_sched_is_tree_balanced(struct ice_hw *hw, struct ice_sched_node *node)
+{
+	u8 i;
+
+	/* start from the leaf node */
+	for (i = 0; i < node->num_children; i++)
+		/* Fail if node doesn't match with the SW DB
+		 * this recursion is intentional, and wouldn't
+		 * go more than 9 calls
+		 */
+		if (!ice_sched_is_tree_balanced(hw, node->children[i]))
+			return false;
+
+	return ice_sched_check_node(hw, node);
+}
+
+/**
+ * ice_aq_query_node_to_root - retrieve the tree topology for a given node TEID
+ * @hw: pointer to the HW struct
+ * @node_teid: node TEID
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function retrieves the tree topology from the firmware for a given
+ * node TEID to the root node.
+ */
+enum ice_status
+ice_aq_query_node_to_root(struct ice_hw *hw, u32 node_teid,
+			  struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+			  struct ice_sq_cd *cd)
+{
+	struct ice_aqc_query_node_to_root *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.query_node_to_root;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_node_to_root);
+	cmd->teid = cpu_to_le32(node_teid);
+	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+}
+
+/**
+ * ice_get_agg_info - get the aggregator ID
+ * @hw: pointer to the hardware structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates aggregator ID. The function returns info if
+ * aggregator ID is present in list otherwise it returns null.
+ */
+static struct ice_sched_agg_info *
+ice_get_agg_info(struct ice_hw *hw, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id)
+			return agg_info;
+
+	return NULL;
+}
+
+/**
+ * ice_sched_get_free_vsi_parent - Find a free parent node in aggregator subtree
+ * @hw: pointer to the HW struct
+ * @node: pointer to a child node
+ * @num_nodes: num nodes count array
+ *
+ * This function walks through the aggregator subtree to find a free parent
+ * node
+ */
+static struct ice_sched_node *
+ice_sched_get_free_vsi_parent(struct ice_hw *hw, struct ice_sched_node *node,
+			      u16 *num_nodes)
+{
+	u8 l = node->tx_sched_layer;
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(hw);
+
+	/* Is it VSI parent layer ? */
+	if (l == vsil - 1)
+		return (node->num_children < hw->max_children[l]) ? node : NULL;
+
+	/* We have intermediate nodes. Let's walk through the subtree. If the
+	 * intermediate node has space to add a new node then clear the count
+	 */
+	if (node->num_children < hw->max_children[l])
+		num_nodes[l] = 0;
+	/* The below recursive call is intentional and wouldn't go more than
+	 * 2 or 3 iterations.
+	 */
+
+	for (i = 0; i < node->num_children; i++) {
+		struct ice_sched_node *parent;
+
+		parent = ice_sched_get_free_vsi_parent(hw, node->children[i],
+						       num_nodes);
+		if (parent)
+			return parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_sched_update_parent - update the new parent in SW DB
+ * @new_parent: pointer to a new parent node
+ * @node: pointer to a child node
+ *
+ * This function removes the child from the old parent and adds it to a new
+ * parent
+ */
+static void
+ice_sched_update_parent(struct ice_sched_node *new_parent,
+			struct ice_sched_node *node)
+{
+	struct ice_sched_node *old_parent;
+	u8 i, j;
+
+	old_parent = node->parent;
+
+	/* update the old parent children */
+	for (i = 0; i < old_parent->num_children; i++)
+		if (old_parent->children[i] == node) {
+			for (j = i + 1; j < old_parent->num_children; j++)
+				old_parent->children[j - 1] =
+					old_parent->children[j];
+			old_parent->num_children--;
+			break;
+		}
+
+	/* now move the node to a new parent */
+	new_parent->children[new_parent->num_children++] = node;
+	node->parent = new_parent;
+	node->info.parent_teid = new_parent->info.node_teid;
+}
+
+/**
+ * ice_sched_move_nodes - move child nodes to a given parent
+ * @pi: port information structure
+ * @parent: pointer to parent node
+ * @num_items: number of child nodes to be moved
+ * @list: pointer to child node teids
+ *
+ * This function move the child nodes to a given parent.
+ */
+static enum ice_status
+ice_sched_move_nodes(struct ice_port_info *pi, struct ice_sched_node *parent,
+		     u16 num_items, u32 *list)
+{
+	struct ice_aqc_move_elem *buf;
+	struct ice_sched_node *node;
+	enum ice_status status = 0;
+	u16 i, grps_movd = 0;
+	struct ice_hw *hw;
+	u16 buf_len;
+
+	hw = pi->hw;
+
+	if (!parent || !num_items)
+		return ICE_ERR_PARAM;
+
+	/* Does parent have enough space */
+	if (parent->num_children + num_items >
+	    hw->max_children[parent->tx_sched_layer])
+		return ICE_ERR_AQ_FULL;
+
+	buf_len = struct_size(buf, teid, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_items; i++) {
+		node = ice_sched_find_node_by_teid(pi->root, list[i]);
+		if (!node) {
+			status = ICE_ERR_PARAM;
+			goto move_err_exit;
+		}
+
+		buf->hdr.src_parent_teid = node->info.parent_teid;
+		buf->hdr.dest_parent_teid = parent->info.node_teid;
+		buf->teid[0] = node->info.node_teid;
+		buf->hdr.num_elems = cpu_to_le16(1);
+		status = ice_aq_move_sched_elems(hw, 1, buf, buf_len,
+						 &grps_movd, NULL);
+		if (status && grps_movd != 1) {
+			status = ICE_ERR_CFG;
+			goto move_err_exit;
+		}
+
+		/* update the SW DB */
+		ice_sched_update_parent(parent, node);
+	}
+
+move_err_exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_sched_move_vsi_to_agg - move VSI to aggregator node
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function moves a VSI to an aggregator node or its subtree.
+ * Intermediate nodes may be created if required.
+ */
+static enum ice_status
+ice_sched_move_vsi_to_agg(struct ice_port_info *pi, u16 vsi_handle, u32 agg_id,
+			  u8 tc)
+{
+	struct ice_sched_node *vsi_node, *agg_node, *tc_node, *parent;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	u32 first_node_teid, vsi_teid;
+	enum ice_status status;
+	u16 num_nodes_added;
+	u8 aggl, vsil, i;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+	if (!vsi_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Is this VSI already part of given aggregator? */
+	if (ice_sched_find_node_in_subtree(pi->hw, agg_node, vsi_node))
+		return 0;
+
+	aggl = ice_sched_get_agg_layer(pi->hw);
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+
+	/* set intermediate node count to 1 between aggregator and VSI layers */
+	for (i = aggl + 1; i < vsil; i++)
+		num_nodes[i] = 1;
+
+	/* Check if the aggregator subtree has any free node to add the VSI */
+	for (i = 0; i < agg_node->num_children; i++) {
+		parent = ice_sched_get_free_vsi_parent(pi->hw,
+						       agg_node->children[i],
+						       num_nodes);
+		if (parent)
+			goto move_nodes;
+	}
+
+	/* add new nodes */
+	parent = agg_node;
+	for (i = aggl + 1; i < vsil; i++) {
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added)
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+		else
+			parent = parent->children[0];
+
+		if (!parent)
+			return ICE_ERR_CFG;
+	}
+
+move_nodes:
+	vsi_teid = le32_to_cpu(vsi_node->info.node_teid);
+	return ice_sched_move_nodes(pi, parent, 1, &vsi_teid);
+}
+
+/**
+ * ice_move_all_vsi_to_dflt_agg - move all VSI(s) to default aggregator
+ * @pi: port information structure
+ * @agg_info: aggregator info
+ * @tc: traffic class number
+ * @rm_vsi_info: true or false
+ *
+ * This function move all the VSI(s) to the default aggregator and delete
+ * aggregator VSI info based on passed in boolean parameter rm_vsi_info. The
+ * caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_move_all_vsi_to_dflt_agg(struct ice_port_info *pi,
+			     struct ice_sched_agg_info *agg_info, u8 tc,
+			     bool rm_vsi_info)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_vsi_info *tmp;
+	enum ice_status status = 0;
+
+	list_for_each_entry_safe(agg_vsi_info, tmp, &agg_info->agg_vsi_list,
+				 list_entry) {
+		u16 vsi_handle = agg_vsi_info->vsi_handle;
+
+		/* Move VSI to default aggregator */
+		if (!ice_is_tc_ena(agg_vsi_info->tc_bitmap[0], tc))
+			continue;
+
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle,
+						   ICE_DFLT_AGG_ID, tc);
+		if (status)
+			break;
+
+		clear_bit(tc, agg_vsi_info->tc_bitmap);
+		if (rm_vsi_info && !agg_vsi_info->tc_bitmap[0]) {
+			list_del(&agg_vsi_info->list_entry);
+			devm_kfree(ice_hw_to_dev(pi->hw), agg_vsi_info);
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_is_agg_inuse - check whether the aggregator is in use or not
+ * @pi: port information structure
+ * @node: node pointer
+ *
+ * This function checks whether the aggregator is attached with any VSI or not.
+ */
+static bool
+ice_sched_is_agg_inuse(struct ice_port_info *pi, struct ice_sched_node *node)
+{
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	if (node->tx_sched_layer < vsil - 1) {
+		for (i = 0; i < node->num_children; i++)
+			if (ice_sched_is_agg_inuse(pi, node->children[i]))
+				return true;
+		return false;
+	} else {
+		return node->num_children ? true : false;
+	}
+}
+
+/**
+ * ice_sched_rm_agg_cfg - remove the aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function removes the aggregator node and intermediate nodes if any
+ * from the given TC
+ */
+static enum ice_status
+ice_sched_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *tc_node, *agg_node;
+	struct ice_hw *hw = pi->hw;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Can't remove the aggregator node if it has children */
+	if (ice_sched_is_agg_inuse(pi, agg_node))
+		return ICE_ERR_IN_USE;
+
+	/* need to remove the whole subtree if aggregator node is the
+	 * only child.
+	 */
+	while (agg_node->tx_sched_layer > hw->sw_entry_point_layer) {
+		struct ice_sched_node *parent = agg_node->parent;
+
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		if (parent->num_children > 1)
+			break;
+
+		agg_node = parent;
+	}
+
+	ice_free_sched_node(pi, agg_node);
+	return 0;
+}
+
+/**
+ * ice_rm_agg_cfg_tc - remove aggregator configuration for TC
+ * @pi: port information structure
+ * @agg_info: aggregator ID
+ * @tc: TC number
+ * @rm_vsi_info: bool value true or false
+ *
+ * This function removes aggregator reference to VSI of given TC. It removes
+ * the aggregator configuration completely for requested TC. The caller needs
+ * to hold the scheduler lock.
+ */
+static enum ice_status
+ice_rm_agg_cfg_tc(struct ice_port_info *pi, struct ice_sched_agg_info *agg_info,
+		  u8 tc, bool rm_vsi_info)
+{
+	enum ice_status status = 0;
+
+	/* If nothing to remove - return success */
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		goto exit_rm_agg_cfg_tc;
+
+	status = ice_move_all_vsi_to_dflt_agg(pi, agg_info, tc, rm_vsi_info);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	/* Delete aggregator node(s) */
+	status = ice_sched_rm_agg_cfg(pi, agg_info->agg_id, tc);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	clear_bit(tc, agg_info->tc_bitmap);
+exit_rm_agg_cfg_tc:
+	return status;
+}
+
+/**
+ * ice_save_agg_tc_bitmap - save aggregator TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc_bitmap: 8 bits TC bitmap
+ *
+ * Save aggregator TC bitmap. This function needs to be called with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_tc_bitmap(struct ice_port_info *pi, u32 agg_id,
+		       unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_add_agg_cfg - create an aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function creates an aggregator node and intermediate nodes if required
+ * for the given TC
+ */
+static enum ice_status
+ice_sched_add_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *parent, *agg_node, *tc_node;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u32 first_node_teid;
+	u16 num_nodes_added;
+	u8 i, aggl;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	/* Does Agg node already exist ? */
+	if (agg_node)
+		return status;
+
+	aggl = ice_sched_get_agg_layer(hw);
+
+	/* need one node in Agg layer */
+	num_nodes[aggl] = 1;
+
+	/* Check whether the intermediate nodes have space to add the
+	 * new aggregator. If they are full, then SW needs to allocate a new
+	 * intermediate node on those layers
+	 */
+	for (i = hw->sw_entry_point_layer; i < aggl; i++) {
+		parent = ice_sched_get_first_node(pi, tc_node, i);
+
+		/* scan all the siblings */
+		while (parent) {
+			if (parent->num_children < hw->max_children[i])
+				break;
+			parent = parent->sibling;
+		}
+
+		/* all the nodes are full, reserve one for this layer */
+		if (!parent)
+			num_nodes[i]++;
+	}
+
+	/* add the aggregator node */
+	parent = tc_node;
+	for (i = hw->sw_entry_point_layer; i <= aggl; i++) {
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added) {
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+			/* register aggregator ID with the aggregator node */
+			if (parent && i == aggl)
+				parent->agg_id = agg_id;
+		} else {
+			parent = parent->children[0];
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_sched_cfg_agg - configure aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * It registers a unique aggregator node into scheduler services. It
+ * allows a user to register with a unique ID to track it's resources.
+ * The aggregator type determines if this is a queue group, VSI group
+ * or aggregator group. It then creates the aggregator node(s) for requested
+ * TC(s) or removes an existing aggregator node including its configuration
+ * if indicated via tc_bitmap. Call ice_rm_agg_cfg to release aggregator
+ * resources and remove aggregator ID.
+ * This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+		  enum ice_agg_type agg_type, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info) {
+		/* Create new entry for new aggregator ID */
+		agg_info = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*agg_info),
+					GFP_KERNEL);
+		if (!agg_info)
+			return ICE_ERR_NO_MEMORY;
+
+		agg_info->agg_id = agg_id;
+		agg_info->agg_type = agg_type;
+		agg_info->tc_bitmap[0] = 0;
+
+		/* Initialize the aggregator VSI list head */
+		INIT_LIST_HEAD(&agg_info->agg_vsi_list);
+
+		/* Add new entry in aggregator list */
+		list_add(&agg_info->list_entry, &hw->agg_list);
+	}
+	/* Create aggregator node(s) for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc)) {
+			/* Delete aggregator cfg TC if it exists previously */
+			status = ice_rm_agg_cfg_tc(pi, agg_info, tc, false);
+			if (status)
+				break;
+			continue;
+		}
+
+		/* Check if aggregator node for TC already exists */
+		if (ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+			continue;
+
+		/* Create new aggregator node for TC */
+		status = ice_sched_add_agg_cfg(pi, agg_id, tc);
+		if (status)
+			break;
+
+		/* Save aggregator node's TC information */
+		set_bit(tc, agg_info->tc_bitmap);
+	}
+
+	return status;
+}
+
+/**
+ * ice_cfg_agg - config aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * This function configures aggregator node(s).
+ */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id, enum ice_agg_type agg_type,
+	    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_cfg_agg(pi, agg_id, agg_type,
+				   (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_tc_bitmap(pi, agg_id,
+						(unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_get_agg_vsi_info - get the aggregator ID
+ * @agg_info: aggregator info
+ * @vsi_handle: software VSI handle
+ *
+ * The function returns aggregator VSI info based on VSI handle. This function
+ * needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_vsi_info *
+ice_get_agg_vsi_info(struct ice_sched_agg_info *agg_info, u16 vsi_handle)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+	list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list, list_entry)
+		if (agg_vsi_info->vsi_handle == vsi_handle)
+			return agg_vsi_info;
+
+	return NULL;
+}
+
+/**
+ * ice_get_vsi_agg_info - get the aggregator info of VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: Sw VSI handle
+ *
+ * The function returns aggregator info of VSI represented via vsi_handle. The
+ * VSI has in this case a different aggregator than the default one. This
+ * function needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_info *
+ice_get_vsi_agg_info(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+		if (agg_vsi_info)
+			return agg_info;
+	}
+	return NULL;
+}
+
+/**
+ * ice_save_agg_vsi_tc_bitmap - save aggregator VSI TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Save VSI to aggregator TC bitmap. This function needs to call with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_vsi_tc_bitmap(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+			   unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_vsi_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_assoc_vsi_to_agg - associate/move VSI to new/default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * This function moves VSI to a new or default aggregator node. If VSI is
+ * already associated to the aggregator node then no operation is performed on
+ * the tree. This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_assoc_vsi_to_agg(struct ice_port_info *pi, u32 agg_id,
+			   u16 vsi_handle, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info, *old_agg_vsi_info = NULL;
+	struct ice_sched_agg_info *agg_info, *old_agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* If the vsi is already part of another aggregator then update
+	 * its vsi info list
+	 */
+	old_agg_info = ice_get_vsi_agg_info(hw, vsi_handle);
+	if (old_agg_info && old_agg_info != agg_info) {
+		struct ice_sched_agg_vsi_info *vtmp;
+
+		list_for_each_entry_safe(old_agg_vsi_info, vtmp,
+					 &old_agg_info->agg_vsi_list,
+					 list_entry)
+			if (old_agg_vsi_info->vsi_handle == vsi_handle)
+				break;
+	}
+
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info) {
+		/* Create new entry for VSI under aggregator list */
+		agg_vsi_info = devm_kzalloc(ice_hw_to_dev(hw),
+					    sizeof(*agg_vsi_info), GFP_KERNEL);
+		if (!agg_vsi_info)
+			return ICE_ERR_PARAM;
+
+		/* add VSI ID into the aggregator list */
+		agg_vsi_info->vsi_handle = vsi_handle;
+		list_add(&agg_vsi_info->list_entry, &agg_info->agg_vsi_list);
+	}
+	/* Move VSI node to new aggregator node for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc))
+			continue;
+
+		/* Move VSI to new aggregator */
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle, agg_id, tc);
+		if (status)
+			break;
+
+		set_bit(tc, agg_vsi_info->tc_bitmap);
+		if (old_agg_vsi_info)
+			clear_bit(tc, old_agg_vsi_info->tc_bitmap);
+	}
+	if (old_agg_vsi_info && !old_agg_vsi_info->tc_bitmap[0]) {
+		list_del(&old_agg_vsi_info->list_entry);
+		devm_kfree(ice_hw_to_dev(pi->hw), old_agg_vsi_info);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_rm_unused_rl_prof - remove unused RL profile
+ * @hw: pointer to the hardware structure
+ *
+ * This function removes unused rate limit profiles from the HW and
+ * SW DB. The caller needs to hold scheduler lock.
+ */
+static void ice_sched_rm_unused_rl_prof(struct ice_hw *hw)
+{
+	u16 ln;
+
+	for (ln = 0; ln < hw->num_tx_sched_layers; ln++) {
+		struct ice_aqc_rl_profile_info *rl_prof_elem;
+		struct ice_aqc_rl_profile_info *rl_prof_tmp;
+
+		list_for_each_entry_safe(rl_prof_elem, rl_prof_tmp,
+					 &hw->rl_prof_list[ln], list_entry) {
+			if (!ice_sched_del_rl_profile(hw, rl_prof_elem))
+				ice_debug(hw, ICE_DBG_SCHED, "Removed rl profile\n");
+		}
+	}
+}
+
+/**
+ * ice_sched_update_elem - update element
+ * @hw: pointer to the HW struct
+ * @node: pointer to node
+ * @info: node info to update
+ *
+ * Update the HW DB, and local SW DB of node. Update the scheduling
+ * parameters of node from argument info data buffer (Info->data buf) and
+ * returns success or error on config sched element failure. The caller
+ * needs to hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_update_elem(struct ice_hw *hw, struct ice_sched_node *node,
+		      struct ice_aqc_txsched_elem_data *info)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	enum ice_status status;
+	u16 elem_cfgd = 0;
+	u16 num_elems = 1;
+
+	buf = *info;
+	/* Parent TEID is reserved field in this aq call */
+	buf.parent_teid = 0;
+	/* Element type is reserved field in this aq call */
+	buf.data.elem_type = 0;
+	/* Flags is reserved field in this aq call */
+	buf.data.flags = 0;
+
+	/* Update HW DB */
+	/* Configure element node */
+	status = ice_aq_cfg_sched_elems(hw, num_elems, &buf, sizeof(buf),
+					&elem_cfgd, NULL);
+	if (status || elem_cfgd != num_elems) {
+		ice_debug(hw, ICE_DBG_SCHED, "Config sched elem error\n");
+		return ICE_ERR_CFG;
+	}
+
+	/* Config success case */
+	/* Now update local SW DB */
+	/* Only copy the data portion of info buffer */
+	node->info.data = info->data;
+	return status;
+}
+
+/**
+ * ice_sched_cfg_node_bw_alloc - configure node BW weight/alloc params
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @rl_type: rate limit type CIR, EIR, or shared
+ * @bw_alloc: BW weight/allocation
+ *
+ * This function configures node element's BW allocation.
+ */
+static enum ice_status
+ice_sched_cfg_node_bw_alloc(struct ice_hw *hw, struct ice_sched_node *node,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+
+	buf = node->info;
+	data = &buf.data;
+	if (rl_type == ICE_MIN_BW) {
+		data->valid_sections |= ICE_AQC_ELEM_VALID_CIR;
+		data->cir_bw.bw_alloc = cpu_to_le16(bw_alloc);
+	} else if (rl_type == ICE_MAX_BW) {
+		data->valid_sections |= ICE_AQC_ELEM_VALID_EIR;
+		data->eir_bw.bw_alloc = cpu_to_le16(bw_alloc);
+	} else {
+		return ICE_ERR_PARAM;
+	}
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_move_vsi_to_agg - moves VSI to new or default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Move or associate VSI to a new or default aggregator node.
+ */
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_assoc_vsi_to_agg(pi, agg_id, vsi_handle,
+					    (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_vsi_tc_bitmap(pi, agg_id, vsi_handle,
+						    (unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_rm_agg_cfg - remove aggregator configuration
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function removes aggregator reference to VSI and delete aggregator ID
+ * info. It removes the aggregator configuration completely.
+ */
+enum ice_status ice_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	u8 tc;
+
+	mutex_lock(&pi->sched_lock);
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit_ice_rm_agg_cfg;
+	}
+
+	ice_for_each_traffic_class(tc) {
+		status = ice_rm_agg_cfg_tc(pi, agg_info, tc, true);
+		if (status)
+			goto exit_ice_rm_agg_cfg;
+	}
+
+	if (!bitmap_empty(agg_info->tc_bitmap, ICE_MAX_TRAFFIC_CLASS)) {
+		status = ICE_ERR_IN_USE;
+		goto exit_ice_rm_agg_cfg;
+	}
+
+	/* Safe to delete entry now */
+	list_del(&agg_info->list_entry);
+	devm_kfree(ice_hw_to_dev(pi->hw), agg_info);
+
+	/* Remove unused RL profile IDs from HW and SW DB */
+	ice_sched_rm_unused_rl_prof(pi->hw);
+
+exit_ice_rm_agg_cfg:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_set_clear_cir_bw_alloc - set or clear CIR BW alloc information
+ * @bw_t_info: bandwidth type information structure
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save or clear CIR BW alloc information (bw_alloc) in the passed param
+ * bw_t_info.
+ */
+static void
+ice_set_clear_cir_bw_alloc(struct ice_bw_type_info *bw_t_info, u16 bw_alloc)
+{
+	bw_t_info->cir_bw.bw_alloc = bw_alloc;
+	if (bw_t_info->cir_bw.bw_alloc)
+		set_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_set_clear_eir_bw_alloc - set or clear EIR BW alloc information
+ * @bw_t_info: bandwidth type information structure
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save or clear EIR BW alloc information (bw_alloc) in the passed param
+ * bw_t_info.
+ */
+static void
+ice_set_clear_eir_bw_alloc(struct ice_bw_type_info *bw_t_info, u16 bw_alloc)
+{
+	bw_t_info->eir_bw.bw_alloc = bw_alloc;
+	if (bw_t_info->eir_bw.bw_alloc)
+		set_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_sched_save_vsi_bw_alloc - save VSI node's BW alloc information
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save BW alloc information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&vsi_ctx->sched.bw_t_info[tc],
+					   bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&vsi_ctx->sched.bw_t_info[tc],
+					   bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_set_clear_cir_bw - set or clear CIR BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear CIR bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_cir_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->cir_bw.bw = 0;
+	} else {
+		/* Save type of BW information */
+		set_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->cir_bw.bw = bw;
+	}
+}
+
+/**
+ * ice_set_clear_eir_bw - set or clear EIR BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear EIR bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_eir_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->eir_bw.bw = 0;
+	} else {
+		/* save EIR BW information */
+		set_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap);
+		bw_t_info->eir_bw.bw = bw;
+	}
+}
+
+/**
+ * ice_set_clear_shared_bw - set or clear shared BW
+ * @bw_t_info: bandwidth type information structure
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save or clear shared bandwidth (BW) in the passed param bw_t_info.
+ */
+static void ice_set_clear_shared_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
+{
+	if (bw == ICE_SCHED_DFLT_BW) {
+		clear_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap);
+		bw_t_info->shared_bw = 0;
+	} else {
+		/* save shared BW information */
+		set_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap);
+		bw_t_info->shared_bw = bw;
+	}
+}
+
+/**
+ * ice_sched_save_vsi_bw - save VSI node's BW information
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_bw(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&vsi_ctx->sched.bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_set_clear_prio - set or clear priority information
+ * @bw_t_info: bandwidth type information structure
+ * @prio: priority to save
+ *
+ * Save or clear priority (prio) in the passed param bw_t_info.
+ */
+static void ice_set_clear_prio(struct ice_bw_type_info *bw_t_info, u8 prio)
+{
+	bw_t_info->generic = prio;
+	if (bw_t_info->generic)
+		set_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap);
+	else
+		clear_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap);
+}
+
+/**
+ * ice_sched_save_vsi_prio - save VSI node's priority information
+ * @pi: port information structure
+ * @vsi_handle: Software VSI handle
+ * @tc: traffic class
+ * @prio: priority to save
+ *
+ * Save priority information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_vsi_prio(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			u8 prio)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	ice_set_clear_prio(&vsi_ctx->sched.bw_t_info[tc], prio);
+	return 0;
+}
+
+/**
+ * ice_sched_save_agg_bw_alloc - save aggregator node's BW alloc information
+ * @pi: port information structure
+ * @agg_id: node aggregator ID
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: bandwidth alloc information
+ *
+ * Save BW alloc information of AGG type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			    enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&agg_info->bw_t_info[tc], bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&agg_info->bw_t_info[tc], bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_save_agg_bw - save aggregator node's BW information
+ * @pi: port information structure
+ * @agg_id: node aggregator ID
+ * @tc: traffic class
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of AGG type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_agg_bw(struct ice_port_info *pi, u32 agg_id, u8 tc,
+		      enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&agg_info->bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_cfg_vsi_bw_lmt_per_tc - configure VSI BW limit per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of VSI scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_vsi_bw_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, vsi_handle,
+						  ICE_AGG_TYPE_VSI,
+						  tc, rl_type, bw);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type, bw);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_dflt_lmt_per_tc - configure default VSI BW limit per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @rl_type: min or max
+ *
+ * This function configures default BW limit of VSI scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_vsi_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			       enum ice_rl_type rl_type)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, vsi_handle,
+						  ICE_AGG_TYPE_VSI,
+						  tc, rl_type,
+						  ICE_SCHED_DFLT_BW);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type,
+					       ICE_SCHED_DFLT_BW);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_lmt_per_tc - configure aggregator BW limit per TC
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function applies BW limit to aggregator scheduling node based on TC
+ * information.
+ */
+enum ice_status
+ice_cfg_agg_bw_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, agg_id, ICE_AGG_TYPE_AGG,
+						  tc, rl_type, bw);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type, bw);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_dflt_lmt_per_tc - configure aggregator BW default limit per TC
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @rl_type: min or max
+ *
+ * This function applies default BW limit to aggregator scheduling node based
+ * on TC information.
+ */
+enum ice_status
+ice_cfg_agg_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			       enum ice_rl_type rl_type)
+{
+	enum ice_status status;
+
+	status = ice_sched_set_node_bw_lmt_per_tc(pi, agg_id, ICE_AGG_TYPE_AGG,
+						  tc, rl_type,
+						  ICE_SCHED_DFLT_BW);
+	if (!status) {
+		mutex_lock(&pi->sched_lock);
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type,
+					       ICE_SCHED_DFLT_BW);
+		mutex_unlock(&pi->sched_lock);
+	}
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_shared_lmt - configure VSI BW shared limit
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of all VSI type nodes across all traffic
+ * classes for VSI matching handle.
+ */
+enum ice_status
+ice_cfg_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle, u32 min_bw,
+			  u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_vsi_bw_shared_lmt(pi, vsi_handle, min_bw, max_bw,
+					       shared_bw);
+}
+
+/**
+ * ice_cfg_vsi_bw_no_shared_lmt - configure VSI BW for no shared limiter
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function removes the shared rate limiter(SRL) of all VSI type nodes
+ * across all traffic classes for VSI matching handle.
+ */
+enum ice_status
+ice_cfg_vsi_bw_no_shared_lmt(struct ice_port_info *pi, u16 vsi_handle)
+{
+	return ice_sched_set_vsi_bw_shared_lmt(pi, vsi_handle,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_agg_bw_shared_lmt - configure aggregator BW shared limit
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+			  u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_agg_bw_shared_lmt(pi, agg_id, min_bw, max_bw,
+					       shared_bw);
+}
+
+/**
+ * ice_cfg_agg_bw_no_shared_lmt - configure aggregator BW for no shared limiter
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function removes the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt(struct ice_port_info *pi, u32 agg_id)
+{
+	return ice_sched_set_agg_bw_shared_lmt(pi, agg_id, ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW,
+					       ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_agg_bw_shared_lmt_per_tc - config aggregator BW shared limit per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				 u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	return ice_sched_set_agg_bw_shared_lmt_per_tc(pi, agg_id, tc, min_bw,
+						      max_bw, shared_bw);
+}
+
+/**
+ * ice_cfg_agg_bw_no_shared_lmt_per_tc - cfg aggregator BW shared limit per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id.
+ */
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	return ice_sched_set_agg_bw_shared_lmt_per_tc(pi, agg_id, tc,
+						      ICE_SCHED_DFLT_BW,
+						      ICE_SCHED_DFLT_BW,
+						      ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_cfg_vsi_q_priority - config VSI queue priority of node
+ * @pi: port information structure
+ * @num_qs: number of VSI queues
+ * @q_ids: queue IDs array
+ * @q_prio: queue priority array
+ *
+ * This function configures the queue node priority (Sibling Priority) of the
+ * passed in VSI's queue(s) for a given traffic class (TC).
+ */
+enum ice_status
+ice_cfg_vsi_q_priority(struct ice_port_info *pi, u16 num_qs, u32 *q_ids,
+		       u8 *q_prio)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	u16 i;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < num_qs; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, q_ids[i]);
+		if (!node || node->info.data.elem_type !=
+		    ICE_AQC_ELEM_TYPE_LEAF) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		/* Configure Priority */
+		status = ice_sched_cfg_sibl_node_prio(pi, node, q_prio[i]);
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_agg_vsi_priority_per_tc - config aggregator's VSI priority per TC
+ * @pi: port information structure
+ * @agg_id: Aggregator ID
+ * @num_vsis: number of VSI(s)
+ * @vsi_handle_arr: array of software VSI handles
+ * @node_prio: pointer to node priority
+ * @tc: traffic class
+ *
+ * This function configures the node priority (Sibling Priority) of the
+ * passed in VSI's for a given traffic class (TC) of an Aggregator ID.
+ */
+enum ice_status
+ice_cfg_agg_vsi_priority_per_tc(struct ice_port_info *pi, u32 agg_id,
+				u16 num_vsis, u16 *vsi_handle_arr,
+				u8 *node_prio, u8 tc)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_node *tc_node, *agg_node;
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	struct ice_hw *hw = pi->hw;
+	u16 i;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present)
+		goto exit_agg_priority_per_tc;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_agg_priority_per_tc;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		goto exit_agg_priority_per_tc;
+
+	if (num_vsis > hw->max_children[agg_node->tx_sched_layer])
+		goto exit_agg_priority_per_tc;
+
+	for (i = 0; i < num_vsis; i++) {
+		struct ice_sched_node *vsi_node;
+		bool vsi_handle_valid = false;
+		u16 vsi_handle;
+
+		status = ICE_ERR_PARAM;
+		vsi_handle = vsi_handle_arr[i];
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			goto exit_agg_priority_per_tc;
+		/* Verify child nodes before applying settings */
+		list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list,
+				    list_entry)
+			if (agg_vsi_info->vsi_handle == vsi_handle) {
+				/* cppcheck-suppress unreadVariable */
+				vsi_handle_valid = true;
+				break;
+			}
+
+		if (!vsi_handle_valid)
+			goto exit_agg_priority_per_tc;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			goto exit_agg_priority_per_tc;
+
+		if (ice_sched_find_node_in_subtree(hw, agg_node, vsi_node)) {
+			/* Configure Priority */
+			status = ice_sched_cfg_sibl_node_prio(pi, vsi_node,
+							      node_prio[i]);
+			if (status)
+				break;
+			status = ice_sched_save_vsi_prio(pi, vsi_handle, tc,
+							 node_prio[i]);
+			if (status)
+				break;
+		}
+	}
+
+exit_agg_priority_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_bw_alloc - config VSI BW alloc per TC
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @ena_tcmap: enabled TC map
+ * @rl_type: Rate limit type CIR/EIR
+ * @bw_alloc: Array of BW alloc
+ *
+ * This function configures the BW allocation of the passed in VSI's
+ * node(s) for enabled traffic class.
+ */
+enum ice_status
+ice_cfg_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+
+		if (!ice_is_tc_ena(ena_tcmap, tc))
+			continue;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		status = ice_sched_cfg_node_bw_alloc(pi->hw, vsi_node, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+		status = ice_sched_save_vsi_bw_alloc(pi, vsi_handle, tc,
+						     rl_type, bw_alloc[tc]);
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_agg_bw_alloc - config aggregator BW alloc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @ena_tcmap: enabled TC map
+ * @rl_type: rate limit type CIR/EIR
+ * @bw_alloc: array of BW alloc
+ *
+ * This function configures the BW allocation of passed in aggregator for
+ * enabled traffic class(s).
+ */
+enum ice_status
+ice_cfg_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc)
+{
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present) {
+		status = ICE_ERR_PARAM;
+		goto exit_cfg_agg_bw_alloc;
+	}
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+
+		if (!ice_is_tc_ena(ena_tcmap, tc))
+			continue;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+
+		status = ice_sched_cfg_node_bw_alloc(hw, agg_node, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+		status = ice_sched_save_agg_bw_alloc(pi, agg_id, tc, rl_type,
+						     bw_alloc[tc]);
+		if (status)
+			break;
+	}
+
+exit_cfg_agg_bw_alloc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_calc_wakeup - calculate RL profile wakeup parameter
+ * @hw: pointer to the HW struct
+ * @bw: bandwidth in Kbps
+ *
+ * This function calculates the wakeup parameter of RL profile.
+ */
+static u16 ice_sched_calc_wakeup(struct ice_hw *hw, s32 bw)
+{
+	s64 bytes_per_sec, wakeup_int, wakeup_a, wakeup_b, wakeup_f;
+	s32 wakeup_f_int;
+	u16 wakeup = 0;
+
+	/* Get the wakeup integer value */
+	bytes_per_sec = div64_long(((s64)bw * 1000), BITS_PER_BYTE);
+	wakeup_int = div64_long(hw->psm_clk_freq, bytes_per_sec);
+	if (wakeup_int > 63) {
+		wakeup = (u16)((1 << 15) | wakeup_int);
+	} else {
+		/* Calculate fraction value up to 4 decimals
+		 * Convert Integer value to a constant multiplier
+		 */
+		wakeup_b = (s64)ICE_RL_PROF_MULTIPLIER * wakeup_int;
+		wakeup_a = div64_long((s64)ICE_RL_PROF_MULTIPLIER * hw->psm_clk_freq,
+				      bytes_per_sec);
+
+		/* Get Fraction value */
+		wakeup_f = wakeup_a - wakeup_b;
+
+		/* Round up the Fractional value via Ceil(Fractional value) */
+		if (wakeup_f > div64_long(ICE_RL_PROF_MULTIPLIER, 2))
+			wakeup_f += 1;
+
+		wakeup_f_int = (s32) div64_long(wakeup_f * ICE_RL_PROF_FRACTION,
+						ICE_RL_PROF_MULTIPLIER);
+		wakeup |= (u16)(wakeup_int << 9);
+		wakeup |= (u16)(0x1ff & wakeup_f_int);
+	}
+
+	return wakeup;
+}
+
+/**
+ * ice_sched_bw_to_rl_profile - convert BW to profile parameters
+ * @hw: pointer to the HW struct
+ * @bw: bandwidth in Kbps
+ * @profile: profile parameters to return
+ *
+ * This function converts the BW to profile structure format.
+ */
+static enum ice_status
+ice_sched_bw_to_rl_profile(struct ice_hw *hw, u32 bw,
+			   struct ice_aqc_rl_profile_elem *profile)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	s64 bytes_per_sec, ts_rate, mv_tmp;
+	bool found = false;
+	s32 encode = 0;
+	s64 mv = 0;
+	s32 i;
+
+	/* Bw settings range is from 0.5Mb/sec to 100Gb/sec */
+	if (bw < ICE_SCHED_MIN_BW || bw > ICE_SCHED_MAX_BW)
+		return status;
+
+	/* Bytes per second from Kbps */
+	bytes_per_sec = div64_long(((s64)bw * 1000), BITS_PER_BYTE);
+
+	/* encode is 6 bits but really useful are 5 bits */
+	for (i = 0; i < 64; i++) {
+		u64 pow_result = BIT_ULL(i);
+
+		ts_rate = div64_long((s64)hw->psm_clk_freq,
+				     pow_result * ICE_RL_PROF_TS_MULTIPLIER);
+		if (ts_rate <= 0)
+			continue;
+
+		/* Multiplier value */
+		mv_tmp = div64_long(bytes_per_sec * ICE_RL_PROF_MULTIPLIER,
+				    ts_rate);
+
+		/* Round to the nearest ICE_RL_PROF_MULTIPLIER */
+		mv = round_up_64bit(mv_tmp, ICE_RL_PROF_MULTIPLIER);
+
+		/* First multiplier value greater than the given
+		 * accuracy bytes
+		 */
+		if (mv > ICE_RL_PROF_ACCURACY_BYTES) {
+			encode = i;
+			found = true;
+			break;
+		}
+	}
+	if (found) {
+		u16 wm;
+
+		wm = ice_sched_calc_wakeup(hw, bw);
+		profile->rl_multiply = cpu_to_le16(mv);
+		profile->wake_up_calc = cpu_to_le16(wm);
+		profile->rl_encode = cpu_to_le16(encode);
+		status = 0;
+	} else {
+		status = ICE_ERR_DOES_NOT_EXIST;
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_add_rl_profile - add RL profile
+ * @hw: pointer to the hardware structure
+ * @rl_type: type of rate limit BW - min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ * @layer_num: specifies in which layer to create profile
+ *
+ * This function first checks the existing list for corresponding BW
+ * parameter. If it exists, it returns the associated profile otherwise
+ * it creates a new rate limit profile for requested BW, and adds it to
+ * the HW DB and local list. It returns the new profile or null on error.
+ * The caller needs to hold the scheduler lock.
+ */
+static struct ice_aqc_rl_profile_info *
+ice_sched_add_rl_profile(struct ice_hw *hw, enum ice_rl_type rl_type,
+			 u32 bw, u8 layer_num)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_elem;
+	u16 profiles_added = 0, num_profiles = 1;
+	struct ice_aqc_rl_profile_elem *buf;
+	enum ice_status status;
+	u8 profile_type;
+
+	if (layer_num >= ICE_AQC_TOPO_MAX_LEVEL_NUM)
+		return NULL;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_CIR;
+		break;
+	case ICE_MAX_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_EIR;
+		break;
+	case ICE_SHARED_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_SRL;
+		break;
+	default:
+		return NULL;
+	}
+
+	if (!hw)
+		return NULL;
+	list_for_each_entry(rl_prof_elem, &hw->rl_prof_list[layer_num],
+			    list_entry)
+		if ((rl_prof_elem->profile.flags & ICE_AQC_RL_PROFILE_TYPE_M) ==
+		    profile_type && rl_prof_elem->bw == bw)
+			/* Return existing profile ID info */
+			return rl_prof_elem;
+
+	/* Create new profile ID */
+	rl_prof_elem = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rl_prof_elem),
+				    GFP_KERNEL);
+
+	if (!rl_prof_elem)
+		return NULL;
+
+	status = ice_sched_bw_to_rl_profile(hw, bw, &rl_prof_elem->profile);
+	if (status)
+		goto exit_add_rl_prof;
+
+	rl_prof_elem->bw = bw;
+	/* layer_num is zero relative, and fw expects level from 1 to 9 */
+	rl_prof_elem->profile.level = layer_num + 1;
+	rl_prof_elem->profile.flags = profile_type;
+	rl_prof_elem->profile.max_burst_size = cpu_to_le16(hw->max_burst_size);
+
+	/* Create new entry in HW DB */
+	buf = &rl_prof_elem->profile;
+	status = ice_aq_add_rl_profile(hw, num_profiles, buf, sizeof(*buf),
+				       &profiles_added, NULL);
+	if (status || profiles_added != num_profiles)
+		goto exit_add_rl_prof;
+
+	/* Good entry - add in the list */
+	rl_prof_elem->prof_id_ref = 0;
+	list_add(&rl_prof_elem->list_entry, &hw->rl_prof_list[layer_num]);
+	return rl_prof_elem;
+
+exit_add_rl_prof:
+	devm_kfree(ice_hw_to_dev(hw), rl_prof_elem);
+	return NULL;
+}
+
+/**
+ * ice_sched_cfg_node_bw_lmt - configure node sched params
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @rl_type: rate limit type CIR, EIR, or shared
+ * @rl_prof_id: rate limit profile ID
+ *
+ * This function configures node element's BW limit.
+ */
+static enum ice_status
+ice_sched_cfg_node_bw_lmt(struct ice_hw *hw, struct ice_sched_node *node,
+			  enum ice_rl_type rl_type, u16 rl_prof_id)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+
+	buf = node->info;
+	data = &buf.data;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_CIR;
+		data->cir_bw.bw_profile_idx = cpu_to_le16(rl_prof_id);
+		break;
+	case ICE_MAX_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_EIR;
+		data->eir_bw.bw_profile_idx = cpu_to_le16(rl_prof_id);
+		break;
+	case ICE_SHARED_BW:
+		data->valid_sections |= ICE_AQC_ELEM_VALID_SHARED;
+		data->srl_id = cpu_to_le16(rl_prof_id);
+		break;
+	default:
+		/* Unknown rate limit type */
+		return ICE_ERR_PARAM;
+	}
+
+	/* Configure element */
+	return ice_sched_update_elem(hw, node, &buf);
+}
+
+/**
+ * ice_sched_get_node_rl_prof_id - get node's rate limit profile ID
+ * @node: sched node
+ * @rl_type: rate limit type
+ *
+ * If existing profile matches, it returns the corresponding rate
+ * limit profile ID, otherwise it returns an invalid ID as error.
+ */
+static u16
+ice_sched_get_node_rl_prof_id(struct ice_sched_node *node,
+			      enum ice_rl_type rl_type)
+{
+	u16 rl_prof_id = ICE_SCHED_INVAL_PROF_ID;
+	struct ice_aqc_txsched_elem *data;
+
+	data = &node->info.data;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_CIR)
+			rl_prof_id = le16_to_cpu(data->cir_bw.bw_profile_idx);
+		break;
+	case ICE_MAX_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_EIR)
+			rl_prof_id = le16_to_cpu(data->eir_bw.bw_profile_idx);
+		break;
+	case ICE_SHARED_BW:
+		if (data->valid_sections & ICE_AQC_ELEM_VALID_SHARED)
+			rl_prof_id = le16_to_cpu(data->srl_id);
+		break;
+	default:
+		break;
+	}
+
+	return rl_prof_id;
+}
+
+/**
+ * ice_sched_get_rl_prof_layer - selects rate limit profile creation layer
+ * @pi: port information structure
+ * @rl_type: type of rate limit BW - min, max, or shared
+ * @layer_index: layer index
+ *
+ * This function returns requested profile creation layer.
+ */
+static u8
+ice_sched_get_rl_prof_layer(struct ice_port_info *pi, enum ice_rl_type rl_type,
+			    u8 layer_index)
+{
+	struct ice_hw *hw = pi->hw;
+
+	if (layer_index >= hw->num_tx_sched_layers)
+		return ICE_SCHED_INVAL_LAYER_NUM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		if (hw->layer_info[layer_index].max_cir_rl_profiles)
+			return layer_index;
+		break;
+	case ICE_MAX_BW:
+		if (hw->layer_info[layer_index].max_eir_rl_profiles)
+			return layer_index;
+		break;
+	case ICE_SHARED_BW:
+		/* if current layer doesn't support SRL profile creation
+		 * then try a layer up or down.
+		 */
+		if (hw->layer_info[layer_index].max_srl_profiles)
+			return layer_index;
+		else if (layer_index < hw->num_tx_sched_layers - 1 &&
+			 hw->layer_info[layer_index + 1].max_srl_profiles)
+			return layer_index + 1;
+		else if (layer_index > 0 &&
+			 hw->layer_info[layer_index - 1].max_srl_profiles)
+			return layer_index - 1;
+		break;
+	default:
+		break;
+	}
+	return ICE_SCHED_INVAL_LAYER_NUM;
+}
+
+/**
+ * ice_sched_get_srl_node - get shared rate limit node
+ * @node: tree node
+ * @srl_layer: shared rate limit layer
+ *
+ * This function returns SRL node to be used for shared rate limit purpose.
+ * The caller needs to hold scheduler lock.
+ */
+static struct ice_sched_node *
+ice_sched_get_srl_node(struct ice_sched_node *node, u8 srl_layer)
+{
+	if (srl_layer > node->tx_sched_layer)
+		return node->children[0];
+	else if (srl_layer < node->tx_sched_layer)
+		/* Node can't be created without a parent. It will always
+		 * have a valid parent except root node.
+		 */
+		return node->parent;
+	else
+		return node;
+}
+
+/**
+ * ice_sched_rm_rl_profile - remove RL profile ID
+ * @hw: pointer to the hardware structure
+ * @layer_num: layer number where profiles are saved
+ * @profile_type: profile type like EIR, CIR, or SRL
+ * @profile_id: profile ID to remove
+ *
+ * This function removes rate limit profile from layer 'layer_num' of type
+ * 'profile_type' and profile ID as 'profile_id'. The caller needs to hold
+ * scheduler lock.
+ */
+static enum ice_status
+ice_sched_rm_rl_profile(struct ice_hw *hw, u8 layer_num, u8 profile_type,
+			u16 profile_id)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_elem;
+	enum ice_status status = 0;
+
+	if (layer_num >= ICE_AQC_TOPO_MAX_LEVEL_NUM)
+		return ICE_ERR_PARAM;
+	/* Check the existing list for RL profile */
+	list_for_each_entry(rl_prof_elem, &hw->rl_prof_list[layer_num],
+			    list_entry)
+		if ((rl_prof_elem->profile.flags & ICE_AQC_RL_PROFILE_TYPE_M) ==
+		    profile_type &&
+		    le16_to_cpu(rl_prof_elem->profile.profile_id) ==
+		    profile_id) {
+			if (rl_prof_elem->prof_id_ref)
+				rl_prof_elem->prof_id_ref--;
+
+			/* Remove old profile ID from database */
+			status = ice_sched_del_rl_profile(hw, rl_prof_elem);
+			if (status && status != ICE_ERR_IN_USE)
+				ice_debug(hw, ICE_DBG_SCHED, "Remove rl profile failed\n");
+			break;
+		}
+	if (status == ICE_ERR_IN_USE)
+		status = 0;
+	return status;
+}
+
+/**
+ * ice_sched_set_node_bw_dflt - set node's bandwidth limit to default
+ * @pi: port information structure
+ * @node: pointer to node structure
+ * @rl_type: rate limit type min, max, or shared
+ * @layer_num: layer number where RL profiles are saved
+ *
+ * This function configures node element's BW rate limit profile ID of
+ * type CIR, EIR, or SRL to default. This function needs to be called
+ * with the scheduler lock held.
+ */
+static enum ice_status
+ice_sched_set_node_bw_dflt(struct ice_port_info *pi,
+			   struct ice_sched_node *node,
+			   enum ice_rl_type rl_type, u8 layer_num)
+{
+	enum ice_status status;
+	struct ice_hw *hw;
+	u8 profile_type;
+	u16 rl_prof_id;
+	u16 old_id;
+
+	hw = pi->hw;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_CIR;
+		rl_prof_id = ICE_SCHED_DFLT_RL_PROF_ID;
+		break;
+	case ICE_MAX_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_EIR;
+		rl_prof_id = ICE_SCHED_DFLT_RL_PROF_ID;
+		break;
+	case ICE_SHARED_BW:
+		profile_type = ICE_AQC_RL_PROFILE_TYPE_SRL;
+		/* No SRL is configured for default case */
+		rl_prof_id = ICE_SCHED_NO_SHARED_RL_PROF_ID;
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	/* Save existing RL prof ID for later clean up */
+	old_id = ice_sched_get_node_rl_prof_id(node, rl_type);
+	/* Configure BW scheduling parameters */
+	status = ice_sched_cfg_node_bw_lmt(hw, node, rl_type, rl_prof_id);
+	if (status)
+		return status;
+
+	/* Remove stale RL profile ID */
+	if (old_id == ICE_SCHED_DFLT_RL_PROF_ID ||
+	    old_id == ICE_SCHED_INVAL_PROF_ID)
+		return 0;
+
+	return ice_sched_rm_rl_profile(hw, layer_num, profile_type, old_id);
+}
+
+/**
+ * ice_sched_set_node_bw - set node's bandwidth
+ * @pi: port information structure
+ * @node: tree node
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ * @layer_num: layer number
+ *
+ * This function adds new profile corresponding to requested BW, configures
+ * node's RL profile ID of type CIR, EIR, or SRL, and removes old profile
+ * ID from local database. The caller needs to hold scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_node_bw(struct ice_port_info *pi, struct ice_sched_node *node,
+		      enum ice_rl_type rl_type, u32 bw, u8 layer_num)
+{
+	struct ice_aqc_rl_profile_info *rl_prof_info;
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_hw *hw = pi->hw;
+	u16 old_id, rl_prof_id;
+
+	rl_prof_info = ice_sched_add_rl_profile(hw, rl_type, bw, layer_num);
+	if (!rl_prof_info)
+		return status;
+
+	rl_prof_id = le16_to_cpu(rl_prof_info->profile.profile_id);
+
+	/* Save existing RL prof ID for later clean up */
+	old_id = ice_sched_get_node_rl_prof_id(node, rl_type);
+	/* Configure BW scheduling parameters */
+	status = ice_sched_cfg_node_bw_lmt(hw, node, rl_type, rl_prof_id);
+	if (status)
+		return status;
+
+	/* New changes has been applied */
+	/* Increment the profile ID reference count */
+	rl_prof_info->prof_id_ref++;
+
+	/* Check for old ID removal */
+	if ((old_id == ICE_SCHED_DFLT_RL_PROF_ID && rl_type != ICE_SHARED_BW) ||
+	    old_id == ICE_SCHED_INVAL_PROF_ID || old_id == rl_prof_id)
+		return 0;
+
+	return ice_sched_rm_rl_profile(hw, layer_num,
+				       rl_prof_info->profile.flags &
+				       ICE_AQC_RL_PROFILE_TYPE_M, old_id);
+}
+
+/**
+ * ice_sched_set_node_bw_lmt - set node's BW limit
+ * @pi: port information structure
+ * @node: tree node
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * It updates node's BW limit parameters like BW RL profile ID of type CIR,
+ * EIR, or SRL. The caller needs to hold scheduler lock.
+ *
+ * NOTE: Caller provides the correct SRL node in case of shared profile
+ * settings.
+ */
+static enum ice_status
+ice_sched_set_node_bw_lmt(struct ice_port_info *pi, struct ice_sched_node *node,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	struct ice_hw *hw;
+	u8 layer_num;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+	hw = pi->hw;
+	/* Remove unused RL profile IDs from HW and SW DB */
+	ice_sched_rm_unused_rl_prof(hw);
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+						node->tx_sched_layer);
+	if (layer_num >= hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	if (bw == ICE_SCHED_DFLT_BW)
+		return ice_sched_set_node_bw_dflt(pi, node, rl_type, layer_num);
+	return ice_sched_set_node_bw(pi, node, rl_type, bw, layer_num);
+}
+
+
+/**
+ * ice_sched_set_node_bw_dflt_lmt - set node's BW limit to default
+ * @pi: port information structure
+ * @node: pointer to node structure
+ * @rl_type: rate limit type min, max, or shared
+ *
+ * This function configures node element's BW rate limit profile ID of
+ * type CIR, EIR, or SRL to default. This function needs to be called
+ * with the scheduler lock held.
+ */
+static enum ice_status
+ice_sched_set_node_bw_dflt_lmt(struct ice_port_info *pi,
+			       struct ice_sched_node *node,
+			       enum ice_rl_type rl_type)
+{
+	return ice_sched_set_node_bw_lmt(pi, node, rl_type,
+					 ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_validate_srl_node - Check node for SRL applicability
+ * @node: sched node to configure
+ * @sel_layer: selected SRL layer
+ *
+ * This function checks if the SRL can be applied to a selceted layer node on
+ * behalf of the requested node (first argument). This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_srl_node(struct ice_sched_node *node, u8 sel_layer)
+{
+	/* SRL profiles are not available on all layers. Check if the
+	 * SRL profile can be applied to a node above or below the
+	 * requested node. SRL configuration is possible only if the
+	 * selected layer's node has single child.
+	 */
+	if (sel_layer == node->tx_sched_layer ||
+	    ((sel_layer == node->tx_sched_layer + 1) &&
+	    node->num_children == 1) ||
+	    ((sel_layer == node->tx_sched_layer - 1) &&
+	    (node->parent && node->parent->num_children == 1)))
+		return 0;
+
+	return ICE_ERR_CFG;
+}
+
+/**
+ * ice_sched_save_q_bw - save queue node's BW information
+ * @q_ctx: queue context structure
+ * @rl_type: rate limit type min, max, or shared
+ * @bw: bandwidth in Kbps - Kilo bits per sec
+ *
+ * Save BW information of queue type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_q_bw(struct ice_q_ctx *q_ctx, enum ice_rl_type rl_type, u32 bw)
+{
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&q_ctx->bw_t_info, bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&q_ctx->bw_t_info, bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&q_ctx->bw_t_info, bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_q_bw_lmt - sets queue BW limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ * @bw: bandwidth in Kbps
+ *
+ * This function sets BW limit of queue scheduling node.
+ */
+static enum ice_status
+ice_sched_set_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		       u16 q_handle, enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *node;
+	struct ice_q_ctx *q_ctx;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	q_ctx = ice_get_lan_q_ctx(pi->hw, vsi_handle, tc, q_handle);
+	if (!q_ctx)
+		goto exit_q_bw_lmt;
+	node = ice_sched_find_node_by_teid(pi->root, q_ctx->q_teid);
+	if (!node) {
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Wrong q_teid\n");
+		goto exit_q_bw_lmt;
+	}
+
+	/* Return error if it is not a leaf node */
+	if (node->info.data.elem_type != ICE_AQC_ELEM_TYPE_LEAF)
+		goto exit_q_bw_lmt;
+
+	/* SRL bandwidth layer selection */
+	if (rl_type == ICE_SHARED_BW) {
+		u8 sel_layer; /* selected layer */
+
+		sel_layer = ice_sched_get_rl_prof_layer(pi, rl_type,
+							node->tx_sched_layer);
+		if (sel_layer >= pi->hw->num_tx_sched_layers) {
+			status = ICE_ERR_PARAM;
+			goto exit_q_bw_lmt;
+		}
+		status = ice_sched_validate_srl_node(node, sel_layer);
+		if (status)
+			goto exit_q_bw_lmt;
+	}
+
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, node, rl_type, bw);
+
+	if (!status)
+		status = ice_sched_save_q_bw(q_ctx, rl_type, bw);
+
+exit_q_bw_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_q_bw_lmt - configure queue BW limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of queue scheduling node.
+ */
+enum ice_status
+ice_cfg_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		 u16 q_handle, enum ice_rl_type rl_type, u32 bw)
+{
+	return ice_sched_set_q_bw_lmt(pi, vsi_handle, tc, q_handle, rl_type,
+				      bw);
+}
+
+/**
+ * ice_cfg_q_bw_dflt_lmt - configure queue BW default limit
+ * @pi: port information structure
+ * @vsi_handle: sw VSI handle
+ * @tc: traffic class
+ * @q_handle: software queue handle
+ * @rl_type: min, max, or shared
+ *
+ * This function configures BW default limit of queue scheduling node.
+ */
+enum ice_status
+ice_cfg_q_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 q_handle, enum ice_rl_type rl_type)
+{
+	return ice_sched_set_q_bw_lmt(pi, vsi_handle, tc, q_handle, rl_type,
+				      ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_save_tc_node_bw - save TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function saves the modified values of bandwidth settings for later
+ * replay purpose (restore) after reset.
+ */
+static enum ice_status
+ice_sched_save_tc_node_bw(struct ice_port_info *pi, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw)
+{
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	case ICE_SHARED_BW:
+		ice_set_clear_shared_bw(&pi->tc_node_bw_t_info[tc], bw);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_tc_node_bw_lmt - sets TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures bandwidth limit of TC node.
+ */
+static enum ice_status
+ice_sched_set_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+			     enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *tc_node;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return status;
+	mutex_lock(&pi->sched_lock);
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_set_tc_node_bw;
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, tc_node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, tc_node, rl_type, bw);
+	if (!status)
+		status = ice_sched_save_tc_node_bw(pi, tc, rl_type, bw);
+
+exit_set_tc_node_bw:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_tc_node_bw_lmt - configure TC node BW limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function configures BW limit of TC node.
+ * Note: The minimum guaranteed reservation is done via DCBX.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+		       enum ice_rl_type rl_type, u32 bw)
+{
+	return ice_sched_set_tc_node_bw_lmt(pi, tc, rl_type, bw);
+}
+
+/**
+ * ice_cfg_tc_node_bw_dflt_lmt - configure TC node BW default limit
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ *
+ * This function configures BW default limit of TC node.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_dflt_lmt(struct ice_port_info *pi, u8 tc,
+			    enum ice_rl_type rl_type)
+{
+	return ice_sched_set_tc_node_bw_lmt(pi, tc, rl_type, ICE_SCHED_DFLT_BW);
+}
+
+/**
+ * ice_sched_save_tc_node_bw_alloc - save TC node's BW alloc information
+ * @pi: port information structure
+ * @tc: traffic class
+ * @rl_type: rate limit type min or max
+ * @bw_alloc: Bandwidth allocation information
+ *
+ * Save BW alloc information of VSI type node for post replay use.
+ */
+static enum ice_status
+ice_sched_save_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+				enum ice_rl_type rl_type, u16 bw_alloc)
+{
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return ICE_ERR_PARAM;
+	switch (rl_type) {
+	case ICE_MIN_BW:
+		ice_set_clear_cir_bw_alloc(&pi->tc_node_bw_t_info[tc],
+					   bw_alloc);
+		break;
+	case ICE_MAX_BW:
+		ice_set_clear_eir_bw_alloc(&pi->tc_node_bw_t_info[tc],
+					   bw_alloc);
+		break;
+	default:
+		return ICE_ERR_PARAM;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_tc_node_bw_alloc - set TC node BW alloc
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw_alloc: bandwidth alloc
+ *
+ * This function configures bandwidth alloc of TC node, also saves the
+ * changed settings for replay purpose, and return success if it succeeds
+ * in modifying bandwidth alloc setting.
+ */
+static enum ice_status
+ice_sched_set_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			       enum ice_rl_type rl_type, u8 bw_alloc)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *tc_node;
+
+	if (tc >= ICE_MAX_TRAFFIC_CLASS)
+		return status;
+	mutex_lock(&pi->sched_lock);
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		goto exit_set_tc_node_bw_alloc;
+	status = ice_sched_cfg_node_bw_alloc(pi->hw, tc_node, rl_type,
+					     bw_alloc);
+	if (status)
+		goto exit_set_tc_node_bw_alloc;
+	status = ice_sched_save_tc_node_bw_alloc(pi, tc, rl_type, bw_alloc);
+
+exit_set_tc_node_bw_alloc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_tc_node_bw_alloc - configure TC node BW alloc
+ * @pi: port information structure
+ * @tc: TC number
+ * @rl_type: min or max
+ * @bw_alloc: bandwidth alloc
+ *
+ * This function configures BW limit of TC node.
+ * Note: The minimum guaranteed reservation is done via DCBX.
+ */
+enum ice_status
+ice_cfg_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			 enum ice_rl_type rl_type, u8 bw_alloc)
+{
+	return ice_sched_set_tc_node_bw_alloc(pi, tc, rl_type, bw_alloc);
+}
+
+/**
+ * ice_sched_set_agg_bw_dflt_lmt - set aggregator node's BW limit to default
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function retrieves the aggregator ID based on VSI ID and TC,
+ * and sets node's BW limit to default. This function needs to be
+ * called with the scheduler lock held.
+ */
+enum ice_status
+ice_sched_set_agg_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle)
+{
+	struct ice_vsi_ctx *vsi_ctx;
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *node;
+
+		node = vsi_ctx->sched.ag_node[tc];
+		if (!node)
+			continue;
+
+		/* Set min profile to default */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, ICE_MIN_BW);
+		if (status)
+			break;
+
+		/* Set max profile to default */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, ICE_MAX_BW);
+		if (status)
+			break;
+
+		/* Remove shared profile, if there is one */
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node,
+							ICE_SHARED_BW);
+		if (status)
+			break;
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_get_node_by_id_type - get node from ID type
+ * @pi: port information structure
+ * @id: identifier
+ * @agg_type: type of aggregator
+ * @tc: traffic class
+ *
+ * This function returns node identified by ID of type aggregator, and
+ * based on traffic class (TC). This function needs to be called with
+ * the scheduler lock held.
+ */
+static struct ice_sched_node *
+ice_sched_get_node_by_id_type(struct ice_port_info *pi, u32 id,
+			      enum ice_agg_type agg_type, u8 tc)
+{
+	struct ice_sched_node *node = NULL;
+	struct ice_sched_node *child_node;
+
+	switch (agg_type) {
+	case ICE_AGG_TYPE_VSI: {
+		struct ice_vsi_ctx *vsi_ctx;
+		u16 vsi_handle = (u16)id;
+
+		if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+			break;
+		/* Get sched_vsi_info */
+		vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+		if (!vsi_ctx)
+			break;
+		node = vsi_ctx->sched.vsi_node[tc];
+		break;
+	}
+
+	case ICE_AGG_TYPE_AGG: {
+		struct ice_sched_node *tc_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (tc_node)
+			node = ice_sched_get_agg_node(pi, tc_node, id);
+		break;
+	}
+
+	case ICE_AGG_TYPE_Q:
+		/* The current implementation allows single queue to modify */
+		node = ice_sched_get_node(pi, id);
+		break;
+
+	case ICE_AGG_TYPE_QG:
+		/* The current implementation allows single qg to modify */
+		child_node = ice_sched_get_node(pi, id);
+		if (!child_node)
+			break;
+		node = child_node->parent;
+		break;
+
+	default:
+		break;
+	}
+
+	return node;
+}
+
+/**
+ * ice_sched_set_node_bw_lmt_per_tc - set node BW limit per TC
+ * @pi: port information structure
+ * @id: ID (software VSI handle or AGG ID)
+ * @agg_type: aggregator type (VSI or AGG type node)
+ * @tc: traffic class
+ * @rl_type: min or max
+ * @bw: bandwidth in Kbps
+ *
+ * This function sets BW limit of VSI or Aggregator scheduling node
+ * based on TC information from passed in argument BW.
+ */
+enum ice_status
+ice_sched_set_node_bw_lmt_per_tc(struct ice_port_info *pi, u32 id,
+				 enum ice_agg_type agg_type, u8 tc,
+				 enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status = ICE_ERR_PARAM;
+	struct ice_sched_node *node;
+
+	if (!pi)
+		return status;
+
+	if (rl_type == ICE_UNKNOWN_BW)
+		return status;
+
+	mutex_lock(&pi->sched_lock);
+	node = ice_sched_get_node_by_id_type(pi, id, agg_type, tc);
+	if (!node) {
+		ice_debug(pi->hw, ICE_DBG_SCHED, "Wrong id, agg type, or tc\n");
+		goto exit_set_node_bw_lmt_per_tc;
+	}
+	if (bw == ICE_SCHED_DFLT_BW)
+		status = ice_sched_set_node_bw_dflt_lmt(pi, node, rl_type);
+	else
+		status = ice_sched_set_node_bw_lmt(pi, node, rl_type, bw);
+
+exit_set_node_bw_lmt_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_validate_vsi_srl_node - validate VSI SRL node
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ *
+ * This function validates SRL node of the VSI node if available SRL layer is
+ * different than the VSI node layer on all TC(s).This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_vsi_srl_node(struct ice_port_info *pi, u16 vsi_handle)
+{
+	u8 sel_layer = ICE_SCHED_INVAL_LAYER_NUM;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+		enum ice_rl_type rl_type = ICE_SHARED_BW;
+		enum ice_status status;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		/* SRL bandwidth layer selection */
+		if (sel_layer == ICE_SCHED_INVAL_LAYER_NUM) {
+			u8 node_layer = vsi_node->tx_sched_layer;
+			u8 layer_num;
+
+			layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+								node_layer);
+			if (layer_num >= pi->hw->num_tx_sched_layers)
+				return ICE_ERR_PARAM;
+			sel_layer = layer_num;
+		}
+
+		status = ice_sched_validate_srl_node(vsi_node, sel_layer);
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_set_save_vsi_srl_node_bw - set VSI shared limit values
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @srl_node: sched node to configure
+ * @rl_type: rate limit type minimum, maximum, or shared
+ * @bw: minimum, maximum, or shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of VSI type nodes across given traffic
+ * class, and saves those value for later use for replaying purposes. The
+ * caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_save_vsi_srl_node_bw(struct ice_port_info *pi, u16 vsi_handle,
+				   u8 tc, struct ice_sched_node *srl_node,
+				   enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	if (bw == ICE_SCHED_DFLT_BW) {
+		status = ice_sched_set_node_bw_dflt_lmt(pi, srl_node, rl_type);
+	} else {
+		status = ice_sched_set_node_bw_lmt(pi, srl_node, rl_type, bw);
+		if (status)
+			return status;
+		status = ice_sched_save_vsi_bw(pi, vsi_handle, tc, rl_type, bw);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_set_vsi_node_srl_per_tc - set VSI node BW shared limit for tc
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of  VSI type nodes across requested
+ * traffic class for VSI matching handle. When BW value of ICE_SCHED_DFLT_BW
+ * is passed, it removes the corresponding bw from the node. The caller
+ * holds scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_vsi_node_srl_per_tc(struct ice_port_info *pi, u16 vsi_handle,
+				  u8 tc, u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	struct ice_sched_node *tc_node, *vsi_node, *cfg_node;
+	enum ice_status status;
+	u8 layer_num;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+	if (!vsi_node)
+		return ICE_ERR_CFG;
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, ICE_SHARED_BW,
+						vsi_node->tx_sched_layer);
+	if (layer_num >= pi->hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	/* SRL node may be different */
+	cfg_node = ice_sched_get_srl_node(vsi_node, layer_num);
+	if (!cfg_node)
+		return ICE_ERR_CFG;
+
+	status = ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc,
+						    cfg_node, ICE_MIN_BW,
+						    min_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc,
+						    cfg_node, ICE_MAX_BW,
+						    max_bw);
+	if (status)
+		return status;
+
+	return ice_sched_set_save_vsi_srl_node_bw(pi, vsi_handle, tc, cfg_node,
+						  ICE_SHARED_BW, shared_bw);
+}
+
+/**
+ * ice_sched_set_vsi_bw_shared_lmt - set VSI BW shared limit
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of all VSI type nodes across all traffic
+ * classes for VSI matching handle. When BW value of ICE_SCHED_DFLT_BW is
+ * passed, it removes those value(s) from the node.
+ */
+enum ice_status
+ice_sched_set_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle,
+				u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_vsi_srl_node(pi, vsi_handle);
+	if (status)
+		goto exit_set_vsi_bw_shared_lmt;
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *vsi_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+
+		status = ice_sched_set_vsi_node_srl_per_tc(pi, vsi_handle, tc,
+							   min_bw, max_bw,
+							   shared_bw);
+		if (status)
+			break;
+	}
+
+exit_set_vsi_bw_shared_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_validate_agg_srl_node - validate AGG SRL node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates SRL node of the AGG node if available SRL layer is
+ * different than the AGG node layer on all TC(s).This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_validate_agg_srl_node(struct ice_port_info *pi, u32 agg_id)
+{
+	u8 sel_layer = ICE_SCHED_INVAL_LAYER_NUM;
+	struct ice_sched_agg_info *agg_info;
+	bool agg_id_present = false;
+	enum ice_status status = 0;
+	u8 tc;
+
+	list_for_each_entry(agg_info, &pi->hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+	if (!agg_id_present)
+		return ICE_ERR_PARAM;
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+		enum ice_rl_type rl_type = ICE_SHARED_BW;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+		/* SRL bandwidth layer selection */
+		if (sel_layer == ICE_SCHED_INVAL_LAYER_NUM) {
+			u8 node_layer = agg_node->tx_sched_layer;
+			u8 layer_num;
+
+			layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+								node_layer);
+			if (layer_num >= pi->hw->num_tx_sched_layers)
+				return ICE_ERR_PARAM;
+			sel_layer = layer_num;
+		}
+
+		status = ice_sched_validate_srl_node(agg_node, sel_layer);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_validate_agg_id - Validate aggregator id
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates aggregator id. Caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_validate_agg_id(struct ice_port_info *pi, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+	struct ice_sched_agg_info *tmp;
+	bool agg_id_present = false;
+	enum ice_status status;
+
+	status = ice_sched_validate_agg_srl_node(pi, agg_id);
+	if (status)
+		return status;
+
+	list_for_each_entry_safe(agg_info, tmp, &pi->hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id) {
+			agg_id_present = true;
+			break;
+		}
+
+	if (!agg_id_present)
+		return ICE_ERR_PARAM;
+
+	return 0;
+}
+
+/**
+ * ice_sched_set_save_agg_srl_node_bw - set aggregator shared limit values
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @srl_node: sched node to configure
+ * @rl_type: rate limit type minimum, maximum, or shared
+ * @bw: minimum, maximum, or shared bandwidth in Kbps
+ *
+ * Configure shared rate limiter(SRL) of aggregator type nodes across
+ * requested traffic class, and saves those value for later use for
+ * replaying purposes. The caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_save_agg_srl_node_bw(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				   struct ice_sched_node *srl_node,
+				   enum ice_rl_type rl_type, u32 bw)
+{
+	enum ice_status status;
+
+	if (bw == ICE_SCHED_DFLT_BW) {
+		status = ice_sched_set_node_bw_dflt_lmt(pi, srl_node, rl_type);
+	} else {
+		status = ice_sched_set_node_bw_lmt(pi, srl_node, rl_type, bw);
+		if (status)
+			return status;
+		status = ice_sched_save_agg_bw(pi, agg_id, tc, rl_type, bw);
+	}
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_node_srl_per_tc - set aggregator SRL per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of aggregator type
+ * node for a given traffic class for aggregator matching agg_id. When BW
+ * value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the node. Caller
+ * holds the scheduler lock.
+ */
+static enum ice_status
+ice_sched_set_agg_node_srl_per_tc(struct ice_port_info *pi, u32 agg_id,
+				  u8 tc, u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	struct ice_sched_node *tc_node, *agg_node, *cfg_node;
+	enum ice_rl_type rl_type = ICE_SHARED_BW;
+	enum ice_status status = ICE_ERR_CFG;
+	u8 layer_num;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_CFG;
+
+	layer_num = ice_sched_get_rl_prof_layer(pi, rl_type,
+						agg_node->tx_sched_layer);
+	if (layer_num >= pi->hw->num_tx_sched_layers)
+		return ICE_ERR_PARAM;
+
+	/* SRL node may be different */
+	cfg_node = ice_sched_get_srl_node(agg_node, layer_num);
+	if (!cfg_node)
+		return ICE_ERR_CFG;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_MIN_BW, min_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_MAX_BW, max_bw);
+	if (status)
+		return status;
+
+	status = ice_sched_set_save_agg_srl_node_bw(pi, agg_id, tc, cfg_node,
+						    ICE_SHARED_BW, shared_bw);
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_bw_shared_lmt - set aggregator BW shared limit
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of all aggregator type
+ * nodes across all traffic classes for aggregator matching agg_id. When
+ * BW value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the
+ * node(s).
+ */
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id,
+				u32 min_bw, u32 max_bw, u32 shared_bw)
+{
+	enum ice_status status;
+	u8 tc;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_agg_id(pi, agg_id);
+	if (status)
+		goto exit_agg_bw_shared_lmt;
+
+	/* Return success if no nodes are present across TC */
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node, *agg_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+
+		agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+		if (!agg_node)
+			continue;
+
+		status = ice_sched_set_agg_node_srl_per_tc(pi, agg_id, tc,
+							   min_bw, max_bw,
+							   shared_bw);
+		if (status)
+			break;
+	}
+
+exit_agg_bw_shared_lmt:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_set_agg_bw_shared_lmt_per_tc - set aggregator BW shared lmt per tc
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: traffic class
+ * @min_bw: minimum bandwidth in Kbps
+ * @max_bw: maximum bandwidth in Kbps
+ * @shared_bw: shared bandwidth in Kbps
+ *
+ * This function configures the shared rate limiter(SRL) of aggregator type
+ * node for a given traffic class for aggregator matching agg_id. When BW
+ * value of ICE_SCHED_DFLT_BW is passed, it removes SRL from the node.
+ */
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				       u8 tc, u32 min_bw, u32 max_bw,
+				       u32 shared_bw)
+{
+	enum ice_status status;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_validate_agg_id(pi, agg_id);
+	if (status)
+		goto exit_agg_bw_shared_lmt_per_tc;
+
+	status = ice_sched_set_agg_node_srl_per_tc(pi, agg_id, tc, min_bw,
+						   max_bw, shared_bw);
+
+exit_agg_bw_shared_lmt_per_tc:
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_cfg_sibl_node_prio - configure node sibling priority
+ * @pi: port information structure
+ * @node: sched node to configure
+ * @priority: sibling priority
+ *
+ * This function configures node element's sibling priority only. This
+ * function needs to be called with scheduler lock held.
+ */
+enum ice_status
+ice_sched_cfg_sibl_node_prio(struct ice_port_info *pi,
+			     struct ice_sched_node *node, u8 priority)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
+
+	if (!hw)
+		return ICE_ERR_PARAM;
+	buf = node->info;
+	data = &buf.data;
+	data->valid_sections |= ICE_AQC_ELEM_VALID_GENERIC;
+	priority = (priority << ICE_AQC_ELEM_GENERIC_PRIO_S) &
+		   ICE_AQC_ELEM_GENERIC_PRIO_M;
+	data->generic &= ~ICE_AQC_ELEM_GENERIC_PRIO_M;
+	data->generic |= priority;
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_cfg_rl_burst_size - Set burst size value
+ * @hw: pointer to the HW struct
+ * @bytes: burst size in bytes
+ *
+ * This function configures/set the burst size to requested new value. The new
+ * burst size value is used for future rate limit calls. It doesn't change the
+ * existing or previously created RL profiles.
+ */
+enum ice_status ice_cfg_rl_burst_size(struct ice_hw *hw, u32 bytes)
+{
+	u16 burst_size_to_prog;
+
+	if (bytes < ICE_MIN_BURST_SIZE_ALLOWED ||
+	    bytes > ICE_MAX_BURST_SIZE_ALLOWED)
+		return ICE_ERR_PARAM;
+	if (ice_round_to_num(bytes, 64) <=
+	    ICE_MAX_BURST_SIZE_64_BYTE_GRANULARITY) {
+		/* 64 byte granularity case */
+		/* Disable MSB granularity bit */
+		burst_size_to_prog = ICE_64_BYTE_GRANULARITY;
+		/* round number to nearest 64 byte granularity */
+		bytes = ice_round_to_num(bytes, 64);
+		/* The value is in 64 byte chunks */
+		burst_size_to_prog |= (u16)(bytes / 64);
+	} else {
+		/* k bytes granularity case */
+		/* Enable MSB granularity bit */
+		burst_size_to_prog = ICE_KBYTE_GRANULARITY;
+		/* round number to nearest 1024 granularity */
+		bytes = ice_round_to_num(bytes, 1024);
+		/* check rounding doesn't go beyond allowed */
+		if (bytes > ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY)
+			bytes = ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY;
+		/* The value is in k bytes */
+		burst_size_to_prog |= (u16)(bytes / 1024);
+	}
+	hw->max_burst_size = burst_size_to_prog;
+	return 0;
+}
+
+/**
+ * ice_sched_replay_node_prio - re-configure node priority
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @priority: priority value
+ *
+ * This function configures node element's priority value. It
+ * needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_node_prio(struct ice_hw *hw, struct ice_sched_node *node,
+			   u8 priority)
+{
+	struct ice_aqc_txsched_elem_data buf;
+	struct ice_aqc_txsched_elem *data;
+	enum ice_status status;
+
+	buf = node->info;
+	data = &buf.data;
+	data->valid_sections |= ICE_AQC_ELEM_VALID_GENERIC;
+	data->generic = priority;
+
+	/* Configure element */
+	status = ice_sched_update_elem(hw, node, &buf);
+	return status;
+}
+
+/**
+ * ice_sched_replay_node_bw - replay node(s) BW
+ * @hw: pointer to the HW struct
+ * @node: sched node to configure
+ * @bw_t_info: BW type information
+ *
+ * This function restores node's BW from bw_t_info. The caller needs
+ * to hold the scheduler lock.
+ */
+static enum ice_status
+ice_sched_replay_node_bw(struct ice_hw *hw, struct ice_sched_node *node,
+			 struct ice_bw_type_info *bw_t_info)
+{
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status = ICE_ERR_PARAM;
+	u16 bw_alloc;
+
+	if (!node)
+		return status;
+	if (bitmap_empty(bw_t_info->bw_t_bitmap, ICE_BW_TYPE_CNT))
+		return 0;
+	if (test_bit(ICE_BW_TYPE_PRIO, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_replay_node_prio(hw, node,
+						    bw_t_info->generic);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_MIN_BW,
+						   bw_t_info->cir_bw.bw);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_CIR_WT, bw_t_info->bw_t_bitmap)) {
+		bw_alloc = bw_t_info->cir_bw.bw_alloc;
+		status = ice_sched_cfg_node_bw_alloc(hw, node, ICE_MIN_BW,
+						     bw_alloc);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap)) {
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_MAX_BW,
+						   bw_t_info->eir_bw.bw);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_EIR_WT, bw_t_info->bw_t_bitmap)) {
+		bw_alloc = bw_t_info->eir_bw.bw_alloc;
+		status = ice_sched_cfg_node_bw_alloc(hw, node, ICE_MAX_BW,
+						     bw_alloc);
+		if (status)
+			return status;
+	}
+	if (test_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap))
+		status = ice_sched_set_node_bw_lmt(pi, node, ICE_SHARED_BW,
+						   bw_t_info->shared_bw);
+	return status;
+}
+
+/**
+ * ice_sched_replay_agg_bw - replay aggregator node(s) BW
+ * @hw: pointer to the HW struct
+ * @agg_info: aggregator data structure
+ *
+ * This function re-creates aggregator type nodes. The caller needs to hold
+ * the scheduler lock.
+ */
+static enum ice_status
+ice_sched_replay_agg_bw(struct ice_hw *hw, struct ice_sched_agg_info *agg_info)
+{
+	struct ice_sched_node *tc_node, *agg_node;
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	ice_for_each_traffic_class(tc) {
+		if (bitmap_empty(agg_info->bw_t_info[tc].bw_t_bitmap, ICE_BW_TYPE_CNT))
+			continue;
+		tc_node = ice_sched_get_tc_node(hw->port_info, tc);
+		if (!tc_node) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		agg_node = ice_sched_get_agg_node(hw->port_info, tc_node,
+						  agg_info->agg_id);
+		if (!agg_node) {
+			status = ICE_ERR_PARAM;
+			break;
+		}
+		status = ice_sched_replay_node_bw(hw, agg_node,
+						  &agg_info->bw_t_info[tc]);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_get_ena_tc_bitmap - get enabled TC bitmap
+ * @pi: port info struct
+ * @tc_bitmap: 8 bits TC bitmap to check
+ * @ena_tc_bitmap: 8 bits enabled TC bitmap to return
+ *
+ * This function returns enabled TC bitmap in variable ena_tc_bitmap. Some TCs
+ * may be missing, it returns enabled TCs. This function needs to be called with
+ * scheduler lock held.
+ */
+static void
+ice_sched_get_ena_tc_bitmap(struct ice_port_info *pi,
+			    unsigned long *tc_bitmap,
+			    unsigned long *ena_tc_bitmap)
+{
+	u8 tc;
+
+	/* Some TC(s) may be missing after reset, adjust for replay */
+	ice_for_each_traffic_class(tc)
+		if (ice_is_tc_ena(*tc_bitmap, tc) &&
+		    (ice_sched_get_tc_node(pi, tc)))
+			set_bit(tc, ena_tc_bitmap);
+}
+
+/**
+ * ice_sched_replay_agg - recreate aggregator node(s)
+ * @hw: pointer to the HW struct
+ *
+ * This function recreate aggregator type nodes which are not replayed earlier.
+ * It also replay aggregator BW information. These aggregator nodes are not
+ * associated with VSI type node yet.
+ */
+void ice_sched_replay_agg(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		/* replay aggregator (re-create aggregator node) */
+		if (!bitmap_equal(agg_info->tc_bitmap, agg_info->replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS)) {
+			DECLARE_BITMAP(replay_bitmap,
+					   ICE_MAX_TRAFFIC_CLASS);
+			enum ice_status status;
+
+			bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+			ice_sched_get_ena_tc_bitmap(pi,
+						    agg_info->replay_tc_bitmap,
+						    replay_bitmap);
+			status = ice_sched_cfg_agg(hw->port_info,
+						   agg_info->agg_id,
+						   ICE_AGG_TYPE_AGG,
+						   replay_bitmap);
+			if (status) {
+				dev_info(ice_hw_to_dev(hw),
+					 "Replay agg id[%d] failed\n",
+					 agg_info->agg_id);
+				/* Move on to next one */
+				continue;
+			}
+			/* Replay aggregator node BW (restore aggregator BW) */
+			status = ice_sched_replay_agg_bw(hw, agg_info);
+			if (status)
+				dev_info(ice_hw_to_dev(hw),
+					 "Replay agg bw [id=%d] failed\n",
+					 agg_info->agg_id);
+		}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_agg_vsi_preinit - Agg/VSI replay pre initialization
+ * @hw: pointer to the HW struct
+ *
+ * This function initialize aggregator(s) TC bitmap to zero. A required
+ * preinit step for replaying aggregators.
+ */
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_info->tc_bitmap[0] = 0;
+		list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list,
+				    list_entry)
+			agg_vsi_info->tc_bitmap[0] = 0;
+	}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_root_node_bw - replay root node BW
+ * @pi: port information structure
+ *
+ * Replay root node BW settings.
+ */
+enum ice_status ice_sched_replay_root_node_bw(struct ice_port_info *pi)
+{
+	enum ice_status status = 0;
+
+	if (!pi->hw)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+
+	status = ice_sched_replay_node_bw(pi->hw, pi->root,
+					  &pi->root_node_bw_t_info);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_tc_node_bw - replay TC node(s) BW
+ * @pi: port information structure
+ *
+ * This function replay TC nodes.
+ */
+enum ice_status ice_sched_replay_tc_node_bw(struct ice_port_info *pi)
+{
+	enum ice_status status = 0;
+	u8 tc;
+
+	if (!pi->hw)
+		return ICE_ERR_PARAM;
+	mutex_lock(&pi->sched_lock);
+	ice_for_each_traffic_class(tc) {
+		struct ice_sched_node *tc_node;
+
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue; /* TC not present */
+		status = ice_sched_replay_node_bw(pi->hw, tc_node,
+						  &pi->tc_node_bw_t_info[tc]);
+		if (status)
+			break;
+	}
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_vsi_bw - replay VSI type node(s) BW
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: 8 bits TC bitmap
+ *
+ * This function replays VSI type nodes bandwidth. This function needs to be
+ * called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_vsi_bw(struct ice_hw *hw, u16 vsi_handle,
+			unsigned long *tc_bitmap)
+{
+	struct ice_sched_node *vsi_node, *tc_node;
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_bw_type_info *bw_t_info;
+	struct ice_vsi_ctx *vsi_ctx;
+	enum ice_status status = 0;
+	u8 tc;
+
+	vsi_ctx = ice_get_vsi_ctx(pi->hw, vsi_handle);
+	if (!vsi_ctx)
+		return ICE_ERR_PARAM;
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc))
+			continue;
+		tc_node = ice_sched_get_tc_node(pi, tc);
+		if (!tc_node)
+			continue;
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+		if (!vsi_node)
+			continue;
+		bw_t_info = &vsi_ctx->sched.bw_t_info[tc];
+		status = ice_sched_replay_node_bw(hw, vsi_node, bw_t_info);
+		if (status)
+			break;
+	}
+	return status;
+}
+
+/**
+ * ice_sched_replay_vsi_agg - replay aggregator & VSI to aggregator node(s)
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays aggregator node, VSI to aggregator type nodes, and
+ * their node bandwidth information. This function needs to be called with
+ * scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	DECLARE_BITMAP(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_vsi_agg_info(hw, vsi_handle);
+	if (!agg_info)
+		return 0; /* Not present in list - default Agg case */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return 0; /* Not present in list - default Agg case */
+	ice_sched_get_ena_tc_bitmap(pi, agg_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Replay aggregator node associated to vsi_handle */
+	status = ice_sched_cfg_agg(hw->port_info, agg_info->agg_id,
+				   ICE_AGG_TYPE_AGG, replay_bitmap);
+	if (status)
+		return status;
+	/* Replay aggregator node BW (restore aggregator BW) */
+	status = ice_sched_replay_agg_bw(hw, agg_info);
+	if (status)
+		return status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	ice_sched_get_ena_tc_bitmap(pi, agg_vsi_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Move this VSI (vsi_handle) to above aggregator */
+	status = ice_sched_assoc_vsi_to_agg(pi, agg_info->agg_id, vsi_handle,
+					    replay_bitmap);
+	if (status)
+		return status;
+	/* Replay VSI BW (restore VSI BW) */
+	return ice_sched_replay_vsi_bw(hw, vsi_handle,
+				       agg_vsi_info->tc_bitmap);
+}
+
+/**
+ * ice_replay_vsi_agg - replay VSI to aggregator node
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays association of VSI to aggregator type nodes, and
+ * node bandwidth information.
+ */
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_replay_vsi_agg(hw, vsi_handle);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_sched_replay_q_bw - replay queue type node BW
+ * @pi: port information structure
+ * @q_ctx: queue context structure
+ *
+ * This function replays queue type node bandwidth. This function needs to be
+ * called with scheduler lock held.
+ */
+enum ice_status
+ice_sched_replay_q_bw(struct ice_port_info *pi, struct ice_q_ctx *q_ctx)
+{
+	struct ice_sched_node *q_node;
+
+	/* Following also checks the presence of node in tree */
+	q_node = ice_sched_find_node_by_teid(pi->root, q_ctx->q_teid);
+	if (!q_node)
+		return ICE_ERR_PARAM;
+	return ice_sched_replay_node_bw(pi->hw, q_node, &q_ctx->bw_t_info);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.h b/drivers/net/ethernet/intel/ice/ice_sched.h
index 3902a8ad3025aa2e9f8cbbc4600cf42f2b86e462..48642a54f8a9a6ed7f1fe9938874ceb53bd292e9 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.h
+++ b/drivers/net/ethernet/intel/ice/ice_sched.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SCHED_H_
 #define _ICE_SCHED_H_
@@ -8,11 +8,56 @@
 
 #define ICE_QGRP_LAYER_OFFSET	2
 #define ICE_VSI_LAYER_OFFSET	4
+#define ICE_AGG_LAYER_OFFSET	6
+#define ICE_SCHED_INVAL_LAYER_NUM	0xFF
+/* Burst size is a 12 bits register that is configured while creating the RL
+ * profile(s). MSB is a granularity bit and tells the granularity type
+ * 0 - LSB bits are in 64 bytes granularity
+ * 1 - LSB bits are in 1K bytes granularity
+ */
+#define ICE_64_BYTE_GRANULARITY			0
+#define ICE_KBYTE_GRANULARITY			BIT(11)
+#define ICE_MIN_BURST_SIZE_ALLOWED		64 /* In Bytes */
+#define ICE_MAX_BURST_SIZE_ALLOWED \
+	((BIT(11) - 1) * 1024) /* In Bytes */
+#define ICE_MAX_BURST_SIZE_64_BYTE_GRANULARITY \
+	((BIT(11) - 1) * 64) /* In Bytes */
+#define ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY	ICE_MAX_BURST_SIZE_ALLOWED
+
+#define ICE_RL_PROF_ACCURACY_BYTES 128
+#define ICE_RL_PROF_MULTIPLIER 10000
+#define ICE_RL_PROF_TS_MULTIPLIER 32
+#define ICE_RL_PROF_FRACTION 512
+
+#define ICE_PSM_CLK_367MHZ_IN_HZ 367647059
+#define ICE_PSM_CLK_416MHZ_IN_HZ 416666667
+#define ICE_PSM_CLK_446MHZ_IN_HZ 446428571
+#define ICE_PSM_CLK_390MHZ_IN_HZ 390625000
+
+
+struct rl_profile_params {
+	u32 bw;			/* in Kbps */
+	u16 rl_multiplier;
+	u16 wake_up_calc;
+	u16 rl_encode;
+};
+
+/* BW rate limit profile parameters list entry along
+ * with bandwidth maintained per layer in port info
+ */
+struct ice_aqc_rl_profile_info {
+	struct ice_aqc_rl_profile_elem profile;
+	struct list_head list_entry;
+	u32 bw;			/* requested */
+	u16 prof_id_ref;	/* profile ID to node association ref count */
+};
 
 struct ice_sched_agg_vsi_info {
 	struct list_head list_entry;
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u16 vsi_handle;
+	/* save aggregator VSI TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 struct ice_sched_agg_info {
@@ -21,21 +66,43 @@ struct ice_sched_agg_info {
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u32 agg_id;
 	enum ice_agg_type agg_type;
+	/* bw_t_info saves aggregator BW information */
+	struct ice_bw_type_info bw_t_info[ICE_MAX_TRAFFIC_CLASS];
+	/* save aggregator TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 /* FW AQ command calls */
 enum ice_status
+ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
+		     struct ice_aqc_get_topo_elem *buf, u16 buf_size,
+		     u8 *num_branches, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_query_rl_profile(struct ice_hw *hw, u16 num_profiles,
+			struct ice_aqc_rl_profile_elem *buf, u16 buf_size,
+			struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_cfg_l2_node_cgd(struct ice_hw *hw, u16 num_nodes,
+		       struct ice_aqc_cfg_l2_node_cgd_elem *buf, u16 buf_size,
+		       struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_query_sched_elems(struct ice_hw *hw, u16 elems_req,
-			 struct ice_aqc_get_elem *buf, u16 buf_size,
+			 struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
 			 u16 *elems_ret, struct ice_sq_cd *cd);
 enum ice_status ice_sched_init_port(struct ice_port_info *pi);
 enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw);
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw);
+
+/* Functions to cleanup scheduler SW DB */
 void ice_sched_clear_port(struct ice_port_info *pi);
 void ice_sched_cleanup_all(struct ice_hw *hw);
 void ice_sched_clear_agg(struct ice_hw *hw);
 
+/* Get a scheduling node from SW DB for given TEID */
+struct ice_sched_node *ice_sched_get_node(struct ice_port_info *pi, u32 teid);
 struct ice_sched_node *
 ice_sched_find_node_by_teid(struct ice_sched_node *start_node, u32 teid);
+/* Add a scheduling node into SW DB for given info */
 enum ice_status
 ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 		   struct ice_aqc_txsched_elem_data *info);
@@ -48,4 +115,110 @@ enum ice_status
 ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		  u8 owner, bool enable);
 enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle);
+struct ice_sched_node *
+ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		       u16 vsi_handle);
+bool ice_sched_is_tree_balanced(struct ice_hw *hw, struct ice_sched_node *node);
+enum ice_status
+ice_aq_query_node_to_root(struct ice_hw *hw, u32 node_teid,
+			  struct ice_aqc_txsched_elem_data *buf, u16 buf_size,
+			  struct ice_sq_cd *cd);
+
+/* Tx scheduler rate limiter functions */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+	    enum ice_agg_type agg_type, u8 tc_bitmap);
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap);
+enum ice_status ice_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id);
+enum ice_status
+ice_cfg_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		 u16 q_handle, enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_q_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 q_handle, enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_tc_node_bw_lmt(struct ice_port_info *pi, u8 tc,
+		       enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_tc_node_bw_dflt_lmt(struct ice_port_info *pi, u8 tc,
+			    enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_vsi_bw_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_vsi_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+			       enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_agg_bw_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			  enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_cfg_agg_bw_dflt_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+			       enum ice_rl_type rl_type);
+enum ice_status
+ice_cfg_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle, u32 min_bw,
+			  u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_vsi_bw_no_shared_lmt(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_cfg_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+			  u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt(struct ice_port_info *pi, u32 agg_id);
+enum ice_status
+ice_cfg_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id, u8 tc,
+				 u32 min_bw, u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_cfg_agg_bw_no_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				    u8 tc);
+enum ice_status
+ice_cfg_vsi_q_priority(struct ice_port_info *pi, u16 num_qs, u32 *q_ids,
+		       u8 *q_prio);
+enum ice_status
+ice_cfg_vsi_bw_alloc(struct ice_port_info *pi, u16 vsi_handle, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc);
+enum ice_status
+ice_cfg_agg_vsi_priority_per_tc(struct ice_port_info *pi, u32 agg_id,
+				u16 num_vsis, u16 *vsi_handle_arr,
+				u8 *node_prio, u8 tc);
+enum ice_status
+ice_cfg_agg_bw_alloc(struct ice_port_info *pi, u32 agg_id, u8 ena_tcmap,
+		     enum ice_rl_type rl_type, u8 *bw_alloc);
+bool
+ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
+			       struct ice_sched_node *node);
+enum ice_status
+ice_sched_set_agg_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle);
+enum ice_status
+ice_sched_set_node_bw_lmt_per_tc(struct ice_port_info *pi, u32 id,
+				 enum ice_agg_type agg_type, u8 tc,
+				 enum ice_rl_type rl_type, u32 bw);
+enum ice_status
+ice_sched_set_vsi_bw_shared_lmt(struct ice_port_info *pi, u16 vsi_handle,
+				u32 min_bw, u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt(struct ice_port_info *pi, u32 agg_id, u32 min_bw,
+				u32 max_bw, u32 shared_bw);
+enum ice_status
+ice_sched_set_agg_bw_shared_lmt_per_tc(struct ice_port_info *pi, u32 agg_id,
+				       u8 tc, u32 min_bw, u32 max_bw,
+				       u32 shared_bw);
+enum ice_status
+ice_sched_cfg_sibl_node_prio(struct ice_port_info *pi,
+			     struct ice_sched_node *node, u8 priority);
+enum ice_status
+ice_cfg_tc_node_bw_alloc(struct ice_port_info *pi, u8 tc,
+			 enum ice_rl_type rl_type, u8 bw_alloc);
+enum ice_status ice_cfg_rl_burst_size(struct ice_hw *hw, u32 bytes);
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw);
+void ice_sched_replay_agg(struct ice_hw *hw);
+enum ice_status ice_sched_replay_tc_node_bw(struct ice_port_info *pi);
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status ice_sched_replay_root_node_bw(struct ice_port_info *pi);
+enum ice_status
+ice_sched_replay_q_bw(struct ice_port_info *pi, struct ice_q_ctx *q_ctx);
+
 #endif /* _ICE_SCHED_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index d2db0d04e1174258b9108e92b9e939d00bd03587..a66f896b5a28807ec3ee246e5c8d4c73d0c7f5b5 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1,8 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_common.h"
-#include "ice_adminq_cmd.h"
 #include "ice_sriov.h"
 
 /**
@@ -40,6 +39,7 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
 	return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
 }
 
+
 /**
  * ice_conv_link_speed_to_virtchnl
  * @adv_link_support: determines the format of the returned link speed
@@ -121,9 +121,7 @@ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
 			speed = (u32)VIRTCHNL_LINK_SPEED_25GB;
 			break;
 		case ICE_AQ_LINK_SPEED_40GB:
-			/* fall through */
 		case ICE_AQ_LINK_SPEED_50GB:
-			/* fall through */
 		case ICE_AQ_LINK_SPEED_100GB:
 			speed = (u32)VIRTCHNL_LINK_SPEED_40GB;
 			break;
@@ -134,3 +132,404 @@ u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed)
 
 	return speed;
 }
+
+/* The mailbox overflow detection algorithm helps to check if there
+ * is a possibility of a malicious VF transmitting too many MBX messages to the
+ * PF.
+ * 1. The mailbox snapshot structure, ice_mbx_snapshot, is initialized during
+ * driver initialization in ice_init_hw() using ice_mbx_init_snapshot().
+ * The struct ice_mbx_snapshot helps to track and traverse a static window of
+ * messages within the mailbox queue while looking for a malicious VF.
+ *
+ * 2. When the caller starts processing its mailbox queue in response to an
+ * interrupt, the structure ice_mbx_snapshot is expected to be cleared before
+ * the algorithm can be run for the first time for that interrupt. This can be
+ * done via ice_mbx_reset_snapshot().
+ *
+ * 3. For every message read by the caller from the MBX Queue, the caller must
+ * call the detection algorithm's entry function ice_mbx_vf_state_handler().
+ * Before every call to ice_mbx_vf_state_handler() the struct ice_mbx_data is
+ * filled as it is required to be passed to the algorithm.
+ *
+ * 4. Every time a message is read from the MBX queue, a VFId is received which
+ * is passed to the state handler. The boolean output is_malvf of the state
+ * handler ice_mbx_vf_state_handler() serves as an indicator to the caller
+ * whether this VF is malicious or not.
+ *
+ * 5. When a VF is identified to be malicious, the caller can send a message
+ * to the system administrator. The caller can invoke ice_mbx_report_malvf()
+ * to help determine if a malicious VF is to be reported or not. This function
+ * requires the caller to maintain a global bitmap to track all malicious VFs
+ * and pass that to ice_mbx_report_malvf() along with the VFID which was identified
+ * to be malicious by ice_mbx_vf_state_handler().
+ *
+ * 6. The global bitmap maintained by PF can be cleared completely if PF is in
+ * reset or the bit corresponding to a VF can be cleared if that VF is in reset.
+ * When a VF is shut down and brought back up, we assume that the new VF
+ * brought up is not malicious and hence report it if found malicious.
+ *
+ * 7. The function ice_mbx_reset_snapshot() is called to reset the information
+ * in ice_mbx_snapshot for every new mailbox interrupt handled.
+ *
+ * 8. The memory allocated for variables in ice_mbx_snapshot is de-allocated
+ * when driver is unloaded.
+ */
+#define ICE_RQ_DATA_MASK(rq_data) ((rq_data) & PF_MBX_ARQH_ARQH_M)
+/* Using the highest value for an unsigned 16-bit value 0xFFFF to indicate that
+ * the max messages check must be ignored in the algorithm
+ */
+#define ICE_IGNORE_MAX_MSG_CNT	0xFFFF
+
+/**
+ * ice_mbx_traverse - Pass through mailbox snapshot
+ * @hw: pointer to the HW struct
+ * @new_state: new algorithm state
+ *
+ * Traversing the mailbox static snapshot without checking
+ * for malicious VFs.
+ */
+static void
+ice_mbx_traverse(struct ice_hw *hw,
+		 enum ice_mbx_snapshot_state *new_state)
+{
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	u32 num_iterations;
+
+	snap_buf = &hw->mbx_snapshot.mbx_buf;
+
+	/* As mailbox buffer is circular, applying a mask
+	 * on the incremented iteration count.
+	 */
+	num_iterations = ICE_RQ_DATA_MASK(++snap_buf->num_iterations);
+
+	/* Checking either of the below conditions to exit snapshot traversal:
+	 * Condition-1: If the number of iterations in the mailbox is equal to
+	 * the mailbox head which would indicate that we have reached the end
+	 * of the static snapshot.
+	 * Condition-2: If the maximum messages serviced in the mailbox for a
+	 * given interrupt is the highest possible value then there is no need
+	 * to check if the number of messages processed is equal to it. If not
+	 * check if the number of messages processed is greater than or equal
+	 * to the maximum number of mailbox entries serviced in current work item.
+	 */
+	if (num_iterations == snap_buf->head ||
+	    (snap_buf->max_num_msgs_mbx < ICE_IGNORE_MAX_MSG_CNT &&
+	     ++snap_buf->num_msg_proc >= snap_buf->max_num_msgs_mbx))
+		*new_state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_detect_malvf - Detect malicious VF in snapshot
+ * @hw: pointer to the HW struct
+ * @vf_id: relative virtual function ID
+ * @new_state: new algorithm state
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * This function tracks the number of asynchronous messages
+ * sent per VF and marks the VF as malicious if it exceeds
+ * the permissible number of messages to send.
+ */
+static enum ice_status
+ice_mbx_detect_malvf(struct ice_hw *hw, u16 vf_id,
+		     enum ice_mbx_snapshot_state *new_state,
+		     bool *is_malvf)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	if (vf_id >= snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* increment the message count in the VF array */
+	snap->mbx_vf.vf_cntr[vf_id]++;
+
+	if (snap->mbx_vf.vf_cntr[vf_id] >= ICE_ASYNC_VF_MSG_THRESHOLD)
+		*is_malvf = true;
+
+	/* continue to iterate through the mailbox snapshot */
+	ice_mbx_traverse(hw, new_state);
+
+	return 0;
+}
+
+/**
+ * ice_mbx_reset_snapshot - Reset mailbox snapshot structure
+ * @snap: pointer to mailbox snapshot structure in the ice_hw struct
+ *
+ * Reset the mailbox snapshot structure and clear VF counter array.
+ */
+static void ice_mbx_reset_snapshot(struct ice_mbx_snapshot *snap)
+{
+	u32 vfcntr_len;
+
+	if (!snap || !snap->mbx_vf.vf_cntr)
+		return;
+
+	/* Clear VF counters. */
+	vfcntr_len = snap->mbx_vf.vfcntr_len;
+	if (vfcntr_len)
+		memset(snap->mbx_vf.vf_cntr, 0,
+		       (vfcntr_len * sizeof(*snap->mbx_vf.vf_cntr)));
+
+	/* Reset mailbox snapshot for a new capture. */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+	snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+}
+
+/**
+ * ice_mbx_vf_state_handler - Handle states of the overflow algorithm
+ * @hw: pointer to the HW struct
+ * @mbx_data: pointer to structure containing mailbox data
+ * @vf_id: relative virtual function (VF) ID
+ * @is_malvf: boolean output to indicate if VF is malicious
+ *
+ * The function serves as an entry point for the malicious VF
+ * detection algorithm by handling the different states and state
+ * transitions of the algorithm:
+ * New snapshot: This state is entered when creating a new static
+ * snapshot. The data from any previous mailbox snapshot is
+ * cleared and a new capture of the mailbox head and tail is
+ * logged. This will be the new static snapshot to detect
+ * asynchronous messages sent by VFs. On capturing the snapshot
+ * and depending on whether the number of pending messages in that
+ * snapshot exceed the watermark value, the state machine enters
+ * traverse or detect states.
+ * Traverse: If pending message count is below watermark then iterate
+ * through the snapshot without any action on VF.
+ * Detect: If pending message count exceeds watermark traverse
+ * the static snapshot and look for a malicious VF.
+ */
+enum ice_status
+ice_mbx_vf_state_handler(struct ice_hw *hw,
+			 struct ice_mbx_data *mbx_data, u16 vf_id,
+			 bool *is_malvf)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+	struct ice_mbx_snap_buffer_data *snap_buf;
+	struct ice_ctl_q_info *cq = &hw->mailboxq;
+	enum ice_mbx_snapshot_state new_state;
+	enum ice_status status = 0;
+
+	if (!is_malvf || !mbx_data)
+		return ICE_ERR_BAD_PTR;
+
+	/* When entering the mailbox state machine assume that the VF
+	 * is not malicious until detected.
+	 */
+	*is_malvf = false;
+
+	 /* Checking if max messages allowed to be processed while servicing current
+	  * interrupt is not less than the defined AVF message threshold.
+	  */
+	if (mbx_data->max_num_msgs_mbx <= ICE_ASYNC_VF_MSG_THRESHOLD)
+		return ICE_ERR_INVAL_SIZE;
+
+	/* The watermark value should not be lesser than the threshold limit
+	 * set for the number of asynchronous messages a VF can send to mailbox
+	 * nor should it be greater than the maximum number of messages in the
+	 * mailbox serviced in current interrupt.
+	 */
+	if (mbx_data->async_watermark_val < ICE_ASYNC_VF_MSG_THRESHOLD ||
+	    mbx_data->async_watermark_val > mbx_data->max_num_msgs_mbx)
+		return ICE_ERR_PARAM;
+
+	new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+	snap_buf = &snap->mbx_buf;
+
+	switch (snap_buf->state) {
+	case ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT:
+		/* Clear any previously held data in mailbox snapshot structure. */
+		ice_mbx_reset_snapshot(snap);
+
+		/* Collect the pending ARQ count, number of messages processed and
+		 * the maximum number of messages allowed to be processed from the
+		 * Mailbox for current interrupt.
+		 */
+		snap_buf->num_pending_arq = mbx_data->num_pending_arq;
+		snap_buf->num_msg_proc = mbx_data->num_msg_proc;
+		snap_buf->max_num_msgs_mbx = mbx_data->max_num_msgs_mbx;
+
+		/* Capture a new static snapshot of the mailbox by logging the
+		 * head and tail of snapshot and set num_iterations to the tail
+		 * value to mark the start of the iteration through the snapshot.
+		 */
+		snap_buf->head = ICE_RQ_DATA_MASK(cq->rq.next_to_clean +
+						  mbx_data->num_pending_arq);
+		snap_buf->tail = ICE_RQ_DATA_MASK(cq->rq.next_to_clean - 1);
+		snap_buf->num_iterations = snap_buf->tail;
+
+		/* Pending ARQ messages returned by ice_clean_rq_elem
+		 * is the difference between the head and tail of the
+		 * mailbox queue. Comparing this value against the watermark
+		 * helps to check if we potentially have malicious VFs.
+		 */
+		if (snap_buf->num_pending_arq >=
+		    mbx_data->async_watermark_val) {
+			new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+			status = ice_mbx_detect_malvf(hw, vf_id, &new_state, is_malvf);
+		} else {
+			new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+			ice_mbx_traverse(hw, &new_state);
+		}
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_TRAVERSE:
+		new_state = ICE_MAL_VF_DETECT_STATE_TRAVERSE;
+		ice_mbx_traverse(hw, &new_state);
+		break;
+
+	case ICE_MAL_VF_DETECT_STATE_DETECT:
+		new_state = ICE_MAL_VF_DETECT_STATE_DETECT;
+		status = ice_mbx_detect_malvf(hw, vf_id, &new_state, is_malvf);
+		break;
+
+	default:
+		new_state = ICE_MAL_VF_DETECT_STATE_INVALID;
+		status = ICE_ERR_CFG;
+	}
+
+	snap_buf->state = new_state;
+
+	return status;
+}
+
+/**
+ * ice_mbx_report_malvf - Track and note malicious VF
+ * @hw: pointer to the HW struct
+ * @all_malvfs: all malicious VFs tracked by PF
+ * @bitmap_len: length of bitmap in bits
+ * @vf_id: relative virtual function ID of the malicious VF
+ * @report_malvf: boolean to indicate if malicious VF must be reported
+ *
+ * This function will update a bitmap that keeps track of the malicious
+ * VFs attached to the PF. A malicious VF must be reported only once if
+ * discovered between VF resets or loading so the function checks
+ * the input vf_id against the bitmap to verify if the VF has been
+ * detected in any previous mailbox iterations.
+ */
+enum ice_status
+ice_mbx_report_malvf(struct ice_hw *hw, unsigned long *all_malvfs,
+		     u16 bitmap_len, u16 vf_id, bool *report_malvf)
+{
+	if (!all_malvfs || !report_malvf)
+		return ICE_ERR_PARAM;
+
+	*report_malvf = false;
+
+	if (bitmap_len < hw->mbx_snapshot.mbx_vf.vfcntr_len)
+		return ICE_ERR_INVAL_SIZE;
+
+	if (vf_id >= bitmap_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* If the vf_id is found in the bitmap set bit and boolean to true */
+	if (!test_bit(vf_id, all_malvfs)) {
+		set_bit(vf_id, all_malvfs);
+		*report_malvf = true;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_mbx_clear_malvf - Clear VF bitmap and counter for VF ID
+ * @snap: pointer to the mailbox snapshot structure
+ * @all_malvfs: all malicious VFs tracked by PF
+ * @bitmap_len: length of bitmap in bits
+ * @vf_id: relative virtual function ID of the malicious VF
+ *
+ * In case of a VF reset, this function can be called to clear
+ * the bit corresponding to the VF ID in the bitmap tracking all
+ * malicious VFs attached to the PF. The function also clears the
+ * VF counter array at the index of the VF ID. This is to ensure
+ * that the new VF loaded is not considered malicious before going
+ * through the overflow detection algorithm.
+ */
+enum ice_status
+ice_mbx_clear_malvf(struct ice_mbx_snapshot *snap, unsigned long *all_malvfs,
+		    u16 bitmap_len, u16 vf_id)
+{
+	if (!snap || !all_malvfs)
+		return ICE_ERR_PARAM;
+
+	if (bitmap_len < snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_INVAL_SIZE;
+
+	/* Ensure VF ID value is not larger than bitmap or VF counter length */
+	if (vf_id >= bitmap_len || vf_id >= snap->mbx_vf.vfcntr_len)
+		return ICE_ERR_OUT_OF_RANGE;
+
+	/* Clear VF ID bit in the bitmap tracking malicious VFs attached to PF */
+	clear_bit(vf_id, all_malvfs);
+
+	/* Clear the VF counter in the mailbox snapshot structure for that VF ID.
+	 * This is to ensure that if a VF is unloaded and a new one brought back
+	 * up with the same VF ID for a snapshot currently in traversal or detect
+	 * state the counter for that VF ID does not increment on top of existing
+	 * values in the mailbox overflow detection algorithm.
+	 */
+	snap->mbx_vf.vf_cntr[vf_id] = 0;
+
+	return 0;
+}
+
+/**
+ * ice_mbx_init_snapshot - Initialize mailbox snapshot structure
+ * @hw: pointer to the hardware structure
+ * @vf_count: number of VFs allocated on a PF
+ *
+ * Clear the mailbox snapshot structure and allocate memory
+ * for the VF counter array based on the number of VFs allocated
+ * on that PF.
+ *
+ * Assumption: This function will assume ice_get_caps() has already been
+ * called to ensure that the vf_count can be compared against the number
+ * of VFs supported as defined in the functional capabilities of the device.
+ */
+enum ice_status ice_mbx_init_snapshot(struct ice_hw *hw, u16 vf_count)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	/* Ensure that the number of VFs allocated is non-zero and
+	 * is not greater than the number of supported VFs defined in
+	 * the functional capabilities of the PF.
+	 */
+	if (!vf_count || vf_count > hw->func_caps.num_allocd_vfs)
+		return ICE_ERR_INVAL_SIZE;
+
+	snap->mbx_vf.vf_cntr = devm_kcalloc(ice_hw_to_dev(hw), vf_count,
+					    sizeof(*snap->mbx_vf.vf_cntr),
+					    GFP_KERNEL);
+	if (!snap->mbx_vf.vf_cntr)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Setting the VF counter length to the number of allocated
+	 * VFs for given PF's functional capabilities.
+	 */
+	snap->mbx_vf.vfcntr_len = vf_count;
+
+	/* Clear mbx_buf in the mailbox snaphot structure and setting the
+	 * mailbox snapshot state to a new capture.
+	 */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+	snap->mbx_buf.state = ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT;
+
+	return 0;
+}
+
+/**
+ * ice_mbx_deinit_snapshot - Free mailbox snapshot structure
+ * @hw: pointer to the hardware structure
+ *
+ * Clear the mailbox snapshot structure and free the VF counter array.
+ */
+void ice_mbx_deinit_snapshot(struct ice_hw *hw)
+{
+	struct ice_mbx_snapshot *snap = &hw->mbx_snapshot;
+
+	/* Free VF counter array and reset vf counter length */
+	devm_kfree(ice_hw_to_dev(hw), snap->mbx_vf.vf_cntr);
+	snap->mbx_vf.vfcntr_len = 0;
+
+	/* Clear mbx_buf in the mailbox snaphot structure */
+	memset(&snap->mbx_buf, 0, sizeof(snap->mbx_buf));
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.h b/drivers/net/ethernet/intel/ice/ice_sriov.h
index 3d78a0795138c92c30558d47e588a89395a35ae2..15adb2e5aaf226120d01001ec1d6ba12f8f0ed95 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.h
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.h
@@ -1,10 +1,17 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SRIOV_H_
 #define _ICE_SRIOV_H_
 
-#include "ice_common.h"
+#include "ice_type.h"
+#include "ice_controlq.h"
+
+/* Defining the mailbox message threshold as 63 asynchronous
+ * pending messages. Normal VF functionality does not require
+ * sending more than 63 asynchronous pending message.
+ */
+#define ICE_ASYNC_VF_MSG_THRESHOLD	63
 
 #ifdef CONFIG_PCI_IOV
 enum ice_status
@@ -12,6 +19,17 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
 		      u8 *msg, u16 msglen, struct ice_sq_cd *cd);
 
 u32 ice_conv_link_speed_to_virtchnl(bool adv_link_support, u16 link_speed);
+enum ice_status
+ice_mbx_vf_state_handler(struct ice_hw *hw, struct ice_mbx_data *mbx_data,
+			 u16 vf_id, bool *is_mal_vf);
+enum ice_status
+ice_mbx_clear_malvf(struct ice_mbx_snapshot *snap, unsigned long *all_malvfs,
+		    u16 bitmap_len, u16 vf_id);
+enum ice_status ice_mbx_init_snapshot(struct ice_hw *hw, u16 vf_count);
+void ice_mbx_deinit_snapshot(struct ice_hw *hw);
+enum ice_status
+ice_mbx_report_malvf(struct ice_hw *hw, unsigned long *all_malvfs,
+		     u16 bitmap_len, u16 vf_id, bool *report_malvf);
 #else /* CONFIG_PCI_IOV */
 static inline enum ice_status
 ice_aq_send_msg_to_vf(struct ice_hw __always_unused *hw,
diff --git a/drivers/net/ethernet/intel/ice/ice_status.h b/drivers/net/ethernet/intel/ice/ice_status.h
index c015978856291c1740faa43ee7cd593c6adcf8a0..aa279abfe32a9b0d04dcfb7c0abed401d7cca010 100644
--- a/drivers/net/ethernet/intel/ice/ice_status.h
+++ b/drivers/net/ethernet/intel/ice/ice_status.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_STATUS_H_
 #define _ICE_STATUS_H_
@@ -26,14 +26,22 @@ enum ice_status {
 	ICE_ERR_IN_USE				= -16,
 	ICE_ERR_MAX_LIMIT			= -17,
 	ICE_ERR_RESET_ONGOING			= -18,
+	ICE_ERR_HW_TABLE			= -19,
+	ICE_ERR_FW_DDP_MISMATCH			= -20,
+
+	/* NVM specific error codes: Range -50..-59 */
+	ICE_ERR_NVM				= -50,
 	ICE_ERR_NVM_CHECKSUM			= -51,
 	ICE_ERR_BUF_TOO_SHORT			= -52,
 	ICE_ERR_NVM_BLANK_MODE			= -53,
+
+	/* ARQ/ASQ specific error codes. Range -100..-109 */
 	ICE_ERR_AQ_ERROR			= -100,
 	ICE_ERR_AQ_TIMEOUT			= -101,
 	ICE_ERR_AQ_FULL				= -102,
 	ICE_ERR_AQ_NO_WORK			= -103,
 	ICE_ERR_AQ_EMPTY			= -104,
+	ICE_ERR_AQ_FW_CRITICAL			= -105,
 };
 
 #endif /* _ICE_STATUS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 7ff2e07f6d38aa5dfa556fab6cb1615908326598..c0e622d13893dd4e1e3375eb5a07c7586a0fd4e4 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice_switch.h"
+#include "ice_flex_type.h"
+#include "ice_flow.h"
+
 
 #define ICE_ETH_DA_OFFSET		0
 #define ICE_ETH_ETHTYPE_OFFSET		12
@@ -23,76 +26,1286 @@
  *	In case of Ether type filter it is treated as header without VLAN tag
  *	and byte 12 and 13 is used to program a given Ether type instead
  */
-#define DUMMY_ETH_HDR_LEN		16
 static const u8 dummy_eth_header[DUMMY_ETH_HDR_LEN] = { 0x2, 0, 0, 0, 0, 0,
 							0x2, 0, 0, 0, 0, 0,
 							0x81, 0, 0, 0};
 
-#define ICE_SW_RULE_RX_TX_ETH_HDR_SIZE \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lkup_rx_tx) + DUMMY_ETH_HDR_LEN - 1)
-#define ICE_SW_RULE_RX_TX_NO_HDR_SIZE \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lkup_rx_tx) - 1)
-#define ICE_SW_RULE_LG_ACT_SIZE(n) \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_lg_act) - \
-	 sizeof(((struct ice_sw_rule_lg_act *)0)->act) + \
-	 ((n) * sizeof(((struct ice_sw_rule_lg_act *)0)->act)))
-#define ICE_SW_RULE_VSI_LIST_SIZE(n) \
-	(sizeof(struct ice_aqc_sw_rules_elem) - \
-	 sizeof(((struct ice_aqc_sw_rules_elem *)0)->pdata) + \
-	 sizeof(struct ice_sw_rule_vsi_list) - \
-	 sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi) + \
-	 ((n) * sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi)))
-
-/**
- * ice_aq_alloc_free_res - command to allocate/free resources
- * @hw: pointer to the HW struct
- * @num_entries: number of resource entries in buffer
- * @buf: Indirect buffer to hold data parameters and response
- * @buf_size: size of buffer for indirect commands
- * @opc: pass in the command opcode
- * @cd: pointer to command details structure or NULL
+
+struct ice_dummy_pkt_offsets {
+	enum ice_protocol_type type;
+	u16 offset; /* ICE_PROTOCOL_LAST indicates end of list */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_gre_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_NVGRE,		34 },
+	{ ICE_MAC_IL,		42 },
+	{ ICE_IPV4_IL,		56 },
+	{ ICE_TCP_IL,		76 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_gre_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x3E,	/* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x2F, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x80, 0x00, 0x65, 0x58,	/* ICE_NVGRE 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_IL 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x14,	/* ICE_IPV4_IL 56 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_TCP_IL 76 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x02, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
+static const struct ice_dummy_pkt_offsets dummy_gre_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_NVGRE,		34 },
+	{ ICE_MAC_IL,		42 },
+	{ ICE_IPV4_IL,		56 },
+	{ ICE_UDP_ILOS,		76 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_gre_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x3E,	/* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x2F, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x80, 0x00, 0x65, 0x58,	/* ICE_NVGRE 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_MAC_IL 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x14,	/* ICE_IPV4_IL 56 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00,	/* ICE_UDP_ILOS 76 */
+	0x00, 0x08, 0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_udp_tun_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_VXLAN,		42 },
+	{ ICE_GENEVE,		42 },
+	{ ICE_VXLAN_GPE,	42 },
+	{ ICE_MAC_IL,		50 },
+	{ ICE_IPV4_IL,		64 },
+	{ ICE_TCP_IL,		84 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_udp_tun_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,  /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x5a, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x12, 0xb5, /* ICE_UDP_OF 34 */
+	0x00, 0x46, 0x00, 0x00,
+
+	0x00, 0x00, 0x65, 0x58, /* ICE_VXLAN 42 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_IL 50 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_IL 64 */
+	0x00, 0x01, 0x00, 0x00,
+	0x40, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 84 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x02, 0x20, 0x00,
+	0x00, 0x00, 0x00, 0x00
+};
+
+static const struct ice_dummy_pkt_offsets dummy_udp_tun_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_VXLAN,		42 },
+	{ ICE_GENEVE,		42 },
+	{ ICE_VXLAN_GPE,	42 },
+	{ ICE_MAC_IL,		50 },
+	{ ICE_IPV4_IL,		64 },
+	{ ICE_UDP_ILOS,		84 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_udp_tun_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00,  /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x4e, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x12, 0xb5, /* ICE_UDP_OF 34 */
+	0x00, 0x3a, 0x00, 0x00,
+
+	0x00, 0x00, 0x65, 0x58, /* ICE_VXLAN 42 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_IL 50 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_IL 64 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 84 */
+	0x00, 0x08, 0x00, 0x00,
+};
+
+/* offset info for MAC + IPv4 + UDP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_ILOS,		34 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* Dummy packet for MAC + IPv4 + UDP */
+static const u8 dummy_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 34 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + VLAN + IPv4 + UDP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_vlan_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV4_OFOS,	18 },
+	{ ICE_UDP_ILOS,		38 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (801.1Q), IPv4:UDP dummy packet */
+static const u8 dummy_vlan_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x08, 0x00, /* ICE_VLAN_OFOS 14 */
+
+	0x45, 0x00, 0x00, 0x1c, /* ICE_IPV4_OFOS 18 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 38 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + IPv4 + TCP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_TCP_IL,		34 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* Dummy packet for MAC + IPv4 + TCP */
+static const u8 dummy_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 34 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+/* offset info for MAC + VLAN (C-tag, 802.1Q) + IPv4 + TCP dummy packet */
+static const struct ice_dummy_pkt_offsets dummy_vlan_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV4_OFOS,	18 },
+	{ ICE_TCP_IL,		38 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (801.1Q), IPv4:TCP dummy packet */
+static const u8 dummy_vlan_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x08, 0x00, /* ICE_VLAN_OFOS 14 */
+
+	0x45, 0x00, 0x00, 0x28, /* ICE_IPV4_OFOS 18 */
+	0x00, 0x01, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 38 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,	/* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_tcp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_TCP_IL,		54 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_tcp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x86, 0xDD,		/* ICE_ETYPE_OL 12 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 40 */
+	0x00, 0x14, 0x06, 0x00, /* Next header is TCP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 54 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* C-tag (802.1Q): IPv6 + TCP */
+static const struct ice_dummy_pkt_offsets
+dummy_vlan_tcp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV6_OFOS,	18 },
+	{ ICE_TCP_IL,		58 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (802.1Q), IPv6 + TCP dummy packet */
+static const u8 dummy_vlan_tcp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x86, 0xDD, /* ICE_VLAN_OFOS 14 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 18 */
+	0x00, 0x14, 0x06, 0x00, /* Next header is TCP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_TCP_IL 58 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* IPv6 + UDP */
+static const struct ice_dummy_pkt_offsets dummy_udp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_ILOS,		54 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* IPv6 + UDP dummy packet */
+static const u8 dummy_udp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x86, 0xDD,		/* ICE_ETYPE_OL 12 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 40 */
+	0x00, 0x10, 0x11, 0x00, /* Next header UDP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 54 */
+	0x00, 0x10, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* needed for ESP packets */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* C-tag (802.1Q): IPv6 + UDP */
+static const struct ice_dummy_pkt_offsets
+dummy_vlan_udp_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_ETYPE_OL,		12 },
+	{ ICE_VLAN_OFOS,	14 },
+	{ ICE_IPV6_OFOS,	18 },
+	{ ICE_UDP_ILOS,		58 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+/* C-tag (802.1Q), IPv6 + UDP dummy packet */
+static const u8 dummy_vlan_udp_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x81, 0x00,		/* ICE_ETYPE_OL 12 */
+
+	0x00, 0x00, 0x86, 0xDD, /* ICE_VLAN_OFOS 14 */
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 18 */
+	0x00, 0x08, 0x11, 0x00, /* Next header UDP */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* ICE_UDP_ILOS 58 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv4 + Outer UDP + GTP + Inner IPv4 + Inner TCP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_TCP_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x58, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x44, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x34, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* IP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv4 + Outer UDP + GTP + Inner IPv4 + Inner UDP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_UDP_ILOS,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x4c, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x38, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* IP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 82 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+/* Outer IPv6 + Outer UDP + GTP + Inner IPv4 + Inner TCP */
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_TCP_IL,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x6c, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x58, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x48, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 62 */
+	0x00, 0x14, 0x06, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 102 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_UDP_ILOS,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x60, /* IP 14 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 34 */
+	0x00, 0x4c, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x3c, /* GTP-U Header 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 62 */
+	0x00, 0x08, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 102 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_TCP_IL,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x44, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x44, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x34, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x28, /* IP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 102 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_UDP_ILOS,		102 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x38, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x38, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x1c, /* IP 82 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 102 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_tcp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_TCP_IL,		122 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_tcp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x58, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x58, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x48, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 82 */
+	0x00, 0x14, 0x06, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* TCP 122 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x50, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_udp_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_UDP_ILOS,		122 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_udp_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* Ethernet 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 14 */
+	0x00, 0x4c, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x08, 0x68, /* UDP 54 */
+	0x00, 0x4c, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x3c, /* GTP-U Header 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* GTP_PDUSession_ExtensionHeader 74 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* IPv6 82 */
+	0x00, 0x08, 0x11, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00, 0x00, 0x00, /* UDP 122 */
+	0x00, 0x08, 0x00, 0x00,
+
+	0x00, 0x00, /* 2 bytes for 4 byte alignment */
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv4_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV4_IL,		62 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv4_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x44, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x14, /* ICE_IPV4_IL 62 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv4_gtpu_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV4_OFOS,	14 },
+	{ ICE_UDP_OF,		34 },
+	{ ICE_GTP,		42 },
+	{ ICE_IPV6_IL,		62 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv4_gtpu_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x08, 0x00,
+
+	0x45, 0x00, 0x00, 0x58, /* ICE_IPV4_OFOS 14 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x11, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 34 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 42 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_IL 62 */
+	0x00, 0x00, 0x3b, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv4_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV4_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv4_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 14 */
+	0x00, 0x58, 0x11, 0x00, /* Next header UDP*/
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x45, 0x00, 0x00, 0x14, /* ICE_IPV4_IL 82 */
+	0x00, 0x00, 0x40, 0x00,
+	0x40, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+static const struct ice_dummy_pkt_offsets dummy_ipv6_gtpu_ipv6_packet_offsets[] = {
+	{ ICE_MAC_OFOS,		0 },
+	{ ICE_IPV6_OFOS,	14 },
+	{ ICE_UDP_OF,		54 },
+	{ ICE_GTP,		62 },
+	{ ICE_IPV6_IL,		82 },
+	{ ICE_PROTOCOL_LAST,	0 },
+};
+
+static const u8 dummy_ipv6_gtpu_ipv6_packet[] = {
+	0x00, 0x00, 0x00, 0x00, /* ICE_MAC_OFOS 0 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x86, 0xdd,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFOS 14 */
+	0x00, 0x6c, 0x11, 0x00, /* Next header UDP*/
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x08, 0x68, 0x08, 0x68, /* ICE_UDP_OF 54 */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x34, 0xff, 0x00, 0x28,  /* ICE_GTP 62 */
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x85,
+
+	0x02, 0x00, 0x00, 0x00, /* PDU Session extension header */
+	0x00, 0x00, 0x00, 0x00,
+
+	0x60, 0x00, 0x00, 0x00, /* ICE_IPV6_OFIL 82 */
+	0x00, 0x00, 0x3b, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+
+	0x00, 0x00,
+};
+
+
+/* this is a recipe to profile association bitmap */
+static DECLARE_BITMAP(recipe_to_profile[ICE_MAX_NUM_RECIPES],
+			  ICE_MAX_NUM_PROFILES);
+
+/* this is a profile to recipe association bitmap */
+static DECLARE_BITMAP(profile_to_recipe[ICE_MAX_NUM_PROFILES],
+			  ICE_MAX_NUM_RECIPES);
+
+static void ice_get_recp_to_prof_map(struct ice_hw *hw);
+
+/**
+ * ice_collect_result_idx - copy result index values
+ * @buf: buffer that contains the result index
+ * @recp: the recipe struct to copy data into
+ */
+static void ice_collect_result_idx(struct ice_aqc_recipe_data_elem *buf,
+				   struct ice_sw_recipe *recp)
+{
+	if (buf->content.result_indx & ICE_AQ_RECIPE_RESULT_EN)
+		set_bit(buf->content.result_indx & ~ICE_AQ_RECIPE_RESULT_EN,
+		        recp->res_idxs);
+}
+
+
+/**
+ * ice_get_recp_frm_fw - update SW bookkeeping from FW recipe entries
+ * @hw: pointer to hardware structure
+ * @recps: struct that we need to populate
+ * @rid: recipe ID that we are populating
+ * @refresh_required: true if we should get recipe to profile mapping from FW
  *
- * Helper function to allocate/free resources using the admin queue commands
+ * This function is used to populate all the necessary entries into our
+ * bookkeeping so that we have a current list of all the recipes that are
+ * programmed in the firmware.
  */
 static enum ice_status
-ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
-		      struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size,
-		      enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+ice_get_recp_frm_fw(struct ice_hw *hw, struct ice_sw_recipe *recps, u8 rid,
+		    bool *refresh_required)
 {
-	struct ice_aqc_alloc_free_res_cmd *cmd;
-	struct ice_aq_desc desc;
+	DECLARE_BITMAP(result_bm, ICE_MAX_FV_WORDS);
+	struct ice_aqc_recipe_data_elem *tmp;
+	u16 num_recps = ICE_MAX_NUM_RECIPES;
+	struct ice_prot_lkup_ext *lkup_exts;
+	enum ice_status status;
+	u8 fv_word_idx = 0;
+	u16 sub_recps;
 
-	cmd = &desc.params.sw_res_ctrl;
+	bitmap_zero(result_bm, ICE_MAX_FV_WORDS);
 
-	if (!buf)
-		return ICE_ERR_PARAM;
+	/* we need a buffer big enough to accommodate all the recipes */
+	tmp = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_NUM_RECIPES,
+			   sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
 
-	if (buf_size < (num_entries * sizeof(buf->elem[0])))
-		return ICE_ERR_PARAM;
+	tmp[0].recipe_indx = rid;
+	status = ice_aq_get_recipe(hw, tmp, &num_recps, rid, NULL);
+	/* non-zero status meaning recipe doesn't exist */
+	if (status)
+		goto err_unroll;
+
+	/* Get recipe to profile map so that we can get the fv from lkups that
+	 * we read for a recipe from FW. Since we want to minimize the number of
+	 * times we make this FW call, just make one call and cache the copy
+	 * until a new recipe is added. This operation is only required the
+	 * first time to get the changes from FW. Then to search existing
+	 * entries we don't need to update the cache again until another recipe
+	 * gets added.
+	 */
+	if (*refresh_required) {
+		ice_get_recp_to_prof_map(hw);
+		*refresh_required = false;
+	}
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	/* Start populating all the entries for recps[rid] based on lkups from
+	 * firmware. Note that we are only creating the root recipe in our
+	 * database.
+	 */
+	lkup_exts = &recps[rid].lkup_exts;
+
+	for (sub_recps = 0; sub_recps < num_recps; sub_recps++) {
+		struct ice_aqc_recipe_data_elem root_bufs = tmp[sub_recps];
+		struct ice_recp_grp_entry *rg_entry;
+		u8 i, prof, idx, prot = 0;
+		bool is_root;
+		u16 off = 0;
+
+		rg_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rg_entry),
+					GFP_KERNEL);
+		if (!rg_entry) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_unroll;
+		}
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		idx = root_bufs.recipe_indx;
+		is_root = root_bufs.content.rid & ICE_AQ_RECIPE_ID_IS_ROOT;
+
+		/* Mark all result indices in this chain */
+		if (root_bufs.content.result_indx & ICE_AQ_RECIPE_RESULT_EN)
+			set_bit(root_bufs.content.result_indx & ~ICE_AQ_RECIPE_RESULT_EN,
+				result_bm);
+
+		/* get the first profile that is associated with rid */
+		prof = find_first_bit(recipe_to_profile[idx],
+				      ICE_MAX_NUM_PROFILES);
+		for (i = 0; i < ICE_NUM_WORDS_RECIPE; i++) {
+			u8 lkup_indx = root_bufs.content.lkup_indx[i + 1];
+
+			rg_entry->fv_idx[i] = lkup_indx;
+			rg_entry->fv_mask[i] =
+				le16_to_cpu(root_bufs.content.mask[i + 1]);
+
+			/* If the recipe is a chained recipe then all its
+			 * child recipe's result will have a result index.
+			 * To fill fv_words we should not use those result
+			 * index, we only need the protocol ids and offsets.
+			 * We will skip all the fv_idx which stores result
+			 * index in them. We also need to skip any fv_idx which
+			 * has ICE_AQ_RECIPE_LKUP_IGNORE or 0 since it isn't a
+			 * valid offset value.
+			 */
+			if (test_bit(rg_entry->fv_idx[i], hw->switch_info->prof_res_bm[prof]) ||
+			    rg_entry->fv_idx[i] & ICE_AQ_RECIPE_LKUP_IGNORE ||
+			    rg_entry->fv_idx[i] == 0)
+				continue;
+
+			ice_find_prot_off(hw, ICE_BLK_SW, prof,
+					  rg_entry->fv_idx[i], &prot, &off);
+			lkup_exts->fv_words[fv_word_idx].prot_id = prot;
+			lkup_exts->fv_words[fv_word_idx].off = off;
+			lkup_exts->field_mask[fv_word_idx] =
+				rg_entry->fv_mask[i];
+			fv_word_idx++;
+		}
+		/* populate rg_list with the data from the child entry of this
+		 * recipe
+		 */
+		list_add(&rg_entry->l_entry, &recps[rid].rg_list);
+
+		/* Propagate some data to the recipe database */
+		recps[idx].is_root = !!is_root;
+		recps[idx].priority = root_bufs.content.act_ctrl_fwd_priority;
+		bitmap_zero(recps[idx].res_idxs, ICE_MAX_FV_WORDS);
+		if (root_bufs.content.result_indx & ICE_AQ_RECIPE_RESULT_EN) {
+			recps[idx].chain_idx = root_bufs.content.result_indx &
+				~ICE_AQ_RECIPE_RESULT_EN;
+			set_bit(recps[idx].chain_idx, recps[idx].res_idxs);
+		} else {
+			recps[idx].chain_idx = ICE_INVAL_CHAIN_IND;
+		}
+
+		if (!is_root)
+			continue;
+
+		/* Only do the following for root recipes entries */
+		memcpy(recps[idx].r_bitmap, root_bufs.recipe_bitmap,
+		       sizeof(recps[idx].r_bitmap));
+		recps[idx].root_rid = root_bufs.content.rid &
+			~ICE_AQ_RECIPE_ID_IS_ROOT;
+		recps[idx].priority = root_bufs.content.act_ctrl_fwd_priority;
+	}
+
+	/* Complete initialization of the root recipe entry */
+	lkup_exts->n_val_words = fv_word_idx;
+	recps[rid].big_recp = (num_recps > 1);
+	recps[rid].n_grp_count = (u8)num_recps;
+	recps[rid].root_buf = devm_kmemdup(ice_hw_to_dev(hw), tmp,
+					   recps[rid].n_grp_count * sizeof(*recps[rid].root_buf),
+					   GFP_KERNEL);
+	if (!recps[rid].root_buf)
+		goto err_unroll;
+
+	/* Copy result indexes */
+	bitmap_copy(recps[rid].res_idxs, result_bm, ICE_MAX_FV_WORDS);
+	recps[rid].recp_created = true;
+
+err_unroll:
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	return status;
+}
+
+/**
+ * ice_get_recp_to_prof_map - updates recipe to profile mapping
+ * @hw: pointer to hardware structure
+ *
+ * This function is used to populate recipe_to_profile matrix where index to
+ * this array is the recipe ID and the element is the mapping of which profiles
+ * is this recipe mapped to.
+ */
+static void ice_get_recp_to_prof_map(struct ice_hw *hw)
+{
+	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+	u16 i;
 
-	cmd->num_entries = cpu_to_le16(num_entries);
+	for (i = 0; i < hw->switch_info->max_used_prof_index + 1; i++) {
+		u16 j;
 
-	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+		bitmap_zero(profile_to_recipe[i], ICE_MAX_NUM_RECIPES);
+		bitmap_zero(r_bitmap, ICE_MAX_NUM_RECIPES);
+		if (ice_aq_get_recipe_to_profile(hw, i, (u8 *)r_bitmap, NULL))
+			continue;
+		bitmap_copy(profile_to_recipe[i], r_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+		for_each_set_bit(j, r_bitmap, ICE_MAX_NUM_RECIPES)
+			set_bit(i, recipe_to_profile[j]);
+	}
 }
 
 /**
  * ice_init_def_sw_recp - initialize the recipe book keeping tables
  * @hw: pointer to the HW struct
+ * @recp_list: pointer to sw recipe list
  *
  * Allocate memory for the entire recipe table and initialize the structures/
  * entries corresponding to basic recipes.
  */
-enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
+enum ice_status
+ice_init_def_sw_recp(struct ice_hw *hw, struct ice_sw_recipe **recp_list)
 {
 	struct ice_sw_recipe *recps;
 	u8 i;
@@ -102,14 +1315,15 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
 	if (!recps)
 		return ICE_ERR_NO_MEMORY;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		recps[i].root_rid = i;
 		INIT_LIST_HEAD(&recps[i].filt_rules);
 		INIT_LIST_HEAD(&recps[i].filt_replay_rules);
+		INIT_LIST_HEAD(&recps[i].rg_list);
 		mutex_init(&recps[i].filt_rule_lock);
 	}
 
-	hw->switch_info->recp_list = recps;
+	*recp_list = recps;
 
 	return 0;
 }
@@ -123,7 +1337,7 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
  * @num_elems: pointer to number of elements
  * @cd: pointer to command details structure or NULL
  *
- * Get switch configuration (0x0200) to be placed in 'buff'.
+ * Get switch configuration (0x0200) to be placed in buf.
  * This admin command returns information such as initial VSI/port number
  * and switch ID it belongs to.
  *
@@ -139,14 +1353,14 @@ enum ice_status ice_init_def_sw_recp(struct ice_hw *hw)
  * in response buffer. The caller of this function to use *num_elems while
  * parsing the response buffer.
  */
-static enum ice_status
-ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
+enum ice_status
+ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp_elem *buf,
 		  u16 buf_size, u16 *req_desc, u16 *num_elems,
 		  struct ice_sq_cd *cd)
 {
 	struct ice_aqc_get_sw_cfg *cmd;
-	enum ice_status status;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_sw_cfg);
 	cmd = &desc.params.get_sw_conf;
@@ -161,6 +1375,299 @@ ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
 	return status;
 }
 
+/**
+ * ice_dump_sw_cfg - get and print switch config as seen by firmware
+ * @hw: ice hardware struct
+ */
+enum ice_status ice_dump_sw_cfg(struct ice_hw *hw)
+{
+	struct ice_aqc_get_sw_cfg_resp_elem *rbuf;
+	enum ice_status ret;
+	u16 req_desc = 0;
+	u16 num_elems;
+	u16 i;
+
+	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
+			    GFP_KERNEL);
+	if (!rbuf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Multiple calls to ice_aq_get_sw_cfg may be required
+	 * to get all the switch configuration information. The need
+	 * for additional calls is indicated by ice_aq_get_sw_cfg
+	 * writing a non-zero value in req_desc.
+	 */
+	do {
+		struct ice_aqc_get_sw_cfg_resp_elem *ele;
+
+		ret = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
+					&req_desc, &num_elems, NULL);
+		if (ret)
+			break;
+
+		for (i = 0, ele = rbuf; i < num_elems; i++, ele++) {
+			u16 vsi_port_num, pf_vf_num, swid;
+
+			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
+				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+
+			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
+				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+
+			swid = le16_to_cpu(ele->swid);
+
+			dev_info(ice_hw_to_dev(hw), "element[%d]\n", i);
+
+			switch (le16_to_cpu(ele->vsi_port_num) >>
+				ICE_AQC_GET_SW_CONF_RESP_TYPE_S) {
+			case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tphy_port = %d\n", vsi_port_num);
+				break;
+			case ICE_AQC_GET_SW_CONF_RESP_VIRT_PORT:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tvir_port = %d\n", vsi_port_num);
+				break;
+			case ICE_AQC_GET_SW_CONF_RESP_VSI:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tvsi_num = %d\n", vsi_port_num);
+				break;
+
+			default:
+				dev_info(ice_hw_to_dev(hw),
+					 "\tincorrect vsi/port type\n");
+				ret = ICE_ERR_CFG;
+				break;
+			}
+
+			dev_info(ice_hw_to_dev(hw), "\tswid = %d\n", swid);
+
+			if (le16_to_cpu(ele->pf_vf_num) &
+			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
+				dev_info(ice_hw_to_dev(hw), "\tvf_id = %d\n",
+					 pf_vf_num);
+			else
+				dev_info(ice_hw_to_dev(hw), "\tpf_id = %d\n",
+					 pf_vf_num);
+		}
+	} while (req_desc && !ret);
+
+	devm_kfree(ice_hw_to_dev(hw), rbuf);
+	return ret;
+}
+
+/**
+ * ice_alloc_rss_global_lut - allocate a RSS global LUT
+ * @hw: pointer to the HW struct
+ * @shared_res: true to allocate as a shared resource and false to allocate as a dedicated resource
+ * @global_lut_id: output parameter for the RSS global LUT's ID
+ */
+enum ice_status ice_alloc_rss_global_lut(struct ice_hw *hw, bool shared_res, u16 *global_lut_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH |
+				       (shared_res ? ICE_AQC_RES_TYPE_FLAG_SHARED :
+				       ICE_AQC_RES_TYPE_FLAG_DEDICATED));
+
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, ice_aqc_opc_alloc_res, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_RES, "Failed to allocate %s RSS global LUT, status %d\n",
+			  shared_res ? "shared" : "dedicated", status);
+		goto ice_alloc_global_lut_exit;
+	}
+
+	*global_lut_id = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+
+ice_alloc_global_lut_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_free_rss_global_lut - free a RSS global LUT
+ * @hw: pointer to the HW struct
+ * @global_lut_id: ID of the RSS global LUT to free
+ */
+enum ice_status ice_free_rss_global_lut(struct ice_hw *hw, u16 global_lut_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	u16 buf_len, num_elems = 1;
+	enum ice_status status;
+
+	buf_len = struct_size(sw_buf, elem, num_elems);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(num_elems);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_GLOBAL_RSS_HASH);
+	sw_buf->elem[0].e.sw_resp = cpu_to_le16(global_lut_id);
+
+	status = ice_aq_alloc_free_res(hw, num_elems, sw_buf, buf_len, ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_RES, "Failed to free RSS global LUT %d, status %d\n",
+			  global_lut_id, status);
+
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_alloc_sw - allocate resources specific to switch
+ * @hw: pointer to the HW struct
+ * @ena_stats: true to turn on VEB stats
+ * @shared_res: true for shared resource, false for dedicated resource
+ * @sw_id: switch ID returned
+ * @counter_id: VEB counter ID returned
+ *
+ * allocates switch resources (SWID and VEB counter) (0x0208)
+ */
+enum ice_status
+ice_alloc_sw(struct ice_hw *hw, bool ena_stats, bool shared_res, u16 *sw_id,
+	     u16 *counter_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	struct ice_aqc_res_elem *sw_ele;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Prepare buffer for switch ID.
+	 * The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is allocated, and hence a single sw_id
+	 * is requested.
+	 */
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type =
+		cpu_to_le16(ICE_AQC_RES_TYPE_SWID |
+			    (shared_res ? ICE_AQC_RES_TYPE_FLAG_SHARED :
+			    ICE_AQC_RES_TYPE_FLAG_DEDICATED));
+
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+
+	if (status)
+		goto ice_alloc_sw_exit;
+
+	sw_ele = &sw_buf->elem[0];
+	*sw_id = le16_to_cpu(sw_ele->e.sw_resp);
+
+	if (ena_stats) {
+		/* Prepare buffer for VEB Counter */
+		enum ice_adminq_opc opc = ice_aqc_opc_alloc_res;
+		struct ice_aqc_alloc_free_res_elem *counter_buf;
+		struct ice_aqc_res_elem *counter_ele;
+
+		counter_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len,
+					   GFP_KERNEL);
+		if (!counter_buf) {
+			status = ICE_ERR_NO_MEMORY;
+			goto ice_alloc_sw_exit;
+		}
+
+		/* The number of resource entries in buffer is passed as 1 since
+		 * only a single switch/VEB instance is allocated, and hence a
+		 * single VEB counter is requested.
+		 */
+		counter_buf->num_elems = cpu_to_le16(1);
+		counter_buf->res_type =
+			cpu_to_le16(ICE_AQC_RES_TYPE_VEB_COUNTER |
+				    ICE_AQC_RES_TYPE_FLAG_DEDICATED);
+		status = ice_aq_alloc_free_res(hw, 1, counter_buf, buf_len,
+					       opc, NULL);
+
+		if (status) {
+			devm_kfree(ice_hw_to_dev(hw), counter_buf);
+			goto ice_alloc_sw_exit;
+		}
+		counter_ele = &counter_buf->elem[0];
+		*counter_id = le16_to_cpu(counter_ele->e.sw_resp);
+		devm_kfree(ice_hw_to_dev(hw), counter_buf);
+	}
+
+ice_alloc_sw_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return status;
+}
+
+/**
+ * ice_free_sw - free resources specific to switch
+ * @hw: pointer to the HW struct
+ * @sw_id: switch ID returned
+ * @counter_id: VEB counter ID returned
+ *
+ * free switch resources (SWID and VEB counter) (0x0209)
+ *
+ * NOTE: This function frees multiple resources. It continues
+ * releasing other resources even after it encounters error.
+ * The error code returned is the last error it encountered.
+ */
+enum ice_status ice_free_sw(struct ice_hw *hw, u16 sw_id, u16 counter_id)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf, *counter_buf;
+	enum ice_status status, ret_status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Prepare buffer to free for switch ID res.
+	 * The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is freed, and hence a single sw_id
+	 * is released.
+	 */
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_SWID);
+	sw_buf->elem[0].e.sw_resp = cpu_to_le16(sw_id);
+
+	ret_status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+					   ice_aqc_opc_free_res, NULL);
+
+	if (ret_status)
+		ice_debug(hw, ICE_DBG_SW, "CQ CMD Buffer:\n");
+
+	/* Prepare buffer to free for VEB Counter resource */
+	counter_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!counter_buf) {
+		devm_kfree(ice_hw_to_dev(hw), sw_buf);
+		return ICE_ERR_NO_MEMORY;
+	}
+
+	/* The number of resource entries in buffer is passed as 1 since only a
+	 * single switch/VEB instance is freed, and hence a single VEB counter
+	 * is released
+	 */
+	counter_buf->num_elems = cpu_to_le16(1);
+	counter_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VEB_COUNTER);
+	counter_buf->elem[0].e.sw_resp = cpu_to_le16(counter_id);
+
+	status = ice_aq_alloc_free_res(hw, 1, counter_buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SW, "VEB counter resource could not be freed\n");
+		ret_status = status;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), counter_buf);
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+	return ret_status;
+}
+
 /**
  * ice_aq_add_vsi
  * @hw: pointer to the HW struct
@@ -346,6 +1853,10 @@ static void ice_clear_vsi_q_ctx(struct ice_hw *hw, u16 vsi_handle)
 			devm_kfree(ice_hw_to_dev(hw), vsi->lan_q_ctx[i]);
 			vsi->lan_q_ctx[i] = NULL;
 		}
+		if (vsi->rdma_q_ctx[i]) {
+			devm_kfree(ice_hw_to_dev(hw), vsi->rdma_q_ctx[i]);
+			vsi->rdma_q_ctx[i] = NULL;
+		}
 	}
 }
 
@@ -413,11 +1924,11 @@ ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 			return ICE_ERR_NO_MEMORY;
 		}
 		*tmp_vsi_ctx = *vsi_ctx;
+
 		ice_save_vsi_ctx(hw, vsi_handle, tmp_vsi_ctx);
 	} else {
 		/* update with new HW VSI num */
-		if (tmp_vsi_ctx->vsi_num != vsi_ctx->vsi_num)
-			tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
+		tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
 	}
 
 	return 0;
@@ -468,2253 +1979,6114 @@ ice_update_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 }
 
 /**
- * ice_aq_alloc_free_vsi_list
- * @hw: pointer to the HW struct
- * @vsi_list_id: VSI list ID returned or used for lookup
- * @lkup_type: switch rule filter lookup type
- * @opc: switch rules population command type - pass in the command opcode
- *
- * allocates or free a VSI list resource
+ * ice_cfg_iwarp_fltr - enable/disable iWARP filtering on VSI
+ * @hw: pointer to HW struct
+ * @vsi_handle: VSI SW index
+ * @enable: boolean for enable/disable
  */
-static enum ice_status
-ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id,
-			   enum ice_sw_lkup_type lkup_type,
-			   enum ice_adminq_opc opc)
+enum ice_status
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable)
 {
-	struct ice_aqc_alloc_free_res_elem *sw_buf;
-	struct ice_aqc_res_elem *vsi_ele;
-	enum ice_status status;
-	u16 buf_len;
-
-	buf_len = sizeof(*sw_buf);
-	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
-	if (!sw_buf)
-		return ICE_ERR_NO_MEMORY;
-	sw_buf->num_elems = cpu_to_le16(1);
-
-	if (lkup_type == ICE_SW_LKUP_MAC ||
-	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN) {
-		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
-	} else if (lkup_type == ICE_SW_LKUP_VLAN) {
-		sw_buf->res_type =
-			cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_PRUNE);
-	} else {
-		status = ICE_ERR_PARAM;
-		goto ice_aq_alloc_free_vsi_list_exit;
-	}
+	struct ice_vsi_ctx *ctx;
 
-	if (opc == ice_aqc_opc_free_res)
-		sw_buf->elem[0].e.sw_resp = cpu_to_le16(*vsi_list_id);
-
-	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, opc, NULL);
-	if (status)
-		goto ice_aq_alloc_free_vsi_list_exit;
+	ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!ctx)
+		return ICE_ERR_DOES_NOT_EXIST;
 
-	if (opc == ice_aqc_opc_alloc_res) {
-		vsi_ele = &sw_buf->elem[0];
-		*vsi_list_id = le16_to_cpu(vsi_ele->e.sw_resp);
-	}
+	if (enable)
+		ctx->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	else
+		ctx->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
 
-ice_aq_alloc_free_vsi_list_exit:
-	devm_kfree(ice_hw_to_dev(hw), sw_buf);
-	return status;
+	return ice_update_vsi(hw, vsi_handle, ctx, NULL);
 }
 
 /**
- * ice_aq_sw_rules - add/update/remove switch rules
+ * ice_aq_get_vsi_params
  * @hw: pointer to the HW struct
- * @rule_list: pointer to switch rule population list
- * @rule_list_sz: total size of the rule list in bytes
- * @num_rules: number of switch rules in the rule_list
- * @opc: switch rules population command type - pass in the command opcode
+ * @vsi_ctx: pointer to a VSI context struct
  * @cd: pointer to command details structure or NULL
  *
- * Add(0x02a0)/Update(0x02a1)/Remove(0x02a2) switch rules commands to firmware
+ * Get VSI context info from hardware (0x0212)
  */
-static enum ice_status
-ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
-		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+enum ice_status
+ice_aq_get_vsi_params(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		      struct ice_sq_cd *cd)
 {
+	struct ice_aqc_add_get_update_free_vsi *cmd;
+	struct ice_aqc_get_vsi_resp *resp;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	if (opc != ice_aqc_opc_add_sw_rules &&
-	    opc != ice_aqc_opc_update_sw_rules &&
-	    opc != ice_aqc_opc_remove_sw_rules)
-		return ICE_ERR_PARAM;
+	cmd = &desc.params.vsi_cmd;
+	resp = &desc.params.get_vsi_resp;
 
-	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_vsi_params);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	desc.params.sw_rules.num_rules_fltr_entry_index =
-		cpu_to_le16(num_rules);
-	return ice_aq_send_cmd(hw, &desc, rule_list, rule_list_sz, cd);
+	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
+
+	status = ice_aq_send_cmd(hw, &desc, &vsi_ctx->info,
+				 sizeof(vsi_ctx->info), cd);
+	if (!status) {
+		vsi_ctx->vsi_num = le16_to_cpu(resp->vsi_num) &
+					ICE_AQ_VSI_NUM_M;
+		vsi_ctx->vf_num = resp->vf_id;
+		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
+		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+	}
+
+	return status;
 }
 
-/* ice_init_port_info - Initialize port_info with switch configuration data
- * @pi: pointer to port_info
- * @vsi_port_num: VSI number or port number
- * @type: Type of switch element (port or VSI)
- * @swid: switch ID of the switch the element is attached to
- * @pf_vf_num: PF or VF number
- * @is_vf: true if the element is a VF, false otherwise
+/**
+ * ice_aq_add_update_mir_rule - add/update a mirror rule
+ * @hw: pointer to the HW struct
+ * @rule_type: Rule Type
+ * @dest_vsi: VSI number to which packets will be mirrored
+ * @count: length of the list
+ * @mr_buf: buffer for list of mirrored VSI numbers
+ * @cd: pointer to command details structure or NULL
+ * @rule_id: Rule ID
+ *
+ * Add/Update Mirror Rule (0x260).
  */
-static void
-ice_init_port_info(struct ice_port_info *pi, u16 vsi_port_num, u8 type,
-		   u16 swid, u16 pf_vf_num, bool is_vf)
+enum ice_status
+ice_aq_add_update_mir_rule(struct ice_hw *hw, u16 rule_type, u16 dest_vsi,
+			   u16 count, struct ice_mir_rule_buf *mr_buf,
+			   struct ice_sq_cd *cd, u16 *rule_id)
 {
-	switch (type) {
-	case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
-		pi->lport = (u8)(vsi_port_num & ICE_LPORT_MASK);
-		pi->sw_id = swid;
-		pi->pf_vf_num = pf_vf_num;
-		pi->is_vf = is_vf;
-		pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
-		pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+	struct ice_aqc_add_update_mir_rule *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	__le16 *mr_list = NULL;
+	u16 buf_size = 0;
+
+	switch (rule_type) {
+	case ICE_AQC_RULE_TYPE_VPORT_INGRESS:
+	case ICE_AQC_RULE_TYPE_VPORT_EGRESS:
+		/* Make sure count and mr_buf are set for these rule_types */
+		if (!(count && mr_buf))
+			return ICE_ERR_PARAM;
+
+		buf_size = count * sizeof(__le16);
+		mr_list = devm_kzalloc(ice_hw_to_dev(hw), buf_size,
+				       GFP_KERNEL);
+		if (!mr_list)
+			return ICE_ERR_NO_MEMORY;
 		break;
-	default:
-		ice_debug(pi->hw, ICE_DBG_SW,
-			  "incorrect VSI/port type received\n");
+	case ICE_AQC_RULE_TYPE_PPORT_INGRESS:
+	case ICE_AQC_RULE_TYPE_PPORT_EGRESS:
+		/* Make sure count and mr_buf are not set for these
+		 * rule_types
+		 */
+		if (count || mr_buf)
+			return ICE_ERR_PARAM;
 		break;
+	default:
+		ice_debug(hw, ICE_DBG_SW, "Error due to unsupported rule_type %u\n", rule_type);
+		return ICE_ERR_OUT_OF_RANGE;
+	}
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_update_mir_rule);
+
+	/* Pre-process 'mr_buf' items for add/update of virtual port
+	 * ingress/egress mirroring (but not physical port ingress/egress
+	 * mirroring)
+	 */
+	if (mr_buf) {
+		int i;
+
+		for (i = 0; i < count; i++) {
+			u16 id;
+
+			id = mr_buf[i].vsi_idx & ICE_AQC_RULE_MIRRORED_VSI_M;
+
+			/* Validate specified VSI number, make sure it is less
+			 * than ICE_MAX_VSI, if not return with error.
+			 */
+			if (id >= ICE_MAX_VSI) {
+				ice_debug(hw, ICE_DBG_SW, "Error VSI index (%u) out-of-range\n",
+					  id);
+				devm_kfree(ice_hw_to_dev(hw), mr_list);
+				return ICE_ERR_OUT_OF_RANGE;
+			}
+
+			/* add VSI to mirror rule */
+			if (mr_buf[i].add)
+				mr_list[i] =
+					cpu_to_le16(id | ICE_AQC_RULE_ACT_M);
+			else /* remove VSI from mirror rule */
+				mr_list[i] = cpu_to_le16(id);
+		}
 	}
+
+	cmd = &desc.params.add_update_rule;
+	if ((*rule_id) != ICE_INVAL_MIRROR_RULE_ID)
+		cmd->rule_id = cpu_to_le16(((*rule_id) & ICE_AQC_RULE_ID_M) |
+					   ICE_AQC_RULE_ID_VALID_M);
+	cmd->rule_type = cpu_to_le16(rule_type & ICE_AQC_RULE_TYPE_M);
+	cmd->num_entries = cpu_to_le16(count);
+	cmd->dest = cpu_to_le16(dest_vsi);
+
+	status = ice_aq_send_cmd(hw, &desc, mr_list, buf_size, cd);
+	if (!status)
+		*rule_id = le16_to_cpu(cmd->rule_id) & ICE_AQC_RULE_ID_M;
+
+	devm_kfree(ice_hw_to_dev(hw), mr_list);
+
+	return status;
 }
 
-/* ice_get_initial_sw_cfg - Get initial port and default VSI data
- * @hw: pointer to the hardware structure
+/**
+ * ice_aq_delete_mir_rule - delete a mirror rule
+ * @hw: pointer to the HW struct
+ * @rule_id: Mirror rule ID (to be deleted)
+ * @keep_allocd: if set, the VSI stays part of the PF allocated res,
+ *		 otherwise it is returned to the shared pool
+ * @cd: pointer to command details structure or NULL
+ *
+ * Delete Mirror Rule (0x261).
  */
-enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
+enum ice_status
+ice_aq_delete_mir_rule(struct ice_hw *hw, u16 rule_id, bool keep_allocd,
+		       struct ice_sq_cd *cd)
 {
-	struct ice_aqc_get_sw_cfg_resp *rbuf;
-	enum ice_status status;
-	u16 req_desc = 0;
-	u16 num_elems;
-	u16 i;
-
-	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
-			    GFP_KERNEL);
+	struct ice_aqc_delete_mir_rule *cmd;
+	struct ice_aq_desc desc;
 
-	if (!rbuf)
-		return ICE_ERR_NO_MEMORY;
+	/* rule_id should be in the range 0...63 */
+	if (rule_id >= ICE_MAX_NUM_MIRROR_RULES)
+		return ICE_ERR_OUT_OF_RANGE;
 
-	/* Multiple calls to ice_aq_get_sw_cfg may be required
-	 * to get all the switch configuration information. The need
-	 * for additional calls is indicated by ice_aq_get_sw_cfg
-	 * writing a non-zero value in req_desc
-	 */
-	do {
-		status = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
-					   &req_desc, &num_elems, NULL);
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_del_mir_rule);
 
-		if (status)
-			break;
+	cmd = &desc.params.del_rule;
+	rule_id |= ICE_AQC_RULE_ID_VALID_M;
+	cmd->rule_id = cpu_to_le16(rule_id);
 
-		for (i = 0; i < num_elems; i++) {
-			struct ice_aqc_get_sw_cfg_resp_elem *ele;
-			u16 pf_vf_num, swid, vsi_port_num;
-			bool is_vf = false;
-			u8 type;
+	if (keep_allocd)
+		cmd->flags = cpu_to_le16(ICE_AQC_FLAG_KEEP_ALLOCD_M);
 
-			ele = rbuf[i].elements;
-			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
-				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
 
-			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
-				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+/**
+ * ice_aq_alloc_free_vsi_list
+ * @hw: pointer to the HW struct
+ * @vsi_list_id: VSI list ID returned or used for lookup
+ * @lkup_type: switch rule filter lookup type
+ * @opc: switch rules population command type - pass in the command opcode
+ *
+ * allocates or free a VSI list resource
+ */
+static enum ice_status
+ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id,
+			   enum ice_sw_lkup_type lkup_type,
+			   enum ice_adminq_opc opc)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	struct ice_aqc_res_elem *vsi_ele;
+	enum ice_status status;
+	u16 buf_len;
 
-			swid = le16_to_cpu(ele->swid);
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+	sw_buf->num_elems = cpu_to_le16(1);
 
-			if (le16_to_cpu(ele->pf_vf_num) &
-			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
-				is_vf = true;
+	if (lkup_type == ICE_SW_LKUP_MAC ||
+	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_LAST) {
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_REP);
+	} else if (lkup_type == ICE_SW_LKUP_VLAN) {
+		sw_buf->res_type =
+			cpu_to_le16(ICE_AQC_RES_TYPE_VSI_LIST_PRUNE);
+	} else {
+		status = ICE_ERR_PARAM;
+		goto ice_aq_alloc_free_vsi_list_exit;
+	}
 
-			type = le16_to_cpu(ele->vsi_port_num) >>
-				ICE_AQC_GET_SW_CONF_RESP_TYPE_S;
+	if (opc == ice_aqc_opc_free_res)
+		sw_buf->elem[0].e.sw_resp = cpu_to_le16(*vsi_list_id);
 
-			if (type == ICE_AQC_GET_SW_CONF_RESP_VSI) {
-				/* FW VSI is not needed. Just continue. */
-				continue;
-			}
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len, opc, NULL);
+	if (status)
+		goto ice_aq_alloc_free_vsi_list_exit;
 
-			ice_init_port_info(hw->port_info, vsi_port_num,
-					   type, swid, pf_vf_num, is_vf);
-		}
-	} while (req_desc && !status);
+	if (opc == ice_aqc_opc_alloc_res) {
+		vsi_ele = &sw_buf->elem[0];
+		*vsi_list_id = le16_to_cpu(vsi_ele->e.sw_resp);
+	}
 
-	devm_kfree(ice_hw_to_dev(hw), (void *)rbuf);
+ice_aq_alloc_free_vsi_list_exit:
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
 	return status;
 }
 
 /**
- * ice_fill_sw_info - Helper function to populate lb_en and lan_en
- * @hw: pointer to the hardware structure
- * @fi: filter info structure to fill/update
+ * ice_aq_set_storm_ctrl - Sets storm control configuration
+ * @hw: pointer to the HW struct
+ * @bcast_thresh: represents the upper threshold for broadcast storm control
+ * @mcast_thresh: represents the upper threshold for multicast storm control
+ * @ctl_bitmask: storm control knobs
  *
- * This helper function populates the lb_en and lan_en elements of the provided
- * ice_fltr_info struct using the switch's type and characteristics of the
- * switch rule being configured.
+ * Sets the storm control configuration (0x0280)
  */
-static void ice_fill_sw_info(struct ice_hw *hw, struct ice_fltr_info *fi)
+enum ice_status
+ice_aq_set_storm_ctrl(struct ice_hw *hw, u32 bcast_thresh, u32 mcast_thresh,
+		      u32 ctl_bitmask)
 {
-	fi->lb_en = false;
-	fi->lan_en = false;
-	if ((fi->flag & ICE_FLTR_TX) &&
-	    (fi->fltr_act == ICE_FWD_TO_VSI ||
-	     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
-	     fi->fltr_act == ICE_FWD_TO_Q ||
-	     fi->fltr_act == ICE_FWD_TO_QGRP)) {
-		/* Setting LB for prune actions will result in replicated
-		 * packets to the internal switch that will be dropped.
-		 */
-		if (fi->lkup_type != ICE_SW_LKUP_VLAN)
-			fi->lb_en = true;
+	struct ice_aqc_storm_cfg *cmd;
+	struct ice_aq_desc desc;
 
-		/* Set lan_en to TRUE if
-		 * 1. The switch is a VEB AND
-		 * 2
-		 * 2.1 The lookup is a directional lookup like ethertype,
-		 * promiscuous, ethertype-MAC, promiscuous-VLAN
-		 * and default-port OR
-		 * 2.2 The lookup is VLAN, OR
-		 * 2.3 The lookup is MAC with mcast or bcast addr for MAC, OR
-		 * 2.4 The lookup is MAC_VLAN with mcast or bcast addr for MAC.
-		 *
-		 * OR
-		 *
-		 * The switch is a VEPA.
-		 *
-		 * In all other cases, the LAN enable has to be set to false.
-		 */
-		if (hw->evb_veb) {
-			if (fi->lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-			    fi->lkup_type == ICE_SW_LKUP_PROMISC ||
-			    fi->lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-			    fi->lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
-			    fi->lkup_type == ICE_SW_LKUP_DFLT ||
-			    fi->lkup_type == ICE_SW_LKUP_VLAN ||
-			    (fi->lkup_type == ICE_SW_LKUP_MAC &&
-			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)) ||
-			    (fi->lkup_type == ICE_SW_LKUP_MAC_VLAN &&
-			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)))
-				fi->lan_en = true;
-		} else {
-			fi->lan_en = true;
-		}
-	}
+	cmd = &desc.params.storm_conf;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_storm_cfg);
+
+	cmd->bcast_thresh_size = cpu_to_le32(bcast_thresh & ICE_AQ_THRESHOLD_M);
+	cmd->mcast_thresh_size = cpu_to_le32(mcast_thresh & ICE_AQ_THRESHOLD_M);
+	cmd->storm_ctrl_ctrl = cpu_to_le32(ctl_bitmask);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
 
 /**
- * ice_fill_sw_rule - Helper function to fill switch rule structure
- * @hw: pointer to the hardware structure
- * @f_info: entry containing packet forwarding information
- * @s_rule: switch rule structure to be filled in based on mac_entry
- * @opc: switch rules population command type - pass in the command opcode
+ * ice_aq_get_storm_ctrl - gets storm control configuration
+ * @hw: pointer to the HW struct
+ * @bcast_thresh: represents the upper threshold for broadcast storm control
+ * @mcast_thresh: represents the upper threshold for multicast storm control
+ * @ctl_bitmask: storm control knobs
+ *
+ * Gets the storm control configuration (0x0281)
  */
-static void
-ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
-		 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
+enum ice_status
+ice_aq_get_storm_ctrl(struct ice_hw *hw, u32 *bcast_thresh, u32 *mcast_thresh,
+		      u32 *ctl_bitmask)
 {
-	u16 vlan_id = ICE_MAX_VLAN_ID + 1;
-	void *daddr = NULL;
-	u16 eth_hdr_sz;
-	u8 *eth_hdr;
-	u32 act = 0;
-	__be16 *off;
-	u8 q_rgn;
+	enum ice_status status;
+	struct ice_aq_desc desc;
 
-	if (opc == ice_aqc_opc_remove_sw_rules) {
-		s_rule->pdata.lkup_tx_rx.act = 0;
-		s_rule->pdata.lkup_tx_rx.index =
-			cpu_to_le16(f_info->fltr_rule_id);
-		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
-		return;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_storm_cfg);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+	if (!status) {
+		struct ice_aqc_storm_cfg *resp = &desc.params.storm_conf;
+
+		if (bcast_thresh)
+			*bcast_thresh = le32_to_cpu(resp->bcast_thresh_size) &
+				ICE_AQ_THRESHOLD_M;
+		if (mcast_thresh)
+			*mcast_thresh = le32_to_cpu(resp->mcast_thresh_size) &
+				ICE_AQ_THRESHOLD_M;
+		if (ctl_bitmask)
+			*ctl_bitmask = le32_to_cpu(resp->storm_ctrl_ctrl);
 	}
 
-	eth_hdr_sz = sizeof(dummy_eth_header);
-	eth_hdr = s_rule->pdata.lkup_tx_rx.hdr;
+	return status;
+}
 
-	/* initialize the ether header with a dummy header */
-	memcpy(eth_hdr, dummy_eth_header, eth_hdr_sz);
-	ice_fill_sw_info(hw, f_info);
+/**
+ * ice_aq_sw_rules - add/update/remove switch rules
+ * @hw: pointer to the HW struct
+ * @rule_list: pointer to switch rule population list
+ * @rule_list_sz: total size of the rule list in bytes
+ * @num_rules: number of switch rules in the rule_list
+ * @opc: switch rules population command type - pass in the command opcode
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add(0x02a0)/Update(0x02a1)/Remove(0x02a2) switch rules commands to firmware
+ */
+enum ice_status __maybe_unused
+ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
+		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+{
+	struct ice_aq_desc desc;
+	enum ice_status status;
 
-	switch (f_info->fltr_act) {
-	case ICE_FWD_TO_VSI:
-		act |= (f_info->fwd_id.hw_vsi_id << ICE_SINGLE_ACT_VSI_ID_S) &
-			ICE_SINGLE_ACT_VSI_ID_M;
-		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
-			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
-				ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_VSI_LIST:
-		act |= ICE_SINGLE_ACT_VSI_LIST;
-		act |= (f_info->fwd_id.vsi_list_id <<
-			ICE_SINGLE_ACT_VSI_LIST_ID_S) &
-			ICE_SINGLE_ACT_VSI_LIST_ID_M;
-		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
-			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
-				ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_Q:
-		act |= ICE_SINGLE_ACT_TO_Q;
-		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
-			ICE_SINGLE_ACT_Q_INDEX_M;
-		break;
-	case ICE_DROP_PACKET:
-		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
-			ICE_SINGLE_ACT_VALID_BIT;
-		break;
-	case ICE_FWD_TO_QGRP:
-		q_rgn = f_info->qgrp_size > 0 ?
-			(u8)ilog2(f_info->qgrp_size) : 0;
-		act |= ICE_SINGLE_ACT_TO_Q;
-		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
-			ICE_SINGLE_ACT_Q_INDEX_M;
-		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
-			ICE_SINGLE_ACT_Q_REGION_M;
-		break;
-	default:
-		return;
+	if (opc != ice_aqc_opc_add_sw_rules &&
+	    opc != ice_aqc_opc_update_sw_rules &&
+	    opc != ice_aqc_opc_remove_sw_rules)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.params.sw_rules.num_rules_fltr_entry_index =
+		cpu_to_le16(num_rules);
+	status = ice_aq_send_cmd(hw, &desc, rule_list, rule_list_sz, cd);
+	if (opc != ice_aqc_opc_add_sw_rules &&
+	    hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT)
+		status = ICE_ERR_DOES_NOT_EXIST;
+
+	return status;
+}
+
+/**
+ * ice_aq_add_recipe - add switch recipe
+ * @hw: pointer to the HW struct
+ * @s_recipe_list: pointer to switch rule population list
+ * @num_recipes: number of switch recipes in the list
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add(0x0290)
+ */
+enum ice_status
+ice_aq_add_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 num_recipes, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_get_recipe *cmd;
+	struct ice_aq_desc desc;
+	u16 buf_size;
+
+	cmd = &desc.params.add_get_recipe;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_recipe);
+
+	cmd->num_sub_recipes = cpu_to_le16(num_recipes);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	buf_size = num_recipes * sizeof(*s_recipe_list);
+
+	return ice_aq_send_cmd(hw, &desc, s_recipe_list, buf_size, cd);
+}
+
+/**
+ * ice_aq_get_recipe - get switch recipe
+ * @hw: pointer to the HW struct
+ * @s_recipe_list: pointer to switch rule population list
+ * @num_recipes: pointer to the number of recipes (input and output)
+ * @recipe_root: root recipe number of recipe(s) to retrieve
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get(0x0292)
+ *
+ * On input, *num_recipes should equal the number of entries in s_recipe_list.
+ * On output, *num_recipes will equal the number of entries returned in
+ * s_recipe_list.
+ *
+ * The caller must supply enough space in s_recipe_list to hold all possible
+ * recipes and *num_recipes must equal ICE_MAX_NUM_RECIPES.
+ */
+enum ice_status
+ice_aq_get_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 *num_recipes, u16 recipe_root, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_get_recipe *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+	u16 buf_size;
+
+	if (*num_recipes != ICE_MAX_NUM_RECIPES)
+		return ICE_ERR_PARAM;
+
+	cmd = &desc.params.add_get_recipe;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe);
+
+	cmd->return_index = cpu_to_le16(recipe_root);
+	cmd->num_sub_recipes = 0;
+
+	buf_size = *num_recipes * sizeof(*s_recipe_list);
+
+	status = ice_aq_send_cmd(hw, &desc, s_recipe_list, buf_size, cd);
+	*num_recipes = le16_to_cpu(cmd->num_sub_recipes);
+
+	return status;
+}
+
+/**
+ * ice_update_recipe_lkup_idx - update a default recipe based on the lkup_idx
+ * @hw: pointer to the HW struct
+ * @params: parameters used to update the default recipe
+ *
+ * This function only supports updating default recipes and it only supports
+ * updating a single recipe based on the lkup_idx at a time.
+ *
+ * This is done as a read-modify-write operation. First, get the current recipe
+ * contents based on the recipe's ID. Then modify the field vector index and
+ * mask if it's valid at the lkup_idx. Finally, use the add recipe AQ to update
+ * the pre-existing recipe with the modifications.
+ */
+enum ice_status
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+			   struct ice_update_recipe_lkup_idx_params *params)
+{
+	struct ice_aqc_recipe_data_elem *rcp_list;
+	u16 num_recps = ICE_MAX_NUM_RECIPES;
+	enum ice_status status;
+
+	rcp_list = devm_kzalloc(ice_hw_to_dev(hw),
+				num_recps * sizeof(*rcp_list), GFP_KERNEL);
+	if (!rcp_list)
+		return ICE_ERR_NO_MEMORY;
+
+	/* read current recipe list from firmware */
+	rcp_list->recipe_indx = params->rid;
+	status = ice_aq_get_recipe(hw, rcp_list, &num_recps, params->rid, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_SW, "Failed to get recipe %d, status %d\n",
+			  params->rid, status);
+		goto error_out;
 	}
 
-	if (f_info->lb_en)
-		act |= ICE_SINGLE_ACT_LB_ENABLE;
-	if (f_info->lan_en)
-		act |= ICE_SINGLE_ACT_LAN_ENABLE;
+	/* only modify existing recipe's lkup_idx and mask if valid, while
+	 * leaving all other fields the same, then update the recipe firmware
+	 */
+	rcp_list->content.lkup_indx[params->lkup_idx] = params->fv_idx;
+	if (params->mask_valid)
+		rcp_list->content.mask[params->lkup_idx] =
+			cpu_to_le16(params->mask);
 
-	switch (f_info->lkup_type) {
-	case ICE_SW_LKUP_MAC:
-		daddr = f_info->l_data.mac.mac_addr;
-		break;
-	case ICE_SW_LKUP_VLAN:
-		vlan_id = f_info->l_data.vlan.vlan_id;
-		if (f_info->fltr_act == ICE_FWD_TO_VSI ||
-		    f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
-			act |= ICE_SINGLE_ACT_PRUNE;
-			act |= ICE_SINGLE_ACT_EGRESS | ICE_SINGLE_ACT_INGRESS;
-		}
-		break;
-	case ICE_SW_LKUP_ETHERTYPE_MAC:
-		daddr = f_info->l_data.ethertype_mac.mac_addr;
-		/* fall-through */
-	case ICE_SW_LKUP_ETHERTYPE:
-		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
-		*off = cpu_to_be16(f_info->l_data.ethertype_mac.ethertype);
-		break;
-	case ICE_SW_LKUP_MAC_VLAN:
-		daddr = f_info->l_data.mac_vlan.mac_addr;
-		vlan_id = f_info->l_data.mac_vlan.vlan_id;
-		break;
-	case ICE_SW_LKUP_PROMISC_VLAN:
-		vlan_id = f_info->l_data.mac_vlan.vlan_id;
-		/* fall-through */
-	case ICE_SW_LKUP_PROMISC:
-		daddr = f_info->l_data.mac_vlan.mac_addr;
+	if (params->ignore_valid)
+		rcp_list->content.lkup_indx[params->lkup_idx] |=
+			ICE_AQ_RECIPE_LKUP_IGNORE;
+
+	status = ice_aq_add_recipe(hw, &rcp_list[0], 1, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "Failed to update recipe %d lkup_idx %d fv_idx %d mask %d mask_valid %s, status %d\n",
+			  params->rid, params->lkup_idx, params->fv_idx,
+			  params->mask, params->mask_valid ? "true" : "false",
+			  status);
+
+error_out:
+	devm_kfree(ice_hw_to_dev(hw), rcp_list);
+	return status;
+}
+
+/**
+ * ice_aq_map_recipe_to_profile - Map recipe to packet profile
+ * @hw: pointer to the HW struct
+ * @profile_id: package profile ID to associate the recipe with
+ * @r_bitmap: Recipe bitmap filled in and need to be returned as response
+ * @cd: pointer to command details structure or NULL
+ * Recipe to profile association (0x0291)
+ */
+enum ice_status
+ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_recipe_to_profile *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.recipe_to_profile;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_recipe_to_profile);
+	cmd->profile_id = cpu_to_le16(profile_id);
+	/* Set the recipe ID bit in the bitmask to let the device know which
+	 * profile we are associating the recipe to
+	 */
+	memcpy(cmd->recipe_assoc, r_bitmap, sizeof(cmd->recipe_assoc));
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
+/**
+ * ice_aq_get_recipe_to_profile - Map recipe to packet profile
+ * @hw: pointer to the HW struct
+ * @profile_id: package profile ID to associate the recipe with
+ * @r_bitmap: Recipe bitmap filled in and need to be returned as response
+ * @cd: pointer to command details structure or NULL
+ * Associate profile ID with given recipe (0x0293)
+ */
+enum ice_status
+ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_recipe_to_profile *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.recipe_to_profile;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe_to_profile);
+	cmd->profile_id = cpu_to_le16(profile_id);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status)
+		memcpy(r_bitmap, cmd->recipe_assoc, sizeof(cmd->recipe_assoc));
+
+	return status;
+}
+
+/**
+ * ice_alloc_recipe - add recipe resource
+ * @hw: pointer to the hardware structure
+ * @rid: recipe ID returned as response to AQ call
+ */
+enum ice_status ice_alloc_recipe(struct ice_hw *hw, u16 *rid)
+{
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
+	enum ice_status status;
+	u16 buf_len;
+
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
+		return ICE_ERR_NO_MEMORY;
+
+	sw_buf->num_elems = cpu_to_le16(1);
+	sw_buf->res_type = cpu_to_le16((ICE_AQC_RES_TYPE_RECIPE <<
+					ICE_AQC_RES_TYPE_S) |
+					ICE_AQC_RES_TYPE_FLAG_SHARED);
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (!status)
+		*rid = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
+
+	return status;
+}
+
+/* ice_init_port_info - Initialize port_info with switch configuration data
+ * @pi: pointer to port_info
+ * @vsi_port_num: VSI number or port number
+ * @type: Type of switch element (port or VSI)
+ * @swid: switch ID of the switch the element is attached to
+ * @pf_vf_num: PF or VF number
+ * @is_vf: true if the element is a VF, false otherwise
+ */
+static void
+ice_init_port_info(struct ice_port_info *pi, u16 vsi_port_num, u8 type,
+		   u16 swid, u16 pf_vf_num, bool is_vf)
+{
+	switch (type) {
+	case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+		pi->lport = (u8)(vsi_port_num & ICE_LPORT_MASK);
+		pi->sw_id = swid;
+		pi->pf_vf_num = pf_vf_num;
+		pi->is_vf = is_vf;
+		pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+		pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
 		break;
 	default:
+		ice_debug(pi->hw, ICE_DBG_SW, "incorrect VSI/port type received\n");
 		break;
 	}
+}
 
-	s_rule->type = (f_info->flag & ICE_FLTR_RX) ?
-		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX) :
-		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+/* ice_get_initial_sw_cfg - Get initial port and default VSI data
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
+{
+	struct ice_aqc_get_sw_cfg_resp_elem *rbuf;
+	enum ice_status status;
+	u8 num_total_ports;
+	u16 req_desc = 0;
+	u16 num_elems;
+	u8 j = 0;
+	u16 i;
 
-	/* Recipe set depending on lookup type */
-	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(f_info->lkup_type);
-	s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(f_info->src);
-	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+	num_total_ports = 1;
 
-	if (daddr)
-		ether_addr_copy(eth_hdr + ICE_ETH_DA_OFFSET, daddr);
+	rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
+			    GFP_KERNEL);
 
-	if (!(vlan_id > ICE_MAX_VLAN_ID)) {
-		off = (__force __be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
-		*off = cpu_to_be16(vlan_id);
-	}
+	if (!rbuf)
+		return ICE_ERR_NO_MEMORY;
+
+	/* Multiple calls to ice_aq_get_sw_cfg may be required
+	 * to get all the switch configuration information. The need
+	 * for additional calls is indicated by ice_aq_get_sw_cfg
+	 * writing a non-zero value in req_desc
+	 */
+	do {
+		struct ice_aqc_get_sw_cfg_resp_elem *ele;
+
+		status = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
+					   &req_desc, &num_elems, NULL);
+
+		if (status)
+			break;
+
+		for (i = 0, ele = rbuf; i < num_elems; i++, ele++) {
+			u16 pf_vf_num, swid, vsi_port_num;
+			bool is_vf = false;
+			u8 res_type;
+
+			vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
+				ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+
+			pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
+				ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+
+			swid = le16_to_cpu(ele->swid);
+
+			if (le16_to_cpu(ele->pf_vf_num) &
+			    ICE_AQC_GET_SW_CONF_RESP_IS_VF)
+				is_vf = true;
+
+			res_type = (u8)(le16_to_cpu(ele->vsi_port_num) >>
+					ICE_AQC_GET_SW_CONF_RESP_TYPE_S);
+
+			switch (res_type) {
+			case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+			case ICE_AQC_GET_SW_CONF_RESP_VIRT_PORT:
+				if (j == num_total_ports) {
+					ice_debug(hw, ICE_DBG_SW, "more ports than expected\n");
+					status = ICE_ERR_CFG;
+					goto out;
+				}
+				ice_init_port_info(hw->port_info,
+						   vsi_port_num, res_type, swid,
+						   pf_vf_num, is_vf);
+				j++;
+				break;
+			default:
+				break;
+			}
+		}
+	} while (req_desc && !status);
+
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), rbuf);
+	return status;
+}
+
+/**
+ * ice_dump_all_sw_rules
+ * @hw: pointer to the hardware structure
+ * @lkup: switch rule filter lookup type
+ * @recp_list: pointer to recipes
+ *
+ * Helper function to print filter information of all entries in the list for a
+ * given lookup type
+ */
+static void
+ice_dump_all_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lkup,
+		      struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_fltr_info *fi;
+
+	rule_lock = &recp_list->filt_rule_lock;
+	rule_head = &recp_list->filt_rules;
+
+	switch (lkup) {
+	case ICE_SW_LKUP_MAC:
+		/* dump MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump MAC hash list of lookup type %d\n", lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac.mac_addr, fm_entry->vsi_count,
+				 fi->flag, fi->lb_en, fi->lan_en,
+				 fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_VLAN:
+		/* dump VLAN hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump VLAN hash list of lookup type %d\n", lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tvlan_id = %d, vlan_tpid = 0x%04x, vsi_count = %d, vsi_list_id = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.vlan.vlan_id,
+				 fi->l_data.vlan.tpid_valid ? fi->l_data.vlan.tpid : ETH_P_8021Q,
+				 fm_entry->vsi_count, fi->fwd_id.vsi_list_id,
+				 fi->flag, fi->lb_en, fi->lan_en,
+				 fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		/* dump MAC VLAN hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump MAC VLAN hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vlan_id = %d, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac_vlan.mac_addr,
+				 fi->l_data.mac_vlan.vlan_id,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE:
+		/* dump Ethertype/Ethertype MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Ethertype hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tethertype = %d, vsi_count = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.ethertype_mac.ethertype,
+				 fm_entry->vsi_count, fi->flag, fi->fltr_act,
+				 fi->lb_en, fi->lan_en, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		/* dump Ethertype/Ethertype MAC hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Ethertype MAC hash list of lookup type %d\n",
+			 lkup);
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, ethertype = %d, vsi_count = %d, fw_act_flag = %d, filt_act = %d, lb_en = %d, lan_en = %d, filt_rule_id = %d\n",
+				 fi->l_data.ethertype_mac.mac_addr,
+				 fi->l_data.ethertype_mac.ethertype,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_PROMISC:
+		/* dump Promisc mode hash list */
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump Promisc/Promisc VLAN mode hash list of lookup type %d\n",
+			 lkup);
+		dev_info(ice_hw_to_dev(hw),
+			 "\tNote: Ignore VLAN in case of Promisc only lookup type & ignore MAC in case of Promisc VLAN lookup type\n");
+		mutex_lock(rule_lock);
+		list_for_each_entry(fm_entry, rule_head, list_entry) {
+			fi = &fm_entry->fltr_info;
+			dev_info(ice_hw_to_dev(hw),
+				 "\tmac: %pM, vlan_id = %d, vsi_count = %d, fw_act_flag = %d, lb_en = %d, lan_en = %d, filt_act = %d, filt_rule_id = %d\n",
+				 fi->l_data.mac_vlan.mac_addr,
+				 fi->l_data.mac_vlan.vlan_id,
+				 fm_entry->vsi_count, fi->flag, fi->lb_en,
+				 fi->lan_en, fi->fltr_act, fi->fltr_rule_id);
+		}
+		mutex_unlock(rule_lock);
+		break;
+	case ICE_SW_LKUP_DFLT:
+		/* dump default VSI filter rule */
+		if (hw->port_info->dflt_tx_vsi_num != ICE_DFLT_VSI_INVAL)
+			dev_info(ice_hw_to_dev(hw),
+				 "\tDefault VSI filter (lookup type %d): tx_vsi_id = %d, filt_act = %d, tx_filt_rule_id = %d\n",
+				 lkup, hw->port_info->dflt_tx_vsi_num,
+				 ICE_FWD_TO_VSI,
+				 hw->port_info->dflt_tx_vsi_rule_id);
+
+		if (hw->port_info->dflt_rx_vsi_num != ICE_DFLT_VSI_INVAL)
+			dev_info(ice_hw_to_dev(hw),
+				 "\tDefault VSI filter (lookup type %d): rx_vsi_id = %d, filt_act = %d, rx_filt_rule_id = %d\n",
+				 lkup, hw->port_info->dflt_rx_vsi_num,
+				 ICE_FWD_TO_VSI,
+				 hw->port_info->dflt_rx_vsi_rule_id);
+		break;
+	case ICE_SW_LKUP_PROMISC_VLAN:
+	case ICE_SW_LKUP_LAST:
+		dev_info(ice_hw_to_dev(hw),
+			 "\tDump for this lookup type hasn't been implemented yet\n");
+		break;
+	}
+}
+
+
+/**
+ * ice_fill_sw_info - Helper function to populate lb_en and lan_en
+ * @hw: pointer to the hardware structure
+ * @fi: filter info structure to fill/update
+ *
+ * This helper function populates the lb_en and lan_en elements of the provided
+ * ice_fltr_info struct using the switch's type and characteristics of the
+ * switch rule being configured.
+ */
+static void ice_fill_sw_info(struct ice_hw *hw, struct ice_fltr_info *fi)
+{
+	fi->lb_en = false;
+	fi->lan_en = false;
+	if ((fi->flag & ICE_FLTR_TX) &&
+	    (fi->fltr_act == ICE_FWD_TO_VSI ||
+	     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+	     fi->fltr_act == ICE_FWD_TO_Q ||
+	     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+		/* Setting LB for prune actions will result in replicated
+		 * packets to the internal switch that will be dropped.
+		 */
+		if (fi->lkup_type != ICE_SW_LKUP_VLAN)
+			fi->lb_en = true;
+
+		/* Set lan_en to TRUE if
+		 * 1. The switch is a VEB AND
+		 * 2
+		 * 2.1 The lookup is a directional lookup like ethertype,
+		 * promiscuous, ethertype-MAC, promiscuous-VLAN
+		 * and default-port OR
+		 * 2.2 The lookup is VLAN, OR
+		 * 2.3 The lookup is MAC with mcast or bcast addr for MAC, OR
+		 * 2.4 The lookup is MAC_VLAN with mcast or bcast addr for MAC.
+		 *
+		 * OR
+		 *
+		 * The switch is a VEPA.
+		 *
+		 * In all other cases, the LAN enable has to be set to false.
+		 */
+		if (hw->evb_veb) {
+			if (fi->lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+			    fi->lkup_type == ICE_SW_LKUP_PROMISC ||
+			    fi->lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+			    fi->lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+			    fi->lkup_type == ICE_SW_LKUP_DFLT ||
+			    fi->lkup_type == ICE_SW_LKUP_VLAN ||
+			    (fi->lkup_type == ICE_SW_LKUP_MAC &&
+			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)) ||
+			    (fi->lkup_type == ICE_SW_LKUP_MAC_VLAN &&
+			     !is_unicast_ether_addr(fi->l_data.mac.mac_addr)))
+				fi->lan_en = true;
+		} else {
+			fi->lan_en = true;
+		}
+	}
+}
+
+/**
+ * ice_fill_sw_rule - Helper function to fill switch rule structure
+ * @hw: pointer to the hardware structure
+ * @f_info: entry containing packet forwarding information
+ * @s_rule: switch rule structure to be filled in based on mac_entry
+ * @opc: switch rules population command type - pass in the command opcode
+ */
+static void
+ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
+		 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
+{
+	u16 vlan_id = ICE_MAX_VLAN_ID + 1;
+	u16 vlan_tpid = ETH_P_8021Q;
+	void *daddr = NULL;
+	u16 eth_hdr_sz;
+	u8 *eth_hdr;
+	u32 act = 0;
+	__be16 *off;
+	u8 q_rgn;
+
+	if (opc == ice_aqc_opc_remove_sw_rules) {
+		s_rule->pdata.lkup_tx_rx.act = 0;
+		s_rule->pdata.lkup_tx_rx.index =
+			cpu_to_le16(f_info->fltr_rule_id);
+		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+		return;
+	}
+
+	eth_hdr_sz = sizeof(dummy_eth_header);
+	eth_hdr = s_rule->pdata.lkup_tx_rx.hdr;
+
+	/* initialize the ether header with a dummy header */
+	memcpy(eth_hdr, dummy_eth_header, eth_hdr_sz);
+	ice_fill_sw_info(hw, f_info);
+
+	switch (f_info->fltr_act) {
+	case ICE_FWD_TO_VSI:
+		act |= (f_info->fwd_id.hw_vsi_id << ICE_SINGLE_ACT_VSI_ID_S) &
+			ICE_SINGLE_ACT_VSI_ID_M;
+		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
+				ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_VSI_LIST:
+		act |= ICE_SINGLE_ACT_VSI_LIST;
+		act |= (f_info->fwd_id.vsi_list_id <<
+			ICE_SINGLE_ACT_VSI_LIST_ID_S) &
+			ICE_SINGLE_ACT_VSI_LIST_ID_M;
+		if (f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			act |= ICE_SINGLE_ACT_VSI_FORWARDING |
+				ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_Q:
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+			ICE_SINGLE_ACT_Q_INDEX_M;
+		break;
+	case ICE_DROP_PACKET:
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
+			ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_QGRP:
+		q_rgn = f_info->qgrp_size > 0 ?
+			(u8)ilog2(f_info->qgrp_size) : 0;
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (f_info->fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+			ICE_SINGLE_ACT_Q_INDEX_M;
+		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
+			ICE_SINGLE_ACT_Q_REGION_M;
+		break;
+	default:
+		return;
+	}
+
+	if (f_info->lb_en)
+		act |= ICE_SINGLE_ACT_LB_ENABLE;
+	if (f_info->lan_en)
+		act |= ICE_SINGLE_ACT_LAN_ENABLE;
+
+	switch (f_info->lkup_type) {
+	case ICE_SW_LKUP_MAC:
+		daddr = f_info->l_data.mac.mac_addr;
+		break;
+	case ICE_SW_LKUP_VLAN:
+		vlan_id = f_info->l_data.vlan.vlan_id;
+		if (f_info->l_data.vlan.tpid_valid)
+			vlan_tpid = f_info->l_data.vlan.tpid;
+		if (f_info->fltr_act == ICE_FWD_TO_VSI ||
+		    f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			act |= ICE_SINGLE_ACT_PRUNE;
+			act |= ICE_SINGLE_ACT_EGRESS | ICE_SINGLE_ACT_INGRESS;
+		}
+		break;
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		daddr = f_info->l_data.ethertype_mac.mac_addr;
+		/* fall-through */
+	case ICE_SW_LKUP_ETHERTYPE:
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
+		*off = cpu_to_be16(f_info->l_data.ethertype_mac.ethertype);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		daddr = f_info->l_data.mac_vlan.mac_addr;
+		vlan_id = f_info->l_data.mac_vlan.vlan_id;
+		break;
+	case ICE_SW_LKUP_PROMISC_VLAN:
+		vlan_id = f_info->l_data.mac_vlan.vlan_id;
+		/* fall-through */
+	case ICE_SW_LKUP_PROMISC:
+		daddr = f_info->l_data.mac_vlan.mac_addr;
+		break;
+	default:
+		break;
+	}
+
+	s_rule->type = (f_info->flag & ICE_FLTR_RX) ?
+		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX) :
+		cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+
+	/* Recipe set depending on lookup type */
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(f_info->lkup_type);
+	s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(f_info->src);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	if (daddr)
+		ether_addr_copy(eth_hdr + ICE_ETH_DA_OFFSET, daddr);
+
+	if (!(vlan_id > ICE_MAX_VLAN_ID)) {
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
+		*off = cpu_to_be16(vlan_id);
+		off = (__force __be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
+		*off = cpu_to_be16(vlan_tpid);
+	}
+
+	/* Create the switch rule with the final dummy Ethernet header */
+	if (opc != ice_aqc_opc_update_sw_rules)
+		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(eth_hdr_sz);
+}
+
+
+/**
+ * ice_dump_sw_rules - Function to dump sw rules
+ * @hw: pointer to the hardware structure
+ * @lkup: rules type to be dumped
+ */
+void ice_dump_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lkup)
+{
+	ice_dump_all_sw_rules(hw, lkup, &hw->switch_info->recp_list[lkup]);
+}
+
+/**
+ * ice_add_marker_act
+ * @hw: pointer to the hardware structure
+ * @m_ent: the management entry for which sw marker needs to be added
+ * @sw_marker: sw marker to tag the Rx descriptor with
+ * @l_id: large action resource ID
+ *
+ * Create a large action to hold software marker and update the switch rule
+ * entry pointed by m_ent with newly created large action
+ */
+static enum ice_status
+ice_add_marker_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
+		   u16 sw_marker, u16 l_id)
+{
+	struct ice_aqc_sw_rules_elem *lg_act, *rx_tx;
+	/* For software marker we need 3 large actions
+	 * 1. FWD action: FWD TO VSI or VSI LIST
+	 * 2. GENERIC VALUE action to hold the profile ID
+	 * 3. GENERIC VALUE action to hold the software marker ID
+	 */
+	const u16 num_lg_acts = 3;
+	enum ice_status status;
+	u16 lg_act_size;
+	u16 rules_size;
+	u32 act;
+	u16 id;
+
+	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	/* Create two back-to-back switch rules and submit them to the HW using
+	 * one memory buffer:
+	 *    1. Large Action
+	 *    2. Look up Tx Rx
+	 */
+	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_lg_acts);
+	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
+	if (!lg_act)
+		return ICE_ERR_NO_MEMORY;
+
+	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
+
+	/* Fill in the first switch rule i.e. large action */
+	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
+	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
+	lg_act->pdata.lg_act.size = cpu_to_le16(num_lg_acts);
+
+	/* First action VSI forwarding or VSI list forwarding depending on how
+	 * many VSIs
+	 */
+	id = (m_ent->vsi_count > 1) ? m_ent->fltr_info.fwd_id.vsi_list_id :
+		m_ent->fltr_info.fwd_id.hw_vsi_id;
+
+	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
+	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) & ICE_LG_ACT_VSI_LIST_ID_M;
+	if (m_ent->vsi_count > 1)
+		act |= ICE_LG_ACT_VSI_LIST;
+	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+
+	/* Second action descriptor type */
+	act = ICE_LG_ACT_GENERIC;
+
+	act |= (1 << ICE_LG_ACT_GENERIC_VALUE_S) & ICE_LG_ACT_GENERIC_VALUE_M;
+	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+
+	act = (ICE_LG_ACT_GENERIC_OFF_RX_DESC_PROF_IDX <<
+	       ICE_LG_ACT_GENERIC_OFFSET_S) & ICE_LG_ACT_GENERIC_OFFSET_M;
+
+	/* Third action Marker value */
+	act |= ICE_LG_ACT_GENERIC;
+	act |= (sw_marker << ICE_LG_ACT_GENERIC_VALUE_S) &
+		ICE_LG_ACT_GENERIC_VALUE_M;
+
+	lg_act->pdata.lg_act.act[2] = cpu_to_le32(act);
+
+	/* call the fill switch rule to fill the lookup Tx Rx structure */
+	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
+			 ice_aqc_opc_update_sw_rules);
+
+	/* Update the action to point to the large action ID */
+	rx_tx->pdata.lkup_tx_rx.act =
+		cpu_to_le32(ICE_SINGLE_ACT_PTR |
+			    ((l_id << ICE_SINGLE_ACT_PTR_VAL_S) &
+			     ICE_SINGLE_ACT_PTR_VAL_M));
+
+	/* Use the filter rule ID of the previously created rule with single
+	 * act. Once the update happens, hardware will treat this as large
+	 * action
+	 */
+	rx_tx->pdata.lkup_tx_rx.index =
+		cpu_to_le16(m_ent->fltr_info.fltr_rule_id);
+
+	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	if (!status) {
+		m_ent->lg_act_idx = l_id;
+		m_ent->sw_marker_id = sw_marker;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), lg_act);
+	return status;
+}
+
+/**
+ * ice_add_counter_act - add/update filter rule with counter action
+ * @hw: pointer to the hardware structure
+ * @m_ent: the management entry for which counter needs to be added
+ * @counter_id: VLAN counter ID returned as part of allocate resource
+ * @l_id: large action resource ID
+ */
+static enum ice_status
+ice_add_counter_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
+		    u16 counter_id, u16 l_id)
+{
+	struct ice_aqc_sw_rules_elem *lg_act;
+	struct ice_aqc_sw_rules_elem *rx_tx;
+	enum ice_status status;
+	/* 2 actions will be added while adding a large action counter */
+	const int num_acts = 2;
+	u16 lg_act_size;
+	u16 rules_size;
+	u16 f_rule_id;
+	u32 act;
+	u16 id;
+
+	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	/* Create two back-to-back switch rules and submit them to the HW using
+	 * one memory buffer:
+	 * 1. Large Action
+	 * 2. Look up Tx Rx
+	 */
+	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_acts);
+	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
+	if (!lg_act)
+		return ICE_ERR_NO_MEMORY;
+
+	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
+
+	/* Fill in the first switch rule i.e. large action */
+	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
+	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
+	lg_act->pdata.lg_act.size = cpu_to_le16(num_acts);
+
+	/* First action VSI forwarding or VSI list forwarding depending on how
+	 * many VSIs
+	 */
+	id = (m_ent->vsi_count > 1) ?  m_ent->fltr_info.fwd_id.vsi_list_id :
+		m_ent->fltr_info.fwd_id.hw_vsi_id;
+
+	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
+	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) &
+		ICE_LG_ACT_VSI_LIST_ID_M;
+	if (m_ent->vsi_count > 1)
+		act |= ICE_LG_ACT_VSI_LIST;
+	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+
+	/* Second action counter ID */
+	act = ICE_LG_ACT_STAT_COUNT;
+	act |= (counter_id << ICE_LG_ACT_STAT_COUNT_S) &
+		ICE_LG_ACT_STAT_COUNT_M;
+	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+
+	/* call the fill switch rule to fill the lookup Tx Rx structure */
+	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
+			 ice_aqc_opc_update_sw_rules);
+
+	act = ICE_SINGLE_ACT_PTR;
+	act |= (l_id << ICE_SINGLE_ACT_PTR_VAL_S) & ICE_SINGLE_ACT_PTR_VAL_M;
+	rx_tx->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	/* Use the filter rule ID of the previously created rule with single
+	 * act. Once the update happens, hardware will treat this as large
+	 * action
+	 */
+	f_rule_id = m_ent->fltr_info.fltr_rule_id;
+	rx_tx->pdata.lkup_tx_rx.index = cpu_to_le16(f_rule_id);
+
+	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
+				 ice_aqc_opc_update_sw_rules, NULL);
+	if (!status) {
+		m_ent->lg_act_idx = l_id;
+		m_ent->counter_index = counter_id;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), lg_act);
+	return status;
+}
+
+/**
+ * ice_create_vsi_list_map
+ * @hw: pointer to the hardware structure
+ * @vsi_handle_arr: array of VSI handles to set in the VSI mapping
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ *
+ * Helper function to create a new entry of VSI list ID to VSI mapping
+ * using the given VSI list ID
+ */
+static struct ice_vsi_list_map_info *
+ice_create_vsi_list_map(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			u16 vsi_list_id)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_vsi_list_map_info *v_map;
+	int i;
+
+	v_map = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*v_map), GFP_KERNEL);
+	if (!v_map)
+		return NULL;
+
+	v_map->vsi_list_id = vsi_list_id;
+	v_map->ref_cnt = 1;
+	for (i = 0; i < num_vsi; i++)
+		set_bit(vsi_handle_arr[i], v_map->vsi_map);
+
+	list_add(&v_map->list_entry, &sw->vsi_list_map_head);
+	return v_map;
+}
+
+/**
+ * ice_update_vsi_list_rule
+ * @hw: pointer to the hardware structure
+ * @vsi_handle_arr: array of VSI handles to form a VSI list
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ * @remove: Boolean value to indicate if this is a remove action
+ * @opc: switch rules population command type - pass in the command opcode
+ * @lkup_type: lookup type of the filter
+ *
+ * Call AQ command to add a new switch rule or update existing switch rule
+ * using the given VSI list ID
+ */
+static enum ice_status
+ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			 u16 vsi_list_id, bool remove, enum ice_adminq_opc opc,
+			 enum ice_sw_lkup_type lkup_type)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+	u16 s_rule_size;
+	u16 rule_type;
+	int i;
+
+	if (!num_vsi)
+		return ICE_ERR_PARAM;
+
+	if (lkup_type == ICE_SW_LKUP_MAC ||
+	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
+	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC ||
+	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN ||
+	    lkup_type == ICE_SW_LKUP_LAST)
+		rule_type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR :
+			ICE_AQC_SW_RULES_T_VSI_LIST_SET;
+	else if (lkup_type == ICE_SW_LKUP_VLAN)
+		rule_type = remove ? ICE_AQC_SW_RULES_T_PRUNE_LIST_CLEAR :
+			ICE_AQC_SW_RULES_T_PRUNE_LIST_SET;
+	else
+		return ICE_ERR_PARAM;
+
+	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(num_vsi);
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	for (i = 0; i < num_vsi; i++) {
+		if (!ice_is_vsi_valid(hw, vsi_handle_arr[i])) {
+			status = ICE_ERR_PARAM;
+			goto exit;
+		}
+		/* AQ call requires hw_vsi_id(s) */
+		s_rule->pdata.vsi_list.vsi[i] =
+			cpu_to_le16(ice_get_hw_vsi_num(hw, vsi_handle_arr[i]));
+	}
+
+	s_rule->type = cpu_to_le16(rule_type);
+	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(num_vsi);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+
+	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opc, NULL);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_create_vsi_list_rule - Creates and populates a VSI list rule
+ * @hw: pointer to the HW struct
+ * @vsi_handle_arr: array of VSI handles to form a VSI list
+ * @num_vsi: number of VSI handles in the array
+ * @vsi_list_id: stores the ID of the VSI list to be created
+ * @lkup_type: switch rule filter's lookup type
+ */
+static enum ice_status
+ice_create_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
+			 u16 *vsi_list_id, enum ice_sw_lkup_type lkup_type)
+{
+	enum ice_status status;
+
+	status = ice_aq_alloc_free_vsi_list(hw, vsi_list_id, lkup_type,
+					    ice_aqc_opc_alloc_res);
+	if (status)
+		return status;
+
+	/* Update the newly created VSI list to include the specified VSIs */
+	return ice_update_vsi_list_rule(hw, vsi_handle_arr, num_vsi,
+					*vsi_list_id, false,
+					ice_aqc_opc_add_sw_rules, lkup_type);
+}
+
+/**
+ * ice_create_pkt_fwd_rule
+ * @hw: pointer to the hardware structure
+ * @recp_list: corresponding filter management list
+ * @f_entry: entry containing packet forwarding information
+ *
+ * Create switch rule with given filter information and add an entry
+ * to the corresponding filter management list to track this switch rule
+ * and VSI mapping
+ */
+static enum ice_status
+ice_create_pkt_fwd_rule(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+			struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	fm_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*fm_entry),
+				GFP_KERNEL);
+	if (!fm_entry) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_create_pkt_fwd_rule_exit;
+	}
+
+	fm_entry->fltr_info = f_entry->fltr_info;
+
+	/* Initialize all the fields for the management entry */
+	fm_entry->vsi_count = 1;
+	fm_entry->lg_act_idx = ICE_INVAL_LG_ACT_INDEX;
+	fm_entry->sw_marker_id = ICE_INVAL_SW_MARKER_ID;
+	fm_entry->counter_index = ICE_INVAL_COUNTER_ID;
+
+	ice_fill_sw_rule(hw, &fm_entry->fltr_info, s_rule,
+			 ice_aqc_opc_add_sw_rules);
+
+	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
+				 ice_aqc_opc_add_sw_rules, NULL);
+	if (status) {
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+		goto ice_create_pkt_fwd_rule_exit;
+	}
+
+	f_entry->fltr_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+	fm_entry->fltr_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+
+	/* The book keeping entries will get removed when base driver
+	 * calls remove filter AQ command
+	 */
+	list_add(&fm_entry->list_entry, &recp_list->filt_rules);
+
+ice_create_pkt_fwd_rule_exit:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_update_pkt_fwd_rule
+ * @hw: pointer to the hardware structure
+ * @f_info: filter information for switch rule
+ *
+ * Call AQ command to update a previously created switch rule with a
+ * VSI list ID
+ */
+static enum ice_status
+ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	ice_fill_sw_rule(hw, f_info, s_rule, ice_aqc_opc_update_sw_rules);
+
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(f_info->fltr_rule_id);
+
+	/* Update switch rule with new rule set to forward VSI list */
+	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
+				 ice_aqc_opc_update_sw_rules, NULL);
+
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_update_sw_rule_bridge_mode
+ * @hw: pointer to the HW struct
+ *
+ * Updates unicast switch filter rules based on VEB/VEPA mode
+ */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(fm_entry, rule_head, list_entry) {
+		struct ice_fltr_info *fi = &fm_entry->fltr_info;
+		u8 *addr = fi->l_data.mac.mac_addr;
+
+		/* Update unicast Tx rules to reflect the selected
+		 * VEB/VEPA mode
+		 */
+		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
+		    (fi->fltr_act == ICE_FWD_TO_VSI ||
+		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+		     fi->fltr_act == ICE_FWD_TO_Q ||
+		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+			status = ice_update_pkt_fwd_rule(hw, fi);
+			if (status)
+				break;
+		}
+	}
+
+	mutex_unlock(rule_lock);
+
+	return status;
+}
+
+/**
+ * ice_add_update_vsi_list
+ * @hw: pointer to the hardware structure
+ * @m_entry: pointer to current filter management list entry
+ * @cur_fltr: filter information from the book keeping entry
+ * @new_fltr: filter information with the new VSI to be added
+ *
+ * Call AQ command to add or update previously created VSI list with new VSI.
+ *
+ * Helper function to do book keeping associated with adding filter information
+ * The algorithm to do the book keeping is described below :
+ * When a VSI needs to subscribe to a given filter (MAC/VLAN/Ethtype etc.)
+ *	if only one VSI has been added till now
+ *		Allocate a new VSI list and add two VSIs
+ *		to this list using switch rule command
+ *		Update the previously created switch rule with the
+ *		newly created VSI list ID
+ *	if a VSI list was previously created
+ *		Add the new VSI to the previously created VSI list set
+ *		using the update switch rule command
+ */
+static enum ice_status
+ice_add_update_vsi_list(struct ice_hw *hw,
+			struct ice_fltr_mgmt_list_entry *m_entry,
+			struct ice_fltr_info *cur_fltr,
+			struct ice_fltr_info *new_fltr)
+{
+	enum ice_status status = 0;
+	u16 vsi_list_id = 0;
+
+	if ((cur_fltr->fltr_act == ICE_FWD_TO_Q ||
+	     cur_fltr->fltr_act == ICE_FWD_TO_QGRP))
+		return ICE_ERR_NOT_IMPL;
+
+	if ((new_fltr->fltr_act == ICE_FWD_TO_Q ||
+	     new_fltr->fltr_act == ICE_FWD_TO_QGRP) &&
+	    (cur_fltr->fltr_act == ICE_FWD_TO_VSI ||
+	     cur_fltr->fltr_act == ICE_FWD_TO_VSI_LIST))
+		return ICE_ERR_NOT_IMPL;
+
+	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
+		/* Only one entry existed in the mapping and it was not already
+		 * a part of a VSI list. So, create a VSI list with the old and
+		 * new VSIs.
+		 */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
+
+		/* A rule already exists with the new VSI being added */
+		if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id)
+			return ICE_ERR_ALREADY_EXISTS;
+
+		vsi_handle_arr[0] = cur_fltr->vsi_handle;
+		vsi_handle_arr[1] = new_fltr->vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id,
+						  new_fltr->lkup_type);
+		if (status)
+			return status;
+
+		tmp_fltr = *new_fltr;
+		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		/* Update the previous switch rule of "MAC forward to VSI" to
+		 * "MAC fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			return status;
+
+		cur_fltr->fwd_id.vsi_list_id = vsi_list_id;
+		cur_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
+		m_entry->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_NO_MEMORY;
+
+		/* If this entry was large action then the large action needs
+		 * to be updated to point to FWD to VSI list
+		 */
+		if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID)
+			status =
+			    ice_add_marker_act(hw, m_entry,
+					       m_entry->sw_marker_id,
+					       m_entry->lg_act_idx);
+	} else {
+		u16 vsi_handle = new_fltr->vsi_handle;
+		enum ice_adminq_opc opcode;
+
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_CFG;
+
+		/* A rule already exists with the new VSI being added */
+		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
+			return 0;
+
+		/* Update the previously created VSI list set with
+		 * the new VSI ID passed in
+		 */
+		vsi_list_id = cur_fltr->fwd_id.vsi_list_id;
+		opcode = ice_aqc_opc_update_sw_rules;
+
+		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
+						  vsi_list_id, false, opcode,
+						  new_fltr->lkup_type);
+		/* update VSI list mapping info with new VSI ID */
+		if (!status)
+			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
+	}
+	if (!status)
+		m_entry->vsi_count++;
+	return status;
+}
+
+/**
+ * ice_find_rule_entry - Search a rule entry
+ * @list_head: head of rule list
+ * @f_info: rule information
+ *
+ * Helper function to search for a given rule entry
+ * Returns pointer to entry storing the rule if found
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_find_rule_entry(struct list_head *list_head,
+		    struct ice_fltr_info *f_info)
+{
+	struct ice_fltr_mgmt_list_entry *list_itr, *ret = NULL;
+
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->flag == list_itr->fltr_info.flag) {
+			ret = list_itr;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * ice_find_vsi_list_entry - Search VSI list map with VSI count 1
+ * @recp_list: VSI lists needs to be searched
+ * @vsi_handle: VSI handle to be found in VSI list
+ * @vsi_list_id: VSI list ID found containing vsi_handle
+ *
+ * Helper function to search a VSI list with single entry containing given VSI
+ * handle element. This can be extended further to search VSI list with more
+ * than 1 vsi_count. Returns pointer to VSI list entry if found.
+ */
+static struct ice_vsi_list_map_info *
+ice_find_vsi_list_entry(struct ice_sw_recipe *recp_list, u16 vsi_handle,
+			u16 *vsi_list_id)
+{
+	struct ice_vsi_list_map_info *map_info = NULL;
+	struct list_head *list_head;
+
+	list_head = &recp_list->filt_rules;
+	if (recp_list->adv_rule) {
+		struct ice_adv_fltr_mgmt_list_entry *list_itr;
+
+		list_for_each_entry(list_itr, list_head, list_entry) {
+			if (list_itr->vsi_list_info) {
+				map_info = list_itr->vsi_list_info;
+				if (test_bit(vsi_handle, map_info->vsi_map)) {
+					*vsi_list_id = map_info->vsi_list_id;
+					return map_info;
+				}
+			}
+		}
+	} else {
+		struct ice_fltr_mgmt_list_entry *list_itr;
+
+		list_for_each_entry(list_itr, list_head, list_entry) {
+			if (list_itr->vsi_count == 1 &&
+			    list_itr->vsi_list_info) {
+				map_info = list_itr->vsi_list_info;
+				if (test_bit(vsi_handle, map_info->vsi_map)) {
+					*vsi_list_id = map_info->vsi_list_id;
+					return map_info;
+				}
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * ice_add_rule_internal - add rule for a given lookup type
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which rule has to be added
+ * @lport: logic port number on which function add rule
+ * @f_entry: structure containing MAC forwarding information
+ *
+ * Adds or updates the rule lists for a given recipe
+ */
+static enum ice_status
+ice_add_rule_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+		      u8 lport, struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_info *new_fltr, *cur_fltr;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+
+	/* Load the hw_vsi_id only if the fwd action is fwd to VSI */
+	if (f_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI)
+		f_entry->fltr_info.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	rule_lock = &recp_list->filt_rule_lock;
+
+	mutex_lock(rule_lock);
+	new_fltr = &f_entry->fltr_info;
+	if (new_fltr->flag & ICE_FLTR_RX)
+		new_fltr->src = lport;
+	else if (new_fltr->flag & ICE_FLTR_TX)
+		new_fltr->src =
+			ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, new_fltr);
+	if (!m_entry) {
+		status = ice_create_pkt_fwd_rule(hw, recp_list, f_entry);
+		goto exit_add_rule_internal;
+	}
+
+	cur_fltr = &m_entry->fltr_info;
+	status = ice_add_update_vsi_list(hw, m_entry, cur_fltr, new_fltr);
+
+exit_add_rule_internal:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_remove_vsi_list_rule
+ * @hw: pointer to the hardware structure
+ * @vsi_list_id: VSI list ID generated as part of allocate resource
+ * @lkup_type: switch rule filter lookup type
+ *
+ * The VSI list should be emptied before this function is called to remove the
+ * VSI list.
+ */
+static enum ice_status
+ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
+			 enum ice_sw_lkup_type lkup_type)
+{
+	/* Free the vsi_list resource that we allocated. It is assumed that the
+	 * list is empty at this point.
+	 */
+	return ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
+					    ice_aqc_opc_free_res);
+}
+
+/**
+ * ice_rem_update_vsi_list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle of the VSI to remove
+ * @fm_list: filter management entry for which the VSI list management needs to
+ *	     be done
+ */
+static enum ice_status
+ice_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
+			struct ice_fltr_mgmt_list_entry *fm_list)
+{
+	enum ice_sw_lkup_type lkup_type;
+	enum ice_status status = 0;
+	u16 vsi_list_id;
+
+	if (fm_list->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST ||
+	    fm_list->vsi_count == 0)
+		return ICE_ERR_PARAM;
+
+	/* A rule with the VSI being removed does not exist */
+	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	lkup_type = fm_list->fltr_info.lkup_type;
+	vsi_list_id = fm_list->fltr_info.fwd_id.vsi_list_id;
+	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
+					  ice_aqc_opc_update_sw_rules,
+					  lkup_type);
+	if (status)
+		return status;
+
+	fm_list->vsi_count--;
+	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+
+	if (fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) {
+		struct ice_fltr_info tmp_fltr_info = fm_list->fltr_info;
+		struct ice_vsi_list_map_info *vsi_list_info =
+			fm_list->vsi_list_info;
+		u16 rem_vsi_handle;
+
+		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
+						ICE_MAX_VSI);
+		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
+			return ICE_ERR_OUT_OF_RANGE;
+
+		/* Make sure VSI list is empty before removing it below */
+		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
+						  vsi_list_id, true,
+						  ice_aqc_opc_update_sw_rules,
+						  lkup_type);
+		if (status)
+			return status;
+
+		tmp_fltr_info.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr_info.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		tmp_fltr_info.vsi_handle = rem_vsi_handle;
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr_info);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
+				  tmp_fltr_info.fwd_id.hw_vsi_id, status);
+			return status;
+		}
+
+		fm_list->fltr_info = tmp_fltr_info;
+	}
+
+	if ((fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) ||
+	    (fm_list->vsi_count == 0 && lkup_type == ICE_SW_LKUP_VLAN)) {
+		struct ice_vsi_list_map_info *vsi_list_info =
+			fm_list->vsi_list_info;
+
+		/* Remove the VSI list since it is no longer used */
+		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to remove VSI list %d, error %d\n",
+				  vsi_list_id, status);
+			return status;
+		}
+
+		list_del(&vsi_list_info->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
+		fm_list->vsi_list_info = NULL;
+	}
+
+	return status;
+}
+
+/**
+ * ice_remove_rule_internal - Remove a filter rule of a given type
+ *
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which the rule needs to removed
+ * @f_entry: rule entry containing filter information
+ */
+static enum ice_status
+ice_remove_rule_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+			 struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *list_elem;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	bool remove_rule = false;
+	u16 vsi_handle;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+	f_entry->fltr_info.fwd_id.hw_vsi_id =
+		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_elem = ice_find_rule_entry(&recp_list->filt_rules,
+					&f_entry->fltr_info);
+	if (!list_elem) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit;
+	}
+
+	if (list_elem->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST) {
+		remove_rule = true;
+	} else if (!list_elem->vsi_list_info) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit;
+	} else if (list_elem->vsi_list_info->ref_cnt > 1) {
+		/* a ref_cnt > 1 indicates that the vsi_list is being
+		 * shared by multiple rules. Decrement the ref_cnt and
+		 * remove this rule, but do not modify the list, as it
+		 * is in-use by other rules.
+		 */
+		list_elem->vsi_list_info->ref_cnt--;
+		remove_rule = true;
+	} else {
+		/* a ref_cnt of 1 indicates the vsi_list is only used
+		 * by one rule. However, the original removal request is only
+		 * for a single VSI. Update the vsi_list first, and only
+		 * remove the rule if there are no further VSIs in this list.
+		 */
+		vsi_handle = f_entry->fltr_info.vsi_handle;
+		status = ice_rem_update_vsi_list(hw, vsi_handle, list_elem);
+		if (status)
+			goto exit;
+		/* if VSI count goes to zero after updating the VSI list */
+		if (list_elem->vsi_count == 0)
+			remove_rule = true;
+	}
+
+	if (remove_rule) {
+		/* Remove the lookup rule */
+		struct ice_aqc_sw_rules_elem *s_rule;
+
+		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
+				      GFP_KERNEL);
+		if (!s_rule) {
+			status = ICE_ERR_NO_MEMORY;
+			goto exit;
+		}
+
+		ice_fill_sw_rule(hw, &list_elem->fltr_info, s_rule,
+				 ice_aqc_opc_remove_sw_rules);
+
+		status = ice_aq_sw_rules(hw, s_rule,
+					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
+					 ice_aqc_opc_remove_sw_rules, NULL);
+
+		/* Remove a book keeping from the list */
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
+
+		if (status)
+			goto exit;
+
+		list_del(&list_elem->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), list_elem);
+	}
+exit:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_aq_get_res_alloc - get allocated resources
+ * @hw: pointer to the HW struct
+ * @num_entries: pointer to u16 to store the number of resource entries returned
+ * @buf: pointer to buffer
+ * @buf_size: size of buf
+ * @cd: pointer to command details structure or NULL
+ *
+ * The caller-supplied buffer must be large enough to store the resource
+ * information for all resource types. Each resource type is an
+ * ice_aqc_get_res_resp_elem structure.
+ */
+enum ice_status
+ice_aq_get_res_alloc(struct ice_hw *hw, u16 *num_entries,
+		     struct ice_aqc_get_res_resp_elem *buf, u16 buf_size,
+		     struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_res_alloc *resp;
+	enum ice_status status;
+	struct ice_aq_desc desc;
+
+	if (!buf)
+		return ICE_ERR_BAD_PTR;
+
+	if (buf_size < ICE_AQ_GET_RES_ALLOC_BUF_LEN)
+		return ICE_ERR_INVAL_SIZE;
+
+	resp = &desc.params.get_res;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_res_alloc);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+
+	if (!status && num_entries)
+		*num_entries = le16_to_cpu(resp->resp_elem_num);
+
+	return status;
+}
+
+/**
+ * ice_aq_get_res_descs - get allocated resource descriptors
+ * @hw: pointer to the hardware structure
+ * @num_entries: number of resource entries in buffer
+ * @buf: structure to hold response data buffer
+ * @buf_size: size of buffer
+ * @res_type: resource type
+ * @res_shared: is resource shared
+ * @desc_id: input - first desc ID to start; output - next desc ID
+ * @cd: pointer to command details structure or NULL
+ */
+enum ice_status
+ice_aq_get_res_descs(struct ice_hw *hw, u16 num_entries,
+		     struct ice_aqc_res_elem *buf, u16 buf_size, u16 res_type,
+		     bool res_shared, u16 *desc_id, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_get_allocd_res_desc *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.get_res_desc;
+
+	if (!buf)
+		return ICE_ERR_PARAM;
+
+	if (buf_size != (num_entries * sizeof(*buf)))
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_allocd_res_desc);
+
+	cmd->ops.cmd.res = cpu_to_le16(((res_type << ICE_AQC_RES_TYPE_S) &
+					 ICE_AQC_RES_TYPE_M) | (res_shared ?
+					ICE_AQC_RES_TYPE_FLAG_SHARED : 0));
+	cmd->ops.cmd.first_desc = cpu_to_le16(*desc_id);
+
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status)
+		*desc_id = le16_to_cpu(cmd->ops.resp.next_desc);
+
+	return status;
+}
+
+/**
+ * ice_mac_fltr_exist - does this MAC filter exist for given VSI
+ * @hw: pointer to the hardware structure
+ * @mac: MAC address to be checked (for MAC filter)
+ * @vsi_handle: check MAC filter for this VSI
+ */
+bool ice_mac_fltr_exist(struct ice_hw *hw, u8 *mac, u16 vsi_handle)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_switch_info *sw;
+	u16 hw_vsi_id;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return false;
+
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	sw = hw->switch_info;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	if (!rule_head)
+		return false;
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_for_each_entry(entry, rule_head, list_entry) {
+		struct ice_fltr_info *f_info = &entry->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+
+		if (is_zero_ether_addr(mac_addr))
+			continue;
+
+		if (f_info->flag != ICE_FLTR_TX ||
+		    f_info->src_id != ICE_SRC_ID_VSI ||
+		    f_info->lkup_type != ICE_SW_LKUP_MAC ||
+		    f_info->fltr_act != ICE_FWD_TO_VSI ||
+		    hw_vsi_id != f_info->fwd_id.hw_vsi_id)
+			continue;
+
+		if (ether_addr_equal(mac, mac_addr)) {
+			mutex_unlock(rule_lock);
+			return true;
+		}
+	}
+	mutex_unlock(rule_lock);
+	return false;
+}
+
+/**
+ * ice_vlan_fltr_exist - does this VLAN filter exist for given VSI
+ * @hw: pointer to the hardware structure
+ * @vlan_id: VLAN ID
+ * @vsi_handle: check MAC filter for this VSI
+ */
+bool ice_vlan_fltr_exist(struct ice_hw *hw, u16 vlan_id, u16 vsi_handle)
+{
+	struct ice_fltr_mgmt_list_entry *entry;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	struct ice_switch_info *sw;
+	u16 hw_vsi_id;
+
+	if (vlan_id > ICE_MAX_VLAN_ID)
+		return false;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return false;
+
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	sw = hw->switch_info;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
+	if (!rule_head)
+		return false;
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_for_each_entry(entry, rule_head, list_entry) {
+		struct ice_fltr_info *f_info = &entry->fltr_info;
+		u16 entry_vlan_id = f_info->l_data.vlan.vlan_id;
+		struct ice_vsi_list_map_info *map_info;
+
+		if (entry_vlan_id > ICE_MAX_VLAN_ID)
+			continue;
+
+		if (f_info->flag != ICE_FLTR_TX ||
+		    f_info->src_id != ICE_SRC_ID_VSI ||
+		    f_info->lkup_type != ICE_SW_LKUP_VLAN)
+			continue;
+
+		/* Only allowed filter action are FWD_TO_VSI/_VSI_LIST */
+		if (f_info->fltr_act != ICE_FWD_TO_VSI &&
+		    f_info->fltr_act != ICE_FWD_TO_VSI_LIST)
+			continue;
+
+		if (f_info->fltr_act == ICE_FWD_TO_VSI) {
+			if (hw_vsi_id != f_info->fwd_id.hw_vsi_id)
+				continue;
+		} else if (f_info->fltr_act == ICE_FWD_TO_VSI_LIST) {
+			/* If filter_action is FWD_TO_VSI_LIST, make sure
+			 * that VSI being checked is part of VSI list
+			 */
+			if (entry->vsi_count == 1 &&
+			    entry->vsi_list_info) {
+				map_info = entry->vsi_list_info;
+				if (!test_bit(vsi_handle, map_info->vsi_map))
+					continue;
+			}
+		}
+
+		if (vlan_id == entry_vlan_id) {
+			mutex_unlock(rule_lock);
+			return true;
+		}
+	}
+	mutex_unlock(rule_lock);
+
+	return false;
+}
+
+/**
+ * ice_add_mac_rule - Add a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * IMPORTANT: When the umac_shared flag is set to false and m_list has
+ * multiple unicast addresses, the function assumes that all the
+ * addresses are unique in a given add_mac call. It doesn't
+ * check for duplicates in this case, removing duplicates from a given
+ * list should be taken care of in the caller of this function.
+ */
+static enum ice_status
+ice_add_mac_rule(struct ice_hw *hw, struct list_head *m_list,
+		 struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_sw_recipe *recp_list = &sw->recp_list[ICE_SW_LKUP_MAC];
+	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
+	struct ice_fltr_list_entry *m_list_itr;
+	struct list_head *rule_head;
+	u16 total_elem_left, s_rule_size;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	u16 num_unicast = 0;
+	u8 elem_sent;
+
+	s_rule = NULL;
+	rule_lock = &recp_list->filt_rule_lock;
+	rule_head = &recp_list->filt_rules;
+
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		u8 *add = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
+		u16 vsi_handle;
+		u16 hw_vsi_id;
+
+		m_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		vsi_handle = m_list_itr->fltr_info.vsi_handle;
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			return ICE_ERR_PARAM;
+		hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+		m_list_itr->fltr_info.fwd_id.hw_vsi_id = hw_vsi_id;
+		/* update the src in case it is VSI num */
+		if (m_list_itr->fltr_info.src_id != ICE_SRC_ID_VSI)
+			return ICE_ERR_PARAM;
+		m_list_itr->fltr_info.src = hw_vsi_id;
+		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC ||
+		    is_zero_ether_addr(add))
+			return ICE_ERR_PARAM;
+		if (is_unicast_ether_addr(add) && !hw->umac_shared) {
+			/* Don't overwrite the unicast address */
+			mutex_lock(rule_lock);
+			if (ice_find_rule_entry(rule_head,
+						&m_list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
+				return ICE_ERR_ALREADY_EXISTS;
+			}
+			mutex_unlock(rule_lock);
+			num_unicast++;
+		} else if (is_multicast_ether_addr(add) ||
+			   (is_unicast_ether_addr(add) && hw->umac_shared)) {
+			m_list_itr->status =
+				ice_add_rule_internal(hw, recp_list, lport,
+						      m_list_itr);
+			if (m_list_itr->status)
+				return m_list_itr->status;
+		}
+	}
+
+	mutex_lock(rule_lock);
+	/* Exit if no suitable entries were found for adding bulk switch rule */
+	if (!num_unicast) {
+		status = 0;
+		goto ice_add_mac_exit;
+	}
+
+	/* Allocate switch rule buffer for the bulk update for unicast */
+	s_rule_size = ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
+	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
+			      GFP_KERNEL);
+	if (!s_rule) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_add_mac_exit;
+	}
+
+	r_iter = s_rule;
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+
+		if (is_unicast_ether_addr(mac_addr)) {
+			ice_fill_sw_rule(hw, &m_list_itr->fltr_info, r_iter,
+					 ice_aqc_opc_add_sw_rules);
+			r_iter = (struct ice_aqc_sw_rules_elem *)
+				((u8 *)r_iter + s_rule_size);
+		}
+	}
+
+	/* Call AQ bulk switch rule update for all unicast addresses */
+	r_iter = s_rule;
+	/* Call AQ switch rule in AQ_MAX chunk */
+	for (total_elem_left = num_unicast; total_elem_left > 0;
+	     total_elem_left -= elem_sent) {
+		struct ice_aqc_sw_rules_elem *entry = r_iter;
+
+		elem_sent = min_t(u8, total_elem_left,
+				  (ICE_AQ_MAX_BUF_LEN / s_rule_size));
+		status = ice_aq_sw_rules(hw, entry, elem_sent * s_rule_size,
+					 elem_sent, ice_aqc_opc_add_sw_rules,
+					 NULL);
+		if (status)
+			goto ice_add_mac_exit;
+		r_iter = (struct ice_aqc_sw_rules_elem *)
+			((u8 *)r_iter + (elem_sent * s_rule_size));
+	}
+
+	/* Fill up rule ID based on the value returned from FW */
+	r_iter = s_rule;
+	list_for_each_entry(m_list_itr, m_list, list_entry) {
+		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+		struct ice_fltr_mgmt_list_entry *fm_entry;
+
+		if (is_unicast_ether_addr(mac_addr)) {
+			f_info->fltr_rule_id =
+				le16_to_cpu(r_iter->pdata.lkup_tx_rx.index);
+			f_info->fltr_act = ICE_FWD_TO_VSI;
+			/* Create an entry to track this MAC address */
+			fm_entry = devm_kzalloc(ice_hw_to_dev(hw),
+						sizeof(*fm_entry), GFP_KERNEL);
+			if (!fm_entry) {
+				status = ICE_ERR_NO_MEMORY;
+				goto ice_add_mac_exit;
+			}
+			fm_entry->fltr_info = *f_info;
+			fm_entry->vsi_count = 1;
+			/* The book keeping entries will get removed when
+			 * base driver calls remove filter AQ command
+			 */
+
+			list_add(&fm_entry->list_entry, rule_head);
+			r_iter = (struct ice_aqc_sw_rules_elem *)
+				((u8 *)r_iter + s_rule_size);
+		}
+	}
+
+ice_add_mac_exit:
+	mutex_unlock(rule_lock);
+	if (s_rule)
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+
+/**
+ * ice_add_mac - Add a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ *
+ * Function add MAC rule for logical port from HW struct
+ */
+enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
+{
+	if (!m_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_mac_rule(hw, m_list, hw->switch_info,
+				hw->port_info->lport);
+}
+
+/**
+ * ice_add_vlan_internal - Add one VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @recp_list: recipe list for which rule has to be added
+ * @f_entry: filter entry containing one VLAN information
+ */
+static enum ice_status
+ice_add_vlan_internal(struct ice_hw *hw, struct ice_sw_recipe *recp_list,
+		      struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_fltr_mgmt_list_entry *v_list_itr;
+	struct ice_fltr_info *new_fltr, *cur_fltr;
+	enum ice_sw_lkup_type lkup_type;
+	u16 vsi_list_id = 0, vsi_handle;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+
+	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+		return ICE_ERR_PARAM;
+
+	f_entry->fltr_info.fwd_id.hw_vsi_id =
+		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+	new_fltr = &f_entry->fltr_info;
+
+	/* VLAN ID should only be 12 bits */
+	if (new_fltr->l_data.vlan.vlan_id > ICE_MAX_VLAN_ID)
+		return ICE_ERR_PARAM;
+
+	if (new_fltr->src_id != ICE_SRC_ID_VSI)
+		return ICE_ERR_PARAM;
+
+	new_fltr->src = new_fltr->fwd_id.hw_vsi_id;
+	lkup_type = new_fltr->lkup_type;
+	vsi_handle = new_fltr->vsi_handle;
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	v_list_itr = ice_find_rule_entry(&recp_list->filt_rules, new_fltr);
+	if (!v_list_itr) {
+		struct ice_vsi_list_map_info *map_info = NULL;
+
+		if (new_fltr->fltr_act == ICE_FWD_TO_VSI) {
+			/* All VLAN pruning rules use a VSI list. Check if
+			 * there is already a VSI list containing VSI that we
+			 * want to add. If found, use the same vsi_list_id for
+			 * this new VLAN rule or else create a new list.
+			 */
+			map_info = ice_find_vsi_list_entry(recp_list,
+							   vsi_handle,
+							   &vsi_list_id);
+			if (!map_info) {
+				status = ice_create_vsi_list_rule(hw,
+								  &vsi_handle,
+								  1,
+								  &vsi_list_id,
+								  lkup_type);
+				if (status)
+					goto exit;
+			}
+			/* Convert the action to forwarding to a VSI list. */
+			new_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
+			new_fltr->fwd_id.vsi_list_id = vsi_list_id;
+		}
+
+		status = ice_create_pkt_fwd_rule(hw, recp_list, f_entry);
+		if (!status) {
+			v_list_itr = ice_find_rule_entry(&recp_list->filt_rules,
+							 new_fltr);
+			if (!v_list_itr) {
+				status = ICE_ERR_DOES_NOT_EXIST;
+				goto exit;
+			}
+			/* reuse VSI list for new rule and increment ref_cnt */
+			if (map_info) {
+				v_list_itr->vsi_list_info = map_info;
+				map_info->ref_cnt++;
+			} else {
+				v_list_itr->vsi_list_info =
+					ice_create_vsi_list_map(hw, &vsi_handle,
+								1, vsi_list_id);
+			}
+		}
+	} else if (v_list_itr->vsi_list_info->ref_cnt == 1) {
+		/* Update existing VSI list to add new VSI ID only if it used
+		 * by one VLAN rule.
+		 */
+		cur_fltr = &v_list_itr->fltr_info;
+		status = ice_add_update_vsi_list(hw, v_list_itr, cur_fltr,
+						 new_fltr);
+	} else {
+		/* If VLAN rule exists and VSI list being used by this rule is
+		 * referenced by more than 1 VLAN rule. Then create a new VSI
+		 * list appending previous VSI with new VSI and update existing
+		 * VLAN rule to point to new VSI list ID
+		 */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
+		u16 cur_handle;
+
+		/* Current implementation only supports reusing VSI list with
+		 * one VSI count. We should never hit below condition
+		 */
+		if (v_list_itr->vsi_count > 1 &&
+		    v_list_itr->vsi_list_info->ref_cnt > 1) {
+			ice_debug(hw, ICE_DBG_SW, "Invalid configuration: Optimization to reuse VSI list with more than one VSI is not being done yet\n");
+			status = ICE_ERR_CFG;
+			goto exit;
+		}
+
+		cur_handle =
+			find_first_bit(v_list_itr->vsi_list_info->vsi_map,
+				       ICE_MAX_VSI);
+
+		/* A rule already exists with the new VSI being added */
+		if (cur_handle == vsi_handle) {
+			status = ICE_ERR_ALREADY_EXISTS;
+			goto exit;
+		}
+
+		vsi_handle_arr[0] = cur_handle;
+		vsi_handle_arr[1] = vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id, lkup_type);
+		if (status)
+			goto exit;
+
+		tmp_fltr = v_list_itr->fltr_info;
+		tmp_fltr.fltr_rule_id = v_list_itr->fltr_info.fltr_rule_id;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		/* Update the previous switch rule to a new VSI list which
+		 * includes current VSI that is requested
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			goto exit;
+
+		/* before overriding VSI list map info. decrement ref_cnt of
+		 * previous VSI list
+		 */
+		v_list_itr->vsi_list_info->ref_cnt--;
+
+		/* now update to newly created list */
+		v_list_itr->fltr_info.fwd_id.vsi_list_id = vsi_list_id;
+		v_list_itr->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+		v_list_itr->vsi_count++;
+	}
+
+exit:
+	mutex_unlock(rule_lock);
+	return status;
+}
+
+/**
+ * ice_add_vlan_rule - Add VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN entries and forwarding information
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+ice_add_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+		  struct ice_switch_info *sw)
+{
+	struct ice_fltr_list_entry *v_list_itr;
+	struct ice_sw_recipe *recp_list;
+
+	recp_list = &sw->recp_list[ICE_SW_LKUP_VLAN];
+	list_for_each_entry(v_list_itr, v_list, list_entry) {
+		if (v_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		v_list_itr->status = ice_add_vlan_internal(hw, recp_list,
+							   v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_vlan - Add a VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN and forwarding information
+ *
+ * Function add VLAN rule for logical port from HW struct
+ */
+enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
+{
+	if (!v_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_vlan_rule(hw, v_list, hw->switch_info);
+}
+
+/**
+ * ice_add_mac_vlan_rule - Add MAC and VLAN pair based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC and VLAN filters
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * If the VSI on which the MAC-VLAN pair has to be added has Rx and Tx VLAN
+ * pruning bits enabled, then it is the responsibility of the caller to make
+ * sure to add a VLAN only filter on the same VSI. Packets belonging to that
+ * VLAN won't be received on that VSI otherwise.
+ */
+static enum ice_status
+ice_add_mac_vlan_rule(struct ice_hw *hw, struct list_head *mv_list,
+		      struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_fltr_list_entry *mv_list_itr;
+	struct ice_sw_recipe *recp_list;
+
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &sw->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	list_for_each_entry(mv_list_itr, mv_list, list_entry) {
+		enum ice_sw_lkup_type l_type =
+			mv_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_MAC_VLAN)
+			return ICE_ERR_PARAM;
+		mv_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		mv_list_itr->status =
+			ice_add_rule_internal(hw, recp_list, lport,
+					      mv_list_itr);
+		if (mv_list_itr->status)
+			return mv_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_mac_vlan - Add a MAC VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC VLAN addresses and forwarding information
+ *
+ * Function add MAC VLAN rule for logical port from HW struct
+ */
+enum ice_status
+ice_add_mac_vlan(struct ice_hw *hw, struct list_head *mv_list)
+{
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_mac_vlan_rule(hw, mv_list, hw->switch_info,
+				     hw->port_info->lport);
+}
+
+/**
+ * ice_add_eth_mac_rule - Add ethertype and MAC based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ether type MAC filter, MAC is optional
+ * @sw: pointer to switch info struct for which function add rule
+ * @lport: logic port number on which function add rule
+ *
+ * This function requires the caller to populate the entries in
+ * the filter list with the necessary fields (including flags to
+ * indicate Tx or Rx rules).
+ */
+static enum ice_status
+ice_add_eth_mac_rule(struct ice_hw *hw, struct list_head *em_list,
+		     struct ice_switch_info *sw, u8 lport)
+{
+	struct ice_fltr_list_entry *em_list_itr;
+
+	list_for_each_entry(em_list_itr, em_list, list_entry) {
+		struct ice_sw_recipe *recp_list;
+		enum ice_sw_lkup_type l_type;
+
+		l_type = em_list_itr->fltr_info.lkup_type;
+		recp_list = &sw->recp_list[l_type];
+
+		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
+		    l_type != ICE_SW_LKUP_ETHERTYPE)
+			return ICE_ERR_PARAM;
+
+		em_list_itr->status = ice_add_rule_internal(hw, recp_list,
+							    lport,
+							    em_list_itr);
+		if (em_list_itr->status)
+			return em_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_add_eth_mac - Add a ethertype based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype and forwarding information
+ *
+ * Function add ethertype rule for logical port from HW struct
+ */
+enum ice_status
+ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+{
+	if (!em_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_add_eth_mac_rule(hw, em_list, hw->switch_info,
+				    hw->port_info->lport);
+}
+
+/**
+ * ice_remove_eth_mac_rule - Remove an ethertype (or MAC) based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype or ethertype MAC entries
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+ice_remove_eth_mac_rule(struct ice_hw *hw, struct list_head *em_list,
+			struct ice_switch_info *sw)
+{
+	struct ice_fltr_list_entry *em_list_itr, *tmp;
+
+	list_for_each_entry_safe(em_list_itr, tmp, em_list, list_entry) {
+		struct ice_sw_recipe *recp_list;
+		enum ice_sw_lkup_type l_type;
+
+		l_type = em_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
+		    l_type != ICE_SW_LKUP_ETHERTYPE)
+			return ICE_ERR_PARAM;
+
+		recp_list = &sw->recp_list[l_type];
+		em_list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							       em_list_itr);
+		if (em_list_itr->status)
+			return em_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_eth_mac - remove a ethertype based filter rule
+ * @hw: pointer to the hardware structure
+ * @em_list: list of ethertype and forwarding information
+ *
+ */
+enum ice_status
+ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+{
+	if (!em_list || !hw)
+		return ICE_ERR_PARAM;
+
+	return ice_remove_eth_mac_rule(hw, em_list, hw->switch_info);
+}
+
+
+/**
+ * ice_rem_sw_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	if (!list_empty(rule_head)) {
+		struct ice_fltr_mgmt_list_entry *entry;
+		struct ice_fltr_mgmt_list_entry *tmp;
+
+		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
+			list_del(&entry->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), entry);
+		}
+	}
+}
+
+/**
+ * ice_rem_adv_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_adv_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	struct ice_adv_fltr_mgmt_list_entry *tmp_entry;
+	struct ice_adv_fltr_mgmt_list_entry *lst_itr;
+
+	if (list_empty(rule_head))
+		return;
+
+	list_for_each_entry_safe(lst_itr, tmp_entry, rule_head, list_entry) {
+		list_del(&lst_itr->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), lst_itr->lkups);
+		devm_kfree(ice_hw_to_dev(hw), lst_itr);
+	}
+}
+
+/**
+ * ice_rem_all_sw_rules_info
+ * @hw: pointer to the hardware structure
+ */
+void ice_rem_all_sw_rules_info(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	u8 i;
+
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		struct list_head *rule_head;
+
+		rule_head = &sw->recp_list[i].filt_rules;
+		if (!sw->recp_list[i].adv_rule)
+			ice_rem_sw_rule_info(hw, rule_head);
+		else
+			ice_rem_adv_rule_info(hw, rule_head);
+		if (sw->recp_list[i].adv_rule &&
+		    list_empty(&sw->recp_list[i].filt_rules))
+			sw->recp_list[i].adv_rule = false;
+	}
+}
+
+/**
+ * ice_cfg_dflt_vsi - change state of VSI to set/clear default
+ * @pi: pointer to the port_info structure
+ * @vsi_handle: VSI handle to set as default
+ * @set: true to add the above mentioned switch rule, false to remove it
+ * @direction: ICE_FLTR_RX or ICE_FLTR_TX
+ *
+ * add filter rule to set/unset given VSI as default VSI for the switch
+ * (represented by swid)
+ */
+enum ice_status
+ice_cfg_dflt_vsi(struct ice_port_info *pi, u16 vsi_handle, bool set,
+		 u8 direction)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	struct ice_fltr_info f_info;
+	struct ice_hw *hw = pi->hw;
+	enum ice_adminq_opc opcode;
+	enum ice_status status;
+	u16 s_rule_size;
+	u16 hw_vsi_id;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	s_rule_size = set ? ICE_SW_RULE_RX_TX_ETH_HDR_SIZE :
+		ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
+
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	memset(&f_info, 0, sizeof(f_info));
+
+	f_info.lkup_type = ICE_SW_LKUP_DFLT;
+	f_info.flag = direction;
+	f_info.fltr_act = ICE_FWD_TO_VSI;
+	f_info.fwd_id.hw_vsi_id = hw_vsi_id;
+
+	if (f_info.flag & ICE_FLTR_RX) {
+		f_info.src = pi->lport;
+		f_info.src_id = ICE_SRC_ID_LPORT;
+		if (!set)
+			f_info.fltr_rule_id =
+				pi->dflt_rx_vsi_rule_id;
+	} else if (f_info.flag & ICE_FLTR_TX) {
+		f_info.src_id = ICE_SRC_ID_VSI;
+		f_info.src = hw_vsi_id;
+		if (!set)
+			f_info.fltr_rule_id =
+				pi->dflt_tx_vsi_rule_id;
+	}
+
+	if (set)
+		opcode = ice_aqc_opc_add_sw_rules;
+	else
+		opcode = ice_aqc_opc_remove_sw_rules;
+
+	ice_fill_sw_rule(hw, &f_info, s_rule, opcode);
+
+	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opcode, NULL);
+	if (status || !(f_info.flag & ICE_FLTR_TX_RX))
+		goto out;
+	if (set) {
+		u16 index = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+
+		if (f_info.flag & ICE_FLTR_TX) {
+			pi->dflt_tx_vsi_num = hw_vsi_id;
+			pi->dflt_tx_vsi_rule_id = index;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			pi->dflt_rx_vsi_num = hw_vsi_id;
+			pi->dflt_rx_vsi_rule_id = index;
+		}
+	} else {
+		if (f_info.flag & ICE_FLTR_TX) {
+			pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+			pi->dflt_tx_vsi_rule_id = ICE_INVAL_ACT;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+			pi->dflt_rx_vsi_rule_id = ICE_INVAL_ACT;
+		}
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_find_ucast_rule_entry - Search for a unicast MAC filter rule entry
+ * @list_head: head of rule list
+ * @f_info: rule information
+ *
+ * Helper function to search for a unicast rule entry - this is to be used
+ * to remove unicast MAC filter that is not shared with other VSIs on the
+ * PF switch.
+ *
+ * Returns pointer to entry storing the rule if found
+ */
+static struct ice_fltr_mgmt_list_entry *
+ice_find_ucast_rule_entry(struct list_head *list_head,
+			  struct ice_fltr_info *f_info)
+{
+	struct ice_fltr_mgmt_list_entry *list_itr;
+
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->fwd_id.hw_vsi_id ==
+		    list_itr->fltr_info.fwd_id.hw_vsi_id &&
+		    f_info->flag == list_itr->fltr_info.flag)
+			return list_itr;
+	}
+	return NULL;
+}
+
+/**
+ * ice_remove_mac_rule - remove a MAC based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ * @recp_list: list from which function remove MAC address
+ *
+ * This function removes either a MAC filter rule or a specific VSI from a
+ * VSI list for a multicast MAC address.
+ *
+ * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
+ * ice_add_mac. Caller should be aware that this call will only work if all
+ * the entries passed into m_list were added previously. It will not attempt to
+ * do a partial remove of entries that were found.
+ */
+static enum ice_status
+ice_remove_mac_rule(struct ice_hw *hw, struct list_head *m_list,
+		    struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *list_itr, *tmp;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	if (!m_list)
+		return ICE_ERR_PARAM;
+
+	rule_lock = &recp_list->filt_rule_lock;
+	list_for_each_entry_safe(list_itr, tmp, m_list, list_entry) {
+		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
+		u8 *add = &list_itr->fltr_info.l_data.mac.mac_addr[0];
+		u16 vsi_handle;
+
+		if (l_type != ICE_SW_LKUP_MAC)
+			return ICE_ERR_PARAM;
+
+		vsi_handle = list_itr->fltr_info.vsi_handle;
+		if (!ice_is_vsi_valid(hw, vsi_handle))
+			return ICE_ERR_PARAM;
+
+		list_itr->fltr_info.fwd_id.hw_vsi_id =
+					ice_get_hw_vsi_num(hw, vsi_handle);
+		if (is_unicast_ether_addr(add) && !hw->umac_shared) {
+			/* Don't remove the unicast address that belongs to
+			 * another VSI on the switch, since it is not being
+			 * shared...
+			 */
+			mutex_lock(rule_lock);
+			if (!ice_find_ucast_rule_entry(&recp_list->filt_rules,
+						       &list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
+				return ICE_ERR_DOES_NOT_EXIST;
+			}
+			mutex_unlock(rule_lock);
+		}
+		list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							    list_itr);
+		if (list_itr->status)
+			return list_itr->status;
+	}
+	return 0;
+}
+
+/**
+ * ice_remove_mac - remove a MAC address based filter rule
+ * @hw: pointer to the hardware structure
+ * @m_list: list of MAC addresses and forwarding information
+ *
+ */
+enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+	return ice_remove_mac_rule(hw, m_list, recp_list);
+}
+
+
+/**
+ * ice_remove_vlan_rule - Remove VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN entries and forwarding information
+ * @recp_list: list from which function remove VLAN
+ */
+static enum ice_status
+ice_remove_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+		     struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->status = ice_remove_rule_internal(hw, recp_list,
+							      v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_vlan - remove a VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of VLAN and forwarding information
+ *
+ */
+enum ice_status
+ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	if (!v_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_VLAN];
+	return ice_remove_vlan_rule(hw, v_list, recp_list);
+}
+
+/**
+ * ice_remove_mac_vlan_rule - Remove MAC VLAN based filter rule
+ * @hw: pointer to the hardware structure
+ * @v_list: list of MAC VLAN entries and forwarding information
+ * @recp_list: list from which function remove MAC VLAN
+ */
+static enum ice_status
+ice_remove_mac_vlan_rule(struct ice_hw *hw, struct list_head *v_list,
+			 struct ice_sw_recipe *recp_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_MAC_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->status =
+			ice_remove_rule_internal(hw, recp_list,
+						 v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
+}
+
+
+/**
+ * ice_remove_mac_vlan - remove a MAC VLAN address based filter rule
+ * @hw: pointer to the hardware structure
+ * @mv_list: list of MAC VLAN and forwarding information
+ */
+enum ice_status
+ice_remove_mac_vlan(struct ice_hw *hw, struct list_head *mv_list)
+{
+	struct ice_sw_recipe *recp_list;
+
+	if (!mv_list || !hw)
+		return ICE_ERR_PARAM;
+
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC_VLAN];
+	return ice_remove_mac_vlan_rule(hw, mv_list, recp_list);
+}
+
+
+/**
+ * ice_vsi_uses_fltr - Determine if given VSI uses specified filter
+ * @fm_entry: filter entry to inspect
+ * @vsi_handle: VSI handle to compare with filter info
+ */
+static bool
+ice_vsi_uses_fltr(struct ice_fltr_mgmt_list_entry *fm_entry, u16 vsi_handle)
+{
+	return ((fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
+		 fm_entry->fltr_info.vsi_handle == vsi_handle) ||
+		(fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST &&
+		 fm_entry->vsi_list_info &&
+		 (test_bit(vsi_handle, fm_entry->vsi_list_info->vsi_map))));
+}
+
+/**
+ * ice_add_entry_to_vsi_fltr_list - Add copy of fltr_list_entry to remove list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @vsi_list_head: pointer to the list to add entry to
+ * @fi: pointer to fltr_info of filter entry to copy & add
+ *
+ * Helper function, used when creating a list of filters to remove from
+ * a specific VSI. The entry added to vsi_list_head is a COPY of the
+ * original filter entry, with the exception of fltr_info.fltr_act and
+ * fltr_info.fwd_id fields. These are set such that later logic can
+ * extract which VSI to remove the fltr from, and pass on that information.
+ */
+static enum ice_status
+ice_add_entry_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
+			       struct list_head *vsi_list_head,
+			       struct ice_fltr_info *fi)
+{
+	struct ice_fltr_list_entry *tmp;
+
+	/* this memory is freed up in the caller function
+	 * once filters for this VSI are removed
+	 */
+	tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
+
+	tmp->fltr_info = *fi;
+
+	/* Overwrite these fields to indicate which VSI to remove filter from,
+	 * so find and remove logic can extract the information from the
+	 * list entries. Note that original entries will still have proper
+	 * values.
+	 */
+	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
+	tmp->fltr_info.vsi_handle = vsi_handle;
+	tmp->fltr_info.fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	list_add(&tmp->list_entry, vsi_list_head);
+
+	return 0;
+}
+
+/**
+ * ice_add_to_vsi_fltr_list - Add VSI filters to the list
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @lkup_list_head: pointer to the list that has certain lookup type filters
+ * @vsi_list_head: pointer to the list pertaining to VSI with vsi_handle
+ *
+ * Locates all filters in lkup_list_head that are used by the given VSI,
+ * and adds COPIES of those entries to vsi_list_head (intended to be used
+ * to remove the listed filters).
+ * Note that this means all entries in vsi_list_head must be explicitly
+ * deallocated by the caller when done with list.
+ */
+static enum ice_status
+ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
+			 struct list_head *lkup_list_head,
+			 struct list_head *vsi_list_head)
+{
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+
+	/* check to make sure VSI ID is valid and within boundary */
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	list_for_each_entry(fm_entry, lkup_list_head, list_entry) {
+		if (!ice_vsi_uses_fltr(fm_entry, vsi_handle))
+			continue;
+
+		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
+							vsi_list_head,
+							&fm_entry->fltr_info);
+		if (status)
+			return status;
+	}
+	return status;
+}
+
+
+/**
+ * ice_determine_promisc_mask
+ * @fi: filter info to parse
+ *
+ * Helper function to determine which ICE_PROMISC_ mask corresponds
+ * to given filter into.
+ */
+static u8 ice_determine_promisc_mask(struct ice_fltr_info *fi)
+{
+	u16 vid = fi->l_data.mac_vlan.vlan_id;
+	u8 *macaddr = fi->l_data.mac.mac_addr;
+	bool is_tx_fltr = false;
+	u8 promisc_mask = 0;
+
+	if (fi->flag == ICE_FLTR_TX)
+		is_tx_fltr = true;
+
+	if (is_broadcast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_BCAST_TX : ICE_PROMISC_BCAST_RX;
+	else if (is_multicast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_MCAST_TX : ICE_PROMISC_MCAST_RX;
+	else if (is_unicast_ether_addr(macaddr))
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_UCAST_TX : ICE_PROMISC_UCAST_RX;
+	if (vid)
+		promisc_mask |= is_tx_fltr ?
+			ICE_PROMISC_VLAN_TX : ICE_PROMISC_VLAN_RX;
 
-	/* Create the switch rule with the final dummy Ethernet header */
-	if (opc != ice_aqc_opc_update_sw_rules)
-		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(eth_hdr_sz);
+	return promisc_mask;
 }
 
+
 /**
- * ice_add_marker_act
+ * _ice_get_vsi_promisc - get promiscuous mode of given VSI
  * @hw: pointer to the hardware structure
- * @m_ent: the management entry for which sw marker needs to be added
- * @sw_marker: sw marker to tag the Rx descriptor with
- * @l_id: large action resource ID
- *
- * Create a large action to hold software marker and update the switch rule
- * entry pointed by m_ent with newly created large action
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_add_marker_act(struct ice_hw *hw, struct ice_fltr_mgmt_list_entry *m_ent,
-		   u16 sw_marker, u16 l_id)
+_ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		     u16 *vid, struct ice_switch_info *sw)
 {
-	struct ice_aqc_sw_rules_elem *lg_act, *rx_tx;
-	/* For software marker we need 3 large actions
-	 * 1. FWD action: FWD TO VSI or VSI LIST
-	 * 2. GENERIC VALUE action to hold the profile ID
-	 * 3. GENERIC VALUE action to hold the software marker ID
-	 */
-	const u16 num_lg_acts = 3;
-	enum ice_status status;
-	u16 lg_act_size;
-	u16 rules_size;
-	u32 act;
-	u16 id;
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
 
-	if (m_ent->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	/* Create two back-to-back switch rules and submit them to the HW using
-	 * one memory buffer:
-	 *    1. Large Action
-	 *    2. Look up Tx Rx
-	 */
-	lg_act_size = (u16)ICE_SW_RULE_LG_ACT_SIZE(num_lg_acts);
-	rules_size = lg_act_size + ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
-	lg_act = devm_kzalloc(ice_hw_to_dev(hw), rules_size, GFP_KERNEL);
-	if (!lg_act)
-		return ICE_ERR_NO_MEMORY;
-
-	rx_tx = (struct ice_aqc_sw_rules_elem *)((u8 *)lg_act + lg_act_size);
-
-	/* Fill in the first switch rule i.e. large action */
-	lg_act->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LG_ACT);
-	lg_act->pdata.lg_act.index = cpu_to_le16(l_id);
-	lg_act->pdata.lg_act.size = cpu_to_le16(num_lg_acts);
-
-	/* First action VSI forwarding or VSI list forwarding depending on how
-	 * many VSIs
-	 */
-	id = (m_ent->vsi_count > 1) ? m_ent->fltr_info.fwd_id.vsi_list_id :
-		m_ent->fltr_info.fwd_id.hw_vsi_id;
-
-	act = ICE_LG_ACT_VSI_FORWARDING | ICE_LG_ACT_VALID_BIT;
-	act |= (id << ICE_LG_ACT_VSI_LIST_ID_S) &
-		ICE_LG_ACT_VSI_LIST_ID_M;
-	if (m_ent->vsi_count > 1)
-		act |= ICE_LG_ACT_VSI_LIST;
-	lg_act->pdata.lg_act.act[0] = cpu_to_le32(act);
+	*vid = 0;
+	*promisc_mask = 0;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_PROMISC].filt_rules;
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_PROMISC].filt_rule_lock;
 
-	/* Second action descriptor type */
-	act = ICE_LG_ACT_GENERIC;
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		/* Continue if this filter doesn't apply to this VSI or the
+		 * VSI ID is not in the VSI map for this filter
+		 */
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
 
-	act |= (1 << ICE_LG_ACT_GENERIC_VALUE_S) & ICE_LG_ACT_GENERIC_VALUE_M;
-	lg_act->pdata.lg_act.act[1] = cpu_to_le32(act);
+		*promisc_mask |= ice_determine_promisc_mask(&itr->fltr_info);
+	}
+	mutex_unlock(rule_lock);
 
-	act = (ICE_LG_ACT_GENERIC_OFF_RX_DESC_PROF_IDX <<
-	       ICE_LG_ACT_GENERIC_OFFSET_S) & ICE_LG_ACT_GENERIC_OFFSET_M;
+	return 0;
+}
 
-	/* Third action Marker value */
-	act |= ICE_LG_ACT_GENERIC;
-	act |= (sw_marker << ICE_LG_ACT_GENERIC_VALUE_S) &
-		ICE_LG_ACT_GENERIC_VALUE_M;
+/**
+ * ice_get_vsi_promisc - get promiscuous mode of given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ */
+enum ice_status
+ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		    u16 *vid)
+{
+	return _ice_get_vsi_promisc(hw, vsi_handle, promisc_mask,
+				    vid, hw->switch_info);
+}
 
-	lg_act->pdata.lg_act.act[2] = cpu_to_le32(act);
+/**
+ * _ice_get_vsi_vlan_promisc - get VLAN promiscuous mode of given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
+ * @sw: pointer to switch info struct for which function add rule
+ */
+static enum ice_status
+_ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			  u16 *vid, struct ice_switch_info *sw)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
 
-	/* call the fill switch rule to fill the lookup Tx Rx structure */
-	ice_fill_sw_rule(hw, &m_ent->fltr_info, rx_tx,
-			 ice_aqc_opc_update_sw_rules);
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
 
-	/* Update the action to point to the large action ID */
-	rx_tx->pdata.lkup_tx_rx.act =
-		cpu_to_le32(ICE_SINGLE_ACT_PTR |
-			    ((l_id << ICE_SINGLE_ACT_PTR_VAL_S) &
-			     ICE_SINGLE_ACT_PTR_VAL_M));
+	*vid = 0;
+	*promisc_mask = 0;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_PROMISC_VLAN].filt_rules;
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_PROMISC_VLAN].filt_rule_lock;
 
-	/* Use the filter rule ID of the previously created rule with single
-	 * act. Once the update happens, hardware will treat this as large
-	 * action
-	 */
-	rx_tx->pdata.lkup_tx_rx.index =
-		cpu_to_le16(m_ent->fltr_info.fltr_rule_id);
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		/* Continue if this filter doesn't apply to this VSI or the
+		 * VSI ID is not in the VSI map for this filter
+		 */
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
 
-	status = ice_aq_sw_rules(hw, lg_act, rules_size, 2,
-				 ice_aqc_opc_update_sw_rules, NULL);
-	if (!status) {
-		m_ent->lg_act_idx = l_id;
-		m_ent->sw_marker_id = sw_marker;
+		*promisc_mask |= ice_determine_promisc_mask(&itr->fltr_info);
 	}
+	mutex_unlock(rule_lock);
 
-	devm_kfree(ice_hw_to_dev(hw), lg_act);
-	return status;
+	return 0;
 }
 
 /**
- * ice_create_vsi_list_map
+ * ice_get_vsi_vlan_promisc - get VLAN promiscuous mode of given VSI
  * @hw: pointer to the hardware structure
- * @vsi_handle_arr: array of VSI handles to set in the VSI mapping
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- *
- * Helper function to create a new entry of VSI list ID to VSI mapping
- * using the given VSI list ID
+ * @vsi_handle: VSI handle to retrieve info from
+ * @promisc_mask: pointer to mask to be filled in
+ * @vid: VLAN ID of promisc VLAN VSI
  */
-static struct ice_vsi_list_map_info *
-ice_create_vsi_list_map(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			u16 vsi_list_id)
+enum ice_status
+ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			 u16 *vid)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_vsi_list_map_info *v_map;
-	int i;
-
-	v_map = devm_kcalloc(ice_hw_to_dev(hw), 1, sizeof(*v_map), GFP_KERNEL);
-	if (!v_map)
-		return NULL;
+	return _ice_get_vsi_vlan_promisc(hw, vsi_handle, promisc_mask,
+					 vid, hw->switch_info);
+}
 
-	v_map->vsi_list_id = vsi_list_id;
-	v_map->ref_cnt = 1;
-	for (i = 0; i < num_vsi; i++)
-		set_bit(vsi_handle_arr[i], v_map->vsi_map);
+/**
+ * ice_remove_promisc - Remove promisc based filter rules
+ * @hw: pointer to the hardware structure
+ * @recp_id: recipe ID for which the rule needs to removed
+ * @v_list: list of promisc entries
+ */
+static enum ice_status
+ice_remove_promisc(struct ice_hw *hw, u8 recp_id,
+		   struct list_head *v_list)
+{
+	struct ice_fltr_list_entry *v_list_itr, *tmp;
+	struct ice_sw_recipe *recp_list;
 
-	list_add(&v_map->list_entry, &sw->vsi_list_map_head);
-	return v_map;
+	recp_list = &hw->switch_info->recp_list[recp_id];
+	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
+		v_list_itr->status =
+			ice_remove_rule_internal(hw, recp_list, v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
+	}
+	return 0;
 }
 
 /**
- * ice_update_vsi_list_rule
+ * _ice_clear_vsi_promisc - clear specified promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @vsi_handle_arr: array of VSI handles to form a VSI list
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- * @remove: Boolean value to indicate if this is a remove action
- * @opc: switch rules population command type - pass in the command opcode
- * @lkup_type: lookup type of the filter
- *
- * Call AQ command to add a new switch rule or update existing switch rule
- * using the given VSI list ID
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_update_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			 u16 vsi_list_id, bool remove, enum ice_adminq_opc opc,
-			 enum ice_sw_lkup_type lkup_type)
+_ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		       u16 vid, struct ice_switch_info *sw)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_status status;
-	u16 s_rule_size;
-	u16 type;
-	int i;
+	struct ice_fltr_list_entry *fm_entry, *tmp;
+	struct list_head remove_list_head;
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head *rule_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	u8 recipe_id;
 
-	if (!num_vsi)
+	if (!ice_is_vsi_valid(hw, vsi_handle))
 		return ICE_ERR_PARAM;
 
-	if (lkup_type == ICE_SW_LKUP_MAC ||
-	    lkup_type == ICE_SW_LKUP_MAC_VLAN ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE ||
-	    lkup_type == ICE_SW_LKUP_ETHERTYPE_MAC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC ||
-	    lkup_type == ICE_SW_LKUP_PROMISC_VLAN)
-		type = remove ? ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR :
-				ICE_AQC_SW_RULES_T_VSI_LIST_SET;
-	else if (lkup_type == ICE_SW_LKUP_VLAN)
-		type = remove ? ICE_AQC_SW_RULES_T_PRUNE_LIST_CLEAR :
-				ICE_AQC_SW_RULES_T_PRUNE_LIST_SET;
+	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX))
+		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
 	else
-		return ICE_ERR_PARAM;
+		recipe_id = ICE_SW_LKUP_PROMISC;
 
-	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(num_vsi);
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-	for (i = 0; i < num_vsi; i++) {
-		if (!ice_is_vsi_valid(hw, vsi_handle_arr[i])) {
-			status = ICE_ERR_PARAM;
-			goto exit;
+	rule_head = &sw->recp_list[recipe_id].filt_rules;
+	rule_lock = &sw->recp_list[recipe_id].filt_rule_lock;
+
+	INIT_LIST_HEAD(&remove_list_head);
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(itr, rule_head, list_entry) {
+		struct ice_fltr_info *fltr_info;
+		u8 fltr_promisc_mask = 0;
+
+		if (!ice_vsi_uses_fltr(itr, vsi_handle))
+			continue;
+		fltr_info = &itr->fltr_info;
+
+		if (recipe_id == ICE_SW_LKUP_PROMISC_VLAN &&
+		    vid != fltr_info->l_data.mac_vlan.vlan_id)
+			continue;
+
+		fltr_promisc_mask |= ice_determine_promisc_mask(fltr_info);
+
+		/* Skip if filter is not completely specified by given mask */
+		if (fltr_promisc_mask & ~promisc_mask)
+			continue;
+
+		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
+							&remove_list_head,
+							fltr_info);
+		if (status) {
+			mutex_unlock(rule_lock);
+			goto free_fltr_list;
 		}
-		/* AQ call requires hw_vsi_id(s) */
-		s_rule->pdata.vsi_list.vsi[i] =
-			cpu_to_le16(ice_get_hw_vsi_num(hw, vsi_handle_arr[i]));
 	}
+	mutex_unlock(rule_lock);
 
-	s_rule->type = cpu_to_le16(type);
-	s_rule->pdata.vsi_list.number_vsi = cpu_to_le16(num_vsi);
-	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+	status = ice_remove_promisc(hw, recipe_id, &remove_list_head);
 
-	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opc, NULL);
+free_fltr_list:
+	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
+		list_del(&fm_entry->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	}
 
-exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
 	return status;
 }
 
 /**
- * ice_create_vsi_list_rule - Creates and populates a VSI list rule
- * @hw: pointer to the HW struct
- * @vsi_handle_arr: array of VSI handles to form a VSI list
- * @num_vsi: number of VSI handles in the array
- * @vsi_list_id: stores the ID of the VSI list to be created
- * @lkup_type: switch rule filter's lookup type
+ * ice_clear_vsi_promisc - clear specified promiscuous mode(s) for given VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to clear mode
+ * @promisc_mask: mask of promiscuous config bits to clear
+ * @vid: VLAN ID to clear VLAN promiscuous
  */
-static enum ice_status
-ice_create_vsi_list_rule(struct ice_hw *hw, u16 *vsi_handle_arr, u16 num_vsi,
-			 u16 *vsi_list_id, enum ice_sw_lkup_type lkup_type)
+enum ice_status
+ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle,
+		      u8 promisc_mask, u16 vid)
 {
-	enum ice_status status;
-
-	status = ice_aq_alloc_free_vsi_list(hw, vsi_list_id, lkup_type,
-					    ice_aqc_opc_alloc_res);
-	if (status)
-		return status;
-
-	/* Update the newly created VSI list to include the specified VSIs */
-	return ice_update_vsi_list_rule(hw, vsi_handle_arr, num_vsi,
-					*vsi_list_id, false,
-					ice_aqc_opc_add_sw_rules, lkup_type);
+	return _ice_clear_vsi_promisc(hw, vsi_handle, promisc_mask,
+				      vid, hw->switch_info);
 }
 
 /**
- * ice_create_pkt_fwd_rule
+ * _ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @f_entry: entry containing packet forwarding information
- *
- * Create switch rule with given filter information and add an entry
- * to the corresponding filter management list to track this switch rule
- * and VSI mapping
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
+ * @lport: logical port number to configure promisc mode
+ * @sw: pointer to switch info struct for which function add rule
  */
 static enum ice_status
-ice_create_pkt_fwd_rule(struct ice_hw *hw,
-			struct ice_fltr_list_entry *f_entry)
+_ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		     u16 vid, u8 lport, struct ice_switch_info *sw)
 {
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_sw_lkup_type l_type;
-	struct ice_sw_recipe *recp;
-	enum ice_status status;
+	enum { UCAST_FLTR = 1, MCAST_FLTR, BCAST_FLTR };
+	struct ice_fltr_list_entry f_list_entry;
+	struct ice_fltr_info new_fltr;
+	enum ice_status status = 0;
+	bool is_tx_fltr;
+	u16 hw_vsi_id;
+	int pkt_type;
+	u8 recipe_id;
 
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-	fm_entry = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*fm_entry),
-				GFP_KERNEL);
-	if (!fm_entry) {
-		status = ICE_ERR_NO_MEMORY;
-		goto ice_create_pkt_fwd_rule_exit;
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+
+	memset(&new_fltr, 0, sizeof(new_fltr));
+
+	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX)) {
+		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC_VLAN;
+		new_fltr.l_data.mac_vlan.vlan_id = vid;
+		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
+	} else {
+		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC;
+		recipe_id = ICE_SW_LKUP_PROMISC;
 	}
 
-	fm_entry->fltr_info = f_entry->fltr_info;
+	/* Separate filters must be set for each direction/packet type
+	 * combination, so we will loop over the mask value, store the
+	 * individual type, and clear it out in the input mask as it
+	 * is found.
+	 */
+	while (promisc_mask) {
+		struct ice_sw_recipe *recp_list;
+		u8 *mac_addr;
 
-	/* Initialize all the fields for the management entry */
-	fm_entry->vsi_count = 1;
-	fm_entry->lg_act_idx = ICE_INVAL_LG_ACT_INDEX;
-	fm_entry->sw_marker_id = ICE_INVAL_SW_MARKER_ID;
-	fm_entry->counter_index = ICE_INVAL_COUNTER_ID;
+		pkt_type = 0;
+		is_tx_fltr = false;
 
-	ice_fill_sw_rule(hw, &fm_entry->fltr_info, s_rule,
-			 ice_aqc_opc_add_sw_rules);
+		if (promisc_mask & ICE_PROMISC_UCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_UCAST_RX;
+			pkt_type = UCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_UCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_UCAST_TX;
+			pkt_type = UCAST_FLTR;
+			is_tx_fltr = true;
+		} else if (promisc_mask & ICE_PROMISC_MCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_MCAST_RX;
+			pkt_type = MCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_MCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_MCAST_TX;
+			pkt_type = MCAST_FLTR;
+			is_tx_fltr = true;
+		} else if (promisc_mask & ICE_PROMISC_BCAST_RX) {
+			promisc_mask &= ~ICE_PROMISC_BCAST_RX;
+			pkt_type = BCAST_FLTR;
+		} else if (promisc_mask & ICE_PROMISC_BCAST_TX) {
+			promisc_mask &= ~ICE_PROMISC_BCAST_TX;
+			pkt_type = BCAST_FLTR;
+			is_tx_fltr = true;
+		}
 
-	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
-				 ice_aqc_opc_add_sw_rules, NULL);
-	if (status) {
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
-		goto ice_create_pkt_fwd_rule_exit;
-	}
+		/* Check for VLAN promiscuous flag */
+		if (promisc_mask & ICE_PROMISC_VLAN_RX) {
+			promisc_mask &= ~ICE_PROMISC_VLAN_RX;
+		} else if (promisc_mask & ICE_PROMISC_VLAN_TX) {
+			promisc_mask &= ~ICE_PROMISC_VLAN_TX;
+			is_tx_fltr = true;
+		}
 
-	f_entry->fltr_info.fltr_rule_id =
-		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
-	fm_entry->fltr_info.fltr_rule_id =
-		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		/* Set filter DA based on packet type */
+		mac_addr = new_fltr.l_data.mac.mac_addr;
+		if (pkt_type == BCAST_FLTR) {
+			eth_broadcast_addr(mac_addr);
+		} else if (pkt_type == MCAST_FLTR ||
+			   pkt_type == UCAST_FLTR) {
+			/* Use the dummy ether header DA */
+			ether_addr_copy(mac_addr, dummy_eth_header);
+			if (pkt_type == MCAST_FLTR)
+				mac_addr[0] |= 0x1;	/* Set multicast bit */
+		}
 
-	/* The book keeping entries will get removed when base driver
-	 * calls remove filter AQ command
-	 */
-	l_type = fm_entry->fltr_info.lkup_type;
-	recp = &hw->switch_info->recp_list[l_type];
-	list_add(&fm_entry->list_entry, &recp->filt_rules);
+		/* Need to reset this to zero for all iterations */
+		new_fltr.flag = 0;
+		if (is_tx_fltr) {
+			new_fltr.flag |= ICE_FLTR_TX;
+			new_fltr.src = hw_vsi_id;
+		} else {
+			new_fltr.flag |= ICE_FLTR_RX;
+			new_fltr.src = lport;
+		}
 
-ice_create_pkt_fwd_rule_exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
+		new_fltr.fltr_act = ICE_FWD_TO_VSI;
+		new_fltr.vsi_handle = vsi_handle;
+		new_fltr.fwd_id.hw_vsi_id = hw_vsi_id;
+		f_list_entry.fltr_info = new_fltr;
+		recp_list = &sw->recp_list[recipe_id];
+
+		status = ice_add_rule_internal(hw, recp_list, lport,
+					       &f_list_entry);
+		if (status)
+			goto set_promisc_exit;
+	}
+
+set_promisc_exit:
 	return status;
 }
 
 /**
- * ice_update_pkt_fwd_rule
+ * ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
  * @hw: pointer to the hardware structure
- * @f_info: filter information for switch rule
- *
- * Call AQ command to update a previously created switch rule with a
- * VSI list ID
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @vid: VLAN ID to set VLAN promiscuous
  */
-static enum ice_status
-ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
+enum ice_status
+ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+		    u16 vid)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_status status;
-
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-			      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-
-	ice_fill_sw_rule(hw, f_info, s_rule, ice_aqc_opc_update_sw_rules);
-
-	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(f_info->fltr_rule_id);
-
-	/* Update switch rule with new rule set to forward VSI list */
-	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
-				 ice_aqc_opc_update_sw_rules, NULL);
-
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
+	return _ice_set_vsi_promisc(hw, vsi_handle, promisc_mask, vid,
+				    hw->port_info->lport,
+				    hw->switch_info);
 }
 
 /**
- * ice_update_sw_rule_bridge_mode
- * @hw: pointer to the HW struct
+ * _ice_set_vlan_vsi_promisc
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @rm_vlan_promisc: Clear VLANs VSI promisc mode
+ * @lport: logical port number to configure promisc mode
+ * @sw: pointer to switch info struct for which function add rule
  *
- * Updates unicast switch filter rules based on VEB/VEPA mode
+ * Configure VSI with all associated VLANs to given promiscuous mode(s)
  */
-enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+static enum ice_status
+_ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			  bool rm_vlan_promisc, u8 lport,
+			  struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	enum ice_status status = 0;
-	struct list_head *rule_head;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	struct ice_fltr_list_entry *list_itr, *tmp;
+	struct list_head vsi_list_head;
+	struct list_head *vlan_head;
+	struct mutex *vlan_lock; /* Lock to protect filter rule list */
+	enum ice_status status;
+	u16 vlan_id;
 
-	mutex_lock(rule_lock);
-	list_for_each_entry(fm_entry, rule_head, list_entry) {
-		struct ice_fltr_info *fi = &fm_entry->fltr_info;
-		u8 *addr = fi->l_data.mac.mac_addr;
+	INIT_LIST_HEAD(&vsi_list_head);
+	vlan_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
+	vlan_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
+	mutex_lock(vlan_lock);
+	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, vlan_head,
+					  &vsi_list_head);
+	mutex_unlock(vlan_lock);
+	if (status)
+		goto free_fltr_list;
 
-		/* Update unicast Tx rules to reflect the selected
-		 * VEB/VEPA mode
-		 */
-		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
-		    (fi->fltr_act == ICE_FWD_TO_VSI ||
-		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
-		     fi->fltr_act == ICE_FWD_TO_Q ||
-		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
-			status = ice_update_pkt_fwd_rule(hw, fi);
-			if (status)
-				break;
-		}
+	list_for_each_entry(list_itr, &vsi_list_head, list_entry) {
+		vlan_id = list_itr->fltr_info.l_data.vlan.vlan_id;
+		if (rm_vlan_promisc)
+			status =  _ice_clear_vsi_promisc(hw, vsi_handle,
+							 promisc_mask,
+							 vlan_id, sw);
+		else
+			status =  _ice_set_vsi_promisc(hw, vsi_handle,
+						       promisc_mask, vlan_id,
+						       lport, sw);
+		if (status)
+			break;
 	}
 
-	mutex_unlock(rule_lock);
-
+free_fltr_list:
+	list_for_each_entry_safe(list_itr, tmp, &vsi_list_head, list_entry) {
+		list_del(&list_itr->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), list_itr);
+	}
 	return status;
 }
 
 /**
- * ice_add_update_vsi_list
+ * ice_set_vlan_vsi_promisc
  * @hw: pointer to the hardware structure
- * @m_entry: pointer to current filter management list entry
- * @cur_fltr: filter information from the book keeping entry
- * @new_fltr: filter information with the new VSI to be added
- *
- * Call AQ command to add or update previously created VSI list with new VSI.
+ * @vsi_handle: VSI handle to configure
+ * @promisc_mask: mask of promiscuous config bits
+ * @rm_vlan_promisc: Clear VLANs VSI promisc mode
  *
- * Helper function to do book keeping associated with adding filter information
- * The algorithm to do the book keeping is described below :
- * When a VSI needs to subscribe to a given filter (MAC/VLAN/Ethtype etc.)
- *	if only one VSI has been added till now
- *		Allocate a new VSI list and add two VSIs
- *		to this list using switch rule command
- *		Update the previously created switch rule with the
- *		newly created VSI list ID
- *	if a VSI list was previously created
- *		Add the new VSI to the previously created VSI list set
- *		using the update switch rule command
+ * Configure VSI with all associated VLANs to given promiscuous mode(s)
  */
-static enum ice_status
-ice_add_update_vsi_list(struct ice_hw *hw,
-			struct ice_fltr_mgmt_list_entry *m_entry,
-			struct ice_fltr_info *cur_fltr,
-			struct ice_fltr_info *new_fltr)
+enum ice_status
+ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
+			 bool rm_vlan_promisc)
 {
-	enum ice_status status = 0;
-	u16 vsi_list_id = 0;
-
-	if ((cur_fltr->fltr_act == ICE_FWD_TO_Q ||
-	     cur_fltr->fltr_act == ICE_FWD_TO_QGRP))
-		return ICE_ERR_NOT_IMPL;
-
-	if ((new_fltr->fltr_act == ICE_FWD_TO_Q ||
-	     new_fltr->fltr_act == ICE_FWD_TO_QGRP) &&
-	    (cur_fltr->fltr_act == ICE_FWD_TO_VSI ||
-	     cur_fltr->fltr_act == ICE_FWD_TO_VSI_LIST))
-		return ICE_ERR_NOT_IMPL;
-
-	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
-		/* Only one entry existed in the mapping and it was not already
-		 * a part of a VSI list. So, create a VSI list with the old and
-		 * new VSIs.
-		 */
-		struct ice_fltr_info tmp_fltr;
-		u16 vsi_handle_arr[2];
-
-		/* A rule already exists with the new VSI being added */
-		if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id)
-			return ICE_ERR_ALREADY_EXISTS;
-
-		vsi_handle_arr[0] = cur_fltr->vsi_handle;
-		vsi_handle_arr[1] = new_fltr->vsi_handle;
-		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
-						  &vsi_list_id,
-						  new_fltr->lkup_type);
-		if (status)
-			return status;
-
-		tmp_fltr = *new_fltr;
-		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
-		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
-		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
-		/* Update the previous switch rule of "MAC forward to VSI" to
-		 * "MAC fwd to VSI list"
-		 */
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
-		if (status)
-			return status;
-
-		cur_fltr->fwd_id.vsi_list_id = vsi_list_id;
-		cur_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
-		m_entry->vsi_list_info =
-			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
-						vsi_list_id);
-
-		if (!m_entry->vsi_list_info)
-			return ICE_ERR_NO_MEMORY;
-
-		/* If this entry was large action then the large action needs
-		 * to be updated to point to FWD to VSI list
-		 */
-		if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID)
-			status =
-			    ice_add_marker_act(hw, m_entry,
-					       m_entry->sw_marker_id,
-					       m_entry->lg_act_idx);
-	} else {
-		u16 vsi_handle = new_fltr->vsi_handle;
-		enum ice_adminq_opc opcode;
+	return _ice_set_vlan_vsi_promisc(hw, vsi_handle, promisc_mask,
+					 rm_vlan_promisc, hw->port_info->lport,
+					 hw->switch_info);
+}
 
-		if (!m_entry->vsi_list_info)
-			return ICE_ERR_CFG;
+/**
+ * ice_remove_vsi_lkup_fltr - Remove lookup type filters for a VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ * @recp_list: recipe list from which function remove fltr
+ * @lkup: switch rule filter lookup type
+ */
+static void
+ice_remove_vsi_lkup_fltr(struct ice_hw *hw, u16 vsi_handle,
+			 struct ice_sw_recipe *recp_list,
+			 enum ice_sw_lkup_type lkup)
+{
+	struct ice_fltr_list_entry *fm_entry;
+	struct list_head remove_list_head;
+	struct list_head *rule_head;
+	struct ice_fltr_list_entry *tmp;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status status;
 
-		/* A rule already exists with the new VSI being added */
-		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
-			return 0;
+	INIT_LIST_HEAD(&remove_list_head);
+	rule_lock = &recp_list[lkup].filt_rule_lock;
+	rule_head = &recp_list[lkup].filt_rules;
+	mutex_lock(rule_lock);
+	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, rule_head,
+					  &remove_list_head);
+	mutex_unlock(rule_lock);
+	if (status)
+		goto free_fltr_list;
 
-		/* Update the previously created VSI list set with
-		 * the new VSI ID passed in
-		 */
-		vsi_list_id = cur_fltr->fwd_id.vsi_list_id;
-		opcode = ice_aqc_opc_update_sw_rules;
+	switch (lkup) {
+	case ICE_SW_LKUP_MAC:
+		ice_remove_mac_rule(hw, &remove_list_head, &recp_list[lkup]);
+		break;
+	case ICE_SW_LKUP_VLAN:
+		ice_remove_vlan_rule(hw, &remove_list_head, &recp_list[lkup]);
+		break;
+	case ICE_SW_LKUP_PROMISC:
+	case ICE_SW_LKUP_PROMISC_VLAN:
+		ice_remove_promisc(hw, lkup, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_MAC_VLAN:
+		ice_remove_mac_vlan(hw, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_ETHERTYPE:
+	case ICE_SW_LKUP_ETHERTYPE_MAC:
+		ice_remove_eth_mac(hw, &remove_list_head);
+		break;
+	case ICE_SW_LKUP_DFLT:
+		ice_debug(hw, ICE_DBG_SW, "Remove filters for this lookup type hasn't been implemented yet\n");
+		break;
+	case ICE_SW_LKUP_LAST:
+		ice_debug(hw, ICE_DBG_SW, "Unsupported lookup type\n");
+		break;
+	}
 
-		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
-						  vsi_list_id, false, opcode,
-						  new_fltr->lkup_type);
-		/* update VSI list mapping info with new VSI ID */
-		if (!status)
-			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
+free_fltr_list:
+	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
+		list_del(&fm_entry->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fm_entry);
 	}
-	if (!status)
-		m_entry->vsi_count++;
-	return status;
 }
 
 /**
- * ice_find_rule_entry - Search a rule entry
+ * ice_remove_vsi_fltr_rule - Remove all filters for a VSI
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which the specified rule needs to be searched
- * @f_info: rule information
- *
- * Helper function to search for a given rule entry
- * Returns pointer to entry storing the rule if found
+ * @vsi_handle: VSI handle to remove filters from
+ * @sw: pointer to switch info struct
  */
-static struct ice_fltr_mgmt_list_entry *
-ice_find_rule_entry(struct ice_hw *hw, u8 recp_id, struct ice_fltr_info *f_info)
+static void
+ice_remove_vsi_fltr_rule(struct ice_hw *hw, u16 vsi_handle,
+			 struct ice_switch_info *sw)
 {
-	struct ice_fltr_mgmt_list_entry *list_itr, *ret = NULL;
-	struct ice_switch_info *sw = hw->switch_info;
-	struct list_head *list_head;
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_MAC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_MAC_VLAN);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_PROMISC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_VLAN);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_DFLT);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_ETHERTYPE);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_ETHERTYPE_MAC);
+	ice_remove_vsi_lkup_fltr(hw, vsi_handle,
+				 sw->recp_list, ICE_SW_LKUP_PROMISC_VLAN);
+}
 
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
-			    sizeof(f_info->l_data)) &&
-		    f_info->flag == list_itr->fltr_info.flag) {
-			ret = list_itr;
-			break;
-		}
-	}
-	return ret;
+
+/**
+ * ice_remove_vsi_fltr - Remove all filters for a VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: VSI handle to remove filters from
+ */
+void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
+{
+	ice_remove_vsi_fltr_rule(hw, vsi_handle, hw->switch_info);
 }
 
 /**
- * ice_find_vsi_list_entry - Search VSI list map with VSI count 1
+ * ice_alloc_res_cntr - allocating resource counter
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which VSI lists needs to be searched
- * @vsi_handle: VSI handle to be found in VSI list
- * @vsi_list_id: VSI list ID found containing vsi_handle
- *
- * Helper function to search a VSI list with single entry containing given VSI
- * handle element. This can be extended further to search VSI list with more
- * than 1 vsi_count. Returns pointer to VSI list entry if found.
+ * @type: type of resource
+ * @alloc_shared: if set it is shared else dedicated
+ * @num_items: number of entries requested for FD resource type
+ * @counter_id: counter index returned by AQ call
  */
-static struct ice_vsi_list_map_info *
-ice_find_vsi_list_entry(struct ice_hw *hw, u8 recp_id, u16 vsi_handle,
-			u16 *vsi_list_id)
+enum ice_status
+ice_alloc_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		   u16 *counter_id)
 {
-	struct ice_vsi_list_map_info *map_info = NULL;
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_itr;
-	struct list_head *list_head;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (list_itr->vsi_count == 1 && list_itr->vsi_list_info) {
-			map_info = list_itr->vsi_list_info;
-			if (test_bit(vsi_handle, map_info->vsi_map)) {
-				*vsi_list_id = map_info->vsi_list_id;
-				return map_info;
-			}
-		}
-	}
-	return NULL;
+	/* Allocate resource */
+	buf_len = struct_size(buf, elem, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	buf->num_elems = cpu_to_le16(num_items);
+	buf->res_type = cpu_to_le16(((type << ICE_AQC_RES_TYPE_S) &
+				      ICE_AQC_RES_TYPE_M) | alloc_shared);
+
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (status)
+		goto exit;
+
+	*counter_id = le16_to_cpu(buf->elem[0].e.sw_resp);
+
+exit:
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
 }
 
 /**
- * ice_add_rule_internal - add rule for a given lookup type
+ * ice_free_res_cntr - free resource counter
  * @hw: pointer to the hardware structure
- * @recp_id: lookup type (recipe ID) for which rule has to be added
- * @f_entry: structure containing MAC forwarding information
- *
- * Adds or updates the rule lists for a given recipe
+ * @type: type of resource
+ * @alloc_shared: if set it is shared else dedicated
+ * @num_items: number of entries to be freed for FD resource type
+ * @counter_id: counter ID resource which needs to be freed
  */
-static enum ice_status
-ice_add_rule_internal(struct ice_hw *hw, u8 recp_id,
-		      struct ice_fltr_list_entry *f_entry)
+enum ice_status
+ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		  u16 counter_id)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_info *new_fltr, *cur_fltr;
-	struct ice_fltr_mgmt_list_entry *m_entry;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
+	struct ice_aqc_alloc_free_res_elem *buf;
+	enum ice_status status;
+	u16 buf_len;
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
-		return ICE_ERR_PARAM;
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
+	/* Free resource */
+	buf_len = struct_size(buf, elem, 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
 
-	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
+	buf->num_elems = cpu_to_le16(num_items);
+	buf->res_type = cpu_to_le16(((type << ICE_AQC_RES_TYPE_S) &
+				      ICE_AQC_RES_TYPE_M) | alloc_shared);
+	buf->elem[0].e.sw_resp = cpu_to_le16(counter_id);
 
-	mutex_lock(rule_lock);
-	new_fltr = &f_entry->fltr_info;
-	if (new_fltr->flag & ICE_FLTR_RX)
-		new_fltr->src = hw->port_info->lport;
-	else if (new_fltr->flag & ICE_FLTR_TX)
-		new_fltr->src = f_entry->fltr_info.fwd_id.hw_vsi_id;
+	status = ice_aq_alloc_free_res(hw, 1, buf, buf_len,
+				       ice_aqc_opc_free_res, NULL);
+	if (status)
+		ice_debug(hw, ICE_DBG_SW, "counter resource could not be freed\n");
 
-	m_entry = ice_find_rule_entry(hw, recp_id, new_fltr);
-	if (!m_entry) {
-		mutex_unlock(rule_lock);
-		return ice_create_pkt_fwd_rule(hw, f_entry);
-	}
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
 
-	cur_fltr = &m_entry->fltr_info;
-	status = ice_add_update_vsi_list(hw, m_entry, cur_fltr, new_fltr);
-	mutex_unlock(rule_lock);
+/**
+ * ice_alloc_vlan_res_counter - obtain counter resource for VLAN type
+ * @hw: pointer to the hardware structure
+ * @counter_id: returns counter index
+ */
+enum ice_status ice_alloc_vlan_res_counter(struct ice_hw *hw, u16 *counter_id)
+{
+	return ice_alloc_res_cntr(hw, ICE_AQC_RES_TYPE_VLAN_COUNTER,
+				  ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1,
+				  counter_id);
+}
 
-	return status;
+/**
+ * ice_free_vlan_res_counter - Free counter resource for VLAN type
+ * @hw: pointer to the hardware structure
+ * @counter_id: counter index to be freed
+ */
+enum ice_status ice_free_vlan_res_counter(struct ice_hw *hw, u16 counter_id)
+{
+	return ice_free_res_cntr(hw, ICE_AQC_RES_TYPE_VLAN_COUNTER,
+				 ICE_AQC_RES_TYPE_FLAG_DEDICATED, 1,
+				 counter_id);
 }
 
 /**
- * ice_remove_vsi_list_rule
+ * ice_alloc_res_lg_act - add large action resource
  * @hw: pointer to the hardware structure
- * @vsi_list_id: VSI list ID generated as part of allocate resource
- * @lkup_type: switch rule filter lookup type
- *
- * The VSI list should be emptied before this function is called to remove the
- * VSI list.
+ * @l_id: large action ID to fill it in
+ * @num_acts: number of actions to hold with a large action entry
  */
 static enum ice_status
-ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
-			 enum ice_sw_lkup_type lkup_type)
+ice_alloc_res_lg_act(struct ice_hw *hw, u16 *l_id, u16 num_acts)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
+	struct ice_aqc_alloc_free_res_elem *sw_buf;
 	enum ice_status status;
-	u16 s_rule_size;
+	u16 buf_len;
 
-	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(0);
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
+	if (num_acts > ICE_MAX_LG_ACT || num_acts == 0)
+		return ICE_ERR_PARAM;
+
+	/* Allocate resource for large action */
+	buf_len = struct_size(sw_buf, elem, 1);
+	sw_buf = devm_kzalloc(ice_hw_to_dev(hw), buf_len, GFP_KERNEL);
+	if (!sw_buf)
 		return ICE_ERR_NO_MEMORY;
 
-	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
-	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+	sw_buf->num_elems = cpu_to_le16(1);
 
-	/* Free the vsi_list resource that we allocated. It is assumed that the
-	 * list is empty at this point.
+	/* If num_acts is 1, use ICE_AQC_RES_TYPE_WIDE_TABLE_1.
+	 * If num_acts is 2, use ICE_AQC_RES_TYPE_WIDE_TABLE_3.
+	 * If num_acts is greater than 2, then use
+	 * ICE_AQC_RES_TYPE_WIDE_TABLE_4.
+	 * The num_acts cannot exceed 4. This was ensured at the
+	 * beginning of the function.
 	 */
-	status = ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
-					    ice_aqc_opc_free_res);
+	if (num_acts == 1)
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_1);
+	else if (num_acts == 2)
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_2);
+	else
+		sw_buf->res_type = cpu_to_le16(ICE_AQC_RES_TYPE_WIDE_TABLE_4);
 
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	status = ice_aq_alloc_free_res(hw, 1, sw_buf, buf_len,
+				       ice_aqc_opc_alloc_res, NULL);
+	if (!status)
+		*l_id = le16_to_cpu(sw_buf->elem[0].e.sw_resp);
+
+	devm_kfree(ice_hw_to_dev(hw), sw_buf);
 	return status;
 }
 
 /**
- * ice_rem_update_vsi_list
+ * ice_add_mac_with_sw_marker - add filter with sw marker
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle of the VSI to remove
- * @fm_list: filter management entry for which the VSI list management needs to
- *           be done
+ * @f_info: filter info structure containing the MAC filter information
+ * @sw_marker: sw marker to tag the Rx descriptor with
  */
-static enum ice_status
-ice_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
-			struct ice_fltr_mgmt_list_entry *fm_list)
+enum ice_status
+ice_add_mac_with_sw_marker(struct ice_hw *hw, struct ice_fltr_info *f_info,
+			   u16 sw_marker)
 {
-	enum ice_sw_lkup_type lkup_type;
-	enum ice_status status = 0;
-	u16 vsi_list_id;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct ice_fltr_list_entry fl_info;
+	struct ice_sw_recipe *recp_list;
+	struct list_head l_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status ret;
+	bool entry_exists;
+	u16 lg_act_id;
 
-	if (fm_list->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST ||
-	    fm_list->vsi_count == 0)
+	if (f_info->fltr_act != ICE_FWD_TO_VSI)
 		return ICE_ERR_PARAM;
 
-	/* A rule with the VSI being removed does not exist */
-	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
-		return ICE_ERR_DOES_NOT_EXIST;
+	if (f_info->lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
 
-	lkup_type = fm_list->fltr_info.lkup_type;
-	vsi_list_id = fm_list->fltr_info.fwd_id.vsi_list_id;
-	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
-					  ice_aqc_opc_update_sw_rules,
-					  lkup_type);
-	if (status)
-		return status;
+	if (sw_marker == ICE_INVAL_SW_MARKER_ID)
+		return ICE_ERR_PARAM;
 
-	fm_list->vsi_count--;
-	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+	if (!ice_is_vsi_valid(hw, f_info->vsi_handle))
+		return ICE_ERR_PARAM;
+	f_info->fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, f_info->vsi_handle);
 
-	if (fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) {
-		struct ice_fltr_info tmp_fltr_info = fm_list->fltr_info;
-		struct ice_vsi_list_map_info *vsi_list_info =
-			fm_list->vsi_list_info;
-		u16 rem_vsi_handle;
+	/* Add filter if it doesn't exist so then the adding of large
+	 * action always results in update
+	 */
 
-		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
-						ICE_MAX_VSI);
-		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
-			return ICE_ERR_OUT_OF_RANGE;
+	INIT_LIST_HEAD(&l_head);
+	fl_info.fltr_info = *f_info;
+	list_add(&fl_info.list_entry, &l_head);
 
-		/* Make sure VSI list is empty before removing it below */
-		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
-						  vsi_list_id, true,
-						  ice_aqc_opc_update_sw_rules,
-						  lkup_type);
-		if (status)
-			return status;
+	entry_exists = false;
+	ret = ice_add_mac_rule(hw, &l_head, hw->switch_info,
+			       hw->port_info->lport);
+	if (ret == ICE_ERR_ALREADY_EXISTS)
+		entry_exists = true;
+	else if (ret)
+		return ret;
 
-		tmp_fltr_info.fltr_act = ICE_FWD_TO_VSI;
-		tmp_fltr_info.fwd_id.hw_vsi_id =
-			ice_get_hw_vsi_num(hw, rem_vsi_handle);
-		tmp_fltr_info.vsi_handle = rem_vsi_handle;
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr_info);
-		if (status) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
-				  tmp_fltr_info.fwd_id.hw_vsi_id, status);
-			return status;
-		}
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+	rule_lock = &recp_list->filt_rule_lock;
+	mutex_lock(rule_lock);
+	/* Get the book keeping entry for the filter */
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, f_info);
+	if (!m_entry)
+		goto exit_error;
 
-		fm_list->fltr_info = tmp_fltr_info;
+	/* If counter action was enabled for this rule then don't enable
+	 * sw marker large action
+	 */
+	if (m_entry->counter_index != ICE_INVAL_COUNTER_ID) {
+		ret = ICE_ERR_PARAM;
+		goto exit_error;
 	}
 
-	if ((fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) ||
-	    (fm_list->vsi_count == 0 && lkup_type == ICE_SW_LKUP_VLAN)) {
-		struct ice_vsi_list_map_info *vsi_list_info =
-			fm_list->vsi_list_info;
+	/* if same marker was added before */
+	if (m_entry->sw_marker_id == sw_marker) {
+		ret = ICE_ERR_ALREADY_EXISTS;
+		goto exit_error;
+	}
 
-		/* Remove the VSI list since it is no longer used */
-		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
-		if (status) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Failed to remove VSI list %d, error %d\n",
-				  vsi_list_id, status);
-			return status;
-		}
+	/* Allocate a hardware table entry to hold large act. Three actions
+	 * for marker based large action
+	 */
+	ret = ice_alloc_res_lg_act(hw, &lg_act_id, 3);
+	if (ret)
+		goto exit_error;
 
-		list_del(&vsi_list_info->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
-		fm_list->vsi_list_info = NULL;
+	if (lg_act_id == ICE_INVAL_LG_ACT_INDEX)
+		goto exit_error;
+
+	/* Update the switch rule to add the marker action */
+	ret = ice_add_marker_act(hw, m_entry, sw_marker, lg_act_id);
+	if (!ret) {
+		mutex_unlock(rule_lock);
+		return ret;
 	}
 
-	return status;
+exit_error:
+	mutex_unlock(rule_lock);
+	/* only remove entry if it did not exist previously */
+	if (!entry_exists)
+		ret = ice_remove_mac(hw, &l_head);
+
+	return ret;
 }
 
 /**
- * ice_remove_rule_internal - Remove a filter rule of a given type
+ * ice_add_mac_with_counter - add filter with counter enabled
  * @hw: pointer to the hardware structure
- * @recp_id: recipe ID for which the rule needs to removed
- * @f_entry: rule entry containing filter information
+ * @f_info: pointer to filter info structure containing the MAC filter
+ *          information
  */
-static enum ice_status
-ice_remove_rule_internal(struct ice_hw *hw, u8 recp_id,
-			 struct ice_fltr_list_entry *f_entry)
+enum ice_status
+ice_add_mac_with_counter(struct ice_hw *hw, struct ice_fltr_info *f_info)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_elem;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	bool remove_rule = false;
-	u16 vsi_handle;
+	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct ice_fltr_list_entry fl_info;
+	struct ice_sw_recipe *recp_list;
+	struct list_head l_head;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
+	enum ice_status ret;
+	bool entry_exist;
+	u16 counter_id;
+	u16 lg_act_id;
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
+	if (f_info->fltr_act != ICE_FWD_TO_VSI)
 		return ICE_ERR_PARAM;
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
 
-	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
+	if (f_info->lkup_type != ICE_SW_LKUP_MAC)
+		return ICE_ERR_PARAM;
+
+	if (!ice_is_vsi_valid(hw, f_info->vsi_handle))
+		return ICE_ERR_PARAM;
+	f_info->fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, f_info->vsi_handle);
+	recp_list = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC];
+
+	entry_exist = false;
+
+	rule_lock = &recp_list->filt_rule_lock;
+
+	/* Add filter if it doesn't exist so then the adding of large
+	 * action always results in update
+	 */
+	INIT_LIST_HEAD(&l_head);
+
+	fl_info.fltr_info = *f_info;
+	list_add(&fl_info.list_entry, &l_head);
+
+	ret = ice_add_mac_rule(hw, &l_head, hw->switch_info,
+			       hw->port_info->lport);
+	if (ret == ICE_ERR_ALREADY_EXISTS)
+		entry_exist = true;
+	else if (ret)
+		return ret;
+
 	mutex_lock(rule_lock);
-	list_elem = ice_find_rule_entry(hw, recp_id, &f_entry->fltr_info);
-	if (!list_elem) {
-		status = ICE_ERR_DOES_NOT_EXIST;
-		goto exit;
+	m_entry = ice_find_rule_entry(&recp_list->filt_rules, f_info);
+	if (!m_entry) {
+		ret = ICE_ERR_BAD_PTR;
+		goto exit_error;
 	}
 
-	if (list_elem->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST) {
-		remove_rule = true;
-	} else if (!list_elem->vsi_list_info) {
-		status = ICE_ERR_DOES_NOT_EXIST;
-		goto exit;
-	} else if (list_elem->vsi_list_info->ref_cnt > 1) {
-		/* a ref_cnt > 1 indicates that the vsi_list is being
-		 * shared by multiple rules. Decrement the ref_cnt and
-		 * remove this rule, but do not modify the list, as it
-		 * is in-use by other rules.
-		 */
-		list_elem->vsi_list_info->ref_cnt--;
-		remove_rule = true;
-	} else {
-		/* a ref_cnt of 1 indicates the vsi_list is only used
-		 * by one rule. However, the original removal request is only
-		 * for a single VSI. Update the vsi_list first, and only
-		 * remove the rule if there are no further VSIs in this list.
-		 */
-		vsi_handle = f_entry->fltr_info.vsi_handle;
-		status = ice_rem_update_vsi_list(hw, vsi_handle, list_elem);
-		if (status)
-			goto exit;
-		/* if VSI count goes to zero after updating the VSI list */
-		if (list_elem->vsi_count == 0)
-			remove_rule = true;
+	/* Don't enable counter for a filter for which sw marker was enabled */
+	if (m_entry->sw_marker_id != ICE_INVAL_SW_MARKER_ID) {
+		ret = ICE_ERR_PARAM;
+		goto exit_error;
 	}
 
-	if (remove_rule) {
-		/* Remove the lookup rule */
-		struct ice_aqc_sw_rules_elem *s_rule;
-
-		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
-				      GFP_KERNEL);
-		if (!s_rule) {
-			status = ICE_ERR_NO_MEMORY;
-			goto exit;
-		}
-
-		ice_fill_sw_rule(hw, &list_elem->fltr_info, s_rule,
-				 ice_aqc_opc_remove_sw_rules);
+	/* If a counter was already enabled then don't need to add again */
+	if (m_entry->counter_index != ICE_INVAL_COUNTER_ID) {
+		ret = ICE_ERR_ALREADY_EXISTS;
+		goto exit_error;
+	}
 
-		status = ice_aq_sw_rules(hw, s_rule,
-					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
-					 ice_aqc_opc_remove_sw_rules, NULL);
+	/* Allocate a hardware table entry to VLAN counter */
+	ret = ice_alloc_vlan_res_counter(hw, &counter_id);
+	if (ret)
+		goto exit_error;
 
-		/* Remove a book keeping from the list */
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
+	/* Allocate a hardware table entry to hold large act. Two actions for
+	 * counter based large action
+	 */
+	ret = ice_alloc_res_lg_act(hw, &lg_act_id, 2);
+	if (ret)
+		goto exit_error;
 
-		if (status)
-			goto exit;
+	if (lg_act_id == ICE_INVAL_LG_ACT_INDEX)
+		goto exit_error;
 
-		list_del(&list_elem->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), list_elem);
+	/* Update the switch rule to add the counter action */
+	ret = ice_add_counter_act(hw, m_entry, counter_id, lg_act_id);
+	if (!ret) {
+		mutex_unlock(rule_lock);
+		return ret;
 	}
-exit:
+
+exit_error:
 	mutex_unlock(rule_lock);
-	return status;
+	/* only remove entry if it did not exist previously */
+	if (!entry_exist)
+		ret = ice_remove_mac(hw, &l_head);
+
+	return ret;
 }
 
+/* This is mapping table entry that maps every word within a given protocol
+ * structure to the real byte offset as per the specification of that
+ * protocol header.
+ * for example dst address is 3 words in ethertype header and corresponding
+ * bytes are 0, 2, 3 in the actual packet header and src address is at 4, 6, 8
+ * IMPORTANT: Every structure part of "ice_prot_hdr" union should have a
+ * matching entry describing its field. This needs to be updated if new
+ * structure is added to that union.
+ */
+static const struct ice_prot_ext_tbl_entry ice_prot_ext[ICE_PROTOCOL_LAST] = {
+	{ ICE_MAC_OFOS,		{ 0, 2, 4, 6, 8, 10, 12 } },
+	{ ICE_MAC_IL,		{ 0, 2, 4, 6, 8, 10, 12 } },
+	{ ICE_ETYPE_OL,		{ 0 } },
+	{ ICE_VLAN_OFOS,	{ 0, 2 } },
+	{ ICE_IPV4_OFOS,	{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18 } },
+	{ ICE_IPV4_IL,		{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18 } },
+	{ ICE_IPV6_OFOS,	{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
+				 26, 28, 30, 32, 34, 36, 38 } },
+	{ ICE_IPV6_IL,		{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
+				 26, 28, 30, 32, 34, 36, 38 } },
+	{ ICE_TCP_IL,		{ 0, 2 } },
+	{ ICE_UDP_OF,		{ 0, 2 } },
+	{ ICE_UDP_ILOS,		{ 0, 2 } },
+	{ ICE_SCTP_IL,		{ 0, 2 } },
+	{ ICE_VXLAN,		{ 8, 10, 12, 14 } },
+	{ ICE_GENEVE,		{ 8, 10, 12, 14 } },
+	{ ICE_VXLAN_GPE,	{ 8, 10, 12, 14 } },
+	{ ICE_NVGRE,		{ 0, 2, 4, 6 } },
+	{ ICE_GTP,		{ 8, 10, 12, 14, 16, 18, 20, 22 } },
+};
+
+/* The following table describes preferred grouping of recipes.
+ * If a recipe that needs to be programmed is a superset or matches one of the
+ * following combinations, then the recipe needs to be chained as per the
+ * following policy.
+ */
+
+static struct ice_protocol_entry ice_prot_id_tbl[ICE_PROTOCOL_LAST] = {
+	{ ICE_MAC_OFOS,		ICE_MAC_OFOS_HW },
+	{ ICE_MAC_IL,		ICE_MAC_IL_HW },
+	{ ICE_ETYPE_OL,		ICE_ETYPE_OL_HW },
+	{ ICE_VLAN_OFOS,	ICE_VLAN_OL_HW },
+	{ ICE_IPV4_OFOS,	ICE_IPV4_OFOS_HW },
+	{ ICE_IPV4_IL,		ICE_IPV4_IL_HW },
+	{ ICE_IPV6_OFOS,	ICE_IPV6_OFOS_HW },
+	{ ICE_IPV6_IL,		ICE_IPV6_IL_HW },
+	{ ICE_TCP_IL,		ICE_TCP_IL_HW },
+	{ ICE_UDP_OF,		ICE_UDP_OF_HW },
+	{ ICE_UDP_ILOS,		ICE_UDP_ILOS_HW },
+	{ ICE_SCTP_IL,		ICE_SCTP_IL_HW },
+	{ ICE_VXLAN,		ICE_UDP_OF_HW },
+	{ ICE_GENEVE,		ICE_UDP_OF_HW },
+	{ ICE_VXLAN_GPE,	ICE_UDP_OF_HW },
+	{ ICE_NVGRE,		ICE_GRE_OF_HW },
+	{ ICE_GTP,		ICE_UDP_OF_HW },
+};
+
 /**
- * ice_add_mac - Add a MAC address based filter rule
+ * ice_find_recp - find a recipe
  * @hw: pointer to the hardware structure
- * @m_list: list of MAC addresses and forwarding information
+ * @lkup_exts: extension sequence to match
  *
- * IMPORTANT: When the ucast_shared flag is set to false and m_list has
- * multiple unicast addresses, the function assumes that all the
- * addresses are unique in a given add_mac call. It doesn't
- * check for duplicates in this case, removing duplicates from a given
- * list should be taken care of in the caller of this function.
+ * Returns index of matching recipe, or ICE_MAX_NUM_RECIPES if not found.
  */
-enum ice_status
-ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
+static u16 ice_find_recp(struct ice_hw *hw, struct ice_prot_lkup_ext *lkup_exts)
 {
-	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
-	struct ice_fltr_list_entry *m_list_itr;
-	struct list_head *rule_head;
-	u16 elem_sent, total_elem_left;
-	struct ice_switch_info *sw;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	u16 num_unicast = 0;
-	u16 s_rule_size;
+	bool refresh_required = true;
+	struct ice_sw_recipe *recp;
+	u8 i;
 
-	if (!m_list || !hw)
-		return ICE_ERR_PARAM;
+	/* Walk through existing recipes to find a match */
+	recp = hw->switch_info->recp_list;
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
+		/* If recipe was not created for this ID, in SW bookkeeping,
+		 * check if FW has an entry for this recipe. If the FW has an
+		 * entry update it in our SW bookkeeping and continue with the
+		 * matching.
+		 */
+		if (!recp[i].recp_created)
+			if (ice_get_recp_frm_fw(hw,
+						hw->switch_info->recp_list, i,
+						&refresh_required))
+				continue;
 
-	s_rule = NULL;
-	sw = hw->switch_info;
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		u8 *add = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
-		u16 vsi_handle;
-		u16 hw_vsi_id;
+		/* Skip inverse action recipes */
+		if (recp[i].root_buf && recp[i].root_buf->content.act_ctrl &
+		    ICE_AQ_RECIPE_ACT_INV_ACT)
+			continue;
 
-		m_list_itr->fltr_info.flag = ICE_FLTR_TX;
-		vsi_handle = m_list_itr->fltr_info.vsi_handle;
-		if (!ice_is_vsi_valid(hw, vsi_handle))
-			return ICE_ERR_PARAM;
-		hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
-		m_list_itr->fltr_info.fwd_id.hw_vsi_id = hw_vsi_id;
-		/* update the src in case it is VSI num */
-		if (m_list_itr->fltr_info.src_id != ICE_SRC_ID_VSI)
-			return ICE_ERR_PARAM;
-		m_list_itr->fltr_info.src = hw_vsi_id;
-		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC ||
-		    is_zero_ether_addr(add))
-			return ICE_ERR_PARAM;
-		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
-			/* Don't overwrite the unicast address */
-			mutex_lock(rule_lock);
-			if (ice_find_rule_entry(hw, ICE_SW_LKUP_MAC,
-						&m_list_itr->fltr_info)) {
-				mutex_unlock(rule_lock);
-				return ICE_ERR_ALREADY_EXISTS;
+		/* if number of words we are looking for match */
+		if (lkup_exts->n_val_words == recp[i].lkup_exts.n_val_words) {
+			struct ice_fv_word *ar = recp[i].lkup_exts.fv_words;
+			struct ice_fv_word *be = lkup_exts->fv_words;
+			u16 *cr = recp[i].lkup_exts.field_mask;
+			u16 *de = lkup_exts->field_mask;
+			bool found = true;
+			u8 pe, qr;
+
+			/* ar, cr, and qr are related to the recipe words, while
+			 * be, de, and pe are related to the lookup words
+			 */
+			for (pe = 0; pe < lkup_exts->n_val_words; pe++) {
+				for (qr = 0; qr < recp[i].lkup_exts.n_val_words;
+				     qr++) {
+					if (ar[qr].off == be[pe].off &&
+					    ar[qr].prot_id == be[pe].prot_id &&
+					    cr[qr] == de[pe])
+						/* Found the "pe"th word in the
+						 * given recipe
+						 */
+						break;
+				}
+				/* After walking through all the words in the
+				 * "i"th recipe if "p"th word was not found then
+				 * this recipe is not what we are looking for.
+				 * So break out from this loop and try the next
+				 * recipe
+				 */
+				if (qr >= recp[i].lkup_exts.n_val_words) {
+					found = false;
+					break;
+				}
 			}
-			mutex_unlock(rule_lock);
-			num_unicast++;
-		} else if (is_multicast_ether_addr(add) ||
-			   (is_unicast_ether_addr(add) && hw->ucast_shared)) {
-			m_list_itr->status =
-				ice_add_rule_internal(hw, ICE_SW_LKUP_MAC,
-						      m_list_itr);
-			if (m_list_itr->status)
-				return m_list_itr->status;
+			/* If for "i"th recipe the found was never set to false
+			 * then it means we found our match
+			 */
+			if (found)
+				return i; /* Return the recipe ID */
 		}
 	}
+	return ICE_MAX_NUM_RECIPES;
+}
 
-	mutex_lock(rule_lock);
-	/* Exit if no suitable entries were found for adding bulk switch rule */
-	if (!num_unicast) {
-		status = 0;
-		goto ice_add_mac_exit;
-	}
+/**
+ * ice_change_proto_id_to_dvm - change proto id in prot_id_tbl
+ *
+ * As protocol id for outer vlan is different in dvm and svm, if dvm is
+ * supported protocol array record for outer vlan has to be modified to
+ * reflect the value proper for DVM.
+ */
+void ice_change_proto_id_to_dvm(void)
+{
+	u8 i;
 
-	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+	for (i = 0; i < ARRAY_SIZE(ice_prot_id_tbl); i++)
+		if (ice_prot_id_tbl[i].type == ICE_VLAN_OFOS &&
+		    ice_prot_id_tbl[i].protocol_id != ICE_VLAN_OF_HW)
+			ice_prot_id_tbl[i].protocol_id = ICE_VLAN_OF_HW;
+}
 
-	/* Allocate switch rule buffer for the bulk update for unicast */
-	s_rule_size = ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
-	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
-			      GFP_KERNEL);
-	if (!s_rule) {
-		status = ICE_ERR_NO_MEMORY;
-		goto ice_add_mac_exit;
-	}
+/**
+ * ice_prot_type_to_id - get protocol ID from protocol type
+ * @type: protocol type
+ * @id: pointer to variable that will receive the ID
+ *
+ * Returns true if found, false otherwise
+ */
+static bool ice_prot_type_to_id(enum ice_protocol_type type, u8 *id)
+{
+	u8 i;
 
-	r_iter = s_rule;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
+	for (i = 0; i < ARRAY_SIZE(ice_prot_id_tbl); i++)
+		if (ice_prot_id_tbl[i].type == type) {
+			*id = ice_prot_id_tbl[i].protocol_id;
+			return true;
+		}
+	return false;
+}
 
-		if (is_unicast_ether_addr(mac_addr)) {
-			ice_fill_sw_rule(hw, &m_list_itr->fltr_info, r_iter,
-					 ice_aqc_opc_add_sw_rules);
-			r_iter = (struct ice_aqc_sw_rules_elem *)
-				((u8 *)r_iter + s_rule_size);
+/**
+ * ice_fill_valid_words - count valid words
+ * @rule: advanced rule with lookup information
+ * @lkup_exts: byte offset extractions of the words that are valid
+ *
+ * calculate valid words in a lookup rule using mask value
+ */
+static u8
+ice_fill_valid_words(struct ice_adv_lkup_elem *rule,
+		     struct ice_prot_lkup_ext *lkup_exts)
+{
+	u8 j, word, prot_id, ret_val;
+
+	if (!ice_prot_type_to_id(rule->type, &prot_id))
+		return 0;
+
+	word = lkup_exts->n_val_words;
+
+	for (j = 0; j < sizeof(rule->m_u) / sizeof(u16); j++)
+		if (((u16 *)&rule->m_u)[j] &&
+		    rule->type < ARRAY_SIZE(ice_prot_ext)) {
+			/* No more space to accommodate */
+			if (word >= ICE_MAX_CHAIN_WORDS)
+				return 0;
+			lkup_exts->fv_words[word].off =
+				ice_prot_ext[rule->type].offs[j];
+			lkup_exts->fv_words[word].prot_id =
+				ice_prot_id_tbl[rule->type].protocol_id;
+			lkup_exts->field_mask[word] =
+				be16_to_cpu(((__force __be16 *)&rule->m_u)[j]);
+			word++;
 		}
-	}
 
-	/* Call AQ bulk switch rule update for all unicast addresses */
-	r_iter = s_rule;
-	/* Call AQ switch rule in AQ_MAX chunk */
-	for (total_elem_left = num_unicast; total_elem_left > 0;
-	     total_elem_left -= elem_sent) {
-		struct ice_aqc_sw_rules_elem *entry = r_iter;
+	ret_val = word - lkup_exts->n_val_words;
+	lkup_exts->n_val_words = word;
 
-		elem_sent = min(total_elem_left,
-				(u16)(ICE_AQ_MAX_BUF_LEN / s_rule_size));
-		status = ice_aq_sw_rules(hw, entry, elem_sent * s_rule_size,
-					 elem_sent, ice_aqc_opc_add_sw_rules,
-					 NULL);
-		if (status)
-			goto ice_add_mac_exit;
-		r_iter = (struct ice_aqc_sw_rules_elem *)
-			((u8 *)r_iter + (elem_sent * s_rule_size));
-	}
+	return ret_val;
+}
 
-	/* Fill up rule ID based on the value returned from FW */
-	r_iter = s_rule;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
-		struct ice_fltr_mgmt_list_entry *fm_entry;
 
-		if (is_unicast_ether_addr(mac_addr)) {
-			f_info->fltr_rule_id =
-				le16_to_cpu(r_iter->pdata.lkup_tx_rx.index);
-			f_info->fltr_act = ICE_FWD_TO_VSI;
-			/* Create an entry to track this MAC address */
-			fm_entry = devm_kzalloc(ice_hw_to_dev(hw),
-						sizeof(*fm_entry), GFP_KERNEL);
-			if (!fm_entry) {
-				status = ICE_ERR_NO_MEMORY;
-				goto ice_add_mac_exit;
+
+/**
+ * ice_create_first_fit_recp_def - Create a recipe grouping
+ * @hw: pointer to the hardware structure
+ * @lkup_exts: an array of protocol header extractions
+ * @rg_list: pointer to a list that stores new recipe groups
+ * @recp_cnt: pointer to a variable that stores returned number of recipe groups
+ *
+ * Using first fit algorithm, take all the words that are still not done
+ * and start grouping them in 4-word groups. Each group makes up one
+ * recipe.
+ */
+static enum ice_status
+ice_create_first_fit_recp_def(struct ice_hw *hw,
+			      struct ice_prot_lkup_ext *lkup_exts,
+			      struct list_head *rg_list,
+			      u8 *recp_cnt)
+{
+	struct ice_pref_recipe_group *grp = NULL;
+	u8 j;
+
+
+	*recp_cnt = 0;
+
+	/* Walk through every word in the rule to check if it is not done. If so
+	 * then this word needs to be part of a new recipe.
+	 */
+	for (j = 0; j < lkup_exts->n_val_words; j++)
+		if (!test_bit(j, lkup_exts->done)) {
+			if (!grp ||
+			    grp->n_val_pairs == ICE_NUM_WORDS_RECIPE) {
+				struct ice_recp_grp_entry *entry;
+
+				entry = devm_kzalloc(ice_hw_to_dev(hw),
+						     sizeof(*entry),
+						     GFP_KERNEL);
+				if (!entry)
+					return ICE_ERR_NO_MEMORY;
+				list_add(&entry->l_entry, rg_list);
+				grp = &entry->r_group;
+				(*recp_cnt)++;
 			}
-			fm_entry->fltr_info = *f_info;
-			fm_entry->vsi_count = 1;
-			/* The book keeping entries will get removed when
-			 * base driver calls remove filter AQ command
-			 */
 
-			list_add(&fm_entry->list_entry, rule_head);
-			r_iter = (struct ice_aqc_sw_rules_elem *)
-				((u8 *)r_iter + s_rule_size);
+			grp->pairs[grp->n_val_pairs].prot_id =
+				lkup_exts->fv_words[j].prot_id;
+			grp->pairs[grp->n_val_pairs].off =
+				lkup_exts->fv_words[j].off;
+			grp->mask[grp->n_val_pairs] = lkup_exts->field_mask[j];
+			grp->n_val_pairs++;
 		}
-	}
 
-ice_add_mac_exit:
-	mutex_unlock(rule_lock);
-	if (s_rule)
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
+	return 0;
 }
 
 /**
- * ice_add_vlan_internal - Add one VLAN based filter rule
+ * ice_fill_fv_word_index - fill in the field vector indices for a recipe group
  * @hw: pointer to the hardware structure
- * @f_entry: filter entry containing one VLAN information
+ * @fv_list: field vector with the extraction sequence information
+ * @rg_list: recipe groupings with protocol-offset pairs
+ *
+ * Helper function to fill in the field vector indices for protocol-offset
+ * pairs. These indexes are then ultimately programmed into a recipe.
  */
 static enum ice_status
-ice_add_vlan_internal(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
+ice_fill_fv_word_index(struct ice_hw *hw, struct list_head *fv_list,
+		       struct list_head *rg_list)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *v_list_itr;
-	struct ice_fltr_info *new_fltr, *cur_fltr;
-	enum ice_sw_lkup_type lkup_type;
-	u16 vsi_list_id = 0, vsi_handle;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
-	enum ice_status status = 0;
+	struct ice_sw_fv_list_entry *fv;
+	struct ice_recp_grp_entry *rg;
+	struct ice_fv_word *fv_ext;
+
+	if (list_empty(fv_list))
+		return 0;
+
+	fv = list_first_entry(fv_list, struct ice_sw_fv_list_entry,
+			      list_entry);
+	fv_ext = fv->fv_ptr->ew;
+
+	list_for_each_entry(rg, rg_list, l_entry) {
+		u8 i;
+
+		for (i = 0; i < rg->r_group.n_val_pairs; i++) {
+			struct ice_fv_word *pr;
+			bool found = false;
+			u16 mask;
+			u8 j;
+
+			pr = &rg->r_group.pairs[i];
+			mask = rg->r_group.mask[i];
+
+			for (j = 0; j < hw->blk[ICE_BLK_SW].es.fvw; j++)
+				if (fv_ext[j].prot_id == pr->prot_id &&
+				    fv_ext[j].off == pr->off) {
+					found = true;
+
+					/* Store index of field vector */
+					rg->fv_idx[i] = j;
+					rg->fv_mask[i] = mask;
+					break;
+				}
+
+			/* Protocol/offset could not be found, caller gave an
+			 * invalid pair
+			 */
+			if (!found)
+				return ICE_ERR_PARAM;
+		}
+	}
 
-	if (!ice_is_vsi_valid(hw, f_entry->fltr_info.vsi_handle))
-		return ICE_ERR_PARAM;
+	return 0;
+}
 
-	f_entry->fltr_info.fwd_id.hw_vsi_id =
-		ice_get_hw_vsi_num(hw, f_entry->fltr_info.vsi_handle);
-	new_fltr = &f_entry->fltr_info;
+/**
+ * ice_find_free_recp_res_idx - find free result indexes for recipe
+ * @hw: pointer to hardware structure
+ * @profiles: bitmap of profiles that will be associated with the new recipe
+ * @free_idx: pointer to variable to receive the free index bitmap
+ *
+ * The algorithm used here is:
+ *	1. When creating a new recipe, create a set P which contains all
+ *	   Profiles that will be associated with our new recipe
+ *
+ *	2. For each Profile p in set P:
+ *	    a. Add all recipes associated with Profile p into set R
+ *	    b. Optional : PossibleIndexes &= profile[p].possibleIndexes
+ *		[initially PossibleIndexes should be 0xFFFFFFFFFFFFFFFF]
+ *		i. Or just assume they all have the same possible indexes:
+ *			44, 45, 46, 47
+ *			i.e., PossibleIndexes = 0x0000F00000000000
+ *
+ *	3. For each Recipe r in set R:
+ *	    a. UsedIndexes |= (bitwise or ) recipe[r].res_indexes
+ *	    b. FreeIndexes = UsedIndexes ^ PossibleIndexes
+ *
+ *	FreeIndexes will contain the bits indicating the indexes free for use,
+ *      then the code needs to update the recipe[r].used_result_idx_bits to
+ *      indicate which indexes were selected for use by this recipe.
+ */
+static u16
+ice_find_free_recp_res_idx(struct ice_hw *hw, const unsigned long *profiles,
+			   unsigned long *free_idx)
+{
+	DECLARE_BITMAP(possible_idx, ICE_MAX_FV_WORDS);
+	DECLARE_BITMAP(recipes, ICE_MAX_NUM_RECIPES);
+	DECLARE_BITMAP(used_idx, ICE_MAX_FV_WORDS);
+	u16 bit;
+
+	bitmap_zero(possible_idx, ICE_MAX_FV_WORDS);
+	bitmap_zero(recipes, ICE_MAX_NUM_RECIPES);
+	bitmap_zero(used_idx, ICE_MAX_FV_WORDS);
+	bitmap_zero(free_idx, ICE_MAX_FV_WORDS);
+
+	bitmap_set(possible_idx, 0, ICE_MAX_FV_WORDS);
+
+	/* For each profile we are going to associate the recipe with, add the
+	 * recipes that are associated with that profile. This will give us
+	 * the set of recipes that our recipe may collide with. Also, determine
+	 * what possible result indexes are usable given this set of profiles.
+	 */
+	for_each_set_bit(bit, profiles, ICE_MAX_NUM_PROFILES) {
+		bitmap_or(recipes, recipes, profile_to_recipe[bit],
+			  ICE_MAX_NUM_RECIPES);
+		bitmap_and(possible_idx, possible_idx,
+			   hw->switch_info->prof_res_bm[bit],
+			   ICE_MAX_FV_WORDS);
+	}
 
-	/* VLAN ID should only be 12 bits */
-	if (new_fltr->l_data.vlan.vlan_id > ICE_MAX_VLAN_ID)
-		return ICE_ERR_PARAM;
+	/* For each recipe that our new recipe may collide with, determine
+	 * which indexes have been used.
+	 */
+	for_each_set_bit(bit, recipes, ICE_MAX_NUM_RECIPES)
+		bitmap_or(used_idx, used_idx,
+			  hw->switch_info->recp_list[bit].res_idxs,
+			  ICE_MAX_FV_WORDS);
 
-	if (new_fltr->src_id != ICE_SRC_ID_VSI)
-		return ICE_ERR_PARAM;
+	bitmap_xor(free_idx, used_idx, possible_idx, ICE_MAX_FV_WORDS);
 
-	new_fltr->src = new_fltr->fwd_id.hw_vsi_id;
-	lkup_type = new_fltr->lkup_type;
-	vsi_handle = new_fltr->vsi_handle;
-	rule_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
-	mutex_lock(rule_lock);
-	v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN, new_fltr);
-	if (!v_list_itr) {
-		struct ice_vsi_list_map_info *map_info = NULL;
+	/* return number of free indexes */
+	return (u16) bitmap_weight(free_idx, ICE_MAX_FV_WORDS);
+}
 
-		if (new_fltr->fltr_act == ICE_FWD_TO_VSI) {
-			/* All VLAN pruning rules use a VSI list. Check if
-			 * there is already a VSI list containing VSI that we
-			 * want to add. If found, use the same vsi_list_id for
-			 * this new VLAN rule or else create a new list.
+/**
+ * ice_add_sw_recipe - function to call AQ calls to create switch recipe
+ * @hw: pointer to hardware structure
+ * @rm: recipe management list entry
+ * @match_tun_mask: tunnel mask that needs to be programmed
+ * @profiles: bitmap of profiles that will be associated.
+ */
+static enum ice_status
+ice_add_sw_recipe(struct ice_hw *hw, struct ice_sw_recipe *rm,
+		  u16 match_tun_mask, unsigned long *profiles)
+{
+	DECLARE_BITMAP(result_idx_bm, ICE_MAX_FV_WORDS);
+	struct ice_aqc_recipe_data_elem *tmp;
+	struct ice_aqc_recipe_data_elem *buf;
+	struct ice_recp_grp_entry *entry;
+	enum ice_status status;
+	u16 free_res_idx;
+	u16 recipe_count;
+	u8 chain_idx;
+	u8 recps = 0;
+
+	/* When more than one recipe are required, another recipe is needed to
+	 * chain them together. Matching a tunnel metadata ID takes up one of
+	 * the match fields in the chaining recipe reducing the number of
+	 * chained recipes by one.
+	 */
+	 /* check number of free result indices */
+	bitmap_zero(result_idx_bm, ICE_MAX_FV_WORDS);
+	free_res_idx = ice_find_free_recp_res_idx(hw, profiles, result_idx_bm);
+
+	ice_debug(hw, ICE_DBG_SW, "Result idx slots: %d, need %d\n",
+		  free_res_idx, rm->n_grp_count);
+
+	if (rm->n_grp_count > 1) {
+		if (rm->n_grp_count > free_res_idx)
+			return ICE_ERR_MAX_LIMIT;
+
+		rm->n_grp_count++;
+	}
+
+	if (rm->n_grp_count > ICE_MAX_CHAIN_RECIPE)
+		return ICE_ERR_MAX_LIMIT;
+
+	tmp = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_NUM_RECIPES,
+			   sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
+
+	buf = devm_kcalloc(ice_hw_to_dev(hw), rm->n_grp_count, sizeof(*buf),
+			   GFP_KERNEL);
+	if (!buf) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_mem;
+	}
+
+	bitmap_zero(rm->r_bitmap, ICE_MAX_NUM_RECIPES);
+	recipe_count = ICE_MAX_NUM_RECIPES;
+	status = ice_aq_get_recipe(hw, tmp, &recipe_count, ICE_SW_LKUP_MAC,
+				   NULL);
+	if (status || recipe_count == 0)
+		goto err_unroll;
+
+	/* Allocate the recipe resources, and configure them according to the
+	 * match fields from protocol headers and extracted field vectors.
+	 */
+	chain_idx = find_first_bit(result_idx_bm, ICE_MAX_FV_WORDS);
+	list_for_each_entry(entry, &rm->rg_list, l_entry) {
+		u8 i;
+
+		status = ice_alloc_recipe(hw, &entry->rid);
+		if (status)
+			goto err_unroll;
+
+		/* Clear the result index of the located recipe, as this will be
+		 * updated, if needed, later in the recipe creation process.
+		 */
+		tmp[0].content.result_indx = 0;
+
+		buf[recps] = tmp[0];
+		buf[recps].recipe_indx = (u8)entry->rid;
+		/* if the recipe is a non-root recipe RID should be programmed
+		 * as 0 for the rules to be applied correctly.
+		 */
+		buf[recps].content.rid = 0;
+		memset(&buf[recps].content.lkup_indx, 0,
+		       sizeof(buf[recps].content.lkup_indx));
+
+		/* All recipes use look-up index 0 to match switch ID. */
+		buf[recps].content.lkup_indx[0] = ICE_AQ_SW_ID_LKUP_IDX;
+		buf[recps].content.mask[0] =
+			cpu_to_le16(ICE_AQ_SW_ID_LKUP_MASK);
+		/* Setup lkup_indx 1..4 to INVALID/ignore and set the mask
+		 * to be 0
+		 */
+		for (i = 1; i <= ICE_NUM_WORDS_RECIPE; i++) {
+			buf[recps].content.lkup_indx[i] = 0x80;
+			buf[recps].content.mask[i] = 0;
+		}
+
+		for (i = 0; i < entry->r_group.n_val_pairs; i++) {
+			buf[recps].content.lkup_indx[i + 1] = entry->fv_idx[i];
+			buf[recps].content.mask[i + 1] =
+				cpu_to_le16(entry->fv_mask[i]);
+		}
+
+		if (rm->n_grp_count > 1) {
+			/* Checks to see if there really is a valid result index
+			 * that can be used.
 			 */
-			map_info = ice_find_vsi_list_entry(hw, ICE_SW_LKUP_VLAN,
-							   vsi_handle,
-							   &vsi_list_id);
-			if (!map_info) {
-				status = ice_create_vsi_list_rule(hw,
-								  &vsi_handle,
-								  1,
-								  &vsi_list_id,
-								  lkup_type);
-				if (status)
-					goto exit;
+			if (chain_idx >= ICE_MAX_FV_WORDS) {
+				ice_debug(hw, ICE_DBG_SW, "No chain index available\n");
+				status = ICE_ERR_MAX_LIMIT;
+				goto err_unroll;
 			}
-			/* Convert the action to forwarding to a VSI list. */
-			new_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
-			new_fltr->fwd_id.vsi_list_id = vsi_list_id;
+
+			entry->chain_idx = chain_idx;
+			buf[recps].content.result_indx =
+				ICE_AQ_RECIPE_RESULT_EN |
+				((chain_idx << ICE_AQ_RECIPE_RESULT_DATA_S) &
+				 ICE_AQ_RECIPE_RESULT_DATA_M);
+			clear_bit(chain_idx, result_idx_bm);
+			chain_idx = find_first_bit(result_idx_bm,
+						   ICE_MAX_FV_WORDS);
 		}
 
-		status = ice_create_pkt_fwd_rule(hw, f_entry);
-		if (!status) {
-			v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN,
-							 new_fltr);
-			if (!v_list_itr) {
-				status = ICE_ERR_DOES_NOT_EXIST;
-				goto exit;
-			}
-			/* reuse VSI list for new rule and increment ref_cnt */
-			if (map_info) {
-				v_list_itr->vsi_list_info = map_info;
-				map_info->ref_cnt++;
-			} else {
-				v_list_itr->vsi_list_info =
-					ice_create_vsi_list_map(hw, &vsi_handle,
-								1, vsi_list_id);
-			}
+		/* fill recipe dependencies */
+		bitmap_zero((unsigned long *)buf[recps].recipe_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+		set_bit(buf[recps].recipe_indx,
+			(unsigned long *)buf[recps].recipe_bitmap);
+		buf[recps].content.act_ctrl_fwd_priority = rm->priority;
+		recps++;
+	}
+
+	if (rm->n_grp_count == 1) {
+		rm->root_rid = buf[0].recipe_indx;
+		set_bit(buf[0].recipe_indx, rm->r_bitmap);
+		buf[0].content.rid = rm->root_rid | ICE_AQ_RECIPE_ID_IS_ROOT;
+		if (sizeof(buf[0].recipe_bitmap) >= sizeof(rm->r_bitmap)) {
+			memcpy(buf[0].recipe_bitmap, rm->r_bitmap,
+			       sizeof(buf[0].recipe_bitmap));
+		} else {
+			status = ICE_ERR_BAD_PTR;
+			goto err_unroll;
 		}
-	} else if (v_list_itr->vsi_list_info->ref_cnt == 1) {
-		/* Update existing VSI list to add new VSI ID only if it used
-		 * by one VLAN rule.
+		/* Applicable only for ROOT_RECIPE, set the fwd_priority for
+		 * the recipe which is getting created if specified
+		 * by user. Usually any advanced switch filter, which results
+		 * into new extraction sequence, ended up creating a new recipe
+		 * of type ROOT and usually recipes are associated with profiles
+		 * Switch rule referreing newly created recipe, needs to have
+		 * either/or 'fwd' or 'join' priority, otherwise switch rule
+		 * evaluation will not happen correctly. In other words, if
+		 * switch rule to be evaluated on priority basis, then recipe
+		 * needs to have priority, otherwise it will be evaluated last.
 		 */
-		cur_fltr = &v_list_itr->fltr_info;
-		status = ice_add_update_vsi_list(hw, v_list_itr, cur_fltr,
-						 new_fltr);
+		buf[0].content.act_ctrl_fwd_priority = rm->priority;
 	} else {
-		/* If VLAN rule exists and VSI list being used by this rule is
-		 * referenced by more than 1 VLAN rule. Then create a new VSI
-		 * list appending previous VSI with new VSI and update existing
-		 * VLAN rule to point to new VSI list ID
+		struct ice_recp_grp_entry *last_chain_entry;
+		u16 rid, i;
+
+		/* Allocate the last recipe that will chain the outcomes of the
+		 * other recipes together
 		 */
-		struct ice_fltr_info tmp_fltr;
-		u16 vsi_handle_arr[2];
-		u16 cur_handle;
+		status = ice_alloc_recipe(hw, &rid);
+		if (status)
+			goto err_unroll;
 
-		/* Current implementation only supports reusing VSI list with
-		 * one VSI count. We should never hit below condition
+		buf[recps].recipe_indx = (u8)rid;
+		buf[recps].content.rid = (u8)rid;
+		buf[recps].content.rid |= ICE_AQ_RECIPE_ID_IS_ROOT;
+		/* the new entry created should also be part of rg_list to
+		 * make sure we have complete recipe
 		 */
-		if (v_list_itr->vsi_count > 1 &&
-		    v_list_itr->vsi_list_info->ref_cnt > 1) {
-			ice_debug(hw, ICE_DBG_SW,
-				  "Invalid configuration: Optimization to reuse VSI list with more than one VSI is not being done yet\n");
-			status = ICE_ERR_CFG;
-			goto exit;
+		last_chain_entry = devm_kzalloc(ice_hw_to_dev(hw),
+						sizeof(*last_chain_entry),
+						GFP_KERNEL);
+		if (!last_chain_entry) {
+			status = ICE_ERR_NO_MEMORY;
+			goto err_unroll;
+		}
+		last_chain_entry->rid = rid;
+		memset(&buf[recps].content.lkup_indx, 0,
+		       sizeof(buf[recps].content.lkup_indx));
+		/* All recipes use look-up index 0 to match switch ID. */
+		buf[recps].content.lkup_indx[0] = ICE_AQ_SW_ID_LKUP_IDX;
+		buf[recps].content.mask[0] =
+			cpu_to_le16(ICE_AQ_SW_ID_LKUP_MASK);
+		for (i = 1; i <= ICE_NUM_WORDS_RECIPE; i++) {
+			buf[recps].content.lkup_indx[i] =
+				ICE_AQ_RECIPE_LKUP_IGNORE;
+			buf[recps].content.mask[i] = 0;
+		}
+
+		i = 1;
+		/* update r_bitmap with the recp that is used for chaining */
+		set_bit(rid, rm->r_bitmap);
+		/* this is the recipe that chains all the other recipes so it
+		 * should not have a chaining ID to indicate the same
+		 */
+		last_chain_entry->chain_idx = ICE_INVAL_CHAIN_IND;
+		list_for_each_entry(entry, &rm->rg_list, l_entry) {
+			last_chain_entry->fv_idx[i] = entry->chain_idx;
+			buf[recps].content.lkup_indx[i] = entry->chain_idx;
+			buf[recps].content.mask[i++] = cpu_to_le16(0xFFFF);
+			set_bit(entry->rid, rm->r_bitmap);
+		}
+		list_add(&last_chain_entry->l_entry, &rm->rg_list);
+		if (sizeof(buf[recps].recipe_bitmap) >=
+		    sizeof(rm->r_bitmap)) {
+			memcpy(buf[recps].recipe_bitmap, rm->r_bitmap,
+			       sizeof(buf[recps].recipe_bitmap));
+		} else {
+			status = ICE_ERR_BAD_PTR;
+			goto err_unroll;
 		}
+		buf[recps].content.act_ctrl_fwd_priority = rm->priority;
+
+		/* To differentiate among different UDP tunnels, a meta data ID
+		 * flag is used.
+		 */
+		if (match_tun_mask) {
+			buf[recps].content.lkup_indx[i] = ICE_TUN_FLAG_FV_IND;
+			buf[recps].content.mask[i] =
+				cpu_to_le16(match_tun_mask);
+		}
+
+		recps++;
+		rm->root_rid = (u8)rid;
+	}
+	status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
+	if (status)
+		goto err_unroll;
+
+	status = ice_aq_add_recipe(hw, buf, rm->n_grp_count, NULL);
+	ice_release_change_lock(hw);
+	if (status)
+		goto err_unroll;
 
-		cur_handle =
-			find_first_bit(v_list_itr->vsi_list_info->vsi_map,
-				       ICE_MAX_VSI);
+	/* Every recipe that just got created add it to the recipe
+	 * book keeping list
+	 */
+	list_for_each_entry(entry, &rm->rg_list, l_entry) {
+		struct ice_switch_info *sw = hw->switch_info;
+		bool is_root, idx_found = false;
+		struct ice_sw_recipe *recp;
+		u16 idx, buf_idx = 0;
+
+		/* find buffer index for copying some data */
+		for (idx = 0; idx < rm->n_grp_count; idx++)
+			if (buf[idx].recipe_indx == entry->rid) {
+				buf_idx = idx;
+				idx_found = true;
+			}
 
-		/* A rule already exists with the new VSI being added */
-		if (cur_handle == vsi_handle) {
-			status = ICE_ERR_ALREADY_EXISTS;
-			goto exit;
+		if (!idx_found) {
+			status = ICE_ERR_OUT_OF_RANGE;
+			goto err_unroll;
 		}
 
-		vsi_handle_arr[0] = cur_handle;
-		vsi_handle_arr[1] = vsi_handle;
-		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
-						  &vsi_list_id, lkup_type);
-		if (status)
-			goto exit;
+		recp = &sw->recp_list[entry->rid];
+		is_root = (rm->root_rid == entry->rid);
+		recp->is_root = is_root;
 
-		tmp_fltr = v_list_itr->fltr_info;
-		tmp_fltr.fltr_rule_id = v_list_itr->fltr_info.fltr_rule_id;
-		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
-		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
-		/* Update the previous switch rule to a new VSI list which
-		 * includes current VSI that is requested
-		 */
-		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
-		if (status)
-			goto exit;
+		recp->root_rid = entry->rid;
+		recp->big_recp = (is_root && rm->n_grp_count > 1);
 
-		/* before overriding VSI list map info. decrement ref_cnt of
-		 * previous VSI list
+		memcpy(&recp->ext_words, entry->r_group.pairs,
+		       entry->r_group.n_val_pairs * sizeof(struct ice_fv_word));
+
+		memcpy(recp->r_bitmap, buf[buf_idx].recipe_bitmap,
+		       sizeof(recp->r_bitmap));
+
+		/* Copy non-result fv index values and masks to recipe. This
+		 * call will also update the result recipe bitmask.
 		 */
-		v_list_itr->vsi_list_info->ref_cnt--;
+		ice_collect_result_idx(&buf[buf_idx], recp);
 
-		/* now update to newly created list */
-		v_list_itr->fltr_info.fwd_id.vsi_list_id = vsi_list_id;
-		v_list_itr->vsi_list_info =
-			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
-						vsi_list_id);
-		v_list_itr->vsi_count++;
+		/* for non-root recipes, also copy to the root, this allows
+		 * easier matching of a complete chained recipe
+		 */
+		if (!is_root)
+			ice_collect_result_idx(&buf[buf_idx],
+					       &sw->recp_list[rm->root_rid]);
+
+		recp->n_ext_words = entry->r_group.n_val_pairs;
+		recp->chain_idx = entry->chain_idx;
+		recp->priority = buf[buf_idx].content.act_ctrl_fwd_priority;
+		recp->n_grp_count = rm->n_grp_count;
+		recp->tun_type = rm->tun_type;
+		recp->recp_created = true;
 	}
+	rm->root_buf = buf;
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	return status;
 
-exit:
-	mutex_unlock(rule_lock);
+err_unroll:
+err_mem:
+	devm_kfree(ice_hw_to_dev(hw), tmp);
+	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
 
 /**
- * ice_add_vlan - Add VLAN based filter rule
- * @hw: pointer to the hardware structure
- * @v_list: list of VLAN entries and forwarding information
+ * ice_create_recipe_group - creates recipe group
+ * @hw: pointer to hardware structure
+ * @rm: recipe management list entry
+ * @lkup_exts: lookup elements
  */
-enum ice_status
-ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
+static enum ice_status
+ice_create_recipe_group(struct ice_hw *hw, struct ice_sw_recipe *rm,
+			struct ice_prot_lkup_ext *lkup_exts)
 {
-	struct ice_fltr_list_entry *v_list_itr;
+	enum ice_status status;
+	u8 recp_count = 0;
 
-	if (!v_list || !hw)
-		return ICE_ERR_PARAM;
+	rm->n_grp_count = 0;
 
-	list_for_each_entry(v_list_itr, v_list, list_entry) {
-		if (v_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_VLAN)
-			return ICE_ERR_PARAM;
-		v_list_itr->fltr_info.flag = ICE_FLTR_TX;
-		v_list_itr->status = ice_add_vlan_internal(hw, v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+	/* Create recipes for words that are marked not done by packing them
+	 * as best fit.
+	 */
+	status = ice_create_first_fit_recp_def(hw, lkup_exts,
+					       &rm->rg_list, &recp_count);
+	if (!status) {
+		rm->n_grp_count += recp_count;
+		rm->n_ext_words = lkup_exts->n_val_words;
+		memcpy(&rm->ext_words, lkup_exts->fv_words,
+		       sizeof(rm->ext_words));
+		memcpy(rm->word_masks, lkup_exts->field_mask,
+		       sizeof(rm->word_masks));
 	}
-	return 0;
+
+	return status;
 }
 
 /**
- * ice_add_eth_mac - Add ethertype and MAC based filter rule
- * @hw: pointer to the hardware structure
- * @em_list: list of ether type MAC filter, MAC is optional
- *
- * This function requires the caller to populate the entries in
- * the filter list with the necessary fields (including flags to
- * indicate Tx or Rx rules).
+ * ice_get_fv - get field vectors/extraction sequences for spec. lookup types
+ * @hw: pointer to hardware structure
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @bm: bitmap of field vectors to consider
+ * @fv_list: pointer to a list that holds the returned field vectors
  */
-enum ice_status
-ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+static enum ice_status
+ice_get_fv(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+	   unsigned long *bm, struct list_head *fv_list)
 {
-	struct ice_fltr_list_entry *em_list_itr;
+	enum ice_status status;
+	u8 *prot_ids;
+	u16 i;
 
-	if (!em_list || !hw)
-		return ICE_ERR_PARAM;
+	prot_ids = devm_kcalloc(ice_hw_to_dev(hw), lkups_cnt,
+				sizeof(*prot_ids), GFP_KERNEL);
+	if (!prot_ids)
+		return ICE_ERR_NO_MEMORY;
 
-	list_for_each_entry(em_list_itr, em_list, list_entry) {
-		enum ice_sw_lkup_type l_type =
-			em_list_itr->fltr_info.lkup_type;
+	for (i = 0; i < lkups_cnt; i++)
+		if (!ice_prot_type_to_id(lkups[i].type, &prot_ids[i])) {
+			status = ICE_ERR_CFG;
+			goto free_mem;
+		}
 
-		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
-		    l_type != ICE_SW_LKUP_ETHERTYPE)
-			return ICE_ERR_PARAM;
+	/* Find field vectors that include all specified protocol types */
+	status = ice_get_sw_fv_list(hw, prot_ids, lkups_cnt, bm, fv_list);
 
-		em_list_itr->status = ice_add_rule_internal(hw, l_type,
-							    em_list_itr);
-		if (em_list_itr->status)
-			return em_list_itr->status;
-	}
-	return 0;
+free_mem:
+	devm_kfree(ice_hw_to_dev(hw), prot_ids);
+	return status;
 }
 
 /**
- * ice_remove_eth_mac - Remove an ethertype (or MAC) based filter rule
- * @hw: pointer to the hardware structure
- * @em_list: list of ethertype or ethertype MAC entries
+ * ice_tun_type_match_word - determine if tun type needs a match mask
+ * @tun_type: tunnel type
+ * @mask: mask to be used for the tunnel
  */
-enum ice_status
-ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list)
+static bool ice_tun_type_match_word(enum ice_sw_tunnel_type tun_type, u16 *mask)
 {
-	struct ice_fltr_list_entry *em_list_itr, *tmp;
+	switch (tun_type) {
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_NVGRE:
+	case ICE_SW_TUN_UDP:
+	case ICE_ALL_TUNNELS:
+	case ICE_SW_TUN_IPV4_GTPU_IPV4:
+	case ICE_SW_TUN_IPV4_GTPU_IPV6:
+	case ICE_SW_TUN_IPV6_GTPU_IPV4:
+	case ICE_SW_TUN_IPV6_GTPU_IPV6:
+	case ICE_SW_TUN_IPV4_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_UDP:
+	/* support for GTP, using only inner protocols,
+	 * outer protocols can be anything
+	 */
+	case ICE_SW_TUN_GTP_IPV4:
+	case ICE_SW_TUN_GTP_IPV6:
+	case ICE_SW_TUN_GTP_IPV4_TCP:
+	case ICE_SW_TUN_GTP_IPV4_UDP:
+	case ICE_SW_TUN_GTP_IPV6_TCP:
+	case ICE_SW_TUN_GTP_IPV6_UDP:
+		*mask = ICE_TUN_FLAG_MASK;
+		return true;
+
+	case ICE_SW_TUN_GENEVE_VLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+		*mask = ICE_TUN_FLAG_MASK & ~ICE_TUN_FLAG_VLAN_MASK;
+		return true;
 
-	if (!em_list || !hw)
-		return ICE_ERR_PARAM;
+	default:
+		*mask = 0;
+		return false;
+	}
+}
 
-	list_for_each_entry_safe(em_list_itr, tmp, em_list, list_entry) {
-		enum ice_sw_lkup_type l_type =
-			em_list_itr->fltr_info.lkup_type;
+/**
+ * ice_add_special_words - Add words that are not protocols, such as metadata
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @lkup_exts: lookup word structure
+ */
+static enum ice_status
+ice_add_special_words(struct ice_adv_rule_info *rinfo,
+		      struct ice_prot_lkup_ext *lkup_exts)
+{
+	u16 mask;
 
-		if (l_type != ICE_SW_LKUP_ETHERTYPE_MAC &&
-		    l_type != ICE_SW_LKUP_ETHERTYPE)
-			return ICE_ERR_PARAM;
+	/* If this is a tunneled packet, then add recipe index to match the
+	 * tunnel bit in the packet metadata flags.
+	 */
+	if (ice_tun_type_match_word(rinfo->tun_type, &mask)) {
+		if (lkup_exts->n_val_words < ICE_MAX_CHAIN_WORDS) {
+			u8 word = lkup_exts->n_val_words++;
 
-		em_list_itr->status = ice_remove_rule_internal(hw, l_type,
-							       em_list_itr);
-		if (em_list_itr->status)
-			return em_list_itr->status;
+			lkup_exts->fv_words[word].prot_id = ICE_META_DATA_ID_HW;
+			lkup_exts->fv_words[word].off = ICE_TUN_FLAG_MDID_OFF;
+			lkup_exts->field_mask[word] = mask;
+		} else {
+			return ICE_ERR_MAX_LIMIT;
+		}
 	}
+
 	return 0;
 }
 
-/**
- * ice_rem_sw_rule_info
- * @hw: pointer to the hardware structure
- * @rule_head: pointer to the switch list structure that we want to delete
+/* ice_get_compat_fv_bitmap - Get compatible field vector bitmap for rule
+ * @hw: pointer to hardware structure
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @bm: pointer to memory for returning the bitmap of field vectors
  */
 static void
-ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+ice_get_compat_fv_bitmap(struct ice_hw *hw, struct ice_adv_rule_info *rinfo,
+			 unsigned long *bm)
 {
-	if (!list_empty(rule_head)) {
-		struct ice_fltr_mgmt_list_entry *entry;
-		struct ice_fltr_mgmt_list_entry *tmp;
+	enum ice_prof_type prof_type;
 
-		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
-			list_del(&entry->list_entry);
-			devm_kfree(ice_hw_to_dev(hw), entry);
-		}
+	bitmap_zero(bm, ICE_MAX_NUM_PROFILES);
+
+	switch (rinfo->tun_type) {
+	case ICE_NON_TUN:
+		prof_type = ICE_PROF_NON_TUN;
+		break;
+	case ICE_ALL_TUNNELS:
+		prof_type = ICE_PROF_TUN_ALL;
+		break;
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_GENEVE_VLAN:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+	case ICE_SW_TUN_UDP:
+		prof_type = ICE_PROF_TUN_UDP;
+		break;
+
+	/* Support for GTP tunnel + L3 */
+	case ICE_SW_TUN_IPV4_GTPU_IPV4:
+	case ICE_SW_TUN_IPV4_GTPU_IPV6:
+	case ICE_SW_TUN_IPV6_GTPU_IPV4:
+	case ICE_SW_TUN_IPV6_GTPU_IPV6:
+	/* Support for GTP tunnel + L3 + L4 */
+	case ICE_SW_TUN_IPV4_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV4_GTP_IPV6_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV4_UDP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_TCP:
+	case ICE_SW_TUN_IPV6_GTP_IPV6_UDP:
+	case ICE_SW_TUN_GTP_IPV4:
+	case ICE_SW_TUN_GTP_IPV6:
+	case ICE_SW_TUN_GTP_IPV4_TCP:
+	case ICE_SW_TUN_GTP_IPV4_UDP:
+	case ICE_SW_TUN_GTP_IPV6_TCP:
+	case ICE_SW_TUN_GTP_IPV6_UDP:
+		prof_type = ICE_PROF_TUN_UDP;
+		break;
+
+	case ICE_SW_TUN_NVGRE:
+		prof_type = ICE_PROF_TUN_GRE;
+		break;
+	case ICE_SW_TUN_AND_NON_TUN:
+	default:
+		prof_type = ICE_PROF_ALL;
+		break;
 	}
+
+	ice_get_sw_fv_bitmap(hw, prof_type, bm);
 }
 
 /**
- * ice_cfg_dflt_vsi - change state of VSI to set/clear default
- * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to set as default
- * @set: true to add the above mentioned switch rule, false to remove it
- * @direction: ICE_FLTR_RX or ICE_FLTR_TX
- *
- * add filter rule to set/unset given VSI as default VSI for the switch
- * (represented by swid)
+ * ice_add_adv_recipe - Add an advanced recipe that is not part of the default
+ * @hw: pointer to hardware structure
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *  structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @rinfo: other information regarding the rule e.g. priority and action info
+ * @rid: return the recipe ID of the recipe created
  */
-enum ice_status
-ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_handle, bool set, u8 direction)
+static enum ice_status
+ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		   u16 lkups_cnt, struct ice_adv_rule_info *rinfo, u16 *rid)
 {
-	struct ice_aqc_sw_rules_elem *s_rule;
-	struct ice_fltr_info f_info;
-	enum ice_adminq_opc opcode;
-	enum ice_status status;
-	u16 s_rule_size;
-	u16 hw_vsi_id;
+	DECLARE_BITMAP(fv_bitmap, ICE_MAX_NUM_PROFILES);
+	DECLARE_BITMAP(profiles, ICE_MAX_NUM_PROFILES);
+	struct ice_prot_lkup_ext *lkup_exts;
+	struct ice_recp_grp_entry *r_entry;
+	struct ice_sw_fv_list_entry *fvit;
+	struct ice_recp_grp_entry *r_tmp;
+	struct ice_sw_fv_list_entry *tmp;
+	enum ice_status status = 0;
+	struct ice_sw_recipe *rm;
+	u16 match_tun_mask = 0;
+	u16 mask;
+	u8 i;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (!lkups_cnt)
 		return ICE_ERR_PARAM;
-	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
 
-	s_rule_size = set ? ICE_SW_RULE_RX_TX_ETH_HDR_SIZE :
-			    ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
+	lkup_exts = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*lkup_exts),
+				 GFP_KERNEL);
+	if (!lkup_exts)
 		return ICE_ERR_NO_MEMORY;
 
-	memset(&f_info, 0, sizeof(f_info));
+	/* Determine the number of words to be matched and if it exceeds a
+	 * recipe's restrictions
+	 */
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 count;
+
+		if (lkups[i].type >= ICE_PROTOCOL_LAST) {
+			status = ICE_ERR_CFG;
+			goto err_free_lkup_exts;
+		}
+
+		count = ice_fill_valid_words(&lkups[i], lkup_exts);
+		if (!count) {
+			status = ICE_ERR_CFG;
+			goto err_free_lkup_exts;
+		}
+	}
+
+	rm = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rm), GFP_KERNEL);
+	if (!rm) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_free_lkup_exts;
+	}
+
+	/* Get field vectors that contain fields extracted from all the protocol
+	 * headers being programmed.
+	 */
+	INIT_LIST_HEAD(&rm->fv_list);
+	INIT_LIST_HEAD(&rm->rg_list);
+
+	/* Get bitmap of field vectors (profiles) that are compatible with the
+	 * rule request; only these will be searched in the subsequent call to
+	 * ice_get_fv.
+	 */
+	ice_get_compat_fv_bitmap(hw, rinfo, fv_bitmap);
+
+	status = ice_get_fv(hw, lkups, lkups_cnt, fv_bitmap, &rm->fv_list);
+	if (status)
+		goto err_unroll;
+
+
+	/* Group match words into recipes using preferred recipe grouping
+	 * criteria.
+	 */
+	status = ice_create_recipe_group(hw, rm, lkup_exts);
+	if (status)
+		goto err_unroll;
+
+	/* For certain tunnel types it is necessary to use a metadata ID flag to
+	 * differentiate different tunnel types. A separate recipe needs to be
+	 * used for the metadata.
+	 */
+	if (ice_tun_type_match_word(rinfo->tun_type, &mask) &&
+	    rm->n_grp_count > 1)
+		match_tun_mask = mask;
+
+	/* set the recipe priority if specified */
+	rm->priority = (u8)rinfo->priority;
+
+	/* Find offsets from the field vector. Pick the first one for all the
+	 * recipes.
+	 */
+	status = ice_fill_fv_word_index(hw, &rm->fv_list, &rm->rg_list);
+	if (status)
+		goto err_unroll;
+
+
+	/* get bitmap of all profiles the recipe will be associated with */
+	bitmap_zero(profiles, ICE_MAX_NUM_PROFILES);
+	list_for_each_entry(fvit, &rm->fv_list, list_entry) {
+		ice_debug(hw, ICE_DBG_SW, "profile: %d\n", fvit->profile_id);
+		set_bit((u16)fvit->profile_id, profiles);
+	}
+
+	/* Create any special protocol/offset pairs, such as looking at tunnel
+	 * bits by extracting metadata
+	 */
+	status = ice_add_special_words(rinfo, lkup_exts);
+	if (status)
+		goto err_free_lkup_exts;
+
+	/* Look for a recipe which matches our requested fv / mask list */
+	*rid = ice_find_recp(hw, lkup_exts);
+	if (*rid < ICE_MAX_NUM_RECIPES)
+		/* Success if found a recipe that match the existing criteria */
+		goto err_unroll;
+
+	/* Recipe we need does not exist, add a recipe */
+	status = ice_add_sw_recipe(hw, rm, match_tun_mask, profiles);
+	if (status)
+		goto err_unroll;
+
+	/* Associate all the recipes created with all the profiles in the
+	 * common field vector.
+	 */
+	list_for_each_entry(fvit, &rm->fv_list, list_entry) {
+		DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+		u16 j;
+
+		status = ice_aq_get_recipe_to_profile(hw, fvit->profile_id,
+						      (u8 *)r_bitmap, NULL);
+		if (status)
+			goto err_unroll;
+
+		bitmap_or(r_bitmap, r_bitmap, rm->r_bitmap,
+			  ICE_MAX_NUM_RECIPES);
+		status = ice_acquire_change_lock(hw, ICE_RES_WRITE);
+		if (status)
+			goto err_unroll;
+
+		status = ice_aq_map_recipe_to_profile(hw, fvit->profile_id,
+						      (u8 *)r_bitmap,
+						      NULL);
+		ice_release_change_lock(hw);
+
+		if (status)
+			goto err_unroll;
+
+		/* Update profile to recipe bitmap array */
+		bitmap_copy(profile_to_recipe[fvit->profile_id], r_bitmap,
+			    ICE_MAX_NUM_RECIPES);
+
+		/* Update recipe to profile bitmap array */
+		for_each_set_bit(j, rm->r_bitmap, ICE_MAX_NUM_RECIPES)
+			set_bit((u16)fvit->profile_id, recipe_to_profile[j]);
+	}
+
+	*rid = rm->root_rid;
+	memcpy(&hw->switch_info->recp_list[*rid].lkup_exts, lkup_exts,
+	       sizeof(*lkup_exts));
+err_unroll:
+	list_for_each_entry_safe(r_entry, r_tmp, &rm->rg_list, l_entry) {
+		list_del(&r_entry->l_entry);
+		devm_kfree(ice_hw_to_dev(hw), r_entry);
+	}
+
+	list_for_each_entry_safe(fvit, tmp, &rm->fv_list, list_entry) {
+		list_del(&fvit->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), fvit);
+	}
+
+	if (rm->root_buf)
+		devm_kfree(ice_hw_to_dev(hw), rm->root_buf);
+
+	devm_kfree(ice_hw_to_dev(hw), rm);
+
+err_free_lkup_exts:
+	devm_kfree(ice_hw_to_dev(hw), lkup_exts);
+
+	return status;
+}
+
+/**
+ * ice_find_dummy_packet - find dummy packet by tunnel type
+ *
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @tun_type: tunnel type from the match criteria
+ * @pkt: dummy packet to fill according to filter match criteria
+ * @pkt_len: packet length of dummy packet
+ * @offsets: pointer to receive the pointer to the offsets for the packet
+ */
+static void
+ice_find_dummy_packet(struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+		      enum ice_sw_tunnel_type tun_type, const u8 **pkt,
+		      u16 *pkt_len,
+		      const struct ice_dummy_pkt_offsets **offsets)
+{
+	bool tcp = false, udp = false, ipv6 = false, vlan = false;
+	u16 i;
+
+	for (i = 0; i < lkups_cnt; i++) {
+		if (lkups[i].type == ICE_UDP_ILOS)
+			udp = true;
+		else if (lkups[i].type == ICE_TCP_IL)
+			tcp = true;
+		else if (lkups[i].type == ICE_IPV6_OFOS)
+			ipv6 = true;
+		else if (lkups[i].type == ICE_VLAN_OFOS)
+			vlan = true;
+	}
+
+	/* figure out which dummy packet and dummy offset to use if user
+	 * wants to add filter for GTP (UDP based tunnel, where tunnel port for
+	 * GTP is fixed, 2152) tunnel where inner/outer L3 could be IPv4[6] and
+	 * likewise inner L4 could be TCP/UDP
+	 */
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV4_TCP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4_TCP) {
+		*pkt = dummy_ipv4_gtpu_ipv4_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_tcp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV4_UDP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4_UDP) {
+		*pkt = dummy_ipv4_gtpu_ipv4_udp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_udp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV6_TCP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6_TCP) {
+		*pkt = dummy_ipv4_gtpu_ipv6_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_tcp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV4_GTP_IPV6_UDP ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6_UDP) {
+		*pkt = dummy_ipv4_gtpu_ipv6_udp_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_udp_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV4_TCP) {
+		*pkt = dummy_ipv6_gtpu_ipv4_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_tcp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV4_UDP) {
+		*pkt = dummy_ipv6_gtpu_ipv4_udp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_udp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_udp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV6_TCP) {
+		*pkt = dummy_ipv6_gtpu_ipv6_tcp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_tcp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_tcp_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_SW_TUN_IPV6_GTP_IPV6_UDP) {
+		*pkt = dummy_ipv6_gtpu_ipv6_udp_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_udp_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_udp_packet_offsets;
+		return;
+	}
+
+	/* Support GTP tunnel + L3 */
+	if (tun_type == ICE_SW_TUN_IPV4_GTPU_IPV4 ||
+	    tun_type == ICE_SW_TUN_GTP_IPV4) {
+		*pkt = dummy_ipv4_gtpu_ipv4_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv4_packet);
+		*offsets = dummy_ipv4_gtpu_ipv4_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV4_GTPU_IPV6 ||
+	    tun_type == ICE_SW_TUN_GTP_IPV6) {
+		*pkt = dummy_ipv4_gtpu_ipv6_packet;
+		*pkt_len = sizeof(dummy_ipv4_gtpu_ipv6_packet);
+		*offsets = dummy_ipv4_gtpu_ipv6_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV6_GTPU_IPV4) {
+		*pkt = dummy_ipv6_gtpu_ipv4_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv4_packet);
+		*offsets = dummy_ipv6_gtpu_ipv4_packet_offsets;
+		return;
+	}
+	if (tun_type == ICE_SW_TUN_IPV6_GTPU_IPV6) {
+		*pkt = dummy_ipv6_gtpu_ipv6_packet;
+		*pkt_len = sizeof(dummy_ipv6_gtpu_ipv6_packet);
+		*offsets = dummy_ipv6_gtpu_ipv6_packet_offsets;
+		return;
+	}
+
+	if (tun_type == ICE_ALL_TUNNELS) {
+		*pkt = dummy_gre_udp_packet;
+		*pkt_len = sizeof(dummy_gre_udp_packet);
+		*offsets = dummy_gre_udp_packet_offsets;
+		return;
+	}
 
-	f_info.lkup_type = ICE_SW_LKUP_DFLT;
-	f_info.flag = direction;
-	f_info.fltr_act = ICE_FWD_TO_VSI;
-	f_info.fwd_id.hw_vsi_id = hw_vsi_id;
+	if (tun_type == ICE_SW_TUN_NVGRE) {
+		if (tcp) {
+			*pkt = dummy_gre_tcp_packet;
+			*pkt_len = sizeof(dummy_gre_tcp_packet);
+			*offsets = dummy_gre_tcp_packet_offsets;
+			return;
+		}
 
-	if (f_info.flag & ICE_FLTR_RX) {
-		f_info.src = hw->port_info->lport;
-		f_info.src_id = ICE_SRC_ID_LPORT;
-		if (!set)
-			f_info.fltr_rule_id =
-				hw->port_info->dflt_rx_vsi_rule_id;
-	} else if (f_info.flag & ICE_FLTR_TX) {
-		f_info.src_id = ICE_SRC_ID_VSI;
-		f_info.src = hw_vsi_id;
-		if (!set)
-			f_info.fltr_rule_id =
-				hw->port_info->dflt_tx_vsi_rule_id;
+		*pkt = dummy_gre_udp_packet;
+		*pkt_len = sizeof(dummy_gre_udp_packet);
+		*offsets = dummy_gre_udp_packet_offsets;
+		return;
 	}
 
-	if (set)
-		opcode = ice_aqc_opc_add_sw_rules;
-	else
-		opcode = ice_aqc_opc_remove_sw_rules;
-
-	ice_fill_sw_rule(hw, &f_info, s_rule, opcode);
+	if (tun_type == ICE_SW_TUN_VXLAN || tun_type == ICE_SW_TUN_GENEVE ||
+	    tun_type == ICE_SW_TUN_VXLAN_GPE || tun_type == ICE_SW_TUN_UDP ||
+	    tun_type == ICE_SW_TUN_GENEVE_VLAN ||
+	    tun_type == ICE_SW_TUN_VXLAN_VLAN) {
+		if (tcp) {
+			*pkt = dummy_udp_tun_tcp_packet;
+			*pkt_len = sizeof(dummy_udp_tun_tcp_packet);
+			*offsets = dummy_udp_tun_tcp_packet_offsets;
+			return;
+		}
 
-	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opcode, NULL);
-	if (status || !(f_info.flag & ICE_FLTR_TX_RX))
-		goto out;
-	if (set) {
-		u16 index = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+		*pkt = dummy_udp_tun_udp_packet;
+		*pkt_len = sizeof(dummy_udp_tun_udp_packet);
+		*offsets = dummy_udp_tun_udp_packet_offsets;
+		return;
+	}
 
-		if (f_info.flag & ICE_FLTR_TX) {
-			hw->port_info->dflt_tx_vsi_num = hw_vsi_id;
-			hw->port_info->dflt_tx_vsi_rule_id = index;
-		} else if (f_info.flag & ICE_FLTR_RX) {
-			hw->port_info->dflt_rx_vsi_num = hw_vsi_id;
-			hw->port_info->dflt_rx_vsi_rule_id = index;
+	if (udp && !ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_udp_packet;
+			*pkt_len = sizeof(dummy_vlan_udp_packet);
+			*offsets = dummy_vlan_udp_packet_offsets;
+			return;
 		}
-	} else {
-		if (f_info.flag & ICE_FLTR_TX) {
-			hw->port_info->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
-			hw->port_info->dflt_tx_vsi_rule_id = ICE_INVAL_ACT;
-		} else if (f_info.flag & ICE_FLTR_RX) {
-			hw->port_info->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
-			hw->port_info->dflt_rx_vsi_rule_id = ICE_INVAL_ACT;
+		*pkt = dummy_udp_packet;
+		*pkt_len = sizeof(dummy_udp_packet);
+		*offsets = dummy_udp_packet_offsets;
+		return;
+	} else if (udp && ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_udp_ipv6_packet;
+			*pkt_len = sizeof(dummy_vlan_udp_ipv6_packet);
+			*offsets = dummy_vlan_udp_ipv6_packet_offsets;
+			return;
 		}
+		*pkt = dummy_udp_ipv6_packet;
+		*pkt_len = sizeof(dummy_udp_ipv6_packet);
+		*offsets = dummy_udp_ipv6_packet_offsets;
+		return;
+	} else if ((tcp && ipv6) || ipv6) {
+		if (vlan) {
+			*pkt = dummy_vlan_tcp_ipv6_packet;
+			*pkt_len = sizeof(dummy_vlan_tcp_ipv6_packet);
+			*offsets = dummy_vlan_tcp_ipv6_packet_offsets;
+			return;
+		}
+		*pkt = dummy_tcp_ipv6_packet;
+		*pkt_len = sizeof(dummy_tcp_ipv6_packet);
+		*offsets = dummy_tcp_ipv6_packet_offsets;
+		return;
 	}
 
-out:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
-}
-
-/**
- * ice_find_ucast_rule_entry - Search for a unicast MAC filter rule entry
- * @hw: pointer to the hardware structure
- * @recp_id: lookup type for which the specified rule needs to be searched
- * @f_info: rule information
- *
- * Helper function to search for a unicast rule entry - this is to be used
- * to remove unicast MAC filter that is not shared with other VSIs on the
- * PF switch.
- *
- * Returns pointer to entry storing the rule if found
- */
-static struct ice_fltr_mgmt_list_entry *
-ice_find_ucast_rule_entry(struct ice_hw *hw, u8 recp_id,
-			  struct ice_fltr_info *f_info)
-{
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *list_itr;
-	struct list_head *list_head;
-
-	list_head = &sw->recp_list[recp_id].filt_rules;
-	list_for_each_entry(list_itr, list_head, list_entry) {
-		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
-			    sizeof(f_info->l_data)) &&
-		    f_info->fwd_id.hw_vsi_id ==
-		    list_itr->fltr_info.fwd_id.hw_vsi_id &&
-		    f_info->flag == list_itr->fltr_info.flag)
-			return list_itr;
+	if (vlan) {
+		*pkt = dummy_vlan_tcp_packet;
+		*pkt_len = sizeof(dummy_vlan_tcp_packet);
+		*offsets = dummy_vlan_tcp_packet_offsets;
+	} else {
+		*pkt = dummy_tcp_packet;
+		*pkt_len = sizeof(dummy_tcp_packet);
+		*offsets = dummy_tcp_packet_offsets;
 	}
-	return NULL;
 }
 
 /**
- * ice_remove_mac - remove a MAC address based filter rule
- * @hw: pointer to the hardware structure
- * @m_list: list of MAC addresses and forwarding information
- *
- * This function removes either a MAC filter rule or a specific VSI from a
- * VSI list for a multicast MAC address.
+ * ice_fill_adv_dummy_packet - fill a dummy packet with given match criteria
  *
- * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
- * ice_add_mac. Caller should be aware that this call will only work if all
- * the entries passed into m_list were added previously. It will not attempt to
- * do a partial remove of entries that were found.
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @s_rule: stores rule information from the match criteria
+ * @dummy_pkt: dummy packet to fill according to filter match criteria
+ * @pkt_len: packet length of dummy packet
+ * @offsets: offset info for the dummy packet
  */
-enum ice_status
-ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
+static enum ice_status
+ice_fill_adv_dummy_packet(struct ice_adv_lkup_elem *lkups, u16 lkups_cnt,
+			  struct ice_aqc_sw_rules_elem *s_rule,
+			  const u8 *dummy_pkt, u16 pkt_len,
+			  const struct ice_dummy_pkt_offsets *offsets)
 {
-	struct ice_fltr_list_entry *list_itr, *tmp;
-	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	u8 *pkt;
+	u16 i;
 
-	if (!m_list)
-		return ICE_ERR_PARAM;
+	/* Start with a packet with a pre-defined/dummy content. Then, fill
+	 * in the header values to be looked up or matched.
+	 */
+	pkt = s_rule->pdata.lkup_tx_rx.hdr;
 
-	rule_lock = &hw->switch_info->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
-	list_for_each_entry_safe(list_itr, tmp, m_list, list_entry) {
-		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
-		u8 *add = &list_itr->fltr_info.l_data.mac.mac_addr[0];
-		u16 vsi_handle;
+	memcpy(pkt, dummy_pkt, pkt_len);
 
-		if (l_type != ICE_SW_LKUP_MAC)
-			return ICE_ERR_PARAM;
+	for (i = 0; i < lkups_cnt; i++) {
+		enum ice_protocol_type type;
+		u16 offset = 0, len = 0, j;
+		bool found = false;
 
-		vsi_handle = list_itr->fltr_info.vsi_handle;
-		if (!ice_is_vsi_valid(hw, vsi_handle))
+		/* find the start of this layer; it should be found since this
+		 * was already checked when search for the dummy packet
+		 */
+		type = lkups[i].type;
+		for (j = 0; offsets[j].type != ICE_PROTOCOL_LAST; j++) {
+			if (type == offsets[j].type) {
+				offset = offsets[j].offset;
+				found = true;
+				break;
+			}
+		}
+		/* this should never happen in a correct calling sequence */
+		if (!found)
 			return ICE_ERR_PARAM;
 
-		list_itr->fltr_info.fwd_id.hw_vsi_id =
-					ice_get_hw_vsi_num(hw, vsi_handle);
-		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
-			/* Don't remove the unicast address that belongs to
-			 * another VSI on the switch, since it is not being
-			 * shared...
-			 */
-			mutex_lock(rule_lock);
-			if (!ice_find_ucast_rule_entry(hw, ICE_SW_LKUP_MAC,
-						       &list_itr->fltr_info)) {
-				mutex_unlock(rule_lock);
-				return ICE_ERR_DOES_NOT_EXIST;
-			}
-			mutex_unlock(rule_lock);
+		switch (lkups[i].type) {
+		case ICE_MAC_OFOS:
+		case ICE_MAC_IL:
+			len = sizeof(struct ice_ether_hdr);
+			break;
+		case ICE_ETYPE_OL:
+			len = sizeof(struct ice_ethtype_hdr);
+			break;
+		case ICE_VLAN_OFOS:
+			len = sizeof(struct ice_vlan_hdr);
+			break;
+		case ICE_IPV4_OFOS:
+		case ICE_IPV4_IL:
+			len = sizeof(struct ice_ipv4_hdr);
+			break;
+		case ICE_IPV6_OFOS:
+		case ICE_IPV6_IL:
+			len = sizeof(struct ice_ipv6_hdr);
+			break;
+		case ICE_TCP_IL:
+		case ICE_UDP_OF:
+		case ICE_UDP_ILOS:
+			len = sizeof(struct ice_l4_hdr);
+			break;
+		case ICE_SCTP_IL:
+			len = sizeof(struct ice_sctp_hdr);
+			break;
+		case ICE_NVGRE:
+			len = sizeof(struct ice_nvgre);
+			break;
+		case ICE_VXLAN:
+		case ICE_GENEVE:
+		case ICE_VXLAN_GPE:
+			len = sizeof(struct ice_udp_tnl_hdr);
+			break;
+
+		case ICE_GTP:
+			len = sizeof(struct ice_udp_gtp_hdr);
+			break;
+		default:
+			return ICE_ERR_PARAM;
 		}
-		list_itr->status = ice_remove_rule_internal(hw,
-							    ICE_SW_LKUP_MAC,
-							    list_itr);
-		if (list_itr->status)
-			return list_itr->status;
+
+		/* the length should be a word multiple */
+		if (len % ICE_BYTES_PER_WORD)
+			return ICE_ERR_CFG;
+
+		/* We have the offset to the header start, the length, the
+		 * caller's header values and mask. Use this information to
+		 * copy the data into the dummy packet appropriately based on
+		 * the mask. Note that we need to only write the bits as
+		 * indicated by the mask to make sure we don't improperly write
+		 * over any significant packet data.
+		 */
+		for (j = 0; j < len / sizeof(u16); j++)
+			/* cppcheck-suppress objectIndex */
+			if (((u16 *)&lkups[i].m_u)[j])
+				((u16 *)(pkt + offset))[j] =
+					(((u16 *)(pkt + offset))[j] &
+					 /* cppcheck-suppress objectIndex */
+					 ~((u16 *)&lkups[i].m_u)[j]) |
+					/* cppcheck-suppress objectIndex */
+					(((u16 *)&lkups[i].h_u)[j] &
+					 /* cppcheck-suppress objectIndex */
+					 ((u16 *)&lkups[i].m_u)[j]);
 	}
+
+	s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(pkt_len);
+
 	return 0;
 }
 
 /**
- * ice_remove_vlan - Remove VLAN based filter rule
+ * ice_fill_adv_packet_tun - fill dummy packet with udp tunnel port
  * @hw: pointer to the hardware structure
- * @v_list: list of VLAN entries and forwarding information
+ * @tun_type: tunnel type
+ * @pkt: dummy packet to fill in
+ * @offsets: offset info for the dummy packet
  */
-enum ice_status
-ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
+static enum ice_status
+ice_fill_adv_packet_tun(struct ice_hw *hw, enum ice_sw_tunnel_type tun_type,
+			u8 *pkt, const struct ice_dummy_pkt_offsets *offsets)
 {
-	struct ice_fltr_list_entry *v_list_itr, *tmp;
+	u16 open_port, i;
+
+	switch (tun_type) {
+	case ICE_SW_TUN_AND_NON_TUN:
+	case ICE_SW_TUN_VXLAN_GPE:
+	case ICE_SW_TUN_VXLAN:
+	case ICE_SW_TUN_VXLAN_VLAN:
+	case ICE_SW_TUN_UDP:
+		if (!ice_get_open_tunnel_port(hw, TNL_VXLAN, &open_port))
+			return ICE_ERR_CFG;
+		break;
 
-	if (!v_list || !hw)
-		return ICE_ERR_PARAM;
+	case ICE_SW_TUN_GENEVE:
+	case ICE_SW_TUN_GENEVE_VLAN:
+		if (!ice_get_open_tunnel_port(hw, TNL_GENEVE, &open_port))
+			return ICE_ERR_CFG;
+		break;
 
-	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
-		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+	default:
+		/* Nothing needs to be done for this tunnel type */
+		return 0;
+	}
 
-		if (l_type != ICE_SW_LKUP_VLAN)
-			return ICE_ERR_PARAM;
-		v_list_itr->status = ice_remove_rule_internal(hw,
-							      ICE_SW_LKUP_VLAN,
-							      v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+	/* Find the outer UDP protocol header and insert the port number */
+	for (i = 0; offsets[i].type != ICE_PROTOCOL_LAST; i++) {
+		if (offsets[i].type == ICE_UDP_OF) {
+			struct ice_l4_hdr *hdr;
+			u16 offset;
+
+			offset = offsets[i].offset;
+			hdr = (struct ice_l4_hdr *)&pkt[offset];
+			hdr->dst_port = cpu_to_be16(open_port);
+
+			return 0;
+		}
 	}
-	return 0;
-}
 
-/**
- * ice_vsi_uses_fltr - Determine if given VSI uses specified filter
- * @fm_entry: filter entry to inspect
- * @vsi_handle: VSI handle to compare with filter info
- */
-static bool
-ice_vsi_uses_fltr(struct ice_fltr_mgmt_list_entry *fm_entry, u16 vsi_handle)
-{
-	return ((fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
-		 fm_entry->fltr_info.vsi_handle == vsi_handle) ||
-		(fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST &&
-		 fm_entry->vsi_list_info &&
-		 (test_bit(vsi_handle, fm_entry->vsi_list_info->vsi_map))));
+	return ICE_ERR_CFG;
 }
 
 /**
- * ice_add_entry_to_vsi_fltr_list - Add copy of fltr_list_entry to remove list
+ * ice_find_adv_rule_entry - Search a rule entry
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @vsi_list_head: pointer to the list to add entry to
- * @fi: pointer to fltr_info of filter entry to copy & add
+ * @lkups: lookup elements or match criteria for the advanced recipe, one
+ *	   structure per protocol header
+ * @lkups_cnt: number of protocols
+ * @recp_id: recipe ID for which we are finding the rule
+ * @rinfo: other information regarding the rule e.g. priority and action info
  *
- * Helper function, used when creating a list of filters to remove from
- * a specific VSI. The entry added to vsi_list_head is a COPY of the
- * original filter entry, with the exception of fltr_info.fltr_act and
- * fltr_info.fwd_id fields. These are set such that later logic can
- * extract which VSI to remove the fltr from, and pass on that information.
+ * Helper function to search for a given advance rule entry
+ * Returns pointer to entry storing the rule if found
  */
-static enum ice_status
-ice_add_entry_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
-			       struct list_head *vsi_list_head,
-			       struct ice_fltr_info *fi)
+static struct ice_adv_fltr_mgmt_list_entry *
+ice_find_adv_rule_entry(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+			u16 lkups_cnt, u16 recp_id,
+			struct ice_adv_rule_info *rinfo)
 {
-	struct ice_fltr_list_entry *tmp;
-
-	/* this memory is freed up in the caller function
-	 * once filters for this VSI are removed
-	 */
-	tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp), GFP_KERNEL);
-	if (!tmp)
-		return ICE_ERR_NO_MEMORY;
-
-	tmp->fltr_info = *fi;
-
-	/* Overwrite these fields to indicate which VSI to remove filter from,
-	 * so find and remove logic can extract the information from the
-	 * list entries. Note that original entries will still have proper
-	 * values.
-	 */
-	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-	tmp->fltr_info.vsi_handle = vsi_handle;
-	tmp->fltr_info.fwd_id.hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
+	struct ice_adv_fltr_mgmt_list_entry *list_itr;
+	struct ice_switch_info *sw = hw->switch_info;
+	int i;
 
-	list_add(&tmp->list_entry, vsi_list_head);
+	list_for_each_entry(list_itr, &sw->recp_list[recp_id].filt_rules,
+			    list_entry) {
+		bool lkups_matched = true;
 
-	return 0;
+		if (lkups_cnt != list_itr->lkups_cnt)
+			continue;
+		for (i = 0; i < list_itr->lkups_cnt; i++)
+			if (memcmp(&list_itr->lkups[i], &lkups[i],
+				   sizeof(*lkups))) {
+				lkups_matched = false;
+				break;
+			}
+		if (rinfo->sw_act.flag == list_itr->rule_info.sw_act.flag &&
+		    rinfo->tun_type == list_itr->rule_info.tun_type &&
+		    lkups_matched)
+			return list_itr;
+	}
+	return NULL;
 }
 
 /**
- * ice_add_to_vsi_fltr_list - Add VSI filters to the list
+ * ice_adv_add_update_vsi_list
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @lkup_list_head: pointer to the list that has certain lookup type filters
- * @vsi_list_head: pointer to the list pertaining to VSI with vsi_handle
+ * @m_entry: pointer to current adv filter management list entry
+ * @cur_fltr: filter information from the book keeping entry
+ * @new_fltr: filter information with the new VSI to be added
  *
- * Locates all filters in lkup_list_head that are used by the given VSI,
- * and adds COPIES of those entries to vsi_list_head (intended to be used
- * to remove the listed filters).
- * Note that this means all entries in vsi_list_head must be explicitly
- * deallocated by the caller when done with list.
+ * Call AQ command to add or update previously created VSI list with new VSI.
+ *
+ * Helper function to do book keeping associated with adding filter information
+ * The algorithm to do the booking keeping is described below :
+ * When a VSI needs to subscribe to a given advanced filter
+ *	if only one VSI has been added till now
+ *		Allocate a new VSI list and add two VSIs
+ *		to this list using switch rule command
+ *		Update the previously created switch rule with the
+ *		newly created VSI list ID
+ *	if a VSI list was previously created
+ *		Add the new VSI to the previously created VSI list set
+ *		using the update switch rule command
  */
 static enum ice_status
-ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_handle,
-			 struct list_head *lkup_list_head,
-			 struct list_head *vsi_list_head)
+ice_adv_add_update_vsi_list(struct ice_hw *hw,
+			    struct ice_adv_fltr_mgmt_list_entry *m_entry,
+			    struct ice_adv_rule_info *cur_fltr,
+			    struct ice_adv_rule_info *new_fltr)
 {
-	struct ice_fltr_mgmt_list_entry *fm_entry;
-	enum ice_status status = 0;
+	enum ice_status status;
+	u16 vsi_list_id = 0;
+
+	if (cur_fltr->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	    cur_fltr->sw_act.fltr_act == ICE_FWD_TO_QGRP ||
+	    cur_fltr->sw_act.fltr_act == ICE_DROP_PACKET)
+		return ICE_ERR_NOT_IMPL;
+
+	if ((new_fltr->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	     new_fltr->sw_act.fltr_act == ICE_FWD_TO_QGRP) &&
+	    (cur_fltr->sw_act.fltr_act == ICE_FWD_TO_VSI ||
+	     cur_fltr->sw_act.fltr_act == ICE_FWD_TO_VSI_LIST))
+		return ICE_ERR_NOT_IMPL;
 
-	/* check to make sure VSI ID is valid and within boundary */
-	if (!ice_is_vsi_valid(hw, vsi_handle))
-		return ICE_ERR_PARAM;
+	if (m_entry->vsi_count < 2 && !m_entry->vsi_list_info) {
+		 /* Only one entry existed in the mapping and it was not already
+		  * a part of a VSI list. So, create a VSI list with the old and
+		  * new VSIs.
+		  */
+		struct ice_fltr_info tmp_fltr;
+		u16 vsi_handle_arr[2];
 
-	list_for_each_entry(fm_entry, lkup_list_head, list_entry) {
-		if (!ice_vsi_uses_fltr(fm_entry, vsi_handle))
-			continue;
+		/* A rule already exists with the new VSI being added */
+		if (cur_fltr->sw_act.fwd_id.hw_vsi_id ==
+		    new_fltr->sw_act.fwd_id.hw_vsi_id)
+			return ICE_ERR_ALREADY_EXISTS;
 
-		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
-							vsi_list_head,
-							&fm_entry->fltr_info);
+		vsi_handle_arr[0] = cur_fltr->sw_act.vsi_handle;
+		vsi_handle_arr[1] = new_fltr->sw_act.vsi_handle;
+		status = ice_create_vsi_list_rule(hw, &vsi_handle_arr[0], 2,
+						  &vsi_list_id,
+						  ICE_SW_LKUP_LAST);
 		if (status)
 			return status;
-	}
-	return status;
-}
 
-/**
- * ice_determine_promisc_mask
- * @fi: filter info to parse
- *
- * Helper function to determine which ICE_PROMISC_ mask corresponds
- * to given filter into.
- */
-static u8 ice_determine_promisc_mask(struct ice_fltr_info *fi)
-{
-	u16 vid = fi->l_data.mac_vlan.vlan_id;
-	u8 *macaddr = fi->l_data.mac.mac_addr;
-	bool is_tx_fltr = false;
-	u8 promisc_mask = 0;
+		memset(&tmp_fltr, 0, sizeof(tmp_fltr));
+		tmp_fltr.flag = m_entry->rule_info.sw_act.flag;
+		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
+		tmp_fltr.lkup_type = ICE_SW_LKUP_LAST;
 
-	if (fi->flag == ICE_FLTR_TX)
-		is_tx_fltr = true;
+		/* Update the previous switch rule of "forward to VSI" to
+		 * "fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status)
+			return status;
 
-	if (is_broadcast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_BCAST_TX : ICE_PROMISC_BCAST_RX;
-	else if (is_multicast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_MCAST_TX : ICE_PROMISC_MCAST_RX;
-	else if (is_unicast_ether_addr(macaddr))
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_UCAST_TX : ICE_PROMISC_UCAST_RX;
-	if (vid)
-		promisc_mask |= is_tx_fltr ?
-			ICE_PROMISC_VLAN_TX : ICE_PROMISC_VLAN_RX;
+		cur_fltr->sw_act.fwd_id.vsi_list_id = vsi_list_id;
+		cur_fltr->sw_act.fltr_act = ICE_FWD_TO_VSI_LIST;
+		m_entry->vsi_list_info =
+			ice_create_vsi_list_map(hw, &vsi_handle_arr[0], 2,
+						vsi_list_id);
+	} else {
+		u16 vsi_handle = new_fltr->sw_act.vsi_handle;
 
-	return promisc_mask;
-}
+		if (!m_entry->vsi_list_info)
+			return ICE_ERR_CFG;
 
-/**
- * ice_remove_promisc - Remove promisc based filter rules
- * @hw: pointer to the hardware structure
- * @recp_id: recipe ID for which the rule needs to removed
- * @v_list: list of promisc entries
- */
-static enum ice_status
-ice_remove_promisc(struct ice_hw *hw, u8 recp_id,
-		   struct list_head *v_list)
-{
-	struct ice_fltr_list_entry *v_list_itr, *tmp;
+		/* A rule already exists with the new VSI being added */
+		if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map))
+			return 0;
 
-	list_for_each_entry_safe(v_list_itr, tmp, v_list, list_entry) {
-		v_list_itr->status =
-			ice_remove_rule_internal(hw, recp_id, v_list_itr);
-		if (v_list_itr->status)
-			return v_list_itr->status;
+		/* Update the previously created VSI list set with
+		 * the new VSI ID passed in
+		 */
+		vsi_list_id = cur_fltr->sw_act.fwd_id.vsi_list_id;
+
+		status = ice_update_vsi_list_rule(hw, &vsi_handle, 1,
+						  vsi_list_id, false,
+						  ice_aqc_opc_update_sw_rules,
+						  ICE_SW_LKUP_LAST);
+		/* update VSI list mapping info with new VSI ID */
+		if (!status)
+			set_bit(vsi_handle, m_entry->vsi_list_info->vsi_map);
 	}
-	return 0;
+	if (!status)
+		m_entry->vsi_count++;
+	return status;
 }
 
 /**
- * ice_clear_vsi_promisc - clear specified promiscuous mode(s) for given VSI
+ * ice_add_adv_rule - helper function to create an advanced switch rule
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to clear mode
- * @promisc_mask: mask of promiscuous config bits to clear
- * @vid: VLAN ID to clear VLAN promiscuous
+ * @lkups: information on the words that needs to be looked up. All words
+ * together makes one recipe
+ * @lkups_cnt: num of entries in the lkups array
+ * @rinfo: other information related to the rule that needs to be programmed
+ * @added_entry: this will return recipe_id, rule_id and vsi_handle. should be
+ *               ignored is case of error.
+ *
+ * This function can program only 1 rule at a time. The lkups is used to
+ * describe the all the words that forms the "lookup" portion of the recipe.
+ * These words can span multiple protocols. Callers to this function need to
+ * pass in a list of protocol headers with lookup information along and mask
+ * that determines which words are valid from the given protocol header.
+ * rinfo describes other information related to this rule such as forwarding
+ * IDs, priority of this rule, etc.
  */
 enum ice_status
-ice_clear_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
-		      u16 vid)
+ice_add_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo,
+		 struct ice_rule_query_data *added_entry)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *fm_entry, *tmp;
-	struct list_head remove_list_head;
-	struct ice_fltr_mgmt_list_entry *itr;
+	struct ice_adv_fltr_mgmt_list_entry *m_entry, *adv_fltr = NULL;
+	u16 rid = 0, i, pkt_len, rule_buf_sz, vsi_handle;
+	const struct ice_dummy_pkt_offsets *pkt_offsets;
+	struct ice_aqc_sw_rules_elem *s_rule = NULL;
 	struct list_head *rule_head;
-	struct mutex *rule_lock;	/* Lock to protect filter rule list */
-	enum ice_status status = 0;
-	u8 recipe_id;
+	struct ice_switch_info *sw;
+	enum ice_status status;
+	const u8 *pkt = NULL;
+	u16 word_cnt;
+	u32 act = 0;
+	u8 q_rgn;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	/* Initialize profile to result index bitmap */
+	if (!hw->switch_info->prof_res_bm_init) {
+		hw->switch_info->prof_res_bm_init = 1;
+		ice_init_prof_result_bm(hw);
+	}
+
+	if (!lkups_cnt)
 		return ICE_ERR_PARAM;
 
-	if (vid)
-		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
-	else
-		recipe_id = ICE_SW_LKUP_PROMISC;
+	/* get # of words we need to match */
+	word_cnt = 0;
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 j, *ptr;
 
-	rule_head = &sw->recp_list[recipe_id].filt_rules;
-	rule_lock = &sw->recp_list[recipe_id].filt_rule_lock;
+		ptr = (u16 *)&lkups[i].m_u;
+		for (j = 0; j < sizeof(lkups->m_u) / sizeof(u16); j++)
+			/* cppcheck-suppress objectIndex */
+			if (ptr[j] != 0)
+				word_cnt++;
+	}
 
-	INIT_LIST_HEAD(&remove_list_head);
+	if (!word_cnt || word_cnt > ICE_MAX_CHAIN_WORDS)
+		return ICE_ERR_PARAM;
 
-	mutex_lock(rule_lock);
-	list_for_each_entry(itr, rule_head, list_entry) {
-		u8 fltr_promisc_mask = 0;
+	/* make sure that we can locate a dummy packet */
+	ice_find_dummy_packet(lkups, lkups_cnt, rinfo->tun_type, &pkt, &pkt_len,
+			      &pkt_offsets);
+	if (!pkt) {
+		status = ICE_ERR_PARAM;
+		goto err_ice_add_adv_rule;
+	}
 
-		if (!ice_vsi_uses_fltr(itr, vsi_handle))
-			continue;
+	if (!(rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI ||
+	      rinfo->sw_act.fltr_act == ICE_FWD_TO_Q ||
+	      rinfo->sw_act.fltr_act == ICE_FWD_TO_QGRP ||
+	      rinfo->sw_act.fltr_act == ICE_DROP_PACKET))
+		return ICE_ERR_CFG;
 
-		fltr_promisc_mask |=
-			ice_determine_promisc_mask(&itr->fltr_info);
+	vsi_handle = rinfo->sw_act.vsi_handle;
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
 
-		/* Skip if filter is not completely specified by given mask */
-		if (fltr_promisc_mask & ~promisc_mask)
-			continue;
+	if (rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI)
+		rinfo->sw_act.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, vsi_handle);
+	if (rinfo->sw_act.flag & ICE_FLTR_TX)
+		rinfo->sw_act.src = ice_get_hw_vsi_num(hw, vsi_handle);
 
-		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_handle,
-							&remove_list_head,
-							&itr->fltr_info);
-		if (status) {
-			mutex_unlock(rule_lock);
-			goto free_fltr_list;
+	status = ice_add_adv_recipe(hw, lkups, lkups_cnt, rinfo, &rid);
+	if (status)
+		return status;
+	m_entry = ice_find_adv_rule_entry(hw, lkups, lkups_cnt, rid, rinfo);
+	if (m_entry) {
+		/* we have to add VSI to VSI_LIST and increment vsi_count.
+		 * Also Update VSI list so that we can change forwarding rule
+		 * if the rule already exists, we will check if it exists with
+		 * same vsi_id, if not then add it to the VSI list if it already
+		 * exists if not then create a VSI list and add the existing VSI
+		 * ID and the new VSI ID to the list
+		 * We will add that VSI to the list
+		 */
+		status = ice_adv_add_update_vsi_list(hw, m_entry,
+						     &m_entry->rule_info,
+						     rinfo);
+		if (added_entry) {
+			added_entry->rid = rid;
+			added_entry->rule_id = m_entry->rule_info.fltr_rule_id;
+			added_entry->vsi_handle = rinfo->sw_act.vsi_handle;
 		}
+		return status;
+	}
+	rule_buf_sz = ICE_SW_RULE_RX_TX_NO_HDR_SIZE + pkt_len;
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), rule_buf_sz, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+	act |= ICE_SINGLE_ACT_LB_ENABLE | ICE_SINGLE_ACT_LAN_ENABLE;
+	switch (rinfo->sw_act.fltr_act) {
+	case ICE_FWD_TO_VSI:
+		act |= (rinfo->sw_act.fwd_id.hw_vsi_id <<
+			ICE_SINGLE_ACT_VSI_ID_S) & ICE_SINGLE_ACT_VSI_ID_M;
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	case ICE_FWD_TO_Q:
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (rinfo->sw_act.fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+		       ICE_SINGLE_ACT_Q_INDEX_M;
+		break;
+	case ICE_FWD_TO_QGRP:
+		q_rgn = rinfo->sw_act.qgrp_size > 0 ?
+			(u8)ilog2(rinfo->sw_act.qgrp_size) : 0;
+		act |= ICE_SINGLE_ACT_TO_Q;
+		act |= (rinfo->sw_act.fwd_id.q_id << ICE_SINGLE_ACT_Q_INDEX_S) &
+		       ICE_SINGLE_ACT_Q_INDEX_M;
+		act |= (q_rgn << ICE_SINGLE_ACT_Q_REGION_S) &
+		       ICE_SINGLE_ACT_Q_REGION_M;
+		break;
+	case ICE_DROP_PACKET:
+		act |= ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_DROP |
+		       ICE_SINGLE_ACT_VALID_BIT;
+		break;
+	default:
+		status = ICE_ERR_CFG;
+		goto err_ice_add_adv_rule;
 	}
-	mutex_unlock(rule_lock);
 
-	status = ice_remove_promisc(hw, recipe_id, &remove_list_head);
+	/* set the rule LOOKUP type based on caller specified 'RX'
+	 * instead of hardcoding it to be either LOOKUP_TX/RX
+	 *
+	 * for 'RX' set the source to be the port number
+	 * for 'TX' set the source to be the source HW VSI number (determined
+	 * by caller)
+	 */
+	if (rinfo->rx) {
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX);
+		s_rule->pdata.lkup_tx_rx.src =
+			cpu_to_le16(hw->port_info->lport);
+	} else {
+		s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX);
+		s_rule->pdata.lkup_tx_rx.src = cpu_to_le16(rinfo->sw_act.src);
+	}
 
-free_fltr_list:
-	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
-		list_del(&fm_entry->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	s_rule->pdata.lkup_tx_rx.recipe_id = cpu_to_le16(rid);
+	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
+
+	status = ice_fill_adv_dummy_packet(lkups, lkups_cnt, s_rule, pkt,
+					   pkt_len, pkt_offsets);
+	if (status)
+		goto err_ice_add_adv_rule;
+
+	if (rinfo->tun_type != ICE_NON_TUN &&
+	    rinfo->tun_type != ICE_SW_TUN_AND_NON_TUN) {
+		status = ice_fill_adv_packet_tun(hw, rinfo->tun_type,
+						 s_rule->pdata.lkup_tx_rx.hdr,
+						 pkt_offsets);
+		if (status)
+			goto err_ice_add_adv_rule;
+	}
+
+	status = ice_aq_sw_rules(hw, (struct ice_aqc_sw_rules *)s_rule,
+				 rule_buf_sz, 1, ice_aqc_opc_add_sw_rules,
+				 NULL);
+	if (status)
+		goto err_ice_add_adv_rule;
+	adv_fltr = devm_kzalloc(ice_hw_to_dev(hw),
+				sizeof(struct ice_adv_fltr_mgmt_list_entry),
+				GFP_KERNEL);
+	if (!adv_fltr) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_adv_rule;
+	}
+
+	adv_fltr->lkups = devm_kmemdup(ice_hw_to_dev(hw), lkups,
+				       lkups_cnt * sizeof(*lkups), GFP_KERNEL);
+	if (!adv_fltr->lkups) {
+		status = ICE_ERR_NO_MEMORY;
+		goto err_ice_add_adv_rule;
+	}
+
+	adv_fltr->lkups_cnt = lkups_cnt;
+	adv_fltr->rule_info = *rinfo;
+	adv_fltr->rule_info.fltr_rule_id =
+		le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+	sw = hw->switch_info;
+	sw->recp_list[rid].adv_rule = true;
+	rule_head = &sw->recp_list[rid].filt_rules;
+
+	if (rinfo->sw_act.fltr_act == ICE_FWD_TO_VSI)
+		adv_fltr->vsi_count = 1;
+
+	/* Add rule entry to book keeping list */
+	list_add(&adv_fltr->list_entry, rule_head);
+	if (added_entry) {
+		added_entry->rid = rid;
+		added_entry->rule_id = adv_fltr->rule_info.fltr_rule_id;
+		added_entry->vsi_handle = rinfo->sw_act.vsi_handle;
 	}
+err_ice_add_adv_rule:
+	if (status && adv_fltr) {
+		devm_kfree(ice_hw_to_dev(hw), adv_fltr->lkups);
+		devm_kfree(ice_hw_to_dev(hw), adv_fltr);
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
 
 	return status;
 }
 
 /**
- * ice_set_vsi_promisc - set given VSI to given promiscuous mode(s)
+ * ice_adv_rem_update_vsi_list
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to configure
- * @promisc_mask: mask of promiscuous config bits
- * @vid: VLAN ID to set VLAN promiscuous
+ * @vsi_handle: VSI handle of the VSI to remove
+ * @fm_list: filter management entry for which the VSI list management needs to
+ *	     be done
  */
-enum ice_status
-ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask, u16 vid)
+static enum ice_status
+ice_adv_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_handle,
+			    struct ice_adv_fltr_mgmt_list_entry *fm_list)
 {
-	enum { UCAST_FLTR = 1, MCAST_FLTR, BCAST_FLTR };
-	struct ice_fltr_list_entry f_list_entry;
-	struct ice_fltr_info new_fltr;
-	enum ice_status status = 0;
-	bool is_tx_fltr;
-	u16 hw_vsi_id;
-	int pkt_type;
-	u8 recipe_id;
+	struct ice_vsi_list_map_info *vsi_list_info;
+	enum ice_sw_lkup_type lkup_type;
+	enum ice_status status;
+	u16 vsi_list_id;
 
-	if (!ice_is_vsi_valid(hw, vsi_handle))
+	if (fm_list->rule_info.sw_act.fltr_act != ICE_FWD_TO_VSI_LIST ||
+	    fm_list->vsi_count == 0)
 		return ICE_ERR_PARAM;
-	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
-
-	memset(&new_fltr, 0, sizeof(new_fltr));
-
-	if (promisc_mask & (ICE_PROMISC_VLAN_RX | ICE_PROMISC_VLAN_TX)) {
-		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC_VLAN;
-		new_fltr.l_data.mac_vlan.vlan_id = vid;
-		recipe_id = ICE_SW_LKUP_PROMISC_VLAN;
-	} else {
-		new_fltr.lkup_type = ICE_SW_LKUP_PROMISC;
-		recipe_id = ICE_SW_LKUP_PROMISC;
-	}
-
-	/* Separate filters must be set for each direction/packet type
-	 * combination, so we will loop over the mask value, store the
-	 * individual type, and clear it out in the input mask as it
-	 * is found.
-	 */
-	while (promisc_mask) {
-		u8 *mac_addr;
 
-		pkt_type = 0;
-		is_tx_fltr = false;
-
-		if (promisc_mask & ICE_PROMISC_UCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_UCAST_RX;
-			pkt_type = UCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_UCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_UCAST_TX;
-			pkt_type = UCAST_FLTR;
-			is_tx_fltr = true;
-		} else if (promisc_mask & ICE_PROMISC_MCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_MCAST_RX;
-			pkt_type = MCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_MCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_MCAST_TX;
-			pkt_type = MCAST_FLTR;
-			is_tx_fltr = true;
-		} else if (promisc_mask & ICE_PROMISC_BCAST_RX) {
-			promisc_mask &= ~ICE_PROMISC_BCAST_RX;
-			pkt_type = BCAST_FLTR;
-		} else if (promisc_mask & ICE_PROMISC_BCAST_TX) {
-			promisc_mask &= ~ICE_PROMISC_BCAST_TX;
-			pkt_type = BCAST_FLTR;
-			is_tx_fltr = true;
-		}
+	/* A rule with the VSI being removed does not exist */
+	if (!test_bit(vsi_handle, fm_list->vsi_list_info->vsi_map))
+		return ICE_ERR_DOES_NOT_EXIST;
 
-		/* Check for VLAN promiscuous flag */
-		if (promisc_mask & ICE_PROMISC_VLAN_RX) {
-			promisc_mask &= ~ICE_PROMISC_VLAN_RX;
-		} else if (promisc_mask & ICE_PROMISC_VLAN_TX) {
-			promisc_mask &= ~ICE_PROMISC_VLAN_TX;
-			is_tx_fltr = true;
-		}
+	lkup_type = ICE_SW_LKUP_LAST;
+	vsi_list_id = fm_list->rule_info.sw_act.fwd_id.vsi_list_id;
+	status = ice_update_vsi_list_rule(hw, &vsi_handle, 1, vsi_list_id, true,
+					  ice_aqc_opc_update_sw_rules,
+					  lkup_type);
+	if (status)
+		return status;
 
-		/* Set filter DA based on packet type */
-		mac_addr = new_fltr.l_data.mac.mac_addr;
-		if (pkt_type == BCAST_FLTR) {
-			eth_broadcast_addr(mac_addr);
-		} else if (pkt_type == MCAST_FLTR ||
-			   pkt_type == UCAST_FLTR) {
-			/* Use the dummy ether header DA */
-			ether_addr_copy(mac_addr, dummy_eth_header);
-			if (pkt_type == MCAST_FLTR)
-				mac_addr[0] |= 0x1;	/* Set multicast bit */
-		}
+	fm_list->vsi_count--;
+	clear_bit(vsi_handle, fm_list->vsi_list_info->vsi_map);
+	vsi_list_info = fm_list->vsi_list_info;
+	if (fm_list->vsi_count == 1) {
+		struct ice_fltr_info tmp_fltr;
+		u16 rem_vsi_handle;
 
-		/* Need to reset this to zero for all iterations */
-		new_fltr.flag = 0;
-		if (is_tx_fltr) {
-			new_fltr.flag |= ICE_FLTR_TX;
-			new_fltr.src = hw_vsi_id;
-		} else {
-			new_fltr.flag |= ICE_FLTR_RX;
-			new_fltr.src = hw->port_info->lport;
+		rem_vsi_handle = find_first_bit(vsi_list_info->vsi_map,
+						ICE_MAX_VSI);
+		if (!ice_is_vsi_valid(hw, rem_vsi_handle))
+			return ICE_ERR_OUT_OF_RANGE;
+
+		/* Make sure VSI list is empty before removing it below */
+		status = ice_update_vsi_list_rule(hw, &rem_vsi_handle, 1,
+						  vsi_list_id, true,
+						  ice_aqc_opc_update_sw_rules,
+						  lkup_type);
+		if (status)
+			return status;
+
+		memset(&tmp_fltr, 0, sizeof(tmp_fltr));
+		tmp_fltr.flag = fm_list->rule_info.sw_act.flag;
+		tmp_fltr.fltr_rule_id = fm_list->rule_info.fltr_rule_id;
+		fm_list->rule_info.sw_act.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI;
+		tmp_fltr.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		fm_list->rule_info.sw_act.fwd_id.hw_vsi_id =
+			ice_get_hw_vsi_num(hw, rem_vsi_handle);
+		fm_list->rule_info.sw_act.vsi_handle = rem_vsi_handle;
+
+		/* Update the previous switch rule of "MAC forward to VSI" to
+		 * "MAC fwd to VSI list"
+		 */
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to update pkt fwd rule to FWD_TO_VSI on HW VSI %d, error %d\n",
+				  tmp_fltr.fwd_id.hw_vsi_id, status);
+			return status;
 		}
+		fm_list->vsi_list_info->ref_cnt--;
 
-		new_fltr.fltr_act = ICE_FWD_TO_VSI;
-		new_fltr.vsi_handle = vsi_handle;
-		new_fltr.fwd_id.hw_vsi_id = hw_vsi_id;
-		f_list_entry.fltr_info = new_fltr;
+		/* Remove the VSI list since it is no longer used */
+		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SW, "Failed to remove VSI list %d, error %d\n",
+				  vsi_list_id, status);
+			return status;
+		}
 
-		status = ice_add_rule_internal(hw, recipe_id, &f_list_entry);
-		if (status)
-			goto set_promisc_exit;
+		list_del(&vsi_list_info->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
+		fm_list->vsi_list_info = NULL;
 	}
 
-set_promisc_exit:
 	return status;
 }
 
 /**
- * ice_set_vlan_vsi_promisc
+ * ice_rem_adv_rule - removes existing advanced switch rule
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to configure
- * @promisc_mask: mask of promiscuous config bits
- * @rm_vlan_promisc: Clear VLANs VSI promisc mode
+ * @lkups: information on the words that needs to be looked up. All words
+ *         together makes one recipe
+ * @lkups_cnt: num of entries in the lkups array
+ * @rinfo: Its the pointer to the rule information for the rule
  *
- * Configure VSI with all associated VLANs to given promiscuous mode(s)
+ * This function can be used to remove 1 rule at a time. The lkups is
+ * used to describe all the words that forms the "lookup" portion of the
+ * rule. These words can span multiple protocols. Callers to this function
+ * need to pass in a list of protocol headers with lookup information along
+ * and mask that determines which words are valid from the given protocol
+ * header. rinfo describes other information related to this rule such as
+ * forwarding IDs, priority of this rule, etc.
  */
 enum ice_status
-ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
-			 bool rm_vlan_promisc)
+ice_rem_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *list_itr, *tmp;
-	struct list_head vsi_list_head;
-	struct list_head *vlan_head;
-	struct mutex *vlan_lock; /* Lock to protect filter rule list */
-	enum ice_status status;
-	u16 vlan_id;
+	struct ice_adv_fltr_mgmt_list_entry *list_elem;
+	struct ice_prot_lkup_ext lkup_exts;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	bool remove_rule = false;
+	u16 i, rid, vsi_handle;
 
-	INIT_LIST_HEAD(&vsi_list_head);
-	vlan_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
-	vlan_head = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rules;
-	mutex_lock(vlan_lock);
-	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, vlan_head,
-					  &vsi_list_head);
-	mutex_unlock(vlan_lock);
+	memset(&lkup_exts, 0, sizeof(lkup_exts));
+	for (i = 0; i < lkups_cnt; i++) {
+		u16 count;
+
+		if (lkups[i].type >= ICE_PROTOCOL_LAST)
+			return ICE_ERR_CFG;
+
+		count = ice_fill_valid_words(&lkups[i], &lkup_exts);
+		if (!count)
+			return ICE_ERR_CFG;
+	}
+
+	/* Create any special protocol/offset pairs, such as looking at tunnel
+	 * bits by extracting metadata
+	 */
+	status = ice_add_special_words(rinfo, &lkup_exts);
 	if (status)
-		goto free_fltr_list;
+		return status;
 
-	list_for_each_entry(list_itr, &vsi_list_head, list_entry) {
-		vlan_id = list_itr->fltr_info.l_data.vlan.vlan_id;
-		if (rm_vlan_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi_handle,
-						       promisc_mask, vlan_id);
-		else
-			status = ice_set_vsi_promisc(hw, vsi_handle,
-						     promisc_mask, vlan_id);
-		if (status)
-			break;
+	rid = ice_find_recp(hw, &lkup_exts);
+	/* If did not find a recipe that match the existing criteria */
+	if (rid == ICE_MAX_NUM_RECIPES)
+		return ICE_ERR_PARAM;
+
+	rule_lock = &hw->switch_info->recp_list[rid].filt_rule_lock;
+	list_elem = ice_find_adv_rule_entry(hw, lkups, lkups_cnt, rid, rinfo);
+	/* the rule is already removed */
+	if (!list_elem)
+		return 0;
+	mutex_lock(rule_lock);
+	if (list_elem->rule_info.sw_act.fltr_act != ICE_FWD_TO_VSI_LIST) {
+		remove_rule = true;
+	} else if (list_elem->vsi_count > 1) {
+		remove_rule = false;
+		vsi_handle = rinfo->sw_act.vsi_handle;
+		status = ice_adv_rem_update_vsi_list(hw, vsi_handle, list_elem);
+	} else {
+		vsi_handle = rinfo->sw_act.vsi_handle;
+		status = ice_adv_rem_update_vsi_list(hw, vsi_handle, list_elem);
+		if (status) {
+			mutex_unlock(rule_lock);
+			return status;
+		}
+		if (list_elem->vsi_count == 0)
+			remove_rule = true;
 	}
+	mutex_unlock(rule_lock);
+	if (remove_rule) {
+		struct ice_aqc_sw_rules_elem *s_rule;
+		u16 rule_buf_sz;
 
-free_fltr_list:
-	list_for_each_entry_safe(list_itr, tmp, &vsi_list_head, list_entry) {
-		list_del(&list_itr->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), list_itr);
+		rule_buf_sz = ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
+		s_rule = devm_kzalloc(ice_hw_to_dev(hw), rule_buf_sz,
+				      GFP_KERNEL);
+		if (!s_rule)
+			return ICE_ERR_NO_MEMORY;
+		s_rule->pdata.lkup_tx_rx.act = 0;
+		s_rule->pdata.lkup_tx_rx.index =
+			cpu_to_le16(list_elem->rule_info.fltr_rule_id);
+		s_rule->pdata.lkup_tx_rx.hdr_len = 0;
+		status = ice_aq_sw_rules(hw, (struct ice_aqc_sw_rules *)s_rule,
+					 rule_buf_sz, 1,
+					 ice_aqc_opc_remove_sw_rules, NULL);
+		if (!status || status == ICE_ERR_DOES_NOT_EXIST) {
+			struct ice_switch_info *sw = hw->switch_info;
+
+			mutex_lock(rule_lock);
+			list_del(&list_elem->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), list_elem->lkups);
+			devm_kfree(ice_hw_to_dev(hw), list_elem);
+			mutex_unlock(rule_lock);
+			if (list_empty(&sw->recp_list[rid].filt_rules))
+				sw->recp_list[rid].adv_rule = false;
+		}
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
 	}
 	return status;
 }
 
 /**
- * ice_remove_vsi_lkup_fltr - Remove lookup type filters for a VSI
+ * ice_rem_adv_rule_by_id - removes existing advanced switch rule by ID
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
- * @lkup: switch rule filter lookup type
+ * @remove_entry: data struct which holds rule_id, VSI handle and recipe ID
+ *
+ * This function is used to remove 1 rule at a time. The removal is based on
+ * the remove_entry parameter. This function will remove rule for a given
+ * vsi_handle with a given rule_id which is passed as parameter in remove_entry
  */
-static void
-ice_remove_vsi_lkup_fltr(struct ice_hw *hw, u16 vsi_handle,
-			 enum ice_sw_lkup_type lkup)
+enum ice_status
+ice_rem_adv_rule_by_id(struct ice_hw *hw,
+		       struct ice_rule_query_data *remove_entry)
 {
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_list_entry *fm_entry;
-	struct list_head remove_list_head;
-	struct list_head *rule_head;
-	struct ice_fltr_list_entry *tmp;
-	struct mutex *rule_lock;	/* Lock to protect filter rule list */
-	enum ice_status status;
-
-	INIT_LIST_HEAD(&remove_list_head);
-	rule_lock = &sw->recp_list[lkup].filt_rule_lock;
-	rule_head = &sw->recp_list[lkup].filt_rules;
-	mutex_lock(rule_lock);
-	status = ice_add_to_vsi_fltr_list(hw, vsi_handle, rule_head,
-					  &remove_list_head);
-	mutex_unlock(rule_lock);
-	if (status)
-		goto free_fltr_list;
-
-	switch (lkup) {
-	case ICE_SW_LKUP_MAC:
-		ice_remove_mac(hw, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_VLAN:
-		ice_remove_vlan(hw, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_PROMISC:
-	case ICE_SW_LKUP_PROMISC_VLAN:
-		ice_remove_promisc(hw, lkup, &remove_list_head);
-		break;
-	case ICE_SW_LKUP_MAC_VLAN:
-	case ICE_SW_LKUP_ETHERTYPE:
-	case ICE_SW_LKUP_ETHERTYPE_MAC:
-	case ICE_SW_LKUP_DFLT:
-	case ICE_SW_LKUP_LAST:
-	default:
-		ice_debug(hw, ICE_DBG_SW, "Unsupported lookup type %d\n", lkup);
-		break;
-	}
+	struct ice_adv_fltr_mgmt_list_entry *list_itr;
+	struct list_head *list_head;
+	struct ice_adv_rule_info rinfo;
+	struct ice_switch_info *sw;
 
-free_fltr_list:
-	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
-		list_del(&fm_entry->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), fm_entry);
+	sw = hw->switch_info;
+	if (!sw->recp_list[remove_entry->rid].recp_created)
+		return ICE_ERR_PARAM;
+	list_head = &sw->recp_list[remove_entry->rid].filt_rules;
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (list_itr->rule_info.fltr_rule_id ==
+		    remove_entry->rule_id) {
+			rinfo = list_itr->rule_info;
+			rinfo.sw_act.vsi_handle = remove_entry->vsi_handle;
+			return ice_rem_adv_rule(hw, list_itr->lkups,
+						list_itr->lkups_cnt, &rinfo);
+		}
 	}
+	/* either list is empty or unable to find rule */
+	return ICE_ERR_DOES_NOT_EXIST;
 }
 
 /**
- * ice_remove_vsi_fltr - Remove all filters for a VSI
+ * ice_rem_adv_rule_for_vsi - removes existing advanced switch rules for a
+ *                       given VSI handle
  * @hw: pointer to the hardware structure
- * @vsi_handle: VSI handle to remove filters from
+ * @vsi_handle: VSI handle for which we are supposed to remove all the rules.
+ *
+ * This function is used to remove all the rules for a given VSI and as soon
+ * as removing a rule fails, it will return immediately with the error code,
+ * else it will return ICE_SUCCESS
  */
-void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
+enum ice_status ice_rem_adv_rule_for_vsi(struct ice_hw *hw, u16 vsi_handle)
 {
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_MAC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_MAC_VLAN);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_PROMISC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_VLAN);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_DFLT);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_ETHERTYPE);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_ETHERTYPE_MAC);
-	ice_remove_vsi_lkup_fltr(hw, vsi_handle, ICE_SW_LKUP_PROMISC_VLAN);
+	struct ice_adv_fltr_mgmt_list_entry *list_itr, *tmp_entry;
+	struct ice_vsi_list_map_info *map_info;
+	struct list_head *list_head;
+	struct ice_adv_rule_info rinfo;
+	struct ice_switch_info *sw;
+	enum ice_status status;
+	u8 rid;
+
+	sw = hw->switch_info;
+	for (rid = 0; rid < ICE_MAX_NUM_RECIPES; rid++) {
+		if (!sw->recp_list[rid].recp_created)
+			continue;
+		if (!sw->recp_list[rid].adv_rule)
+			continue;
+
+		list_head = &sw->recp_list[rid].filt_rules;
+		list_for_each_entry_safe(list_itr, tmp_entry, list_head,
+					 list_entry) {
+			rinfo = list_itr->rule_info;
+
+			if (rinfo.sw_act.fltr_act == ICE_FWD_TO_VSI_LIST) {
+				map_info = list_itr->vsi_list_info;
+				if (!map_info)
+					continue;
+
+				if (!test_bit(vsi_handle, map_info->vsi_map))
+					continue;
+			} else if (rinfo.sw_act.vsi_handle != vsi_handle) {
+				continue;
+			}
+
+			rinfo.sw_act.vsi_handle = vsi_handle;
+			status = ice_rem_adv_rule(hw, list_itr->lkups,
+						  list_itr->lkups_cnt, &rinfo);
+
+			if (status)
+				return status;
+		}
+	}
+	return 0;
 }
 
+
 /**
  * ice_replay_vsi_fltr - Replay filters for requested VSI
  * @hw: pointer to the hardware structure
+ * @pi: pointer to port information structure
+ * @sw: pointer to switch info struct for which function replays filters
  * @vsi_handle: driver VSI handle
  * @recp_id: Recipe ID for which rules need to be replayed
  * @list_head: list for which filters need to be replayed
@@ -2723,15 +8095,18 @@ void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle)
  * It is required to pass valid VSI handle.
  */
 static enum ice_status
-ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
+ice_replay_vsi_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+		    struct ice_switch_info *sw, u16 vsi_handle, u8 recp_id,
 		    struct list_head *list_head)
 {
 	struct ice_fltr_mgmt_list_entry *itr;
 	enum ice_status status = 0;
+	struct ice_sw_recipe *recp_list;
 	u16 hw_vsi_id;
 
 	if (list_empty(list_head))
 		return status;
+	recp_list = &sw->recp_list[recp_id];
 	hw_vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
 
 	list_for_each_entry(itr, list_head, list_entry) {
@@ -2743,7 +8118,9 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 			/* update the src in case it is VSI num */
 			if (f_entry.fltr_info.src_id == ICE_SRC_ID_VSI)
 				f_entry.fltr_info.src = hw_vsi_id;
-			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			status = ice_add_rule_internal(hw, recp_list,
+						       pi->lport,
+						       &f_entry);
 			if (status)
 				goto end;
 			continue;
@@ -2759,9 +8136,11 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 		if (f_entry.fltr_info.src_id == ICE_SRC_ID_VSI)
 			f_entry.fltr_info.src = hw_vsi_id;
 		if (recp_id == ICE_SW_LKUP_VLAN)
-			status = ice_add_vlan_internal(hw, &f_entry);
+			status = ice_add_vlan_internal(hw, recp_list, &f_entry);
 		else
-			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			status = ice_add_rule_internal(hw, recp_list,
+						       pi->lport,
+						       &f_entry);
 		if (status)
 			goto end;
 	}
@@ -2769,50 +8148,106 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id,
 	return status;
 }
 
+/**
+ * ice_replay_vsi_adv_rule - Replay advanced rule for requested VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: driver VSI handle
+ * @list_head: list for which filters need to be replayed
+ *
+ * Replay the advanced rule for the given VSI.
+ */
+static enum ice_status
+ice_replay_vsi_adv_rule(struct ice_hw *hw, u16 vsi_handle,
+			struct list_head *list_head)
+{
+	struct ice_rule_query_data added_entry = { 0 };
+	struct ice_adv_fltr_mgmt_list_entry *adv_fltr;
+	enum ice_status status = 0;
+
+	if (list_empty(list_head))
+		return status;
+	list_for_each_entry(adv_fltr, list_head, list_entry) {
+		struct ice_adv_rule_info *rinfo = &adv_fltr->rule_info;
+		u16 lk_cnt = adv_fltr->lkups_cnt;
+
+		if (vsi_handle != rinfo->sw_act.vsi_handle)
+			continue;
+		status = ice_add_adv_rule(hw, adv_fltr->lkups, lk_cnt, rinfo,
+					  &added_entry);
+		if (status)
+			break;
+	}
+	return status;
+}
+
 /**
  * ice_replay_vsi_all_fltr - replay all filters stored in bookkeeping lists
  * @hw: pointer to the hardware structure
+ * @pi: pointer to port information structure
  * @vsi_handle: driver VSI handle
  *
  * Replays filters for requested VSI via vsi_handle.
  */
-enum ice_status ice_replay_vsi_all_fltr(struct ice_hw *hw, u16 vsi_handle)
+enum ice_status
+ice_replay_vsi_all_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+			u16 vsi_handle)
 {
 	struct ice_switch_info *sw = hw->switch_info;
-	enum ice_status status = 0;
+	enum ice_status status;
 	u8 i;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	/* Update the recipes that were created */
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		struct list_head *head;
 
 		head = &sw->recp_list[i].filt_replay_rules;
-		status = ice_replay_vsi_fltr(hw, vsi_handle, i, head);
+		if (!sw->recp_list[i].adv_rule)
+			status = ice_replay_vsi_fltr(hw, pi, sw, vsi_handle, i,
+						     head);
+		else
+			status = ice_replay_vsi_adv_rule(hw, vsi_handle, head);
 		if (status)
 			return status;
 	}
-	return status;
+
+	return 0;
 }
 
 /**
- * ice_rm_all_sw_replay_rule_info - deletes filter replay rules
+ * ice_rm_sw_replay_rule_info - helper function to delete filter replay rules
  * @hw: pointer to the HW struct
+ * @sw: pointer to switch info struct for which function removes filters
  *
- * Deletes the filter replay rules.
+ * Deletes the filter replay rules for given switch
  */
-void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw)
+void ice_rm_sw_replay_rule_info(struct ice_hw *hw, struct ice_switch_info *sw)
 {
-	struct ice_switch_info *sw = hw->switch_info;
 	u8 i;
 
 	if (!sw)
 		return;
 
-	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+	for (i = 0; i < ICE_MAX_NUM_RECIPES; i++) {
 		if (!list_empty(&sw->recp_list[i].filt_replay_rules)) {
 			struct list_head *l_head;
 
 			l_head = &sw->recp_list[i].filt_replay_rules;
-			ice_rem_sw_rule_info(hw, l_head);
+			if (!sw->recp_list[i].adv_rule)
+				ice_rem_sw_rule_info(hw, l_head);
+			else
+				ice_rem_adv_rule_info(hw, l_head);
 		}
 	}
 }
+
+/**
+ * ice_rm_all_sw_replay_rule_info - deletes filter replay rules
+ * @hw: pointer to the HW struct
+ *
+ * Deletes the filter replay rules.
+ */
+void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw)
+{
+	ice_rm_sw_replay_rule_info(hw, hw->switch_info);
+}
+
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index cb123fbe30bea501e2ee20df6d95e72dd11c3486..adc07b10c1dc6317abae38f734046ded402afaea 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -1,23 +1,42 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_SWITCH_H_
 #define _ICE_SWITCH_H_
 
 #include "ice_common.h"
+#include "ice_protocol_type.h"
 
 #define ICE_SW_CFG_MAX_BUF_LEN 2048
+#define ICE_MAX_SW 256
 #define ICE_DFLT_VSI_INVAL 0xff
 #define ICE_FLTR_RX BIT(0)
 #define ICE_FLTR_TX BIT(1)
 #define ICE_FLTR_TX_RX (ICE_FLTR_RX | ICE_FLTR_TX)
-#define ICE_VSI_INVAL_ID 0xffff
-#define ICE_INVAL_Q_HANDLE 0xFFFF
 
-/* VSI queue context structure */
-struct ice_q_ctx {
-	u16  q_handle;
-};
+
+#define DUMMY_ETH_HDR_LEN		16
+#define ICE_SW_RULE_RX_TX_ETH_HDR_SIZE \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lkup_tx_rx.hdr) + \
+	 (DUMMY_ETH_HDR_LEN * \
+	  sizeof(((struct ice_sw_rule_lkup_rx_tx *)0)->hdr[0])))
+#define ICE_SW_RULE_RX_TX_NO_HDR_SIZE \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lkup_tx_rx.hdr))
+#define ICE_SW_RULE_LG_ACT_SIZE(n) \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.lg_act.act) + \
+	 ((n) * sizeof(((struct ice_sw_rule_lg_act *)0)->act[0])))
+#define ICE_SW_RULE_VSI_LIST_SIZE(n) \
+	(offsetof(struct ice_aqc_sw_rules_elem, pdata.vsi_list.vsi) + \
+	 ((n) * sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi[0])))
+
+
+/* Worst case buffer length for ice_aqc_opc_get_res_alloc */
+#define ICE_MAX_RES_TYPES 0x80
+#define ICE_AQ_GET_RES_ALLOC_BUF_LEN \
+	(ICE_MAX_RES_TYPES * sizeof(struct ice_aqc_get_res_resp_elem))
+
+#define ICE_VSI_INVAL_ID 0xFFFF
+#define ICE_INVAL_Q_HANDLE 0xFFFF
 
 /* VSI context structure for add/get/update/free operations */
 struct ice_vsi_ctx {
@@ -31,15 +50,22 @@ struct ice_vsi_ctx {
 	u8 vf_num;
 	u16 num_lan_q_entries[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_q_ctx *lan_q_ctx[ICE_MAX_TRAFFIC_CLASS];
+	u16 num_rdma_q_entries[ICE_MAX_TRAFFIC_CLASS];
+	struct ice_q_ctx *rdma_q_ctx[ICE_MAX_TRAFFIC_CLASS];
 };
 
-enum ice_sw_fwd_act_type {
-	ICE_FWD_TO_VSI = 0,
-	ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
-	ICE_FWD_TO_Q,
-	ICE_FWD_TO_QGRP,
-	ICE_DROP_PACKET,
-	ICE_INVAL_ACT
+/* This is to be used by add/update mirror rule Admin Queue command */
+struct ice_mir_rule_buf {
+	u16 vsi_idx; /* VSI index */
+
+	/* For each VSI, user can specify whether corresponding VSI
+	 * should be added/removed to/from mirror rule
+	 *
+	 * add mirror rule: this should always be TRUE.
+	 * update mirror rule:  add(true) or remove(false) VSI to/from
+	 * mirror rule
+	 */
+	u8 add;
 };
 
 /* Switch recipe ID enum values are specific to hardware */
@@ -86,6 +112,8 @@ struct ice_fltr_info {
 		} mac_vlan;
 		struct {
 			u16 vlan_id;
+			u16 tpid;
+			u8 tpid_valid;
 		} vlan;
 		/* Set lkup_type as ICE_SW_LKUP_ETHERTYPE
 		 * if just using ethertype as filter. Set lkup_type as
@@ -125,30 +153,123 @@ struct ice_fltr_info {
 	u8 lan_en;	/* Indicate if packet can be forwarded to the uplink */
 };
 
+struct ice_update_recipe_lkup_idx_params {
+	u16 rid;
+	u16 fv_idx;
+	bool ignore_valid;
+	u16 mask;
+	bool mask_valid;
+	u8 lkup_idx;
+};
+
+struct ice_adv_lkup_elem {
+	enum ice_protocol_type type;
+	union ice_prot_hdr h_u;	/* Header values */
+	union ice_prot_hdr m_u;	/* Mask of header values to match */
+};
+
+
+struct ice_sw_act_ctrl {
+	/* Source VSI for LOOKUP_TX or source port for LOOKUP_RX */
+	u16 src;
+	u16 flag;
+	enum ice_sw_fwd_act_type fltr_act;
+	/* Depending on filter action */
+	union {
+		/* This is a queue ID in case of ICE_FWD_TO_Q and starting
+		 * queue ID in case of ICE_FWD_TO_QGRP.
+		 */
+		u16 q_id:11;
+		u16 vsi_id:10;
+		u16 hw_vsi_id:10;
+		u16 vsi_list_id:10;
+	} fwd_id;
+	/* software VSI handle */
+	u16 vsi_handle;
+	u8 qgrp_size;
+};
+
+struct ice_rule_query_data {
+	/* Recipe ID for which the requested rule was added */
+	u16 rid;
+	/* Rule ID that was added or is supposed to be removed */
+	u16 rule_id;
+	/* vsi_handle for which Rule was added or is supposed to be removed */
+	u16 vsi_handle;
+};
+
+struct ice_adv_rule_info {
+	enum ice_sw_tunnel_type tun_type;
+	struct ice_sw_act_ctrl sw_act;
+	u32 priority;
+	u8 rx; /* true means LOOKUP_RX otherwise LOOKUP_TX */
+	u16 fltr_rule_id;
+};
+
+/* A collection of one or more four word recipe */
 struct ice_sw_recipe {
-	struct list_head l_entry;
+	/* For a chained recipe the root recipe is what should be used for
+	 * programming rules
+	 */
+	u8 is_root;
+	u8 root_rid;
+	u8 recp_created;
+
+	/* Number of extraction words */
+	u8 n_ext_words;
+	/* Protocol ID and Offset pair (extraction word) to describe the
+	 * recipe
+	 */
+	struct ice_fv_word ext_words[ICE_MAX_CHAIN_WORDS];
+	u16 word_masks[ICE_MAX_CHAIN_WORDS];
+
+	/* if this recipe is a collection of other recipe */
+	u8 big_recp;
+
+	/* if this recipe is part of another bigger recipe then chain index
+	 * corresponding to this recipe
+	 */
+	u8 chain_idx;
 
-	/* To protect modification of filt_rule list
-	 * defined below
+	/* if this recipe is a collection of other recipe then count of other
+	 * recipes and recipe IDs of those recipes
 	 */
-	struct mutex filt_rule_lock;
+	u8 n_grp_count;
+
+	/* Bit map specifying the IDs associated with this group of recipe */
+	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+
+	enum ice_sw_tunnel_type tun_type;
 
-	/* List of type ice_fltr_mgmt_list_entry */
+	/* List of type ice_fltr_mgmt_list_entry or adv_rule */
+	u8 adv_rule;
 	struct list_head filt_rules;
 	struct list_head filt_replay_rules;
 
-	/* linked list of type recipe_list_entry */
-	struct list_head rg_list;
-	/* linked list of type ice_sw_fv_list_entry*/
+	struct mutex filt_rule_lock;	/* protect filter rule structure */
+
+	/* Profiles this recipe should be associated with */
 	struct list_head fv_list;
-	struct ice_aqc_recipe_data_elem *r_buf;
-	u8 recp_count;
-	u8 root_rid;
-	u8 num_profs;
-	u8 *prof_ids;
 
-	/* recipe bitmap: what all recipes makes this recipe */
-	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+	/* Profiles this recipe is associated with */
+	u8 num_profs, *prof_ids;
+
+	/* Bit map for possible result indexes */
+	DECLARE_BITMAP(res_idxs, ICE_MAX_FV_WORDS);
+
+	/* This allows user to specify the recipe priority.
+	 * For now, this becomes 'fwd_priority' when recipe
+	 * is created, usually recipes can have 'fwd' and 'join'
+	 * priority.
+	 */
+	u8 priority;
+
+	struct list_head rg_list;
+
+	/* AQ buffer associated with this recipe */
+	struct ice_aqc_recipe_data_elem *root_buf;
+	/* This struct saves the fv_words for a given lookup */
+	struct ice_prot_lkup_ext lkup_exts;
 };
 
 /* Bookkeeping structure to hold bitmap of VSIs corresponding to VSI list ID */
@@ -166,6 +287,7 @@ struct ice_fltr_list_entry {
 	struct ice_fltr_info fltr_info;
 };
 
+
 /* This defines an entry in the list that maintains MAC or VLAN membership
  * to HW list mapping, since multiple VSIs can subscribe to the same MAC or
  * VLAN. As an optimization the VSI list should be created only when a
@@ -186,6 +308,16 @@ struct ice_fltr_mgmt_list_entry {
 	u8 counter_index;
 };
 
+struct ice_adv_fltr_mgmt_list_entry {
+	struct list_head list_entry;
+
+	struct ice_adv_lkup_elem *lkups;
+	struct ice_adv_rule_info rule_info;
+	u16 lkups_cnt;
+	struct ice_vsi_list_map_info *vsi_list_info;
+	u16 vsi_count;
+};
+
 enum ice_promisc_flags {
 	ICE_PROMISC_UCAST_RX = 0x1,
 	ICE_PROMISC_UCAST_TX = 0x2,
@@ -207,28 +339,91 @@ ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 enum ice_status
 ice_update_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 	       struct ice_sq_cd *cd);
-bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle);
 struct ice_vsi_ctx *ice_get_vsi_ctx(struct ice_hw *hw, u16 vsi_handle);
 void ice_clear_all_vsi_ctx(struct ice_hw *hw);
+enum ice_status
+ice_aq_get_vsi_params(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		      struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_add_update_mir_rule(struct ice_hw *hw, u16 rule_type, u16 dest_vsi,
+			   u16 count, struct ice_mir_rule_buf *mr_buf,
+			   struct ice_sq_cd *cd, u16 *rule_id);
+enum ice_status
+ice_aq_delete_mir_rule(struct ice_hw *hw, u16 rule_id, bool keep_allocd,
+		       struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_storm_ctrl(struct ice_hw *hw, u32 *bcast_thresh, u32 *mcast_thresh,
+		      u32 *ctl_bitmask);
+enum ice_status
+ice_aq_set_storm_ctrl(struct ice_hw *hw, u32 bcast_thresh, u32 mcast_thresh,
+		      u32 ctl_bitmask);
 /* Switch config */
+enum ice_status
+ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp_elem *buf,
+		  u16 buf_size, u16 *req_desc, u16 *num_elems,
+		  struct ice_sq_cd *cd);
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
-/* Switch/bridge related commands */
+enum ice_status
+ice_alloc_vlan_res_counter(struct ice_hw *hw, u16 *counter_id);
+enum ice_status
+ice_free_vlan_res_counter(struct ice_hw *hw, u16 counter_id);
+enum ice_status
+ice_alloc_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		   u16 *counter_id);
+enum ice_status
+ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items,
+		  u16 counter_id);
+
 enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw);
+enum ice_status ice_alloc_rss_global_lut(struct ice_hw *hw, bool shared_res, u16 *global_lut_id);
+enum ice_status ice_free_rss_global_lut(struct ice_hw *hw, u16 global_lut_id);
+enum ice_status
+ice_alloc_sw(struct ice_hw *hw, bool ena_stats, bool shared_res, u16 *sw_id,
+	     u16 *counter_id);
+enum ice_status
+ice_free_sw(struct ice_hw *hw, u16 sw_id, u16 counter_id);
+enum ice_status
+ice_aq_get_res_alloc(struct ice_hw *hw, u16 *num_entries,
+		     struct ice_aqc_get_res_resp_elem *buf, u16 buf_size,
+		     struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_get_res_descs(struct ice_hw *hw, u16 num_entries,
+		     struct ice_aqc_res_elem *buf, u16 buf_size, u16 res_type,
+		     bool res_shared, u16 *desc_id, struct ice_sq_cd *cd);
+enum ice_status
+ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
+enum ice_status
+ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
+void ice_rem_all_sw_rules_info(struct ice_hw *hw);
 enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_lst);
 enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
+bool ice_mac_fltr_exist(struct ice_hw *hw, u8 *mac, u16 vsi_handle);
+bool ice_vlan_fltr_exist(struct ice_hw *hw, u16 vlan_id, u16 vsi_handle);
 enum ice_status
 ice_add_eth_mac(struct ice_hw *hw, struct list_head *em_list);
 enum ice_status
 ice_remove_eth_mac(struct ice_hw *hw, struct list_head *em_list);
-void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle);
+void ice_dump_sw_rules(struct ice_hw *hw, enum ice_sw_lkup_type lookup);
 enum ice_status
-ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
-enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
+ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable);
+enum ice_status
+ice_add_mac_vlan(struct ice_hw *hw, struct list_head *m_list);
+enum ice_status
+ice_remove_mac_vlan(struct ice_hw *hw, struct list_head *v_list);
+
+enum ice_status
+ice_add_mac_with_sw_marker(struct ice_hw *hw, struct ice_fltr_info *f_info,
+			   u16 sw_marker);
+enum ice_status
+ice_add_mac_with_counter(struct ice_hw *hw, struct ice_fltr_info *f_info);
+void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_handle);
+
 
 /* Promisc/defport setup for VSIs */
 enum ice_status
-ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_handle, bool set, u8 direction);
+ice_cfg_dflt_vsi(struct ice_port_info *pi, u16 vsi_handle, bool set,
+		 u8 direction);
 enum ice_status
 ice_set_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
 		    u16 vid);
@@ -239,11 +434,62 @@ enum ice_status
 ice_set_vlan_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 promisc_mask,
 			 bool rm_vlan_promisc);
 
-enum ice_status ice_init_def_sw_recp(struct ice_hw *hw);
+/* Get VSIs Promisc/defport settings */
+enum ice_status
+ice_get_vsi_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+		    u16 *vid);
+enum ice_status
+ice_get_vsi_vlan_promisc(struct ice_hw *hw, u16 vsi_handle, u8 *promisc_mask,
+			 u16 *vid);
+
+enum ice_status
+ice_aq_add_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 num_recipes, struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_get_recipe(struct ice_hw *hw,
+		  struct ice_aqc_recipe_data_elem *s_recipe_list,
+		  u16 *num_recipes, u16 recipe_root, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap,
+			     struct ice_sq_cd *cd);
+
+enum ice_status ice_alloc_recipe(struct ice_hw *hw, u16 *recipe_id);
+enum ice_status
+ice_add_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo,
+		 struct ice_rule_query_data *added_entry);
+enum ice_status
+ice_rem_adv_rule_for_vsi(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_rem_adv_rule_by_id(struct ice_hw *hw,
+		       struct ice_rule_query_data *remove_entry);
+enum ice_status
+ice_rem_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups,
+		 u16 lkups_cnt, struct ice_adv_rule_info *rinfo);
+
+enum ice_status ice_dump_sw_cfg(struct ice_hw *hw);
+
+enum ice_status
+ice_init_def_sw_recp(struct ice_hw *hw, struct ice_sw_recipe **recp_list);
 u16 ice_get_hw_vsi_num(struct ice_hw *hw, u16 vsi_handle);
 bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle);
 
-enum ice_status ice_replay_vsi_all_fltr(struct ice_hw *hw, u16 vsi_handle);
+enum ice_status
+ice_replay_vsi_all_fltr(struct ice_hw *hw, struct ice_port_info *pi,
+			u16 vsi_handle);
+void ice_rm_sw_replay_rule_info(struct ice_hw *hw, struct ice_switch_info *sw);
 void ice_rm_all_sw_replay_rule_info(struct ice_hw *hw);
-
+enum ice_status
+ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
+		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd);
+enum ice_status
+ice_update_recipe_lkup_idx(struct ice_hw *hw,
+			   struct ice_update_recipe_lkup_idx_params *params);
+void ice_change_proto_id_to_dvm(void);
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
new file mode 100644
index 0000000000000000000000000000000000000000..62e6546ceedafe1997d6339948d34949d16245d5
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
@@ -0,0 +1,2279 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_tc_lib.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_detect_filter_conflict - detect filter conflict across TC
+ * @pf: Pointer to PF structure
+ * @tc_fltr: Pointer to TC flower filter structure
+ *
+ * This function detects filter mismatch type but using same port_number
+ * across TC and allow/deny desired filter combination. Example is,
+ * filter 1, dest_ip + dest_port (80) -> action is forward to TC 1
+ * filter 2: dest_ip + src_port (80) -> action is forward to TC 2
+ *
+ * We do not want to support such config, to avoid situation where
+ * packets are getting duplicated across both the TCs if incoming Rx
+ * packet has same dest_ip + src_port (80) + dst_port (80).
+ * Due to both filter being same high prio filter in HW, both rule
+ * can match (whereas that is not expectation) and cause unexpected
+ * packet mirroring.
+ */
+static int
+ice_detect_filter_conflict(struct ice_pf *pf,
+			   struct ice_tc_flower_fltr *tc_fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers;
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_tc_l4_hdr *l4_key;
+	u16 sport = 0, dport = 0;
+
+	/* header = outer header for non-tunnel filter,
+	 * otherwise inner_headers
+	 */
+	headers = &tc_fltr->outer_headers;
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+		headers = &tc_fltr->inner_headers;
+
+	l4_key = &headers->l4_key;
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+		sport = be16_to_cpu(l4_key->src_port);
+	if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+		dport = be16_to_cpu(l4_key->dst_port);
+
+	hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) {
+		struct ice_tc_flower_lyr_2_4_hdrs *fltr_headers;
+		struct ice_tc_l4_hdr *fltr_l4_key;
+		u16 dst_port = 0, src_port = 0;
+
+		/* if tc_class is same, skip, no check needed */
+		if (fltr->action.tc_class == tc_fltr->action.tc_class)
+			continue;
+
+		/* if only either of them are set, skip it */
+		if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) ^
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID))
+			continue;
+
+		/* if this is tunnel filter, make sure tunnel ID is not same */
+		if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)) {
+			if (fltr->tenant_id && tc_fltr->tenant_id &&
+			    fltr->tenant_id == tc_fltr->tenant_id) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same tunnel key for other TC(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same tunnel key (%u)\n",
+					fltr->action.tc_class,
+					be32_to_cpu(fltr->tenant_id));
+				return -EOPNOTSUPP;
+			}
+		}
+
+		fltr_headers = &fltr->outer_headers;
+		if (fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			fltr_headers = &fltr->inner_headers;
+
+		/* access L4 params */
+		fltr_l4_key = &fltr_headers->l4_key;
+		if (fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+			dst_port = be16_to_cpu(fltr_l4_key->dst_port);
+		if (fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+			src_port = be16_to_cpu(fltr_l4_key->src_port);
+
+		/* proceed only if tc_class is different and filter types
+		 * are different but actual value(s) of say port number are
+		 * same, flag warning to user.
+		 * e.g if filter one is like dest port = 80 -> tc_class(1)
+		 * and second filter is like, src_port = 80 -> tc_class(2)
+		 * Invariably packet can match both the filter and user
+		 * will get expected packet mirroring to both the destination
+		 * (means tc_class(1) and tc_class(2)). To avoid such
+		 * behavior, block user from adding such conficting filter
+		 */
+		if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			if (dport && dst_port && dport == dst_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same destination port for other TC, as destination port based filter(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as destination port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, dst_port);
+				return -EOPNOTSUPP;
+			}
+			if (dport && src_port && dport == src_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same destination port for other TC, as source port based filter(see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as source port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, src_port);
+				return -EOPNOTSUPP;
+			}
+		}
+
+		if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)  {
+			if (sport && dst_port && sport == dst_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same source port for other TC, as destination port based filter (see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as destination port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, dst_port);
+				return -EOPNOTSUPP;
+			}
+			if (sport && src_port && sport == src_port) {
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unsupported filter combination across TC, filter exist with same source port for other TC, as source port based filter (see dmesg log)");
+				dev_err(dev, "Unsupported filter combination across TC, TC %d has filter using same port number (%u) as source port based filter. This is to avoid unexpected packet mirroring.\n",
+					fltr->action.tc_class, src_port);
+				return -EOPNOTSUPP;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_chnl_fltr_type_chk - filter type check
+ * @pf: Pointer to PF
+ * @tc_fltr: Pointer to TC flower filter structure
+ * @final_fltr_type: Ptr to filter type (dest/src/dest+src port)
+ *
+ * This function is used to determine if given filter (based on input params)
+ * should be allowed or not. For a given channel (aka ADQ VSI), supported
+ * filter types are src port, dest port , src+dest port. SO this function
+ * checks if any filter exist for specified channel (if so, channel specific
+ * filter_type will be set), and see if it matches with the filter being added.
+ * It returns 0 (upon success) or POSIX error code
+ */
+static int
+ice_chnl_fltr_type_chk(struct ice_pf *pf, struct ice_tc_flower_fltr *tc_fltr,
+		       enum ice_channel_fltr_type *final_fltr_type)
+{
+	enum ice_channel_fltr_type fltr_type = *final_fltr_type;
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (fltr_type == ICE_CHNL_FLTR_TYPE_INVALID) {
+		/* L4 based filter, more granular, hence should be checked
+		 * beore L3
+		 */
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		/* L3 (IPv4) based filter check */
+		else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+			 (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		/* L3 (IPv6) based filter check */
+		else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+			 (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6))
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6)
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		/* Tunnel filter check, inner criteria is open:
+		 * any combination of inner L3 and/or L4
+		 */
+		else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			fltr_type = ICE_CHNL_FLTR_TYPE_TENANT_ID;
+		else
+			return -EOPNOTSUPP;
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_PORT) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC_PORT to SRC + DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC_PORT to DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST_PORT to SRC + DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST_PORT to SRC_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT) {
+		/* must to have src/dest/src+dest port as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST_PORT to DEST_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_PORT;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) &&
+			   (!(tc_fltr->flags &
+			      ICE_TC_FLWR_FIELD_DEST_L4_PORT))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST_PORT to SRC_PORT\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_PORT;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID) {
+		/* Now only allow filters which has VNI */
+		if (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID))
+			return -EOPNOTSUPP;
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4) {
+		/* must to have src/dest/src+dest IPv4 addr as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv4 addr to DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4) &&
+			   (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv4 to SRC IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_IPV4) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv4 addr to SRC + DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv4 addr to SRC IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_IPV4) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv4 addr to SRC + DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV4;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV4) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv4 addr to DEST IPv4 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV4;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6) {
+		/* must to have src/dest/src+dest IPv6 addr as part of filter
+		 * criteria
+		 */
+		if ((!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6)) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)))
+			return -EOPNOTSUPP;
+
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv6 addr to DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		} else if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6) &&
+			   (!(tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6))) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC+DEST IPv6 to SRC IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_DEST_IPV6) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv6 addr to SRC + DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from DEST IPv6 addr to SRC IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_IPV6;
+		}
+	} else if (fltr_type == ICE_CHNL_FLTR_TYPE_SRC_IPV6) {
+		if ((tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) &&
+		    (tc_fltr->flags & ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv6 addr to SRC + DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_SRC_DEST_IPV6;
+		} else if (tc_fltr->flags & ICE_TC_FLWR_FIELD_DEST_IPV6) {
+			dev_dbg(dev,
+				"Changing filter type for action (tc_class %d) from SRC IPv6 addr to DEST IPv6 addr\n",
+				tc_fltr->action.tc_class);
+			fltr_type = ICE_CHNL_FLTR_TYPE_DEST_IPV6;
+		}
+	} else {
+		return -EINVAL; /* unsupported filter type */
+	}
+
+	/* return the selected fltr_type */
+	*final_fltr_type = fltr_type;
+
+	return 0;
+}
+
+/**
+ * ice_determine_gtp_tun_type - determine TUN type based on user params
+ * @pf: Pointer to PF
+ * @l4_proto : vale of L4 protocol type
+ * @flags: TC filter flags
+ * @rule_info: Pointer to rule_info structure
+ *
+ * Determine TUN type based on user input. For VxLAN and Geneve, it is
+ * straight forward. But to detect, correct TUN type for GTP is
+ * challenging because there is no native support for GTP in kernel
+ * and user may want to filter on
+ *          Outer UDP + GTP (optional) + Inner L3 + Inner L4
+ * Actual API to add advanced switch filter expects caller to detect
+ * and specify correct TUN type and based on TUN type, appropriate
+ * type of rule is added in HW.
+ */
+static bool
+ice_determine_gtp_tun_type(struct ice_pf *pf, u16 l4_proto, u32 flags,
+			   struct ice_adv_rule_info *rule_info)
+{
+	u8 outer_ipv6 = 0, inner_ipv6 = 0;
+	u8 outer_ipv4 = 0, inner_ipv4 = 0;
+
+	/* if user specified enc IPv6 src/dest/src+dest IP */
+	if (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+		     ICE_TC_FLWR_FIELD_ENC_SRC_IPV6))
+		outer_ipv6 = 1;
+	else if (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4))
+		outer_ipv4 = 1;
+
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+		     ICE_TC_FLWR_FIELD_SRC_IPV6))
+		inner_ipv6 = 1;
+	else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 |
+			  ICE_TC_FLWR_FIELD_SRC_IPV4))
+		inner_ipv4 = 1;
+	else
+		/* for GTP encap, specifying inner L3 is must at this point,
+		 * inner L4 is optional
+		 */
+		return false;
+
+	/* following block support various protocol combinations for GTP
+	 * (at this pint we know that detected tunnel type is GTP based
+	 * on outer UDP port (2152: GTP_U):
+	 *     Outer IPv4 + Inner IPv4[6] + Inner TCP/UDP
+	 *     Outer IPv4 + Inner IPv4[6]
+	 *     Outer IPv6 + Inner IPv4[6] + Inner TCP/UDP
+	 *     Outer IPv6 + Inner IPv4[6]
+	 */
+	if (!outer_ipv6 && !outer_ipv4) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_GTP_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	} else if (outer_ipv4) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTPU_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_IPV4_GTPU_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	} else if (outer_ipv6) {
+		if (inner_ipv4 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV4_TCP;
+		else if (inner_ipv4 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV4_UDP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_TCP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV6_TCP;
+		else if (inner_ipv6 && l4_proto == IPPROTO_UDP)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTP_IPV6_UDP;
+		else if (inner_ipv4)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTPU_IPV4;
+		else if (inner_ipv6)
+			rule_info->tun_type = ICE_SW_TUN_IPV6_GTPU_IPV6;
+		else
+			/* no reason to proceed, error condition (must to
+			 * specify inner L3 and/or inner L3 + inner L4)
+			 */
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_tc_count_lkups - determine lookup count for switch filter
+ * @flags: tc-flower flags
+ * @headers: Pointer to TC flower filter header structure
+ * @fltr: Pointer to outer TC filter structure
+ *
+ * Determine lookup count based on TC flower input for switch filter.
+ */
+static int
+ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers,
+		   struct ice_tc_flower_fltr *fltr)
+{
+	int lkups_cnt = 0;
+
+	if (flags & ICE_TC_FLWR_FIELD_ETH_TYPE_ID)
+		lkups_cnt++;
+
+	/* is Tunnel ID specified */
+	if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) {
+		/* For ADQ filter, outer DMAC gets added implictly */
+		if (flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC)
+			lkups_cnt++;
+		/* Copy outer L4 port for non-GTP tunnel */
+		if (fltr->tunnel_type != TNL_GTP) {
+			if (flags & ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT)
+				if (headers->l3_key.ip_proto == IPPROTO_UDP)
+					lkups_cnt++;
+		}
+		/* due to tunnel */
+		lkups_cnt++;
+	}
+
+	/* is MAC fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DST_MAC | ICE_TC_FLWR_FIELD_SRC_MAC))
+		lkups_cnt++;
+
+	/* is VLAN specified? */
+	if (flags & ICE_TC_FLWR_FIELD_VLAN)
+		lkups_cnt++;
+
+	/* is IPv[4|6] fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 | ICE_TC_FLWR_FIELD_SRC_IPV4))
+		lkups_cnt++;
+	else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+			  ICE_TC_FLWR_FIELD_SRC_IPV6))
+		lkups_cnt++;
+
+	/* is L4 (TCP/UDP/any other L4 protocol fields specified? */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_L4_PORT |
+		     ICE_TC_FLWR_FIELD_SRC_L4_PORT))
+		lkups_cnt++;
+
+	return lkups_cnt;
+}
+
+/**
+ * ice_tc_fill_rules - fill filter rules based on tc fltr
+ * @hw: pointer to hw structure
+ * @flags: tc flower field flags
+ * @tc_fltr: pointer to tc flower filter
+ * @list: list of advance rule elements
+ * @rule_info: pointer to information about rule
+ * @l4_proto: pointer to information such as L4 proto type
+ *
+ * Fill ice_adv_lkup_elem list based on tc flower flags and
+ * tc flower headers. This list should be used to add
+ * advance filter in hardware.
+ */
+static int
+ice_tc_fill_rules(struct ice_hw *hw, u32 flags,
+		  struct ice_tc_flower_fltr *tc_fltr,
+		  struct ice_adv_lkup_elem *list,
+		  struct ice_adv_rule_info *rule_info,
+		  u16 *l4_proto)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	int i = 0;
+
+	if (flags & ICE_TC_FLWR_FIELD_ETH_TYPE_ID) {
+		list[i].type = ICE_ETYPE_OL;
+		list[i].h_u.ethertype.ethtype_id = headers->l2_key.n_proto;
+		list[i].m_u.ethertype.ethtype_id = headers->l2_mask.n_proto;
+		i++;
+	}
+
+	/* copy L2 (MAC) fields, Outer UDP (in case of tunnel) port info */
+	if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) {
+		u32 tenant_id;
+
+		/* copy L2 (MAC) fields if specified, For tunnel outer DMAC
+		 * is needed and supported and is part of outer_headers.dst_mac
+		 * For VxLAN tunnel, supported ADQ filter config is:
+		 * - Outer dest MAC + VNI + Inner IPv4 + Inner L4 ports
+		 */
+		if (flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC) {
+			list[i].type = ICE_MAC_OFOS;
+			ether_addr_copy(list[i].h_u.eth_hdr.dst_addr,
+					headers->l2_key.dst_mac);
+			ether_addr_copy(list[i].m_u.eth_hdr.dst_addr,
+					headers->l2_mask.dst_mac);
+			i++;
+		}
+		/* copy outer UDP (enc_dst_port) only for non-GTP tunnel */
+		if (tc_fltr->tunnel_type != TNL_GTP) {
+			if ((flags & ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT) &&
+			    headers->l3_key.ip_proto == IPPROTO_UDP) {
+				list[i].type = ICE_UDP_OF;
+				list[i].h_u.l4_hdr.dst_port =
+					headers->l4_key.dst_port;
+				list[i].m_u.l4_hdr.dst_port =
+					headers->l4_mask.dst_port;
+				i++;
+			}
+		}
+
+		/* setup encap info in list elements such as VNI/encap key-id,
+		 * mask, type of tunnel
+		 */
+		if (tc_fltr->tunnel_type == TNL_VXLAN)
+			list[i].type = ICE_VXLAN;
+		else if (tc_fltr->tunnel_type == TNL_GENEVE)
+			list[i].type = ICE_GENEVE;
+		else if (tc_fltr->tunnel_type == TNL_GTP)
+			list[i].type = ICE_GTP;
+
+		if (tc_fltr->tunnel_type == TNL_VXLAN ||
+		    tc_fltr->tunnel_type == TNL_GENEVE) {
+			tenant_id = be32_to_cpu(tc_fltr->tenant_id) << 8;
+			list[i].h_u.tnl_hdr.vni = cpu_to_be32(tenant_id);
+			if (tenant_id)
+				/* 24 bit tunnel key: mask "\xff\xff\xff\x00" */
+				memcpy(&list[i].m_u.tnl_hdr.vni,
+				       "\xff\xff\xff\x00", 4);
+			else
+				memcpy(&list[i].m_u.tnl_hdr.vni,
+				       "\x00\x00\x00\x00", 4);
+		} else if (tc_fltr->tunnel_type == TNL_GTP) {
+			tenant_id = be32_to_cpu(tc_fltr->tenant_id);
+			list[i].h_u.gtp_hdr.teid = cpu_to_be32(tenant_id);
+			if (tenant_id)
+				/* 32 bit tunnel key: mask "\xff\xff\xff\xff" */
+				memcpy(&list[i].m_u.gtp_hdr.teid,
+				       "\xff\xff\xff\xff", 4);
+			else
+				memcpy(&list[i].m_u.gtp_hdr.teid,
+				       "\x00\x00\x00x00", 4);
+		}
+		/* advance list index */
+		i++;
+
+		/* now access values from inner_headers such as inner MAC (if
+		 * supported), inner IPv4[6], Inner L4 ports, hence update
+		 * "headers" to point to inner_headers
+		 */
+		headers = &tc_fltr->inner_headers;
+	} else {
+		rule_info->tun_type = ICE_NON_TUN;
+		/* copy L2 (MAC) fields, for non-tunnel case */
+		if (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			     ICE_TC_FLWR_FIELD_SRC_MAC)) {
+			struct ice_tc_l2_hdr *l2_key, *l2_mask;
+
+			l2_key = &headers->l2_key;
+			l2_mask = &headers->l2_mask;
+
+			list[i].type = ICE_MAC_OFOS;
+			if (flags & ICE_TC_FLWR_FIELD_DST_MAC) {
+				ether_addr_copy(list[i].h_u.eth_hdr.dst_addr,
+						l2_key->dst_mac);
+				ether_addr_copy(list[i].m_u.eth_hdr.dst_addr,
+						l2_mask->dst_mac);
+			}
+			if (flags & ICE_TC_FLWR_FIELD_SRC_MAC) {
+				ether_addr_copy(list[i].h_u.eth_hdr.src_addr,
+						l2_key->src_mac);
+				ether_addr_copy(list[i].m_u.eth_hdr.src_addr,
+						l2_mask->src_mac);
+			}
+			i++;
+		}
+	}
+
+	/* copy VLAN info */
+	if (flags & ICE_TC_FLWR_FIELD_VLAN) {
+		list[i].type = ICE_VLAN_OFOS;
+		list[i].h_u.vlan_hdr.vlan = headers->vlan_hdr.vlan_id;
+		list[i].m_u.vlan_hdr.vlan = cpu_to_be16(0xFFFF);
+		i++;
+	}
+
+
+	/* copy L3 (IPv[4|6]: src, dest) address */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV4 |
+		     ICE_TC_FLWR_FIELD_SRC_IPV4)) {
+		struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+		/* For encap, Outer L3 and L4 based are not supported,
+		 * hence if user specified L3, L4 fields, they are treated
+		 * as inner L3 and L4 respectivelt
+		 */
+		if (flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			list[i].type = ICE_IPV4_IL;
+		else
+			list[i].type = ICE_IPV4_OFOS;
+
+		l3_key = &headers->l3_key;
+		l3_mask = &headers->l3_mask;
+		if (flags & ICE_TC_FLWR_FIELD_DEST_IPV4) {
+			list[i].h_u.ipv4_hdr.dst_addr = l3_key->dst_ipv4;
+			list[i].m_u.ipv4_hdr.dst_addr = l3_mask->dst_ipv4;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_IPV4) {
+			list[i].h_u.ipv4_hdr.src_addr = l3_key->src_ipv4;
+			list[i].m_u.ipv4_hdr.src_addr = l3_mask->src_ipv4;
+		}
+		i++;
+	} else if (flags & (ICE_TC_FLWR_FIELD_DEST_IPV6 |
+			    ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+		struct ice_ipv6_hdr *ipv6_hdr, *ipv6_mask;
+		struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+		if (flags & ICE_TC_FLWR_FIELD_TENANT_ID)
+			list[i].type = ICE_IPV6_IL;
+		else
+			list[i].type = ICE_IPV6_OFOS;
+		ipv6_hdr = &list[i].h_u.ipv6_hdr;
+		ipv6_mask = &list[i].m_u.ipv6_hdr;
+		l3_key = &headers->l3_key;
+		l3_mask = &headers->l3_mask;
+
+		if (flags & ICE_TC_FLWR_FIELD_DEST_IPV6) {
+			memcpy(&ipv6_hdr->dst_addr, &l3_key->dst_ipv6_addr,
+			       sizeof(l3_key->dst_ipv6_addr));
+			memcpy(&ipv6_mask->dst_addr, &l3_mask->dst_ipv6_addr,
+			       sizeof(l3_mask->dst_ipv6_addr));
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_IPV6) {
+			memcpy(&ipv6_hdr->src_addr, &l3_key->src_ipv6_addr,
+			       sizeof(l3_key->src_ipv6_addr));
+			memcpy(&ipv6_mask->src_addr, &l3_mask->src_ipv6_addr,
+			       sizeof(l3_mask->src_ipv6_addr));
+		}
+		i++;
+	}
+
+	/* copy L4 (src, dest) port */
+	if (flags & (ICE_TC_FLWR_FIELD_DEST_L4_PORT |
+		     ICE_TC_FLWR_FIELD_SRC_L4_PORT)) {
+		struct ice_tc_l4_hdr *l4_key, *l4_mask;
+		u16 dst_port;
+
+		l4_key = &headers->l4_key;
+		l4_mask = &headers->l4_mask;
+		dst_port = be16_to_cpu(l4_key->dst_port);
+		if (headers->l3_key.ip_proto == IPPROTO_TCP) {
+			list[i].type = ICE_TCP_IL;
+			/* detected L4 proto is TCP */
+			if (l4_proto)
+				*l4_proto = IPPROTO_TCP;
+		} else if (headers->l3_key.ip_proto == IPPROTO_UDP) {
+			/* Check if UDP dst port is known as a tunnel port */
+			if (ice_tunnel_port_in_use(hw, dst_port, NULL)) {
+				list[i].type = ICE_UDP_OF;
+				rule_info->tun_type = ICE_SW_TUN_VXLAN;
+			} else {
+				list[i].type = ICE_UDP_ILOS;
+			}
+			/* detected L4 proto is UDP */
+			if (l4_proto)
+				*l4_proto = IPPROTO_UDP;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_DEST_L4_PORT) {
+			list[i].h_u.l4_hdr.dst_port = l4_key->dst_port;
+			list[i].m_u.l4_hdr.dst_port = l4_mask->dst_port;
+		}
+		if (flags & ICE_TC_FLWR_FIELD_SRC_L4_PORT) {
+			list[i].h_u.l4_hdr.src_port = l4_key->src_port;
+			list[i].m_u.l4_hdr.src_port = l4_mask->src_port;
+		}
+		i++;
+	}
+
+	return i;
+}
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+static int ice_eswitch_tc_parse_action(struct ice_tc_flower_fltr *fltr,
+				       struct flow_action_entry *act)
+{
+	struct ice_repr *repr;
+
+	switch (act->id) {
+	case FLOW_ACTION_DROP:
+		fltr->action.fltr_act = ICE_DROP_PACKET;
+		break;
+
+	case FLOW_ACTION_REDIRECT:
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+
+		if (ice_is_port_repr_netdev(act->dev)) {
+			repr = ice_netdev_to_repr(act->dev);
+
+			fltr->dest_vsi = repr->src_vsi;
+			fltr->direction = ICE_ESWITCH_FLTR_INGRESS;
+		} else if (netif_is_ice(act->dev)) {
+			struct ice_netdev_priv *np = netdev_priv(act->dev);
+
+			fltr->dest_vsi = np->vsi;
+			fltr->direction = ICE_ESWITCH_FLTR_EGRESS;
+		} else {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unsupported netdevice in switchdev mode");
+			return -EINVAL;
+		}
+
+		break;
+
+	default:
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unsupported action in switchdev mode");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+
+static int
+ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers;
+	struct ice_adv_rule_info rule_info = { 0 };
+	struct ice_rule_query_data rule_added;
+	struct ice_adv_lkup_elem *list;
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 flags = fltr->flags;
+	enum ice_status status;
+	int lkups_cnt;
+	int ret = 0;
+	int i;
+
+	if (!flags || (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unsupported encap field(s)");
+		return -EOPNOTSUPP;
+	}
+
+	lkups_cnt = ice_tc_count_lkups(flags, headers, fltr);
+	list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	i = ice_tc_fill_rules(hw, flags, fltr, list, &rule_info, NULL);
+	if (i != lkups_cnt) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	rule_info.sw_act.fltr_act = fltr->action.fltr_act;
+	rule_info.sw_act.vsi_handle = fltr->dest_vsi->idx;
+	rule_info.priority = 7;
+
+	if (fltr->direction == ICE_ESWITCH_FLTR_INGRESS) {
+		rule_info.sw_act.flag |= ICE_FLTR_RX;
+		rule_info.sw_act.src = hw->pf_id;
+		rule_info.rx = true;
+	} else {
+		rule_info.sw_act.flag |= ICE_FLTR_TX;
+		rule_info.sw_act.src = vsi->idx;
+		rule_info.rx = false;
+	}
+
+	/* specify the cookie as filter_rule_id */
+	rule_info.fltr_rule_id = fltr->cookie;
+
+	status = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because it already exist");
+		ret = -EINVAL;
+		goto exit;
+	} else if (status) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter due to error");
+		ret = -EIO;
+		goto exit;
+	}
+
+	/* store the output params, which are needed later for removing
+	 * advanced switch filter
+	 */
+	fltr->rid = rule_added.rid;
+	fltr->rule_id = rule_added.rule_id;
+
+	if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) {
+		if (ice_fltr_update_flags(vsi, fltr->rule_id, fltr->rid,
+					  ICE_SINGLE_ACT_LAN_ENABLE))
+			ice_rem_adv_rule_by_id(hw, &rule_added);
+	}
+
+exit:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * ice_add_tc_flower_adv_fltr - add appropriate filter rules
+ * @vsi: Pointer to VSI
+ * @tc_fltr: Pointer to TC flower filter structure
+ *
+ * based on filter parameters using Advance recipes supported
+ * by OS package.
+ */
+int
+ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi,
+			   struct ice_tc_flower_fltr *tc_fltr)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	enum ice_channel_fltr_type fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+	struct ice_adv_rule_info rule_info = {0};
+	struct ice_rule_query_data rule_added;
+	struct ice_channel_vf *vf_ch = NULL;
+	struct ice_adv_lkup_elem *list;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	u32 flags = tc_fltr->flags;
+	enum ice_status status;
+	struct ice_vsi *ch_vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	u16 lkups_cnt = 0;
+	u16 l4_proto = 0;
+	int ret = 0;
+	u16 i = 0;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_is_safe_mode(pf)) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter because driver is in safe mode");
+		return -EOPNOTSUPP;
+	}
+
+	if (!flags || (flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV4 |
+				ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+				ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT))) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unsupported encap field(s)");
+		return -EOPNOTSUPP;
+	}
+
+	/* get the channel (aka ADQ VSI) */
+	if (tc_fltr->dest_vsi)
+		ch_vsi = tc_fltr->dest_vsi;
+	else
+		ch_vsi = vsi->tc_map_vsi[tc_fltr->action.tc_class];
+
+	lkups_cnt = ice_tc_count_lkups(flags, headers, tc_fltr);
+	list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	i = ice_tc_fill_rules(hw, flags, tc_fltr, list, &rule_info, &l4_proto);
+	if (i != lkups_cnt) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	/* Now determine correct TUN type of based on encap params */
+	if ((flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+	    tc_fltr->tunnel_type == TNL_GTP) {
+		if (!ice_determine_gtp_tun_type(pf, l4_proto, tc_fltr->flags,
+						&rule_info)) {
+			if (vsi->type == ICE_VSI_VF)
+				dev_err(dev, "Unable to add filter because could not determine tun type, VSI %u, vf_id:%u\n",
+					vsi->vsi_num, vsi->vf_id);
+			else
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Unable to add filter because could not determine TUN type. ");
+			ret = -EINVAL;
+			goto exit;
+		}
+	}
+
+	rule_info.sw_act.fltr_act = tc_fltr->action.fltr_act;
+	if (tc_fltr->action.tc_class >= ICE_CHNL_START_TC) {
+		if (!ch_vsi) {
+			NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+					   "Unable to add filter because specified destination doesn't exist");
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* dest_vsi is preset, means it is from virtchnl message */
+		if (tc_fltr->dest_vsi) {
+			if (vsi->type != ICE_VSI_VF ||
+			    tc_fltr->dest_vsi->type != ICE_VSI_VF) {
+				dev_err(dev, "Unexpected VSI(vf_id:%u) type: %u\n",
+					vsi->vf_id, vsi->type);
+				ret = -EINVAL;
+				goto exit;
+			}
+			vf = &pf->vf[vsi->vf_id];
+			if (!vf) {
+				dev_err(dev, "VF is NULL for VSI->type: ICE_VF_VSI and vf_id %d\n",
+					vsi->vf_id);
+				ret = -EINVAL;
+				goto exit;
+			}
+			vf_ch = &vf->ch[tc_fltr->action.tc_class];
+
+			fltr_type = (enum ice_channel_fltr_type)
+				    vf_ch->fltr_type;
+		} else if (ch_vsi->ch) {
+			fltr_type = ch_vsi->ch->fltr_type;
+		} else {
+			dev_err(dev, "Can't add switch rule, neither dest_vsi is valid now VSI channel but tc_class sepcified is %u\n",
+				tc_fltr->action.tc_class);
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* perform fltr_type check for channel (aka ADQ) VSI */
+		ret = ice_chnl_fltr_type_chk(pf, tc_fltr, &fltr_type);
+		if (ret) {
+			NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+					   "Unable to add filter because filter type check failed");
+			dev_err(dev, "Unable to add filter because filter type check failed");
+			ret = -EINVAL;
+			goto exit;
+		}
+
+		/* Code is applicable only for PF ADQ, for VF ADQ - such
+		 * checks to be handled by VF driver
+		 */
+		if (ch_vsi && (ch_vsi->type == ICE_VSI_PF ||
+			       ch_vsi->type == ICE_VSI_CHNL)) {
+			ret = ice_detect_filter_conflict(pf, tc_fltr);
+			if (ret)
+				goto exit;
+		}
+
+		if (tc_fltr->dest_vsi) {
+			if (vf_ch && !fltr_type)
+				vf_ch->fltr_type = fltr_type;
+		} else if (ch_vsi->ch) {
+			ch_vsi->ch->fltr_type = fltr_type;
+		}
+
+		rule_info.sw_act.fltr_act = ICE_FWD_TO_VSI;
+#ifdef __CHECKER__
+		/* cppcheck-suppress nullPointerRedundantCheck */
+#endif /* _CHECKER__ */
+		rule_info.sw_act.vsi_handle = ch_vsi->idx;
+		rule_info.priority = 7;
+
+		rule_info.sw_act.src = hw->pf_id;
+		rule_info.rx = true;
+
+		dev_dbg(dev, "add switch rule for TC:%u vsi_idx:%u, lkups_cnt:%u\n",
+			tc_fltr->action.tc_class,
+			rule_info.sw_act.vsi_handle, lkups_cnt);
+	} else {
+		rule_info.sw_act.flag |= ICE_FLTR_TX;
+		rule_info.sw_act.src = vsi->idx;
+		rule_info.rx = false;
+	}
+
+	/* specify the cookie as filter_rule_id */
+	rule_info.fltr_rule_id = tc_fltr->cookie;
+
+	status = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added);
+	if (status == ICE_ERR_ALREADY_EXISTS) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter because it already exist");
+		ret = -EINVAL;
+		goto exit;
+	} else if (status) {
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Unable to add filter due to error");
+		ret = -EIO;
+		goto exit;
+	}
+
+	/* store the output params, which are needed later for removing
+	 * advanced switch filter
+	 */
+	tc_fltr->rid = rule_added.rid;
+	tc_fltr->rule_id = rule_added.rule_id;
+	if (tc_fltr->action.tc_class > 0 && ch_vsi) {
+		/* For PF ADQ, VSI type is set as ICE_VSI_CHNL, and
+		 * for PF ADQ filter, it is not yet set in tc_fltr,
+		 * hence store the dest_vsi ptr in tc_fltr
+		 */
+		if (ch_vsi->type == ICE_VSI_CHNL)
+			tc_fltr->dest_vsi = ch_vsi;
+		/* keep track of advanced switch filter for
+		 * destination VSI (channel VSI)
+		 */
+		ch_vsi->num_chnl_fltr++;
+		/* in this case, dest_id is VSI handle (sw handle) */
+		tc_fltr->dest_id = rule_added.vsi_handle;
+
+		/* keeps track of channel filters for PF VSI */
+		if (vsi->type == ICE_VSI_PF &&
+		    (flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			      ICE_TC_FLWR_FIELD_ENC_DST_MAC)))
+			pf->num_dmac_chnl_fltrs++;
+	}
+	dev_dbg(dev, "added switch rule (lkups_cnt %u, flags 0x%x) for TC %u, rid %u, rule_id %u, vsi_idx %u\n",
+		lkups_cnt, flags,
+		tc_fltr->action.tc_class, rule_added.rid,
+		rule_added.rule_id, rule_added.vsi_handle);
+exit:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * ice_tc_set_ipv4 - Parse IPv4 addresses from TC flower filter
+ * @match: Pointer to flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel IPv4 address
+ */
+static int
+ice_tc_set_ipv4(struct flow_match_ipv4_addrs *match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	if (match->key->dst) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_IPV4;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV4;
+		headers->l3_key.dst_ipv4 = match->key->dst;
+		headers->l3_mask.dst_ipv4 = match->mask->dst;
+	}
+	if (match->key->src) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_IPV4;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV4;
+		headers->l3_key.src_ipv4 = match->key->src;
+		headers->l3_mask.src_ipv4 = match->mask->src;
+	}
+	return 0;
+}
+
+/**
+ * ice_tc_set_ipv6 - Parse IPv6 addresses from TC flower filter
+ * @match: Pointer to flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel IPv6 address
+ */
+static int
+ice_tc_set_ipv6(struct flow_match_ipv6_addrs *match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	struct ice_tc_l3_hdr *l3_key, *l3_mask;
+
+	/* src and dest IPV6 address should not be LOOPBACK
+	 * (0:0:0:0:0:0:0:1), which can be represented as ::1
+	 */
+	if (ipv6_addr_loopback(&match->key->dst) ||
+	    ipv6_addr_loopback(&match->key->src)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack, "Bad ipv6, addr is LOOPBACK");
+		return -EINVAL;
+	}
+	/* if src/dest IPv6 address is *,* error */
+	if (ipv6_addr_any(&match->mask->dst) &&
+	    ipv6_addr_any(&match->mask->src)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Bad src/dest IPV6, addr is any");
+		return -EINVAL;
+	}
+	if (!ipv6_addr_any(&match->mask->dst)) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_IPV6;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV6;
+	}
+	if (!ipv6_addr_any(&match->mask->src)) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_IPV6;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV6;
+	}
+
+	l3_key = &headers->l3_key;
+	l3_mask = &headers->l3_mask;
+
+	if (fltr->flags & (ICE_TC_FLWR_FIELD_ENC_SRC_IPV6 |
+			   ICE_TC_FLWR_FIELD_SRC_IPV6)) {
+		memcpy(&l3_key->src_ipv6_addr, &match->key->src.s6_addr,
+		       sizeof(match->key->src.s6_addr));
+		memcpy(&l3_mask->src_ipv6_addr, &match->mask->src.s6_addr,
+		       sizeof(match->mask->src.s6_addr));
+	}
+	if (fltr->flags & (ICE_TC_FLWR_FIELD_ENC_DEST_IPV6 |
+			   ICE_TC_FLWR_FIELD_DEST_IPV6)) {
+		memcpy(&l3_key->dst_ipv6_addr, &match->key->dst.s6_addr,
+		       sizeof(match->key->dst.s6_addr));
+		memcpy(&l3_mask->dst_ipv6_addr, &match->mask->dst.s6_addr,
+		       sizeof(match->mask->dst.s6_addr));
+	}
+
+	return 0;
+}
+
+/**
+ * ice_tc_set_port - Parse ports from TC flower filter
+ * @match: Flow match structure
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ * @is_encap: set true for tunnel port
+ */
+static int
+ice_tc_set_port(struct flow_match_ports match,
+		struct ice_tc_flower_fltr *fltr,
+		struct ice_tc_flower_lyr_2_4_hdrs *headers, bool is_encap)
+{
+	if (match.key->dst) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT;
+		headers->l4_key.dst_port = match.key->dst;
+		headers->l4_mask.dst_port = match.mask->dst;
+	}
+	if (match.key->src) {
+		if (is_encap)
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT;
+		else
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT;
+		headers->l4_key.src_port = match.key->src;
+		headers->l4_mask.src_port = match.mask->src;
+	}
+	return 0;
+}
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+/**
+ * ice_is_tnl_gtp - detect if tunnel type is GTP or not
+ * @tunnel_dev: ptr to tunnel device
+ * @rule: ptr to flow_rule
+ *
+ * If curr_tnl_type is TNL_LAST and "flow_rule" is non-NULL, then
+ * check if enc_dst_port is well known GTP port (2152)
+ * if so - return true (indicating that tunnel type is GTP), otherwise false.
+ */
+static bool
+ice_is_tnl_gtp(struct net_device *tunnel_dev,
+	       struct flow_rule *rule)
+{
+	/* if flow_rule is non-NULL, proceed with detecting possibility
+	 * of GTP tunnel. Unlike VXLAN and GENEVE, there is no such API
+	 * like  netif_is_gtp since GTP is not natively supported in kernel
+	 */
+	if (rule && (!is_vlan_dev(tunnel_dev))) {
+		struct flow_match_ports match;
+		u16 enc_dst_port;
+
+		if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+			netdev_err(tunnel_dev,
+				   "Tunnel HW offload is not supported, ENC_PORTs are not specified\n");
+			return false;
+		}
+
+		/* get ENC_PORTS info */
+		flow_rule_match_enc_ports(rule, &match);
+		enc_dst_port = be16_to_cpu(match.key->dst);
+
+		/* Outer UDP port is GTP well known port,
+		 * if 'enc_dst_port' matched with GTP wellknown port,
+		 * return true from this function.
+		 */
+		if (enc_dst_port != ICE_GTP_TNL_WELLKNOWN_PORT) {
+			netdev_err(tunnel_dev,
+				   "Tunnel HW offload is not supported for non-GTP tunnel, ENC_DST_PORT is %u\n",
+				   enc_dst_port);
+			return false;
+		}
+
+		/* all checks passed including outer UDP port  to be qualified
+		 * for GTP tunnel
+		 */
+		return true;
+	}
+	return false;
+}
+
+/**
+ * ice_tc_tun_get_type - get the tunnel type
+ * @tunnel_dev: ptr to tunnel device
+ * @rule: ptr to flow_rule
+ *
+ * This function detects appropriate tunnel_type if specified device is
+ * tunnel device such as vxlan/geneve othertwise it tries to detect
+ * tunnel type based on outer GTP port (2152)
+ */
+int
+ice_tc_tun_get_type(struct net_device *tunnel_dev,
+		    struct flow_rule *rule)
+{
+#ifdef HAVE_VXLAN_TYPE
+#if IS_ENABLED(CONFIG_VXLAN)
+	if (netif_is_vxlan(tunnel_dev))
+		return TNL_VXLAN;
+#endif /* HAVE_VXLAN_TYPE */
+#elif defined(HAVE_GENEVE_TYPE)
+#if IS_ENABLED(CONFIG_GENEVE)
+	if (netif_is_geneve(tunnel_dev))
+		return TNL_GENEVE;
+#endif
+#endif /* HAVE_GENEVE_TYPE */
+	/* detect possibility of GTP tunnel type based on input */
+	if (ice_is_tnl_gtp(tunnel_dev, rule))
+		return TNL_GTP;
+
+	return TNL_LAST;
+}
+
+/**
+ * ice_tc_tun_info - Parse and store tunnel info
+ * @pf: ptr to PF device
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @tunnel: type of tunnel (e.g. VxLAN, Geneve, GTP)
+ *
+ * Parse tunnel attributes such as tunnel_id and store them.
+ */
+static int
+ice_tc_tun_info(struct ice_pf *pf, struct flow_cls_offload *f,
+		struct ice_tc_flower_fltr *fltr,
+		enum ice_tunnel_type tunnel)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+
+	/* match on VNI */
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct device *dev = ice_pf_to_dev(pf);
+		struct flow_match_enc_keyid enc_keyid;
+		u32 key_id;
+
+		flow_rule_match_enc_keyid(rule, &enc_keyid);
+		if (!enc_keyid.mask->keyid) {
+			dev_err(dev, "Bad mask for encap key_id 0x%04x, it must be non-zero\n",
+				be32_to_cpu(enc_keyid.mask->keyid));
+			return -EINVAL;
+		}
+
+		if (enc_keyid.mask->keyid !=
+				cpu_to_be32(ICE_TC_FLOWER_MASK_32)) {
+			dev_err(dev, "Bad mask value for encap key_id 0x%04x\n",
+				be32_to_cpu(enc_keyid.mask->keyid));
+			return -EINVAL;
+		}
+
+		key_id = be32_to_cpu(enc_keyid.key->keyid);
+		if (tunnel == TNL_VXLAN || tunnel == TNL_GENEVE) {
+			/* VNI is only 3 bytes, applicable for VXLAN/GENEVE */
+			if (key_id > ICE_TC_FLOWER_VNI_MAX) {
+				dev_err(dev, "VNI out of range : 0x%x\n",
+					key_id);
+				return -EINVAL;
+			}
+		}
+		fltr->flags |= ICE_TC_FLWR_FIELD_TENANT_ID;
+		fltr->tenant_id = enc_keyid.key->keyid;
+	} else if (tunnel == TNL_GTP) {
+		/* User didn't specify tunnel_key but indicated
+		 * intention about GTP tunnel.
+		 * For GTP tunnel, support for wild-card tunnel-ID
+		 */
+		fltr->flags |= ICE_TC_FLWR_FIELD_TENANT_ID;
+		fltr->tenant_id = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_tc_tun_parse - Parse tunnel attributes from TC flower filter
+ * @filter_dev: Pointer to device on which filter is being added
+ * @vsi: Pointer to VSI structure
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ */
+static int
+ice_tc_tun_parse(struct net_device *filter_dev, struct ice_vsi *vsi,
+		 struct flow_cls_offload *f,
+		 struct ice_tc_flower_fltr *fltr,
+		 struct ice_tc_flower_lyr_2_4_hdrs *headers)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	enum ice_tunnel_type tunnel_type;
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	int err = 0;
+
+	dev = ice_pf_to_dev(pf);
+	tunnel_type = ice_tc_tun_get_type(filter_dev, rule);
+
+	/* VXLAN and GTP tunnel are supported now */
+	if (tunnel_type == TNL_VXLAN || tunnel_type == TNL_GTP) {
+		err = ice_tc_tun_info(pf, f, fltr, tunnel_type);
+		if (err) {
+			dev_err(dev, "Failed to parse tunnel (tunnel_type %u) attributes\n",
+				tunnel_type);
+			return err;
+		}
+	} else {
+		dev_err(dev, "Tunnel HW offload is not supported for the tunnel type: %d\n",
+			tunnel_type);
+		return -EOPNOTSUPP;
+	}
+	fltr->tunnel_type = tunnel_type;
+	headers->l3_key.ip_proto = IPPROTO_UDP;
+	return err;
+}
+
+/**
+ * ice_parse_tunnel_attr - Parse tunnel attributes from TC flower filter
+ * @filter_dev: Pointer to device on which filter is being added
+ * @vsi: Pointer to VSI structure
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ * @headers: inner or outer header fields
+ */
+static int
+ice_parse_tunnel_attr(struct net_device *filter_dev, struct ice_vsi *vsi,
+		      struct flow_cls_offload *f,
+		      struct ice_tc_flower_fltr *fltr,
+		      struct ice_tc_flower_lyr_2_4_hdrs *headers)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct flow_match_control enc_control;
+	int err;
+
+	err = ice_tc_tun_parse(filter_dev, vsi, f, fltr, headers);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "failed to parse tunnel attributes");
+		return err;
+	}
+
+	flow_rule_match_enc_control(rule, &enc_control);
+
+	if (enc_control.key->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_match_ipv4_addrs match;
+
+		flow_rule_match_enc_ipv4_addrs(rule, &match);
+		if (ice_tc_set_ipv4(&match, fltr, headers, true))
+			return -EINVAL;
+	} else if (enc_control.key->addr_type ==
+					FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_match_ipv6_addrs match;
+
+		flow_rule_match_enc_ipv6_addrs(rule, &match);
+		if (ice_tc_set_ipv6(&match, fltr, headers, true))
+			return -EINVAL;
+	}
+
+#ifdef HAVE_TC_FLOWER_ENC_IP
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) {
+		struct flow_match_ip match;
+
+		flow_rule_match_enc_ip(rule, &match);
+		headers->l3_key.tos = match.key->tos;
+		headers->l3_key.ttl = match.key->ttl;
+		headers->l3_mask.tos = match.mask->tos;
+		headers->l3_mask.ttl = match.mask->ttl;
+	}
+#endif /* HAVE_TC_FLOWER_ENC_IP */
+
+	if (fltr->tunnel_type == TNL_GTP &&
+	    flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_match_ports match;
+
+		flow_rule_match_enc_ports(rule, &match);
+		/* store away outer L4 port info and mark it for tunnel */
+		if (ice_tc_set_port(match, fltr, headers, true))
+			return -EINVAL;
+	}
+	return 0;
+}
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+
+/**
+ * ice_parse_cls_flower - Parse TC flower filters provided by kernel
+ * @vsi: Pointer to the VSI
+ * @filter_dev: Pointer to device on which filter is being added
+ * @f: Pointer to struct flow_cls_offload
+ * @fltr: Pointer to filter structure
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi,
+		     struct flow_cls_offload *f,
+		     struct ice_tc_flower_fltr *fltr)
+#else
+static int
+ice_parse_cls_flower(struct net_device __always_unused *filter_dev,
+		     struct ice_vsi __always_unused *vsi,
+		     struct tc_cls_flower_offload *f,
+		     struct ice_tc_flower_fltr *fltr)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers;
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct flow_dissector *dissector = rule->match.dissector;
+	u16 n_proto_mask = 0, n_proto_key = 0, addr_type = 0;
+
+	if (dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+#ifdef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	      BIT(FLOW_DISSECTOR_KEY_VLANID) |
+#endif
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+#endif
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+#ifdef HAVE_TC_FLOWER_ENC
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) |
+#ifdef HAVE_TC_FLOWER_ENC_IP
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
+#endif /* HAVE_TC_FLOWER_ENC_IP */
+#endif /* HAVE_TC_FLOWER_ENC */
+	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported key used");
+		return -EOPNOTSUPP;
+	}
+
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+	if ((flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID) ||
+	     flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS))) {
+		int err;
+
+		err = ice_parse_tunnel_attr(filter_dev, vsi, f, fltr, headers);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Failed to parse TC flower tunnel attributes");
+			return err;
+		}
+
+		/* header pointers should point to the inner headers, outer
+		 * header were already set by ice_parse_tunnel_attr
+		 */
+		headers = &fltr->inner_headers;
+	} else {
+		fltr->tunnel_type = TNL_LAST;
+	}
+#else /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+	fltr->tunnel_type = TNL_LAST;
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_match_basic match;
+
+		flow_rule_match_basic(rule, &match);
+
+		n_proto_key = ntohs(match.key->n_proto);
+		n_proto_mask = ntohs(match.mask->n_proto);
+
+		if (n_proto_key == ETH_P_ALL || n_proto_key == 0) {
+			n_proto_key = 0;
+			n_proto_mask = 0;
+		} else {
+			if (!ice_is_adq_active(vsi->back))
+				fltr->flags |= ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+		}
+
+		headers->l2_key.n_proto = cpu_to_be16(n_proto_key);
+		headers->l2_mask.n_proto = cpu_to_be16(n_proto_mask);
+		headers->l3_key.ip_proto = match.key->ip_proto;
+	}
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_match_eth_addrs match;
+
+		flow_rule_match_eth_addrs(rule, &match);
+
+		if (!is_zero_ether_addr(match.key->dst)) {
+			ether_addr_copy(headers->l2_key.dst_mac,
+					match.key->dst);
+			ether_addr_copy(headers->l2_mask.dst_mac,
+					match.mask->dst);
+			fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+		}
+
+		if (!is_zero_ether_addr(match.key->src)) {
+			ether_addr_copy(headers->l2_key.src_mac,
+					match.key->src);
+			ether_addr_copy(headers->l2_mask.src_mac,
+					match.mask->src);
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_MAC;
+		}
+	}
+
+#ifdef HAVE_TC_FLOWER_VLAN_IN_TAGS
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_VLANID)) {
+		struct flow_dissector_key_tags *key =
+			(struct flow_dissector_key_tags *)
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLANID,
+						  f->key);
+		struct flow_dissector_key_tags *mask =
+			(struct flow_dissector_key_tags *)
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLANID,
+						  f->mask);
+
+		if (mask->vlan_id) {
+			if (mask->vlan_id == VLAN_VID_MASK) {
+				fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+				fltr->flags &= ~ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+			} else {
+				NL_SET_ERR_MSG_MOD(fltr->extack,
+						   "Bad VLAN mask");
+				return -EINVAL;
+			}
+		}
+		headers->vlan_hdr.vlan_id =
+				cpu_to_be16(key->vlan_id & VLAN_VID_MASK);
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+		if (mask->vlan_priority)
+			headers->vlan_hdr.vlan_prio = key->vlan_priority;
+#endif
+	}
+#else /* !HAVE_TC_FLOWER_VLAN_IN_TAGS */
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN) ||
+	    is_vlan_dev(filter_dev)) {
+		struct flow_dissector_key_vlan mask;
+		struct flow_dissector_key_vlan key;
+		struct flow_match_vlan match;
+
+		if (is_vlan_dev(filter_dev)) {
+			match.key = &key;
+			match.key->vlan_id = vlan_dev_vlan_id(filter_dev);
+			match.key->vlan_priority = 0;
+			match.mask = &mask;
+			memset(match.mask, 0xff, sizeof(*match.mask));
+			match.mask->vlan_priority = 0;
+		} else {
+			flow_rule_match_vlan(rule, &match);
+		}
+
+		if (match.mask->vlan_id) {
+			if (match.mask->vlan_id == VLAN_VID_MASK) {
+				fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+				fltr->flags &= ~ICE_TC_FLWR_FIELD_ETH_TYPE_ID;
+			} else {
+				NL_SET_ERR_MSG_MOD(fltr->extack,
+						   "Bad VLAN mask");
+				return -EINVAL;
+			}
+		}
+
+		headers->vlan_hdr.vlan_id =
+				cpu_to_be16(match.key->vlan_id & VLAN_VID_MASK);
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+		if (match.mask->vlan_priority)
+			headers->vlan_hdr.vlan_prio = match.key->vlan_priority;
+#endif
+	}
+#endif /* HAVE_TC_FLOWER_VLAN_IN_TAGS */
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_match_control match;
+
+		flow_rule_match_control(rule, &match);
+
+		addr_type = match.key->addr_type;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_match_ipv4_addrs match;
+
+		flow_rule_match_ipv4_addrs(rule, &match);
+		if (ice_tc_set_ipv4(&match, fltr, headers, false))
+			return -EINVAL;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_match_ipv6_addrs match;
+
+		flow_rule_match_ipv6_addrs(rule, &match);
+		if (ice_tc_set_ipv6(&match, fltr, headers, false))
+			return -EINVAL;
+	}
+
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_match_ports match;
+
+		flow_rule_match_ports(rule, &match);
+		if (ice_tc_set_port(match, fltr, headers, false))
+			return -EINVAL;
+		switch (headers->l3_key.ip_proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Only UDP and TCP transport are supported");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_add_remove_tc_flower_dflt_fltr - add or remove default filter
+ * @vsi: Pointer to VSI
+ * @tc_fltr: Pointer to TC flower filter structure
+ * @add: true if filter is being added.
+ *
+ * Add or remove default filter using default recipes to add MAC
+ * or VLAN or MAC-VLAN filters.
+ */
+static int
+ice_add_remove_tc_flower_dflt_fltr(struct ice_vsi *vsi,
+				   struct ice_tc_flower_fltr *tc_fltr, bool add)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers;
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	enum ice_sw_fwd_act_type act = tc_fltr->action.fltr_act;
+	u16 vlan_id =  be16_to_cpu(headers->vlan_hdr.vlan_id);
+	const u8 *dst_mac = headers->l2_key.dst_mac;
+	int err;
+
+	switch (tc_fltr->flags) {
+	case ICE_TC_FLWR_FLTR_FLAGS_DST_MAC:
+		if (add) {
+			err = ice_fltr_add_mac(vsi, dst_mac, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add MAC filters");
+		} else {
+			err = ice_fltr_remove_mac(vsi, dst_mac, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not remove MAC filters");
+		}
+		break;
+	case ICE_TC_FLWR_FLTR_FLAGS_VLAN:
+		if (add) {
+			struct ice_vlan vlan =
+				ICE_VLAN(ETH_P_8021Q, vlan_id, 0, act);
+			err = vlan_ops->add_vlan(vsi, &vlan);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add VLAN filters");
+		} else {
+			struct ice_vlan vlan =
+				ICE_VLAN(ETH_P_8021Q, vlan_id, 0, act);
+			err = vlan_ops->del_vlan(vsi, &vlan);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not delete VLAN filters");
+		}
+		break;
+	case ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN:
+		if (add) {
+			err = ice_fltr_add_mac_vlan(vsi, dst_mac, vlan_id, act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not add MAC VLAN filters");
+		} else {
+			err = ice_fltr_remove_mac_vlan(vsi, dst_mac, vlan_id,
+						       act);
+			if (err)
+				NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+						   "Could not remove MAC VLAN filters");
+		}
+		break;
+	default:
+		NL_SET_ERR_MSG_MOD(tc_fltr->extack,
+				   "Not a default filter type");
+		err = -EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
+/**
+ * ice_add_switch_fltr - Add TC flower filters
+ * @vsi: Pointer to VSI
+ * @fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * Add filter in HW switch block
+ */
+static int
+ice_add_switch_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	if (ice_is_eswitch_mode_switchdev(vsi->back))
+		return ice_eswitch_add_tc_fltr(vsi, fltr);
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (fltr->action.fltr_act == ICE_FWD_TO_QGRP)
+		return -EOPNOTSUPP;
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+	if (fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_VLAN ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN)
+		return ice_add_remove_tc_flower_dflt_fltr(vsi, fltr, true);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	return ice_add_tc_flower_adv_fltr(vsi, fltr);
+#else
+	return -EOPNOTSUPP;
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+}
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+/**
+ * ice_handle_tclass_action - Support directing to a traffic class
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to TC flower offload structure
+ * @fltr: Pointer to TC flower filter structure
+ *
+ * Support directing traffic to a traffic class
+ */
+static int
+ice_handle_tclass_action(struct ice_vsi *vsi,
+			 struct flow_cls_offload *cls_flower,
+			 struct ice_tc_flower_fltr *fltr)
+{
+	int tc = tc_classid_to_hwtc(vsi->netdev, cls_flower->classid);
+	struct ice_vsi *main_vsi;
+
+	if (tc < 0) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because specified destination is invalid");
+		return -EINVAL;
+	}
+	if (!tc) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of invalid destination");
+		return -EINVAL;
+	}
+
+	if (!(vsi->all_enatc & BIT(tc))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of non-existence destination");
+		return -EINVAL;
+	}
+
+	/* Redirect to a TC class or Queue Group */
+	main_vsi = ice_get_main_vsi(vsi->back);
+	if (!main_vsi || !main_vsi->netdev) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because of invalid netdevice");
+		return -EINVAL;
+	}
+
+	if ((fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID) &&
+	    (fltr->flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+			   ICE_TC_FLWR_FIELD_SRC_MAC))) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because filter using tunnel key and inner MAC is unsupported combination");
+		return -EOPNOTSUPP;
+	}
+
+	/* For ADQ, filter must include dest MAC address, otherwise unwanted
+	 * packets with unrelated MAC address get delivered to ADQ VSIs as long
+	 * as remaining filter criteria is satisfied such as dest IP address
+	 * and dest/src L4 port. Following code is trying to handle:
+	 * 1. For non-tunnel, if user specify MAC addresses, use them (means
+	 * this code won't do anything
+	 * 2. For non-tunnel, if user didn't specify MAC address, add implicit
+	 * dest MAC to be lower netdev's active unicast MAC address
+	 * 3. For tunnel,  as of now tc-filter thru flower classifier doesn't
+	 * have provision for user to specify outer DMAC, hence driver to
+	 * implicitly add outer dest MAC to be lower netdev's active unicast
+	 * MAC address.
+	 */
+	if (fltr->flags & ICE_TC_FLWR_FIELD_TENANT_ID)  {
+		if (!(fltr->flags & ICE_TC_FLWR_FIELD_ENC_DST_MAC)) {
+			ether_addr_copy(fltr->outer_headers.l2_key.dst_mac,
+					main_vsi->netdev->dev_addr);
+			eth_broadcast_addr(fltr->outer_headers.l2_mask.dst_mac);
+			fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DST_MAC;
+		}
+	} else if (!(fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)) {
+		ether_addr_copy(fltr->outer_headers.l2_key.dst_mac,
+				main_vsi->netdev->dev_addr);
+		eth_broadcast_addr(fltr->outer_headers.l2_mask.dst_mac);
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+	}
+
+	/* validate specified dest MAC address, make sure either it belongs to
+	 * lower netdev or any of non-offloaded MACVLAN. Non-offloaded MACVLANs
+	 * MAC address are added as unicast MAC filter destined to main VSI.
+	 */
+	if (!ice_mac_fltr_exist(&main_vsi->back->hw,
+				fltr->outer_headers.l2_key.dst_mac,
+				main_vsi->idx)) {
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Unable to add filter because legacy MAC filter for specified destination doesn't exist");
+		return -EINVAL;
+	}
+
+	/* Make sure VLAN is already added to main VSI, before allowing ADQ to
+	 * add a VLAN based filter such as MAC + VLAN + L4 port.
+	 */
+	if (fltr->flags & ICE_TC_FLWR_FIELD_VLAN) {
+		u16 vlan_id = be16_to_cpu(fltr->outer_headers.vlan_hdr.vlan_id);
+
+		if (!ice_vlan_fltr_exist(&main_vsi->back->hw, vlan_id,
+					 main_vsi->idx)) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unable to add filter because legacy VLAN filter for specified destination doesn't exist");
+			return -EINVAL;
+		}
+	}
+	fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	fltr->action.tc_class = tc;
+
+	return 0;
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+/**
+ * ice_parse_tc_flower_actions - Parse the actions for a TC filter
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to TC flower offload structure
+ * @fltr: Pointer to TC flower filter structure
+ *
+ * Parse the actions for a TC filter
+ */
+static int
+ice_parse_tc_flower_actions(struct ice_vsi *vsi,
+			    struct flow_cls_offload *cls_flower,
+			    struct ice_tc_flower_fltr *fltr)
+{
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	struct flow_rule *rule = flow_cls_offload_flow_rule(cls_flower);
+	struct flow_action *flow_action = &rule->action;
+	struct flow_action_entry *act;
+	int i;
+#else
+	struct tcf_exts *exts = cls_flower->exts;
+	struct tc_action *tc_act;
+#if defined(HAVE_TCF_EXTS_FOR_EACH_ACTION)
+	int i;
+#else
+	struct tc_action *temp;
+	LIST_HEAD(tc_actions);
+#endif
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+	if (cls_flower->classid)
+		return ice_handle_tclass_action(vsi, cls_flower, fltr);
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	if (!flow_action_has_entries(flow_action))
+#elif defined(HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV)
+	if (!tcf_exts_has_actions(exts))
+#else
+	if (tc_no_actions(exts))
+#endif
+		return -EINVAL;
+
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+	flow_action_for_each(i, act, flow_action) {
+#elif defined(HAVE_TCF_EXTS_FOR_EACH_ACTION)
+	tcf_exts_for_each_action(i, tc_act, exts) {
+#elif defined(HAVE_TCF_EXTS_TO_LIST)
+	tcf_exts_to_list(exts, &tc_actions);
+
+	list_for_each_entry_safe(tc_act, temp, &tc_actions, list) {
+#else
+	list_for_each_entry_safe(tc_act, temp, &(exts)->actions, list) {
+#endif /* HAVE_TCF_EXTS_TO_LIST */
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+		if (ice_is_eswitch_mode_switchdev(vsi->back)) {
+			int err =  ice_eswitch_tc_parse_action(fltr, act);
+
+			if (err)
+				return err;
+		}
+#else
+		if (ice_is_eswitch_mode_switchdev(vsi->back))
+			return -EINVAL;
+#endif /* HAVE_TC_FLOW_RULE_INFRASTRUCTURE */
+		/* Allow only one rule per filter */
+
+		/* Drop action */
+#ifdef HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+		if (act->id == FLOW_ACTION_DROP) {
+#else
+		if (is_tcf_gact_shot(tc_act)) {
+#endif
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "Unsupported action DROP");
+			return -EINVAL;
+		}
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	}
+	return 0;
+}
+
+/**
+ * ice_del_tc_fltr - deletes a filter from HW table
+ * @vsi: Pointer to VSI
+ * @fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * This function deletes a filter from HW table and manages book-keeping
+ */
+static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+{
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	if (fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_VLAN ||
+	    fltr->flags == ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN) {
+		err = ice_add_remove_tc_flower_dflt_fltr(vsi, fltr, false);
+	} else {
+		struct ice_rule_query_data rule_rem;
+
+		rule_rem.rid = fltr->rid;
+		rule_rem.rule_id = fltr->rule_id;
+		rule_rem.vsi_handle = fltr->dest_id;
+		err = ice_rem_adv_rule_by_id(&pf->hw, &rule_rem);
+	}
+
+	if (err) {
+		if (err == ICE_ERR_DOES_NOT_EXIST) {
+			NL_SET_ERR_MSG_MOD(fltr->extack,
+					   "filter does not exist\n");
+			return -ENOENT;
+		}
+		NL_SET_ERR_MSG_MOD(fltr->extack,
+				   "Failed to delete TC flower filter");
+		return -EIO;
+	}
+
+	/* update advanced switch filter count for destination
+	 * VSI if filter destination was VSI
+	 */
+	if (fltr->dest_vsi) {
+		if (fltr->dest_vsi->type == ICE_VSI_CHNL) {
+			struct ice_channel *ch = fltr->dest_vsi->ch;
+
+			fltr->dest_vsi->num_chnl_fltr--;
+
+			/* reset filter type for channel if channel filter
+			 * count reaches zero
+			 */
+			if (!fltr->dest_vsi->num_chnl_fltr && ch)
+				ch->fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+
+			/* keeps track of channel filters for PF VSI */
+			if (vsi->type == ICE_VSI_PF &&
+			    (fltr->flags & (ICE_TC_FLWR_FIELD_DST_MAC |
+					    ICE_TC_FLWR_FIELD_ENC_DST_MAC)))
+				pf->num_dmac_chnl_fltrs--;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_add_tc_fltr - adds a TC flower filter
+ * @netdev: Pointer to netdev
+ * @vsi: Pointer to VSI
+ * @f: Pointer to flower offload structure
+ * @__fltr: Pointer to struct ice_tc_flower_fltr
+ *
+ * This function parses tc-flower input fields, parses action,
+ * and adds a filter.
+ */
+#ifdef HAVE_TC_INDIR_BLOCK
+static int
+ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi,
+		struct flow_cls_offload *f,
+		struct ice_tc_flower_fltr **__fltr)
+#else
+static int
+ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi,
+		struct tc_cls_flower_offload *f,
+		struct ice_tc_flower_fltr **__fltr)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+	struct ice_tc_flower_fltr *fltr;
+	int err;
+
+	/* by default, set output to be INVALID */
+	*__fltr = NULL;
+
+	fltr = kzalloc(sizeof(*fltr), GFP_KERNEL);
+	if (!fltr)
+		return -ENOMEM;
+
+	fltr->cookie = f->cookie;
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	fltr->extack = f->common.extack;
+#endif
+	fltr->src_vsi = vsi;
+	INIT_HLIST_NODE(&fltr->tc_flower_node);
+
+	err = ice_parse_cls_flower(netdev, vsi, f, fltr);
+	if (err < 0)
+		goto err;
+
+	err = ice_parse_tc_flower_actions(vsi, f, fltr);
+	if (err < 0)
+		goto err;
+
+	err = ice_add_switch_fltr(vsi, fltr);
+	if (err < 0)
+		goto err;
+
+	/* return the newly created filter */
+	*__fltr = fltr;
+
+	return 0;
+err:
+	kfree(fltr);
+	return err;
+}
+
+/**
+ * ice_find_tc_flower_fltr - Find the TC flower filter in the list
+ * @pf: Pointer to PF
+ * @cookie: filter specific cookie
+ */
+static struct ice_tc_flower_fltr *
+ice_find_tc_flower_fltr(struct ice_pf *pf, unsigned long cookie)
+{
+	struct ice_tc_flower_fltr *fltr;
+
+	hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node)
+		if (cookie == fltr->cookie)
+			return fltr;
+
+	return NULL;
+}
+
+/**
+ * ice_add_cls_flower - add TC flower filters
+ * @netdev: Pointer to filter device
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to flower offload structure
+ */
+int
+#ifdef HAVE_TC_INDIR_BLOCK
+ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi,
+		   struct flow_cls_offload *cls_flower)
+#else
+ice_add_cls_flower(struct net_device __always_unused *netdev,
+		   struct ice_vsi *vsi,
+		   struct tc_cls_flower_offload *cls_flower)
+#endif /* HAVE_TC_INDIR_BLOCK */
+{
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	struct netlink_ext_ack *extack = cls_flower->common.extack;
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+	struct net_device *vsi_netdev = vsi->netdev;
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_pf *pf = vsi->back;
+	int err = 0;
+
+	if (ice_is_reset_in_progress(pf->state))
+		return -EBUSY;
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags))
+		return -EINVAL;
+
+#ifdef HAVE_TC_FLOW_INDIR_DEV
+	if ((ice_tc_tun_get_type(netdev, NULL) == TNL_LAST) &&
+	    ice_is_port_repr_netdev(netdev))
+		vsi_netdev = netdev;
+#else
+	if (ice_is_port_repr_netdev(netdev))
+		vsi_netdev = netdev;
+#endif /* HAVE_TC_FLOW_INDIR_DEV */
+
+	if (!(vsi_netdev->features & NETIF_F_HW_TC) &&
+	    !test_bit(ICE_FLAG_CLS_FLOWER, pf->flags)) {
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+#ifdef HAVE_TC_INDIR_BLOCK
+		/* Based on TC indirect notifications from kernel, all ice
+		 * devices get an instance of rule from higher level device.
+		 * Avoid triggering explicit error in this case.
+		 */
+		if (netdev == vsi_netdev)
+			NL_SET_ERR_MSG_MOD(extack,
+					   "can't apply TC flower filters, turn ON hw-tc-offload and try again");
+#else
+		NL_SET_ERR_MSG_MOD(extack,
+				   "can't apply TC flower filters, turn ON hw-tc-offload and try again");
+#endif /* HAVE_TC_INDIR_BLOCK */
+#else  /* !HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		netdev_err(vsi_netdev,
+			   "can't apply TC flower filters, turn ON hw-tc-offload and try again\n");
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		return -EINVAL;
+	}
+
+	/* avoid duplicate entries, if exists - return error */
+	fltr = ice_find_tc_flower_fltr(pf, cls_flower->cookie);
+	if (fltr) {
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+		NL_SET_ERR_MSG_MOD(extack,
+				   "filter cookie already exists, ignoring");
+#else
+		netdev_warn(vsi_netdev,
+			    "filter cookie %lx already exists, ignoring\n",
+			    fltr->cookie);
+#endif /* HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK */
+		return -EEXIST;
+	}
+
+	/* prep and add tc-flower filter in HW */
+	err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr);
+	if (err)
+		return err;
+
+	/* add filter into an ordered list */
+	hlist_add_head(&fltr->tc_flower_node, &pf->tc_flower_fltr_list);
+	return 0;
+}
+
+/**
+ * ice_del_cls_flower - delete TC flower filters
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to struct flow_cls_offload
+ */
+int
+ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct ice_pf *pf = vsi->back;
+	int err;
+
+	/* find filter */
+	fltr = ice_find_tc_flower_fltr(pf, cls_flower->cookie);
+	if (!fltr) {
+		/* when egress qdisc is deleted, driver deletes all channel
+		 * filters so that there are no stale filters left in
+		 * HW (as per design) because deleting egress qdisc means,
+		 * deleting all channel VSIs, hence no reason to keep filters
+		 * destined to those channel VSIs. But software (OS) still
+		 * sees those filters being offloaded in HW. In this situation
+		 * user can try to delete those filters or OS will try to
+		 * delete them one by one when ingress qdisc is deleted from
+		 * given interace (ethX) and driver won't find those filters in
+		 * its list of filters, hence don't return error. Return the
+		 * error only when there are still active channel(s) and can't
+		 * find requested filter and/or failed to delet the filter,
+		 * otherwise return success
+		 */
+		/* means no channels are configured or channels are deleted and
+		 * channel filter list is empty
+		 */
+		if (!test_bit(ICE_FLAG_TC_MQPRIO, pf->flags) &&
+		    hlist_empty(&pf->tc_flower_fltr_list))
+			return 0;
+
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+		NL_SET_ERR_MSG_MOD(cls_flower->common.extack,
+				   "failed to delete TC flower filter because unable to find it");
+#else
+		dev_err(ice_pf_to_dev(pf),
+			"failed to delete TC flower filter because unable to find it\n");
+#endif
+		return -EINVAL;
+	}
+
+#ifdef HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+	fltr->extack = cls_flower->common.extack;
+#endif
+	/* delete filter from HW */
+	err = ice_del_tc_fltr(vsi, fltr);
+	if (err)
+		return err;
+
+	/* delete filter from an ordered list */
+	hlist_del(&fltr->tc_flower_node);
+
+	/* free the filter node */
+	kfree(fltr);
+
+	return 0;
+}
+
+/**
+ * ice_replay_tc_fltrs - replay tc filters
+ * @pf: pointer to PF struct
+ */
+void ice_replay_tc_fltrs(struct ice_pf *pf)
+{
+	struct ice_tc_flower_fltr *fltr;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(fltr, node,
+				  &pf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		fltr->extack = NULL;
+		ice_add_switch_fltr(fltr->src_vsi, fltr);
+	}
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a6f4566e1fbacc0d0caca4d33ea297fca45ce99
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_TC_LIB_H_
+#define _ICE_TC_LIB_H_
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#define ICE_TC_FLWR_FIELD_DST_MAC		BIT(0)
+#define ICE_TC_FLWR_FIELD_SRC_MAC		BIT(1)
+#define ICE_TC_FLWR_FIELD_VLAN			BIT(2)
+#define ICE_TC_FLWR_FIELD_DEST_IPV4		BIT(3)
+#define ICE_TC_FLWR_FIELD_SRC_IPV4		BIT(4)
+#define ICE_TC_FLWR_FIELD_DEST_IPV6		BIT(5)
+#define ICE_TC_FLWR_FIELD_SRC_IPV6		BIT(6)
+#define ICE_TC_FLWR_FIELD_DEST_L4_PORT		BIT(7)
+#define ICE_TC_FLWR_FIELD_SRC_L4_PORT		BIT(8)
+#define ICE_TC_FLWR_FIELD_TENANT_ID		BIT(9)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_IPV4		BIT(10)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_IPV4		BIT(11)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_IPV6		BIT(12)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_IPV6		BIT(13)
+#define ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT	BIT(14)
+#define ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT	BIT(15)
+#define ICE_TC_FLWR_FIELD_ENC_DST_MAC		BIT(16)
+#define ICE_TC_FLWR_FIELD_ETH_TYPE_ID		BIT(17)
+
+/* TC flower supported filter match */
+#define ICE_TC_FLWR_FLTR_FLAGS_DST_MAC		ICE_TC_FLWR_FIELD_DST_MAC
+#define ICE_TC_FLWR_FLTR_FLAGS_VLAN		ICE_TC_FLWR_FIELD_VLAN
+#define ICE_TC_FLWR_FLTR_FLAGS_DST_MAC_VLAN	(ICE_TC_FLWR_FIELD_DST_MAC | \
+						 ICE_TC_FLWR_FIELD_VLAN)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV4_DST_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV4 | \
+						 ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV4_SRC_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV4 | \
+						 ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV6_DST_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV6 | \
+						 ICE_TC_FLWR_FIELD_DEST_L4_PORT)
+#define ICE_TC_FLWR_FLTR_FLAGS_IPV6_SRC_PORT	(ICE_TC_FLWR_FIELD_DEST_IPV6 | \
+						 ICE_TC_FLWR_FIELD_SRC_L4_PORT)
+
+#define ICE_TC_FLOWER_MASK_32	0xFFFFFFFF
+#define ICE_TC_FLOWER_MASK_16	0xFFFF
+#define ICE_TC_FLOWER_VNI_MAX	0xFFFFFFU
+
+#ifdef HAVE_TC_INDIR_BLOCK
+struct ice_indr_block_priv {
+	struct net_device *netdev;
+	struct ice_netdev_priv *np;
+	struct list_head list;
+};
+#endif /* HAVE_TC_INDIR_BLOCK */
+
+struct ice_tc_flower_action {
+	u32 tc_class;
+	enum ice_sw_fwd_act_type fltr_act;
+};
+
+struct ice_tc_vlan_hdr {
+	__be16 vlan_id; /* Only last 12 bits valid */
+#ifdef HAVE_FLOW_DISSECTOR_VLAN_PRIO
+	u16 vlan_prio; /* Only last 3 bits valid (valid values: 0..7) */
+#endif
+};
+
+struct ice_tc_l2_hdr {
+	u8 dst_mac[ETH_ALEN];
+	u8 src_mac[ETH_ALEN];
+	__be16 n_proto;    /* Ethernet Protocol */
+};
+
+struct ice_tc_l3_hdr {
+	u8 ip_proto;    /* IPPROTO value */
+	union {
+		struct {
+			struct in_addr dst_ip;
+			struct in_addr src_ip;
+		} v4;
+		struct {
+			struct in6_addr dst_ip6;
+			struct in6_addr src_ip6;
+		} v6;
+	} ip;
+#define dst_ipv6	ip.v6.dst_ip6.s6_addr32
+#define dst_ipv6_addr	ip.v6.dst_ip6.s6_addr
+#define src_ipv6	ip.v6.src_ip6.s6_addr32
+#define src_ipv6_addr	ip.v6.src_ip6.s6_addr
+#define dst_ipv4	ip.v4.dst_ip.s_addr
+#define src_ipv4	ip.v4.src_ip.s_addr
+
+	u8 tos;
+	u8 ttl;
+};
+
+struct ice_tc_l4_hdr {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ice_tc_flower_lyr_2_4_hdrs {
+	/* L2 layer fields with their mask */
+	struct ice_tc_l2_hdr l2_key;
+	struct ice_tc_l2_hdr l2_mask;
+	struct ice_tc_vlan_hdr vlan_hdr;
+	/* L3 (IPv4[6]) layer fields with their mask */
+	struct ice_tc_l3_hdr l3_key;
+	struct ice_tc_l3_hdr l3_mask;
+
+	/* L4 layer fields with their mask */
+	struct ice_tc_l4_hdr l4_key;
+	struct ice_tc_l4_hdr l4_mask;
+};
+
+enum ice_eswitch_fltr_direction {
+	ICE_ESWITCH_FLTR_INGRESS,
+	ICE_ESWITCH_FLTR_EGRESS,
+};
+
+struct ice_tc_flower_fltr {
+	struct hlist_node tc_flower_node;
+
+	/* cookie becomes filter_rule_id if rule is added successfully */
+	unsigned long cookie;
+
+	/* add_adv_rule returns information like recipe ID, rule_id. Store
+	 * those values since they are needed to remove advanced rule
+	 */
+	u16 rid;
+	u16 rule_id;
+	/* this could be queue/vsi_idx (sw handle)/queue_group, depending upon
+	 * destination type
+	 */
+	u16 dest_id;
+	/* if dest_id is vsi_idx, then need to store destination VSI ptr */
+	struct ice_vsi *dest_vsi;
+	/* direction of fltr for eswitch use case */
+	enum ice_eswitch_fltr_direction direction;
+
+	/* Parsed TC flower configuration params */
+	struct ice_tc_flower_lyr_2_4_hdrs outer_headers;
+	struct ice_tc_flower_lyr_2_4_hdrs inner_headers;
+	struct ice_vsi *src_vsi;
+	__be32 tenant_id;
+	u32 flags;
+#define ICE_TC_FLWR_TNL_TYPE_NONE        0xff
+	u8 tunnel_type;
+	struct ice_tc_flower_action	action;
+
+	/* cache ptr which is used wherever needed to communicate netlink
+	 * messages
+	 */
+	struct netlink_ext_ack *extack;
+};
+
+/**
+ * ice_is_chnl_fltr - is this a valid channel filter
+ * @f: Pointer to tc-flower filter
+ *
+ * Criteria to determine of given filter is valid channel filter
+ * or not is based on its "destination". If destination is hw_tc (aka tc_class)
+ * and it is non-zero, then it is valid channel (aka ADQ) filter
+ */
+static inline bool ice_is_chnl_fltr(struct ice_tc_flower_fltr *f)
+{
+	return !!f->action.tc_class;
+}
+
+/**
+ * ice_chnl_dmac_fltr_cnt - DMAC based CHNL filter count
+ * @pf: Pointer to PF
+ */
+static inline int ice_chnl_dmac_fltr_cnt(struct ice_pf *pf)
+{
+	return pf->num_dmac_chnl_fltrs;
+}
+int
+ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi,
+			   struct ice_tc_flower_fltr *tc_fltr);
+#if defined(HAVE_TC_FLOWER_ENC) && defined(HAVE_TC_INDIR_BLOCK)
+int
+ice_tc_tun_get_type(struct net_device *tunnel_dev,
+		    struct flow_rule *rule);
+#endif /* HAVE_TC_FLOWER_ENC && HAVE_TC_INDIR_BLOCK */
+int
+#ifdef HAVE_TC_INDIR_BLOCK
+ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi,
+		   struct flow_cls_offload *cls_flower);
+#else
+ice_add_cls_flower(struct net_device __always_unused *netdev,
+		   struct ice_vsi *vsi,
+		   struct tc_cls_flower_offload *cls_flower);
+#endif /* HAVE_TC_INDIR_BLOCK */
+int
+ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower);
+void ice_replay_tc_fltrs(struct ice_pf *pf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+#endif /* _ICE_TC_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_trace.h b/drivers/net/ethernet/intel/ice/ice_trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e50808fe490119b314d2a94566a1d1704c23c9b
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_trace.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#if !IS_ENABLED(CONFIG_TRACEPOINTS) || defined(__CHECKER__)
+#if !defined(_ICE_TRACE_H_)
+#define _ICE_TRACE_H_
+/* If the Linux kernel tracepoints are not available then the ice_trace*
+ * macros become nops.
+ */
+
+#define ice_trace(trace_name, args...)
+#define ice_trace_enabled(trace_name) (0)
+#endif /* !defined(_ICE_TRACE_H_) */
+#else /* CONFIG_TRACEPOINTS */
+/*
+ * Modeled on trace-events-sample.h
+ */
+
+/*
+ * The trace subsystem name for ice will be "ice".
+ *
+ * This file is named ice_trace.h.
+ *
+ * Since this include file's name is different from the trace
+ * subsystem name, we'll have to define TRACE_INCLUDE_FILE at the end
+ * of this file.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ice
+
+/*
+ * See trace-events-sample.h for a detailed description of why this
+ * guard clause is different from most normal include files.
+ */
+#if !defined(_ICE_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _ICE_TRACE_H_
+
+#include "ice_txrx.h"
+#include <linux/tracepoint.h>
+
+/**
+ * ice_trace() enables trace points
+ * like:
+ *
+ * trace_ice_example(args...)
+ *
+ * ... as:
+ *
+ * ice_trace(example, args...)
+ *
+ * ... to resolve to the PF version of the tracepoint without
+ * ifdefs, and to allow tracepoints to be disabled entirely at build
+ * time.
+ *
+ * Trace point should always be referred to in the driver via this
+ * macro.
+ *
+ * Similarly, ice_trace_enabled(trace_name) wraps references to
+ * trace_ice_<trace_name>_enabled() functions.
+ * @trace_name: name of tracepoint
+ */
+#define _ICE_TRACE_NAME(trace_name) (trace_##ice##_##trace_name)
+#define ICE_TRACE_NAME(trace_name) _ICE_TRACE_NAME(trace_name)
+
+#define ice_trace(trace_name, args...) ICE_TRACE_NAME(trace_name)(args)
+
+#define ice_trace_enabled(trace_name) ICE_TRACE_NAME(trace_name##_enabled)()
+
+/*
+ * This is for events common to PF. Corresponding versions will be named
+ * trace_ice_*. The ice_trace() macro above will select the right trace point
+ * name for the driver.
+ */
+
+/* Begin tracepoints */
+
+/* Global tracepoints */
+DECLARE_EVENT_CLASS(ice_print_msg,
+		    TP_PROTO(char *msg),
+
+		    TP_ARGS(msg),
+
+		    TP_STRUCT__entry(__string(msg, msg)),
+
+		    TP_fast_assign(__assign_str(msg, msg);),
+
+		    TP_printk("%s", __get_str(msg))
+);
+
+#define DEFINE_PRINT_MSG_EVENT(name) \
+DEFINE_EVENT(ice_print_msg, name, \
+	     TP_PROTO(char *msg), \
+	     TP_ARGS(msg))
+
+DEFINE_PRINT_MSG_EVENT(ice_print_err);
+DEFINE_PRINT_MSG_EVENT(ice_print_warn);
+DEFINE_PRINT_MSG_EVENT(ice_print_adminq_msg);
+DEFINE_PRINT_MSG_EVENT(ice_print_adminq_desc);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_err);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_warn);
+DEFINE_PRINT_MSG_EVENT(ice_print_netdev_info);
+DEFINE_PRINT_MSG_EVENT(ice_print_peer_err);
+
+/* Events related to DIM, q_vectors and ring containers */
+DECLARE_EVENT_CLASS(ice_rx_dim_template,
+		    TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+		    TP_ARGS(q_vector, dim),
+		    TP_STRUCT__entry(__field(struct ice_q_vector *, q_vector)
+				     __field(struct dim *, dim)
+				     __string(devname, q_vector->rx.ring->netdev->name)),
+
+		    TP_fast_assign(__entry->q_vector = q_vector;
+				   __entry->dim = dim;
+				   __assign_str(devname, q_vector->rx.ring->netdev->name);),
+
+		    TP_printk("netdev: %s Rx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
+			      __get_str(devname),
+			      __entry->q_vector->rx.ring->q_index,
+			      __entry->dim->state,
+			      __entry->dim->profile_ix,
+			      __entry->dim->tune_state,
+			      __entry->dim->steps_right,
+			      __entry->dim->steps_left,
+			      __entry->dim->tired)
+);
+
+DEFINE_EVENT(ice_rx_dim_template, ice_rx_dim_work,
+	     TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+	     TP_ARGS(q_vector, dim)
+);
+
+DECLARE_EVENT_CLASS(ice_tx_dim_template,
+		    TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+		    TP_ARGS(q_vector, dim),
+		    TP_STRUCT__entry(__field(struct ice_q_vector *, q_vector)
+				     __field(struct dim *, dim)
+				     __string(devname, q_vector->tx.ring->netdev->name)),
+
+		    TP_fast_assign(__entry->q_vector = q_vector;
+				   __entry->dim = dim;
+				   __assign_str(devname, q_vector->tx.ring->netdev->name);),
+
+		    TP_printk("netdev: %s Tx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
+			      __get_str(devname),
+			      __entry->q_vector->rx.ring->q_index,
+			      __entry->dim->state,
+			      __entry->dim->profile_ix,
+			      __entry->dim->tune_state,
+			      __entry->dim->steps_right,
+			      __entry->dim->steps_left,
+			      __entry->dim->tired)
+);
+
+DEFINE_EVENT(ice_tx_dim_template, ice_tx_dim_work,
+	     TP_PROTO(struct ice_q_vector *q_vector, struct dim *dim),
+	     TP_ARGS(q_vector, dim)
+);
+
+/* Events related to a vsi & ring */
+DECLARE_EVENT_CLASS(ice_tx_template,
+		    TP_PROTO(struct ice_ring *ring, struct ice_tx_desc *desc,
+			     struct ice_tx_buf *buf),
+
+		    TP_ARGS(ring, desc, buf),
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __field(void *, buf)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __entry->buf = buf;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK buf %pK", __get_str(devname),
+			      __entry->ring, __entry->desc, __entry->buf)
+);
+
+#define DEFINE_TX_TEMPLATE_OP_EVENT(name) \
+DEFINE_EVENT(ice_tx_template, name, \
+	     TP_PROTO(struct ice_ring *ring, \
+		      struct ice_tx_desc *desc, \
+		      struct ice_tx_buf *buf), \
+	     TP_ARGS(ring, desc, buf))
+
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq);
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq_unmap);
+DEFINE_TX_TEMPLATE_OP_EVENT(ice_clean_tx_irq_unmap_eop);
+
+DECLARE_EVENT_CLASS(ice_rx_template,
+		    TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc),
+
+		    TP_ARGS(ring, desc),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK", __get_str(devname),
+			      __entry->ring, __entry->desc)
+);
+DEFINE_EVENT(ice_rx_template, ice_clean_rx_irq,
+	     TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc),
+	     TP_ARGS(ring, desc)
+);
+
+DECLARE_EVENT_CLASS(ice_rx_indicate_template,
+		    TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc,
+			     struct sk_buff *skb),
+
+		    TP_ARGS(ring, desc, skb),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, desc)
+				     __field(void *, skb)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->desc = desc;
+				   __entry->skb = skb;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s ring: %pK desc: %pK skb %pK", __get_str(devname),
+			      __entry->ring, __entry->desc, __entry->skb)
+);
+
+DEFINE_EVENT(ice_rx_indicate_template, ice_clean_rx_irq_indicate,
+	     TP_PROTO(struct ice_ring *ring, union ice_32b_rx_flex_desc *desc,
+		      struct sk_buff *skb),
+	     TP_ARGS(ring, desc, skb)
+);
+
+DECLARE_EVENT_CLASS(ice_xmit_template,
+		    TP_PROTO(struct ice_ring *ring, struct sk_buff *skb),
+
+		    TP_ARGS(ring, skb),
+
+		    TP_STRUCT__entry(__field(void *, ring)
+				     __field(void *, skb)
+				     __string(devname, ring->netdev->name)),
+
+		    TP_fast_assign(__entry->ring = ring;
+				   __entry->skb = skb;
+				   __assign_str(devname, ring->netdev->name);),
+
+		    TP_printk("netdev: %s skb: %pK ring: %pK", __get_str(devname),
+			      __entry->skb, __entry->ring)
+);
+
+#define DEFINE_XMIT_TEMPLATE_OP_EVENT(name) \
+DEFINE_EVENT(ice_xmit_template, name, \
+	     TP_PROTO(struct ice_ring *ring, struct sk_buff *skb), \
+	     TP_ARGS(ring, skb))
+
+DEFINE_XMIT_TEMPLATE_OP_EVENT(ice_xmit_frame_ring);
+DEFINE_XMIT_TEMPLATE_OP_EVENT(ice_xmit_frame_ring_drop);
+
+/* End tracepoints */
+
+#endif /* _ICE_TRACE_H_ */
+/* This must be outside ifdef _ICE_TRACE_H */
+
+/* This trace include file is not located in the .../include/trace
+ * with the kernel tracepoint definitions, because we're a loadable
+ * module.
+ */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE ice_trace
+#include <trace/define_trace.h>
+#endif /* CONFIG_TRACEPOINTS */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 33dd103035dcd4d8c5d18a9f5c5148755c7027f8..c3c2746fe977773e72ef62ae2296030e9169d5ef 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1,15 +1,116 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 /* The driver transmit and receive code */
 
 #include <linux/prefetch.h>
 #include <linux/mm.h>
+#include "ice_txrx_lib.h"
+#include "ice_lib.h"
 #include "ice.h"
 #include "ice_dcb_lib.h"
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#include "ice_xsk.h"
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#include <linux/bpf_trace.h>
+#ifdef HAVE_XDP_BUFF_IN_XDP_H
+#include <net/xdp.h>
+#else
+#include <linux/filter.h>
+#endif /* HAVE_XDP_BUFF_IN_XDP_H */
+#endif /* HAVE_XDP_SUPPORT */
+#include "ice_eswitch.h"
+#include <net/busy_poll.h>
 
 #define ICE_RX_HDR_SIZE		256
 
+
+#define FDIR_DESC_RXDID 0x40
+#define ICE_FDIR_CLEAN_DELAY 10
+
+/**
+ * ice_prgm_fdir_fltr - Program a Flow Director filter
+ * @vsi: VSI to send dummy packet
+ * @fdir_desc: flow director descriptor
+ * @raw_packet: allocated buffer for flow director
+ */
+int
+ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
+		   u8 *raw_packet)
+{
+	struct ice_tx_buf *tx_buf, *first;
+	struct ice_fltr_desc *f_desc;
+	struct ice_tx_desc *tx_desc;
+	struct ice_ring *tx_ring;
+	struct device *dev;
+	dma_addr_t dma;
+	u32 td_cmd;
+	u16 i;
+
+	/* VSI and Tx ring */
+	if (!vsi)
+		return -ENOENT;
+	tx_ring = vsi->tx_rings[0];
+	if (!tx_ring || !tx_ring->desc)
+		return -ENOENT;
+	dev = tx_ring->dev;
+
+	/* we are using two descriptors to add/del a filter and we can wait */
+	for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) {
+		if (!i)
+			return -EAGAIN;
+		msleep_interruptible(1);
+	}
+
+	dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE,
+			     DMA_TO_DEVICE);
+
+	if (dma_mapping_error(dev, dma))
+		return -EINVAL;
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	first = &tx_ring->tx_buf[i];
+	f_desc = ICE_TX_FDIRDESC(tx_ring, i);
+	memcpy(f_desc, fdir_desc, sizeof(*f_desc));
+
+	i++;
+	i = (i < tx_ring->count) ? i : 0;
+
+	tx_desc = ICE_TX_DESC(tx_ring, i);
+	tx_buf = &tx_ring->tx_buf[i];
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	memset(tx_buf, 0, sizeof(*tx_buf));
+	dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE);
+	dma_unmap_addr_set(tx_buf, dma, dma);
+
+	tx_desc->buf_addr = cpu_to_le64(dma);
+	td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
+		 ICE_TX_DESC_CMD_RE;
+
+	tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
+	tx_buf->raw_buf = (void *)raw_packet;
+
+	tx_desc->cmd_type_offset_bsz =
+		ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0);
+
+	/* Force memory write to complete before letting h/w know
+	 * there are new descriptors to fetch.
+	 */
+	wmb();
+
+	/* mark the data descriptor to be watched */
+	first->next_to_watch = tx_desc;
+
+	writel(tx_ring->next_to_use, tx_ring->tail);
+
+	return 0;
+}
+
 /**
  * ice_unmap_and_free_tx_buf - Release a Tx buffer
  * @ring: the ring that owns the buffer
@@ -18,13 +119,27 @@
 static void
 ice_unmap_and_free_tx_buf(struct ice_ring *ring, struct ice_tx_buf *tx_buf)
 {
+	struct ice_vsi *vsi = ring->vsi;
+
 	if (tx_buf->skb) {
-		dev_kfree_skb_any(tx_buf->skb);
+		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
+			devm_kfree(ring->dev, tx_buf->raw_buf);
+#ifdef HAVE_XDP_SUPPORT
+		else if (ice_ring_is_xdp(ring))
+			page_frag_free(tx_buf->raw_buf);
+#endif /* HAVE_XDP_SUPPORT */
+		else
+			dev_kfree_skb_any(tx_buf->skb);
 		if (dma_unmap_len(tx_buf, len))
 			dma_unmap_single(ring->dev,
 					 dma_unmap_addr(tx_buf, dma),
 					 dma_unmap_len(tx_buf, len),
 					 DMA_TO_DEVICE);
+		if (unlikely(tx_buf->tx_flags & ICE_TX_FLAGS_TSYN)) {
+			dev_kfree_skb_any(vsi->ptp_tx_skb[tx_buf->ptp_ts_idx]);
+			vsi->ptp_tx_skb[tx_buf->ptp_ts_idx] = NULL;
+			tx_buf->ptp_ts_idx = -1;
+		}
 	} else if (dma_unmap_len(tx_buf, len)) {
 		dma_unmap_page(ring->dev,
 			       dma_unmap_addr(tx_buf, dma),
@@ -51,6 +166,13 @@ void ice_clean_tx_ring(struct ice_ring *tx_ring)
 {
 	u16 i;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
+		ice_xsk_clean_xdp_ring(tx_ring);
+		goto tx_skip_free;
+	}
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	/* ring already cleared, nothing to do */
 	if (!tx_ring->tx_buf)
 		return;
@@ -59,6 +181,9 @@ void ice_clean_tx_ring(struct ice_ring *tx_ring)
 	for (i = 0; i < tx_ring->count; i++)
 		ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]);
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+tx_skip_free:
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count);
 
 	/* Zero out the descriptor ring */
@@ -124,6 +249,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 		smp_rmb();	/* prevent any other reads prior to eop_desc */
 
+		ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
 		/* if the descriptor isn't done, no work yet to do */
 		if (!(eop_desc->cmd_type_offset_bsz &
 		      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
@@ -136,8 +262,16 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 		total_bytes += tx_buf->bytecount;
 		total_pkts += tx_buf->gso_segs;
 
+#ifdef HAVE_XDP_SUPPORT
+		if (ice_ring_is_xdp(tx_ring))
+			page_frag_free(tx_buf->raw_buf);
+		else
+			/* free the skb */
+			napi_consume_skb(tx_buf->skb, napi_budget);
+#else
 		/* free the skb */
 		napi_consume_skb(tx_buf->skb, napi_budget);
+#endif /* HAVE_XDP_SUPPORT */
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -151,6 +285,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 		/* unmap remaining buffers */
 		while (tx_desc != eop_desc) {
+			ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf);
 			tx_buf++;
 			tx_desc++;
 			i++;
@@ -169,6 +304,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 				dma_unmap_len_set(tx_buf, len, 0);
 			}
 		}
+		ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf);
 
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buf++;
@@ -188,12 +324,13 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->stats.bytes += total_bytes;
-	tx_ring->stats.pkts += total_pkts;
-	u64_stats_update_end(&tx_ring->syncp);
-	tx_ring->q_vector->tx.total_bytes += total_bytes;
-	tx_ring->q_vector->tx.total_pkts += total_pkts;
+
+	ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
+
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_ring_is_xdp(tx_ring))
+		return !!budget;
+#endif /* HAVE_XDP_SUPPORT */
 
 	netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts,
 				  total_bytes);
@@ -207,7 +344,7 @@ static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget)
 		smp_mb();
 		if (__netif_subqueue_stopped(tx_ring->netdev,
 					     tx_ring->q_index) &&
-		    !test_bit(__ICE_DOWN, vsi->state)) {
+		    !test_bit(ICE_VSI_DOWN, vsi->state)) {
 			netif_wake_subqueue(tx_ring->netdev,
 					    tx_ring->q_index);
 			++tx_ring->tx_stats.restart_q;
@@ -273,6 +410,13 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 	if (!rx_ring->rx_buf)
 		return;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	if (rx_ring->xsk_pool) {
+		ice_xsk_clean_rx_ring(rx_ring);
+		goto rx_skip_free;
+	}
+
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
 		struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
@@ -289,17 +433,26 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 		 */
 		dma_sync_single_range_for_cpu(dev, rx_buf->dma,
 					      rx_buf->page_offset,
-					      ICE_RXBUF_2048, DMA_FROM_DEVICE);
+					      rx_ring->rx_buf_len,
+					      DMA_FROM_DEVICE);
 
+#ifndef HAVE_STRUCT_DMA_ATTRS
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(dev, rx_buf->dma, PAGE_SIZE,
+		dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#else
+		dma_unmap_page(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
+			       DMA_FROM_DEVICE);
+#endif
 		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 
 		rx_buf->page = NULL;
 		rx_buf->page_offset = 0;
 	}
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+rx_skip_free:
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 	memset(rx_ring->rx_buf, 0, sizeof(*rx_ring->rx_buf) * rx_ring->count);
 
 	/* Zero out the descriptor ring */
@@ -319,6 +472,14 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 void ice_free_rx_ring(struct ice_ring *rx_ring)
 {
 	ice_clean_rx_ring(rx_ring);
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_XDP_BUFF_RXQ
+	if (rx_ring->vsi->type == ICE_VSI_PF)
+		if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+			xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+#endif /* HAVE_XDP_BUFF_RXQ */
+	rx_ring->xdp_prog = NULL;
+#endif /* HAVE_XDP_SUPPORT */
 	devm_kfree(rx_ring->dev, rx_ring->rx_buf);
 	rx_ring->rx_buf = NULL;
 
@@ -363,6 +524,19 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
 
 	rx_ring->next_to_use = 0;
 	rx_ring->next_to_clean = 0;
+
+#ifdef HAVE_XDP_SUPPORT
+	if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+		WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
+
+#ifdef HAVE_XDP_BUFF_RXQ
+	if (rx_ring->vsi->type == ICE_VSI_PF &&
+	    !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				     rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
+			goto err;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
 	return 0;
 
 err:
@@ -372,35 +546,206 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
 }
 
 /**
- * ice_release_rx_desc - Store the new tail and head values
- * @rx_ring: ring to bump
- * @val: new head index
+ * ice_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
  */
-static void ice_release_rx_desc(struct ice_ring *rx_ring, u32 val)
+static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
 {
-	u16 prev_ntu = rx_ring->next_to_use;
+	if (ice_ring_uses_build_skb(rx_ring))
+		return ICE_SKB_PAD;
+#ifdef HAVE_XDP_SUPPORT
+	else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+		return XDP_PACKET_HEADROOM;
+#endif /* HAVE_XDP_SUPPORT */
 
-	rx_ring->next_to_use = val;
+	return 0;
+}
 
-	/* update next to alloc since we have filled the ring */
-	rx_ring->next_to_alloc = val;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+/**
+ * ice_rx_frame_truesize - Returns an actual size of Rx frame in memory
+ * @rx_ring: Rx ring we are requesting the frame size of
+ * @size: Packet length from rx_desc
+ *
+ * Returns an actual size of Rx frame in memory, considering page size
+ * and SKB data alignment.
+ */
+static unsigned int
+ice_rx_frame_truesize(struct ice_ring *rx_ring, unsigned int __maybe_unused size)
+{
+	unsigned int truesize;
 
-	/* QRX_TAIL will be updated with any tail value, but hardware ignores
-	 * the lower 3 bits. This makes it so we only bump tail on meaningful
-	 * boundaries. Also, this allows us to bump tail on intervals of 8 up to
-	 * the budget depending on the current traffic load.
-	 */
-	val &= ~0x7;
-	if (prev_ntu != val) {
-		/* Force memory writes to complete before letting h/w
-		 * know there are new descriptors to fetch. (Only
-		 * applicable for weak-ordered memory model archs,
-		 * such as IA-64).
-		 */
-		wmb();
-		writel(val, rx_ring->tail);
+#if (PAGE_SIZE < 8192)
+	truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+	truesize = ice_rx_offset(rx_ring) ?
+		SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+		SKB_DATA_ALIGN(size);
+#endif
+	return truesize;
+}
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_run_xdp - Executes an XDP program on initialized xdp_buff
+ * @rx_ring: Rx ring
+ * @xdp: xdp_buff used as input to the XDP program
+ * @xdp_prog: XDP program to run
+ *
+ * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
+ */
+static int
+ice_run_xdp(struct ice_ring *rx_ring, struct xdp_buff *xdp,
+	    struct bpf_prog *xdp_prog)
+{
+	int err, result = ICE_XDP_PASS;
+	struct ice_ring *xdp_ring;
+	u32 act;
+#ifdef ICE_ADD_PROBES
+	u64 rx_bytes = (u64)(xdp->data_end - xdp->data);
+
+	rx_ring->xdp_stats.xdp_rx_pkts++;
+	rx_ring->xdp_stats.xdp_rx_bytes += rx_bytes;
+#endif
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_pass++;
+#endif
+		break;
+	case XDP_TX:
+		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
+		result = ice_xmit_xdp_buff(xdp, xdp_ring);
+#ifdef ICE_ADD_PROBES
+		if (result == ICE_XDP_TX)
+			rx_ring->xdp_stats.xdp_tx++;
+		else
+			rx_ring->xdp_stats.xdp_tx_fail++;
+#endif
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+#ifdef ICE_ADD_PROBES
+		if (!err)
+			rx_ring->xdp_stats.xdp_redirect++;
+		else
+			rx_ring->xdp_stats.xdp_redirect_fail++;
+#endif
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fallthrough -- not supported action */
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_unknown++;
+#endif
+		/* fallthrough -- handle aborts by dropping frame */
+	case XDP_DROP:
+		result = ICE_XDP_CONSUMED;
+#ifdef ICE_ADD_PROBES
+		rx_ring->xdp_stats.xdp_drop++;
+#endif
+		break;
+	}
+
+	return result;
+}
+
+#ifdef HAVE_XDP_FRAME_STRUCT
+/**
+ * ice_xdp_xmit - submit packets to XDP ring for transmission
+ * @dev: netdev
+ * @n: number of XDP frames to be transmitted
+ * @frames: XDP frames to be transmitted
+ * @flags: transmit flags
+ *
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
+ */
+int
+ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+	     u32 flags)
+#else
+int ice_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+#endif /* HAVE_XDP_FRAME_STRUCT */
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_ring *xdp_ring;
+#ifdef HAVE_XDP_FRAME_STRUCT
+	int drops = 0, i;
+#else
+	int err;
+#endif /* HAVE_XDP_FRAME_STRUCT */
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
+		return -ENXIO;
+
+#ifdef HAVE_XDP_FRAME_STRUCT
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+#endif
+
+	xdp_ring = vsi->xdp_rings[queue_index];
+#ifdef HAVE_XDP_FRAME_STRUCT
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
+
+		err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
+		if (err != ICE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
 	}
+
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ice_xdp_ring_update_tail(xdp_ring);
+
+	return n - drops;
+#else
+	err = ice_xmit_xdp_ring(xdp->data,
+				(u8 *)xdp->data_end - (u8 *)xdp->data,
+				xdp_ring);
+	return err == ICE_XDP_TX ? 0 : -EFAULT;
+#endif /* HAVE_XDP_FRAME_STRUCT */
+}
+
+#ifndef NO_NDO_XDP_FLUSH
+/**
+ * ice_xdp_flush - flush XDP ring and transmit all submitted packets
+ * @dev: netdev
+ */
+void ice_xdp_flush(struct net_device *dev)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct ice_vsi *vsi = np->vsi;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return;
+
+	if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
+		return;
+
+	ice_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
 }
+#endif /* !NO_NDO_XDP_FLUSH */
+#endif /* HAVE_XDP_SUPPORT */
 
 /**
  * ice_alloc_mapped_page - recycle or make a new page
@@ -418,35 +763,46 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi)
 
 	/* since we are recycling buffers we should seldom need to alloc */
 	if (likely(page)) {
-		rx_ring->rx_stats.page_reuse_count++;
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
 		return true;
 	}
 
 	/* alloc new page for storage */
-	page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+	page = dev_alloc_pages(ice_rx_pg_order(rx_ring));
 	if (unlikely(!page)) {
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	/* map page for use */
-	dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+#ifndef HAVE_STRUCT_DMA_ATTRS
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
 				 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#else
+	dma = dma_map_page(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
+			   DMA_FROM_DEVICE);
+#endif
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
 	if (dma_mapping_error(rx_ring->dev, dma)) {
-		__free_pages(page, 0);
+		__free_pages(page, ice_rx_pg_order(rx_ring));
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = 0;
+	bi->page_offset = ice_rx_offset(rx_ring);
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
 	page_ref_add(page, USHRT_MAX - 1);
 	bi->pagecnt_bias = USHRT_MAX;
+#else
+	bi->pagecnt_bias = 1;
+#endif
 
 	return true;
 }
@@ -471,7 +827,8 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
 	struct ice_rx_buf *bi;
 
 	/* do nothing if no valid netdev defined */
-	if (!rx_ring->netdev || !cleaned_count)
+	if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) ||
+	    !cleaned_count)
 		return false;
 
 	/* get the Rx descriptor and buffer based on next_to_use */
@@ -486,14 +843,13 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
 		/* sync the buffer for use by the device */
 		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
 						 bi->page_offset,
-						 ICE_RXBUF_2048,
+						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
-
 		rx_desc++;
 		bi++;
 		ntu++;
@@ -532,7 +888,7 @@ static bool ice_page_is_reserved(struct page *page)
  * Update the offset within page so that Rx buf will be ready to be reused.
  * For systems with PAGE_SIZE < 8192 this function will flip the page offset
  * so the second half of page assigned to Rx buffer will be used, otherwise
- * the offset is moved by the @size bytes
+ * the offset is moved by "size" bytes
  */
 static void
 ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
@@ -557,9 +913,6 @@ ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
  */
 static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 {
-#if (PAGE_SIZE >= 8192)
-	unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
-#endif
 	unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
 	struct page *page = rx_buf->page;
 
@@ -572,7 +925,9 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 	if (unlikely((page_count(page) - pagecnt_bias) > 1))
 		return false;
 #else
-	if (rx_buf->page_offset > last_offset)
+#define ICE_LAST_OFFSET \
+	(SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
+	if (rx_buf->page_offset > ICE_LAST_OFFSET)
 		return false;
 #endif /* PAGE_SIZE < 8192) */
 
@@ -580,16 +935,24 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
 	 * the pagecnt_bias and page count so that we fully restock the
 	 * number of references the driver holds.
 	 */
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
 	if (unlikely(pagecnt_bias == 1)) {
 		page_ref_add(page, USHRT_MAX - 1);
 		rx_buf->pagecnt_bias = USHRT_MAX;
 	}
+#else
+	if (likely(!pagecnt_bias)) {
+		get_page(page);
+		rx_buf->pagecnt_bias = 1;
+	}
+#endif
 
 	return true;
 }
 
 /**
  * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
+ * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: buffer containing page to add
  * @skb: sk_buff to place the data into
  * @size: packet length from rx_desc
@@ -599,13 +962,13 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
  * The function will then update the page offset.
  */
 static void
-ice_add_rx_frag(struct ice_rx_buf *rx_buf, struct sk_buff *skb,
-		unsigned int size)
+ice_add_rx_frag(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+		struct sk_buff *skb, unsigned int size)
 {
 #if (PAGE_SIZE >= 8192)
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = SKB_DATA_ALIGN(size + ice_rx_offset(rx_ring));
 #else
-	unsigned int truesize = ICE_RXBUF_2048;
+	unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 #endif
 
 	if (!size)
@@ -678,11 +1041,70 @@ ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb,
 	return rx_buf;
 }
 
+/**
+ * ice_build_skb - Build skb around an existing buffer
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @rx_buf: Rx buffer to pull data from
+ * @xdp: xdp_buff pointing to the data
+ *
+ * This function builds an skb around an existing Rx buffer, taking care
+ * to set up the skb correctly and avoid any memcpy overhead.
+ */
+static struct sk_buff *
+ice_build_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+	      struct xdp_buff *xdp)
+{
+#ifdef HAVE_XDP_BUFF_DATA_META
+	u8 metasize = xdp->data - xdp->data_meta;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
+				SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
+#endif
+	struct sk_buff *skb;
+
+#ifdef HAVE_XDP_BUFF_DATA_META
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points exactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	net_prefetch(xdp->data_meta);
+#else
+	net_prefetch(xdp->data);
+#endif /* HAVE_XDP_BUFF_DATA_META */
+	/* build an skb around the page buffer */
+	skb = build_skb(xdp->data_hard_start, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* must to record Rx queue, otherwise OS features such as
+	 * symmetric queue won't work
+	 */
+	skb_record_rx_queue(skb, rx_ring->q_index);
+
+	/* update pointers within the skb to store the data */
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+#ifdef HAVE_XDP_BUFF_DATA_META
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+#endif /* HAVE_XDP_BUFF_DATA_META */
+
+	/* buffer is used by skb, update page_offset */
+	ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+
+	return skb;
+}
+
 /**
  * ice_construct_skb - Allocate skb and populate it
  * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: Rx buffer to pull data from
- * @size: the length of the packet
+ * @xdp: xdp_buff pointing to the data
  *
  * This function allocates an skb. It then populates it with the page
  * data from the current receive descriptor, taking care to set up the
@@ -690,17 +1112,14 @@ ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb,
  */
 static struct sk_buff *
 ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
-		  unsigned int size)
+		  struct xdp_buff *xdp)
 {
-	void *va = page_address(rx_buf->page) + rx_buf->page_offset;
+	unsigned int size = xdp->data_end - xdp->data;
 	unsigned int headlen;
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	prefetch(va);
-#if L1_CACHE_BYTES < 128
-	prefetch((u8 *)va + L1_CACHE_BYTES);
-#endif /* L1_CACHE_BYTES */
+	net_prefetch(xdp->data);
 
 	/* allocate a skb to store the frags */
 	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
@@ -712,10 +1131,11 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > ICE_RX_HDR_SIZE)
-		headlen = eth_get_headlen(skb->dev, va, ICE_RX_HDR_SIZE);
+		headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
 
 	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
+	memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
+							 sizeof(long)));
 
 	/* if we exhaust the linear part then add what is left as a frag */
 	size -= headlen;
@@ -723,7 +1143,7 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
 #if (PAGE_SIZE >= 8192)
 		unsigned int truesize = SKB_DATA_ALIGN(size);
 #else
-		unsigned int truesize = ICE_RXBUF_2048;
+		unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 #endif
 		skb_add_rx_frag(skb, 0, rx_buf->page,
 				rx_buf->page_offset + headlen, size, truesize);
@@ -745,22 +1165,37 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
  * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buf: Rx buffer to pull data from
  *
- * This function will  clean up the contents of the rx_buf. It will
- * either recycle the buffer or unmap it and free the associated resources.
+ * This function will update next_to_clean and then clean up the contents
+ * of the rx_buf. It will either recycle the buffer or unmap it and free
+ * the associated resources.
  */
 static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 {
+	u16 ntc = rx_ring->next_to_clean + 1;
+
+	/* fetch, update, and store next to clean */
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
 	if (!rx_buf)
 		return;
 
 	if (ice_can_reuse_rx_page(rx_buf)) {
 		/* hand second half of page back to the ring */
 		ice_reuse_rx_page(rx_ring, rx_buf);
-		rx_ring->rx_stats.page_reuse_count++;
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
 	} else {
 		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
-				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+#ifndef HAVE_STRUCT_DMA_ATTRS
+		dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma,
+				     ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE,
+				     ICE_RX_DMA_ATTR);
+#else
+		dma_unmap_page(rx_ring->dev, rx_buf->dma,
+			       ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+#endif
 		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 	}
 
@@ -769,245 +1204,292 @@ static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 	rx_buf->skb = NULL;
 }
 
-/**
- * ice_cleanup_headers - Correct empty headers
- * @skb: pointer to current skb being fixed
- *
- * Also address the case where we are pulling data in on pages only
- * and as such no data is present in the skb header.
- *
- * In addition if skb is not at least 60 bytes we need to pad it so that
- * it is large enough to qualify as a valid Ethernet frame.
- *
- * Returns true if an error was encountered and skb was freed.
- */
-static bool ice_cleanup_headers(struct sk_buff *skb)
-{
-	/* if eth_skb_pad returns an error the skb was freed */
-	if (eth_skb_pad(skb))
-		return true;
-
-	return false;
-}
-
-/**
- * ice_test_staterr - tests bits in Rx descriptor status and error fields
- * @rx_desc: pointer to receive descriptor (in le64 format)
- * @stat_err_bits: value to mask
- *
- * This function does some fast chicanery in order to return the
- * value of the mask which is really only used for boolean tests.
- * The status_error_len doesn't need to be shifted because it begins
- * at offset zero.
- */
-static bool
-ice_test_staterr(union ice_32b_rx_flex_desc *rx_desc, const u16 stat_err_bits)
-{
-	return !!(rx_desc->wb.status_error0 &
-		  cpu_to_le16(stat_err_bits));
-}
-
 /**
  * ice_is_non_eop - process handling of non-EOP buffers
  * @rx_ring: Rx ring being processed
  * @rx_desc: Rx descriptor for current buffer
  * @skb: Current socket buffer containing buffer in progress
  *
- * This function updates next to clean. If the buffer is an EOP buffer
- * this function exits returning false, otherwise it will place the
- * sk_buff in the next buffer to be chained and return true indicating
- * that this is in fact a non-EOP buffer.
+ * If the buffer is an EOP buffer, this function exits returning false,
+ * otherwise return true indicating that this is in fact a non-EOP buffer.
  */
 static bool
 ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 	       struct sk_buff *skb)
 {
-	u32 ntc = rx_ring->next_to_clean + 1;
-
-	/* fetch, update, and store next to clean */
-	ntc = (ntc < rx_ring->count) ? ntc : 0;
-	rx_ring->next_to_clean = ntc;
-
-	prefetch(ICE_RX_DESC(rx_ring, ntc));
-
 	/* if we are the last buffer then there is nothing else to do */
 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
-	if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
+	if (likely(ice_test_staterr(rx_desc->wb.status_error0, ICE_RXD_EOF)))
 		return false;
 
 	/* place skb in next buffer to be received */
-	rx_ring->rx_buf[ntc].skb = skb;
+	rx_ring->rx_buf[rx_ring->next_to_clean].skb = skb;
 	rx_ring->rx_stats.non_eop_descs++;
 
 	return true;
 }
 
 /**
- * ice_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
+ * ice_detect_dis_inline_fd_usage  - detect and disable usage of inline-fd
+ * @ch_vsi : ptr to channel VSI
  *
- * Returns a hash type to be used by skb_set_hash
+ * This function to detect FD table full condition and if so,
+ * return true otherwise false
  */
-static enum pkt_hash_types ice_ptype_to_htype(u8 __always_unused ptype)
+static bool
+ice_detect_dis_inline_fd_usage(struct ice_vsi *ch_vsi)
 {
-	return PKT_HASH_TYPE_NONE;
+	int total_fd_allowed = ch_vsi->num_gfltr + ch_vsi->num_bfltr;
+	int inline_fd_active;
+
+	/* detect if transitioned to RSS mode, if so return true */
+	if (test_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state))
+		return true;
+
+	/* for some reason if channel VSI doesn't have any FD resources
+	 * reserved (from guaranteed or best effort pool), stay in RSS
+	 */
+	if (!total_fd_allowed) {
+		set_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state);
+		return true;
+	}
+
+	/* inline_fd_active_cnt is decremented from ice_chnl_inline_fd
+	 * function when evicting FD entry upon FIN/RST transmit
+	 */
+	inline_fd_active = atomic_inc_return(&ch_vsi->inline_fd_active_cnt) - 1;
+	if (inline_fd_active >= total_fd_allowed) {
+		set_bit(ICE_SWITCH_TO_RSS, ch_vsi->adv_state);
+		return true;
+	}
+
+	return false;
 }
 
+/* Rx desc:flexi_flags are bits 15:10 applicable when RXDID=2 as defined
+ * by package
+ */
+#define ICE_RX_FLEXI_FLAGS_ACK	BIT(2)
+#define ICE_RX_FLEXI_FLAGS_FIN	BIT(3)
+#define ICE_RX_FLEXI_FLAGS_SYN	BIT(4)
+#define ICE_RX_FLEXI_FLAGS_RST	BIT(5)
+
+/* Rx desc:flexi_flags2, applicable when RXDID=2 */
+#define ICE_RX_FLEXI_FLAGS2_TNL_0 BIT(5)
+#define ICE_RX_FLEXI_FLAGS2_TNL_1 BIT(6)
+#define ICE_RX_SUPPORTED_TNL_FLEXI_FLAGS (ICE_RX_FLEXI_FLAGS2_TNL_0 | \
+					  ICE_RX_FLEXI_FLAGS2_TNL_1)
+
 /**
- * ice_rx_hash - set the hash value in the skb
- * @rx_ring: descriptor ring
- * @rx_desc: specific descriptor
- * @skb: pointer to current skb
- * @rx_ptype: the ptype value from the descriptor
+ * ice_is_ctrl_pkt - determine if given packet is control/data packet
+ * @skb: receive buffer
+ * @rx_ring: ptr to Rx ring
+ * @rx_desc: ptr to Rx desc
+ * @ptype: packet type
+ * @flags: value of flexi_flags0 from Rx desc
+ *
+ * Determine if given packet is control/data packet. Definition of control
+ * packet is if it consist of SYN/FIN/RST flags, otherwise data packet. This
+ * check is applicable only for TCP/IPv4[6]. This function is expected to
+ * work correctly even for tunnel if inner protocol is TCP/IPv4[6] as long
+ * as device parser understands it as known packet.
+ *
+ * Function returns TRUE of this is control packet otherwise false if given
+ * packet is classified as data packet. For all error condition, it returns
+ * true so that it gets treated as control packet. This control versus data
+ * packet logic feeds into deferring interrupt enablement from napi_poll
+ * when busy_poll:stop is called.
  */
-static void
-ice_rx_hash(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
-	    struct sk_buff *skb, u8 rx_ptype)
+static bool
+ice_is_ctrl_pkt(struct sk_buff *skb, struct ice_ring *rx_ring,
+		union ice_32b_rx_flex_desc *rx_desc, u16 ptype, u16 *flags)
 {
 	struct ice_32b_rx_flex_desc_nic *nic_mdid;
-	u32 hash;
+	struct ice_rx_ptype_decoded decoded;
+	u16 flexi_flags;
 
-	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
-		return;
+	*flags = 0;
 
+	/* RXDID must be set to FLEX, otherwise no gurantee that "flags"
+	 * will be available in Rx desc.flexi_flags0
+	 */
 	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
-		return;
+		return true;
+
+	/* process PTYPE from Rx desc */
+	decoded = ice_decode_rx_desc_ptype(ptype);
+	if (!decoded.known)
+		return true;
+
+	/* Make sure packet is L4 and L4 proto (inner most) is TCP */
+	if (!(decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4 &&
+	      decoded.inner_prot == ICE_RX_PTYPE_INNER_PROT_TCP))
+		return true;
 
 	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
-	hash = le32_to_cpu(nic_mdid->rss_hash);
-	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+	if (decoded.tunnel_type != ICE_RX_PTYPE_TUNNEL_NONE) {
+		/* Determine which tunnel to support and allow only
+		 * well known tunnel as determined thru' Rx desc
+		 * either PTYPE (this needs additional support for
+		 * tunnels like GTP, PTYPE 325 onwards) or via
+		 * flexi_flag2:TNL_2..0
+		 *   TNL_0 : VxLAN
+		 *   TNL_1 and TNL_0 : Geneve
+		 */
+		/* This only takes care of VxLAN and Geneve */
+		if (!(nic_mdid->flexi_flags2 &
+		      ICE_RX_SUPPORTED_TNL_FLEXI_FLAGS))
+			return true;
+	}
+
+	flexi_flags = le16_to_cpu(nic_mdid->ptype_flexi_flags0) >>
+				  ICE_RX_FLEX_DESC_FLEXI_FLAGS0_S;
+
+#ifdef ADQ_PERF_COUNTERS
+	if (flexi_flags & ICE_RX_FLEXI_FLAGS_FIN)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_fin++;
+	else if (flexi_flags & ICE_RX_FLEXI_FLAGS_RST)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_rst++;
+	else if (flexi_flags & ICE_RX_FLEXI_FLAGS_SYN)
+		rx_ring->ch_q_stats.rx.num_tcp_flags_syn++;
+#endif /* ADQ_PERF_COUNTERS */
+
+	/* return the flexi_flags to caller */
+	*flags = flexi_flags;
+
+	/* Packet is ctrl_pkt : if SYN|FIN|RST|SYN+ACK|FIN+ACK set */
+	if (flexi_flags & (ICE_RX_FLEXI_FLAGS_FIN | ICE_RX_FLEXI_FLAGS_RST |
+			   ICE_RX_FLEXI_FLAGS_SYN))
+		return true;
+
+	/* if reached here, means packet is DATA packet */
+	return false;
 }
 
 /**
- * ice_rx_csum - Indicate in skb if checksum is good
- * @ring: the ring we care about
- * @skb: skb currently being received and modified
- * @rx_desc: the receive descriptor
- * @ptype: the packet type decoded by hardware
+ * ice_rx_queue_override - override Rx queue if needed and update skb
+ * @skb: receive buffer
+ * @rx_ring: ptr to Rx ring
+ * @flags: value of flexi_flags (such as TCP flags)
  *
- * skb->protocol must be set before this function is called
+ * Override Rx queue if packet being processed is SYN only and records
+ * new Rx queue in skb. This is applicable only for TCP/IPv4[6].
  */
 static void
-ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb,
-	    union ice_32b_rx_flex_desc *rx_desc, u8 ptype)
+ice_rx_queue_override(struct sk_buff *skb, struct ice_ring *rx_ring,
+		      u16 flags)
 {
-	struct ice_rx_ptype_decoded decoded;
-	u32 rx_error, rx_status;
-	bool ipv4, ipv6;
+	struct ice_channel *ch = rx_ring->ch;
+	struct ice_vsi *vsi = rx_ring->vsi;
+	struct ice_ring *ring; /* selected ring for override */
+	int queue_to_use;
 
-	rx_status = le16_to_cpu(rx_desc->wb.status_error0);
-	rx_error = rx_status;
+	/* make sure ring is channel enabled before proceeding with Rx queue
+	 * override logic
+	 */
+	if (!ice_ring_ch_enabled(rx_ring))
+		return;
+	/* SYN must be set to proceed */
+	if (!(flags & ICE_RX_FLEXI_FLAGS_SYN))
+		return;
+	/* ACK must not be set to proceed */
+	if (flags & ICE_RX_FLEXI_FLAGS_ACK)
+		return;
 
-	decoded = ice_decode_rx_desc_ptype(ptype);
 
-	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
-	skb->ip_summed = CHECKSUM_NONE;
-	skb_checksum_none_assert(skb);
+	/* proceed only when filter type for channel is of type dest
+	 * port or src+dest port or tunnel
+	 */
+	if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT ||
+	      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT ||
+	      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID))
+		return;
 
-	/* check if Rx checksum is enabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	/* make sure channel VSI is FD capable and enabled for
+	 * inline flow-director usage
+	 */
+	if (!ice_vsi_fd_ena(ch->ch_vsi) ||
+	    !ice_vsi_inline_fd_ena(ch->ch_vsi))
 		return;
 
-	/* check if HW has decoded the packet and checksum */
-	if (!(rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
+	/* Detection logic to check if HW table is about to get full,
+	 * if so, switch to RSS mode, means don't change Rx queue
+	 */
+	if (ice_detect_dis_inline_fd_usage(ch->ch_vsi)) {
+#ifdef ADQ_PERF_COUNTERS
+		rx_ring->ch_q_stats.rx.num_rx_queue_bailouts++;
+#endif /* ADQ_PERF_COUNTERS */
 		return;
+	}
+
+	/* Pick the Rx queue based on round-robin policy for the
+	 * connection, limited to queue region of specific channel
+	 */
+	queue_to_use = (atomic_inc_return(&ch->fd_queue) - 1) %
+			ch->num_rxq;
 
-	if (!(decoded.known && decoded.outer_ip))
+	/* adjust the queue based on channel's base_queue, so that
+	 * correct Rx queue number is recorded in skb
+	 */
+	queue_to_use += ch->base_q;
+
+	/* Get the selected ring ptr */
+	ring = vsi->rx_rings[queue_to_use];
+	if (!ring || !ring->q_vector)
 		return;
 
-	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+	/* re-record selected queue as Rx queue in SKB */
+	skb_record_rx_queue(skb, queue_to_use);
 
-	if (ipv4 && (rx_error & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
-				 BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
-		goto checksum_fail;
-	else if (ipv6 && (rx_status &
-		 (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S))))
-		goto checksum_fail;
+#ifdef ADQ_PERF_COUNTERS
+	ring->ch_q_stats.rx.num_rx_queue_set++;
+#endif /* ADQ_PERF_COUNTERS */
 
-	/* check for L4 errors and handle packets that were not able to be
-	 * checksummed due to arrival speed
+	/* mark selected queue:vector for inline filter usage by
+	 * incrementing atomic variable, it can't be flag
+	 * because during ATR eviction, this needs to be
+	 * decremented
 	 */
-	if (rx_error & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
-		goto checksum_fail;
-
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case ICE_RX_PTYPE_INNER_PROT_TCP:
-	case ICE_RX_PTYPE_INNER_PROT_UDP:
-	case ICE_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	default:
-		break;
-	}
-	return;
+	atomic_inc(&ring->q_vector->inline_fd_cnt);
 
-checksum_fail:
-	ring->vsi->back->hw_csum_rx_error++;
+	return;
 }
 
 /**
- * ice_process_skb_fields - Populate skb header fields from Rx descriptor
- * @rx_ring: Rx descriptor ring packet is being transacted on
- * @rx_desc: pointer to the EOP Rx descriptor
- * @skb: pointer to current skb being populated
- * @ptype: the packet type decoded by hardware
+ * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @budget: Total limit on number of packets to process
+ *
+ * This function provides a "bounce buffer" approach to Rx interrupt
+ * processing. The advantage to this is that on systems that have
+ * expensive overhead for IOMMU access this provides a means of avoiding
+ * it by maintaining the mapping of the page to the system.
  *
- * This function checks the ring, descriptor, and packet information in
- * order to populate the hash, checksum, VLAN, protocol, and
- * other fields within the skb.
+ * Returns amount of work completed
  */
-static void
-ice_process_skb_fields(struct ice_ring *rx_ring,
-		       union ice_32b_rx_flex_desc *rx_desc,
-		       struct sk_buff *skb, u8 ptype)
-{
-	ice_rx_hash(rx_ring, rx_desc, skb, ptype);
-
-	/* modifies the skb - consumes the enet header */
-	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
-
-	ice_rx_csum(rx_ring, skb, rx_desc, ptype);
-}
-
-/**
- * ice_receive_skb - Send a completed packet up the stack
- * @rx_ring: Rx ring in play
- * @skb: packet to send up
- * @vlan_tag: VLAN tag for packet
- *
- * This function sends the completed packet (via. skb) up the stack using
- * gro receive functions (with/without VLAN tag)
- */
-static void
-ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
-{
-	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    (vlan_tag & VLAN_VID_MASK))
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-	napi_gro_receive(&rx_ring->q_vector->napi, skb);
-}
-
-/**
- * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
- * @rx_ring: Rx descriptor ring to transact packets on
- * @budget: Total limit on number of packets to process
- *
- * This function provides a "bounce buffer" approach to Rx interrupt
- * processing. The advantage to this is that on systems that have
- * expensive overhead for IOMMU access this provides a means of avoiding
- * it by maintaining the mapping of the page to the system.
- *
- * Returns amount of work completed
- */
-static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
+int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
 	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+#ifdef HAVE_XDP_SUPPORT
+	unsigned int xdp_res, xdp_xmit = 0;
+	struct bpf_prog *xdp_prog = NULL;
+#endif /* HAVE_XDP_SUPPORT */
+	struct xdp_buff xdp;
 	bool failure;
 
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_XDP_BUFF_RXQ
+	xdp.rxq = &rx_ring->xdp_rxq;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+	xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
 	/* start the loop to process Rx packets bounded by 'budget' */
 	while (likely(total_rx_pkts < (unsigned int)budget)) {
 		union ice_32b_rx_flex_desc *rx_desc;
@@ -1016,7 +1498,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		unsigned int size;
 		u16 stat_err_bits;
 		u16 vlan_tag = 0;
-		u8 rx_ptype;
+		u16 rx_ptype;
 
 		/* get the Rx desc from Rx ring based on 'next_to_clean' */
 		rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
@@ -1027,7 +1509,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		 * hardware wrote DD then it will be non-zero
 		 */
 		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
-		if (!ice_test_staterr(rx_desc, stat_err_bits))
+		if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -1036,17 +1518,95 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		 */
 		dma_rmb();
 
+		ice_trace(clean_rx_irq, rx_ring, rx_desc);
+		if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
+			struct ice_vsi *ctrl_vsi = rx_ring->vsi;
+
+			if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+			    ctrl_vsi->vf_id != ICE_INVAL_VFID)
+				ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
+			ice_put_rx_buf(rx_ring, NULL);
+			cleaned_count++;
+			continue;
+		}
+
 		size = le16_to_cpu(rx_desc->wb.pkt_len) &
 			ICE_RX_FLX_DESC_PKT_LEN_M;
 
 		/* retrieve a buffer from the ring */
 		rx_buf = ice_get_rx_buf(rx_ring, &skb, size);
 
-		if (skb)
-			ice_add_rx_frag(rx_buf, skb, size);
-		else
-			skb = ice_construct_skb(rx_ring, rx_buf, size);
+		if (!size) {
+			xdp.data = NULL;
+			xdp.data_end = NULL;
+			xdp.data_hard_start = NULL;
+#ifdef HAVE_XDP_BUFF_DATA_META
+			xdp.data_meta = NULL;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+			goto construct_skb;
+		}
+
+		xdp.data = page_address(rx_buf->page) + rx_buf->page_offset;
+		xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring);
+#ifdef HAVE_XDP_BUFF_DATA_META
+		xdp.data_meta = xdp.data;
+#endif /* HAVE_XDP_BUFF_DATA_META */
+		xdp.data_end = xdp.data + size;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+#if (PAGE_SIZE > 4096)
+		/* At larger PAGE_SIZE, frame_sz depend on len size */
+		xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+
+#ifdef HAVE_XDP_SUPPORT
+		rcu_read_lock();
+		xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+		if (!xdp_prog) {
+			rcu_read_unlock();
+			goto construct_skb;
+		}
 
+		xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog);
+		rcu_read_unlock();
+		if (!xdp_res)
+			goto construct_skb;
+		if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+#ifndef HAVE_XDP_BUFF_FRAME_SZ
+			unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+			truesize = ice_rx_pg_size(rx_ring) / 2;
+#else
+			truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) +
+						  size);
+#endif
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+			xdp_xmit |= xdp_res;
+#ifdef HAVE_XDP_BUFF_FRAME_SZ
+			ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
+#else
+			ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+#endif /* HAVE_XDP_BUFF_FRAME_SZ */
+		} else {
+			rx_buf->pagecnt_bias++;
+		}
+		total_rx_bytes += size;
+		total_rx_pkts++;
+
+		cleaned_count++;
+		ice_put_rx_buf(rx_ring, rx_buf);
+		continue;
+#endif /* HAVE_XDP_SUPPORT */
+construct_skb:
+		if (skb) {
+			ice_add_rx_frag(rx_ring, rx_buf, skb, size);
+		} else if (likely(xdp.data)) {
+			if (ice_ring_uses_build_skb(rx_ring))
+				skb = ice_build_skb(rx_ring, rx_buf, &xdp);
+			else
+				skb = ice_construct_skb(rx_ring, rx_buf, &xdp);
+		}
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buf_failed++;
@@ -1063,19 +1623,16 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 			continue;
 
 		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
-		if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
+		if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+					      stat_err_bits))) {
 			dev_kfree_skb_any(skb);
 			continue;
 		}
 
-		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
-		if (ice_test_staterr(rx_desc, stat_err_bits))
-			vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
+		vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
 
-		/* correct empty headers and pad skb if needed (to make valid
-		 * ethernet frame
-		 */
-		if (ice_cleanup_headers(skb)) {
+		/* pad the skb if needed, to make a valid ethernet frame */
+		if (eth_skb_pad(skb)) {
 			skb = NULL;
 			continue;
 		}
@@ -1089,6 +1646,20 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 
 		ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
 
+		if (ice_ring_ch_enabled(rx_ring)) {
+			bool ctrl_pkt;
+			u16 flags;
+
+			ctrl_pkt = ice_is_ctrl_pkt(skb, rx_ring, rx_desc,
+						   rx_ptype, &flags);
+			if (!ctrl_pkt)
+				rx_ring->q_vector->state_flags |=
+						ICE_CHNL_PREV_DATA_PKT_RECV;
+			else
+				ice_rx_queue_override(skb, rx_ring, flags);
+		}
+
+		ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
 		/* send completed skb up the stack */
 		ice_receive_skb(rx_ring, skb, vlan_tag);
 
@@ -1099,230 +1670,68 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 	/* return up to cleaned_count buffers to hardware */
 	failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
 
-	/* update queue and vector specific stats */
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->stats.pkts += total_rx_pkts;
-	rx_ring->stats.bytes += total_rx_bytes;
-	u64_stats_update_end(&rx_ring->syncp);
-	rx_ring->q_vector->rx.total_pkts += total_rx_pkts;
-	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+#ifdef HAVE_XDP_SUPPORT
+	if (xdp_prog)
+		ice_finalize_xdp_rx(rx_ring, xdp_xmit);
+#endif /* HAVE_XDP_SUPPORT */
+
+	ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes);
 
 	/* guarantee a trip back through this routine if there was a failure */
 	return failure ? budget : (int)total_rx_pkts;
 }
 
-/**
- * ice_adjust_itr_by_size_and_speed - Adjust ITR based on current traffic
- * @port_info: port_info structure containing the current link speed
- * @avg_pkt_size: average size of Tx or Rx packets based on clean routine
- * @itr: ITR value to update
- *
- * Calculate how big of an increment should be applied to the ITR value passed
- * in based on wmem_default, SKB overhead, Ethernet overhead, and the current
- * link speed.
- *
- * The following is a calculation derived from:
- *  wmem_default / (size + overhead) = desired_pkts_per_int
- *  rate / bits_per_byte / (size + Ethernet overhead) = pkt_rate
- *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
- *
- * Assuming wmem_default is 212992 and overhead is 640 bytes per
- * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
- * formula down to:
- *
- *	 wmem_default * bits_per_byte * usecs_per_sec   pkt_size + 24
- * ITR = -------------------------------------------- * --------------
- *			     rate			pkt_size + 640
- */
-static unsigned int
-ice_adjust_itr_by_size_and_speed(struct ice_port_info *port_info,
-				 unsigned int avg_pkt_size,
-				 unsigned int itr)
+static void __ice_update_sample(struct ice_q_vector *q_vector,
+				struct ice_ring_container *rc,
+				struct dim_sample *sample)
 {
-	switch (port_info->phy.link_info.link_speed) {
-	case ICE_AQ_LINK_SPEED_100GB:
-		itr += DIV_ROUND_UP(17 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_50GB:
-		itr += DIV_ROUND_UP(34 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_40GB:
-		itr += DIV_ROUND_UP(43 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_25GB:
-		itr += DIV_ROUND_UP(68 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_20GB:
-		itr += DIV_ROUND_UP(85 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	case ICE_AQ_LINK_SPEED_10GB:
-		/* fall through */
-	default:
-		itr += DIV_ROUND_UP(170 * (avg_pkt_size + 24),
-				    avg_pkt_size + 640);
-		break;
-	}
+	u64 packets = 0, bytes = 0;
+	struct ice_ring *ring;
 
-	if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) {
-		itr &= ICE_ITR_ADAPTIVE_LATENCY;
-		itr += ICE_ITR_ADAPTIVE_MAX_USECS;
+	ice_for_each_ring(ring, *rc) {
+		packets += ring->stats.pkts;
+		bytes += ring->stats.bytes;
 	}
 
-	return itr;
+	dim_update_sample(q_vector->total_events, packets, bytes, sample);
+	sample->comp_ctr = 0;
+
+	/* if dim settings get stale, like when not updated for 1
+	 * second or longer, force it to start again. This addresses the
+	 * freqent case of an idle queue being switched to by the
+	 * scheduler. The 1,000 here means 1,000 milliseconds.
+	 */
+	if (ktime_ms_delta(sample->time, rc->dim.start_sample.time) >= 1000)
+		rc->dim.state = DIM_START_MEASURE;
 }
 
 /**
- * ice_update_itr - update the adaptive ITR value based on statistics
- * @q_vector: structure containing interrupt and ring information
- * @rc: structure containing ring performance data
+ * ice_net_dim - Update net DIM algorithm
+ * @q_vector: the vector associated with the interrupt
  *
- * Stores a new ITR value based on packets and byte
- * counts during the last interrupt.  The advantage of per interrupt
- * computation is faster updates and more accurate ITR for the current
- * traffic pattern.  Constants in this function were computed
- * based on theoretical maximum wire speed and thresholds were set based
- * on testing data as well as attempting to minimize response time
- * while increasing bulk throughput.
+ * Create a DIM sample and notify net_dim() so that it can possibly decide
+ * a new ITR value based on incoming packets, bytes, and interrupts.
+ *
+ * This function is a no-op if the ring is not configured to dynamic ITR.
  */
-static void
-ice_update_itr(struct ice_q_vector *q_vector, struct ice_ring_container *rc)
+static void ice_net_dim(struct ice_q_vector *q_vector)
 {
-	unsigned long next_update = jiffies;
-	unsigned int packets, bytes, itr;
-	bool container_is_rx;
+	struct ice_ring_container *tx = &q_vector->tx;
+	struct ice_ring_container *rx = &q_vector->rx;
 
-	if (!rc->ring || !ITR_IS_DYNAMIC(rc->itr_setting))
-		return;
+	if (ITR_IS_DYNAMIC(tx)) {
+		struct dim_sample dim_sample;
 
-	/* If itr_countdown is set it means we programmed an ITR within
-	 * the last 4 interrupt cycles. This has a side effect of us
-	 * potentially firing an early interrupt. In order to work around
-	 * this we need to throw out any data received for a few
-	 * interrupts following the update.
-	 */
-	if (q_vector->itr_countdown) {
-		itr = rc->target_itr;
-		goto clear_counts;
+		__ice_update_sample(q_vector, tx, &dim_sample);
+		net_dim(&tx->dim, dim_sample);
 	}
 
-	container_is_rx = (&q_vector->rx == rc);
-	/* For Rx we want to push the delay up and default to low latency.
-	 * for Tx we want to pull the delay down and default to high latency.
-	 */
-	itr = container_is_rx ?
-		ICE_ITR_ADAPTIVE_MIN_USECS | ICE_ITR_ADAPTIVE_LATENCY :
-		ICE_ITR_ADAPTIVE_MAX_USECS | ICE_ITR_ADAPTIVE_LATENCY;
-
-	/* If we didn't update within up to 1 - 2 jiffies we can assume
-	 * that either packets are coming in so slow there hasn't been
-	 * any work, or that there is so much work that NAPI is dealing
-	 * with interrupt moderation and we don't need to do anything.
-	 */
-	if (time_after(next_update, rc->next_update))
-		goto clear_counts;
-
-	prefetch(q_vector->vsi->port_info);
-
-	packets = rc->total_pkts;
-	bytes = rc->total_bytes;
+	if (ITR_IS_DYNAMIC(rx)) {
+		struct dim_sample dim_sample;
 
-	if (container_is_rx) {
-		/* If Rx there are 1 to 4 packets and bytes are less than
-		 * 9000 assume insufficient data to use bulk rate limiting
-		 * approach unless Tx is already in bulk rate limiting. We
-		 * are likely latency driven.
-		 */
-		if (packets && packets < 4 && bytes < 9000 &&
-		    (q_vector->tx.target_itr & ICE_ITR_ADAPTIVE_LATENCY)) {
-			itr = ICE_ITR_ADAPTIVE_LATENCY;
-			goto adjust_by_size_and_speed;
-		}
-	} else if (packets < 4) {
-		/* If we have Tx and Rx ITR maxed and Tx ITR is running in
-		 * bulk mode and we are receiving 4 or fewer packets just
-		 * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
-		 * that the Rx can relax.
-		 */
-		if (rc->target_itr == ICE_ITR_ADAPTIVE_MAX_USECS &&
-		    (q_vector->rx.target_itr & ICE_ITR_MASK) ==
-		    ICE_ITR_ADAPTIVE_MAX_USECS)
-			goto clear_counts;
-	} else if (packets > 32) {
-		/* If we have processed over 32 packets in a single interrupt
-		 * for Tx assume we need to switch over to "bulk" mode.
-		 */
-		rc->target_itr &= ~ICE_ITR_ADAPTIVE_LATENCY;
+		__ice_update_sample(q_vector, rx, &dim_sample);
+		net_dim(&rx->dim, dim_sample);
 	}
-
-	/* We have no packets to actually measure against. This means
-	 * either one of the other queues on this vector is active or
-	 * we are a Tx queue doing TSO with too high of an interrupt rate.
-	 *
-	 * Between 4 and 56 we can assume that our current interrupt delay
-	 * is only slightly too low. As such we should increase it by a small
-	 * fixed amount.
-	 */
-	if (packets < 56) {
-		itr = rc->target_itr + ICE_ITR_ADAPTIVE_MIN_INC;
-		if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) {
-			itr &= ICE_ITR_ADAPTIVE_LATENCY;
-			itr += ICE_ITR_ADAPTIVE_MAX_USECS;
-		}
-		goto clear_counts;
-	}
-
-	if (packets <= 256) {
-		itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
-		itr &= ICE_ITR_MASK;
-
-		/* Between 56 and 112 is our "goldilocks" zone where we are
-		 * working out "just right". Just report that our current
-		 * ITR is good for us.
-		 */
-		if (packets <= 112)
-			goto clear_counts;
-
-		/* If packet count is 128 or greater we are likely looking
-		 * at a slight overrun of the delay we want. Try halving
-		 * our delay to see if that will cut the number of packets
-		 * in half per interrupt.
-		 */
-		itr >>= 1;
-		itr &= ICE_ITR_MASK;
-		if (itr < ICE_ITR_ADAPTIVE_MIN_USECS)
-			itr = ICE_ITR_ADAPTIVE_MIN_USECS;
-
-		goto clear_counts;
-	}
-
-	/* The paths below assume we are dealing with a bulk ITR since
-	 * number of packets is greater than 256. We are just going to have
-	 * to compute a value and try to bring the count under control,
-	 * though for smaller packet sizes there isn't much we can do as
-	 * NAPI polling will likely be kicking in sooner rather than later.
-	 */
-	itr = ICE_ITR_ADAPTIVE_BULK;
-
-adjust_by_size_and_speed:
-
-	/* based on checks above packets cannot be 0 so division is safe */
-	itr = ice_adjust_itr_by_size_and_speed(q_vector->vsi->port_info,
-					       bytes / packets, itr);
-
-clear_counts:
-	/* write back value */
-	rc->target_itr = itr;
-
-	/* next update should occur within next jiffy */
-	rc->next_update = next_update + 1;
-
-	rc->total_bytes = 0;
-	rc->total_pkts = 0;
 }
 
 /**
@@ -1346,85 +1755,311 @@ static u32 ice_buildreg_itr(u16 itr_idx, u16 itr)
 		(itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S));
 }
 
-/* The act of updating the ITR will cause it to immediately trigger. In order
- * to prevent this from throwing off adaptive update statistics we defer the
- * update so that it can only happen so often. So after either Tx or Rx are
- * updated we make the adaptive scheme wait until either the ITR completely
- * expires via the next_update expiration or we have been through at least
- * 3 interrupts.
- */
-#define ITR_COUNTDOWN_START 3
-
 /**
- * ice_update_ena_itr - Update ITR and re-enable MSIX interrupt
- * @q_vector: q_vector for which ITR is being updated and interrupt enabled
+ * ice_enable_interrupt - re-enable MSI-X interrupt
+ * @q_vector: the vector associated with the interrupt to enable
+ *
+ * If the VSI is down, the interrupt will not be re-enabled. Also,
+ * when enabling the interrupt always reset the wb_on_itr to false
+ * and trigger a software interrupt to clean out internal state.
  */
-static void ice_update_ena_itr(struct ice_q_vector *q_vector)
+static void ice_enable_interrupt(struct ice_q_vector *q_vector)
 {
-	struct ice_ring_container *tx = &q_vector->tx;
-	struct ice_ring_container *rx = &q_vector->rx;
 	struct ice_vsi *vsi = q_vector->vsi;
+	bool wb_en = q_vector->wb_on_itr;
 	u32 itr_val;
 
-	/* when exiting WB_ON_ITR lets set a low ITR value and trigger
-	 * interrupts to expire right away in case we have more work ready to go
-	 * already
-	 */
-	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE) {
-		itr_val = ice_buildreg_itr(rx->itr_idx, ICE_WB_ON_ITR_USECS);
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
-		/* set target back to last user set value */
-		rx->target_itr = rx->itr_setting;
-		/* set current to what we just wrote and dynamic if needed */
-		rx->current_itr = ICE_WB_ON_ITR_USECS |
-			(rx->itr_setting & ICE_ITR_DYNAMIC);
-		/* allow normal interrupt flow to start */
-		q_vector->itr_countdown = 0;
+	if (test_bit(ICE_DOWN, vsi->state))
 		return;
+
+	/* When exiting WB_ON_ITR, let ITR resume its normal
+	 * interrupts-enabled path.
+	 */
+	if (wb_en)
+		q_vector->wb_on_itr = false;
+
+	itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
+	/* trigger an immediate software interrupt when exiting
+	 * busy poll, to make sure to catch any pending cleanups
+	 * that might have been missed due to interrupt state
+	 * transition.
+	 */
+	if (wb_en) {
+		itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M |
+			   GLINT_DYN_CTL_SW_ITR_INDX_M |
+			   GLINT_DYN_CTL_SW_ITR_INDX_ENA_M;
 	}
+	wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
+}
 
-	/* This will do nothing if dynamic updates are not enabled */
-	ice_update_itr(q_vector, tx);
-	ice_update_itr(q_vector, rx);
+/**
+ * ice_refresh_bp_state - refresh state machine
+ * @napi: ptr to NAPI struct
+ * @budget: NAPI budget
+ *
+ * Update ADQ state machine, and depending on whether this was called from
+ * busy poll, enable interrupts and update ITR
+ */
+static void ice_refresh_bp_state(struct napi_struct *napi, int budget)
+{
+	struct ice_q_vector *q_vector =
+			       container_of(napi, struct ice_q_vector, napi);
+
+	if (ice_vsi_pkt_process_bp_stop_ena(q_vector->ch->ch_vsi) &&
+	    (q_vector->state_flags & ICE_CHNL_WD_EQUALS_BP)) {
+		/* Manage the internal state in such a way that, napi_poll
+		 * can decide when to perform Rx cleanup. When internal
+		 * state indicates that vector is transition from busy_poll to
+		 * interrupt, napi_poll avoid cleaning Rx rings and that
+		 * eventually translates to whether driver will return
+		 * "budget" or not.
+		 *
+		 * Keep internal state as it is "budget" specified
+		 * is not equal to "napi->weight), hence "skip
+		 * When napi weight is equal to budget, and reached the
+		 * value of tunable (max_limit_process_rx_queues), follow
+		 * the NAPI state as seen by OS, otherwise skip internal
+		 * state update (which will allow to keep the vector
+		 * internal state to be whatever it was, in this case)
+		 */
+		if (napi->weight == budget &&
+		    q_vector->process_rx_queues ==
+		    q_vector->max_limit_process_rx_queues) {
+			/* reached the point, keep internal state to be in
+			 * sync with NAPI state as seen by OS
+			 */
+			goto state_update;
+		} else {
+#ifdef ADQ_PERF_COUNTERS
+			if (napi->weight == budget)
+				q_vector->ch_stats.keep_state_bp_budget64++;
+			else
+				q_vector->ch_stats.keep_state_bp_budget8++;
+#endif /* ADQ_PERF_COUNTERS */
+			/* keep internal state of vector as it is, do not
+			 * perform state update
+			 */
+			goto skip_state_update;
+		}
+	}
 
-	/* This block of logic allows us to get away with only updating
-	 * one ITR value with each interrupt. The idea is to perform a
-	 * pseudo-lazy update with the following criteria.
-	 *
-	 * 1. Rx is given higher priority than Tx if both are in same state
-	 * 2. If we must reduce an ITR that is given highest priority.
-	 * 3. We then give priority to increasing ITR based on amount.
+state_update:
+	/* cache previous state of vector */
+	if (q_vector->state_flags & ICE_CHNL_IN_BP)
+		q_vector->state_flags |= ICE_CHNL_PREV_IN_BP;
+	else
+		q_vector->state_flags &= ~ICE_CHNL_PREV_IN_BP;
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+	/* update current state of vector */
+	if (test_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state))
+		q_vector->state_flags |= ICE_CHNL_IN_BP;
+	else
+		q_vector->state_flags &= ~ICE_CHNL_IN_BP;
+
+#endif /* HAVE_STATE_IN_BUSY_POLL */
+
+skip_state_update:
+	if (q_vector->state_flags & ICE_CHNL_IN_BP) {
+		q_vector->jiffy = jiffies;
+		/* triffer force_wb by setting WB_ON_ITR only when
+		 * - vector is transitioning from INTR->BUSY_POLL
+		 * - once_in_bp is false, this is to prevent from doing it
+		 * every time whenever vector state is changing from
+		 * INTR->BUSY_POLL because that could be due to legit
+		 * busy_poll stop
+		 */
+		if (!(q_vector->state_flags & ICE_CHNL_ONCE_IN_BP) &&
+		    ice_vector_intr_busypoll(q_vector))
+			ice_force_wb(&q_vector->vsi->back->hw, q_vector);
+
+		q_vector->state_flags |= ICE_CHNL_ONCE_IN_BP;
+#ifdef ADQ_PERF_COUNTERS
+		q_vector->ch_stats.in_bp++;
+		/* state transition : INTERRUPT --> BUSY_POLL */
+		if (!(q_vector->state_flags & ICE_CHNL_PREV_IN_BP))
+			q_vector->ch_stats.real_int_to_bp++;
+		else
+			q_vector->ch_stats.real_bp_to_bp++;
+	} else {
+		q_vector->ch_stats.in_int++;
+		/* state transition : BUSY_POLL --> INTERRUPT */
+		if (q_vector->state_flags & ICE_CHNL_PREV_IN_BP)
+			q_vector->ch_stats.real_bp_to_int++;
+		else
+			q_vector->ch_stats.real_int_to_int++;
+#endif /* ADQ_PERF_COUNTERS */
+	}
+}
+
+/**
+ * ice_handle_chnl_vector - handle channel enabled vector
+ * @q_vector: ptr to q_vector
+ * @unlikely_cb_bp: will comeback to busy_poll or not
+ *
+ * This function eithers triggers software interrupt (when unlikely_cb_bp is
+ * true) or enable interrupt normally. unlikely_cb_bp gets determined based
+ * on state machine and packet parsing logic.
+ */
+static void
+ice_handle_chnl_vector(struct ice_q_vector *q_vector, bool unlikely_cb_bp)
+{
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_q_vector_ch_stats *stats = &q_vector->ch_stats;
+#endif /* ADQ_PERF_COUNTERS */
+	struct ice_vsi *ch_vsi = q_vector->ch->ch_vsi;
+	struct ice_vsi *vsi = q_vector->vsi;
+
+
+	/* caller of this function deteremines next occurrence/execution context
+	 * of napi_poll (means next time whether napi_poll will be invoked from
+	 * busy_poll or SOFT IRQ context). Please refer to the caller of this
+	 * function to see logic for "unlikely_cb_bp" (aka, re-occurrence to
+	 * busy_poll or not).
+	 * If logic determines that, next occurrence of napi_poll will not be
+	 * from busy_poll context, trigger software initiated interrupt on
+	 * channel enabled vector to revive queue(s) processing, otherwise if
+	 * in true interrupt state - just enable interrupt.
 	 */
-	if (rx->target_itr < rx->current_itr) {
-		/* Rx ITR needs to be reduced, this is highest priority */
-		itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr);
-		rx->current_itr = rx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
-	} else if ((tx->target_itr < tx->current_itr) ||
-		   ((rx->target_itr - rx->current_itr) <
-		    (tx->target_itr - tx->current_itr))) {
-		/* Tx ITR needs to be reduced, this is second priority
-		 * Tx ITR needs to be increased more than Rx, fourth priority
+	if (unlikely_cb_bp) {
+#ifdef ADQ_PERF_COUNTERS
+		stats->unlikely_cb_to_bp++;
+		if (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)
+			stats->ucb_o_bp++;
+#endif /* ADQ_PERF_COUNTERS */
+
+		/* if once_in_bp is set and pkt inspection based optimization
+		 * is off, do not trigger SW interrupt (simply bailout).
+		 * No change in logic from service_task based software
+		 * triggred interrupt - to revive the queue based on jiffy logic
+		 */
+		if (ch_vsi && (q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+			if (!ice_vsi_pkt_inspect_opt_ena(ch_vsi)) {
+#ifdef ADQ_PERF_COUNTERS
+				stats->num_no_sw_intr_opt_off++;
+#endif /* ADQ_PERF_COUNTERS */
+				return;
+			}
+		}
+
+		/* Since this real BP -> INT transition,
+		 * reset Jiffies snapshot
 		 */
-		itr_val = ice_buildreg_itr(tx->itr_idx, tx->target_itr);
-		tx->current_itr = tx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
-	} else if (rx->current_itr != rx->target_itr) {
-		/* Rx ITR needs to be increased, third priority */
-		itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr);
-		rx->current_itr = rx->target_itr;
-		q_vector->itr_countdown = ITR_COUNTDOWN_START;
+		q_vector->jiffy = 0;
+
+		/* Likewise for real BP -> INT, trigger
+		 * SW interrupt, so that vector is put back
+		 * in sane state, trigger sw interrupt to revive the queue
+		 */
+#ifdef ADQ_PERF_COUNTERS
+		ice_sw_intr_cntr(q_vector, true);
+#endif /* ADQ_PERF_COUNTERS */
+		ice_adq_trigger_sw_intr(&vsi->back->hw, q_vector);
+	} else if (!(q_vector->state_flags & ICE_CHNL_ONCE_IN_BP)) {
+#ifdef ADQ_PERF_COUNTERS
+		stats->once_bp_false++;
+#endif /* ADQ_PERF_COUNTERS */
+		ice_enable_interrupt(q_vector);
+	}
+}
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+/**
+ * ice_chnl_vector_bypass_clean_complete
+ * @napi: ptr to napi
+ * @budget: value of budget (it could be napi:weight or BUSY_POLL_BUDGET)
+ * @work_done: amount of work_done (number of packets cleaned)
+ *
+ * This function returns true upon following condition:
+ * - state of NAPI is IN_BUSY_POLL (this is subject to change)
+ * - priv-flag "channel-pkt-clean-bp-stop" is disabled - means user turned
+ *   off such optimization (this is high level knob for user)
+ * - vector state is set (workdone == budget) and napi:weight == budget (means
+ *   invoked from napi_schedule coe path) and limit of optimization is
+ *   reached
+ *
+ * When this function returns true, caller of this function (napi_poll)
+ * do not allow napi_poll to return "budget". This is to prevent OS calling
+ * us upto 2 msec or 10 times (softirq.c:__do_softirq)
+ */
+static bool
+ice_chnl_vector_bypass_clean_complete(struct napi_struct *napi, int budget,
+				      int work_done)
+{
+	struct ice_q_vector *qv = container_of(napi, struct ice_q_vector, napi);
+
+	if (!ice_vector_ever_in_busypoll(qv))
+		return false;
+
+	if (!ice_vsi_pkt_process_bp_stop_ena(qv->ch->ch_vsi))
+		return true; /* like what it was before */
+
+#ifdef ADQ_PERF_COUNTERS
+	if (napi->weight == budget) /* napi_schedule */
+		qv->ch_stats.pkt_bp_stop_napi_budget += work_done;
+	else /* busy_poll_stop */
+		qv->ch_stats.pkt_bp_stop_bp_budget += work_done;
+#endif /* ADQ_PERF_COUNTERS */
+
+	if ((qv->state_flags & ICE_CHNL_WD_EQUALS_BP) &&
+	    napi->weight == budget) {
+		qv->process_rx_queues++;
+		if (qv->process_rx_queues == qv->max_limit_process_rx_queues)
+			return true;
 	} else {
-		/* Still have to re-enable the interrupts */
-		itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
-		if (q_vector->itr_countdown)
-			q_vector->itr_countdown--;
+		qv->process_rx_queues = 0;
 	}
 
-	if (!test_bit(__ICE_DOWN, q_vector->vsi->state))
-		wr32(&q_vector->vsi->back->hw,
-		     GLINT_DYN_CTL(q_vector->reg_idx),
-		     itr_val);
+	return false;
+}
+#endif /* HAVE_NAPI_STATE_IN_BUSY_POLL */
+
+/**
+ * ice_chnl_vector_wd_eq_budget - detect workdone equals budget and set bit
+ * @napi: ptr to napi
+ * @budget: value of budget (it could be napi:weight or BUSY_POLL_BUDGET)
+ * @clean_complete: value of clean_complete as computed by napi_poll
+ * @cleaned_any_data_pkt: this function detects true of cleaned any data pkt
+ *
+ * Based on value of "clean_complete", set/reset per vector state
+ * bit indicating that workdone == budget condition has reached and
+ * increment specific stats based on value of "budget"
+ */
+static void
+ice_chnl_vector_wd_eq_budget(struct napi_struct *napi, int budget,
+			     bool clean_complete, bool *cleaned_any_data_pkt)
+{
+	struct ice_q_vector *qv = container_of(napi, struct ice_q_vector, napi);
+
+	if ((qv->state_flags & ICE_CHNL_IN_BP) &&
+	    ice_vsi_pkt_process_bp_stop_ena(qv->ch->ch_vsi)) {
+		if (qv->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV) {
+			qv->state_flags &= ~ICE_CHNL_PREV_DATA_PKT_RECV;
+			*cleaned_any_data_pkt = true;
+#ifdef ADQ_PERF_COUNTERS
+			qv->ch_stats.cleaned_any_data_pkt++;
+#endif /* ADQ_PERF_COUNTERS */
+		}
+		/* Take snapshot if work_done == budget, which is used
+		 * when busy_poll_stop is called, to decide it internal
+		 * state machine to keep in BUSY_POLL or not.
+		 * see ice_refresh_bp_state function for details.
+		 */
+		if (!clean_complete)
+			qv->state_flags |= ICE_CHNL_WD_EQUALS_BP;
+		else
+			qv->state_flags &= ~ICE_CHNL_WD_EQUALS_BP;
+#ifdef ADQ_PERF_COUNTERS
+		if (qv->state_flags & ICE_CHNL_WD_EQUALS_BP) {
+			if (napi->weight == budget)
+				qv->ch_stats.bp_wd_equals_budget64++;
+			else
+				qv->ch_stats.bp_wd_equals_budget8++;
+		}
+#endif /* ADQ_PERF_COUNTERS */
+	} else {
+		qv->state_flags &= ~ICE_CHNL_WD_EQUALS_BP;
+	}
 }
 
 /**
@@ -1434,32 +2069,31 @@ static void ice_update_ena_itr(struct ice_q_vector *q_vector)
  * We need to tell hardware to write-back completed descriptors even when
  * interrupts are disabled. Descriptors will be written back on cache line
  * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
- * descriptors may not be written back if they don't fill a cache line until the
- * next interrupt.
+ * descriptors may not be written back if they don't fill a cache line until
+ * the next interrupt.
  *
- * This sets the write-back frequency to 2 microseconds as that is the minimum
- * value that's not 0 due to ITR granularity. Also, set the INTENA_MSK bit to
- * make sure hardware knows we aren't meddling with the INTENA_M bit.
+ * This sets the write-back frequency to whatever was set previously for the
+ * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
+ * aren't meddling with the INTENA_M bit.
  */
 static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
 {
 	struct ice_vsi *vsi = q_vector->vsi;
 
-	/* already in WB_ON_ITR mode no need to change it */
-	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE)
+	/* already in wb_on_itr mode no need to change it */
+	if (q_vector->wb_on_itr)
 		return;
 
-	if (q_vector->num_ring_rx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_RX_ITR));
-
-	if (q_vector->num_ring_tx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_TX_ITR));
+	/* use previously set ITR values for all of the ITR indices by
+	 * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
+	 * be static in non-adaptive mode (user configured)
+	 */
+	wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
+	     ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) &
+	      GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M |
+	     GLINT_DYN_CTL_WB_ON_ITR_M);
 
-	q_vector->itr_countdown = ICE_IN_WB_ON_ITR_MODE;
+	q_vector->wb_on_itr = true;
 }
 
 /**
@@ -1475,29 +2109,103 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct ice_q_vector *q_vector =
 				container_of(napi, struct ice_q_vector, napi);
+	bool cleaned_any_data_pkt = false;
+	bool unlikely_cb_bp = false;
 	bool clean_complete = true;
 	struct ice_ring *ring;
 	int budget_per_ring;
 	int work_done = 0;
+	bool ch_enabled;
+
+	/* determine once if vector needs to be processed differently */
+	ch_enabled = ice_vector_ch_enabled(q_vector);
+	if (ch_enabled) {
+		/* Refresh state machine */
+		ice_refresh_bp_state(napi, budget);
+
+		/* check during previous run of napi_poll whether at least one
+		 * data packets is processed or not. If processed at least one
+		 * data packet, set the local flag 'cleaned_any_data_pkt'
+		 * which is used later in this function to determine if
+		 * interrupt should be enabled or deferred (this is applicable
+		 * only in case when busy_poll stop is invoked, means previous
+		 * state of vector is in busy_poll and current state is not
+		 * (aka BUSY_POLL -> INTR))
+		 */
+		if (q_vector->state_flags & ICE_CHNL_PREV_DATA_PKT_RECV) {
+			q_vector->state_flags &= ~ICE_CHNL_PREV_DATA_PKT_RECV;
+			/* It is important to check and cache correct
+			 * information (cleaned any data packets or not) in
+			 * local variable before napi_complete_done is finished.
+			 * Once napi_complete_done is returned, napi_poll
+			 * can get invoked again (means re-entrant) which can
+			 * potentially results to incorrect decision making
+			 * w.r.t. whether interrupt should be enabled or
+			 * deferred)
+			 */
+			if (ice_vector_busypoll_intr(q_vector)) {
+				cleaned_any_data_pkt = true;
+#ifdef ADQ_PERF_COUNTERS
+				q_vector->ch_stats.cleaned_any_data_pkt++;
+#endif /* ADQ_PERF_COUNTERS */
+			}
+		}
+	}
 
 	/* Since the actual Tx work is minimal, we can give the Tx a larger
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */
-	ice_for_each_ring(ring, q_vector->tx)
-		if (!ice_clean_tx_irq(ring, budget))
+	ice_for_each_ring(ring, q_vector->tx) {
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		bool wd = ring->xsk_pool ?
+			  ice_clean_tx_irq_zc(ring) :
+			  ice_clean_tx_irq(ring, budget);
+#else
+		bool wd = ice_clean_tx_irq(ring, budget);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+
+#ifdef ICE_ADD_PROBES
+		if (!wd) {
+			struct ice_q_stats *stats = &ring->stats;
+
+			/* if we are reporting that we are not done, then we
+			 * know napi is going to continue so increment the
+			 * count
+			 */
+			stats->napi_poll_cnt++;
+			clean_complete = false;
+		}
+#else /* ICE_ADD_PROBES */
+		if (!wd)
 			clean_complete = false;
+#endif /* !ICE_ADD_PROBES */
+	}
 
 	/* Handle case where we are called by netpoll with a budget of 0 */
 	if (unlikely(budget <= 0))
 		return budget;
 
+	/* state transitioning from BUSY_POLL --> INTERRUPT. This can happen
+	 * due to several reason when stack calls busy_poll_stop
+	 *    1. during last execution of napi_poll returned non-zero packets
+	 *    2. busy_loop ended
+	 *    3. need re-sched set
+	 * driver keeps track of packets were cleaned during last run and if
+	 * that is zero, means most likely napi_poll won't be invoked from
+	 * busy_poll context; in that situation bypass processing of Rx queues
+	 * and enable interrupt and let subsequent run of napi_poll from
+	 * interrupt path handle cleanup of Rx queues
+	 */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector))
+		goto bypass;
+
 	/* normally we have 1 Rx ring per q_vector */
 	if (unlikely(q_vector->num_ring_rx > 1))
 		/* We attempt to distribute budget to each Rx queue fairly, but
 		 * don't allow the budget to go below 1 because that would exit
 		 * polling early.
 		 */
-		budget_per_ring = max(budget / q_vector->num_ring_rx, 1);
+		budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1);
 	else
 		/* Max of 1 Rx ring in this q_vector so give it the budget */
 		budget_per_ring = budget;
@@ -1505,37 +2213,146 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
 	ice_for_each_ring(ring, q_vector->rx) {
 		int cleaned;
 
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		/* A dedicated path for zero-copy allows making a single
+		 * comparison in the irq context instead of many inside the
+		 * ice_clean_rx_irq function and makes the codebase cleaner.
+		 */
+		cleaned = ring->xsk_pool ?
+			  ice_clean_rx_irq_zc(ring, budget_per_ring) :
+			  ice_clean_rx_irq(ring, budget_per_ring);
+#else
 		cleaned = ice_clean_rx_irq(ring, budget_per_ring);
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
 		work_done += cleaned;
 		/* if we clean as many as budgeted, we must not be done */
+#ifdef ICE_ADD_PROBES
+		if (cleaned >= budget_per_ring) {
+			struct ice_q_stats *stats = &ring->stats;
+
+			/* if we are reporting that we are not done, then we
+			 * know napi is going to continue so increment the
+			 * count
+			 */
+			stats->napi_poll_cnt++;
+			clean_complete = false;
+		}
+#else /* ICE_ADD_PROBES */
 		if (cleaned >= budget_per_ring)
 			clean_complete = false;
+#endif /* !ICE_ADD_PROBES */
+
+		if (ch_enabled)
+			ice_chnl_vector_wd_eq_budget(napi, budget,
+						     clean_complete,
+						     &cleaned_any_data_pkt);
+	} /* end for ice_for_each_ring */
+
+#ifdef HAVE_NAPI_STATE_IN_BUSY_POLL
+	if (ch_enabled &&
+	    (!test_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state))) {
+		if (ice_chnl_vector_bypass_clean_complete(napi, budget,
+							  work_done))
+			goto bypass;
 	}
 
+#endif /* HAVE_NAPI_STATE_IN_BUSY_POLL */
 	/* If work not completed, return budget and polling will return */
-	if (!clean_complete)
+	if (!clean_complete) {
+		/* Set the writeback on ITR so partial completions of
+		 * cache-lines will still continue even if we're polling.
+		 */
+		if (!ch_enabled)
+			ice_set_wb_on_itr(q_vector);
 		return budget;
+	}
 
-	/* Exit the polling mode, but don't re-enable interrupts if stack might
-	 * poll us due to busy-polling
+bypass:
+	/* reset the counter if code flow reached here because this function
+	 * determined that it is not going to return budget and will
+	 * end up calling napi_complete_done followed by return value < budget
 	 */
-	if (likely(napi_complete_done(napi, work_done)))
-		ice_update_ena_itr(q_vector);
-	else
-		ice_set_wb_on_itr(q_vector);
+	q_vector->process_rx_queues = 0;
+
+#ifdef ADQ_PERF_COUNTERS
+	/* Following block is only for stats */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector)) {
+		struct ice_q_vector_ch_stats *stats;
+
+		stats = &q_vector->ch_stats;
+		if (unlikely(need_resched())) {
+			stats->num_need_resched_bp_stop++;
+			if (!cleaned_any_data_pkt)
+				stats->num_l_c_data_pkt++;
+		} else {
+			/* here , means actually because of 2 reason
+			 * - busy_poll timeout expired
+			 * - last time, cleaned data packets, hence
+			 *  stack asked to stop busy_poll so that packet
+			 *  can be processed by consumer
+			 */
+			stats->num_timeout_bp_stop++;
+			if (!cleaned_any_data_pkt)
+				stats->num_l_c_data_pkt1++;
+		}
+	}
+#endif /* ADQ_PERF_COUNTERS */
 
-	return min_t(int, work_done, budget - 1);
-}
+	/* if state transition from busy_poll to interrupt and during
+	 * last run: did not cleanup TCP data packets -
+	 *      then application unlikely to comeback to busy_poll
+	 */
+	if (ch_enabled && ice_vector_busypoll_intr(q_vector) &&
+	    !cleaned_any_data_pkt) {
+		/* for now, if need_resched is true (it can be either
+		 * due to voluntary/in-voluntary context switches),
+		 * do not trigger SW interrupt.
+		 * if need_resched is not set, safely assuming, it is due
+		 * to possible timeout and unlikely that application/context
+		 * will return to busy_poll, hence set 'unlikely_cb_bp' to
+		 * true which will cause software triggered interrupt
+		 * to reviev the queue/vector
+		 */
+		if (unlikely(need_resched()))
+			unlikely_cb_bp = false;
+		else
+			unlikely_cb_bp = true;
+	}
 
-/* helper function for building cmd/type/offset */
-static __le64
-build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
-{
-	return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
-			   (td_cmd    << ICE_TXD_QW1_CMD_S) |
-			   (td_offset << ICE_TXD_QW1_OFFSET_S) |
-			   ((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
-			   (td_tag    << ICE_TXD_QW1_L2TAG1_S));
+	/* Work is done so exit the polling mode and re-enable the interrupt */
+	if (likely(napi_complete_done(napi, work_done))) {
+		/* napi_ret : false (means vector is still in POLLING mode
+		 *            true (means out of POLLING)
+		 * NOTE: Generally if napi_ret is TRUE, enable device interrupt
+		 * but there are condition/optimization, where it can be
+		 * optimized. Basically, if napi_complete_done returns true.
+		 * But if it is last time Rx packets were cleaned,
+		 * then most likely, consumer thread will come back to do
+		 * busy_polling where cleaning of  Tx/Rx queue will happen
+		 * normally. Hence no reason to arm the interrupt.
+		 *
+		 * If for some reason, consumer thread/context doesn't comeback
+		 * to busy_poll:napi_poll, there is bail-out mechanism to kick
+		 * start the state machine thru' SW triggered interrupt from
+		 * service task.
+		 */
+		if (ch_enabled) {
+			/* current state of NAPI is INTERRUPT */
+			ice_handle_chnl_vector(q_vector, unlikely_cb_bp);
+		} else {
+			/* vector is not channel enabled and NAPI is not in
+			 * BUSY_POLL, always enable interrupt
+			 */
+			ice_net_dim(q_vector);
+			ice_enable_interrupt(q_vector);
+		}
+
+	} else {
+		if (!ch_enabled)
+			ice_set_wb_on_itr(q_vector);
+	}
+
+	return min_t(int, work_done, budget - 1);
 }
 
 /**
@@ -1592,11 +2409,11 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 {
 	u64 td_offset, td_tag, td_cmd;
 	u16 i = tx_ring->next_to_use;
-	skb_frag_t *frag;
 	unsigned int data_len, size;
 	struct ice_tx_desc *tx_desc;
 	struct ice_tx_buf *tx_buf;
 	struct sk_buff *skb;
+	skb_frag_t *frag;
 	dma_addr_t dma;
 
 	td_tag = off->td_l2tag1;
@@ -1607,6 +2424,7 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	data_len = skb->data_len;
 	size = skb_headlen(skb);
 
+
 	tx_desc = ICE_TX_DESC(tx_ring, i);
 
 	if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) {
@@ -1638,7 +2456,8 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 		 */
 		while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
 			tx_desc->cmd_type_offset_bsz =
-				build_ctob(td_cmd, td_offset, max_data, td_tag);
+				ice_build_ctob(td_cmd, td_offset, max_data,
+					       td_tag);
 
 			tx_desc++;
 			i++;
@@ -1658,8 +2477,8 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 		if (likely(!data_len))
 			break;
 
-		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
-							  size, td_tag);
+		tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset,
+							      size, td_tag);
 
 		tx_desc++;
 		i++;
@@ -1684,15 +2503,14 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	/* record SW timestamp if HW timestamp is not available */
 	skb_tx_timestamp(first->skb);
 
+	/* write last descriptor with RS and EOP bits */
+	td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD;
+	tx_desc->cmd_type_offset_bsz =
+			ice_build_ctob(td_cmd, td_offset, size, td_tag);
 	i++;
 	if (i == tx_ring->count)
 		i = 0;
 
-	/* write last descriptor with RS and EOP bits */
-	td_cmd |= (u64)(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS);
-	tx_desc->cmd_type_offset_bsz =
-			build_ctob(td_cmd, td_offset, size, td_tag);
-
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.
 	 *
@@ -1709,9 +2527,20 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
 	/* notify HW of packet */
+#ifdef HAVE_SKB_XMIT_MORE
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
-		writel(i, tx_ring->tail);
+#endif /* HAVE_SKB_XMIT_MORE */
+		writel_relaxed(i, tx_ring->tail);
+#ifndef SPIN_UNLOCK_IMPLIES_MMIOWB
+
+		/* we need this if more than one processor can write to our tail
+		 * at a time, it synchronizes IO on IA64/Altix systems
+		 */
+		mmiowb();
+#endif /* SPIN_UNLOCK_IMPLIES_MMIOWB */
+#ifdef HAVE_SKB_XMIT_MORE
 	}
+#endif /* HAVE_SKB_XMIT_MORE */
 
 	return;
 
@@ -1730,6 +2559,7 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 	tx_ring->next_to_use = i;
 }
 
+
 /**
  * ice_tx_csum - Enable Tx checksum offloads
  * @first: pointer to the first descriptor
@@ -1740,6 +2570,9 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first,
 static
 int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 {
+#ifdef ICE_ADD_PROBES
+	struct ice_ring *tx_ring = off->tx_ring;
+#endif
 	u32 l4_len = 0, l3_len = 0, l2_len = 0;
 	struct sk_buff *skb = first->skb;
 	union {
@@ -1766,22 +2599,117 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	l2_len = ip.hdr - skb->data;
 	offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S;
 
-	if (skb->encapsulation)
-		return -1;
+	protocol = vlan_get_protocol(skb);
+
+	if (protocol == htons(ETH_P_IP))
+		first->tx_flags |= ICE_TX_FLAGS_IPV4;
+	else if (protocol == htons(ETH_P_IPV6))
+		first->tx_flags |= ICE_TX_FLAGS_IPV6;
+
+	if (skb->encapsulation) {
+		bool gso_ena = false;
+		u32 tunnel = 0;
+
+		/* define outer network header type */
+		if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
+			tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ?
+				  ICE_TX_CTX_EIPT_IPV4 :
+				  ICE_TX_CTX_EIPT_IPV4_NO_CSUM;
+			l4_proto = ip.v4->protocol;
+		} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
+			int ret;
+
+			tunnel |= ICE_TX_CTX_EIPT_IPV6;
+			exthdr = ip.hdr + sizeof(*ip.v6);
+			l4_proto = ip.v6->nexthdr;
+			ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
+					       &l4_proto, &frag_off);
+			if (ret < 0)
+				return -1;
+		}
+
+		/* define outer transport */
+		switch (l4_proto) {
+		case IPPROTO_UDP:
+			tunnel |= ICE_TXD_CTX_UDP_TUNNELING;
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			break;
+		case IPPROTO_GRE:
+			tunnel |= ICE_TXD_CTX_GRE_TUNNELING;
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			break;
+		case IPPROTO_IPIP:
+		case IPPROTO_IPV6:
+			first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
+			l4.hdr = skb_inner_network_header(skb);
+			break;
+		default:
+			if (first->tx_flags & ICE_TX_FLAGS_TSO)
+				return -1;
+
+			skb_checksum_help(skb);
+			return 0;
+		}
+
+#ifdef ICE_ADD_PROBES
+		if (protocol == htons(ETH_P_IP))
+			tx_ring->vsi->back->tx_ip4_cso++;
+#endif
+		/* compute outer L3 header size */
+		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
+			  ICE_TXD_CTX_QW0_EIPLEN_S;
+
+		/* switch IP header pointer from outer to inner header */
+		ip.hdr = skb_inner_network_header(skb);
+
+		/* compute tunnel header size */
+		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
+			   ICE_TXD_CTX_QW0_NATLEN_S;
+
+#ifdef NETIF_F_GSO_PARTIAL
+		gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL;
+#endif
+		/* indicate if we need to offload outer UDP header */
+		if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
+			tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M;
+
+		/* record tunnel offload values */
+		off->cd_tunnel_params |= tunnel;
+
+		/* set DTYP=1 to indicate that it's an Tx context descriptor
+		 * in IPsec tunnel mode with Tx offloads in Quad word 1
+		 */
+		off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX;
+
+		/* switch L4 header pointer from outer to inner */
+		l4.hdr = skb_inner_transport_header(skb);
+		l4_proto = 0;
+
+		/* reset type as we transition from outer to inner headers */
+		first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6);
+		if (ip.v4->version == 4)
+			first->tx_flags |= ICE_TX_FLAGS_IPV4;
+		if (ip.v6->version == 6)
+			first->tx_flags |= ICE_TX_FLAGS_IPV6;
+	}
 
 	/* Enable IP checksum offloads */
-	protocol = vlan_get_protocol(skb);
-	if (protocol == htons(ETH_P_IP)) {
+	if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
 		l4_proto = ip.v4->protocol;
 		/* the stack computes the IP header already, the only time we
 		 * need the hardware to recompute it is in the case of TSO.
 		 */
+
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_ip4_cso++;
+#endif
 		if (first->tx_flags & ICE_TX_FLAGS_TSO)
 			cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM;
 		else
 			cmd |= ICE_TX_DESC_CMD_IIPT_IPV4;
 
-	} else if (protocol == htons(ETH_P_IPV6)) {
+	} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
 		cmd |= ICE_TX_DESC_CMD_IIPT_IPV6;
 		exthdr = ip.hdr + sizeof(*ip.v6);
 		l4_proto = ip.v6->nexthdr;
@@ -1789,6 +2717,9 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 			ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto,
 					 &frag_off);
 	} else {
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_l3_cso_err++;
+#endif
 		return -1;
 	}
 
@@ -1803,21 +2734,33 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP;
 		l4_len = l4.tcp->doff;
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_tcp_cso++;
+#endif
 		break;
 	case IPPROTO_UDP:
 		/* enable UDP checksum offload */
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP;
 		l4_len = (sizeof(struct udphdr) >> 2);
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_udp_cso++;
+#endif
 		break;
 	case IPPROTO_SCTP:
 		/* enable SCTP checksum offload */
 		cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP;
 		l4_len = sizeof(struct sctphdr) >> 2;
 		offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_sctp_cso++;
+#endif
 		break;
 
 	default:
+#ifdef ICE_ADD_PROBES
+		tx_ring->vsi->back->tx_l4_cso_err++;
+#endif
 		if (first->tx_flags & ICE_TX_FLAGS_TSO)
 			return -1;
 		skb_checksum_help(skb);
@@ -1836,50 +2779,59 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
  *
  * Checks the skb and set up correspondingly several generic transmit flags
  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
- *
- * Returns error code indicate the frame should be dropped upon error and the
- * otherwise returns 0 to indicate the flags has been set properly.
  */
-static int
+static void
 ice_tx_prepare_vlan_flags(struct ice_ring *tx_ring, struct ice_tx_buf *first)
 {
 	struct sk_buff *skb = first->skb;
-	__be16 protocol = skb->protocol;
-
-	if (protocol == htons(ETH_P_8021Q) &&
-	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
-		/* when HW VLAN acceleration is turned off by the user the
-		 * stack sets the protocol to 8021q so that the driver
-		 * can take any steps required to support the SW only
-		 * VLAN handling. In our case the driver doesn't need
-		 * to take any further steps so just set the protocol
-		 * to the encapsulated ethertype.
-		 */
-		skb->protocol = vlan_get_protocol(skb);
-		return 0;
-	}
 
-	/* if we have a HW VLAN tag being added, default to the HW one */
+	/* nothing left to do, software offloaded VLAN */
+	if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
+		return;
+
+	/* the VLAN ethertype/tpid is determined by VSI configuration and netdev
+	 * feature flags, which the driver only allows either 802.1Q or 802.1ad
+	 * VLAN offloads exclusively so we only care about the VLAN ID here
+	 */
 	if (skb_vlan_tag_present(skb)) {
 		first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S;
-		first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
-	} else if (protocol == htons(ETH_P_8021Q)) {
-		struct vlan_hdr *vhdr, _vhdr;
-
-		/* for SW VLAN, check the next protocol and store the tag */
-		vhdr = (struct vlan_hdr *)skb_header_pointer(skb, ETH_HLEN,
-							     sizeof(_vhdr),
-							     &_vhdr);
-		if (!vhdr)
-			return -EINVAL;
-
-		first->tx_flags |= ntohs(vhdr->h_vlan_TCI) <<
-				   ICE_TX_FLAGS_VLAN_S;
-		first->tx_flags |= ICE_TX_FLAGS_SW_VLAN;
+		if (tx_ring->flags & ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)
+			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
+		else
+			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
+
+#ifdef ICE_ADD_PROBES
+		if (tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)
+			tx_ring->vsi->back->tx_q_vlano++;
+		else
+			tx_ring->vsi->back->tx_ad_vlano++;
+#endif
 	}
 
-	return ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
+	ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
+}
+
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_update_gso_cntr - update TSO/USO counter
+ * @tx_buf: Tx buffer with necessary data to update counter
+ */
+static void ice_update_gso_cntr(struct ice_tx_buf *tx_buf)
+{
+	struct sk_buff *skb = tx_buf->skb;
+	struct ice_netdev_priv *np =
+		netdev_priv(skb->dev);
+
+#ifdef NETIF_F_GSO_UDP_L4
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		np->vsi->back->udp_segs += tx_buf->gso_segs;
+	else
+		np->vsi->back->tcp_segs += tx_buf->gso_segs;
+#else
+	np->vsi->back->tcp_segs += tx_buf->gso_segs;
+#endif /* NETIF_F_GSO_UDP_L4 */
 }
+#endif /* ICE_ADD_PROBES */
 
 /**
  * ice_tso - computes mss and TSO length to prepare for TSO
@@ -1899,10 +2851,12 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	} ip;
 	union {
 		struct tcphdr *tcp;
+		struct udphdr *udp;
 		unsigned char *hdr;
 	} l4;
 	u64 cd_mss, cd_tso_len;
-	u32 paylen, l4_start;
+	u32 paylen;
+	u8 l4_start;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1915,7 +2869,6 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 	if (err < 0)
 		return err;
 
-	/* cppcheck-suppress unreadVariable */
 	ip.hdr = skb_network_header(skb);
 	l4.hdr = skb_transport_header(skb);
 
@@ -1927,15 +2880,74 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 		ip.v6->payload_len = 0;
 	}
 
+	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+#ifdef NETIF_F_GSO_PARTIAL
+					 SKB_GSO_GRE_CSUM |
+#endif
+#ifdef NETIF_F_GSO_IPXIP4
+					 SKB_GSO_IPXIP4 |
+					 SKB_GSO_IPXIP6 |
+#else
+#ifdef NETIF_F_GSO_IPIP
+					 SKB_GSO_IPIP |
+					 SKB_GSO_SIT |
+#endif
+#endif /* NETIF_F_GSO_IPXIP4 */
+					 SKB_GSO_UDP_TUNNEL |
+					 SKB_GSO_UDP_TUNNEL_CSUM)) {
+#ifndef NETIF_F_GSO_PARTIAL
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+#else
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+#endif
+			l4.udp->len = 0;
+
+			/* determine offset of outer transport header */
+			l4_start = (u8)(l4.hdr - skb->data);
+
+			/* remove payload length from outer checksum */
+			paylen = skb->len - l4_start;
+			csum_replace_by_diff(&l4.udp->check,
+					     (__force __wsum)htonl(paylen));
+		}
+
+		/* reset pointers to inner headers */
+
+		ip.hdr = skb_inner_network_header(skb);
+		l4.hdr = skb_inner_transport_header(skb);
+
+		/* initialize inner IP header fields */
+		if (ip.v4->version == 4) {
+			ip.v4->tot_len = 0;
+			ip.v4->check = 0;
+		} else {
+			ip.v6->payload_len = 0;
+		}
+	}
+
 	/* determine offset of transport header */
-	l4_start = l4.hdr - skb->data;
+	l4_start = (u8)(l4.hdr - skb->data);
 
 	/* remove payload length from checksum */
 	paylen = skb->len - l4_start;
-	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
 
-	/* compute length of segmentation header */
-	off->header_len = (l4.tcp->doff * 4) + l4_start;
+#ifdef NETIF_F_GSO_UDP_L4
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		csum_replace_by_diff(&l4.udp->check,
+				     (__force __wsum)htonl(paylen));
+		/* compute length of UDP segmentation header */
+		off->header_len = (u8)sizeof(l4.udp) + l4_start;
+	} else {
+		csum_replace_by_diff(&l4.tcp->check,
+				     (__force __wsum)htonl(paylen));
+		/* compute length of TCP segmentation header */
+		off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
+	}
+#else
+	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
+	off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
+#endif /* NETIF_F_GSO_UDP_L4 */
 
 	/* update gso_segs and bytecount */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
@@ -1950,6 +2962,9 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
 			     (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) |
 			     (cd_mss << ICE_TXD_CTX_QW1_MSS_S));
 	first->tx_flags |= ICE_TX_FLAGS_TSO;
+#ifdef ICE_ADD_PROBES
+	ice_update_gso_cntr(first);
+#endif
 	return 1;
 }
 
@@ -2040,7 +3055,7 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 	frag = &skb_shinfo(skb)->frags[0];
 
 	/* Initialize size to the negative value of gso_size minus 1. We
-	 * use this as the worst case scenerio in which the frag ahead
+	 * use this as the worst case scenario in which the frag ahead
 	 * of us only provides one byte which is why we are limited to 6
 	 * descriptors for a single transmit as the header and previous
 	 * fragment are already consuming 2 descriptors.
@@ -2057,10 +3072,30 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > ICE_MAX_DATA_PER_TXD) {
+			int align_pad = -(skb_frag_off(stale)) &
+					(ICE_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= ICE_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > ICE_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -2068,7 +3103,7 @@ static bool __ice_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;
@@ -2096,6 +3131,317 @@ static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
 	return count != ICE_MAX_BUF_TXD;
 }
 
+/**
+ * ice_get_queue_based_on_mark - determine the Tx queue based on mark value
+ * @vsi: pointer to VSI
+ * @mark: mark value (skb->mark)
+ * @queue: return the Tx queue number
+ *
+ * Based on mark value (which comes form skb->mark as a result of SO_MARK
+ * socket option), determine the Tx queue, which gets used to align flow
+ * to HW queue.
+ */
+static bool ice_get_queue_based_on_mark(struct ice_vsi *vsi, u32 mark,
+					u16 *queue)
+{
+	int v_idx;
+
+	ice_for_each_q_vector(vsi, v_idx) {
+		struct ice_q_vector *q_vector = vsi->q_vectors[v_idx];
+		struct ice_ring *tx_ring;
+
+		if (!q_vector)
+			continue;
+		if (q_vector->napi.napi_id != mark)
+			continue;
+
+		/* Now we located matching "q_vector:napi_struct" based
+		 * on "mark (as napi_id)
+		 */
+
+		/* for now use first tx_ring:q_index */
+		ice_for_each_ring(tx_ring, q_vector->tx) {
+			*queue = tx_ring->q_index;
+			return true;
+		}
+	}
+	return false;
+}
+
+/**
+ * ice_chnl_inline_fd - Add a Flow director ATR filter
+ * @tx_ring: ring to add programming descriptor
+ * @skb: send buffer
+ * @tx_flags: Tx flags
+ */
+static void ice_chnl_inline_fd(struct ice_ring *tx_ring, struct sk_buff *skb,
+			       u32 tx_flags)
+{
+	struct ice_q_vector *qv = tx_ring->q_vector;
+	struct ice_fd_fltr_desc_ctx fd_ctx = { 0 };
+	struct ice_channel *ch = tx_ring->ch;
+	struct ice_fltr_desc *fdir_desc;
+	union {
+		unsigned char *network;
+		struct iphdr *ipv4;
+	} hdr;
+	struct tcphdr *th;
+	unsigned int hlen;
+	u16 q_index = 0;
+	u16 i, vsi_num;
+	u8 l4_proto;
+
+	/* Currently only IPv4/IPv6 with TCP is supported */
+	if (!(tx_flags & (ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6)))
+		return;
+
+	/* make sure channel VSI is valid and vector is channel enabled */
+	if (!ch->ch_vsi || !qv->ch)
+		return;
+
+	/* do not support inline-FD usage for queues which are
+	 * not in range of channel's queue region.
+	 */
+	if (tx_ring->q_index < ch->base_q)
+		return;
+
+	/* make sure channel VSI is FD capable and enabled for
+	 * inline flow-director usage
+	 */
+	if (!ice_vsi_fd_ena(ch->ch_vsi) ||
+	    !ice_vsi_inline_fd_ena(ch->ch_vsi))
+		return;
+
+	/* snag network header to get L4 type and address */
+	hdr.network = (tx_flags & ICE_TX_FLAGS_TUNNEL) ?
+		       skb_inner_network_header(skb) : skb_network_header(skb);
+
+	if (tx_flags & ICE_TX_FLAGS_IPV4) {
+		/* access ihl as u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+		hdr.ipv4 = ip_hdr(skb);
+		l4_proto = hdr.ipv4->protocol;
+	} else if (tx_flags & ICE_TX_FLAGS_IPV6) {
+		/* find the start of the innermost ipv6 header */
+		unsigned int inner_hlen = hdr.network - skb->data;
+		unsigned int h_offset = inner_hlen;
+
+		/* this function updates h_offset to the end of the header */
+		l4_proto = ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL,
+					 NULL);
+		hlen = h_offset - inner_hlen;
+	} else {
+		return; /* Unsupported protocol */
+	}
+
+	/* Currently ATR is supported only for TCP */
+	if (l4_proto != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(hdr.network + hlen);
+
+	if (ice_vsi_inline_fd_mark_ena(ch->ch_vsi)) {
+		/* proceed only for MARK, SYN, SYN+ACK, RST, FIN packets */
+		if (!skb->mark && !th->syn && !th->rst && !th->fin)
+			return;
+	} else {
+		/* proceed only for SYN, SYN+ACK, RST, FIN packets */
+		if (!th->syn && !th->rst && !th->fin)
+			return;
+	}
+
+	/* update queue as needed using channel's base_q, this queue number
+	 * gets programmed in filter descriptor while adding inline-FD entry
+	 */
+	if (skb->mark && ice_vsi_inline_fd_mark_ena(ch->ch_vsi)) {
+#ifdef HAVE_MIN_NAPI_ID
+		if (skb->mark < MIN_NAPI_ID)
+			return;
+#endif /* HAVE_MIN_NAPI_ID */
+
+		if (!skb->sk)
+			return;
+
+		/* skb->mark is part of union {mark, reserved_tailroom}.
+		 * Hence explicit check (to avoid false positive) to make
+		 * sure it is (skb->mark) is same as sk->mark.
+		 */
+		if (skb->mark != skb->sk->sk_mark)
+			return;
+
+		/* if current vector/queue is already aligned (as indicated
+		 * by skb->mark (napi_id), no action needed.
+		 */
+		if (skb->mark == qv->napi.napi_id)
+			return;
+
+		/* Unsupported config for now */
+		if (qv->num_ring_tx > 1)
+			return;
+		/* now locate ring/queue using based on skb->mark as napi_id */
+		if (!ice_get_queue_based_on_mark(qv->vsi, skb->mark, &q_index))
+			return;
+
+		/* all checks are passed, proceed with inline-FD programming */
+		q_index -= ch->base_q;
+	} else if (th->ack || th->fin || th->rst)  {
+		/* server side connection setup || connection_termination */
+		q_index = tx_ring->q_index - ch->base_q;
+	} else if (th->syn) {
+		/* just SYN, client side connection establishment.
+		 * since channel's num_txq and num_rxq has to be same,
+		 * using either num_rxq or num_txq is OK, but for readability
+		 * perspective, using 'num_txq' since this is transmit flow
+		 */
+		q_index = (atomic_inc_return(&ch->fd_queue) - 1) % ch->num_txq;
+	} else {
+		/* dont proceed */
+		return;
+	}
+
+	/* use channel specific HW VSI number */
+	vsi_num = ch->ch_vsi->vsi_num;
+
+	if (th->syn && th->ack) {
+		/* server side connection establishment, hence SYN+ACK.
+		 * proceed only when filter type for channel is of type dest
+		 * port or src+dest port. This is to handle server (target)
+		 * side use case where server side filter is either
+		 * based on dest port or src+dest port
+		 */
+		if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_DEST_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID))
+			return;
+
+		if (atomic_dec_if_positive(&qv->inline_fd_cnt) < 0) {
+			/* bailout */
+#ifdef ADQ_PERF_COUNTERS
+			tx_ring->ch_q_stats.tx.num_atr_bailouts++;
+#endif /* ADQ_PERF_COUNTERS */
+			return;
+		}
+#ifdef ADQ_PERF_COUNTERS
+		tx_ring->ch_q_stats.tx.num_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else if (th->syn) {
+#ifdef ADQ_PERF_COUNTERS
+		struct ice_ring *ch_tx_ring;
+#endif /* ADQ_PERF_COUNTERS */
+		/* client side doing active connect, hence SYN.
+		 * proceed only when filter type for channel is of type src
+		 * port or src+dest port. This is to handle client (initiator)
+		 * side, where filter type would be either based on
+		 * src port or src+dest port.
+		 */
+		if (!(ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_PORT ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_TENANT_ID ||
+		      ch->fltr_type == ICE_CHNL_FLTR_TYPE_SRC_DEST_PORT))
+			return;
+
+#ifdef ADQ_PERF_COUNTERS
+		ch_tx_ring = qv->vsi->tx_rings[q_index + ch->base_q];
+		if (ch_tx_ring)
+			ch_tx_ring->ch_q_stats.tx.num_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else if (th->fin || th->rst) {
+#ifdef ADQ_PERF_COUNTERS
+		tx_ring->ch_q_stats.tx.num_atr_evict++;
+#endif /* ADQ_PERF_COUNTERS */
+	} else {
+		/* This case is due to skb-mark, no need to check again,
+		 * It is handled previously
+		 */
+
+		/* filter type must be valid, SO_MARK based FD programming
+		 * is agnostic to client/server type connection, hence
+		 * not checking specific type of filter
+		 */
+		if (ch->fltr_type == ICE_CHNL_FLTR_TYPE_INVALID ||
+		    ch->fltr_type == ICE_CHNL_FLTR_TYPE_LAST)
+			return;
+#ifdef ADQ_PERF_COUNTERS
+		struct ice_ring *ch_tx_ring;
+
+		ch_tx_ring = qv->vsi->tx_rings[q_index + ch->base_q];
+		if (ch_tx_ring)
+			ch_tx_ring->ch_q_stats.tx.num_mark_atr_setup++;
+#endif /* ADQ_PERF_COUNTERS */
+	}
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	fdir_desc = ICE_TX_FDIRDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	ice_set_dflt_val_fd_desc(&fd_ctx);
+
+
+	/* set report completion to NONE, means flow-director programming
+	 * status won't be informed to SW.
+	 */
+	fd_ctx.comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_NONE;
+
+	/* Do not want auto-eviction of filter due to FIN/RST, eviction
+	 * is managed by SW, to avoid possible problems with TCP half-close
+	 * OR TCP simultaneous close from both side.
+	 */
+	fd_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE;
+	fd_ctx.qindex = q_index;
+	fd_ctx.cnt_index = tx_ring->ch_inline_fd_cnt_index;
+	fd_ctx.cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	fd_ctx.pcmd = (th->fin || th->rst) ?
+			    ICE_FXD_FLTR_QW1_PCMD_REMOVE :
+			    ICE_FXD_FLTR_QW1_PCMD_ADD;
+	fd_ctx.fd_vsi = vsi_num;
+	ice_set_fd_desc_val(&fd_ctx, fdir_desc);
+}
+
+/**
+ * ice_tsyn - set up the tsyn context descriptor
+ * @tx_ring:  ptr to the ring to send
+ * @skb:      ptr to the skb we're sending
+ * @first: Tx buffer
+ * @off: Quad Word 1
+ * @ptp_idx: ptp index to be filled in
+ *
+ * Returns NETDEV_TX_BUSY if not index avail, else OK
+ */
+static netdev_tx_t
+ice_tsyn(struct ice_ring *tx_ring, struct sk_buff *skb,
+	 struct ice_tx_buf *first,
+	 struct ice_tx_offload_params *off, int *ptp_idx)
+{
+	struct ice_vsi *vsi = tx_ring->vsi;
+	int idx;
+
+	if (!vsi->ptp_tx)
+		return NETDEV_TX_BUSY;
+
+	/* Tx timestamps cannot be sampled when doing TSO */
+	if (first->tx_flags & ICE_TX_FLAGS_TSO)
+		return NETDEV_TX_BUSY;
+
+	idx = ice_ptp_get_ts_idx(vsi);
+	if (idx >= 0) {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		vsi->ptp_tx_skb[idx] = skb_get(skb);
+		*ptp_idx = idx;
+	} else {
+		vsi->tx_hwtstamp_skipped++;
+		return NETDEV_TX_BUSY;
+	}
+
+	off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+			     (ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) |
+			     ((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S));
+	first->tx_flags |= ICE_TX_FLAGS_TSYN;
+
+	return NETDEV_TX_OK;
+}
+
 /**
  * ice_xmit_frame_ring - Sends buffer on Tx ring
  * @skb: send buffer
@@ -2109,8 +3455,13 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	struct ice_tx_offload_params offload = { 0 };
 	struct ice_vsi *vsi = tx_ring->vsi;
 	struct ice_tx_buf *first;
+	struct ethhdr *eth;
 	unsigned int count;
+	bool tsyn = true;
 	int tso, csum;
+	int idx = -1;
+
+	ice_trace(xmit_frame_ring, tx_ring, skb);
 
 	count = ice_xmit_desc_count(skb);
 	if (ice_chk_linearize(skb, count)) {
@@ -2142,8 +3493,14 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	first->tx_flags = 0;
 
 	/* prepare the VLAN tagging flags for Tx */
-	if (ice_tx_prepare_vlan_flags(tx_ring, first))
-		goto out_drop;
+	ice_tx_prepare_vlan_flags(tx_ring, first);
+	if (first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
+		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
+					(ICE_TX_CTX_DESC_IL2TAG2 <<
+					ICE_TXD_CTX_QW1_CMD_S));
+		offload.cd_l2tag2 = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
+			ICE_TX_FLAGS_VLAN_S;
+	}
 
 	/* set up TSO offload */
 	tso = ice_tso(first, &offload);
@@ -2156,16 +3513,34 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 		goto out_drop;
 
 	/* allow CONTROL frames egress from main VSI if FW LLDP disabled */
-	if (unlikely(skb->priority == TC_PRIO_CONTROL &&
+	eth = (struct ethhdr *)skb_mac_header(skb);
+	if (unlikely((skb->priority == TC_PRIO_CONTROL ||
+		      eth->h_proto == htons(ETH_P_LLDP)) &&
+		     (!(tx_ring->ch && tx_ring->ch->ch_vsi)) &&
 		     vsi->type == ICE_VSI_PF &&
-		     vsi->port_info->is_sw_lldp))
+		     vsi->port_info->qos_cfg.is_sw_lldp))
 		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
 					ICE_TX_CTX_DESC_SWTCH_UPLINK <<
 					ICE_TXD_CTX_QW1_CMD_S);
 
+	/* only timestamp the outbound packet if the user has requested it */
+#ifdef SKB_SHARED_TX_IS_UNION
+	if (likely(!(skb_tx(skb)->hardware)))
+#else
+	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
+#endif /* SKB_SHARED_TX_IS_UNION */
+		tsyn = false;
+
+	if (tsyn &&
+	    ice_tsyn(tx_ring, skb, first, &offload, &idx) == NETDEV_TX_BUSY)
+		goto out_ptp_drop;
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	if (ice_is_switchdev_running(vsi->back))
+		ice_eswitch_set_target_vsi(skb, &offload);
+#endif /* CONFIG_NET_DEVLINK */
 	if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
 		struct ice_tx_ctx_desc *cdesc;
-		int i = tx_ring->next_to_use;
+		u16 i = tx_ring->next_to_use;
 
 		/* grab the next descriptor */
 		cdesc = ICE_TX_CTX_DESC(tx_ring, i);
@@ -2179,12 +3554,21 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 		cdesc->qw1 = cpu_to_le64(offload.cd_qw1);
 	}
 
+
+	if (ice_ring_ch_enabled(tx_ring))
+		ice_chnl_inline_fd(tx_ring, skb, first->tx_flags);
+
+	first->ptp_ts_idx = idx;
+
 	ice_tx_map(tx_ring, first, &offload);
 	return NETDEV_TX_OK;
 
 out_drop:
+	ice_trace(xmit_frame_ring_drop, tx_ring, skb);
 	dev_kfree_skb_any(skb);
 	return NETDEV_TX_OK;
+out_ptp_drop:
+	return NETDEV_TX_BUSY;
 }
 
 /**
@@ -2210,3 +3594,86 @@ netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 	return ice_xmit_frame_ring(skb, tx_ring);
 }
+
+/**
+ * ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue
+ * @tx_ring: tx_ring to clean
+ */
+void ice_clean_ctrl_tx_irq(struct ice_ring *tx_ring)
+{
+	struct ice_vsi *vsi = tx_ring->vsi;
+	s16 i = tx_ring->next_to_clean;
+	int budget = ICE_DFLT_IRQ_WORK;
+	struct ice_tx_desc *tx_desc;
+	struct ice_tx_buf *tx_buf;
+
+	tx_buf = &tx_ring->tx_buf[i];
+	tx_desc = ICE_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
+
+	do {
+		struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
+
+		/* if next_to_watch is not set then there is no pending work */
+		if (!eop_desc)
+			break;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+
+		/* if the descriptor isn't done, no work to do */
+		if (!(eop_desc->cmd_type_offset_bsz &
+		      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
+			break;
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->next_to_watch = NULL;
+		tx_desc->buf_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+
+		/* move past filter desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_buf;
+			tx_desc = ICE_TX_DESC(tx_ring, 0);
+		}
+
+		/* unmap the data header */
+		if (dma_unmap_len(tx_buf, len))
+			dma_unmap_single(tx_ring->dev,
+					 dma_unmap_addr(tx_buf, dma),
+					 dma_unmap_len(tx_buf, len),
+					 DMA_TO_DEVICE);
+		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
+			devm_kfree(tx_ring->dev, tx_buf->raw_buf);
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->raw_buf = NULL;
+		tx_buf->tx_flags = 0;
+		tx_buf->next_to_watch = NULL;
+		dma_unmap_len_set(tx_buf, len, 0);
+		tx_desc->buf_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+
+		/* move past eop_desc for start of next FD desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_buf;
+			tx_desc = ICE_TX_DESC(tx_ring, 0);
+		}
+
+		budget--;
+	} while (likely(budget));
+
+	i += tx_ring->count;
+	tx_ring->next_to_clean = i;
+
+	/* re-enable interrupt if needed */
+	ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 94a9280193e29608a4e96402a99530ac97f38322..054982dbf1c32455eb1bd7ad36c2e96f475acd34 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -1,11 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_TXRX_H_
 #define _ICE_TXRX_H_
 
+#include "ice_type.h"
+
 #define ICE_DFLT_IRQ_WORK	256
+#define ICE_RXBUF_3072		3072
 #define ICE_RXBUF_2048		2048
+#define ICE_RXBUF_1536		1536
 #define ICE_MAX_CHAINED_RX_BUFS	5
 #define ICE_MAX_BUF_TXD		8
 #define ICE_MIN_TX_LEN		17
@@ -22,6 +26,72 @@
 #define ICE_RX_BUF_WRITE	16	/* Must be power of 2 */
 #define ICE_MAX_TXQ_PER_TXQG	128
 
+/* Attempt to maximize the headroom available for incoming frames. We use a 2K
+ * buffer for MTUs <= 1500 and need 1536/1534 to store the data for the frame.
+ * This leaves us with 512 bytes of room.  From that we need to deduct the
+ * space needed for the shared info and the padding needed to IP align the
+ * frame.
+ *
+ * Note: For cache line sizes 256 or larger this value is going to end
+ *	 up negative.  In these cases we should fall back to the legacy
+ *	 receive path.
+ */
+#if (PAGE_SIZE < 8192)
+#define ICE_2K_TOO_SMALL_WITH_PADDING \
+	((unsigned int)(NET_SKB_PAD + ICE_RXBUF_1536) > \
+			SKB_WITH_OVERHEAD(ICE_RXBUF_2048))
+
+/**
+ * ice_compute_pad - compute the padding
+ * @rx_buf_len: buffer length
+ *
+ * Figure out the size of half page based on given buffer length and
+ * then subtract the skb_shared_info followed by subtraction of the
+ * actual buffer length; this in turn results in the actual space that
+ * is left for padding usage
+ */
+static inline int ice_compute_pad(int rx_buf_len)
+{
+	int half_page_size;
+
+	half_page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
+	return SKB_WITH_OVERHEAD(half_page_size) - rx_buf_len;
+}
+
+/**
+ * ice_skb_pad - determine the padding that we can supply
+ *
+ * Figure out the right Rx buffer size and based on that calculate the
+ * padding
+ */
+static inline int ice_skb_pad(void)
+{
+	int rx_buf_len;
+
+	/* If a 2K buffer cannot handle a standard Ethernet frame then
+	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
+	 *
+	 * For a 3K buffer we need to add enough padding to allow for
+	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
+	 * cache-line alignment.
+	 */
+	if (ICE_2K_TOO_SMALL_WITH_PADDING)
+		rx_buf_len = ICE_RXBUF_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
+	else
+		rx_buf_len = ICE_RXBUF_1536;
+
+	/* if needed make room for NET_IP_ALIGN */
+	rx_buf_len -= NET_IP_ALIGN;
+
+	return ice_compute_pad(rx_buf_len);
+}
+
+#define ICE_SKB_PAD ice_skb_pad()
+#else
+#define ICE_2K_TOO_SMALL_WITH_PADDING false
+#define ICE_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
+#endif
+
 /* We are assuming that the cache line is always 64 Bytes here for ice.
  * In order to make sure that is a correct assumption there is a check in probe
  * to print a warning if the read from GLPCI_CNF2 tells us that the cache line
@@ -38,28 +108,53 @@
 #define DESC_NEEDED (MAX_SKB_FRAGS + ICE_DESCS_FOR_CTX_DESC + \
 		     ICE_DESCS_PER_CACHE_LINE + ICE_DESCS_FOR_SKB_DATA_PTR)
 #define ICE_DESC_UNUSED(R)	\
-	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
-	(R)->next_to_clean - (R)->next_to_use - 1)
+	(u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	      (R)->next_to_clean - (R)->next_to_use - 1)
 
 #define ICE_TX_FLAGS_TSO	BIT(0)
 #define ICE_TX_FLAGS_HW_VLAN	BIT(1)
 #define ICE_TX_FLAGS_SW_VLAN	BIT(2)
+/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be
+ * freed instead of returned like skb packets.
+ */
+#define ICE_TX_FLAGS_DUMMY_PKT	BIT(3)
+
+#define ICE_TX_FLAGS_TSYN	BIT(4)
+#define ICE_TX_FLAGS_IPV4	BIT(5)
+#define ICE_TX_FLAGS_IPV6	BIT(6)
+#define ICE_TX_FLAGS_TUNNEL	BIT(7)
+#define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN	BIT(8)
 #define ICE_TX_FLAGS_VLAN_M	0xffff0000
 #define ICE_TX_FLAGS_VLAN_PR_M	0xe0000000
 #define ICE_TX_FLAGS_VLAN_PR_S	29
 #define ICE_TX_FLAGS_VLAN_S	16
+#ifdef HAVE_XDP_SUPPORT
+
+#define ICE_XDP_PASS		0
+#define ICE_XDP_CONSUMED	BIT(0)
+#define ICE_XDP_TX		BIT(1)
+#define ICE_XDP_REDIR		BIT(2)
+#endif /* HAVE_XDP_SUPPORT */
 
 #define ICE_RX_DMA_ATTR \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+#define ICE_ETH_PKT_HDR_PAD	(ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
+
+#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)
+
 struct ice_tx_buf {
 	struct ice_tx_desc *next_to_watch;
-	struct sk_buff *skb;
+	union {
+		struct sk_buff *skb;
+		void *raw_buf; /* used for XDP */
+	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
 	u32 tx_flags;
 	DEFINE_DMA_UNMAP_LEN(len);
 	DEFINE_DMA_UNMAP_ADDR(dma);
+	int ptp_ts_idx;
 };
 
 struct ice_tx_offload_params {
@@ -76,14 +171,66 @@ struct ice_tx_offload_params {
 struct ice_rx_buf {
 	struct sk_buff *skb;
 	dma_addr_t dma;
-	struct page *page;
-	unsigned int page_offset;
-	u16 pagecnt_bias;
+	union {
+		struct {
+			struct page *page;
+			unsigned int page_offset;
+			u16 pagecnt_bias;
+		};
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+		struct {
+			union {
+				struct xdp_buff *xdp;
+				void *addr;
+			};
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			u64 handle;
+#endif
+		};
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	};
+};
+
+#ifdef ADQ_PERF_COUNTERS
+struct ice_ch_tx_q_stats {
+	u64 num_atr_setup; /* How many times, ATR is setup */
+	u64 num_mark_atr_setup; /* How many times, ATR is setup via skb->mark */
+	u64 num_atr_evict; /* How many times, ATR is teardown */
+	u64 num_atr_bailouts; /* How many times, bailout from ATR setup */
+};
+
+struct ice_ch_rx_q_stats {
+	u64 num_rx_queue_set; /* times, Rx queue set for ATR */
+	u64 num_rx_queue_bailouts; /* times bailed out from ATR setup */
+	u64 num_tcp_ctrl_pkts;
+	u64 num_only_ctrl_pkts;
+	u64 num_no_data_pkt_bp;
+	u64 num_tcp_flags_fin;
+	u64 num_tcp_flags_rst;
+	u64 num_tcp_flags_syn;
+};
+
+struct ice_ch_q_poll_stats {
+	/* general packet counters for busy_poll versus napi_poll */
+	u64 bp_packets;
+	u64 np_packets;
 };
 
+struct ice_ch_q_stats {
+	struct ice_ch_q_poll_stats poll;
+	union {
+		struct ice_ch_tx_q_stats tx;
+		struct ice_ch_rx_q_stats rx;
+	};
+};
+#endif /* ADQ_PERF_COUNTERS */
+
 struct ice_q_stats {
 	u64 pkts;
 	u64 bytes;
+#ifdef ICE_ADD_PROBES
+	u64 napi_poll_cnt;
+#endif /* ICE_ADD_PROBES */
 };
 
 struct ice_txq_stats {
@@ -97,7 +244,14 @@ struct ice_rxq_stats {
 	u64 non_eop_descs;
 	u64 alloc_page_failed;
 	u64 alloc_buf_failed;
-	u64 page_reuse_count;
+#ifdef ICE_ADD_PROBES
+	u64 page_reuse;
+#endif /* ICE_ADD_PROBES */
+};
+
+enum ice_ring_state_t {
+	ICE_TX_XPS_INIT_DONE,
+	ICE_TX_NBITS,
 };
 
 /* this enum matches hardware bits and is meant to be used by DYN_CTLN
@@ -124,27 +278,23 @@ enum ice_rx_dtype {
 #define ICE_TX_ITR	ICE_IDX_ITR1
 #define ICE_ITR_8K	124
 #define ICE_ITR_20K	50
-#define ICE_ITR_MAX	8160
-#define ICE_DFLT_TX_ITR	(ICE_ITR_20K | ICE_ITR_DYNAMIC)
-#define ICE_DFLT_RX_ITR	(ICE_ITR_20K | ICE_ITR_DYNAMIC)
-#define ICE_ITR_DYNAMIC	0x8000  /* used as flag for itr_setting */
-#define ITR_IS_DYNAMIC(setting) (!!((setting) & ICE_ITR_DYNAMIC))
-#define ITR_TO_REG(setting)	((setting) & ~ICE_ITR_DYNAMIC)
+#define ICE_ITR_MAX	8160 /* 0x1FE0 */
+#define ICE_DFLT_TX_ITR	ICE_ITR_20K
+#define ICE_DFLT_RX_ITR	ICE_ITR_20K
+enum ice_dynamic_itr {
+	ITR_STATIC = 0,
+	ITR_DYNAMIC = 1
+};
+
+#define ITR_IS_DYNAMIC(rc) ((rc)->itr_mode == ITR_DYNAMIC)
 #define ICE_ITR_GRAN_S		1	/* ITR granularity is always 2us */
 #define ICE_ITR_GRAN_US		BIT(ICE_ITR_GRAN_S)
 #define ICE_ITR_MASK		0x1FFE	/* ITR register value alignment mask */
-#define ITR_REG_ALIGN(setting)	__ALIGN_MASK(setting, ~ICE_ITR_MASK)
-
-#define ICE_ITR_ADAPTIVE_MIN_INC	0x0002
-#define ICE_ITR_ADAPTIVE_MIN_USECS	0x0002
-#define ICE_ITR_ADAPTIVE_MAX_USECS	0x00FA
-#define ICE_ITR_ADAPTIVE_LATENCY	0x8000
-#define ICE_ITR_ADAPTIVE_BULK		0x0000
+#define ITR_REG_ALIGN(setting)	((setting) & ICE_ITR_MASK)
 
 #define ICE_DFLT_INTRL	0
 #define ICE_MAX_INTRL	236
 
-#define ICE_WB_ON_ITR_USECS	2
 #define ICE_IN_WB_ON_ITR_MODE	255
 /* Sets WB_ON_ITR and assumes INTENA bit is already cleared, which allows
  * setting the MSK_M bit to tell hardware to ignore the INTENA_M bit. Also,
@@ -161,6 +311,22 @@ enum ice_rx_dtype {
 #define ICE_TX_ADVANCED	0
 #define ICE_TX_LEGACY	1
 
+#ifdef HAVE_XDP_SUPPORT
+#ifdef ICE_ADD_PROBES
+struct ice_xdp_stats {
+	u64 xdp_rx_pkts;
+	u64 xdp_rx_bytes;
+	u64 xdp_pass;
+	u64 xdp_drop;
+	u64 xdp_tx;
+	u64 xdp_tx_fail;
+	u64 xdp_unknown;
+	u64 xdp_redirect;
+	u64 xdp_redirect_fail;
+};
+#endif
+#endif
+
 /* descriptor ring, associated with a VSI */
 struct ice_ring {
 	/* CL1 - 1st cacheline starts here */
@@ -179,15 +345,21 @@ struct ice_ring {
 	u16 q_index;			/* Queue number of ring */
 	u16 q_handle;			/* Queue handle per TC */
 
-	u8 ring_active:1;		/* is ring online or not */
-
 	u16 count;			/* Number of descriptors */
 	u16 reg_idx;			/* HW register index of the ring */
 
 	/* used in interrupt processing */
 	u16 next_to_use;
 	u16 next_to_clean;
-	u16 next_to_alloc;
+	union {
+		u16 next_to_alloc;
+		u16 next_rs_idx;
+	};
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	u16 xdp_tx_active;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
 
 	/* stats structs */
 	struct ice_q_stats	stats;
@@ -198,38 +370,119 @@ struct ice_ring {
 	};
 
 	struct rcu_head rcu;		/* to avoid race on free */
+	DECLARE_BITMAP(xps_state, ICE_TX_NBITS);	/* XPS Config State */
+	struct ice_channel *ch;
+#ifdef HAVE_XDP_SUPPORT
+	struct bpf_prog *xdp_prog;
+#ifdef ICE_ADD_PROBES
+	struct ice_xdp_stats xdp_stats;
+#endif
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	struct xsk_buff_pool *xsk_pool;
+#else
+	struct xdp_umem *xsk_pool;
+#endif
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct zero_copy_allocator zca;
+#endif
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+	/* CL3 - 3rd cacheline starts here */
+#ifdef HAVE_XDP_BUFF_RXQ
+	struct xdp_rxq_info xdp_rxq;
+#endif /* HAVE_XDP_BUFF_RXQ */
+#endif /* HAVE_XDP_SUPPORT */
+
 	/* CLX - the below items are only accessed infrequently and should be
 	 * in their own cache line if possible
 	 */
+#ifdef HAVE_XDP_SUPPORT
+#define ICE_TX_FLAGS_RING_XDP			BIT(0)
+#endif /* HAVE_XDP_SUPPORT */
+#define ICE_RX_FLAGS_RING_BUILD_SKB		BIT(1)
+#define ICE_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(2)
+#define ICE_TX_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(3)
+	u8 flags;
 	dma_addr_t dma;			/* physical address of ring */
 	unsigned int size;		/* length of descriptor ring in bytes */
 	u32 txq_teid;			/* Added Tx queue TEID */
 	u16 rx_buf_len;
-#ifdef CONFIG_DCB
+	u8 rx_crc_strip_dis;
 	u8 dcb_tc;			/* Traffic class of ring */
-#endif /* CONFIG_DCB */
+	u64 cached_systime;
+	u8 ptp_rx:1;
+	u32 ch_inline_fd_cnt_index;
+#ifdef ADQ_PERF_COUNTERS
+	struct ice_ch_q_stats ch_q_stats;
+#endif /* ADQ_PERF_COUNTERS */
 } ____cacheline_internodealigned_in_smp;
 
+static inline bool ice_ring_uses_build_skb(struct ice_ring *ring)
+{
+	return !!(ring->flags & ICE_RX_FLAGS_RING_BUILD_SKB);
+}
+
+static inline void ice_set_ring_build_skb_ena(struct ice_ring *ring)
+{
+	ring->flags |= ICE_RX_FLAGS_RING_BUILD_SKB;
+}
+
+static inline void ice_clear_ring_build_skb_ena(struct ice_ring *ring)
+{
+	ring->flags &= ~ICE_RX_FLAGS_RING_BUILD_SKB;
+}
+
+static inline bool ice_ring_ch_enabled(struct ice_ring *ring)
+{
+	return !!(ring->ch);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+static inline bool ice_ring_is_xdp(struct ice_ring *ring)
+{
+	return !!(ring->flags & ICE_TX_FLAGS_RING_XDP);
+}
+#endif /* HAVE_XDP_SUPPORT */
+
 struct ice_ring_container {
 	/* head of linked-list of rings */
 	struct ice_ring *ring;
-	unsigned long next_update;	/* jiffies value of next queue update */
-	unsigned int total_bytes;	/* total bytes processed this int */
-	unsigned int total_pkts;	/* total packets processed this int */
+	struct dim dim;		/* data for net_dim algorithm */
 	u16 itr_idx;		/* index in the interrupt vector */
-	u16 target_itr;		/* value in usecs divided by the hw->itr_gran */
-	u16 current_itr;	/* value in usecs divided by the hw->itr_gran */
-	/* high bit set means dynamic ITR, rest is used to store user
-	 * readable ITR value in usecs and must be converted before programming
-	 * to a register.
+	/* this matches the maximum number of ITR bits, but in usec
+	 * values, so it is shifted left one bit (bit zero is ignored)
 	 */
-	u16 itr_setting;
+	u16 itr_setting:13;
+	u16 itr_reserved:2;
+	u16 itr_mode:1;
+};
+
+struct ice_coalesce_stored {
+	u16 itr_tx;
+	u16 itr_rx;
+	u8 intrl;
+	u8 tx_valid;
+	u8 rx_valid;
 };
 
 /* iterator for handling rings in ring container */
 #define ice_for_each_ring(pos, head) \
 	for (pos = (head).ring; pos; pos = pos->next)
 
+static inline unsigned int ice_rx_pg_order(struct ice_ring *ring)
+{
+#if (PAGE_SIZE < 8192)
+	if (ring->rx_buf_len > (PAGE_SIZE / 2))
+		return 1;
+#endif
+	return 0;
+}
+
+#define ice_rx_pg_size(_ring) (PAGE_SIZE << ice_rx_pg_order(_ring))
+
+
+union ice_32b_rx_flex_desc;
+
 bool ice_alloc_rx_bufs(struct ice_ring *rxr, u16 cleaned_count);
 netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev);
 void ice_clean_tx_ring(struct ice_ring *tx_ring);
@@ -239,5 +492,9 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring);
 void ice_free_tx_ring(struct ice_ring *tx_ring);
 void ice_free_rx_ring(struct ice_ring *rx_ring);
 int ice_napi_poll(struct napi_struct *napi, int budget);
-
+int
+ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
+		   u8 *raw_packet);
+int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget);
+void ice_clean_ctrl_tx_irq(struct ice_ring *tx_ring);
 #endif /* _ICE_TXRX_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
new file mode 100644
index 0000000000000000000000000000000000000000..261a8635e8b641201e90bf3fd201ee6213c84c45
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_txrx_lib.h"
+#include "ice_eswitch.h"
+
+/**
+ * ice_release_rx_desc - Store the new tail and head values
+ * @rx_ring: ring to bump
+ * @val: new head index
+ */
+void ice_release_rx_desc(struct ice_ring *rx_ring, u16 val)
+{
+	u16 prev_ntu = rx_ring->next_to_use & ~0x7;
+
+	rx_ring->next_to_use = val;
+
+	/* update next to alloc since we have filled the ring */
+	rx_ring->next_to_alloc = val;
+
+	/* QRX_TAIL will be updated with any tail value, but hardware ignores
+	 * the lower 3 bits. This makes it so we only bump tail on meaningful
+	 * boundaries. Also, this allows us to bump tail on intervals of 8 up to
+	 * the budget depending on the current traffic load.
+	 */
+	val &= ~0x7;
+	if (prev_ntu != val) {
+		/* Force memory writes to complete before letting h/w
+		 * know there are new descriptors to fetch. (Only
+		 * applicable for weak-ordered memory model archs,
+		 * such as IA-64).
+		 */
+		wmb();
+		writel_relaxed(val, rx_ring->tail);
+	}
+}
+
+/**
+ * ice_ptype_to_htype - get a hash type
+ * @ptype: the ptype value from the descriptor
+ *
+ * Returns appropriate hash type (such as PKT_HASH_TYPE_L2/L3/L4) to be used by
+ * skb_set_hash based on PTYPE as parsed by HW Rx pipeline and is part of
+ * Rx desc.
+ */
+static enum pkt_hash_types ice_ptype_to_htype(u16 ptype)
+{
+	struct ice_rx_ptype_decoded decoded = ice_decode_rx_desc_ptype(ptype);
+
+	if (!decoded.known)
+		return PKT_HASH_TYPE_NONE;
+	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4)
+		return PKT_HASH_TYPE_L4;
+	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3)
+		return PKT_HASH_TYPE_L3;
+	if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2)
+		return PKT_HASH_TYPE_L2;
+
+	return PKT_HASH_TYPE_NONE;
+}
+
+/**
+ * ice_rx_hash - set the hash value in the skb
+ * @rx_ring: descriptor ring
+ * @rx_desc: specific descriptor
+ * @skb: pointer to current skb
+ * @rx_ptype: the ptype value from the descriptor
+ */
+static void
+ice_rx_hash(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
+	    struct sk_buff *skb, u16 rx_ptype)
+{
+	struct ice_32b_rx_flex_desc_nic *nic_mdid;
+	u32 hash;
+
+	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
+		return;
+
+	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
+		return;
+
+	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
+	hash = le32_to_cpu(nic_mdid->rss_hash);
+	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+}
+
+#ifdef ICE_ADD_PROBES
+/**
+ * ice_rx_extra_counters - Update Rx csum counters
+ * @vsi: the VSI we care about
+ * @rx_status: status bits extracted from desc qword
+ * @inner_prot: inner protocol of decoded ptype
+ * @is_ipv4: if ptype is ipv4 or not
+ */
+static void
+ice_rx_extra_counters(struct ice_vsi *vsi, u16 rx_status, u32 inner_prot,
+		      bool is_ipv4)
+{
+	if (is_ipv4) {
+		vsi->back->rx_ip4_cso++;
+		if (rx_status & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
+				 BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)))
+			vsi->back->rx_ip4_cso_err++;
+	}
+
+	switch (inner_prot) {
+	case ICE_RX_PTYPE_INNER_PROT_TCP:
+		vsi->back->rx_tcp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_tcp_cso_err++;
+		break;
+	case ICE_RX_PTYPE_INNER_PROT_UDP:
+		vsi->back->rx_udp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_udp_cso_err++;
+		break;
+	case ICE_RX_PTYPE_INNER_PROT_SCTP:
+		vsi->back->rx_sctp_cso++;
+		if (rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+			vsi->back->rx_sctp_cso_err++;
+		break;
+	default:
+		break;
+	}
+}
+#endif /* ICE_ADD_PROBES */
+
+/**
+ * ice_rx_csum - Indicate in skb if checksum is good
+ * @ring: the ring we care about
+ * @skb: skb currently being received and modified
+ * @rx_desc: the receive descriptor
+ * @ptype: the packet type decoded by hardware
+ *
+ * skb->protocol must be set before this function is called
+ */
+static void
+ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb,
+	    union ice_32b_rx_flex_desc *rx_desc, u16 ptype)
+{
+	struct ice_rx_ptype_decoded decoded;
+	u16 rx_status0, rx_status1;
+	bool ipv4, ipv6;
+
+	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
+	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
+
+	decoded = ice_decode_rx_desc_ptype(ptype);
+
+
+	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
+	skb->ip_summed = CHECKSUM_NONE;
+	skb_checksum_none_assert(skb);
+
+	/* check if Rx checksum is enabled */
+	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+		return;
+
+
+	/* check if HW has decoded the packet and checksum */
+	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
+		return;
+
+	if (!(decoded.known && decoded.outer_ip))
+		return;
+
+	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
+	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+
+#ifdef ICE_ADD_PROBES
+	ice_rx_extra_counters(ring->vsi, rx_status0, decoded.inner_prot, ipv4);
+#endif
+
+	if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
+				   BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
+		goto checksum_fail;
+
+	if (ipv6 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S))))
+		goto checksum_fail;
+
+	/* check for L4 errors and handle packets that were not able to be
+	 * checksummed due to arrival speed
+	 */
+	if (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S))
+		goto checksum_fail;
+
+	/* check for outer UDP checksum error in tunneled packets */
+	if ((rx_status1 & BIT(ICE_RX_FLEX_DESC_STATUS1_NAT_S)) &&
+	    (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S)))
+		goto checksum_fail;
+
+	/* If there is an outer header present that might contain a checksum
+	 * we need to bump the checksum level by 1 to reflect the fact that
+	 * we are indicating we validated the inner checksum.
+	 */
+	if (decoded.tunnel_type >= ICE_RX_PTYPE_TUNNEL_IP_GRENAT)
+#ifdef HAVE_SKBUFF_CSUM_LEVEL
+		skb->csum_level = 1;
+#else
+		skb->encapsulation = 1;
+#endif
+
+	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
+	switch (decoded.inner_prot) {
+	case ICE_RX_PTYPE_INNER_PROT_TCP:
+	case ICE_RX_PTYPE_INNER_PROT_UDP:
+	case ICE_RX_PTYPE_INNER_PROT_SCTP:
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	default:
+		break;
+	}
+	return;
+
+checksum_fail:
+	ring->vsi->back->hw_csum_rx_error++;
+}
+
+/**
+ * ice_process_skb_fields - Populate skb header fields from Rx descriptor
+ * @rx_ring: Rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being populated
+ * @ptype: the packet type decoded by hardware
+ *
+ * This function checks the ring, descriptor, and packet information in
+ * order to populate the hash, checksum, VLAN, protocol, and
+ * other fields within the skb.
+ */
+void
+ice_process_skb_fields(struct ice_ring *rx_ring,
+		       union ice_32b_rx_flex_desc *rx_desc,
+		       struct sk_buff *skb, u16 ptype)
+{
+	ice_rx_hash(rx_ring, rx_desc, skb, ptype);
+
+	/* modifies the skb - consumes the enet header */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	skb->protocol = eth_type_trans(skb, ice_eswitch_get_target_netdev
+				       (rx_ring, rx_desc));
+#else
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+#endif /* CONFIG_NET_DEVLINK */
+
+	ice_rx_csum(rx_ring, skb, rx_desc, ptype);
+	if (rx_ring->ptp_rx)
+		ice_ptp_rx_hwtstamp(rx_ring, rx_desc, skb);
+
+#ifdef HAVE_NETDEV_SB_DEV
+	if (!netif_is_ice(rx_ring->netdev))
+		macvlan_count_rx((const struct macvlan_dev *)netdev_priv(rx_ring->netdev),
+				 skb->len + ETH_HLEN, true, false);
+#endif /* HAVE_NETDEV_SB_DEV */
+}
+
+/**
+ * ice_receive_skb - Send a completed packet up the stack
+ * @rx_ring: Rx ring in play
+ * @skb: packet to send up
+ * @vlan_tag: VLAN tag for packet
+ *
+ * This function sends the completed packet (via. skb) up the stack using
+ * gro receive functions (with/without VLAN tag)
+ */
+void
+ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)
+{
+	netdev_features_t features = rx_ring->netdev->features;
+	bool non_zero_vlan = vlan_tag & VLAN_VID_MASK;
+
+#ifdef ICE_ADD_PROBES
+	if ((features & NETIF_F_HW_VLAN_CTAG_RX) && non_zero_vlan) {
+		rx_ring->vsi->back->rx_q_vlano++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+	} else if ((features & NETIF_F_HW_VLAN_STAG_RX) && non_zero_vlan) {
+		rx_ring->vsi->back->rx_ad_vlano++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+	}
+#else
+	if ((features & NETIF_F_HW_VLAN_CTAG_RX) && non_zero_vlan)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+	else if ((features & NETIF_F_HW_VLAN_STAG_RX) && non_zero_vlan)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+#endif /* ICE_ADD_PROBES */
+
+	napi_gro_receive(&rx_ring->q_vector->napi, skb);
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_xmit_xdp_ring - submit single packet to XDP ring for transmission
+ * @data: packet data pointer
+ * @size: packet data size
+ * @xdp_ring: XDP ring for transmission
+ */
+int ice_xmit_xdp_ring(void *data, u16 size, struct ice_ring *xdp_ring)
+{
+	u16 i = xdp_ring->next_to_use;
+	struct ice_tx_desc *tx_desc;
+	struct ice_tx_buf *tx_buf;
+	dma_addr_t dma;
+
+	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
+		xdp_ring->tx_stats.tx_busy++;
+		return ICE_XDP_CONSUMED;
+	}
+
+	dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(xdp_ring->dev, dma))
+		return ICE_XDP_CONSUMED;
+
+	tx_buf = &xdp_ring->tx_buf[i];
+	tx_buf->bytecount = size;
+	tx_buf->gso_segs = 1;
+	tx_buf->raw_buf = data;
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buf, len, size);
+	dma_unmap_addr_set(tx_buf, dma, dma);
+
+	tx_desc = ICE_TX_DESC(xdp_ring, i);
+	tx_desc->buf_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TXD_LAST_DESC_CMD, 0,
+						      size, 0);
+
+	/* Make certain all of the status bits have been updated
+	 * before next_to_watch is written.
+	 */
+	smp_wmb();
+
+#ifdef HAVE_XDP_SUPPORT
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+	xdp_ring->xdp_tx_active++;
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* HAVE_XDP_SUPPORT */
+	i++;
+	if (i == xdp_ring->count)
+		i = 0;
+
+	tx_buf->next_to_watch = tx_desc;
+	xdp_ring->next_to_use = i;
+
+	return ICE_XDP_TX;
+}
+
+/**
+ * ice_xmit_xdp_buff - convert an XDP buffer to an XDP frame and send it
+ * @xdp: XDP buffer
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns negative on failure, 0 on success.
+ */
+int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_ring *xdp_ring)
+{
+#ifdef HAVE_XDP_FRAME_STRUCT
+	struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
+
+	if (unlikely(!xdpf))
+		return ICE_XDP_CONSUMED;
+
+	return ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
+#else
+	return ice_xmit_xdp_ring(xdp->data,
+				 xdp->data_end - xdp->data,
+				 xdp_ring);
+#endif /* HAVE_XDP_FRAME_STRUCT */
+}
+
+/**
+ * ice_finalize_xdp_rx - Bump XDP Tx tail and/or flush redirect map
+ * @rx_ring: Rx ring
+ * @xdp_res: Result of the receive batch
+ *
+ * This function bumps XDP Tx tail and/or flush redirect map, and
+ * should be called when a batch of packets has been processed in the
+ * napi loop.
+ */
+void ice_finalize_xdp_rx(struct ice_ring *rx_ring, unsigned int xdp_res)
+{
+	if (xdp_res & ICE_XDP_REDIR)
+		xdp_do_flush_map();
+
+	if (xdp_res & ICE_XDP_TX) {
+		struct ice_ring *xdp_ring =
+			rx_ring->vsi->xdp_rings[rx_ring->q_index];
+
+		ice_xdp_ring_update_tail(xdp_ring);
+	}
+}
+#endif /* HAVE_XDP_SUPPORT */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a2e80525cdf880c5671f7ec1faa84c92e023cb
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_TXRX_LIB_H_
+#define _ICE_TXRX_LIB_H_
+#include "ice.h"
+
+/**
+ * ice_test_staterr - tests bits in Rx descriptor status and error fields
+ * @status_err_n: Rx descriptor status_error0 or status_error1 bits
+ * @stat_err_bits: value to mask
+ *
+ * This function does some fast chicanery in order to return the
+ * value of the mask which is really only used for boolean tests.
+ * The status_error_len doesn't need to be shifted because it begins
+ * at offset zero.
+ */
+static inline bool
+ice_test_staterr(__le16 status_err_n, const u16 stat_err_bits)
+{
+	return !!(status_err_n & cpu_to_le16(stat_err_bits));
+}
+
+static inline __le64
+ice_build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
+{
+	return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
+			   (td_cmd    << ICE_TXD_QW1_CMD_S) |
+			   (td_offset << ICE_TXD_QW1_OFFSET_S) |
+			   ((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
+			   (td_tag    << ICE_TXD_QW1_L2TAG1_S));
+}
+
+/**
+ * ice_get_vlan_from_rx_desc - get VLAN from Rx flex descriptor
+ * @rx_desc: Rx 32b flex descriptor with RXDID=2
+ *
+ * The OS and current PF implementation only support stripping a single VLAN tag
+ * at a time, so there should only ever be 0 or 1 tags in the l2tag* fields. If
+ * one is found return the tag, else return 0 to mean no VLAN tag was found.
+ */
+static inline u16
+ice_get_vlan_tag_from_rx_desc(union ice_32b_rx_flex_desc *rx_desc)
+{
+	u16 stat_err_bits;
+
+	stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
+	if (ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
+		return le16_to_cpu(rx_desc->wb.l2tag1);
+
+	stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
+	if (ice_test_staterr(rx_desc->wb.status_error1, stat_err_bits))
+		return le16_to_cpu(rx_desc->wb.l2tag2_2nd);
+
+	return 0;
+}
+
+#ifdef HAVE_XDP_SUPPORT
+/**
+ * ice_xdp_ring_update_tail - Updates the XDP Tx ring tail register
+ * @xdp_ring: XDP Tx ring
+ *
+ * This function updates the XDP Tx ring tail register.
+ */
+static inline void ice_xdp_ring_update_tail(struct ice_ring *xdp_ring)
+{
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+	writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+void ice_finalize_xdp_rx(struct ice_ring *rx_ring, unsigned int xdp_res);
+int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_ring *xdp_ring);
+int ice_xmit_xdp_ring(void *data, u16 size, struct ice_ring *xdp_ring);
+#endif /* HAVE_XDP_SUPPORT */
+void ice_release_rx_desc(struct ice_ring *rx_ring, u16 val);
+void
+ice_process_skb_fields(struct ice_ring *rx_ring,
+		       union ice_32b_rx_flex_desc *rx_desc,
+		       struct sk_buff *skb, u16 ptype);
+void
+ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag);
+#endif /* !_ICE_TXRX_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 6667d17a4206176940f4a87bcde646f4b56c1bf4..eeb2de1a152eb797b432f63c0f16dc56d089a54b 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -1,42 +1,105 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_TYPE_H_
 #define _ICE_TYPE_H_
 
+
+
+
+
+
 #define ICE_BYTES_PER_WORD	2
 #define ICE_BYTES_PER_DWORD	4
+#define ICE_MAX_TRAFFIC_CLASS	8
+#define ICE_CHNL_MAX_TC		16
+
+
+
+
 
 #include "ice_status.h"
 #include "ice_hw_autogen.h"
+#include "ice_devids.h"
 #include "ice_osdep.h"
 #include "ice_controlq.h"
 #include "ice_lan_tx_rx.h"
 #include "ice_flex_type.h"
+#include "ice_protocol_type.h"
+#include "ice_sbq_cmd.h"
+#include "ice_vlan_mode.h"
+#include "ice_fwlog.h"
+
+
 
 static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc)
 {
 	return test_bit(tc, &bitmap);
 }
 
+
+static inline u64 round_up_64bit(u64 a, u32 b)
+{
+	return div64_long(((a) + (b) / 2), (b));
+}
+
+static inline u32 ice_round_to_num(u32 N, u32 R)
+{
+	return ((((N) % (R)) < ((R) / 2)) ? (((N) / (R)) * (R)) :
+		((((N) + (R) - 1) / (R)) * (R)));
+}
+
 /* Driver always calls main vsi_handle first */
 #define ICE_MAIN_VSI_HANDLE		0
 
+/* Switch from ms to the 1usec global time (this is the GTIME resolution) */
+#define ICE_MS_TO_GTIME(time)		((time) * 1000)
+
+/* Data type manipulation macros. */
+#define ICE_HI_WORD(x)		((u16)(((x) >> 16) & 0xFFFF))
+#define ICE_LO_WORD(x)		((u16)((x) & 0xFFFF))
+
 /* debug masks - set these bits in hw->debug_mask to control output */
 #define ICE_DBG_INIT		BIT_ULL(1)
+#define ICE_DBG_RELEASE		BIT_ULL(2)
 #define ICE_DBG_FW_LOG		BIT_ULL(3)
 #define ICE_DBG_LINK		BIT_ULL(4)
 #define ICE_DBG_PHY		BIT_ULL(5)
 #define ICE_DBG_QCTX		BIT_ULL(6)
 #define ICE_DBG_NVM		BIT_ULL(7)
 #define ICE_DBG_LAN		BIT_ULL(8)
+#define ICE_DBG_FLOW		BIT_ULL(9)
+#define ICE_DBG_DCB		BIT_ULL(10)
+#define ICE_DBG_DIAG		BIT_ULL(11)
+#define ICE_DBG_FD		BIT_ULL(12)
 #define ICE_DBG_SW		BIT_ULL(13)
 #define ICE_DBG_SCHED		BIT_ULL(14)
+
+#define ICE_DBG_RDMA		BIT_ULL(15)
 #define ICE_DBG_PKG		BIT_ULL(16)
 #define ICE_DBG_RES		BIT_ULL(17)
+#define ICE_DBG_ACL		BIT_ULL(18)
+#define ICE_DBG_PTP		BIT_ULL(19)
 #define ICE_DBG_AQ_MSG		BIT_ULL(24)
+#define ICE_DBG_AQ_DESC		BIT_ULL(25)
+#define ICE_DBG_AQ_DESC_BUF	BIT_ULL(26)
 #define ICE_DBG_AQ_CMD		BIT_ULL(27)
+#define ICE_DBG_AQ		(ICE_DBG_AQ_MSG		| \
+				 ICE_DBG_AQ_DESC	| \
+				 ICE_DBG_AQ_DESC_BUF	| \
+				 ICE_DBG_AQ_CMD)
+
 #define ICE_DBG_USER		BIT_ULL(31)
+#define ICE_DBG_ALL		0xFFFFFFFFFFFFFFFFULL
+
+#ifndef __always_unused
+#define __always_unused
+#endif
+
+
+
+
+
 
 enum ice_aq_res_ids {
 	ICE_NVM_RES_ID = 1,
@@ -72,6 +135,12 @@ enum ice_fc_mode {
 	ICE_FC_DFLT
 };
 
+enum ice_phy_cache_mode {
+	ICE_FC_MODE = 0,
+	ICE_SPEED_MODE,
+	ICE_FEC_MODE
+};
+
 enum ice_fec_mode {
 	ICE_FEC_NONE = 0,
 	ICE_FEC_RS,
@@ -79,6 +148,14 @@ enum ice_fec_mode {
 	ICE_FEC_AUTO
 };
 
+struct ice_phy_cache_mode_data {
+	union {
+		enum ice_fec_mode curr_user_fec_req;
+		enum ice_fc_mode curr_user_fc_req;
+		u16 curr_user_speed_req;
+	} data;
+};
+
 enum ice_set_fc_aq_failures {
 	ICE_SET_FC_AQ_FAIL_NONE = 0,
 	ICE_SET_FC_AQ_FAIL_GET,
@@ -86,12 +163,16 @@ enum ice_set_fc_aq_failures {
 	ICE_SET_FC_AQ_FAIL_UPDATE
 };
 
-/* Various MAC types */
+/* These are structs for managing the hardware information and the operations */
+/* MAC types */
 enum ice_mac_type {
 	ICE_MAC_UNKNOWN = 0,
+	ICE_MAC_VF,
+	ICE_MAC_E810,
 	ICE_MAC_GENERIC,
 };
 
+
 /* Media Types */
 enum ice_media_type {
 	ICE_MEDIA_UNKNOWN = 0,
@@ -99,12 +180,19 @@ enum ice_media_type {
 	ICE_MEDIA_BASET,
 	ICE_MEDIA_BACKPLANE,
 	ICE_MEDIA_DA,
+	ICE_MEDIA_AUI,
 };
 
+/* Software VSI types. */
 enum ice_vsi_type {
 	ICE_VSI_PF = 0,
-	ICE_VSI_VF,
+	ICE_VSI_VF = 1,
+	ICE_VSI_VMDQ2 = 2,
+	ICE_VSI_CTRL = 3,	/* equates to ICE_VSI_PF with 1 queue pair */
+	ICE_VSI_CHNL = 4,
+	ICE_VSI_OFFLOAD_MACVLAN = 5,
 	ICE_VSI_LB = 6,
+	ICE_VSI_SWITCHDEV_CTRL = 7,
 };
 
 struct ice_link_status {
@@ -115,6 +203,7 @@ struct ice_link_status {
 	u16 max_frame_size;
 	u16 link_speed;
 	u16 req_speeds;
+	u8 link_cfg_err;
 	u8 lse_ena;	/* Link Status Event notification */
 	u8 link_info;
 	u8 an_info;
@@ -127,6 +216,15 @@ struct ice_link_status {
 	u8 module_type[ICE_MODULE_TYPE_TOTAL_BYTE];
 };
 
+/* Different data queue types: These are mainly for SW consumption. */
+enum ice_q {
+	ICE_DATA_Q_DOORBELL,
+	ICE_DATA_Q_CMPL,
+	ICE_DATA_Q_QUANTA,
+	ICE_DATA_Q_RX,
+	ICE_DATA_Q_TX,
+};
+
 /* Different reset sources for which a disable queue AQ call has to be made in
  * order to clean the Tx scheduler as a part of the reset
  */
@@ -144,36 +242,260 @@ struct ice_phy_info {
 	u64 phy_type_high;
 	enum ice_media_type media_type;
 	u8 get_link_info;
+	/* Please refer to struct ice_aqc_get_link_status_data to get
+	 * detail of enable bit in curr_user_speed_req
+	 */
+	u16 curr_user_speed_req;
+	enum ice_fec_mode curr_user_fec_req;
+	enum ice_fc_mode curr_user_fc_req;
+	struct ice_aqc_set_phy_cfg_data curr_user_phy_cfg;
+};
+
+#define ICE_MAX_NUM_MIRROR_RULES	64
+
+/* protocol enumeration for filters */
+enum ice_fltr_ptype {
+	/* NONE - used for undef/error */
+	ICE_FLTR_PTYPE_NONF_NONE = 0,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_SCTP,
+	ICE_FLTR_PTYPE_NONF_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV6_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP_IPV4_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3,
+	ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3,
+	ICE_FLTR_PTYPE_NONF_IPV4_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV6_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV4_AH,
+	ICE_FLTR_PTYPE_NONF_IPV6_AH,
+	ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP,
+	ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE,
+	ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION,
+	ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE,
+	ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION,
+	ICE_FLTR_PTYPE_NON_IP_L2,
+	ICE_FLTR_PTYPE_NONF_ECPRI_TP0,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0,
+	ICE_FLTR_PTYPE_FRAG_IPV4,
+	ICE_FLTR_PTYPE_FRAG_IPV6,
+	ICE_FLTR_PTYPE_NONF_IPV6_UDP,
+	ICE_FLTR_PTYPE_NONF_IPV6_TCP,
+	ICE_FLTR_PTYPE_NONF_IPV6_SCTP,
+	ICE_FLTR_PTYPE_NONF_IPV6_OTHER,
+	ICE_FLTR_PTYPE_NONF_IPV4_UDP_VXLAN,
+	ICE_FLTR_PTYPE_MAX,
+};
+
+enum ice_fd_hw_seg {
+	ICE_FD_HW_SEG_NON_TUN = 0,
+	ICE_FD_HW_SEG_TUN,
+	ICE_FD_HW_SEG_MAX,
+};
+
+/* 1 ICE_VSI_PF + 1 ICE_VSI_CTRL + ICE_CHNL_MAX_TC */
+#define ICE_MAX_FDIR_VSI_PER_FILTER	(2 + ICE_CHNL_MAX_TC)
+
+struct ice_fd_hw_prof {
+	struct ice_flow_seg_info *fdir_seg[ICE_FD_HW_SEG_MAX];
+	int cnt;
+	u64 entry_h[ICE_MAX_FDIR_VSI_PER_FILTER][ICE_FD_HW_SEG_MAX];
+	u16 vsi_h[ICE_MAX_FDIR_VSI_PER_FILTER];
 };
 
 /* Common HW capabilities for SW use */
 struct ice_hw_common_caps {
+	/* Write CSR protection */
+	u64 wr_csr_prot;
+	u32 switching_mode;
+	/* switching mode supported - EVB switching (including cloud) */
+#define ICE_NVM_IMAGE_TYPE_EVB		0x0
+
+	/* Manageablity mode & supported protocols over MCTP */
+	u32 mgmt_mode;
+#define ICE_MGMT_MODE_PASS_THRU_MODE_M		0xF
+#define ICE_MGMT_MODE_CTL_INTERFACE_M		0xF0
+#define ICE_MGMT_MODE_REDIR_SB_INTERFACE_M	0xF00
+
+	u32 mgmt_protocols_mctp;
+#define ICE_MGMT_MODE_PROTO_RSVD	BIT(0)
+#define ICE_MGMT_MODE_PROTO_PLDM	BIT(1)
+#define ICE_MGMT_MODE_PROTO_OEM		BIT(2)
+#define ICE_MGMT_MODE_PROTO_NC_SI	BIT(3)
+
+	u32 os2bmc;
 	u32 valid_functions;
 	/* DCB capabilities */
 	u32 active_tc_bitmap;
 	u32 maxtc;
 
+	/* RSS related capabilities */
+	u32 rss_table_size;		/* 512 for PFs and 64 for VFs */
+	u32 rss_table_entry_width;	/* RSS Entry width in bits */
+
 	/* Tx/Rx queues */
-	u16 num_rxq;		/* Number/Total Rx queues */
-	u16 rxq_first_id;	/* First queue ID for Rx queues */
-	u16 num_txq;		/* Number/Total Tx queues */
-	u16 txq_first_id;	/* First queue ID for Tx queues */
+	u32 num_rxq;			/* Number/Total Rx queues */
+	u32 rxq_first_id;		/* First queue ID for Rx queues */
+	u32 num_txq;			/* Number/Total Tx queues */
+	u32 txq_first_id;		/* First queue ID for Tx queues */
 
 	/* MSI-X vectors */
-	u16 num_msix_vectors;
-	u16 msix_vector_first_id;
+	u32 num_msix_vectors;
+	u32 msix_vector_first_id;
 
 	/* Max MTU for function or device */
-	u16 max_mtu;
+	u32 max_mtu;
 
-	/* Virtualization support */
+	/* WOL related */
+	u32 num_wol_proxy_fltr;
+	u32 wol_proxy_vsi_seid;
+
+	/* LED/SDP pin count */
+	u32 led_pin_num;
+	u32 sdp_pin_num;
+
+	/* LED/SDP - Supports up to 12 LED pins and 8 SDP signals */
+#define ICE_MAX_SUPPORTED_GPIO_LED	12
+#define ICE_MAX_SUPPORTED_GPIO_SDP	8
+	u8 led[ICE_MAX_SUPPORTED_GPIO_LED];
+	u8 sdp[ICE_MAX_SUPPORTED_GPIO_SDP];
+
+	/* SR-IOV virtualization */
 	u8 sr_iov_1_1;			/* SR-IOV enabled */
 
-	/* RSS related capabilities */
-	u16 rss_table_size;		/* 512 for PFs and 64 for VFs */
-	u8 rss_table_entry_width;	/* RSS Entry width in bits */
+	/* VMDQ */
+	u8 vmdq;			/* VMDQ supported */
+
+	/* EVB capabilities */
+	u8 evb_802_1_qbg;		/* Edge Virtual Bridging */
+	u8 evb_802_1_qbh;		/* Bridge Port Extension */
 
 	u8 dcb;
+	u8 iscsi;
+	u8 ieee_1588;
+	u8 mgmt_cem;
+	u8 iwarp;
+
+	/* WoL and APM support */
+#define ICE_WOL_SUPPORT_M		BIT(0)
+#define ICE_ACPI_PROG_MTHD_M		BIT(1)
+#define ICE_PROXY_SUPPORT_M		BIT(2)
+	u8 apm_wol_support;
+	u8 acpi_prog_mthd;
+	u8 proxy_support;
+	bool nvm_update_pending_nvm;
+	bool nvm_update_pending_orom;
+	bool nvm_update_pending_netlist;
+#define ICE_NVM_PENDING_NVM_IMAGE		BIT(0)
+#define ICE_NVM_PENDING_OROM			BIT(1)
+#define ICE_NVM_PENDING_NETLIST			BIT(2)
+	bool sec_rev_disabled;
+	bool update_disabled;
+	bool nvm_unified_update;
+#define ICE_NVM_MGMT_SEC_REV_DISABLED		BIT(0)
+#define ICE_NVM_MGMT_UPDATE_DISABLED		BIT(1)
+#define ICE_NVM_MGMT_UNIFIED_UPD_SUPPORT	BIT(3)
+
+	/* External topology device images within the NVM */
+#define ICE_EXT_TOPO_DEV_IMG_COUNT	4
+	u32 ext_topo_dev_img_ver_high[ICE_EXT_TOPO_DEV_IMG_COUNT];
+	u32 ext_topo_dev_img_ver_low[ICE_EXT_TOPO_DEV_IMG_COUNT];
+	u8 ext_topo_dev_img_part_num[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_PART_NUM_S	8
+#define ICE_EXT_TOPO_DEV_IMG_PART_NUM_M	\
+		ICE_M(0xFF, ICE_EXT_TOPO_DEV_IMG_PART_NUM_S)
+	bool ext_topo_dev_img_load_en[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_LOAD_EN	BIT(0)
+	bool ext_topo_dev_img_prog_en[ICE_EXT_TOPO_DEV_IMG_COUNT];
+#define ICE_EXT_TOPO_DEV_IMG_PROG_EN	BIT(1)
+};
+
+/* IEEE 1588 TIME_SYNC specific info */
+/* Function specific definitions */
+#define ICE_TS_FUNC_ENA_M		BIT(0)
+#define ICE_TS_SRC_TMR_OWND_M		BIT(1)
+#define ICE_TS_TMR_ENA_M		BIT(2)
+#define ICE_TS_TMR_IDX_OWND_S		4
+#define ICE_TS_TMR_IDX_OWND_M		BIT(4)
+#define ICE_TS_CLK_FREQ_S		16
+#define ICE_TS_CLK_FREQ_M		ICE_M(0x7, ICE_TS_CLK_FREQ_S)
+#define ICE_TS_CLK_SRC_S		20
+#define ICE_TS_CLK_SRC_M		BIT(20)
+#define ICE_TS_TMR_IDX_ASSOC_S		24
+#define ICE_TS_TMR_IDX_ASSOC_M		BIT(24)
+
+/* TIME_REF clock rate specification */
+enum ice_time_ref_freq {
+	ICE_TIME_REF_FREQ_25_000	= 0,
+	ICE_TIME_REF_FREQ_122_880	= 1,
+	ICE_TIME_REF_FREQ_125_000	= 2,
+	ICE_TIME_REF_FREQ_153_600	= 3,
+	ICE_TIME_REF_FREQ_156_250	= 4,
+	ICE_TIME_REF_FREQ_245_760	= 5,
+
+	NUM_ICE_TIME_REF_FREQ
+};
+
+struct ice_ts_func_info {
+	/* Function specific info */
+	enum ice_time_ref_freq time_ref;
+	u8 clk_freq;
+	u8 clk_src;
+	u8 tmr_index_assoc;
+	u8 ena;
+	u8 tmr_index_owned;
+	u8 src_tmr_owned;
+	u8 tmr_ena;
+};
+
+/* Device specific definitions */
+#define ICE_TS_TMR0_OWNR_M		0x7
+#define ICE_TS_TMR0_OWND_M		BIT(3)
+#define ICE_TS_TMR1_OWNR_S		4
+#define ICE_TS_TMR1_OWNR_M		ICE_M(0x7, ICE_TS_TMR1_OWNR_S)
+#define ICE_TS_TMR1_OWND_M		BIT(7)
+#define ICE_TS_DEV_ENA_M		BIT(24)
+#define ICE_TS_TMR0_ENA_M		BIT(25)
+#define ICE_TS_TMR1_ENA_M		BIT(26)
+
+struct ice_ts_dev_info {
+	/* Device specific info */
+	u32 ena_ports;
+	u32 tmr_own_map;
+	u32 tmr0_owner;
+	u32 tmr1_owner;
+	u8 tmr0_owned;
+	u8 tmr1_owned;
+	u8 ena;
+	u8 tmr0_ena;
+	u8 tmr1_ena;
 };
 
 /* Function specific capabilities */
@@ -182,6 +504,9 @@ struct ice_hw_func_caps {
 	u32 num_allocd_vfs;		/* Number of allocated VFs */
 	u32 vf_base_id;			/* Logical ID of the first VF */
 	u32 guar_num_vsi;
+	u32 fd_fltr_guar;		/* Number of filters guaranteed */
+	u32 fd_fltr_best_effort;	/* Number of best effort filters */
+	struct ice_ts_func_info ts_func_info;
 };
 
 /* Device wide capabilities */
@@ -189,12 +514,48 @@ struct ice_hw_dev_caps {
 	struct ice_hw_common_caps common_cap;
 	u32 num_vfs_exposed;		/* Total number of VFs exposed */
 	u32 num_vsi_allocd_to_host;	/* Excluding EMP VSI */
+	u32 num_flow_director_fltr;	/* Number of FD filters available */
+	struct ice_ts_dev_info ts_dev_info;
+	u32 num_funcs;
 };
 
-/* MAC info */
+
+/* Information about MAC such as address, etc... */
 struct ice_mac_info {
 	u8 lan_addr[ETH_ALEN];
 	u8 perm_addr[ETH_ALEN];
+	u8 port_addr[ETH_ALEN];
+	u8 wol_addr[ETH_ALEN];
+};
+
+/* PCI bus types */
+enum ice_bus_type {
+	ice_bus_unknown = 0,
+	ice_bus_pci_express,
+	ice_bus_embedded, /* Is device Embedded versus card */
+	ice_bus_reserved
+};
+
+/* PCI bus speeds */
+enum ice_pcie_bus_speed {
+	ice_pcie_speed_unknown	= 0xff,
+	ice_pcie_speed_2_5GT	= 0x14,
+	ice_pcie_speed_5_0GT	= 0x15,
+	ice_pcie_speed_8_0GT	= 0x16,
+	ice_pcie_speed_16_0GT	= 0x17
+};
+
+/* PCI bus widths */
+enum ice_pcie_link_width {
+	ice_pcie_lnk_width_resrv	= 0x00,
+	ice_pcie_lnk_x1			= 0x01,
+	ice_pcie_lnk_x2			= 0x02,
+	ice_pcie_lnk_x4			= 0x04,
+	ice_pcie_lnk_x8			= 0x08,
+	ice_pcie_lnk_x12		= 0x0C,
+	ice_pcie_lnk_x16		= 0x10,
+	ice_pcie_lnk_x32		= 0x20,
+	ice_pcie_lnk_width_unknown	= 0xff,
 };
 
 /* Reset types used to determine which kind of reset was requested. These
@@ -213,8 +574,13 @@ enum ice_reset_req {
 
 /* Bus parameters */
 struct ice_bus_info {
+	enum ice_pcie_bus_speed speed;
+	enum ice_pcie_link_width width;
+	enum ice_bus_type type;
+	u16 domain_num;
 	u16 device;
 	u8 func;
+	u8 bus_num;
 };
 
 /* Flow control (FC) parameters */
@@ -223,25 +589,117 @@ struct ice_fc_info {
 	enum ice_fc_mode req_mode;	/* FC mode requested by caller */
 };
 
-/* NVM Information */
+/* Option ROM version information */
+struct ice_orom_info {
+	u8 major;			/* Major version of OROM */
+	u8 patch;			/* Patch version of OROM */
+	u16 build;			/* Build version of OROM */
+	u32 srev;			/* Security revision */
+};
+
+/* NVM version information */
 struct ice_nvm_info {
-	u32 eetrack;              /* NVM data version */
-	u32 oem_ver;              /* OEM version info */
-	u16 sr_words;             /* Shadow RAM size in words */
-	u16 ver;                  /* NVM package version */
-	u8 blank_nvm_mode;        /* is NVM empty (no FW present) */
+	u32 eetrack;
+	u32 srev;
+	u8 major;
+	u8 minor;
+};
+
+/* Minimum Security Revision information */
+struct ice_minsrev_info {
+	u32 nvm;
+	u32 orom;
+	u8 nvm_valid : 1;
+	u8 orom_valid : 1;
+};
+
+/* netlist version information */
+struct ice_netlist_info {
+	u32 major;			/* major high/low */
+	u32 minor;			/* minor high/low */
+	u32 type;			/* type high/low */
+	u32 rev;			/* revision high/low */
+	u32 hash;			/* SHA-1 hash word */
+	u16 cust_ver;			/* customer version */
+};
+
+/* Enumeration of possible flash banks for the NVM, OROM, and Netlist modules
+ * of the flash image.
+ */
+enum ice_flash_bank {
+	ICE_INVALID_FLASH_BANK,
+	ICE_1ST_FLASH_BANK,
+	ICE_2ND_FLASH_BANK,
+};
+
+/* Enumeration of which flash bank is desired to read from, either the active
+ * bank or the inactive bank. Used to abstract 1st and 2nd bank notion from
+ * code which just wants to read the active or inactive flash bank.
+ */
+enum ice_bank_select {
+	ICE_ACTIVE_FLASH_BANK,
+	ICE_INACTIVE_FLASH_BANK,
+};
+
+/* information for accessing NVM, OROM, and Netlist flash banks */
+struct ice_bank_info {
+	u32 nvm_ptr;				/* Pointer to 1st NVM bank */
+	u32 nvm_size;				/* Size of NVM bank */
+	u32 orom_ptr;				/* Pointer to 1st OROM bank */
+	u32 orom_size;				/* Size of OROM bank */
+	u32 netlist_ptr;			/* Pointer to 1st Netlist bank */
+	u32 netlist_size;			/* Size of Netlist bank */
+	enum ice_flash_bank nvm_bank;		/* Active NVM bank */
+	enum ice_flash_bank orom_bank;		/* Active OROM bank */
+	enum ice_flash_bank netlist_bank;	/* Active Netlist bank */
+};
+
+/* Flash Chip Information */
+struct ice_flash_info {
+	struct ice_orom_info orom;	/* Option ROM version info */
+	struct ice_nvm_info nvm;	/* NVM version information */
+	struct ice_netlist_info netlist;/* Netlist version info */
+	struct ice_bank_info banks;	/* Flash Bank information */
+	u16 sr_words;			/* Shadow RAM size in words */
+	u32 flash_size;			/* Size of available flash in bytes */
+	u8 blank_nvm_mode;		/* is NVM empty (no FW present) */
+};
+
+struct ice_link_default_override_tlv {
+	u8 options;
+#define ICE_LINK_OVERRIDE_OPT_M		0x3F
+#define ICE_LINK_OVERRIDE_STRICT_MODE	BIT(0)
+#define ICE_LINK_OVERRIDE_EPCT_DIS	BIT(1)
+#define ICE_LINK_OVERRIDE_PORT_DIS	BIT(2)
+#define ICE_LINK_OVERRIDE_EN		BIT(3)
+#define ICE_LINK_OVERRIDE_AUTO_LINK_DIS	BIT(4)
+#define ICE_LINK_OVERRIDE_EEE_EN	BIT(5)
+	u8 phy_config;
+#define ICE_LINK_OVERRIDE_PHY_CFG_S	8
+#define ICE_LINK_OVERRIDE_PHY_CFG_M	(0xC3 << ICE_LINK_OVERRIDE_PHY_CFG_S)
+#define ICE_LINK_OVERRIDE_PAUSE_M	0x3
+#define ICE_LINK_OVERRIDE_LESM_EN	BIT(6)
+#define ICE_LINK_OVERRIDE_AUTO_FEC_EN	BIT(7)
+	u8 fec_options;
+#define ICE_LINK_OVERRIDE_FEC_OPT_M	0xFF
+	u8 rsvd1;
+	u64 phy_type_low;
+	u64 phy_type_high;
 };
 
 #define ICE_NVM_VER_LEN	32
 
 /* Max number of port to queue branches w.r.t topology */
-#define ICE_MAX_TRAFFIC_CLASS 8
 #define ICE_TXSCHED_MAX_BRANCHES ICE_MAX_TRAFFIC_CLASS
 
 #define ice_for_each_traffic_class(_i)	\
 	for ((_i) = 0; (_i) < ICE_MAX_TRAFFIC_CLASS; (_i)++)
 
+/* ICE_DFLT_AGG_ID means that all new VM(s)/VSI node connects
+ * to driver defined policy for default aggregator
+ */
 #define ICE_INVAL_TEID 0xFFFFFFFF
+#define ICE_DFLT_AGG_ID 0
 
 struct ice_sched_node {
 	struct ice_sched_node *parent;
@@ -256,38 +714,136 @@ struct ice_sched_node {
 	u8 tc_num;
 	u8 owner;
 #define ICE_SCHED_NODE_OWNER_LAN	0
+#define ICE_SCHED_NODE_OWNER_AE		1
+#define ICE_SCHED_NODE_OWNER_RDMA	2
 };
 
 /* Access Macros for Tx Sched Elements data */
 #define ICE_TXSCHED_GET_NODE_TEID(x) le32_to_cpu((x)->info.node_teid)
+#define ICE_TXSCHED_GET_PARENT_TEID(x) le32_to_cpu((x)->info.parent_teid)
+#define ICE_TXSCHED_GET_CIR_RL_ID(x)	\
+	le16_to_cpu((x)->info.cir_bw.bw_profile_idx)
+#define ICE_TXSCHED_GET_EIR_RL_ID(x)	\
+	le16_to_cpu((x)->info.eir_bw.bw_profile_idx)
+#define ICE_TXSCHED_GET_SRL_ID(x) le16_to_cpu((x)->info.srl_id)
+#define ICE_TXSCHED_GET_CIR_BWALLOC(x)	\
+	le16_to_cpu((x)->info.cir_bw.bw_alloc)
+#define ICE_TXSCHED_GET_EIR_BWALLOC(x)	\
+	le16_to_cpu((x)->info.eir_bw.bw_alloc)
+
+struct ice_sched_rl_profile {
+	u32 rate; /* In Kbps */
+	struct ice_aqc_rl_profile_elem info;
+};
 
 /* The aggregator type determines if identifier is for a VSI group,
  * aggregator group, aggregator of queues, or queue group.
  */
 enum ice_agg_type {
 	ICE_AGG_TYPE_UNKNOWN = 0,
-	ICE_AGG_TYPE_VSI,
+	ICE_AGG_TYPE_TC,
 	ICE_AGG_TYPE_AGG, /* aggregator */
-	ICE_AGG_TYPE_Q,
-	ICE_AGG_TYPE_QG
+	ICE_AGG_TYPE_VSI,
+	ICE_AGG_TYPE_QG,
+	ICE_AGG_TYPE_Q
+};
+
+/* Rate limit types */
+enum ice_rl_type {
+	ICE_UNKNOWN_BW = 0,
+	ICE_MIN_BW,		/* for CIR profile */
+	ICE_MAX_BW,		/* for EIR profile */
+	ICE_SHARED_BW		/* for shared profile */
 };
 
+#define ICE_SCHED_MIN_BW		500		/* in Kbps */
+#define ICE_SCHED_MAX_BW		100000000	/* in Kbps */
+#define ICE_SCHED_DFLT_BW		0xFFFFFFFF	/* unlimited */
+#define ICE_SCHED_NO_PRIORITY		0
+#define ICE_SCHED_NO_BW_WT		0
 #define ICE_SCHED_DFLT_RL_PROF_ID	0
-#define ICE_SCHED_DFLT_BW_WT		1
+#define ICE_SCHED_NO_SHARED_RL_PROF_ID	0xFFFF
+#define ICE_SCHED_DFLT_BW_WT		4
+#define ICE_SCHED_INVAL_PROF_ID		0xFFFF
+#define ICE_SCHED_DFLT_BURST_SIZE	(15 * 1024)	/* in bytes (15k) */
+
+/* Access Macros for Tx Sched RL Profile data */
+#define ICE_TXSCHED_GET_RL_PROF_ID(p) le16_to_cpu((p)->info.profile_id)
+#define ICE_TXSCHED_GET_RL_MBS(p) le16_to_cpu((p)->info.max_burst_size)
+#define ICE_TXSCHED_GET_RL_MULTIPLIER(p) le16_to_cpu((p)->info.rl_multiply)
+#define ICE_TXSCHED_GET_RL_WAKEUP_MV(p) le16_to_cpu((p)->info.wake_up_calc)
+#define ICE_TXSCHED_GET_RL_ENCODE(p) le16_to_cpu((p)->info.rl_encode)
+
+#define ICE_MAX_PORT_PER_PCI_DEV	8
+
+/* The following tree example shows the naming conventions followed under
+ * ice_port_info struct for default scheduler tree topology.
+ *
+ *                 A tree on a port
+ *                       *                ---> root node
+ *        (TC0)/  /  /  / \  \  \  \(TC7) ---> num_branches (range:1- 8)
+ *            *  *  *  *   *  *  *  *     |
+ *           /                            |
+ *          *                             |
+ *         /                              |-> num_elements (range:1 - 9)
+ *        *                               |   implies num_of_layers
+ *       /                                |
+ *   (a)*                                 |
+ *
+ *  (a) is the last_node_teid(not of type Leaf). A leaf node is created under
+ *  (a) as child node where queues get added, add Tx/Rx queue admin commands;
+ *  need TEID of (a) to add queues.
+ *
+ *  This tree
+ *       -> has 8 branches (one for each TC)
+ *       -> First branch (TC0) has 4 elements
+ *       -> has 4 layers
+ *       -> (a) is the topmost layer node created by firmware on branch 0
+ *
+ *  Note: Above asterisk tree covers only basic terminology and scenario.
+ *  Refer to the documentation for more info.
+ */
 
-/* VSI type list entry to locate corresponding VSI/ag nodes */
+ /* Data structure for saving BW information */
+enum ice_bw_type {
+	ICE_BW_TYPE_PRIO,
+	ICE_BW_TYPE_CIR,
+	ICE_BW_TYPE_CIR_WT,
+	ICE_BW_TYPE_EIR,
+	ICE_BW_TYPE_EIR_WT,
+	ICE_BW_TYPE_SHARED,
+	ICE_BW_TYPE_CNT		/* This must be last */
+};
+
+struct ice_bw {
+	u32 bw;
+	u16 bw_alloc;
+};
+
+struct ice_bw_type_info {
+	DECLARE_BITMAP(bw_t_bitmap, ICE_BW_TYPE_CNT);
+	u8 generic;
+	struct ice_bw cir_bw;
+	struct ice_bw eir_bw;
+	u32 shared_bw;
+};
+
+/* VSI queue context structure for given TC */
+struct ice_q_ctx {
+	u16  q_handle;
+	u32  q_teid;
+	/* bw_t_info saves queue BW information */
+	struct ice_bw_type_info bw_t_info;
+};
+
+/* VSI type list entry to locate corresponding VSI/aggregator nodes */
 struct ice_sched_vsi_info {
 	struct ice_sched_node *vsi_node[ICE_MAX_TRAFFIC_CLASS];
 	struct ice_sched_node *ag_node[ICE_MAX_TRAFFIC_CLASS];
-	struct list_head list_entry;
 	u16 max_lanq[ICE_MAX_TRAFFIC_CLASS];
-};
-
-/* driver defines the policy */
-struct ice_sched_tx_policy {
-	u16 max_num_vsis;
-	u8 max_num_lan_qs_per_tc[ICE_MAX_TRAFFIC_CLASS];
-	u8 rdma_ena;
+	u16 max_rdmaq[ICE_MAX_TRAFFIC_CLASS];
+	/* bw_t_info saves VSI BW information */
+	struct ice_bw_type_info bw_t_info[ICE_MAX_TRAFFIC_CLASS];
 };
 
 /* CEE or IEEE 802.1Qaz ETS Configuration data */
@@ -315,19 +871,18 @@ struct ice_dcb_app_priority_table {
 	u8 selector;
 };
 
-#define ICE_MAX_USER_PRIORITY	8
-#define ICE_DCBX_MAX_APPS	32
-#define ICE_LLDPDU_SIZE		1500
-#define ICE_TLV_STATUS_OPER	0x1
-#define ICE_TLV_STATUS_SYNC	0x2
-#define ICE_TLV_STATUS_ERR	0x4
-#define ICE_APP_PROT_ID_FCOE	0x8906
-#define ICE_APP_PROT_ID_ISCSI	0x0cbc
-#define ICE_APP_PROT_ID_FIP	0x8914
-#define ICE_APP_SEL_ETHTYPE	0x1
-#define ICE_APP_SEL_TCPIP	0x2
-#define ICE_CEE_APP_SEL_ETHTYPE	0x0
-#define ICE_CEE_APP_SEL_TCPIP	0x1
+#define ICE_MAX_USER_PRIORITY		8
+#define ICE_DCBX_MAX_APPS		64
+#define ICE_DSCP_NUM_VAL		64
+#define ICE_LLDPDU_SIZE			1500
+#define ICE_TLV_STATUS_OPER		0x1
+#define ICE_TLV_STATUS_SYNC		0x2
+#define ICE_TLV_STATUS_ERR		0x4
+#define ICE_APP_PROT_ID_ISCSI_860	0x035c
+#define ICE_APP_SEL_ETHTYPE		0x1
+#define ICE_APP_SEL_TCPIP		0x2
+#define ICE_CEE_APP_SEL_ETHTYPE		0x0
+#define ICE_CEE_APP_SEL_TCPIP		0x1
 
 struct ice_dcbx_cfg {
 	u32 numapps;
@@ -335,7 +890,14 @@ struct ice_dcbx_cfg {
 	struct ice_dcb_ets_cfg etscfg;
 	struct ice_dcb_ets_cfg etsrec;
 	struct ice_dcb_pfc_cfg pfc;
+#define ICE_QOS_MODE_VLAN	0x0
+#define ICE_QOS_MODE_DSCP	0x1
+	u8 pfc_mode;
 	struct ice_dcb_app_priority_table app[ICE_DCBX_MAX_APPS];
+	/* when DSCP mapping defined by user set its bit to 1 */
+	DECLARE_BITMAP(dscp_mapped, ICE_DSCP_NUM_VAL);
+	/* array holding DSCP -> UP/TC values for DSCP L3 QoS mode */
+	u8 dscp_map[ICE_DSCP_NUM_VAL];
 	u8 dcbx_mode;
 #define ICE_DCBX_MODE_CEE	0x1
 #define ICE_DCBX_MODE_IEEE	0x2
@@ -343,6 +905,14 @@ struct ice_dcbx_cfg {
 #define ICE_DCBX_APPS_NON_WILLING	0x1
 };
 
+struct ice_qos_cfg {
+	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
+	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
+	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
+	u8 dcbx_status : 3;			/* see ICE_DCBX_STATUS_DIS */
+	u8 is_sw_lldp : 1;
+};
+
 struct ice_port_info {
 	struct ice_sched_node *root;	/* Root Node per Port */
 	struct ice_hw *hw;		/* back pointer to HW instance */
@@ -364,37 +934,94 @@ struct ice_port_info {
 	struct mutex sched_lock;	/* protect access to TXSched tree */
 	struct ice_sched_node *
 		sib_head[ICE_MAX_TRAFFIC_CLASS][ICE_AQC_TOPO_MAX_LEVEL_NUM];
-	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
-	/* DCBX info */
-	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
-	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
-	/* LLDP/DCBX Status */
-	u8 dcbx_status:3;		/* see ICE_DCBX_STATUS_DIS */
-	u8 is_sw_lldp:1;
+	struct ice_bw_type_info root_node_bw_t_info;
+	struct ice_bw_type_info tc_node_bw_t_info[ICE_MAX_TRAFFIC_CLASS];
+	struct ice_qos_cfg qos_cfg;
 	u8 is_vf:1;
 };
 
 struct ice_switch_info {
 	struct list_head vsi_list_map_head;
 	struct ice_sw_recipe *recp_list;
+	u16 prof_res_bm_init;
+	u16 max_used_prof_index;
+
+	DECLARE_BITMAP(prof_res_bm[ICE_MAX_NUM_PROFILES], ICE_MAX_FV_WORDS);
+};
+
+
+/* Enum defining the different states of the mailbox snapshot in the
+ * PF-VF mailbox overflow detection algorithm. The snapshot can be in
+ * states:
+ * 1. ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT - generate a new static snapshot
+ * within the mailbox buffer.
+ * 2. ICE_MAL_VF_DETECT_STATE_TRAVERSE - iterate through the mailbox snaphot
+ * 3. ICE_MAL_VF_DETECT_STATE_DETECT - track the messages sent per VF via the
+ * mailbox and mark any VFs sending more messages than the threshold limit set.
+ * 4. ICE_MAL_VF_DETECT_STATE_INVALID - Invalid mailbox state set to 0xFFFFFFFF.
+ */
+enum ice_mbx_snapshot_state {
+	ICE_MAL_VF_DETECT_STATE_NEW_SNAPSHOT = 0,
+	ICE_MAL_VF_DETECT_STATE_TRAVERSE,
+	ICE_MAL_VF_DETECT_STATE_DETECT,
+	ICE_MAL_VF_DETECT_STATE_INVALID = 0xFFFFFFFF,
 };
 
-/* FW logging configuration */
-struct ice_fw_log_evnt {
-	u8 cfg : 4;	/* New event enables to configure */
-	u8 cur : 4;	/* Current/active event enables */
+/* Structure to hold information of the static snapshot and the mailbox
+ * buffer data used to generate and track the snapshot.
+ * 1. state: the state of the mailbox snapshot in the malicious VF
+ * detection state handler ice_mbx_vf_state_handler()
+ * 2. head : head of the mailbox snapshot in a circular mailbox buffer
+ * 3. tail : tail of the mailbox snapshot in a circular mailbox buffer
+ * 4. num_iterations: number of messages traversed in circular mailbox buffer
+ * 5. num_msg_proc: number of messages processed in mailbox
+ * 6. num_pending_arq: number of pending asynchronous messages
+ * 7. max_num_msgs_mbx: maximum messages in mailbox for currently
+ * serviced work item or interrupt.
+ */
+struct ice_mbx_snap_buffer_data {
+	enum ice_mbx_snapshot_state state;
+	u32 head;
+	u32 tail;
+	u32 num_iterations;
+	u16 num_msg_proc;
+	u16 num_pending_arq;
+	u16 max_num_msgs_mbx;
+};
+
+/* Structure to track messages sent by VFs on mailbox:
+ * 1. vf_cntr : a counter array of VFs to track the number of
+ * asynchronous messages sent by each VF
+ * 2. vfcntr_len : number of entries in VF counter array
+ */
+struct ice_mbx_vf_counter {
+	u32 *vf_cntr;
+	u32 vfcntr_len;
 };
 
-struct ice_fw_log_cfg {
-	u8 cq_en : 1;    /* FW logging is enabled via the control queue */
-	u8 uart_en : 1;  /* FW logging is enabled via UART for all PFs */
-	u8 actv_evnts;   /* Cumulation of currently enabled log events */
+/* Structure to hold data relevant to the captured static snapshot
+ * of the PF-VF mailbox.
+ */
+struct ice_mbx_snapshot {
+	struct ice_mbx_snap_buffer_data mbx_buf;
+	struct ice_mbx_vf_counter mbx_vf;
+};
 
-#define ICE_FW_LOG_EVNT_INFO	(ICE_AQC_FW_LOG_INFO_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_INIT	(ICE_AQC_FW_LOG_INIT_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_FLOW	(ICE_AQC_FW_LOG_FLOW_EN >> ICE_AQC_FW_LOG_EN_S)
-#define ICE_FW_LOG_EVNT_ERR	(ICE_AQC_FW_LOG_ERR_EN >> ICE_AQC_FW_LOG_EN_S)
-	struct ice_fw_log_evnt evnts[ICE_AQC_FW_LOG_ID_MAX];
+/* Structure to hold data to be used for capturing or updating a
+ * static snapshot.
+ * 1. num_msg_proc: number of messages processed in mailbox
+ * 2. num_pending_arq: number of pending asynchronous messages
+ * 3. max_num_msgs_mbx: maximum messages in mailbox for currently
+ * serviced work item or interrupt.
+ * 4. async_watermark_val: An upper threshold set by caller to determine
+ * if the pending arq count is large enough to assume that there is
+ * the possibility of a mailicious VF.
+ */
+struct ice_mbx_data {
+	u16 num_msg_proc;
+	u16 num_pending_arq;
+	u16 max_num_msgs_mbx;
+	u16 async_watermark_val;
 };
 
 /* Port hardware description */
@@ -403,9 +1030,16 @@ struct ice_hw {
 	void *back;
 	struct ice_aqc_layer_props *layer_info;
 	struct ice_port_info *port_info;
-	u64 debug_mask;		/* bitmap for debug mask */
+	/* 2D Array for each Tx Sched RL Profile type */
+	struct ice_sched_rl_profile **cir_profiles;
+	struct ice_sched_rl_profile **eir_profiles;
+	struct ice_sched_rl_profile **srl_profiles;
+	/* PSM clock frequency for calculating RL profile params */
+	u32 psm_clk_freq;
+	u64 debug_mask;		/* BITMAP for debug mask */
 	enum ice_mac_type mac_type;
 
+	u16 fd_ctr_base;	/* FD counter base index */
 	/* pci info */
 	u16 device_id;
 	u16 vendor_id;
@@ -415,20 +1049,23 @@ struct ice_hw {
 
 	u8 pf_id;		/* device profile info */
 
+	u16 max_burst_size;	/* driver sets this value */
+
 	/* Tx Scheduler values */
-	u16 num_tx_sched_layers;
-	u16 num_tx_sched_phys_layers;
+	u8 num_tx_sched_layers;
+	u8 num_tx_sched_phys_layers;
 	u8 flattened_layers;
 	u8 max_cgds;
 	u8 sw_entry_point_layer;
 	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 	struct list_head agg_list;	/* lists all aggregator */
-
+	/* List contain profile ID(s) and other params per layer */
+	struct list_head rl_prof_list[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 	struct ice_vsi_ctx *vsi_ctx[ICE_MAX_VSI];
 	u8 evb_veb;		/* true for VEB, false for VEPA */
 	u8 reset_ongoing;	/* true if HW is in reset, false otherwise */
 	struct ice_bus_info bus;
-	struct ice_nvm_info nvm;
+	struct ice_flash_info flash;
 	struct ice_hw_dev_caps dev_caps;	/* device capabilities */
 	struct ice_hw_func_caps func_caps;	/* function capabilities */
 
@@ -436,8 +1073,11 @@ struct ice_hw {
 
 	/* Control Queue info */
 	struct ice_ctl_q_info adminq;
+	struct ice_ctl_q_info sbq;
 	struct ice_ctl_q_info mailboxq;
-
+#define DCF_ACL_CAP		0x01	/* DCF ACL capability */
+#define DCF_UDP_TUNNEL_CAP	0x02	/* DCF UDP Tunnel capability */
+	u8 dcf_caps;
 	u8 api_branch;		/* API branch version */
 	u8 api_maj_ver;		/* API major version */
 	u8 api_min_ver;		/* API minor version */
@@ -448,10 +1088,11 @@ struct ice_hw {
 	u8 fw_patch;		/* firmware patch version */
 	u32 fw_build;		/* firmware build number */
 
-	struct ice_fw_log_cfg fw_log;
+	struct ice_fwlog_cfg fwlog_cfg;
+	bool fwlog_support_ena; /* does hardware support FW logging? */
 
 /* Device max aggregate bandwidths corresponding to the GL_PWR_MODE_CTL
- * register. Used for determining the ITR/intrl granularity during
+ * register. Used for determining the ITR/INTRL granularity during
  * initialization.
  */
 #define ICE_MAX_AGG_BW_200G	0x0
@@ -469,22 +1110,33 @@ struct ice_hw {
 	/* INTRL granularity in 1 us */
 	u8 intrl_gran;
 
-	u8 ucast_shared;	/* true if VSIs can share unicast addr */
+	/* true if VSIs can share unicast MAC addr */
+	u8 umac_shared;
+
+#define ICE_PHY_PER_NAC		1
+#define ICE_MAX_QUAD		2
+#define ICE_NUM_QUAD_TYPE	2
+#define ICE_PORTS_PER_QUAD	4
+#define ICE_PHY_0_LAST_QUAD	1
+#define ICE_PORTS_PER_PHY	8
+#define ICE_NUM_EXTERNAL_PORTS		ICE_PORTS_PER_PHY
+
 
 	/* Active package version (currently active) */
 	struct ice_pkg_ver active_pkg_ver;
+	u32 active_track_id;
 	u8 active_pkg_name[ICE_PKG_NAME_SIZE];
 	u8 active_pkg_in_nvm;
 
 	enum ice_aq_err pkg_dwnld_status;
 
-	/* Driver's package ver - (from the Metadata seg) */
+	/* Driver's package ver - (from the Ice Metadata section) */
 	struct ice_pkg_ver pkg_ver;
 	u8 pkg_name[ICE_PKG_NAME_SIZE];
 
-	/* Driver's Ice package version (from the Ice seg) */
-	struct ice_pkg_ver ice_pkg_ver;
-	u8 ice_pkg_name[ICE_PKG_NAME_SIZE];
+	/* Driver's Ice segment format version and id (from the Ice seg) */
+	struct ice_pkg_ver ice_seg_fmt_ver;
+	u8 ice_seg_id[ICE_SEG_ID_SIZE];
 
 	/* Pointer to the ice segment */
 	struct ice_seg *seg;
@@ -493,8 +1145,40 @@ struct ice_hw {
 	u8 *pkg_copy;
 	u32 pkg_size;
 
+	/* tunneling info */
+	struct mutex tnl_lock;
+	struct ice_tunnel_table tnl;
+
+	/* dvm boost update information */
+	struct ice_dvm_table dvm_upd;
+
+	struct ice_acl_tbl *acl_tbl;
+	struct ice_fd_hw_prof **acl_prof;
+	u16 acl_fltr_cnt[ICE_FLTR_PTYPE_MAX];
 	/* HW block tables */
 	struct ice_blk_info blk[ICE_BLK_COUNT];
+	struct mutex fl_profs_locks[ICE_BLK_COUNT];	/* lock fltr profiles */
+	struct list_head fl_profs[ICE_BLK_COUNT];
+	/* Flow Director filter info */
+	int fdir_active_fltr;
+
+	struct mutex fdir_fltr_lock;	/* protect Flow Director */
+	struct list_head fdir_list_head;
+
+	/* Book-keeping of side-band filter count per flow-type.
+	 * This is used to detect and handle input set changes for
+	 * respective flow-type.
+	 */
+	u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX];
+
+	struct ice_fd_hw_prof **fdir_prof;
+	DECLARE_BITMAP(fdir_perfect_fltr, ICE_FLTR_PTYPE_MAX);
+	struct mutex rss_locks;	/* protect RSS configuration */
+	struct list_head rss_list_head;
+	struct ice_mbx_snapshot mbx_snapshot;
+	DECLARE_BITMAP(hw_ptype, ICE_FLOW_PTYPE_MAX);
+	u8 dvm_ena;
+	__le16 io_expander_handle;
 };
 
 /* Statistics collected by each port, VSI, VEB, and S-channel */
@@ -513,6 +1197,16 @@ struct ice_eth_stats {
 	u64 tx_errors;			/* tepc */
 };
 
+#define ICE_MAX_UP	8
+
+/* Statistics collected per VEB per User Priority (UP) for up to 8 UPs */
+struct ice_veb_up_stats {
+	u64 up_rx_pkts[ICE_MAX_UP];
+	u64 up_rx_bytes[ICE_MAX_UP];
+	u64 up_tx_pkts[ICE_MAX_UP];
+	u64 up_tx_bytes[ICE_MAX_UP];
+};
+
 /* Statistics collected by the MAC */
 struct ice_hw_port_stats {
 	/* eth stats collected by the port */
@@ -552,26 +1246,207 @@ struct ice_hw_port_stats {
 	u64 tx_size_1023;		/* ptc1023 */
 	u64 tx_size_1522;		/* ptc1522 */
 	u64 tx_size_big;		/* ptc9522 */
+	u64 mac_short_pkt_dropped;	/* mspdc */
+	/* EEE LPI */
+	u32 tx_lpi_status;
+	u32 rx_lpi_status;
+	u64 tx_lpi_count;		/* etlpic */
+	u64 rx_lpi_count;		/* erlpic */
+	/* flow director stats */
+	u32 fd_sb_status;
+	u64 fd_sb_match;
+	u64 ch_atr_match;
+#ifdef ICE_ADD_PROBES
+	u64 arfs_tcpv4_match;
+	u64 arfs_tcpv6_match;
+	u64 arfs_udpv4_match;
+	u64 arfs_udpv6_match;
+#endif /* ICE_ADD_PROBES */
+};
+
+enum ice_sw_fwd_act_type {
+	ICE_FWD_TO_VSI = 0,
+	ICE_FWD_TO_VSI_LIST, /* Do not use this when adding filter */
+	ICE_FWD_TO_Q,
+	ICE_FWD_TO_QGRP,
+	ICE_DROP_PACKET,
+	ICE_INVAL_ACT
+};
+
+struct ice_aq_get_set_rss_lut_params {
+	u16 vsi_handle;		/* software VSI handle */
+	u16 lut_size;		/* size of the LUT buffer */
+	u8 lut_type;		/* type of the LUT (i.e. VSI, PF, Global) */
+	u8 *lut;		/* input RSS LUT for set and output RSS LUT for get */
+	u8 global_lut_id;	/* only valid when lut_type is global */
 };
 
 /* Checksum and Shadow RAM pointers */
-#define ICE_SR_NVM_DEV_STARTER_VER	0x18
-#define ICE_SR_NVM_EETRACK_LO		0x2D
-#define ICE_SR_NVM_EETRACK_HI		0x2E
-#define ICE_NVM_VER_LO_SHIFT		0
-#define ICE_NVM_VER_LO_MASK		(0xff << ICE_NVM_VER_LO_SHIFT)
-#define ICE_NVM_VER_HI_SHIFT		12
-#define ICE_NVM_VER_HI_MASK		(0xf << ICE_NVM_VER_HI_SHIFT)
-#define ICE_OEM_VER_PATCH_SHIFT		0
-#define ICE_OEM_VER_PATCH_MASK		(0xff << ICE_OEM_VER_PATCH_SHIFT)
-#define ICE_OEM_VER_BUILD_SHIFT		8
-#define ICE_OEM_VER_BUILD_MASK		(0xffff << ICE_OEM_VER_BUILD_SHIFT)
-#define ICE_OEM_VER_SHIFT		24
-#define ICE_OEM_VER_MASK		(0xff << ICE_OEM_VER_SHIFT)
+#define ICE_SR_NVM_CTRL_WORD			0x00
+#define ICE_SR_PHY_ANALOG_PTR			0x04
+#define ICE_SR_OPTION_ROM_PTR			0x05
+#define ICE_SR_RO_PCIR_REGS_AUTO_LOAD_PTR	0x06
+#define ICE_SR_AUTO_GENERATED_POINTERS_PTR	0x07
+#define ICE_SR_PCIR_REGS_AUTO_LOAD_PTR		0x08
+#define ICE_SR_EMP_GLOBAL_MODULE_PTR		0x09
+#define ICE_SR_EMP_IMAGE_PTR			0x0B
+#define ICE_SR_PE_IMAGE_PTR			0x0C
+#define ICE_SR_CSR_PROTECTED_LIST_PTR		0x0D
+#define ICE_SR_MNG_CFG_PTR			0x0E
+#define ICE_SR_EMP_MODULE_PTR			0x0F
+#define ICE_SR_PBA_BLOCK_PTR			0x16
+#define ICE_SR_BOOT_CFG_PTR			0x132
+#define ICE_SR_NVM_WOL_CFG			0x19
+#define ICE_NVM_OROM_VER_OFF			0x02
+#define ICE_SR_NVM_DEV_STARTER_VER		0x18
+#define ICE_SR_ALTERNATE_SAN_MAC_ADDR_PTR	0x27
+#define ICE_SR_PERMANENT_SAN_MAC_ADDR_PTR	0x28
+#define ICE_SR_NVM_MAP_VER			0x29
+#define ICE_SR_NVM_IMAGE_VER			0x2A
+#define ICE_SR_NVM_STRUCTURE_VER		0x2B
+#define ICE_SR_NVM_EETRACK_LO			0x2D
+#define ICE_SR_NVM_EETRACK_HI			0x2E
+#define ICE_NVM_VER_LO_SHIFT			0
+#define ICE_NVM_VER_LO_MASK			(0xff << ICE_NVM_VER_LO_SHIFT)
+#define ICE_NVM_VER_HI_SHIFT			12
+#define ICE_NVM_VER_HI_MASK			(0xf << ICE_NVM_VER_HI_SHIFT)
+#define ICE_OEM_EETRACK_ID			0xffffffff
+#define ICE_OROM_VER_PATCH_SHIFT		0
+#define ICE_OROM_VER_PATCH_MASK		(0xff << ICE_OROM_VER_PATCH_SHIFT)
+#define ICE_OROM_VER_BUILD_SHIFT		8
+#define ICE_OROM_VER_BUILD_MASK		(0xffff << ICE_OROM_VER_BUILD_SHIFT)
+#define ICE_OROM_VER_SHIFT			24
+#define ICE_OROM_VER_MASK			(0xff << ICE_OROM_VER_SHIFT)
+#define ICE_SR_VPD_PTR				0x2F
+#define ICE_SR_PXE_SETUP_PTR			0x30
+#define ICE_SR_PXE_CFG_CUST_OPTIONS_PTR		0x31
+#define ICE_SR_NVM_ORIGINAL_EETRACK_LO		0x34
+#define ICE_SR_NVM_ORIGINAL_EETRACK_HI		0x35
+#define ICE_SR_VLAN_CFG_PTR			0x37
+#define ICE_SR_POR_REGS_AUTO_LOAD_PTR		0x38
+#define ICE_SR_EMPR_REGS_AUTO_LOAD_PTR		0x3A
+#define ICE_SR_GLOBR_REGS_AUTO_LOAD_PTR		0x3B
+#define ICE_SR_CORER_REGS_AUTO_LOAD_PTR		0x3C
+#define ICE_SR_PHY_CFG_SCRIPT_PTR		0x3D
+#define ICE_SR_PCIE_ALT_AUTO_LOAD_PTR		0x3E
+#define ICE_SR_SW_CHECKSUM_WORD			0x3F
+#define ICE_SR_PFA_PTR				0x40
+#define ICE_SR_1ST_SCRATCH_PAD_PTR		0x41
+#define ICE_SR_1ST_NVM_BANK_PTR			0x42
+#define ICE_SR_NVM_BANK_SIZE			0x43
+#define ICE_SR_1ST_OROM_BANK_PTR		0x44
+#define ICE_SR_OROM_BANK_SIZE			0x45
+#define ICE_SR_NETLIST_BANK_PTR			0x46
+#define ICE_SR_NETLIST_BANK_SIZE		0x47
+#define ICE_SR_EMP_SR_SETTINGS_PTR		0x48
+#define ICE_SR_CONFIGURATION_METADATA_PTR	0x4D
+#define ICE_SR_IMMEDIATE_VALUES_PTR		0x4E
+#define ICE_SR_LINK_DEFAULT_OVERRIDE_PTR	0x134
+#define ICE_SR_POR_REGISTERS_AUTOLOAD_PTR	0x118
+
+/* CSS Header words */
+#define ICE_NVM_CSS_SREV_L			0x14
+#define ICE_NVM_CSS_SREV_H			0x15
+
+/* Length of CSS header section in words */
+#define ICE_CSS_HEADER_LENGTH			330
+
+/* Offset of Shadow RAM copy in the NVM bank area. */
+#define ICE_NVM_SR_COPY_WORD_OFFSET		roundup(ICE_CSS_HEADER_LENGTH, 32)
+
+/* Size in bytes of Option ROM trailer */
+#define ICE_NVM_OROM_TRAILER_LENGTH		(2 * ICE_CSS_HEADER_LENGTH)
+
+/* The Link Topology Netlist section is stored as a series of words. It is
+ * stored in the NVM as a TLV, with the first two words containing the type
+ * and length.
+ */
+#define ICE_NETLIST_LINK_TOPO_MOD_ID		0x011B
+#define ICE_NETLIST_TYPE_OFFSET			0x0000
+#define ICE_NETLIST_LEN_OFFSET			0x0001
+
+/* The Link Topology section follows the TLV header. When reading the netlist
+ * using ice_read_netlist_module, we need to account for the 2-word TLV
+ * header.
+ */
+#define ICE_NETLIST_LINK_TOPO_OFFSET(n)		((n) + 2)
+
+#define ICE_LINK_TOPO_MODULE_LEN		ICE_NETLIST_LINK_TOPO_OFFSET(0x0000)
+#define ICE_LINK_TOPO_NODE_COUNT		ICE_NETLIST_LINK_TOPO_OFFSET(0x0001)
+
+#define ICE_LINK_TOPO_NODE_COUNT_M		ICE_M(0x3FF, 0)
+
+/* The Netlist ID Block is located after all of the Link Topology nodes. */
+#define ICE_NETLIST_ID_BLK_SIZE			0x30
+#define ICE_NETLIST_ID_BLK_OFFSET(n)		ICE_NETLIST_LINK_TOPO_OFFSET(0x0004 + 2 * (n))
+
+/* netlist ID block field offsets (word offsets) */
+#define ICE_NETLIST_ID_BLK_MAJOR_VER_LOW	0x02
+#define ICE_NETLIST_ID_BLK_MAJOR_VER_HIGH	0x03
+#define ICE_NETLIST_ID_BLK_MINOR_VER_LOW	0x04
+#define ICE_NETLIST_ID_BLK_MINOR_VER_HIGH	0x05
+#define ICE_NETLIST_ID_BLK_TYPE_LOW		0x06
+#define ICE_NETLIST_ID_BLK_TYPE_HIGH		0x07
+#define ICE_NETLIST_ID_BLK_REV_LOW		0x08
+#define ICE_NETLIST_ID_BLK_REV_HIGH		0x09
+#define ICE_NETLIST_ID_BLK_SHA_HASH_WORD(n)	(0x0A + (n))
+#define ICE_NETLIST_ID_BLK_CUST_VER		0x2F
+
+/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */
+#define ICE_SR_VPD_SIZE_WORDS		512
+#define ICE_SR_PCIE_ALT_SIZE_WORDS	512
+#define ICE_SR_CTRL_WORD_1_S		0x06
+#define ICE_SR_CTRL_WORD_1_M		(0x03 << ICE_SR_CTRL_WORD_1_S)
+#define ICE_SR_CTRL_WORD_VALID		0x1
+#define ICE_SR_CTRL_WORD_OROM_BANK	BIT(3)
+#define ICE_SR_CTRL_WORD_NETLIST_BANK	BIT(4)
+#define ICE_SR_CTRL_WORD_NVM_BANK	BIT(5)
+
+#define ICE_SR_NVM_PTR_4KB_UNITS	BIT(15)
+
+/* Shadow RAM related */
 #define ICE_SR_SECTOR_SIZE_IN_WORDS	0x800
+#define ICE_SR_BUF_ALIGNMENT		4096
 #define ICE_SR_WORDS_IN_1KB		512
-
+/* Checksum should be calculated such that after adding all the words,
+ * including the checksum word itself, the sum should be 0xBABA.
+ */
+#define ICE_SR_SW_CHECKSUM_BASE		0xBABA
+
+/* Link override related */
+#define ICE_SR_PFA_LINK_OVERRIDE_WORDS		10
+#define ICE_SR_PFA_LINK_OVERRIDE_PHY_WORDS	4
+#define ICE_SR_PFA_LINK_OVERRIDE_OFFSET		2
+#define ICE_SR_PFA_LINK_OVERRIDE_FEC_OFFSET	1
+#define ICE_SR_PFA_LINK_OVERRIDE_PHY_OFFSET	2
+#define ICE_FW_API_LINK_OVERRIDE_MAJ		1
+#define ICE_FW_API_LINK_OVERRIDE_MIN		5
+#define ICE_FW_API_LINK_OVERRIDE_PATCH		2
+
+#define ICE_PBA_FLAG_DFLT		0xFAFA
 /* Hash redirection LUT for VSI - maximum array size */
 #define ICE_VSIQF_HLUT_ARRAY_SIZE	((VSIQF_HLUT_MAX_INDEX + 1) * 4)
 
+/*
+ * Defines for values in the VF_PE_DB_SIZE bits in the GLPCI_LBARCTRL register.
+ * This is needed to determine the BAR0 space for the VFs
+ */
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_0KB 0x0
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_8KB 0x1
+#define GLPCI_LBARCTRL_VF_PE_DB_SIZE_64KB 0x2
+
+/* AQ API version for LLDP_FILTER_CONTROL */
+#define ICE_FW_API_LLDP_FLTR_MAJ	1
+#define ICE_FW_API_LLDP_FLTR_MIN	7
+#define ICE_FW_API_LLDP_FLTR_PATCH	1
+
+/* AQ API version for report default configuration */
+#define ICE_FW_API_REPORT_DFLT_CFG_MAJ		1
+#define ICE_FW_API_REPORT_DFLT_CFG_MIN		7
+#define ICE_FW_API_REPORT_DFLT_CFG_PATCH	3
+
+/* AQ API version for FW health reports */
+#define ICE_FW_API_HEALTH_REPORT_MAJ		1
+#define ICE_FW_API_HEALTH_REPORT_MIN		7
+#define ICE_FW_API_HEALTH_REPORT_PATCH		6
 #endif /* _ICE_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd1254d01b80105e1c47f4c1495f4c38fc9c8b81
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_ops.h"
+#include "ice_vsi_vlan_lib.h"
+#include "ice_vlan_mode.h"
+#include "ice.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_virtchnl_pf.h"
+#include "ice_lib.h"
+
+static int
+noop_vlan_arg(struct ice_vsi __always_unused *vsi,
+	      struct ice_vlan * __always_unused vlan)
+{
+	return 0;
+}
+
+static int
+noop_vlan(struct ice_vsi __always_unused *vsi)
+{
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_init_vlan_ops - Initialize default VSI VLAN ops for VF VSI
+ * @vsi: VF's VSI being configured
+ *
+ * If Double VLAN Mode (DVM) is enabled, assume that the VF supports the new
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2 capability and set up the VLAN ops accordingly.
+ * If SVM is enabled maintain the same level of VLAN support previous to
+ * VIRTCHNL_VF_VLAN_OFFLOAD_V2.
+ */
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct ice_pf *pf = vsi->back;
+	struct ice_vf *vf;
+
+	vf = &pf->vf[vsi->vf_id];
+
+	if (ice_is_dvm_ena(&pf->hw)) {
+		vlan_ops = &vsi->outer_vlan_ops;
+
+		/* outer VLAN ops regardless of port VLAN config */
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+		vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+		vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+		if (ice_vf_is_port_vlan_ena(vf)) {
+			/* setup outer VLAN ops */
+			vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan;
+			vlan_ops->ena_rx_filtering =
+				ice_vsi_ena_rx_vlan_filtering;
+
+			/* setup inner VLAN ops */
+			vlan_ops = &vsi->inner_vlan_ops;
+			vlan_ops->add_vlan = noop_vlan_arg;
+			vlan_ops->del_vlan = noop_vlan_arg;
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		} else {
+			if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, pf->flags))
+				vlan_ops->ena_rx_filtering = noop_vlan;
+			else
+				vlan_ops->ena_rx_filtering =
+					ice_vsi_ena_rx_vlan_filtering;
+
+			vlan_ops->del_vlan = ice_vsi_del_vlan;
+			vlan_ops->ena_stripping = ice_vsi_ena_outer_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_outer_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+			/* setup inner VLAN ops */
+			vlan_ops = &vsi->inner_vlan_ops;
+
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		}
+	} else {
+		vlan_ops = &vsi->inner_vlan_ops;
+
+		/* inner VLAN ops regardless of port VLAN config */
+		vlan_ops->add_vlan = ice_vsi_add_vlan;
+		vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+		vlan_ops->ena_tx_filtering = ice_vsi_ena_tx_vlan_filtering;
+		vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+
+		if (ice_vf_is_port_vlan_ena(vf)) {
+			vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan;
+			vlan_ops->ena_rx_filtering =
+				ice_vsi_ena_rx_vlan_filtering;
+		} else {
+			if (test_bit(ICE_FLAG_VF_VLAN_PRUNE_DIS, pf->flags))
+				vlan_ops->ena_rx_filtering = noop_vlan;
+			else
+				vlan_ops->ena_rx_filtering =
+					ice_vsi_ena_rx_vlan_filtering;
+
+			vlan_ops->del_vlan = ice_vsi_del_vlan;
+			vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping;
+			vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping;
+			vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion;
+			vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion;
+		}
+	}
+}
+
+/**
+ * ice_vf_vsi_cfg_dvm_legacy_vlan_mode - Config VLAN mode for old VFs in DVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Double VLAN Mode (DVM) is enabled, there
+ * is not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * This function sets up the VF VSI's inner and outer ice_vsi_vlan_ops and also
+ * initializes software only VLAN mode (i.e. allow all VLANs). Also, use no-op
+ * implementations for any functions that may be called during the lifetime of
+ * the VF so these methods do nothing and succeed.
+ */
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return;
+
+	vlan_ops = &vsi->outer_vlan_ops;
+
+	/* Rx VLAN filtering always disabled to allow software offloaded VLANs
+	 * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+	 * port VLAN configured
+	 */
+	vlan_ops->dis_rx_filtering = ice_vsi_dis_rx_vlan_filtering;
+	/* Don't fail when attempting to enable Rx VLAN filtering */
+	vlan_ops->ena_rx_filtering = noop_vlan;
+
+	/* Tx VLAN filtering always disabled to allow software offloaded VLANs
+	 * for VFs that only support VIRTCHNL_VF_OFFLOAD_VLAN and don't have a
+	 * port VLAN configured
+	 */
+	vlan_ops->dis_tx_filtering = ice_vsi_dis_tx_vlan_filtering;
+	/* Don't fail when attempting to enable Tx VLAN filtering */
+	vlan_ops->ena_tx_filtering = noop_vlan;
+
+	if (vlan_ops->dis_rx_filtering(vsi))
+		dev_dbg(dev, "Failed to disable Rx VLAN filtering for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+	if (vlan_ops->dis_tx_filtering(vsi))
+		dev_dbg(dev, "Failed to disable Tx VLAN filtering for old VF without VIRTHCNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	/* All outer VLAN offloads must be disabled */
+	vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+	vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+	if (vlan_ops->dis_stripping(vsi))
+		dev_dbg(dev, "Failed to disable outer VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	if (vlan_ops->dis_insertion(vsi))
+		dev_dbg(dev, "Failed to disable outer VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	/* All inner VLAN offloads must be disabled */
+	vlan_ops = &vsi->inner_vlan_ops;
+
+	vlan_ops->dis_stripping = ice_vsi_dis_outer_stripping;
+	vlan_ops->dis_insertion = ice_vsi_dis_outer_insertion;
+
+	if (vlan_ops->dis_stripping(vsi))
+		dev_dbg(dev, "Failed to disable inner VLAN stripping for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+
+	if (vlan_ops->dis_insertion(vsi))
+		dev_dbg(dev, "Failed to disable inner VLAN insertion for old VF without VIRTCHNL_VF_OFFLOAD_VLAN_V2 support\n");
+}
+
+/**
+ * ice_vf_vsi_cfg_svm_legacy_vlan_mode - Config VLAN mode for old VFs in SVM
+ * @vsi: VF's VSI being configured
+ *
+ * This should only be called when Single VLAN Mode (SVM) is enabled, there is
+ * not a port VLAN enabled on this VF, and the VF negotiates
+ * VIRTCHNL_VF_OFFLOAD_VLAN.
+ *
+ * All of the normal SVM VLAN ops are identical for this case. However, by
+ * default Rx VLAN filtering should be turned off by default in this case.
+ */
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+
+	if (ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return;
+
+	if (vsi->inner_vlan_ops.dis_rx_filtering(vsi))
+		dev_dbg(ice_pf_to_dev(vf->pf), "Failed to disable Rx VLAN filtering for old VF with VIRTCHNL_VF_OFFLOAD_VLAN support\n");
+}
+
+/**
+ * ice_vf_vsi_dcf_set_outer_port_vlan - Config outer port VLAN for VF in DVM
+ * @vsi: VF's VSI being configured
+ * @vlan: ice_vlan structure used to set the port VLAN
+ */
+int ice_vf_vsi_dcf_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	int err;
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return -EOPNOTSUPP;
+
+	err = ice_vsi_set_outer_port_vlan(vsi, vlan);
+	if (err)
+		return err;
+
+	err = ice_vsi_add_vlan(vsi, vlan);
+	if (err)
+		return err;
+
+	vsi->outer_vlan_ops.add_vlan = noop_vlan_arg;
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_dcf_ena_outer_vlan_stripping - Enable outer VLAN stripping for VF in DVM
+ * @vsi: VF's VSI being configured
+ * @tpid: TPID to enable outer VLAN stripping for
+ */
+int ice_vf_vsi_dcf_ena_outer_vlan_stripping(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_vf *vf = &vsi->back->vf[vsi->vf_id];
+	int err;
+
+	if (!ice_is_dvm_ena(&vsi->back->hw) || ice_vf_is_port_vlan_ena(vf))
+		return -EOPNOTSUPP;
+
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err)
+		return err;
+
+	err = ice_vsi_ena_outer_stripping(vsi, tpid);
+	if (err)
+		return err;
+
+	vsi->outer_vlan_ops.add_vlan = noop_vlan_arg;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c02936e6aa31d91086424e2d52bc9f5736205cf8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VF_VSI_VLAN_OPS_H_
+#define _ICE_VF_VSI_VLAN_OPS_H_
+
+#include "ice_vsi_vlan_ops.h"
+
+struct ice_vsi;
+
+void ice_vf_vsi_cfg_dvm_legacy_vlan_mode(struct ice_vsi *vsi);
+void ice_vf_vsi_cfg_svm_legacy_vlan_mode(struct ice_vsi *vsi);
+int ice_vf_vsi_dcf_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_vf_vsi_dcf_ena_outer_vlan_stripping(struct ice_vsi *vsi, u16 tpid);
+#ifdef CONFIG_PCI_IOV
+void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi);
+#else
+static inline void ice_vf_vsi_init_vlan_ops(struct ice_vsi *vsi) { }
+#endif /* CONFIG_PCI_IOV */
+
+#endif /* _ICE_PF_VSI_VLAN_OPS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
new file mode 100644
index 0000000000000000000000000000000000000000..ffbbe0bfc3d05bf3086ff3eadf74a5b85ea643ba
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_virtchnl_allowlist.h"
+
+/* Purpose of this file is to share functionality to allowlist or denylist
+ * opcodes used in PF <-> VF communication. Group of opcodes:
+ * - default -> should be always allowed after creating VF,
+ *   default_allowlist_opcodes
+ * - opcodes needed by VF to work correctly, but not associated with caps ->
+ *   should be allowed after successful VF resources allocation,
+ *   working_allowlist_opcodes
+ * - opcodes needed by VF when caps are activated
+ *
+ * Caps that don't use new opcodes (no opcodes should be allowed):
+ * - VIRTCHNL_VF_OFFLOAD_RSS_AQ
+ * - VIRTCHNL_VF_OFFLOAD_RSS_REG
+ * - VIRTCHNL_VF_OFFLOAD_WB_ON_ITR
+ * - VIRTCHNL_VF_OFFLOAD_CRC
+ * - VIRTCHNL_VF_OFFLOAD_RX_POLLING
+ * - VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_USO
+ */
+
+/* default opcodes to communicate with VF */
+static const u32 default_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_VF_RESOURCES, VIRTCHNL_OP_VERSION, VIRTCHNL_OP_RESET_VF,
+};
+
+/* opcodes supported after successful VIRTCHNL_OP_GET_VF_RESOURCES */
+static const u32 working_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_TX_QUEUE, VIRTCHNL_OP_CONFIG_RX_QUEUE,
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+	VIRTCHNL_OP_ENABLE_QUEUES, VIRTCHNL_OP_DISABLE_QUEUES,
+	VIRTCHNL_OP_GET_STATS, VIRTCHNL_OP_EVENT,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_L2 */
+static const u32 l2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_ETH_ADDR, VIRTCHNL_OP_DEL_ETH_ADDR,
+	VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+};
+
+/* VIRTCHNL_VF_CAP_RDMA */
+static const u32 rdma_allowlist_opcodes[] = {
+	VIRTCHNL_OP_RDMA, VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP,
+	VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_REQ_QUEUES */
+static const u32 req_queues_allowlist_opcodes[] = {
+	VIRTCHNL_OP_REQUEST_QUEUES,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_VLAN */
+static const u32 vlan_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_VLAN, VIRTCHNL_OP_DEL_VLAN,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_VLAN_V2 */
+static const u32 vlan_v2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS, VIRTCHNL_OP_ADD_VLAN_V2,
+	VIRTCHNL_OP_DEL_VLAN_V2, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2,
+	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2,
+	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_RSS_PF */
+static const u32 rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
+	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADQ */
+static const u32 adq_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ENABLE_CHANNELS, VIRTCHNL_OP_DISABLE_CHANNELS,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER, VIRTCHNL_OP_DEL_CLOUD_FILTER,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADQ_V2 */
+static const u32 adq_v2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ENABLE_CHANNELS, VIRTCHNL_OP_DISABLE_CHANNELS,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER, VIRTCHNL_OP_DEL_CLOUD_FILTER,
+};
+
+/* VIRTCHNL_VF_CAP_DCF */
+static const u32 cap_dcf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_DCF_VLAN_OFFLOAD,
+	VIRTCHNL_OP_DCF_CMD_DESC, VIRTCHNL_OP_DCF_CMD_BUFF,
+	VIRTCHNL_OP_DCF_DISABLE, VIRTCHNL_OP_DCF_GET_VSI_MAP,
+	VIRTCHNL_OP_DCF_GET_PKG_INFO,
+        VIRTCHNL_OP_DCF_RULE_FLUSH,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC */
+static const u32 rx_flex_desc_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF */
+static const u32 adv_rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_RSS_CFG, VIRTCHNL_OP_DEL_RSS_CFG,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_FDIR_PF */
+static const u32 fdir_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER,
+};
+
+
+static const u32 large_num_qpairs_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_MAX_RSS_QREGION,
+	VIRTCHNL_OP_ENABLE_QUEUES_V2,
+	VIRTCHNL_OP_DISABLE_QUEUES_V2,
+	VIRTCHNL_OP_MAP_QUEUE_VECTOR,
+};
+
+struct allowlist_opcode_info {
+	const u32 *opcodes;
+	size_t size;
+};
+
+#define BIT_INDEX(caps) (HWEIGHT((caps) - 1))
+#define ALLOW_ITEM(caps, list) \
+	[BIT_INDEX(caps)] = { \
+		.opcodes = list, \
+		.size = ARRAY_SIZE(list) \
+	}
+static const struct allowlist_opcode_info allowlist_opcodes[] = {
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_L2, l2_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_CAP_RDMA, rdma_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADQ, adq_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADQ_V2, adq_v2_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_CAP_DCF, cap_dcf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC, rx_flex_desc_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_LARGE_NUM_QPAIRS, large_num_qpairs_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN_V2, vlan_v2_allowlist_opcodes),
+};
+
+/**
+ * ice_vc_is_opcode_allowed - check if this opcode is allowed on this VF
+ * @vf: pointer to VF structure
+ * @opcode: virtchnl opcode
+ *
+ * Return true if message is allowed on this VF
+ */
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode)
+{
+	if (opcode >= VIRTCHNL_OP_MAX)
+		return false;
+
+	return test_bit(opcode, vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_allowlist_opcodes - allowlist selected opcodes
+ * @vf: pointer to VF structure
+ * @opcodes: array of opocodes to allowlist
+ * @size: size of opcodes array
+ *
+ * Function should be called to allowlist opcodes on VF.
+ */
+static void
+ice_vc_allowlist_opcodes(struct ice_vf *vf, const u32 *opcodes, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		set_bit(opcodes[i], vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_clear_allowlist - clear all allowlist opcodes
+ * @vf: pointer to VF structure
+ */
+static void ice_vc_clear_allowlist(struct ice_vf *vf)
+{
+	bitmap_zero(vf->opcodes_allowlist, VIRTCHNL_OP_MAX);
+}
+
+/**
+ * ice_vc_set_default_allowlist - allowlist default opcodes for VF
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_default_allowlist(struct ice_vf *vf)
+{
+	ice_vc_clear_allowlist(vf);
+	ice_vc_allowlist_opcodes(vf, default_allowlist_opcodes,
+				 ARRAY_SIZE(default_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_working_allowlist - allowlist opcodes needed to by VF to work
+ * @vf: pointer to VF structure
+ *
+ * Allowlist opcodes that aren't associated with specific caps, but
+ * are needed by VF to work.
+ */
+void ice_vc_set_working_allowlist(struct ice_vf *vf)
+{
+	ice_vc_allowlist_opcodes(vf, working_allowlist_opcodes,
+				 ARRAY_SIZE(working_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_caps_allowlist - allowlist VF opcodes according caps
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_caps_allowlist(struct ice_vf *vf)
+{
+	unsigned long caps = vf->driver_caps;
+	unsigned int i;
+
+	for_each_set_bit(i, &caps, ARRAY_SIZE(allowlist_opcodes))
+		ice_vc_allowlist_opcodes(vf, allowlist_opcodes[i].opcodes,
+					 allowlist_opcodes[i].size);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33bc6ac3f541a7aac3307498b22178e8e4968c4
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_ALLOWLIST_H_
+#define _ICE_VIRTCHNL_ALLOWLIST_H_
+#include "ice.h"
+
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode);
+
+void ice_vc_set_default_allowlist(struct ice_vf *vf);
+void ice_vc_set_working_allowlist(struct ice_vf *vf);
+void ice_vc_set_caps_allowlist(struct ice_vf *vf);
+#endif
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
new file mode 100644
index 0000000000000000000000000000000000000000..a02ebef5772a87a002510dbc10bf92e216e27aab
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
@@ -0,0 +1,2267 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_base.h"
+#include "ice_lib.h"
+
+#define to_fltr_conf_from_desc(p) \
+	container_of(p, struct virtchnl_fdir_fltr_conf, input)
+
+#define ICE_FLOW_PROF_TYPE_S	0
+#define ICE_FLOW_PROF_TYPE_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_TYPE_S)
+#define ICE_FLOW_PROF_VSI_S	32
+#define ICE_FLOW_PROF_VSI_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_VSI_S)
+
+/* Flow profile ID format:
+ * [0:31] - flow type, flow + tun_offs
+ * [32:63] - VSI index
+ */
+#define ICE_FLOW_PROF_FD(vsi, flow, tun_offs) \
+	(u64)(((((flow) + (tun_offs)) & ICE_FLOW_PROF_TYPE_M)) | \
+	      (((u64)(vsi) << ICE_FLOW_PROF_VSI_S) & ICE_FLOW_PROF_VSI_M))
+
+#define GTPU_TEID_OFFSET 4
+#define GTPU_EH_QFI_OFFSET 1
+#define GTPU_EH_QFI_MASK 0x3F
+#define PFCP_S_OFFSET 0
+#define PFCP_S_MASK 0x1
+#define PFCP_PORT_NR 8805
+
+#define FDIR_INSET_FLAG_ESP_S 0
+#define FDIR_INSET_FLAG_ESP_M BIT_ULL(FDIR_INSET_FLAG_ESP_S)
+#define FDIR_INSET_FLAG_ESP_UDP BIT_ULL(FDIR_INSET_FLAG_ESP_S)
+#define FDIR_INSET_FLAG_ESP_IPSEC (0ULL << FDIR_INSET_FLAG_ESP_S)
+
+#define FDIR_INSET_FLAG_ECPRI_S 1
+#define FDIR_INSET_FLAG_ECPRI_M BIT_ULL(FDIR_INSET_FLAG_ECPRI_S)
+#define FDIR_INSET_FLAG_ECPRI_UDP BIT_ULL(FDIR_INSET_FLAG_ECPRI_S)
+#define FDIR_INSET_FLAG_ECPRI_MAC (0ULL << FDIR_INSET_FLAG_ECPRI_S)
+
+enum ice_fdir_tunnel_type {
+	ICE_FDIR_TUNNEL_TYPE_NONE = 0,
+	ICE_FDIR_TUNNEL_TYPE_GTPU,
+	ICE_FDIR_TUNNEL_TYPE_GTPU_EH,
+	ICE_FDIR_TUNNEL_TYPE_ECPRI,
+};
+
+struct virtchnl_fdir_fltr_conf {
+	struct ice_fdir_fltr input;
+	enum ice_fdir_tunnel_type ttype;
+	u64 inset_flag;
+	u32 flow_id;
+};
+
+struct virtchnl_fdir_inset_map {
+	enum virtchnl_proto_hdr_field field;
+	enum ice_flow_field fld;
+	u64 flag;
+	u64 mask;
+};
+
+static const struct virtchnl_fdir_inset_map fdir_inset_map[] = {
+	{VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE, ICE_FLOW_FIELD_IDX_ETH_TYPE,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_SRC, ICE_FLOW_FIELD_IDX_IPV4_SA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_DST, ICE_FLOW_FIELD_IDX_IPV4_DA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_DSCP, ICE_FLOW_FIELD_IDX_IPV4_DSCP,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_TTL, ICE_FLOW_FIELD_IDX_IPV4_TTL,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV4_PROT, ICE_FLOW_FIELD_IDX_IPV4_PROT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_SRC, ICE_FLOW_FIELD_IDX_IPV6_SA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_DST, ICE_FLOW_FIELD_IDX_IPV6_DA,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_TC, ICE_FLOW_FIELD_IDX_IPV6_DSCP,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, ICE_FLOW_FIELD_IDX_IPV6_TTL,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_IPV6_PROT, ICE_FLOW_FIELD_IDX_IPV6_PROT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_UDP_SRC_PORT, ICE_FLOW_FIELD_IDX_UDP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_UDP_DST_PORT, ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_TCP_SRC_PORT, ICE_FLOW_FIELD_IDX_TCP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_TCP_DST_PORT, ICE_FLOW_FIELD_IDX_TCP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT, ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, ICE_FLOW_FIELD_IDX_SCTP_DST_PORT,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP_TEID, ICE_FLOW_FIELD_IDX_GTPU_IP_TEID,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_QFI, ICE_FLOW_FIELD_IDX_GTPU_EH_QFI,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_ESP_SPI, ICE_FLOW_FIELD_IDX_ESP_SPI,
+		FDIR_INSET_FLAG_ESP_IPSEC, FDIR_INSET_FLAG_ESP_M},
+	{VIRTCHNL_PROTO_HDR_ESP_SPI, ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI,
+		FDIR_INSET_FLAG_ESP_UDP, FDIR_INSET_FLAG_ESP_M},
+	{VIRTCHNL_PROTO_HDR_AH_SPI, ICE_FLOW_FIELD_IDX_AH_SPI,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID, ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID,
+		0, 0},
+	{VIRTCHNL_PROTO_HDR_PFCP_S_FIELD, ICE_FLOW_FIELD_IDX_UDP_DST_PORT,
+		0, 0},
+	{
+		VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+		ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID,
+		FDIR_INSET_FLAG_ECPRI_MAC,
+		FDIR_INSET_FLAG_ECPRI_M
+	},
+	{
+		VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+		ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID,
+		FDIR_INSET_FLAG_ECPRI_UDP,
+		FDIR_INSET_FLAG_ECPRI_M
+	},
+};
+
+/**
+ * ice_vc_fdir_param_check
+ * @vf: pointer to the VF structure
+ * @vsi_id: VF relative VSI ID
+ *
+ * Check for the valid VSI ID, PF's state and VF's state
+ *
+ * Return: 0 on success, and -EINVAL on error.
+ */
+static int
+ice_vc_fdir_param_check(struct ice_vf *vf, u16 vsi_id)
+{
+	struct ice_pf *pf = vf->pf;
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EINVAL;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		return -EINVAL;
+
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF))
+		return -EINVAL;
+
+	if (vsi_id != vf->lan_vsi_num)
+		return -EINVAL;
+
+	if (!ice_vc_isvalid_vsi_id(vf, vsi_id))
+		return -EINVAL;
+
+	if (!ice_get_vf_vsi(vf))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_vf_start_ctrl_vsi
+ * @vf: pointer to the VF structure
+ *
+ * Allocate ctrl_vsi for the first time and open the ctrl_vsi port for VF
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vf_start_ctrl_vsi(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *ctrl_vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		return -EEXIST;
+
+	ctrl_vsi = ice_vf_ctrl_vsi_setup(vf);
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Could not setup control VSI for VF %d\n",
+			vf->vf_id);
+		return -ENOMEM;
+	}
+
+	err = ice_vsi_open_ctrl(ctrl_vsi);
+	if (err) {
+		dev_dbg(dev, "Could not open control VSI for VF %d\n",
+			vf->vf_id);
+		goto err_vsi_open;
+	}
+
+	return 0;
+
+err_vsi_open:
+	ice_vsi_release(ctrl_vsi);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI) {
+		pf->vsi[vf->ctrl_vsi_idx] = NULL;
+		vf->ctrl_vsi_idx = ICE_NO_VSI;
+	}
+	return err;
+}
+
+/**
+ * ice_vc_fdir_alloc_prof - allocate profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_alloc_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof) {
+		fdir->fdir_prof = kcalloc(ICE_FLTR_PTYPE_MAX,
+					  sizeof(*fdir->fdir_prof),
+					  GFP_KERNEL);
+		if (!fdir->fdir_prof)
+			return -ENOMEM;
+	}
+
+	if (!fdir->fdir_prof[flow]) {
+		fdir->fdir_prof[flow] = kzalloc(sizeof(**fdir->fdir_prof),
+						GFP_KERNEL);
+		if (!fdir->fdir_prof[flow])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_free_prof - free profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ */
+static void
+ice_vc_fdir_free_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	if (!fdir->fdir_prof[flow])
+		return;
+
+	kfree(fdir->fdir_prof[flow]);
+	fdir->fdir_prof[flow] = NULL;
+}
+
+/**
+ * ice_vc_fdir_free_prof_all - free all the profile for this VF
+ * @vf: pointer to the VF structure
+ */
+void ice_vc_fdir_free_prof_all(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	enum ice_fltr_ptype flow;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX; flow++)
+		ice_vc_fdir_free_prof(vf, flow);
+
+	kfree(fdir->fdir_prof);
+	fdir->fdir_prof = NULL;
+}
+
+/**
+ * ice_vc_fdir_parse_flow_fld
+ * @vf: pointer to the VF structure
+ * @proto_hdr: virtual channel protocol filter header
+ * @conf: FDIR configuration for each filter
+ * @fld: field type array
+ * @fld_cnt: field counter
+ *
+ * Parse the virtual channel filter header and store them into field type array
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_flow_fld(struct ice_vf *vf,
+			   struct virtchnl_proto_hdr *proto_hdr,
+			   struct virtchnl_fdir_fltr_conf *conf,
+			   enum ice_flow_field *fld,
+			   int *fld_cnt)
+{
+	struct virtchnl_proto_hdr hdr;
+	u32 i;
+
+	memcpy(&hdr, proto_hdr, sizeof(hdr));
+
+	for (i = 0; (i < ARRAY_SIZE(fdir_inset_map)) &&
+	     VIRTCHNL_GET_PROTO_HDR_FIELD(&hdr); i++) {
+		if (VIRTCHNL_TEST_PROTO_HDR(&hdr, fdir_inset_map[i].field)) {
+			if (fdir_inset_map[i].mask &&
+			    ((fdir_inset_map[i].mask & conf->inset_flag)
+			    != fdir_inset_map[i].flag))
+				continue;
+
+			fld[*fld_cnt] = fdir_inset_map[i].fld;
+			*fld_cnt += 1;
+			if (*fld_cnt >= ICE_FLOW_FIELD_IDX_MAX)
+				return -EINVAL;
+			VIRTCHNL_DEL_PROTO_HDR_FIELD(&hdr,
+						     fdir_inset_map[i].field);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_fld
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Parse the virtual channel add msg buffer's field vector and store them into
+ * flow's packet segment field
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_fld(struct ice_vf *vf,
+			 struct virtchnl_fdir_add *fltr,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	struct virtchnl_fdir_rule *rule = &fltr->rule_cfg;
+	enum ice_flow_field fld[ICE_FLOW_FIELD_IDX_MAX];
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct virtchnl_proto_hdrs *proto;
+	int fld_cnt = 0;
+	int i;
+
+	proto = &rule->proto_hdrs;
+	for (i = 0; i < proto->count; i++) {
+		struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i];
+		int ret;
+
+		ret = ice_vc_fdir_parse_flow_fld(vf, hdr, conf, fld, &fld_cnt);
+		if (ret)
+			return ret;
+	}
+
+	if (fld_cnt == 0) {
+		dev_dbg(dev, "Empty input set for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < fld_cnt; i++) {
+		ice_flow_set_fld(seg, fld[i],
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_hdr - config the flow's packet segment header
+ * @vf: pointer to the VF structure
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_hdr(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	enum ice_fltr_ptype flow = conf->input.flow_type;
+	enum ice_fdir_tunnel_type ttype = conf->ttype;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NON_IP_L2:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ETH_NON_IP);
+		break;
+	case ICE_FLTR_PTYPE_NONF_ECPRI_TP0:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ECPRI_TP0);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0 |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_L2TPV3 |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ESP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_AH:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_AH |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_NAT_T_ESP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_NODE |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_SESSION |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_UDP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_TCP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_ICMP:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_IPV4_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_IPV4_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_EH |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_DWN |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_UP |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV4 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_IPV6_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH:
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_IPV6_OTHER:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_EH |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_DWN |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP:
+		if (ttype == ICE_FDIR_TUNNEL_TYPE_GTPU_EH) {
+			ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_GTPU_UP |
+					  ICE_FLOW_SEG_HDR_GTPU_IP |
+					  ICE_FLOW_SEG_HDR_IPV6 |
+					  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		} else {
+			dev_dbg(dev, "Invalid tunnel type 0x%x for VF %d\n",
+				flow, vf->vf_id);
+			return -EINVAL;
+		}
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_L2TPV3 |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_ESP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_AH:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_AH |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_NAT_T_ESP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_NODE |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_PFCP_SESSION |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	default:
+		dev_dbg(dev, "Invalid flow type 0x%x for VF %d failed\n",
+			flow, vf->vf_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof - remove profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ */
+static void
+ice_vc_fdir_rem_prof(struct ice_vf *vf, enum ice_fltr_ptype flow, int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_fd_hw_prof *vf_prof;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vf_vsi;
+	struct device *dev;
+	struct ice_hw *hw;
+	u64 prof_id;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	if (!fdir->fdir_prof || !fdir->fdir_prof[flow])
+		return;
+
+	vf_prof = fdir->fdir_prof[flow];
+
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi) {
+		dev_dbg(dev, "NULL vf %d vsi pointer\n", vf->vf_id);
+		return;
+	}
+
+	if (!fdir->prof_entry_cnt[flow][tun])
+		return;
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num,
+				   flow, tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	for (i = 0; i < fdir->prof_entry_cnt[flow][tun]; i++) {
+		if (vf_prof->entry_h[i][tun]) {
+			u16 vsi_num = ice_get_hw_vsi_num(hw, vf_prof->vsi_h[i]);
+
+			ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id);
+			ice_flow_rem_entry(hw, ICE_BLK_FD,
+					   vf_prof->entry_h[i][tun]);
+			vf_prof->entry_h[i][tun] = 0;
+		}
+	}
+
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+	kfree(vf_prof->fdir_seg[tun]);
+	vf_prof->fdir_seg[tun] = NULL;
+
+	for (i = 0; i < vf_prof->cnt; i++)
+		vf_prof->vsi_h[i] = 0;
+
+	fdir->prof_entry_cnt[flow][tun] = 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof_all - remove profile for this VF
+ * @vf: pointer to the VF structure
+ */
+void ice_vc_fdir_rem_prof_all(struct ice_vf *vf)
+{
+	enum ice_fltr_ptype flow;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE;
+	     flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		ice_vc_fdir_rem_prof(vf, flow, 0);
+		ice_vc_fdir_rem_prof(vf, flow, 1);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_flow_prof
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @seg: array of one or more packet segments that describe the flow
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Write the flow's profile config and packet segment into the hardware
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_write_flow_prof(struct ice_vf *vf,
+			    enum ice_fltr_ptype flow,
+			    struct ice_flow_seg_info *seg,
+			    int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_vsi *vf_vsi, *ctrl_vsi;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_fd_hw_prof *vf_prof;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u64 entry1_h = 0;
+	u64 entry2_h = 0;
+	u64 prof_id;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi)
+		return -EINVAL;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	vf_prof = fdir->fdir_prof[flow];
+	old_seg = vf_prof->fdir_seg[tun];
+	if (old_seg) {
+		if (!memcmp(old_seg, seg, sizeof(*seg) * ICE_FD_HW_SEG_MAX)) {
+			dev_dbg(dev, "Duplicated profile for VF %d!\n",
+				vf->vf_id);
+			return -EEXIST;
+		}
+
+		if (fdir->fdir_fltr_cnt[flow][tun]) {
+			ret = -EINVAL;
+			dev_dbg(dev, "Input set conflicts for VF %d\n",
+				vf->vf_id);
+			goto err_exit;
+		}
+
+		/* remove previously allocated profile */
+		ice_vc_fdir_rem_prof(vf, flow, tun);
+	}
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num,
+				   flow, tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg,
+				   tun + 1, NULL, 0, &prof);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add VSI flow 0x%x for VF %d\n",
+			flow, vf->vf_id);
+		goto err_exit;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    vf_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry1_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add flow 0x%x VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_prof;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, NULL, 0, &entry2_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev,
+			"Could not add flow 0x%x Ctrl VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_entry_1;
+	}
+
+	vf_prof->fdir_seg[tun] = seg;
+	vf_prof->cnt = 0;
+	fdir->prof_entry_cnt[flow][tun] = 0;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry1_h;
+	vf_prof->vsi_h[vf_prof->cnt] = vf_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry2_h;
+	vf_prof->vsi_h[vf_prof->cnt] = ctrl_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	return 0;
+
+err_entry_1:
+	ice_rem_prof_id_flow(hw, ICE_BLK_FD,
+			     ice_get_hw_vsi_num(hw, vf_vsi->idx), prof_id);
+	ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h);
+err_prof:
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+err_exit:
+	return ret;
+}
+
+/**
+ * ice_vc_fdir_has_prof_conflict
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ *
+ * Check if @conf has conflicting profile with existing profiles
+ *
+ * Return: true on success, and false on error.
+ */
+static bool
+ice_vc_fdir_has_prof_conflict(struct ice_vf *vf,
+			      struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct ice_fdir_fltr *desc;
+
+	list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *existing_conf =
+				to_fltr_conf_from_desc(desc);
+		struct ice_fdir_fltr *a = &existing_conf->input;
+		struct ice_fdir_fltr *b = &conf->input;
+
+		enum ice_fltr_ptype flow_type_a = a->flow_type;
+		enum ice_fltr_ptype flow_type_b = b->flow_type;
+
+		/* No need to compare two rules with different tunnel type */
+		if (existing_conf->ttype != conf->ttype)
+			continue;
+
+		/* No need to compare two rules with same protocol */
+		if (flow_type_a == flow_type_b)
+			continue;
+
+		switch (flow_type_a) {
+		case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_OTHER)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_UDP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_SCTP)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_OTHER)
+				return true;
+			break;
+		case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+			if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_UDP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_TCP ||
+			    flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_SCTP)
+				return true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_fdir_config_input_set
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Config the input set type and value for virtual channel add msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_config_input_set(struct ice_vf *vf,
+			     struct virtchnl_fdir_add *fltr,
+			     struct virtchnl_fdir_fltr_conf *conf,
+			     int tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_flow_seg_info *seg;
+	enum ice_fltr_ptype flow;
+	int ret;
+
+	ret = ice_vc_fdir_has_prof_conflict(vf, conf);
+	if (ret) {
+		dev_dbg(dev, "Found flow prof conflict for VF %d\n", vf->vf_id);
+		return ret;
+	}
+
+	flow = input->flow_type;
+	ret = ice_vc_fdir_alloc_prof(vf, flow);
+	if (ret) {
+		dev_dbg(dev, "Alloc flow prof for VF %d failed\n", vf->vf_id);
+		return ret;
+	}
+
+	seg = kcalloc(ICE_FD_HW_SEG_MAX, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	ret = ice_vc_fdir_set_flow_fld(vf, fltr, conf, &seg[tun]);
+	if (ret) {
+		dev_dbg(dev, "Set flow field for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_set_flow_hdr(vf, conf, &seg[tun]);
+	if (ret) {
+		dev_dbg(dev, "Set flow hdr for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_write_flow_prof(vf, flow, seg, tun);
+	if (ret == -EEXIST) {
+		kfree(seg);
+	} else if (ret) {
+		dev_dbg(dev, "Write flow profile for VF %d failed\n",
+			vf->vf_id);
+		goto err_exit;
+	}
+
+	return 0;
+
+err_exit:
+	kfree(seg);
+	return ret;
+}
+
+/**
+ * ice_vc_fdir_parse_pattern
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Parse the virtual channel filter's pattern and store them into @conf
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_pattern(struct ice_vf *vf,
+			  struct virtchnl_fdir_add *fltr,
+			  struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_proto_hdrs *proto = &fltr->rule_cfg.proto_hdrs;
+	enum virtchnl_proto_hdr_type l3 = VIRTCHNL_PROTO_HDR_NONE;
+	enum virtchnl_proto_hdr_type l4 = VIRTCHNL_PROTO_HDR_NONE;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_fdir_fltr *input = &conf->input;
+	int i;
+
+	if (proto->count > VIRTCHNL_MAX_NUM_PROTO_HDRS) {
+		dev_dbg(dev, "Invalid protocol count:0x%x for VF %d\n",
+			proto->count, vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < proto->count; i++) {
+		struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i];
+		struct ip_esp_hdr *esph;
+		struct ip_auth_hdr *ah;
+		struct sctphdr *sctph;
+		struct ipv6hdr *ip6h;
+		struct udphdr *udph;
+		struct tcphdr *tcph;
+		struct ethhdr *eth;
+		struct iphdr *iph;
+		u8 msg_type;
+		u8 s_field;
+		u8 *rawh;
+
+		switch (hdr->type) {
+		case VIRTCHNL_PROTO_HDR_ETH:
+			eth = (struct ethhdr *)hdr->buffer;
+			input->flow_type = ICE_FLTR_PTYPE_NON_IP_L2;
+
+			if (hdr->field_selector)
+				input->ext_data.ether_type = eth->h_proto;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV4:
+			iph = (struct iphdr *)hdr->buffer;
+			l3 = VIRTCHNL_PROTO_HDR_IPV4;
+			input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_OTHER;
+
+			if (hdr->field_selector) {
+				input->ip.v4.src_ip = iph->saddr;
+				input->ip.v4.dst_ip = iph->daddr;
+				input->ip.v4.ttl = iph->ttl;
+				input->ip.v4.tos = iph->tos;
+				input->ip.v4.proto = iph->protocol;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV6:
+			ip6h = (struct ipv6hdr *)hdr->buffer;
+			l3 = VIRTCHNL_PROTO_HDR_IPV6;
+			input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_OTHER;
+
+			if (hdr->field_selector) {
+				memcpy(input->ip.v6.src_ip,
+				       ip6h->saddr.in6_u.u6_addr8,
+				       sizeof(ip6h->saddr));
+				memcpy(input->ip.v6.dst_ip,
+				       ip6h->daddr.in6_u.u6_addr8,
+				       sizeof(ip6h->daddr));
+				input->ip.v6.hlim = ip6h->hop_limit;
+				input->ip.v6.tc = ((u8)(ip6h->priority) << 4) |
+						  (ip6h->flow_lbl[0] >> 4);
+				input->ip.v6.proto = ip6h->nexthdr;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_TCP:
+			tcph = (struct tcphdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_TCP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_TCP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = tcph->source;
+					input->ip.v4.dst_port = tcph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = tcph->source;
+					input->ip.v6.dst_port = tcph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_UDP:
+			udph = (struct udphdr *)hdr->buffer;
+			l4 = VIRTCHNL_PROTO_HDR_UDP;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_UDP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = udph->source;
+					input->ip.v4.dst_port = udph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = udph->source;
+					input->ip.v6.dst_port = udph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_SCTP:
+			sctph = (struct sctphdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_SCTP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_SCTP;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4) {
+					input->ip.v4.src_port = sctph->source;
+					input->ip.v4.dst_port = sctph->dest;
+				} else if (l3 == VIRTCHNL_PROTO_HDR_IPV6) {
+					input->ip.v6.src_port = sctph->source;
+					input->ip.v6.dst_port = sctph->dest;
+				}
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_L2TPV3:
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_L2TPV3;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_L2TPV3;
+
+			if (hdr->field_selector)
+				input->l2tpv3_data.session_id =
+					*((__force __be32 *)hdr->buffer);
+			break;
+		case VIRTCHNL_PROTO_HDR_ESP:
+			esph = (struct ip_esp_hdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4 &&
+			    l4 == VIRTCHNL_PROTO_HDR_UDP)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_NAT_T_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 &&
+				 l4 == VIRTCHNL_PROTO_HDR_UDP)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_NAT_T_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV4 &&
+				 l4 == VIRTCHNL_PROTO_HDR_NONE)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_ESP;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 &&
+				 l4 == VIRTCHNL_PROTO_HDR_NONE)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_ESP;
+
+			if (l4 == VIRTCHNL_PROTO_HDR_UDP)
+				conf->inset_flag |= FDIR_INSET_FLAG_ESP_UDP;
+			else
+				conf->inset_flag |= FDIR_INSET_FLAG_ESP_IPSEC;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.sec_parm_idx = esph->spi;
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.sec_parm_idx = esph->spi;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_AH:
+			ah = (struct ip_auth_hdr *)hdr->buffer;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV4_AH;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+				input->flow_type = ICE_FLTR_PTYPE_NONF_IPV6_AH;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.sec_parm_idx = ah->spi;
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.sec_parm_idx = ah->spi;
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_PFCP:
+			rawh = (u8 *)hdr->buffer;
+			s_field = (rawh[0] >> PFCP_S_OFFSET) & PFCP_S_MASK;
+			if (l3 == VIRTCHNL_PROTO_HDR_IPV4 && s_field == 0)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_PFCP_NODE;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV4 && s_field == 1)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_PFCP_SESSION;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 && s_field == 0)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_PFCP_NODE;
+			else if (l3 == VIRTCHNL_PROTO_HDR_IPV6 && s_field == 1)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_PFCP_SESSION;
+
+			if (hdr->field_selector) {
+				if (l3 == VIRTCHNL_PROTO_HDR_IPV4)
+					input->ip.v4.dst_port =
+						cpu_to_be16(PFCP_PORT_NR);
+				else if (l3 == VIRTCHNL_PROTO_HDR_IPV6)
+					input->ip.v6.dst_port =
+						cpu_to_be16(PFCP_PORT_NR);
+			}
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_IP:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP)
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV4_GTPU;
+			else
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV6_GTPU;
+			if (hdr->field_selector)
+				input->gtpu_data.teid =
+					*(__force __be32 *)(&rawh[GTPU_TEID_OFFSET]);
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH;
+			else
+				input->flow_type =
+				    ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_DW;
+			else
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_DW;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP:
+			rawh = (u8 *)hdr->buffer;
+			if (input->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_GTPU)
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_GTPU_EH_UP;
+			else
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV6_GTPU_EH_UP;
+
+			if (hdr->field_selector)
+				input->gtpu_data.qfi =
+					rawh[GTPU_EH_QFI_OFFSET] &
+					GTPU_EH_QFI_MASK;
+			conf->ttype = ICE_FDIR_TUNNEL_TYPE_GTPU_EH;
+			break;
+		case VIRTCHNL_PROTO_HDR_ECPRI:
+			rawh = (u8 *)hdr->buffer;
+			msg_type = rawh[1];
+			if (l3 == VIRTCHNL_PROTO_HDR_NONE &&
+			    l4 == VIRTCHNL_PROTO_HDR_NONE &&
+			    msg_type == 0) {
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_ECPRI_TP0;
+				conf->inset_flag |= FDIR_INSET_FLAG_ECPRI_MAC;
+			} else if ((l3 == VIRTCHNL_PROTO_HDR_IPV4) &&
+				   (l4 == VIRTCHNL_PROTO_HDR_UDP) &&
+				   (msg_type == 0)) {
+				input->flow_type =
+					ICE_FLTR_PTYPE_NONF_IPV4_UDP_ECPRI_TP0;
+				conf->inset_flag |= FDIR_INSET_FLAG_ECPRI_UDP;
+				conf->ttype = ICE_FDIR_TUNNEL_TYPE_ECPRI;
+			} else {
+				return -EINVAL;
+			}
+
+			if (hdr->field_selector)
+				input->ecpri_data.pc_id =
+					*(__force __be16 *)(&rawh[4]);
+			break;
+		default:
+			dev_dbg(dev, "Invalid header type 0x:%x for VF %d\n",
+				hdr->type, vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_parse_action
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Parse the virtual channel filter's action and store them into @conf
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_action(struct ice_vf *vf,
+			 struct virtchnl_fdir_add *fltr,
+			 struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_filter_action_set *as = &fltr->rule_cfg.action_set;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_fdir_fltr *input = &conf->input;
+	u32 dest_num = 0;
+	u32 mark_num = 0;
+	int i;
+
+	if (as->count > VIRTCHNL_MAX_NUM_ACTIONS) {
+		dev_dbg(dev, "Invalid action numbers:0x%x for VF %d\n",
+			as->count, vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < as->count; i++) {
+		struct virtchnl_filter_action *action = &as->actions[i];
+
+		switch (action->type) {
+		case VIRTCHNL_ACTION_PASSTHRU:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_OTHER;
+			break;
+		case VIRTCHNL_ACTION_DROP:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DROP_PKT;
+			break;
+		case VIRTCHNL_ACTION_QUEUE:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QINDEX;
+			input->q_index = action->act_conf.queue.index;
+			break;
+		case VIRTCHNL_ACTION_Q_REGION:
+			dest_num++;
+			input->dest_ctl =
+				ICE_FLTR_PRGM_DESC_DEST_DIRECT_PKT_QGROUP;
+			input->q_index = action->act_conf.queue.index;
+			input->q_region = action->act_conf.queue.region;
+			break;
+		case VIRTCHNL_ACTION_MARK:
+			mark_num++;
+			input->fltr_id = action->act_conf.mark_id;
+			input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+			break;
+		default:
+			dev_dbg(dev, "Invalid action type:0x%x for VF %d\n",
+				action->type, vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (dest_num == 0 || dest_num >= 2) {
+		dev_dbg(dev, "Invalid destination action for VF %d\n",
+			vf->vf_id);
+		return -EINVAL;
+	}
+
+	if (mark_num >= 2) {
+		dev_dbg(dev, "Too many mark actions for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_validate_fdir_fltr - validate the virtual channel filter
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_validate_fdir_fltr(struct ice_vf *vf,
+			  struct virtchnl_fdir_add *fltr,
+			  struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct virtchnl_proto_hdrs *proto = &fltr->rule_cfg.proto_hdrs;
+	int ret;
+
+	if (!ice_vc_validate_pattern(vf, proto))
+		return -EINVAL;
+
+	ret = ice_vc_fdir_parse_pattern(vf, fltr, conf);
+	if (ret)
+		return ret;
+
+	ret = ice_vc_fdir_parse_action(vf, fltr, conf);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_comp_rules - compare if two filter rules have the same value
+ * @conf_a: FDIR configuration for filter a
+ * @conf_b: FDIR configuration for filter b
+ *
+ * Return: 0 on success, and other on error.
+ */
+static bool
+ice_vc_fdir_comp_rules(struct virtchnl_fdir_fltr_conf *conf_a,
+		       struct virtchnl_fdir_fltr_conf *conf_b)
+{
+	struct ice_fdir_fltr *a = &conf_a->input;
+	struct ice_fdir_fltr *b = &conf_b->input;
+
+	if (conf_a->ttype != conf_b->ttype)
+		return false;
+	if (a->flow_type != b->flow_type)
+		return false;
+	if (memcmp(&a->ip, &b->ip, sizeof(a->ip)))
+		return false;
+	if (memcmp(&a->mask, &b->mask, sizeof(a->mask)))
+		return false;
+	if (memcmp(&a->gtpu_data, &b->gtpu_data, sizeof(a->gtpu_data)))
+		return false;
+	if (memcmp(&a->gtpu_mask, &b->gtpu_mask, sizeof(a->gtpu_mask)))
+		return false;
+	if (memcmp(&a->l2tpv3_data, &b->l2tpv3_data, sizeof(a->l2tpv3_data)))
+		return false;
+	if (memcmp(&a->l2tpv3_mask, &b->l2tpv3_mask, sizeof(a->l2tpv3_mask)))
+		return false;
+	if (memcmp(&a->ext_data, &b->ext_data, sizeof(a->ext_data)))
+		return false;
+	if (memcmp(&a->ext_mask, &b->ext_mask, sizeof(a->ext_mask)))
+		return false;
+	if (memcmp(&a->ecpri_data, &b->ecpri_data, sizeof(a->ecpri_data)))
+		return false;
+	if (memcmp(&a->ecpri_mask, &b->ecpri_mask, sizeof(a->ecpri_mask)))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_fdir_is_dup_fltr
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ *
+ * Check if there is duplicated rule with same @conf value
+ *
+ * Return: 0 true success, and false on error.
+ */
+static bool
+ice_vc_fdir_is_dup_fltr(struct ice_vf *vf,
+			struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct ice_fdir_fltr *desc;
+
+	list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *node =
+				to_fltr_conf_from_desc(desc);
+
+		if (ice_vc_fdir_comp_rules(node, conf))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_fdir_insert_entry
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: pointer to ID value allocated by driver
+ *
+ * Insert FDIR conf entry into list and allocate ID for this filter
+ *
+ * Return: 0 true success, and other on error.
+ */
+static int
+ice_vc_fdir_insert_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 u32 *id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	int i;
+
+	/* alloc ID corresponding with conf */
+	i = idr_alloc(&vf->fdir.fdir_rule_idr, conf, 0,
+		      ICE_FDIR_MAX_FLTRS, GFP_KERNEL);
+	if (i < 0)
+		return -EINVAL;
+	*id = i;
+
+	list_add(&input->fltr_node, &vf->fdir.fdir_rule_list);
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_remove_entry - remove FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: filter rule's ID
+ */
+static void
+ice_vc_fdir_remove_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 u32 id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+
+	idr_remove(&vf->fdir.fdir_rule_idr, id);
+	list_del(&input->fltr_node);
+}
+
+/**
+ * ice_vc_fdir_lookup_entry - lookup FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @id: filter rule's ID
+ *
+ * Return: NULL on error, and other on success.
+ */
+static struct virtchnl_fdir_fltr_conf *
+ice_vc_fdir_lookup_entry(struct ice_vf *vf, u32 id)
+{
+	return idr_find(&vf->fdir.fdir_rule_idr, id);
+}
+
+/**
+ * ice_vc_fdir_flush_entry - remove all FDIR conf entry
+ * @vf: pointer to the VF info
+ */
+static void ice_vc_fdir_flush_entry(struct ice_vf *vf)
+{
+	struct ice_fdir_fltr *desc, *temp;
+
+	list_for_each_entry_safe(desc, temp,
+				 &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *conf =
+				to_fltr_conf_from_desc(desc);
+
+		list_del(&desc->fltr_node);
+		kfree(conf);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_fltr - write filter rule into hardware
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @add: true implies add rule, false implies del rules
+ * @is_tun: false implies non-tunnel type filter, true implies tunnel filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vc_fdir_write_fltr(struct ice_vf *vf,
+				  struct virtchnl_fdir_fltr_conf *conf,
+				  bool add,
+				  bool is_tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct ice_vsi *vsi, *ctrl_vsi;
+	struct ice_fltr_desc desc;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int ret;
+	u8 *pkt;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		dev_dbg(dev, "Invalid vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	input->dest_vsi = vsi->idx;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Invalid ctrl_vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+
+	ice_fdir_get_prgm_desc(hw, input, &desc, add);
+	status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Gen training pkt for VF %d ptype %d failed\n",
+			vf->vf_id, input->flow_type);
+		goto err_free_pkt;
+	}
+
+	ret = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt);
+	if (ret)
+		goto err_free_pkt;
+
+	return 0;
+
+err_free_pkt:
+	devm_kfree(dev, pkt);
+	return ret;
+}
+
+/**
+ * ice_vf_fdir_timer - FDIR program waiting timer interrupt handler
+ * @t: pointer to timer_list
+ */
+static void ice_vf_fdir_timer(struct timer_list *t)
+{
+	struct ice_vf_fdir_ctx *ctx_irq = from_timer(ctx_irq, t, rx_tmr);
+	struct ice_vf_fdir_ctx *ctx_done;
+	struct ice_vf_fdir *fdir;
+	unsigned long flags;
+	struct ice_vf *vf;
+	struct ice_pf *pf;
+
+	fdir = container_of(ctx_irq, struct ice_vf_fdir, ctx_irq);
+	vf = container_of(fdir, struct ice_vf, fdir);
+	ctx_done = &fdir->ctx_done;
+	pf = vf->pf;
+	spin_lock_irqsave(&fdir->ctx_lock, flags);
+	if (!(ctx_irq->flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ctx_irq->flags &= ~ICE_VF_FDIR_CTX_VALID;
+
+	ctx_done->flags |= ICE_VF_FDIR_CTX_VALID;
+	ctx_done->conf = ctx_irq->conf;
+	ctx_done->stat = ICE_FDIR_CTX_TIMEOUT;
+	ctx_done->v_opcode = ctx_irq->v_opcode;
+	spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+
+	set_bit(ICE_FD_VF_FLUSH_CTX, pf->state);
+	ice_service_task_schedule(pf);
+}
+
+/**
+ * ice_vc_fdir_irq_handler - ctrl_vsi Rx queue interrupt handler
+ * @ctrl_vsi: pointer to a VF's CTRL VSI
+ * @rx_desc: pointer to FDIR Rx queue descriptor
+ */
+void
+ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi,
+			union ice_32b_rx_flex_desc *rx_desc)
+{
+	struct ice_pf *pf = ctrl_vsi->back;
+	struct ice_vf_fdir_ctx *ctx_done;
+	struct ice_vf_fdir_ctx *ctx_irq;
+	struct ice_vf_fdir *fdir;
+	unsigned long flags;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	vf = &pf->vf[ctrl_vsi->vf_id];
+
+	fdir = &vf->fdir;
+	ctx_done = &fdir->ctx_done;
+	ctx_irq = &fdir->ctx_irq;
+	dev = ice_pf_to_dev(pf);
+	spin_lock_irqsave(&fdir->ctx_lock, flags);
+	if (!(ctx_irq->flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ctx_irq->flags &= ~ICE_VF_FDIR_CTX_VALID;
+
+	ctx_done->flags |= ICE_VF_FDIR_CTX_VALID;
+	ctx_done->conf = ctx_irq->conf;
+	ctx_done->stat = ICE_FDIR_CTX_IRQ;
+	ctx_done->v_opcode = ctx_irq->v_opcode;
+	memcpy(&ctx_done->rx_desc, rx_desc, sizeof(*rx_desc));
+	spin_unlock_irqrestore(&fdir->ctx_lock, flags);
+
+	ret = del_timer(&ctx_irq->rx_tmr);
+	if (!ret)
+		dev_err(dev, "VF %d: Unexpected inactive timer!\n", vf->vf_id);
+
+	set_bit(ICE_FD_VF_FLUSH_CTX, pf->state);
+	ice_service_task_schedule(pf);
+}
+
+/**
+ * ice_vf_fdir_dump_info - dump FDIR information for diagnosis
+ * @vf: pointer to the VF info
+ */
+static void ice_vf_fdir_dump_info(struct ice_vf *vf)
+{
+	struct ice_vsi *vf_vsi;
+	u32 fd_size, fd_cnt;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u16 vsi_num;
+
+	pf = vf->pf;
+	hw = &pf->hw;
+	dev = ice_pf_to_dev(pf);
+	vf_vsi = ice_get_vf_vsi(vf);
+	vsi_num = ice_get_hw_vsi_num(hw, vf_vsi->idx);
+
+	fd_size = rd32(hw, VSIQF_FD_SIZE(vsi_num));
+	fd_cnt = rd32(hw, VSIQF_FD_CNT(vsi_num));
+	dev_dbg(dev, "VF %d: space allocated: guar:0x%x, be:0x%x, space consumed: guar:0x%x, be:0x%x",
+		vf->vf_id,
+		(fd_size & VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S,
+		(fd_size & VSIQF_FD_CNT_FD_BCNT_M) >> VSIQF_FD_CNT_FD_BCNT_S,
+		(fd_cnt & VSIQF_FD_CNT_FD_GCNT_M) >> VSIQF_FD_CNT_FD_GCNT_S,
+		(fd_cnt & VSIQF_FD_CNT_FD_BCNT_M) >> VSIQF_FD_CNT_FD_BCNT_S);
+}
+
+/**
+ * ice_vf_verify_rx_desc - verify received FDIR programming status descriptor
+ * @vf: pointer to the VF info
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vf_verify_rx_desc(struct ice_vf *vf,
+		      struct ice_vf_fdir_ctx *ctx,
+		      enum virtchnl_fdir_prgm_status *status)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u32 stat_err, error, prog_id;
+	int ret;
+
+	stat_err = le16_to_cpu(ctx->rx_desc.wb.status_error0);
+	if (((stat_err & ICE_FXD_FLTR_WB_QW1_DD_M) >>
+	    ICE_FXD_FLTR_WB_QW1_DD_S) != ICE_FXD_FLTR_WB_QW1_DD_YES) {
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: Desc Done not set\n", vf->vf_id);
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	prog_id = (stat_err & ICE_FXD_FLTR_WB_QW1_PROG_ID_M) >>
+		ICE_FXD_FLTR_WB_QW1_PROG_ID_S;
+	if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_ADD &&
+	    ctx->v_opcode != VIRTCHNL_OP_ADD_FDIR_FILTER) {
+		dev_err(dev, "VF %d: Desc show add, but ctx not",
+			vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_DEL &&
+	    ctx->v_opcode != VIRTCHNL_OP_DEL_FDIR_FILTER) {
+		dev_err(dev, "VF %d: Desc show del, but ctx not",
+			vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	error = (stat_err & ICE_FXD_FLTR_WB_QW1_FAIL_M) >>
+		ICE_FXD_FLTR_WB_QW1_FAIL_S;
+	if (error == ICE_FXD_FLTR_WB_QW1_FAIL_YES) {
+		if (prog_id == ICE_FXD_FLTR_WB_QW1_PROG_ADD) {
+			dev_err(dev, "VF %d, Failed to add FDIR rule due to no space in the table",
+				vf->vf_id);
+			*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		} else {
+			dev_err(dev, "VF %d, Failed to remove FDIR rule, attempt to remove non-existent entry",
+				vf->vf_id);
+			*status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST;
+		}
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	error = (stat_err & ICE_FXD_FLTR_WB_QW1_FAIL_PROF_M) >>
+		ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S;
+	if (error == ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES) {
+		dev_err(dev, "VF %d: Profile matching error", vf->vf_id);
+		*status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		ret = -EINVAL;
+		goto err_exit;
+	}
+
+	*status = VIRTCHNL_FDIR_SUCCESS;
+
+	return 0;
+
+err_exit:
+	ice_vf_fdir_dump_info(vf);
+	return ret;
+}
+
+static int ice_fdir_is_tunnel(enum ice_fdir_tunnel_type ttype)
+{
+	return ttype == ICE_FDIR_TUNNEL_TYPE_ECPRI;
+}
+
+/**
+ * ice_vc_add_fdir_fltr_post
+ * @vf: pointer to the VF structure
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ * @success: true implies success, false implies failure
+ *
+ * Post process for flow director add command. If success, then do post process
+ * and send back success msg by virtchnl. Otherwise, do context reversion and
+ * send back failure msg by virtchnl.
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_add_fdir_fltr_post(struct ice_vf *vf,
+			  struct ice_vf_fdir_ctx *ctx,
+			  enum virtchnl_fdir_prgm_status status,
+			  bool success)
+{
+	struct virtchnl_fdir_fltr_conf *conf = ctx->conf;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum virtchnl_status_code v_ret;
+	struct virtchnl_fdir_add *resp;
+	int ret, len, is_tun;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	len = sizeof(*resp);
+	resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+	if (!resp) {
+		len = 0;
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "VF %d: Alloc resp buf fail", vf->vf_id);
+		goto err_exit;
+	}
+
+	if (!success)
+		goto err_exit;
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	resp->status = status;
+	resp->flow_id = conf->flow_id;
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]++;
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+
+	dev_dbg(dev, "VF %d: flow_id:0x%X, FDIR %s success!\n",
+		vf->vf_id, conf->flow_id,
+		(ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER) ?
+		"add" : "del");
+	return ret;
+
+err_exit:
+	if (resp)
+		resp->status = status;
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+	kfree(conf);
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+	return ret;
+}
+
+/**
+ * ice_vc_del_fdir_fltr_post
+ * @vf: pointer to the VF structure
+ * @ctx: FDIR context info for post processing
+ * @status: virtchnl FDIR program status
+ * @success: true implies success, false implies failure
+ *
+ * Post process for flow director del command. If success, then do post process
+ * and send back success msg by virtchnl. Otherwise, do context reversion and
+ * send back failure msg by virtchnl.
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_del_fdir_fltr_post(struct ice_vf *vf,
+			  struct ice_vf_fdir_ctx *ctx,
+			  enum virtchnl_fdir_prgm_status status,
+			  bool success)
+{
+	struct virtchnl_fdir_fltr_conf *conf = ctx->conf;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum virtchnl_status_code v_ret;
+	struct virtchnl_fdir_del *resp;
+	int ret, len, is_tun;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	len = sizeof(*resp);
+	resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+	if (!resp) {
+		len = 0;
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "VF %d: Alloc resp buf fail", vf->vf_id);
+		goto err_exit;
+	}
+
+	if (!success)
+		goto err_exit;
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	resp->status = status;
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]--;
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+
+	dev_dbg(dev, "VF %d: flow_id:0x%X, FDIR %s success!\n",
+		vf->vf_id, conf->flow_id,
+		(ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER) ?
+		"add" : "del");
+	kfree(conf);
+	return ret;
+
+err_exit:
+	if (resp)
+		resp->status = status;
+	if (success)
+		kfree(conf);
+
+	ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret,
+				    (u8 *)resp, len);
+	kfree(resp);
+	return ret;
+}
+
+/**
+ * ice_flush_fdir_ctx
+ * @pf: pointer to the PF structure
+ *
+ * Flush all the pending event on ctx_done list and process them.
+ */
+void ice_flush_fdir_ctx(struct ice_pf *pf)
+{
+	int i;
+
+	if (!test_and_clear_bit(ICE_FD_VF_FLUSH_CTX, pf->state))
+		return;
+
+	ice_for_each_vf(pf, i) {
+		struct device *dev = ice_pf_to_dev(pf);
+		enum virtchnl_fdir_prgm_status status;
+		struct ice_vf *vf = &pf->vf[i];
+		struct ice_vf_fdir_ctx *ctx;
+		unsigned long flags;
+		int ret;
+
+		if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+			continue;
+
+		if (vf->ctrl_vsi_idx == ICE_NO_VSI)
+			continue;
+
+		ctx = &vf->fdir.ctx_done;
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		if (!(ctx->flags & ICE_VF_FDIR_CTX_VALID)) {
+			spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+
+		WARN_ON(ctx->stat == ICE_FDIR_CTX_READY);
+		if (ctx->stat == ICE_FDIR_CTX_TIMEOUT) {
+			status = VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT;
+			dev_err(dev, "VF %d: ctrl_vsi irq timeout\n",
+				vf->vf_id);
+			goto err_exit;
+		}
+
+		ret = ice_vf_verify_rx_desc(vf, ctx, &status);
+		if (ret)
+			goto err_exit;
+
+		if (ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER)
+			ice_vc_add_fdir_fltr_post(vf, ctx, status, true);
+		else if (ctx->v_opcode == VIRTCHNL_OP_DEL_FDIR_FILTER)
+			ice_vc_del_fdir_fltr_post(vf, ctx, status, true);
+		else
+			dev_err(dev, "VF %d: Unsupported opcode\n", vf->vf_id);
+
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+		continue;
+err_exit:
+		if (ctx->v_opcode == VIRTCHNL_OP_ADD_FDIR_FILTER)
+			ice_vc_add_fdir_fltr_post(vf, ctx, status, false);
+		else if (ctx->v_opcode == VIRTCHNL_OP_DEL_FDIR_FILTER)
+			ice_vc_del_fdir_fltr_post(vf, ctx, status, false);
+		else
+			dev_err(dev, "VF %d: Unsupported opcode\n", vf->vf_id);
+
+		spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+		ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+	}
+}
+
+/**
+ * ice_vc_fdir_set_irq_ctx - set FDIR context info for later irq handler
+ * @vf: pointer to the VF structure
+ * @conf: FDIR configuration for each filter
+ * @v_opcode: virtual channel operation code
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_irq_ctx(struct ice_vf *vf,
+			struct virtchnl_fdir_fltr_conf *conf,
+			enum virtchnl_ops v_opcode)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vf_fdir_ctx *ctx;
+	unsigned long flags;
+
+	ctx = &vf->fdir.ctx_irq;
+	spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+	if ((vf->fdir.ctx_irq.flags & ICE_VF_FDIR_CTX_VALID) ||
+	    (vf->fdir.ctx_done.flags & ICE_VF_FDIR_CTX_VALID)) {
+		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+		dev_dbg(dev, "VF %d: Last request is still in progress\n",
+			vf->vf_id);
+		return -EBUSY;
+	}
+	ctx->flags |= ICE_VF_FDIR_CTX_VALID;
+	spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+
+	ctx->conf = conf;
+	ctx->v_opcode = v_opcode;
+	ctx->stat = ICE_FDIR_CTX_READY;
+	timer_setup(&ctx->rx_tmr, ice_vf_fdir_timer, 0);
+
+	mod_timer(&ctx->rx_tmr,
+		  round_jiffies(msecs_to_jiffies(10) + jiffies));
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_clear_irq_ctx - clear FDIR context info for irq handler
+ * @vf: pointer to the VF structure
+ *
+ * Return: 0 on success, and other on error.
+ */
+static void ice_vc_fdir_clear_irq_ctx(struct ice_vf *vf)
+{
+	struct ice_vf_fdir_ctx *ctx = &vf->fdir.ctx_irq;
+	unsigned long flags;
+
+	del_timer(&ctx->rx_tmr);
+	spin_lock_irqsave(&vf->fdir.ctx_lock, flags);
+	ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
+	spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
+}
+
+/**
+ * ice_vc_add_fdir_fltr - add a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_add *fltr = (struct virtchnl_fdir_add *)msg;
+	struct virtchnl_fdir_add *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	enum virtchnl_status_code v_ret;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vf_start_ctrl_vsi(vf);
+	if (ret && (ret != -EEXIST)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_err(dev, "Init FDIR for VF %d failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc conf for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+	ret = ice_vc_validate_fdir_fltr(vf, fltr, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		dev_dbg(dev, "Invalid FDIR filter from VF %d\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	if (fltr->validate_only) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_SUCCESS;
+		kfree(conf);
+		ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER,
+					    v_ret, (u8 *)stat, len);
+		goto exit;
+	}
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	ret = ice_vc_fdir_config_input_set(vf, fltr, conf, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT;
+		dev_err(dev, "VF %d: FDIR input set configure failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_is_dup_fltr(vf, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_EXIST;
+		dev_dbg(dev, "VF %d: duplicated FDIR rule detected\n",
+			vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_insert_entry(vf, conf, &conf->flow_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: insert FDIR list failed\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_set_irq_ctx(vf, conf, VIRTCHNL_OP_ADD_FDIR_FILTER);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: set FDIR context failed\n", vf->vf_id);
+		goto err_rem_entry;
+	}
+
+	ret = ice_vc_fdir_write_fltr(vf, conf, true, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_clr_irq;
+	}
+
+exit:
+	kfree(stat);
+	return ret;
+
+err_clr_irq:
+	ice_vc_fdir_clear_irq_ctx(vf);
+err_rem_entry:
+	ice_vc_fdir_remove_entry(vf, conf, conf->flow_id);
+err_free_conf:
+	kfree(conf);
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vc_del_fdir_fltr - delete a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_del *fltr = (struct virtchnl_fdir_del *)msg;
+	struct virtchnl_fdir_del *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	enum virtchnl_status_code v_ret;
+	struct ice_fdir_fltr *input;
+	enum ice_fltr_ptype flow;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+
+	conf = ice_vc_fdir_lookup_entry(vf, fltr->flow_id);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST;
+		dev_dbg(dev, "VF %d: FDIR invalid flow_id:0x%X\n",
+			vf->vf_id, fltr->flow_id);
+		goto err_exit;
+	}
+
+	/* Just return failure when ctrl_vsi idx is invalid */
+	if (vf->ctrl_vsi_idx == ICE_NO_VSI) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "Invalid FDIR ctrl_vsi for VF %d\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_set_irq_ctx(vf, conf, VIRTCHNL_OP_DEL_FDIR_FILTER);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: set FDIR context failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	is_tun = ice_fdir_is_tunnel(conf->ttype);
+	ret = ice_vc_fdir_write_fltr(vf, conf, false, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_del_tmr;
+	}
+
+	/* Remove unused profiles to avoid unexpected behaviors */
+	input = &conf->input;
+	flow = input->flow_type;
+	if (fdir->fdir_fltr_cnt[flow][is_tun] == 1)
+		ice_vc_fdir_rem_prof(vf, flow, is_tun);
+
+	kfree(stat);
+
+	return ret;
+
+err_del_tmr:
+	ice_vc_fdir_clear_irq_ctx(vf);
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vf_fdir_init - init FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_init(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	idr_init(&fdir->fdir_rule_idr);
+	INIT_LIST_HEAD(&fdir->fdir_rule_list);
+
+	spin_lock_init(&fdir->ctx_lock);
+	fdir->ctx_irq.flags = 0;
+	fdir->ctx_done.flags = 0;
+}
+
+/**
+ * ice_vf_fdir_exit - destroy FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_exit(struct ice_vf *vf)
+{
+	ice_vc_fdir_flush_entry(vf);
+	idr_destroy(&vf->fdir.fdir_rule_idr);
+	ice_vc_fdir_rem_prof_all(vf);
+	ice_vc_fdir_free_prof_all(vf);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf5d8359c70cc265988fe4bcd62c2351e0dc7bd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_FDIR_H_
+#define _ICE_VIRTCHNL_FDIR_H_
+
+struct ice_vf;
+struct ice_pf;
+
+enum ice_fdir_ctx_stat {
+	ICE_FDIR_CTX_READY,
+	ICE_FDIR_CTX_IRQ,
+	ICE_FDIR_CTX_TIMEOUT,
+};
+
+struct ice_vf_fdir_ctx {
+	struct timer_list rx_tmr;
+	enum virtchnl_ops v_opcode;
+	enum ice_fdir_ctx_stat stat;
+	union ice_32b_rx_flex_desc rx_desc;
+#define ICE_VF_FDIR_CTX_VALID		BIT(0)
+	u32 flags;
+
+	void *conf;
+};
+
+/* VF FDIR information structure */
+struct ice_vf_fdir {
+	u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	int prof_entry_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	struct ice_fd_hw_prof **fdir_prof;
+
+	struct idr fdir_rule_idr;
+	struct list_head fdir_rule_list;
+
+	spinlock_t ctx_lock; /* protects FDIR context info */
+	struct ice_vf_fdir_ctx ctx_irq;
+	struct ice_vf_fdir_ctx ctx_done;
+};
+
+#ifdef CONFIG_PCI_IOV
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg);
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg);
+void ice_vc_fdir_free_prof_all(struct ice_vf *vf);
+void ice_vc_fdir_rem_prof_all(struct ice_vf *vf);
+void ice_vf_fdir_init(struct ice_vf *vf);
+void ice_vf_fdir_exit(struct ice_vf *vf);
+void
+ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi,
+			union ice_32b_rx_flex_desc *rx_desc);
+void ice_flush_fdir_ctx(struct ice_pf *pf);
+#else
+static inline
+void ice_vc_fdir_irq_handler(struct ice_vsi *ctrl_vsi, union ice_32b_rx_flex_desc *rx_desc) { }
+static inline void ice_flush_fdir_ctx(struct ice_pf *pf) { }
+#endif /* CONFIG_PCI_IOV */
+#endif /* _ICE_VIRTCHNL_FDIR_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index e92a00a617556fcc5cc743a3558d1c9f44b567a4..52fd9c37fa38678ad0d1a6498f7b8f3aab560e32 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1,11 +1,301 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #include "ice.h"
+#include "ice_base.h"
 #include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice_dcb_lib.h"
+#include "ice_eswitch.h"
+#include "ice_virtchnl_allowlist.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_vlan.h"
+#include "ice_flex_pipe.h"
+#include "ice_tc_lib.h"
+
+#define FIELD_SELECTOR(proto_hdr_field) \
+		BIT((proto_hdr_field) & PROTO_HDR_FIELD_MASK)
+
+struct ice_vc_hdr_match_type {
+	s32 vc_hdr;	/* virtchnl headers (VIRTCHNL_PROTO_HDR_XXX) */
+	u32 ice_hdr;	/* ice headers (ICE_FLOW_SEG_HDR_XXX) */
+};
+
+static const struct ice_vc_hdr_match_type ice_vc_hdr_list[] = {
+	{VIRTCHNL_PROTO_HDR_NONE,	ICE_FLOW_SEG_HDR_NONE},
+	{VIRTCHNL_PROTO_HDR_ETH,	ICE_FLOW_SEG_HDR_ETH},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_IPV4,	ICE_FLOW_SEG_HDR_IPV4 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_IPV6,	ICE_FLOW_SEG_HDR_IPV6 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_TCP,	ICE_FLOW_SEG_HDR_TCP},
+	{VIRTCHNL_PROTO_HDR_UDP,	ICE_FLOW_SEG_HDR_UDP},
+	{VIRTCHNL_PROTO_HDR_SCTP,	ICE_FLOW_SEG_HDR_SCTP},
+	{VIRTCHNL_PROTO_HDR_PPPOE,	ICE_FLOW_SEG_HDR_PPPOE},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,	ICE_FLOW_SEG_HDR_GTPU_IP},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH,	ICE_FLOW_SEG_HDR_GTPU_EH},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+					ICE_FLOW_SEG_HDR_GTPU_DWN},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+					ICE_FLOW_SEG_HDR_GTPU_UP},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,	ICE_FLOW_SEG_HDR_L2TPV3},
+	{VIRTCHNL_PROTO_HDR_ESP,	ICE_FLOW_SEG_HDR_ESP},
+	{VIRTCHNL_PROTO_HDR_AH,		ICE_FLOW_SEG_HDR_AH},
+	{VIRTCHNL_PROTO_HDR_PFCP,	ICE_FLOW_SEG_HDR_PFCP_SESSION},
+	{VIRTCHNL_PROTO_HDR_GTPC,	ICE_FLOW_SEG_HDR_GTPC},
+	{VIRTCHNL_PROTO_HDR_ECPRI,	ICE_FLOW_SEG_HDR_ECPRI_TP0 |
+					ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0},
+	{VIRTCHNL_PROTO_HDR_L2TPV2,	ICE_FLOW_SEG_HDR_L2TPV2},
+	{VIRTCHNL_PROTO_HDR_PPP,	ICE_FLOW_SEG_HDR_PPP},
+};
+
+struct ice_vc_hash_field_match_type {
+	s32 vc_hdr;		/* virtchnl headers
+				 * (VIRTCHNL_PROTO_HDR_XXX)
+				 */
+	u32 vc_hash_field;	/* virtchnl hash fields selector
+				 * FIELD_SELECTOR((VIRTCHNL_PROTO_HDR_ETH_XXX))
+				 */
+	u64 ice_hash_field;	/* ice hash fields
+				 * (BIT_ULL(ICE_FLOW_FIELD_IDX_XXX))
+				 */
+};
+
+static const struct
+ice_vc_hash_field_match_type ice_vc_hash_field_list[] = {
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		ICE_FLOW_HASH_ETH},
+	{VIRTCHNL_PROTO_HDR_ETH,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_TYPE)},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_S_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_S_VLAN)},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_C_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_C_VLAN)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		ICE_FLOW_HASH_IPV4},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		ICE_FLOW_HASH_IPV6},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		ICE_FLOW_HASH_IPV6_PRE64},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6_PRE64 |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		ICE_FLOW_HASH_TCP_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		ICE_FLOW_HASH_UDP_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		ICE_FLOW_HASH_SCTP_PORT},
+	{VIRTCHNL_PROTO_HDR_PPPOE,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPU_IP_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID)},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_ESP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ESP_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI)},
+	{VIRTCHNL_PROTO_HDR_AH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_AH_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)},
+	{VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)},
+	{VIRTCHNL_PROTO_HDR_GTPC,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPC_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID)},
+	{VIRTCHNL_PROTO_HDR_ECPRI,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID)},
+};
+
+/**
+ * ice_get_vf_vsi - get VF's VSI based on the stored index
+ * @vf: VF used to get VSI
+ */
+struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf)
+{
+	return vf->pf->vsi[vf->lan_vsi_idx];
+}
+
+static struct ice_vsi *ice_get_vf_adq_vsi(struct ice_vf *vf, u8 tc)
+{
+	return vf->pf->vsi[vf->ch[tc].vsi_idx];
+}
+
+/**
+ * ice_is_vf_adq_ena - is VF ADQ enabled
+ * @vf: pointer to the VF info
+ *
+ * This function returns true if VF ADQ is enabled. It is must to check
+ * VF's num_tc as well, it must be more than ICE_VF_CHNL_START_TC for
+ * valid ADQ configuration
+ */
+static bool ice_is_vf_adq_ena(struct ice_vf *vf)
+{
+	return vf->adq_enabled && (vf->num_tc > ICE_VF_CHNL_START_TC);
+}
+
+/**
+ * ice_vf_adq_vsi_stop_rings - stops the VF ADQ VSI rings
+ * @vf: pointer to the VF info
+ * @tc: VF ADQ TC number
+ *
+ * This function stops Tx and Rx ring specific to VF ADQ VSI
+ */
+static void ice_vf_adq_vsi_stop_rings(struct ice_vf *vf, int tc)
+{
+	struct ice_vsi *vsi = ice_get_vf_adq_vsi(vf, tc);
+
+	if (!vsi)
+		return;
+	ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
+	ice_vsi_stop_all_rx_rings(vsi);
+}
+
+/**
+ * ice_vf_adq_vsi_disable_txqs - disable Tx queues for VF ADQ
+ * @vf: pointer to the VF info
+ * @tc: VF ADQ TC number
+ *
+ * This function disabled Tx queues specific to VF ADQ VSI
+ */
+static void ice_vf_adq_vsi_disable_txqs(struct ice_vf *vf, int tc)
+{
+	struct ice_vsi *vsi = ice_get_vf_adq_vsi(vf, tc);
+
+	if (!vsi)
+		return;
+	ice_dis_vsi_txq(vsi->port_info, vf->ch[tc].vsi_idx, 0, 0, NULL, NULL,
+			NULL, ICE_VF_RESET, vf->vf_id, NULL);
+}
+
+/**
+ * ice_validate_vf_id - helper to check if VF ID is valid
+ * @pf: pointer to the PF structure
+ * @vf_id: the ID of the VF to check
+ */
+static int ice_validate_vf_id(struct ice_pf *pf, u16 vf_id)
+{
+	/* vf_id range is only valid for 0-255, and should always be unsigned */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(ice_pf_to_dev(pf), "Invalid VF ID: %u\n", vf_id);
+		return -EINVAL;
+	}
+	return 0;
+}
 
 /**
- * ice_err_to_virt err - translate errors for VF return code
+ * ice_check_vf_init - helper to check if VF init complete
+ * @pf: pointer to the PF structure
+ * @vf: the pointer to the VF to check
+ */
+static int ice_check_vf_init(struct ice_pf *pf, struct ice_vf *vf)
+{
+	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
+		dev_err(ice_pf_to_dev(pf), "VF ID: %u in reset. Try again.\n",
+			vf->vf_id);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * ice_err_to_virt_err - translate errors for VF return code
  * @ice_err: error return code
  */
 static enum virtchnl_status_code ice_err_to_virt_err(enum ice_status ice_err)
@@ -48,10 +338,11 @@ ice_vc_vf_broadcast(struct ice_pf *pf, enum virtchnl_ops v_opcode,
 		    enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
 {
 	struct ice_hw *hw = &pf->hw;
-	struct ice_vf *vf = pf->vf;
-	int i;
+	unsigned int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
 
-	for (i = 0; i < pf->num_alloc_vfs; i++, vf++) {
 		/* Not all vfs are enabled so skip the ones that are not */
 		if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states) &&
 		    !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
@@ -91,23 +382,49 @@ ice_set_pfe_link(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
 }
 
 /**
- * ice_set_pfe_link_forced - Force the virtchnl_pf_event link speed/status
- * @vf: pointer to the VF structure
- * @pfe: pointer to the virtchnl_pf_event to set link speed/status for
- * @link_up: whether or not to set the link up/down
+ * ice_vf_has_no_qs_ena - check if the VF has any Rx or Tx queues enabled
+ * @vf: the VF to check
+ *
+ * Returns true if the VF has no Rx and no Tx queues enabled and returns false
+ * otherwise
  */
-static void
-ice_set_pfe_link_forced(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
-			bool link_up)
+static bool ice_vf_has_no_qs_ena(struct ice_vf *vf)
 {
-	u16 link_speed;
+	return (!bitmap_weight(vf->rxq_ena, ICE_MAX_QS_PER_VF) &&
+		!bitmap_weight(vf->txq_ena, ICE_MAX_QS_PER_VF));
+}
 
-	if (link_up)
-		link_speed = ICE_AQ_LINK_SPEED_100GB;
-	else
-		link_speed = ICE_AQ_LINK_SPEED_UNKNOWN;
+/**
+ * ice_vf_get_port_info - Get the VF's port info structure
+ * @vf: VF used to get the port info structure for
+ */
+static struct ice_port_info *ice_vf_get_port_info(struct ice_vf *vf)
+{
+	return vf->pf->hw.port_info;
+}
+
+/**
+ * ice_is_vf_link_up - check if the VF's link is up
+ * @vf: VF to check if link is up
+ */
+static bool ice_is_vf_link_up(struct ice_vf *vf)
+{
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+
+	if (ice_check_vf_init(pf, vf))
+		return false;
+
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return false;
 
-	ice_set_pfe_link(vf, pfe, link_speed, link_up);
+	if (ice_vf_has_no_qs_ena(vf))
+		return false;
+	else if (vf->link_forced)
+		return vf->link_up;
+	else
+		return pi->phy.link_info.link_info &
+			ICE_AQ_LINK_UP;
 }
 
 /**
@@ -116,33 +433,237 @@ ice_set_pfe_link_forced(struct ice_vf *vf, struct virtchnl_pf_event *pfe,
  *
  * send a link status message to a single VF
  */
-static void ice_vc_notify_vf_link_state(struct ice_vf *vf)
+void ice_vc_notify_vf_link_state(struct ice_vf *vf)
 {
 	struct virtchnl_pf_event pfe = { 0 };
-	struct ice_link_status *ls;
-	struct ice_pf *pf = vf->pf;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_port_info *pi;
+
+	pi = ice_vf_get_port_info(vf);
 
-	hw = &pf->hw;
-	ls = &hw->port_info->phy.link_info;
 
 	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
 	pfe.severity = PF_EVENT_SEVERITY_INFO;
 
-	/* Always report link is down if the VF queues aren't enabled */
-	if (!vf->num_qs_ena)
-		ice_set_pfe_link(vf, &pfe, ICE_AQ_LINK_SPEED_UNKNOWN, false);
-	else if (vf->link_forced)
-		ice_set_pfe_link_forced(vf, &pfe, vf->link_up);
+	if (ice_is_vf_link_up(vf))
+		ice_set_pfe_link(vf, &pfe, pi->phy.link_info.link_speed, true);
 	else
-		ice_set_pfe_link(vf, &pfe, ls->link_speed, ls->link_info &
-				 ICE_AQ_LINK_UP);
+		ice_set_pfe_link(vf, &pfe, ICE_AQ_LINK_SPEED_UNKNOWN, false);
 
 	ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT,
 			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe,
 			      sizeof(pfe), NULL);
 }
 
+/**
+ * ice_vf_invalidate_vsi - invalidate vsi_idx/vsi_num to remove VSI access
+ * @vf: VF to remove access to VSI for
+ */
+static void ice_vf_invalidate_vsi(struct ice_vf *vf)
+{
+	vf->lan_vsi_idx = ICE_NO_VSI;
+	vf->lan_vsi_num = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_vsi_release - invalidate the VF's VSI after freeing it
+ * @vf: invalidate this VF's VSI after freeing it
+ */
+static void ice_vf_vsi_release(struct ice_vf *vf)
+{
+	ice_vsi_release(ice_get_vf_vsi(vf));
+	ice_vf_invalidate_vsi(vf);
+}
+
+/**
+ * ice_vf_adq_invalidate_vsi - invalidate vsi_idx/vsi_num to remove VSI access
+ * @vf: VF that ADQ VSI is being invalidated on
+ * @tc: TC used to access channel specific vsi_idx/vsi_num
+ */
+static void ice_vf_adq_invalidate_vsi(struct ice_vf *vf, u8 tc)
+{
+	vf->ch[tc].vsi_idx = ICE_NO_VSI;
+	vf->ch[tc].vsi_num = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_adq_vsi_valid - is ADQ VSI valid?
+ * @vf: VF that ADQ VSI is being validated
+ * @tc: TC used to access channel specific vsi_idx/vsi_num
+ *
+ * vsi_idx must be non-zero, and vsi_idx and vsi_num must not be ICE_NO_VSI
+ */
+static bool ice_vf_adq_vsi_valid(struct ice_vf *vf, u8 tc)
+{
+	return (vf->ch[tc].vsi_idx && vf->ch[tc].vsi_idx != ICE_NO_VSI &&
+		vf->ch[tc].vsi_num != ICE_NO_VSI);
+}
+
+/**
+ * ice_vf_adq_vsi_release - release VF ADQ VSI resources
+ * @vf: VF that ADQ VSI is being released on
+ * @tc: TC used to access channel specific VSI
+ *
+ * This function stops Tx and Rx queues if specified, disables Tx queues if
+ * specified, releases VSI resources, and invalidates it
+ *
+ */
+static void ice_vf_adq_vsi_release(struct ice_vf *vf, u8 tc)
+{
+	ice_vsi_release(ice_get_vf_adq_vsi(vf, tc));
+	ice_vf_adq_invalidate_vsi(vf, tc);
+}
+
+/**
+ * ice_vf_adq_cfg_cleanup - invalidate the VF's channel software info
+ * @vf: VF that ADQ VSI is being released on
+ * @tc: TC used to access channel specific VSI
+ *
+ * This function invalidates software data structures specific to channel
+ * such as num_qps, tx_rate, etc... This is called from places like:
+ * when ADQ VSI is released either from rebuild path "ice_vf_adq_release"
+ * or during rebuild ADQ config if failed to create/setup VF ADQ VSIs
+ */
+static void ice_vf_adq_cfg_cleanup(struct ice_vf *vf, u8 tc)
+{
+	vf->ch[tc].num_qps = 0;
+	vf->ch[tc].offset = 0;
+	vf->ch[tc].max_tx_rate = 0;
+	/* since this function is called from places where
+	 * VF ADQ VSI are cleanup from HW, it's OK to clear
+	 * VF ADQ filter_type to be INVALID.
+	 * Remember VF ADQ filter are replayed by VF driver
+	 * as needed
+	 */
+	vf->ch[tc].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_del_all_adv_switch_fltr
+ * @vf: pointer to the VF info
+ *
+ * This function deletes all advanced switch filters specific to the VF and
+ * releases filter memory and updates all book-keeping. This function to be
+ * used when delete channel message is received before deleting channel VSIs
+ */
+static void ice_del_all_adv_switch_fltr(struct ice_vf *vf)
+{
+	struct ice_rule_query_data rule;
+	struct ice_tc_flower_fltr *f;
+	struct ice_pf *pf = vf->pf;
+	struct hlist_node *node;
+	struct device *dev;
+	int err;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	hlist_for_each_entry_safe(f, node, &vf->tc_flower_fltr_list,
+				  tc_flower_node) {
+		if (!f->dest_vsi)
+			continue;
+
+		/* Deleting TC filter */
+		rule.rid = f->rid;
+		rule.rule_id = f->rule_id;
+		rule.vsi_handle = f->dest_id;
+		err = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+		if (err) {
+			if (err == ICE_ERR_DOES_NOT_EXIST)
+				dev_dbg(dev, "VF %d: filter (rule_id %u) for dest VSI %u DOES NOT EXIST in hw table\n",
+					vf->vf_id, f->rule_id, f->dest_id);
+			else
+				dev_err(dev, "VF %d: Failed to delete switch filter for VSI handle %u, err %d\n",
+					vf->vf_id, f->dest_id, err);
+		}
+
+		/* book-keeping and update filter type if filter count
+		 * reached zero
+		 */
+		f->dest_vsi->num_chnl_fltr--;
+		hlist_del(&f->tc_flower_node);
+		devm_kfree(dev, f);
+		vf->num_dmac_chnl_fltrs--;
+	}
+
+	/* Reset VF channel filter type to be INVALID */
+	for (i = 1; i < vf->num_tc; i++)
+		vf->ch[i].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_vf_adq_release - perform VF ADQ resource cleanup only
+ * @vf: pointer to the VF structure
+ *
+ * Delete all VF ADQ filters, release VF ADQ VSIs, cleanup internal data
+ * structues which keeps track of per TC infor including TC0. This function
+ * is invoked only when VFLR based VF Reset.
+ */
+static void ice_vf_adq_release(struct ice_vf *vf)
+{
+	u8 tc;
+
+	/* no ADQ configured, nothing to do */
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* release VF ADQ specific filters and eventually VF driver
+	 * will trigger replay of VF ADQ filters as needed, just like
+	 * other MAC, VLAN filters
+	 */
+	ice_del_all_adv_switch_fltr(vf);
+#endif
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+		/* Tx queues are disabled before VF reset is scheduled as part
+		 * of VFLR flow. Disabling TX queues again causes error
+		 * such as EINVAL from admin command because underlying
+		 * scheduler configs are cleared as part of disabling once
+		 */
+		if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
+			ice_vf_adq_vsi_stop_rings(vf, tc);
+		ice_vf_adq_vsi_release(vf, tc);
+		/* clear per TC info to avoid stale information such as
+		 * num_qps, tx_rate, etc...
+		 */
+		ice_vf_adq_cfg_cleanup(vf, tc);
+	}
+
+	/* to avoid rebuilding of VF ADQ VSIs by mistake */
+	vf->adq_enabled = false;
+	vf->num_tc = 0;
+
+	/* main VF VSI should be built with default, hence clear related
+	 * data structures otherwise vf->ch[0].num_qps and tx_rate will
+	 * still have stale information as stored from "add channel"
+	 * virtchnl message
+	 */
+	ice_vf_adq_cfg_cleanup(vf, 0);
+}
+
+/**
+ * ice_vf_ctrl_invalidate_vsi - invalidate ctrl_vsi_idx to remove VSI access
+ * @vf: VF that control VSI is being invalidated on
+ */
+static void ice_vf_ctrl_invalidate_vsi(struct ice_vf *vf)
+{
+	vf->ctrl_vsi_idx = ICE_NO_VSI;
+}
+
+/**
+ * ice_vf_ctrl_vsi_release - invalidate the VF's control VSI after freeing it
+ * @vf: VF that control VSI is being released on
+ */
+static void ice_vf_ctrl_vsi_release(struct ice_vf *vf)
+{
+	ice_vsi_release(vf->pf->vsi[vf->ctrl_vsi_idx]);
+	ice_vf_ctrl_invalidate_vsi(vf);
+}
+
 /**
  * ice_free_vf_res - Free a VF's resources
  * @vf: pointer to the VF info
@@ -156,16 +677,25 @@ static void ice_free_vf_res(struct ice_vf *vf)
 	 * accessing the VF's VSI after it's freed or invalidated.
 	 */
 	clear_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	ice_vf_fdir_exit(vf);
+	/* free VF control VSI */
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		ice_vf_ctrl_vsi_release(vf);
 
 	/* free VSI and disconnect it from the parent uplink */
-	if (vf->lan_vsi_idx) {
-		ice_vsi_release(pf->vsi[vf->lan_vsi_idx]);
-		vf->lan_vsi_idx = 0;
-		vf->lan_vsi_num = 0;
+	if (vf->lan_vsi_idx != ICE_NO_VSI) {
+		ice_vf_vsi_release(vf);
 		vf->num_mac = 0;
 	}
 
-	last_vector_idx = vf->first_vector_idx + pf->num_vf_msix - 1;
+	last_vector_idx = vf->first_vector_idx + pf->num_msix_per_vf - 1;
+
+	/* clear VF MDD event information */
+	memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events));
+	memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events));
+
+	ice_vf_adq_release(vf);
+
 	/* Disable interrupts so that VF starts in a known state */
 	for (i = vf->first_vector_idx; i <= last_vector_idx; i++) {
 		wr32(&pf->hw, GLINT_DYN_CTL(i), GLINT_DYN_CTL_CLEARPBA_M);
@@ -184,17 +714,19 @@ static void ice_dis_vf_mappings(struct ice_vf *vf)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
+	struct device *dev;
 	int first, last, v;
 	struct ice_hw *hw;
 
 	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 
+	dev = ice_pf_to_dev(pf);
 	wr32(hw, VPINT_ALLOC(vf->vf_id), 0);
 	wr32(hw, VPINT_ALLOC_PCI(vf->vf_id), 0);
 
 	first = vf->first_vector_idx;
-	last = first + pf->num_vf_msix - 1;
+	last = first + pf->num_msix_per_vf - 1;
 	for (v = first; v <= last; v++) {
 		u32 reg;
 
@@ -208,25 +740,19 @@ static void ice_dis_vf_mappings(struct ice_vf *vf)
 	if (vsi->tx_mapping_mode == ICE_VSI_MAP_CONTIG)
 		wr32(hw, VPLAN_TX_QBASE(vf->vf_id), 0);
 	else
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Tx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Tx queues is not yet implemented\n");
 
 	if (vsi->rx_mapping_mode == ICE_VSI_MAP_CONTIG)
 		wr32(hw, VPLAN_RX_QBASE(vf->vf_id), 0);
 	else
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Rx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Rx queues is not yet implemented\n");
 }
 
 /**
  * ice_sriov_free_msix_res - Reset/free any used MSIX resources
  * @pf: pointer to the PF structure
  *
- * If MSIX entries from the pf->irq_tracker were needed then we need to
- * reset the irq_tracker->end and give back the entries we needed to
- * num_avail_sw_msix.
- *
- * If no MSIX entries were taken from the pf->irq_tracker then just clear
+ * Since no MSIX entries are taken from the pf->irq_tracker then just clear
  * the pf->sriov_base_vector.
  *
  * Returns 0 on success, and -EINVAL on error.
@@ -243,11 +769,7 @@ static int ice_sriov_free_msix_res(struct ice_pf *pf)
 		return -EINVAL;
 
 	/* give back irq_tracker resources used */
-	if (pf->sriov_base_vector < res->num_entries) {
-		res->end = res->num_entries;
-		pf->num_avail_sw_msix +=
-			res->num_entries - pf->sriov_base_vector;
-	}
+	WARN_ON(pf->sriov_base_vector < res->num_entries);
 
 	pf->sriov_base_vector = 0;
 
@@ -261,9 +783,8 @@ static int ice_sriov_free_msix_res(struct ice_pf *pf)
 void ice_set_vf_state_qs_dis(struct ice_vf *vf)
 {
 	/* Clear Rx/Tx enabled queues flag */
-	bitmap_zero(vf->txq_ena, ICE_MAX_BASE_QS_PER_VF);
-	bitmap_zero(vf->rxq_ena, ICE_MAX_BASE_QS_PER_VF);
-	vf->num_qs_ena = 0;
+	bitmap_zero(vf->txq_ena, ICE_MAX_QS_PER_VF);
+	bitmap_zero(vf->rxq_ena, ICE_MAX_QS_PER_VF);
 	clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 }
 
@@ -273,13 +794,22 @@ void ice_set_vf_state_qs_dis(struct ice_vf *vf)
  */
 static void ice_dis_vf_qs(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
 
 	ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
-	ice_vsi_stop_rx_rings(vsi);
+	ice_vsi_stop_all_rx_rings(vsi);
+	/* Likewise if VF ADQ is enabled, stop Tx and Rx rings of VF ADQ VSI */
+	if (ice_is_vf_adq_ena(vf)) {
+		int tc;
+
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			if (!ice_vf_adq_vsi_valid(vf, tc))
+				continue;
+			vsi = ice_get_vf_adq_vsi(vf, tc);
+			ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
+			ice_vsi_stop_all_rx_rings(vsi);
+		}
+	}
 	ice_set_vf_state_qs_dis(vf);
 }
 
@@ -289,19 +819,18 @@ static void ice_dis_vf_qs(struct ice_vf *vf)
  */
 void ice_free_vfs(struct ice_pf *pf)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
-	int tmp, i;
+	unsigned int tmp, i;
 
 	if (!pf->vf)
 		return;
 
-	while (test_and_set_bit(__ICE_VF_DIS, pf->state))
+	ice_eswitch_release(pf);
+
+	while (test_and_set_bit(ICE_VF_DIS, pf->state))
 		usleep_range(1000, 2000);
 
-	/* Avoid wait time by stopping all VFs at the same time */
-	for (i = 0; i < pf->num_alloc_vfs; i++)
-		if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states))
-			ice_dis_vf_qs(&pf->vf[i]);
 
 	/* Disable IOV before freeing resources. This lets any VF drivers
 	 * running in the host get themselves cleaned up before we yank
@@ -310,10 +839,21 @@ void ice_free_vfs(struct ice_pf *pf)
 	if (!pci_vfs_assigned(pf->pdev))
 		pci_disable_sriov(pf->pdev);
 	else
-		dev_warn(&pf->pdev->dev, "VFs are assigned - not disabling SR-IOV\n");
+		dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n");
+
+	if (ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+	}
+
+	/* Avoid wait time by stopping all VFs at the same time */
+	ice_for_each_vf(pf, i)
+		if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states))
+			ice_dis_vf_qs(&pf->vf[i]);
 
 	tmp = pf->num_alloc_vfs;
-	pf->num_vf_qps = 0;
+	pf->num_qps_per_vf = 0;
 	pf->num_alloc_vfs = 0;
 	for (i = 0; i < tmp; i++) {
 		if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) {
@@ -325,10 +865,9 @@ void ice_free_vfs(struct ice_pf *pf)
 	}
 
 	if (ice_sriov_free_msix_res(pf))
-		dev_err(&pf->pdev->dev,
-			"Failed to free MSIX resources used by SR-IOV\n");
+		dev_err(dev, "Failed to free MSIX resources used by SR-IOV\n");
 
-	devm_kfree(&pf->pdev->dev, pf->vf);
+	devm_kfree(dev, pf->vf);
 	pf->vf = NULL;
 
 	/* This check is for when the driver is unloaded while VFs are
@@ -336,7 +875,7 @@ void ice_free_vfs(struct ice_pf *pf)
 	 * before this function ever gets called.
 	 */
 	if (!pci_vfs_assigned(pf->pdev)) {
-		int vf_id;
+		unsigned int vf_id;
 
 		/* Acknowledge VFLR for all VFs. Without this, VFs will fail to
 		 * work correctly when SR-IOV gets re-enabled.
@@ -349,7 +888,13 @@ void ice_free_vfs(struct ice_pf *pf)
 			wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
 		}
 	}
-	clear_bit(__ICE_VF_DIS, pf->state);
+
+	/* clear malicious info if the VFs are getting released */
+	for (i = 0; i < tmp; i++)
+		if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, i))
+			dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
+
+	clear_bit(ICE_VF_DIS, pf->state);
 	clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
 }
 
@@ -367,9 +912,11 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 {
 	struct ice_pf *pf = vf->pf;
 	u32 reg, reg_idx, bit_idx;
+	unsigned int vf_abs_id, i;
+	struct device *dev;
 	struct ice_hw *hw;
-	int vf_abs_id, i;
 
+	dev = ice_pf_to_dev(pf);
 	hw = &pf->hw;
 	vf_abs_id = vf->vf_id + hw->func_caps.vf_base_id;
 
@@ -377,20 +924,19 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 	clear_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
 
 	/* Disable VF's configuration API during reset. The flag is re-enabled
-	 * in ice_alloc_vf_res(), when it's safe again to access VF's VSI.
-	 * It's normally disabled in ice_free_vf_res(), but it's safer
-	 * to do it earlier to give some time to finish to any VF config
-	 * functions that may still be running at this point.
+	 * when it's safe again to access VF's VSI.
 	 */
 	clear_bit(ICE_VF_STATE_INIT, vf->vf_states);
 
-	/* VF_MBX_ARQLEN is cleared by PFR, so the driver needs to clear it
-	 * in the case of VFR. If this is done for PFR, it can mess up VF
-	 * resets because the VF driver may already have started cleanup
-	 * by the time we get here.
+	/* VF_MBX_ARQLEN and VF_MBX_ATQLEN are cleared by PFR, so the driver
+	 * needs to clear them in the case of VFR/VFLR. If this is done for
+	 * PFR, it can mess up VF resets because the VF driver may already
+	 * have started cleanup by the time we get here.
 	 */
-	if (!is_pfr)
-		wr32(hw, VF_MBX_ARQLEN(vf_abs_id), 0);
+	if (!is_pfr) {
+		wr32(hw, VF_MBX_ARQLEN(vf->vf_id), 0);
+		wr32(hw, VF_MBX_ATQLEN(vf->vf_id), 0);
+	}
 
 	/* In the case of a VFLR, the HW has already reset the VF and we
 	 * just need to clean up, so don't hit the VFRTRIG register.
@@ -415,92 +961,84 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr)
 		if ((reg & VF_TRANS_PENDING_M) == 0)
 			break;
 
-		dev_err(&pf->pdev->dev,
-			"VF %d PCI transactions stuck\n", vf->vf_id);
+		dev_err(dev, "VF %u PCI transactions stuck\n", vf->vf_id);
 		udelay(ICE_PCI_CIAD_WAIT_DELAY_US);
 	}
 }
 
 /**
- * ice_vsi_set_pvid_fill_ctxt - Set VSI ctxt for add PVID
- * @ctxt: the VSI ctxt to fill
- * @vid: the VLAN ID to set as a PVID
+ * ice_vf_vsi_setup - Set up a VF VSI
+ * @vf: VF to setup VSI for
+ *
+ * Returns pointer to the successfully allocated VSI struct on success,
+ * otherwise returns NULL on failure.
  */
-static void ice_vsi_set_pvid_fill_ctxt(struct ice_vsi_ctx *ctxt, u16 vid)
+static struct ice_vsi *ice_vf_vsi_setup(struct ice_vf *vf)
 {
-	ctxt->info.vlan_flags = (ICE_AQ_VSI_VLAN_MODE_UNTAGGED |
-				 ICE_AQ_VSI_PVLAN_INSERT_PVID |
-				 ICE_AQ_VSI_VLAN_EMOD_STR);
-	ctxt->info.pvid = cpu_to_le16(vid);
-	ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
-						ICE_AQ_VSI_PROP_SW_VALID);
-}
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
 
-/**
- * ice_vsi_kill_pvid_fill_ctxt - Set VSI ctx for remove PVID
- * @ctxt: the VSI ctxt to fill
- */
-static void ice_vsi_kill_pvid_fill_ctxt(struct ice_vsi_ctx *ctxt)
-{
-	ctxt->info.vlan_flags = ICE_AQ_VSI_VLAN_EMOD_NOTHING;
-	ctxt->info.vlan_flags |= ICE_AQ_VSI_VLAN_MODE_ALL;
-	ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
-	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
-						ICE_AQ_VSI_PROP_SW_VALID);
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_VF, vf->vf_id, NULL, 0);
+
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF VSI\n");
+		ice_vf_invalidate_vsi(vf);
+		return NULL;
+	}
+
+	vf->lan_vsi_idx = vsi->idx;
+	vf->lan_vsi_num = vsi->vsi_num;
+
+	return vsi;
 }
 
 /**
- * ice_vsi_manage_pvid - Enable or disable port VLAN for VSI
- * @vsi: the VSI to update
- * @vid: the VLAN ID to set as a PVID
- * @enable: true for enable PVID false for disable
+ * ice_vf_adq_vsi_setup - Set up a VF channel VSI
+ * @vf: VF to setup VSI for
+ * @tc: TC to setup the channel VSI for
  */
-static int ice_vsi_manage_pvid(struct ice_vsi *vsi, u16 vid, bool enable)
+static struct ice_vsi *ice_vf_adq_vsi_setup(struct ice_vf *vf, u8 tc)
 {
-	struct device *dev = &vsi->back->pdev->dev;
-	struct ice_hw *hw = &vsi->back->hw;
-	struct ice_vsi_ctx *ctxt;
-	enum ice_status status;
-	int ret = 0;
-
-	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
-	if (!ctxt)
-		return -ENOMEM;
-
-	ctxt->info = vsi->info;
-	if (enable)
-		ice_vsi_set_pvid_fill_ctxt(ctxt, vid);
-	else
-		ice_vsi_kill_pvid_fill_ctxt(ctxt);
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
 
-	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
-	if (status) {
-		dev_info(dev, "update VSI for port VLAN failed, err %d aq_err %d\n",
-			 status, hw->adminq.sq_last_status);
-		ret = -EIO;
-		goto out;
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_VF, vf->vf_id, NULL, tc);
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF ADQ VSI for TC %d\n",
+			tc);
+		ice_vf_adq_invalidate_vsi(vf, tc);
+		return NULL;
 	}
 
-	vsi->info = ctxt->info;
-out:
-	devm_kfree(dev, ctxt);
-	return ret;
+	vf->ch[tc].vsi_idx = vsi->idx;
+	vf->ch[tc].vsi_num = vsi->vsi_num;
+
+	return vsi;
 }
 
 /**
- * ice_vf_vsi_setup - Set up a VF VSI
- * @pf: board private structure
- * @pi: pointer to the port_info instance
- * @vf_id: defines VF ID to which this VSI connects.
+ * ice_vf_ctrl_vsi_setup - Set up a VF control VSI
+ * @vf: VF to setup control VSI for
  *
  * Returns pointer to the successfully allocated VSI struct on success,
  * otherwise returns NULL on failure.
  */
-static struct ice_vsi *
-ice_vf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, u16 vf_id)
+struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf)
 {
-	return ice_vsi_setup(pf, pi, ICE_VSI_VF, vf_id);
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+
+	vsi = ice_vsi_setup(pf, pi, ICE_VSI_CTRL, vf->vf_id, NULL, 0);
+
+	if (!vsi) {
+		dev_err(ice_pf_to_dev(pf), "Failed to create VF control VSI\n");
+		ice_vf_ctrl_invalidate_vsi(vf);
+	}
+
+	return vsi;
 }
 
 /**
@@ -517,160 +1055,418 @@ ice_vf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, u16 vf_id)
  */
 static int ice_calc_vf_first_vector_idx(struct ice_pf *pf, struct ice_vf *vf)
 {
-	return pf->sriov_base_vector + vf->vf_id * pf->num_vf_msix;
+	return pf->sriov_base_vector + vf->vf_id * pf->num_msix_per_vf;
 }
 
 /**
- * ice_alloc_vsi_res - Setup VF VSI and its resources
- * @vf: pointer to the VF structure
+ * ice_vf_rebuild_host_tx_rate_cfg - re-apply the Tx rate limiting configuration
+ * @vf: VF to re-apply the configuration for
  *
- * Returns 0 on success, negative value on failure
+ * Called after a VF VSI has been re-added/rebuild during reset. The PF driver
+ * needs to re-apply the host configured Tx rate limiting configuration.
  */
-static int ice_alloc_vsi_res(struct ice_vf *vf)
+static int ice_vf_rebuild_host_tx_rate_cfg(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	LIST_HEAD(tmp_add_list);
-	u8 broadcast[ETH_ALEN];
-	struct ice_vsi *vsi;
-	int status = 0;
-
-	/* first vector index is the VFs OICR index */
-	vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf);
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	int err;
 
-	vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id);
-	if (!vsi) {
-		dev_err(&pf->pdev->dev, "Failed to create VF VSI\n");
-		return -ENOMEM;
+	if (vf->min_tx_rate) {
+		err = ice_set_min_bw_limit(vsi, (u64)vf->min_tx_rate * 1000);
+		if (err) {
+			dev_err(dev, "failed to set min Tx rate to %d Mbps for VF %u, error %d\n",
+				vf->min_tx_rate, vf->vf_id, err);
+			return err;
+		}
 	}
 
-	vf->lan_vsi_idx = vsi->idx;
-	vf->lan_vsi_num = vsi->vsi_num;
-
-	/* Check if port VLAN exist before, and restore it accordingly */
-	if (vf->port_vlan_id) {
-		ice_vsi_manage_pvid(vsi, vf->port_vlan_id, true);
-		ice_vsi_add_vlan(vsi, vf->port_vlan_id & ICE_VLAN_M);
+	if (vf->max_tx_rate) {
+		err = ice_set_max_bw_limit(vsi, (u64)vf->max_tx_rate * 1000);
+		if (err) {
+			dev_err(dev, "failed to set max Tx rate to %d Mbps for VF %u, error %d\n",
+				vf->max_tx_rate, vf->vf_id, err);
+			return err;
+		}
 	}
 
-	eth_broadcast_addr(broadcast);
+	return 0;
+}
 
-	status = ice_add_mac_to_list(vsi, &tmp_add_list, broadcast);
-	if (status)
-		goto ice_alloc_vsi_res_exit;
+static u16 ice_vf_get_port_vlan_id(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.vid;
+}
 
-	if (is_valid_ether_addr(vf->dflt_lan_addr.addr)) {
-		status = ice_add_mac_to_list(vsi, &tmp_add_list,
-					     vf->dflt_lan_addr.addr);
-		if (status)
-			goto ice_alloc_vsi_res_exit;
+static u8 ice_vf_get_port_vlan_prio(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.prio;
+}
+
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf)
+{
+	return (ice_vf_get_port_vlan_id(vf) || ice_vf_get_port_vlan_prio(vf));
+}
+
+static u16 ice_vf_get_port_vlan_tpid(struct ice_vf *vf)
+{
+	return vf->port_vlan_info.tpid;
+}
+
+/**
+ * ice_vf_rebuild_host_vlan_cfg - add VLAN 0 filter or rebuild the Port VLAN
+ * @vf: VF to add MAC filters for
+ * @vsi: Pointer to VSI
+ *
+ * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver
+ * always re-adds either a VLAN 0 or port VLAN based filter after reset.
+ */
+static int ice_vf_rebuild_host_vlan_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err;
+
+	if (ice_vf_is_port_vlan_ena(vf)) {
+		err = vlan_ops->set_port_vlan(vsi, &vf->port_vlan_info);
+		if (err) {
+			dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+
+		err = vlan_ops->add_vlan(vsi, &vf->port_vlan_info);
+	} else {
+		err = ice_vsi_add_vlan_zero(vsi);
 	}
 
-	status = ice_add_mac(&pf->hw, &tmp_add_list);
-	if (status)
-		dev_err(&pf->pdev->dev,
-			"could not add mac filters error %d\n", status);
+	if (err) {
+		dev_err(dev, "failed to add VLAN %u filter for VF %u during VF rebuild, error %d\n",
+			ice_vf_is_port_vlan_ena(vf) ?
+			ice_vf_get_port_vlan_id(vf) : 0, vf->vf_id, err);
+		return err;
+	}
+
+	err = vlan_ops->ena_rx_filtering(vsi);
+	if (err) {
+		dev_warn(dev, "failed to enable Rx VLAN filtering for VF %d VSI %d during VF rebuild, error %d\n",
+			 vf->vf_id, vsi->idx, err);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_dcf_vlan_cfg - Config DCF outer VLAN for VF
+ * @vf: VF to add outer VLAN for
+ * @vsi: Pointer to VSI
+ */
+static int ice_vf_rebuild_dcf_vlan_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	struct ice_dcf_vlan_info *dcf_vlan = &vf->dcf_vlan_info;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err;
+
+	if (!ice_is_dcf_enabled(vf->pf) || !dcf_vlan->applying) {
+		memset(dcf_vlan, 0, sizeof(*dcf_vlan));
+		return 0;
+	}
+
+	dcf_vlan->applying = 0;
+
+	if (dcf_vlan->outer_port_vlan.vid) {
+		err = ice_vf_vsi_dcf_set_outer_port_vlan(vsi, &dcf_vlan->outer_port_vlan);
+		if (err) {
+			dev_err(dev, "failed to configure outer port VLAN via DCF for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+	}
+
+	if (dcf_vlan->outer_stripping_ena) {
+		err = ice_vf_vsi_dcf_ena_outer_vlan_stripping(vsi, dcf_vlan->outer_stripping_tpid);
+		if (err) {
+			dev_err(dev, "failed to enable outer VLAN stripping via DCF for VF %u, error %d\n",
+				vf->vf_id, err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int ice_cfg_mac_antispoof(struct ice_vsi *vsi, bool enable)
+{
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+	int err = 0;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->info.sec_flags = vsi->info.sec_flags;
+	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+	if (enable)
+		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
 	else
-		vf->num_mac = 1;
+		ctx->info.sec_flags &= ~ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
 
-	/* Clear this bit after VF initialization since we shouldn't reclaim
-	 * and reassign interrupts for synchronous or asynchronous VFR events.
-	 * We don't want to reconfigure interrupts since AVF driver doesn't
-	 * expect vector assignment to be changed unless there is a request for
-	 * more vectors.
-	 */
-ice_alloc_vsi_res_exit:
-	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
-	return status;
+	status = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx MAC anti-spoof %s for VSI %d, error %s\n",
+			enable ? "ON" : "OFF", vsi->vsi_num,
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	} else {
+		vsi->info.sec_flags = ctx->info.sec_flags;
+	}
+
+	kfree(ctx);
+
+	return err;
 }
 
 /**
- * ice_alloc_vf_res - Allocate VF resources
- * @vf: pointer to the VF structure
+ * ice_vsi_ena_spoofchk - enable Tx spoof checking for this VSI
+ * @vsi: VSI to enable Tx spoof checking for
  */
-static int ice_alloc_vf_res(struct ice_vf *vf)
+static int ice_vsi_ena_spoofchk(struct ice_vsi *vsi)
 {
-	struct ice_pf *pf = vf->pf;
-	int tx_rx_queue_left;
-	int status;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int err;
 
-	/* Update number of VF queues, in case VF had requested for queue
-	 * changes
-	 */
-	tx_rx_queue_left = min_t(int, ice_get_avail_txq_count(pf),
-				 ice_get_avail_rxq_count(pf));
-	tx_rx_queue_left += ICE_DFLT_QS_PER_VF;
-	if (vf->num_req_qs && vf->num_req_qs <= tx_rx_queue_left &&
-	    vf->num_req_qs != vf->num_vf_qs)
-		vf->num_vf_qs = vf->num_req_qs;
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
 
-	/* setup VF VSI and necessary resources */
-	status = ice_alloc_vsi_res(vf);
-	if (status)
-		goto ice_alloc_vf_res_exit;
+	err = vlan_ops->ena_tx_filtering(vsi);
+	if (err)
+		return err;
 
-	if (vf->trusted)
-		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	err = ice_cfg_mac_antispoof(vsi, true);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_dis_spoofchk - disable Tx spoof checking for this VSI
+ * @vsi: VSI to disable Tx spoof checking for
+ */
+static int ice_vsi_dis_spoofchk(struct ice_vsi *vsi)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int err;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+
+	err = vlan_ops->dis_tx_filtering(vsi);
+	if (err)
+		return err;
+
+	err = ice_cfg_mac_antispoof(vsi, false);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_spoofchk_cfg - apply Tx spoof checking setting
+ * @vf: VF set spoofchk for
+ * @vsi: VSI associated to the VF
+ */
+static int
+ice_vf_set_spoofchk_cfg(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	int err;
+
+	if (vf->spoofchk)
+		err = ice_vsi_ena_spoofchk(vsi);
 	else
-		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+		err = ice_vsi_dis_spoofchk(vsi);
 
-	/* VF is now completely initialized */
-	set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	return err;
+}
 
-	return status;
+/**
+ * ice_vf_rebuild_adq_port_vlan_cfg - set the port VLAN for VF ADQ VSIs
+ * @vf: VF to add MAC filters for
+ *
+ * Called after a VF ADQ VSI has been re-added/rebuilt during reset.
+ */
+static int ice_vf_rebuild_adq_port_vlan_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err, tc;
 
-ice_alloc_vf_res_exit:
-	ice_free_vf_res(vf);
-	return status;
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		err = ice_vf_rebuild_host_vlan_cfg(vf, vsi);
+		if (err) {
+			dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, ADQ VSI(num %u), error %d\n",
+				vf->vf_id, vsi->vsi_num, err);
+			return err;
+		}
+	}
+	return 0;
 }
 
 /**
- * ice_ena_vf_mappings
- * @vf: pointer to the VF structure
+ * ice_vf_rebuild_adq_spoofchk_cfg - set the spoofchk config for VF ADQ VSIs
+ * @vf: VF to set spoofchk for
  *
- * Enable VF vectors and queues allocation by writing the details into
- * respective registers.
+ * Called after a VF ADQ VSI has been re-added/rebuilt during reset.
  */
-static void ice_ena_vf_mappings(struct ice_vf *vf)
+static int ice_vf_rebuild_adq_spoofchk_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	int err, tc;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		err = ice_vf_set_spoofchk_cfg(vf, vsi);
+		if (err) {
+			dev_err(dev, "failed to configure spoofchk via VSI parameters for VF %u, ADQ VSI(num %u), error %d\n",
+				vf->vf_id, vsi->vsi_num, err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_host_mac_cfg - add broadcast and the VF's perm_addr/LAA
+ * @vf: VF to add MAC filters for
+ *
+ * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver
+ * always re-adds a broadcast filter and the VF's perm_addr/LAA after reset.
+ */
+static int ice_vf_rebuild_host_mac_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	enum ice_status status;
+	u8 broadcast[ETH_ALEN];
+
+	if (ice_is_eswitch_mode_switchdev(vf->pf))
+		return 0;
+
+	eth_broadcast_addr(broadcast);
+	status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI);
+	if (status) {
+		dev_err(dev, "failed to add broadcast MAC filter for VF %u, error %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
+	}
+
+	vf->num_mac++;
+
+	if (is_valid_ether_addr(vf->hw_lan_addr.addr)) {
+		status = ice_fltr_add_mac(vsi, vf->hw_lan_addr.addr,
+					  ICE_FWD_TO_VSI);
+		if (status) {
+			dev_err(dev, "failed to add default unicast MAC filter %pM for VF %u, error %s\n",
+				&vf->hw_lan_addr.addr[0], vf->vf_id,
+				ice_stat_str(status));
+			return ice_status_to_errno(status);
+		}
+		vf->num_mac++;
+
+		ether_addr_copy(vf->dev_lan_addr.addr, vf->hw_lan_addr.addr);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_host_trust_cfg - set trust setting based on pre-reset value
+ * @vf: VF to configure trust setting for
+ */
+static void ice_vf_set_host_trust_cfg(struct ice_vf *vf)
+{
+	if (vf->trusted)
+		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	else
+		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+}
+
+/**
+ * ice_ena_vf_msix_mappings - enable VF MSIX mappings in hardware
+ * @vf: VF to enable MSIX mappings for
+ *
+ * Some of the registers need to be indexed/configured using hardware global
+ * device values and other registers need 0-based values, which represent PF
+ * based values.
+ */
+static void ice_ena_vf_msix_mappings(struct ice_vf *vf)
 {
-	int abs_vf_id, abs_first, abs_last;
+	int device_based_first_msix, device_based_last_msix;
+	int pf_based_first_msix, pf_based_last_msix, v;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-	int first, last, v;
+	int device_based_vf_id;
 	struct ice_hw *hw;
 	u32 reg;
 
 	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	first = vf->first_vector_idx;
-	last = (first + pf->num_vf_msix) - 1;
-	abs_first = first + pf->hw.func_caps.common_cap.msix_vector_first_id;
-	abs_last = (abs_first + pf->num_vf_msix) - 1;
-	abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
-
-	/* VF Vector allocation */
-	reg = (((abs_first << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) |
-	       ((abs_last << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) |
-	       VPINT_ALLOC_VALID_M);
+
+	pf_based_first_msix = vf->first_vector_idx;
+	pf_based_last_msix = (pf_based_first_msix + pf->num_msix_per_vf) - 1;
+
+	device_based_first_msix = pf_based_first_msix +
+		pf->hw.func_caps.common_cap.msix_vector_first_id;
+	device_based_last_msix =
+		(device_based_first_msix + pf->num_msix_per_vf) - 1;
+	device_based_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
+
+	reg = (((device_based_first_msix << VPINT_ALLOC_FIRST_S) &
+		VPINT_ALLOC_FIRST_M) |
+	       ((device_based_last_msix << VPINT_ALLOC_LAST_S) &
+		VPINT_ALLOC_LAST_M) | VPINT_ALLOC_VALID_M);
 	wr32(hw, VPINT_ALLOC(vf->vf_id), reg);
 
-	reg = (((abs_first << VPINT_ALLOC_PCI_FIRST_S)
+	reg = (((device_based_first_msix << VPINT_ALLOC_PCI_FIRST_S)
 		 & VPINT_ALLOC_PCI_FIRST_M) |
-	       ((abs_last << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) |
-	       VPINT_ALLOC_PCI_VALID_M);
+	       ((device_based_last_msix << VPINT_ALLOC_PCI_LAST_S) &
+		VPINT_ALLOC_PCI_LAST_M) | VPINT_ALLOC_PCI_VALID_M);
 	wr32(hw, VPINT_ALLOC_PCI(vf->vf_id), reg);
+
 	/* map the interrupts to its functions */
-	for (v = first; v <= last; v++) {
-		reg = (((abs_vf_id << GLINT_VECT2FUNC_VF_NUM_S) &
+	for (v = pf_based_first_msix; v <= pf_based_last_msix; v++) {
+		reg = (((device_based_vf_id << GLINT_VECT2FUNC_VF_NUM_S) &
 			GLINT_VECT2FUNC_VF_NUM_M) |
 		       ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) &
 			GLINT_VECT2FUNC_PF_NUM_M));
 		wr32(hw, GLINT_VECT2FUNC(v), reg);
 	}
 
-	/* Map mailbox interrupt. We put an explicit 0 here to remind us that
-	 * VF admin queue interrupts will go to VF MSI-X vector 0.
-	 */
-	wr32(hw, VPINT_MBX_CTL(abs_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M | 0);
+	/* Map mailbox interrupt to VF VSI VF MSI-X vector 0 */
+	wr32(hw, VPINT_MBX_CTL(device_based_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_ena_vf_q_mappings - enable Rx/Tx queue mappings for a VF
+ * @vf: VF to enable the mappings for
+ * @max_txq: max Tx queues allowed on the VF's VSI
+ * @max_rxq: max Rx queues allowed on the VF's VSI
+ */
+static void ice_ena_vf_q_mappings(struct ice_vf *vf, u16 max_txq, u16 max_rxq)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	struct ice_hw *hw = &vf->pf->hw;
+	u32 reg;
+
 	/* set regardless of mapping mode */
 	wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_id), VPLAN_TXQ_MAPENA_TX_ENA_M);
 
@@ -682,12 +1478,11 @@ static void ice_ena_vf_mappings(struct ice_vf *vf)
 		 */
 		reg = (((vsi->txq_map[0] << VPLAN_TX_QBASE_VFFIRSTQ_S) &
 			VPLAN_TX_QBASE_VFFIRSTQ_M) |
-		       (((vsi->alloc_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) &
+		       (((max_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) &
 			VPLAN_TX_QBASE_VFNUMQ_M));
 		wr32(hw, VPLAN_TX_QBASE(vf->vf_id), reg);
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Tx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Tx queues is not yet implemented\n");
 	}
 
 	/* set regardless of mapping mode */
@@ -701,13 +1496,38 @@ static void ice_ena_vf_mappings(struct ice_vf *vf)
 		 */
 		reg = (((vsi->rxq_map[0] << VPLAN_RX_QBASE_VFFIRSTQ_S) &
 			VPLAN_RX_QBASE_VFFIRSTQ_M) |
-		       (((vsi->alloc_txq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) &
+		       (((max_rxq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) &
 			VPLAN_RX_QBASE_VFNUMQ_M));
 		wr32(hw, VPLAN_RX_QBASE(vf->vf_id), reg);
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Scattered mode for VF Rx queues is not yet implemented\n");
+		dev_err(dev, "Scattered mode for VF Rx queues is not yet implemented\n");
+	}
+}
+
+/**
+ * ice_ena_vf_mappings - enable VF MSIX and queue mapping
+ * @vf: pointer to the VF structure
+ */
+static void ice_ena_vf_mappings(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	u16 max_txq, max_rxq;
+
+	ice_ena_vf_msix_mappings(vf);
+
+	if (ice_is_vf_adq_ena(vf)) {
+		u16 offset, num_qps;
+
+		offset = vf->ch[vf->num_tc - 1].offset;
+		num_qps = vf->ch[vf->num_tc - 1].num_qps;
+		max_txq = offset + num_qps;
+		max_rxq = offset + num_qps;
+	} else {
+		max_txq = vsi->alloc_txq;
+		max_rxq = vsi->alloc_rxq;
 	}
+
+	ice_ena_vf_q_mappings(vf, max_txq, max_rxq);
 }
 
 /**
@@ -753,19 +1573,26 @@ ice_determine_res(struct ice_pf *pf, u16 avail_res, u16 max_res, u16 min_res)
  * ice_calc_vf_reg_idx - Calculate the VF's register index in the PF space
  * @vf: VF to calculate the register index for
  * @q_vector: a q_vector associated to the VF
+ * @tc: Traffic class number for VF ADQ
  */
-int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector)
+int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector,
+			u8 __maybe_unused tc)
 {
 	struct ice_pf *pf;
+	u32 reg_idx;
 
 	if (!vf || !q_vector)
 		return -EINVAL;
 
 	pf = vf->pf;
-
 	/* always add one to account for the OICR being the first MSIX */
-	return pf->sriov_base_vector + pf->num_vf_msix * vf->vf_id +
-		q_vector->v_idx + 1;
+	reg_idx = pf->sriov_base_vector + pf->num_msix_per_vf * vf->vf_id +
+		  q_vector->v_idx + 1;
+
+	if (tc && ice_is_vf_adq_ena(vf))
+		return reg_idx + vf->ch[tc].offset;
+	else
+		return reg_idx;
 }
 
 /**
@@ -797,260 +1624,551 @@ static int ice_get_max_valid_res_idx(struct ice_res_tracker *res)
  * @num_msix_needed: number of MSIX vectors needed for all SR-IOV VFs
  *
  * This function allows SR-IOV resources to be taken from the end of the PF's
- * allowed HW MSIX vectors so in many cases the irq_tracker will not
- * be needed. In these cases we just set the pf->sriov_base_vector and return
- * success.
+ * allowed HW MSIX vectors so that the irq_tracker will not be affected. We
+ * just set the pf->sriov_base_vector and return success.
  *
- * If SR-IOV needs to use any pf->irq_tracker entries it updates the
- * irq_tracker->end based on the first entry needed for SR-IOV. This makes it
- * so any calls to ice_get_res() using the irq_tracker will not try to use
- * resources at or beyond the newly set value.
+ * If there are not enough resources available, return an error. This should
+ * always be caught by ice_set_per_vf_res().
  *
- * Return 0 on success, and -EINVAL when there are not enough MSIX vectors in
+ * Return 0 on success, and -EINVAL when there are not enough MSIX vectors
  * in the PF's space available for SR-IOV.
  */
 static int ice_sriov_set_msix_res(struct ice_pf *pf, u16 num_msix_needed)
 {
-	int max_valid_res_idx = ice_get_max_valid_res_idx(pf->irq_tracker);
-	u16 pf_total_msix_vectors =
-		pf->hw.func_caps.common_cap.num_msix_vectors;
-	struct ice_res_tracker *res = pf->irq_tracker;
+	u16 total_vectors = pf->hw.func_caps.common_cap.num_msix_vectors;
+	int vectors_used = pf->irq_tracker->num_entries;
 	int sriov_base_vector;
 
-	if (max_valid_res_idx < 0)
-		return max_valid_res_idx;
-
-	sriov_base_vector = pf_total_msix_vectors - num_msix_needed;
+	sriov_base_vector = total_vectors - num_msix_needed;
 
 	/* make sure we only grab irq_tracker entries from the list end and
 	 * that we have enough available MSIX vectors
 	 */
-	if (sriov_base_vector <= max_valid_res_idx)
+	if (sriov_base_vector < vectors_used)
 		return -EINVAL;
 
 	pf->sriov_base_vector = sriov_base_vector;
 
-	/* dip into irq_tracker entries and update used resources */
-	if (num_msix_needed > (pf_total_msix_vectors - res->num_entries)) {
-		pf->num_avail_sw_msix -=
-			res->num_entries - pf->sriov_base_vector;
-		res->end = pf->sriov_base_vector;
-	}
-
 	return 0;
 }
 
 /**
- * ice_check_avail_res - check if vectors and queues are available
+ * ice_set_per_vf_res - check if vectors and queues are available
  * @pf: pointer to the PF structure
  *
- * This function is where we calculate actual number of resources for VF VSIs,
- * we don't reserve ahead of time during probe. Returns success if vectors and
- * queues resources are available, otherwise returns error code
+ * First, determine HW interrupts from common pool. If we allocate fewer VFs, we
+ * get more vectors and can enable more queues per VF. Note that this does not
+ * grab any vectors from the SW pool already allocated. Also note, that all
+ * vector counts include one for each VF's miscellaneous interrupt vector
+ * (i.e. OICR).
+ *
+ * Minimum VFs - 2 vectors, 1 queue pair
+ * Small VFs - 5 vectors, 4 queue pairs
+ * Medium VFs - 17 vectors, 16 queue pairs
+ *
+ * While more vectors can be assigned to a VF, the RSS LUT
+ * is only 4 bits wide, so we can only do 16 queues of RSS
+ * per VF.
+ *
+ * ADQ sizes:
+ * Small ADQ VFs - 5 vectors, 4 TCs, 16 queue pairs (4 queue pairs/int)
+ * Medium ADQ VFs - 17 vectors, 4 TCs, 16 queue pairs (1 queue pairs/int)
+ *
+ * Second, determine number of queue pairs per VF by starting with a pre-defined
+ * maximum each VF supports. If this is not possible, then we adjust based on
+ * queue pairs available on the device.
+ *
+ * Lastly, set queue and MSI-X VF variables tracked by the PF so it can be used
+ * by each VF during VF initialization and reset.
  */
-static int ice_check_avail_res(struct ice_pf *pf)
+static int ice_set_per_vf_res(struct ice_pf *pf)
 {
 	int max_valid_res_idx = ice_get_max_valid_res_idx(pf->irq_tracker);
-	u16 num_msix, num_txq, num_rxq, num_avail_msix;
+	int msix_avail_per_vf, msix_avail_for_sriov;
+	struct device *dev = ice_pf_to_dev(pf);
+	u16 num_msix_per_vf, num_txq, num_rxq;
 
 	if (!pf->num_alloc_vfs || max_valid_res_idx < 0)
 		return -EINVAL;
 
-	/* add 1 to max_valid_res_idx to account for it being 0-based */
-	num_avail_msix = pf->hw.func_caps.common_cap.num_msix_vectors -
-		(max_valid_res_idx + 1);
-
-	/* Grab from HW interrupts common pool
-	 * Note: By the time the user decides it needs more vectors in a VF
-	 * its already too late since one must decide this prior to creating the
-	 * VF interface. So the best we can do is take a guess as to what the
-	 * user might want.
-	 *
-	 * We have two policies for vector allocation:
-	 * 1. if num_alloc_vfs is from 1 to 16, then we consider this as small
-	 * number of NFV VFs used for NFV appliances, since this is a special
-	 * case, we try to assign maximum vectors per VF (65) as much as
-	 * possible, based on determine_resources algorithm.
-	 * 2. if num_alloc_vfs is from 17 to 256, then its large number of
-	 * regular VFs which are not used for any special purpose. Hence try to
-	 * grab default interrupt vectors (5 as supported by AVF driver).
-	 */
-	if (pf->num_alloc_vfs <= 16) {
-		num_msix = ice_determine_res(pf, num_avail_msix,
-					     ICE_MAX_INTR_PER_VF,
-					     ICE_MIN_INTR_PER_VF);
-	} else if (pf->num_alloc_vfs <= ICE_MAX_VF_COUNT) {
-		num_msix = ice_determine_res(pf, num_avail_msix,
-					     ICE_DFLT_INTR_PER_VF,
-					     ICE_MIN_INTR_PER_VF);
+	/* determine MSI-X resources per VF */
+	msix_avail_for_sriov = pf->hw.func_caps.common_cap.num_msix_vectors -
+		pf->irq_tracker->num_entries;
+	msix_avail_per_vf = msix_avail_for_sriov / pf->num_alloc_vfs;
+	if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MAX) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MAX;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_LARGE) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_LARGE;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MED) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MED;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_SMALL) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_SMALL;
+	} else if (msix_avail_per_vf >= ICE_NUM_VF_MSIX_MULTIQ_MIN) {
+		num_msix_per_vf = ICE_NUM_VF_MSIX_MULTIQ_MIN;
+	} else if (msix_avail_per_vf >= ICE_MIN_INTR_PER_VF) {
+		num_msix_per_vf = ICE_MIN_INTR_PER_VF;
 	} else {
-		dev_err(&pf->pdev->dev,
-			"Number of VFs %d exceeds max VF count %d\n",
-			pf->num_alloc_vfs, ICE_MAX_VF_COUNT);
+		dev_err(dev, "Only %d MSI-X interrupts available for SR-IOV. Not enough to support minimum of %d MSI-X interrupts per VF for %d VFs\n",
+			msix_avail_for_sriov, ICE_MIN_INTR_PER_VF,
+			pf->num_alloc_vfs);
 		return -EIO;
 	}
 
-	if (!num_msix)
-		return -EIO;
-
-	/* Grab from the common pool
-	 * start by requesting Default queues (4 as supported by AVF driver),
-	 * Note that, the main difference between queues and vectors is, latter
-	 * can only be reserved at init time but queues can be requested by VF
-	 * at runtime through Virtchnl, that is the reason we start by reserving
-	 * few queues.
-	 */
+	/* determine queue resources per VF */
 	num_txq = ice_determine_res(pf, ice_get_avail_txq_count(pf),
-				    ICE_DFLT_QS_PER_VF, ICE_MIN_QS_PER_VF);
+				    min_t(u16,
+					  num_msix_per_vf - ICE_NONQ_VECS_VF,
+					  ICE_MAX_DFLT_QS_PER_VF),
+				    ICE_MIN_QS_PER_VF);
 
 	num_rxq = ice_determine_res(pf, ice_get_avail_rxq_count(pf),
-				    ICE_DFLT_QS_PER_VF, ICE_MIN_QS_PER_VF);
-
-	if (!num_txq || !num_rxq)
+				    min_t(u16,
+					  num_msix_per_vf - ICE_NONQ_VECS_VF,
+					  ICE_MAX_DFLT_QS_PER_VF),
+				    ICE_MIN_QS_PER_VF);
+
+	if (!num_txq || !num_rxq) {
+		dev_err(dev, "Not enough queues to support minimum of %d queue pairs per VF for %d VFs\n",
+			ICE_MIN_QS_PER_VF, pf->num_alloc_vfs);
 		return -EIO;
+	}
 
-	if (ice_sriov_set_msix_res(pf, num_msix * pf->num_alloc_vfs))
+	if (ice_sriov_set_msix_res(pf, num_msix_per_vf * pf->num_alloc_vfs)) {
+		dev_err(dev, "Unable to set MSI-X resources for %d VFs\n",
+			pf->num_alloc_vfs);
 		return -EINVAL;
+	}
 
-	/* since AVF driver works with only queue pairs which means, it expects
-	 * to have equal number of Rx and Tx queues, so take the minimum of
-	 * available Tx or Rx queues
-	 */
-	pf->num_vf_qps = min_t(int, num_txq, num_rxq);
-	pf->num_vf_msix = num_msix;
+	/* only allow equal Tx/Rx queue count (i.e. queue pairs) */
+	pf->num_qps_per_vf = min_t(int, num_txq, num_rxq);
+	pf->num_msix_per_vf = num_msix_per_vf;
+	dev_info(dev, "Enabling %d VFs with %d vectors and %d queues per VF\n",
+		 pf->num_alloc_vfs, pf->num_msix_per_vf, pf->num_qps_per_vf);
 
 	return 0;
 }
 
 /**
- * ice_cleanup_and_realloc_vf - Clean up VF and reallocate resources after reset
- * @vf: pointer to the VF structure
- *
- * Cleanup a VF after the hardware reset is finished. Expects the caller to
- * have verified whether the reset is finished properly, and ensure the
- * minimum amount of wait time has passed. Reallocate VF resources back to make
- * VF state active
+ * ice_clear_vf_reset_trigger - enable VF to access hardware
+ * @vf: VF to enabled hardware access for
  */
-static void ice_cleanup_and_realloc_vf(struct ice_vf *vf)
+static void ice_clear_vf_reset_trigger(struct ice_vf *vf)
 {
-	struct ice_pf *pf = vf->pf;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vf->pf->hw;
 	u32 reg;
 
-	hw = &pf->hw;
-
-	/* PF software completes the flow by notifying VF that reset flow is
-	 * completed. This is done by enabling hardware by clearing the reset
-	 * bit in the VPGEN_VFRTRIG reg and setting VFR_STATE in the VFGEN_RSTAT
-	 * register to VFR completed (done at the end of this function)
-	 * By doing this we allow HW to access VF memory at any point. If we
-	 * did it any sooner, HW could access memory while it was being freed
-	 * in ice_free_vf_res(), causing an IOMMU fault.
-	 *
-	 * On the other hand, this needs to be done ASAP, because the VF driver
-	 * is waiting for this to happen and may report a timeout. It's
-	 * harmless, but it gets logged into Guest OS kernel log, so best avoid
-	 * it.
-	 */
 	reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_id));
 	reg &= ~VPGEN_VFRTRIG_VFSWR_M;
 	wr32(hw, VPGEN_VFRTRIG(vf->vf_id), reg);
+	ice_flush(hw);
+}
 
-	/* reallocate VF resources to finish resetting the VSI state */
-	if (!ice_alloc_vf_res(vf)) {
-		ice_ena_vf_mappings(vf);
-		set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
-		clear_bit(ICE_VF_STATE_DIS, vf->vf_states);
-		vf->num_vlan = 0;
+static int ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u8 lport = vsi->port_info->lport;
+	enum ice_status status;
+
+	if (ice_vf_is_port_vlan_ena(vf))
+		status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m,
+						  ice_vf_get_port_vlan_id(vf),
+						  lport);
+	else if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_set_vlan_vsi_promisc(hw, vsi, promisc_m);
+	else
+		status = ice_fltr_set_vsi_promisc(hw, vsi->idx, promisc_m, 0, lport);
+
+	if (status && status != ICE_ERR_ALREADY_EXISTS) {
+		dev_err(ice_pf_to_dev(vsi->back), "enable Tx/Rx filter promiscuous mode on VF-%u failed, error: %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
 	}
 
-	/* Tell the VF driver the reset is done. This needs to be done only
-	 * after VF has been fully initialized, because the VF driver may
-	 * request resources immediately after setting this flag.
-	 */
-	wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
+	return 0;
 }
 
-/**
- * ice_vf_set_vsi_promisc - set given VF VSI to given promiscuous mode(s)
- * @vf: pointer to the VF info
- * @vsi: the VSI being configured
- * @promisc_m: mask of promiscuous config bits
- * @rm_promisc: promisc flag request from the VF to remove or add filter
- *
- * This function configures VF VSI promiscuous mode, based on the VF requests,
- * for Unicast, Multicast and VLAN
- */
-static enum ice_status
-ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m,
-		       bool rm_promisc)
+static int ice_vf_clear_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m)
 {
-	struct ice_pf *pf = vf->pf;
-	enum ice_status status = 0;
-	struct ice_hw *hw;
+	struct ice_hw *hw = &vsi->back->hw;
+	u8 lport = vsi->port_info->lport;
+	enum ice_status status;
 
-	hw = &pf->hw;
-	if (vf->num_vlan) {
-		status = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_m,
-						  rm_promisc);
-	} else if (vf->port_vlan_id) {
-		if (rm_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       vf->port_vlan_id);
-		else
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     vf->port_vlan_id);
-	} else {
-		if (rm_promisc)
-			status = ice_clear_vsi_promisc(hw, vsi->idx, promisc_m,
-						       0);
-		else
-			status = ice_set_vsi_promisc(hw, vsi->idx, promisc_m,
-						     0);
+	if (ice_vf_is_port_vlan_ena(vf))
+		status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m,
+						    ice_vf_get_port_vlan_id(vf),
+						    lport);
+	else if (ice_vsi_has_non_zero_vlans(vsi))
+		status = ice_fltr_clear_vlan_vsi_promisc(hw, vsi, promisc_m);
+	else
+		status = ice_fltr_clear_vsi_promisc(hw, vsi->idx, promisc_m, 0, lport);
+
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(ice_pf_to_dev(vsi->back), "disable Tx/Rx filter promiscuous mode on VF-%u failed, error: %s\n",
+			vf->vf_id, ice_stat_str(status));
+		return ice_status_to_errno(status);
 	}
 
-	return status;
+	return 0;
 }
 
-/**
- * ice_config_res_vfs - Finalize allocation of VFs resources in one go
- * @pf: pointer to the PF structure
+static void ice_vf_clear_counters(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	vf->num_mac = 0;
+	vsi->num_vlan = 0;
+	memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events));
+	memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events));
+}
+
+/**
+ * ice_vf_pre_vsi_rebuild - tasks to be done prior to VSI rebuild
+ * @vf: VF to perfrom pre VSI rebuild tasks
  *
- * This function is being called as last part of resetting all VFs, or when
- * configuring VFs for the first time, where there is no resource to be freed
- * Returns true if resources were properly allocated for all VFs, and false
- * otherwise.
+ * These tasks are items that don't need to be amortized since they are most
+ * likely called in a for loop with all VF(s) in the reset_all_vfs() case.
  */
-static bool ice_config_res_vfs(struct ice_pf *pf)
+static void ice_vf_pre_vsi_rebuild(struct ice_vf *vf)
 {
-	struct ice_hw *hw = &pf->hw;
-	int v;
+	/* Remove switch rules associated with the reset VF */
+	ice_rm_dcf_sw_vsi_rule(vf->pf, vf->lan_vsi_num);
 
-	if (ice_check_avail_res(pf)) {
-		dev_err(&pf->pdev->dev,
-			"Cannot allocate VF resources, try with fewer number of VFs\n");
-		return false;
+	if (ice_is_vf_dcf(vf)) {
+		if (vf->pf->hw.dcf_caps & DCF_ACL_CAP)
+			ice_acl_destroy_tbl(&vf->pf->hw);
+		ice_clear_dcf_udp_tunnel_cfg(vf->pf);
 	}
 
-	/* rearm global interrupts */
-	if (test_and_clear_bit(__ICE_OICR_INTR_DIS, pf->state))
-		ice_irq_dynamic_ena(hw, NULL, NULL);
+	ice_vf_clear_counters(vf);
+	ice_clear_vf_reset_trigger(vf);
+}
+
+/**
+ * ice_vf_rebuild_aggregator_node_cfg - rebuild aggregator node config
+ * @vsi: Pointer to VSI
+ *
+ * This function moves VSI into corresponding scheduler aggregator node
+ * based on cached value of "aggregator node info" per VSI
+ */
+static void ice_vf_rebuild_aggregator_node_cfg(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
 
-	/* Finish resetting each VF and allocate resources */
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
-		struct ice_vf *vf = &pf->vf[v];
+	if (!vsi->agg_node)
+		return;
 
-		vf->num_vf_qs = pf->num_vf_qps;
-		dev_dbg(&pf->pdev->dev,
-			"VF-id %d has %d queues configured\n",
-			vf->vf_id, vf->num_vf_qs);
-		ice_cleanup_and_realloc_vf(vf);
+	dev = ice_pf_to_dev(pf);
+	if (vsi->agg_node->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+		dev_dbg(dev,
+			"agg_id %u already has reached max_num_vsis %u\n",
+			vsi->agg_node->agg_id, vsi->agg_node->num_vsis);
+		return;
 	}
 
-	ice_flush(hw);
-	clear_bit(__ICE_VF_DIS, pf->state);
+	status = ice_move_vsi_to_agg(pf->hw.port_info, vsi->agg_node->agg_id,
+				     vsi->idx, (u8)vsi->tc_cfg.ena_tc);
+	if (status)
+		dev_dbg(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, vsi->agg_node->agg_id);
+	else
+		vsi->agg_node->num_vsis++;
+}
 
-	return true;
+/**
+ * ice_vf_rebuild_host_cfg - host admin configuration is persistent across reset
+ * @vf: VF to rebuild host configuration on
+ */
+static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	ice_vf_set_host_trust_cfg(vf);
+
+	if (ice_vf_rebuild_host_mac_cfg(vf))
+		dev_err(dev, "failed to rebuild default MAC configuration for VF %d\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_dcf_vlan_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild DCF VLAN configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_host_vlan_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild VLAN configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_rebuild_host_tx_rate_cfg(vf))
+		dev_err(dev, "failed to rebuild Tx rate limiting configuration for VF %u\n",
+			vf->vf_id);
+
+	if (ice_vf_set_spoofchk_cfg(vf, vsi))
+		dev_err(dev, "failed to rebuild spoofchk configuration for VF %d\n",
+			vf->vf_id);
+
+	/* rebuild aggregator node config for main VF VSI */
+	ice_vf_rebuild_aggregator_node_cfg(vsi);
+}
+
+/**
+ * ice_vf_rebuild_adq_aggregator_node - move ADQ VSIs into aggregator node
+ * @vf: VF to rebuild ADQ VSI(s) Tx rate configuration on
+ *
+ * If VF ADQ is enabled, replay scheduler aggregator node config
+ */
+static void ice_vf_rebuild_adq_aggregator_node(struct ice_vf *vf)
+{
+	int tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		ice_vf_rebuild_aggregator_node_cfg(vsi);
+	}
+}
+
+/**
+ * ice_vf_rebuild_adq_tx_rate_cfg - rebuild ADQ VSI(s) Tx rate configuration
+ * @vf: VF to rebuild ADQ VSI(s) Tx rate configuration on
+ */
+static void ice_vf_rebuild_adq_tx_rate_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_vsi *vsi;
+	u64 max_tx_rate;
+	u8 tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return;
+
+	/* Host may have set Tx rate for VF, but use the TC0's specified
+	 * max Tx rate for main VF VSI.
+	 * Iterate thru' all VSI (hence for loop starts with zero) shared by
+	 * given VF and set the BW limit if specified as part of
+	 * VF ADQ TC config
+	 */
+	for (tc = 0; tc < vf->num_tc; tc++) {
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		max_tx_rate = vf->ch[tc].max_tx_rate;
+		if (!max_tx_rate)
+			continue;
+
+		if (!tc && vf->max_tx_rate)
+			dev_dbg(dev, "Host managed VF rate limit %u for VF %d are being changed to %llu\n",
+				vf->max_tx_rate, vf->vf_id, max_tx_rate);
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		if (ice_set_max_bw_limit(vsi, max_tx_rate * 1000))
+			dev_err(dev, "Unable to set Tx rate %llu in Mbps for VF %u TC %d\n",
+				max_tx_rate, vf->vf_id, tc);
+	}
+}
+
+/**
+ * ice_vf_rebuild_adq_host_cfg - host admin config is persistent across reset
+ * @vf: VF to rebuild ADQ host configuration on
+ */
+static void ice_vf_rebuild_adq_host_cfg(struct ice_vf *vf)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	ice_vf_rebuild_adq_aggregator_node(vf);
+	ice_vf_rebuild_adq_tx_rate_cfg(vf);
+	if (ice_vf_rebuild_adq_port_vlan_cfg(vf))
+		dev_err(dev, "failed to rebuild port VLAN configuration for ADQ enabled VF %u\n",
+			vf->vf_id);
+	if (ice_vf_rebuild_adq_spoofchk_cfg(vf))
+		dev_err(dev, "failed to rebuild spoofchk configuration for ADQ enabled VF %u\n",
+			vf->vf_id);
+}
+
+/**
+ * ice_vf_rebuild_adq_vsi_with_release - release and setup each ADQ VSI
+ * @vf: VF to re-apply ADQ configuration for
+ *
+ * This is only called when a single VF is being reset (i.e. VFR, VFLR, host VF
+ * configuration change, etc.).
+ *
+ * This cannot be called for the reset all VFs case as ice_vf_adq_vsi_release()
+ * will fail because there are no VF VSI(s) in firmware at this point.
+ */
+static int ice_vf_rebuild_adq_vsi_with_release(struct ice_vf *vf)
+{
+	u8 tc;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (ice_vf_adq_vsi_valid(vf, tc)) {
+			ice_vf_adq_vsi_stop_rings(vf, tc);
+			ice_vf_adq_vsi_disable_txqs(vf, tc);
+			ice_vf_adq_vsi_release(vf, tc);
+		}
+
+		if (!ice_vf_adq_vsi_setup(vf, tc)) {
+			dev_err(ice_pf_to_dev(vf->pf), "failed to setup ADQ VSI for VF %u, TC %d, disabling VF ADQ VSI\n",
+				vf->vf_id, tc);
+			goto adq_cfg_failed;
+		}
+	}
+
+	/* must to store away TC0's info because it is used later */
+	vf->ch[0].vsi_idx = vf->lan_vsi_idx;
+	vf->ch[0].vsi_num = vf->lan_vsi_num;
+
+	return 0;
+
+adq_cfg_failed:
+	/* perform VSI release for ADQ VSI if some of them were
+	 * created successfully.
+	 */
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		if (ice_vf_adq_vsi_valid(vf, tc)) {
+			ice_vf_adq_vsi_disable_txqs(vf, tc);
+			ice_vf_adq_vsi_release(vf, tc);
+		}
+		ice_vf_adq_cfg_cleanup(vf, tc);
+	}
+	vf->adq_enabled = false;
+	vf->num_tc = 0;
+	/* Upon failure also clean up tc=0 specific info from
+	 * software data structs, to avoid having stale info
+	 */
+	ice_vf_adq_invalidate_vsi(vf, 0);
+	ice_vf_adq_cfg_cleanup(vf, 0);
+	return -ENOMEM;
+}
+
+/**
+ * ice_vf_rebuild_adq_vsi - rebuild ADQ VSI(s) on the VF
+ * @vf: VF to rebuild ADQ VSI(s) on
+ */
+static int ice_vf_rebuild_adq_vsi(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	int tc;
+
+	/* no ADQ configured, nothing to do */
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+		struct ice_vsi *vsi;
+		int ret;
+
+		if (!ice_vf_adq_vsi_valid(vf, tc))
+			continue;
+
+		vsi = ice_get_vf_adq_vsi(vf, tc);
+		ret = ice_vsi_rebuild(vsi, true);
+		if (ret) {
+			dev_err(ice_pf_to_dev(pf), "failed to rebuild ADQ VSI for VF %u, disabling VF ADQ VSI\n",
+				vf->vf_id);
+			vf->adq_enabled = false;
+			ice_vf_adq_invalidate_vsi(vf, tc);
+			return ret;
+		}
+
+		vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+		vf->ch[tc].vsi_num = vsi->vsi_num;
+		vf->ch[tc].vsi_idx = vsi->idx;
+	}
+
+	/* must to store away TC0's info because it is use later */
+	vf->ch[0].vsi_idx = vf->lan_vsi_idx;
+	vf->ch[0].vsi_num = vf->lan_vsi_num;
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_vsi_with_release - release and setup the VF's VSI
+ * @vf: VF to release and setup the VSI for
+ *
+ * This is only called when a single VF is being reset (i.e. VFR, VFLR, host VF
+ * configuration change, etc.).
+ */
+static int ice_vf_rebuild_vsi_with_release(struct ice_vf *vf)
+{
+	ice_vf_vsi_release(vf);
+	if (!ice_vf_vsi_setup(vf))
+		return -ENOMEM;
+
+	ice_vf_rebuild_adq_vsi_with_release(vf);
+
+	return 0;
+}
+
+/**
+ * ice_vf_rebuild_vsi - rebuild the VF's VSI
+ * @vf: VF to rebuild the VSI for
+ *
+ * This is only called when all VF(s) are being reset (i.e. PCIe Reset on the
+ * host, PFR, CORER, etc.).
+ */
+static int ice_vf_rebuild_vsi(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+	struct ice_pf *pf = vf->pf;
+
+	if (ice_vsi_rebuild(vsi, true)) {
+		dev_err(ice_pf_to_dev(pf), "failed to rebuild VF %d VSI\n",
+			vf->vf_id);
+		return -EIO;
+	}
+	/* vsi->idx will remain the same in this case so don't update
+	 * vf->lan_vsi_idx
+	 */
+	vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx);
+	vf->lan_vsi_num = vsi->vsi_num;
+
+	if (ice_vf_rebuild_adq_vsi(vf)) {
+		dev_err(ice_pf_to_dev(pf), "failed to rebuild ADQ configuration for VF %d\n",
+			vf->vf_id);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vf_set_initialized - VF is ready for VIRTCHNL communication
+ * @vf: VF to set in initialized state
+ *
+ * After this function the VF will be ready to receive/handle the
+ * VIRTCHNL_OP_GET_VF_RESOURCES message
+ */
+static void ice_vf_set_initialized(struct ice_vf *vf)
+{
+	ice_set_vf_state_qs_dis(vf);
+	clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states);
+	clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states);
+	clear_bit(ICE_VF_STATE_DIS, vf->vf_states);
+	set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	memset(&vf->vlan_v2_caps, 0, sizeof(vf->vlan_v2_caps));
+}
+
+/**
+ * ice_vf_post_vsi_rebuild - tasks to do after the VF's VSI have been rebuilt
+ * @vf: VF to perform tasks on
+ */
+static void ice_vf_post_vsi_rebuild(struct ice_vf *vf)
+{
+	ice_vf_rebuild_host_cfg(vf);
+	ice_vf_rebuild_adq_host_cfg(vf);
+	ice_vf_set_initialized(vf);
+	ice_ena_vf_mappings(vf);
+	wr32(&vf->pf->hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
 }
 
 /**
@@ -1067,6 +2185,7 @@ static bool ice_config_res_vfs(struct ice_pf *pf)
  */
 bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 {
+	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_hw *hw = &pf->hw;
 	struct ice_vf *vf;
 	int v, i;
@@ -1075,25 +2194,23 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	if (!pf->num_alloc_vfs)
 		return false;
 
+	/* clear all malicious info if the VFs are getting reset */
+	ice_for_each_vf(pf, i)
+		if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, i))
+			dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
+
 	/* If VFs have been disabled, there is no need to reset */
-	if (test_and_set_bit(__ICE_VF_DIS, pf->state))
+	if (test_and_set_bit(ICE_VF_DIS, pf->state))
 		return false;
 
+	ice_clear_dcf_acl_cfg(pf);
+	ice_clear_dcf_udp_tunnel_cfg(pf);
+	pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+
 	/* Begin reset on all VFs at once */
-	for (v = 0; v < pf->num_alloc_vfs; v++)
+	ice_for_each_vf(pf, v)
 		ice_trigger_vf_reset(&pf->vf[v], is_vflr, true);
 
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
-		struct ice_vsi *vsi;
-
-		vf = &pf->vf[v];
-		vsi = pf->vsi[vf->lan_vsi_idx];
-		if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
-			ice_dis_vf_qs(vf);
-		ice_dis_vsi_txq(vsi->port_info, vsi->idx, 0, 0, NULL, NULL,
-				NULL, ICE_VF_RESET, vf->vf_id, NULL);
-	}
-
 	/* HW requires some time to make sure it can flush the FIFO for a VF
 	 * when it resets it. Poll the VPGEN_VFRSTAT register for each VF in
 	 * sequence to make sure that it has completed. We'll keep track of
@@ -1101,7 +2218,6 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	 * finished resetting.
 	 */
 	for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) {
-
 		/* Check each VF in sequence */
 		while (v < pf->num_alloc_vfs) {
 			u32 reg;
@@ -1121,69 +2237,168 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 		}
 	}
 
+
 	/* Display a warning if at least one VF didn't manage to reset in
 	 * time, but continue on with the operation.
 	 */
 	if (v < pf->num_alloc_vfs)
-		dev_warn(&pf->pdev->dev, "VF reset check timeout\n");
+		dev_warn(dev, "VF reset check timeout\n");
+
 
 	/* free VF resources to begin resetting the VSI state */
-	for (v = 0; v < pf->num_alloc_vfs; v++) {
+	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
-		ice_free_vf_res(vf);
+		vf->driver_caps = 0;
+		ice_vc_set_default_allowlist(vf);
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		/* always release VF ADQ filters since those filters will be
+		 * replayed by VF driver. This is needed to avoid stale
+		 * filters in software internal data structues
+		 */
+		ice_del_all_adv_switch_fltr(vf);
+#endif
 
-		/* Free VF queues as well, and reallocate later.
-		 * If a given VF has different number of queues
-		 * configured, the request for update will come
-		 * via mailbox communication.
+		ice_vf_fdir_exit(vf);
+		ice_vf_fdir_init(vf);
+		/* clean VF control VSI when resetting VFs since it should be
+		 * setup only when iAVF creates its first FDIR rule.
 		 */
-		vf->num_vf_qs = 0;
+		if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+			ice_vf_ctrl_invalidate_vsi(vf);
+
+		ice_vf_pre_vsi_rebuild(vf);
+		ice_vf_rebuild_vsi(vf);
+		ice_vf_post_vsi_rebuild(vf);
 	}
 
-	if (ice_sriov_free_msix_res(pf))
-		dev_err(&pf->pdev->dev,
-			"Failed to free MSIX resources used by SR-IOV\n");
+	if (ice_is_eswitch_mode_switchdev(pf))
+		if (ice_eswitch_rebuild(pf))
+			dev_warn(dev, "eswitch rebuild failed\n");
 
-	if (!ice_config_res_vfs(pf))
-		return false;
+	ice_flush(hw);
+	clear_bit(ICE_VF_DIS, pf->state);
 
 	return true;
 }
 
+/**
+ * ice_is_vf_disabled
+ * @vf: pointer to the VF info
+ *
+ * Returns true if the PF or VF is disabled, false otherwise.
+ */
+static bool ice_is_vf_disabled(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+
+	/* If the PF has been disabled, there is no need resetting VF until
+	 * PF is active again. Similarly, if the VF has been disabled, this
+	 * means something else is resetting the VF, so we shouldn't continue.
+	 * Otherwise, set disable VF state bit for actual reset, and continue.
+	 */
+	return (test_bit(ICE_VF_DIS, pf->state) ||
+		test_bit(ICE_VF_STATE_DIS, vf->vf_states));
+}
+
+/**
+ * ice_vf_get_glint_ceqctl_idx - get the GLINT_CEQCTL index relative to the PF
+ * @vf: VF used to get the index
+ * @ceq_idx: 0-based index from the VF
+ *
+ * Use the VF relative (0-based) CEQ index plus the first PF MSI-X index
+ * assigned to this VF (relative to the PF's MSIX space) to determine the index
+ * of the GLINT_CEQCTL register
+ */
+static u16 ice_vf_get_glint_ceqctl_idx(struct ice_vf *vf, u16 ceq_idx)
+{
+	return vf->first_vector_idx + ceq_idx;
+}
+
+/**
+ * ice_vf_clear_ceq_irq_map - clear the CEQ IRQ mapping
+ * @vf: VF used to clear the mapping
+ * @ceq_idx: VF relative (0-based) CEQ index
+ */
+static void ice_vf_clear_ceq_irq_map(struct ice_vf *vf, u16 ceq_idx)
+{
+	u16 glint_ceqctl_idx = ice_vf_get_glint_ceqctl_idx(vf, ceq_idx);
+
+	wr32(&vf->pf->hw, GLINT_CEQCTL(glint_ceqctl_idx), 0);
+}
+
+/**
+ * ice_vf_clear_aeq_irq_map - clear the AEQ IRQ mapping
+ * @vf: VF used to clear the mapping
+ */
+static void ice_vf_clear_aeq_irq_map(struct ice_vf *vf)
+{
+	wr32(&vf->pf->hw, VPINT_AEQCTL(vf->vf_id), 0);
+}
+
+/**
+ * ice_vf_clear_rdma_irq_map - clear the RDMA IRQ mapping
+ * @vf: VF used to clear the mapping
+ *
+ * Clear any RDMA IRQ mapping that a VF might have requested. Since the number
+ * of CEQ indices are never greater than the num_msix_per_vf just clear all CEQ
+ * indices that are possibly associated to this VF. Also clear the AEQ for this
+ * VF. Doing it this way prevents the need to cache the configuration received
+ * on VIRTCHNL_OP_CONFIG_RMDA_IRQ_MAP since VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP is
+ * designed to clear the entire RDMA IRQ mapping configuration.
+ */
+static void ice_vf_clear_rdma_irq_map(struct ice_vf *vf)
+{
+	u16 i;
+
+	for (i = 0; i < vf->pf->num_msix_per_vf; i++)
+		ice_vf_clear_ceq_irq_map(vf, i);
+
+	ice_vf_clear_aeq_irq_map(vf);
+}
+
 /**
  * ice_reset_vf - Reset a particular VF
  * @vf: pointer to the VF structure
  * @is_vflr: true if VFLR was issued, false if not
  *
- * Returns true if the VF is reset, false otherwise.
+ * Returns true if the VF is currently in reset, resets successfully, or resets
+ * are disabled and false otherwise.
  */
-static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
+bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
+	struct device *dev;
 	struct ice_hw *hw;
 	bool rsd = false;
 	u8 promisc_m;
 	u32 reg;
 	int i;
 
-	/* If the PF has been disabled, there is no need resetting VF until
-	 * PF is active again.
-	 */
-	if (test_bit(__ICE_VF_DIS, pf->state))
-		return false;
+	dev = ice_pf_to_dev(pf);
 
-	/* If the VF has been disabled, this means something else is
-	 * resetting the VF, so we shouldn't continue. Otherwise, set
-	 * disable VF state bit for actual reset, and continue.
-	 */
-	if (test_and_set_bit(ICE_VF_STATE_DIS, vf->vf_states))
-		return false;
+	if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
+		dev_dbg(dev, "Trying to reset VF %d, but all VF resets are disabled\n",
+			vf->vf_id);
+		return true;
+	}
+
+	if (ice_is_vf_disabled(vf)) {
+		dev_dbg(dev, "VF is already disabled, there is no need for resetting it, telling VM, all is fine %d\n",
+			vf->vf_id);
+		return true;
+	}
 
+	/* Set VF disable bit state here, before triggering reset */
+	set_bit(ICE_VF_STATE_DIS, vf->vf_states);
 	ice_trigger_vf_reset(vf, is_vflr, false);
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (ice_dcf_get_state(pf) == ICE_DCF_STATE_ON)
+		ice_dcf_set_state(pf, ICE_DCF_STATE_BUSY);
+
+	vsi = ice_get_vf_vsi(vf);
 
 	if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
 		ice_dis_vf_qs(vf);
@@ -1193,6 +2408,21 @@ static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 	 */
 	ice_dis_vsi_txq(vsi->port_info, vsi->idx, 0, 0, NULL, NULL,
 			NULL, ICE_VF_RESET, vf->vf_id, NULL);
+	/* Likewise Disable LAN Tx queues for VF ADQ VSIs */
+	if (ice_is_vf_adq_ena(vf)) {
+		int tc;
+
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			if (!ice_vf_adq_vsi_valid(vf, tc))
+				continue;
+			ice_dis_vsi_txq(vsi->port_info, vf->ch[tc].vsi_idx, 0,
+					0, NULL, NULL, NULL, ICE_VF_RESET,
+					vf->vf_id, NULL);
+		}
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_RDMA)
+		ice_vf_clear_rdma_irq_map(vf);
 
 	hw = &pf->hw;
 	/* poll VPGEN_VFRSTAT reg to make sure
@@ -1213,34 +2443,67 @@ static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 		usleep_range(10, 20);
 	}
 
+	vf->driver_caps = 0;
+	ice_vc_set_default_allowlist(vf);
+
 	/* Display a warning if VF didn't manage to reset in time, but need to
 	 * continue on with the operation.
 	 */
 	if (!rsd)
-		dev_warn(&pf->pdev->dev, "VF reset check timeout on VF %d\n",
-			 vf->vf_id);
+		dev_warn(dev, "VF reset check timeout on VF %d\n", vf->vf_id);
 
 	/* disable promiscuous modes in case they were enabled
 	 * ignore any error if disabling process failed
 	 */
 	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
 	    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) {
-		if (vf->port_vlan_id ||  vf->num_vlan)
+		if (ice_vf_is_port_vlan_ena(vf) || vsi->num_vlan)
 			promisc_m = ICE_UCAST_VLAN_PROMISC_BITS;
 		else
 			promisc_m = ICE_UCAST_PROMISC_BITS;
 
-		vsi = pf->vsi[vf->lan_vsi_idx];
-		if (ice_vf_set_vsi_promisc(vf, vsi, promisc_m, true))
-			dev_err(&pf->pdev->dev, "disabling promiscuous mode failed\n");
+		if (ice_vf_clear_vsi_promisc(vf, vsi, promisc_m))
+			dev_err(dev, "disabling promiscuous mode failed\n");
 	}
 
-	/* free VF resources to begin resetting the VSI state */
-	ice_free_vf_res(vf);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	/* always release VF ADQ filters since those filters will be
+	 * replayed by VF driver. This is needed to avoid stale filters in
+	 * software internal data structures
+	 */
+	ice_del_all_adv_switch_fltr(vf);
+#endif
+	/* VF driver gets reloaded on VFLR, so clear ADQ configuration */
+	if (is_vflr)
+		ice_vf_adq_release(vf);
 
-	ice_cleanup_and_realloc_vf(vf);
 
-	ice_flush(hw);
+	ice_vf_fdir_exit(vf);
+	ice_vf_fdir_init(vf);
+	/* clean VF control VSI when resetting VF since it should be setup
+	 * only when iAVF creates its first FDIR rule.
+	 */
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		ice_vf_ctrl_vsi_release(vf);
+
+	ice_vf_pre_vsi_rebuild(vf);
+
+	if (ice_vf_rebuild_vsi_with_release(vf)) {
+		dev_err(dev, "Failed to release and setup the VF%u's VSI\n", vf->vf_id);
+		return false;
+	}
+
+	ice_vf_post_vsi_rebuild(vf);
+	vsi = ice_get_vf_vsi(vf);
+	ice_eswitch_update_repr(vsi);
+
+	if (ice_dcf_get_state(pf) == ICE_DCF_STATE_BUSY) {
+		ice_dcf_set_state(pf, ICE_DCF_STATE_ON);
+	}
+
+	/* if the VF has been reset allow it to come up again */
+	if (ice_mbx_clear_malvf(&hw->mbx_snapshot, pf->malvfs, ICE_MAX_VF_COUNT, vf->vf_id))
+		dev_dbg(dev, "failed to clear malicious VF state for VF %u\n", i);
 
 	return true;
 }
@@ -1253,7 +2516,7 @@ void ice_vc_notify_link_state(struct ice_pf *pf)
 {
 	int i;
 
-	for (i = 0; i < pf->num_alloc_vfs; i++)
+	ice_for_each_vf(pf, i)
 		ice_vc_notify_vf_link_state(&pf->vf[i]);
 }
 
@@ -1276,6 +2539,28 @@ void ice_vc_notify_reset(struct ice_pf *pf)
 			    (u8 *)&pfe, sizeof(struct virtchnl_pf_event));
 }
 
+/**
+ * ice_vc_notify_dcf_vf_info - Send DCF VF information to the VF
+ * @old_dcf_vf: pointer to the previous DCF VF structure
+ * @cur_dcf_vf: pointer to the current DCF VF structure
+ */
+static void ice_vc_notify_dcf_vf_info(struct ice_vf *old_dcf_vf, struct ice_vf *cur_dcf_vf)
+{
+        struct ice_pf *pf = cur_dcf_vf->pf;
+        struct virtchnl_pf_event pfe = { 0 };
+
+        if (!old_dcf_vf || !cur_dcf_vf)
+                return;
+
+        pfe.event = VIRTCHNL_EVENT_DCF_VSI_INFO;
+        pfe.event_data.vf_vsi_map.vf_id = cur_dcf_vf->vf_id;
+        pfe.event_data.vf_vsi_map.vsi_id = cur_dcf_vf->lan_vsi_num;
+
+        ice_aq_send_msg_to_vf(&pf->hw, old_dcf_vf->vf_id, VIRTCHNL_OP_EVENT,
+                              VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, sizeof(pfe), NULL);
+        return;
+}
+
 /**
  * ice_vc_notify_vf_reset - Notify VF of a reset event
  * @vf: pointer to the VF structure
@@ -1283,13 +2568,17 @@ void ice_vc_notify_reset(struct ice_pf *pf)
 static void ice_vc_notify_vf_reset(struct ice_vf *vf)
 {
 	struct virtchnl_pf_event pfe;
+	struct ice_pf *pf;
 
-	/* validate the request */
-	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+	if (!vf)
 		return;
 
-	/* Bail out if VF is in disabled state, neither initialized, nor active
-	 * state - otherwise proceed with notifications
+	pf = vf->pf;
+	if (ice_validate_vf_id(pf, vf->vf_id))
+		return;
+
+	/* Bail out if VF is in disabled state, neither initialized, nor active
+	 * state - otherwise proceed with notifications
 	 */
 	if ((!test_bit(ICE_VF_STATE_INIT, vf->vf_states) &&
 	     !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) ||
@@ -1298,173 +2587,351 @@ static void ice_vc_notify_vf_reset(struct ice_vf *vf)
 
 	pfe.event = VIRTCHNL_EVENT_RESET_IMPENDING;
 	pfe.severity = PF_EVENT_SEVERITY_CERTAIN_DOOM;
-	ice_aq_send_msg_to_vf(&vf->pf->hw, vf->vf_id, VIRTCHNL_OP_EVENT,
+	ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, VIRTCHNL_OP_EVENT,
 			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, sizeof(pfe),
 			      NULL);
 }
 
+
 /**
- * ice_alloc_vfs - Allocate and set up VFs resources
- * @pf: pointer to the PF structure
- * @num_alloc_vfs: number of VFs to allocate
+ * ice_init_vf_vsi_res - initialize/setup VF VSI resources
+ * @vf: VF to initialize/setup the VSI for
+ *
+ * This function creates a VSI for the VF, adds a VLAN 0 filter, and sets up the
+ * VF VSI's broadcast filter and is only used during initial VF creation.
+ */
+static int ice_init_vf_vsi_res(struct ice_vf *vf)
+{
+	struct ice_vsi_vlan_ops *vlan_ops;
+	struct ice_pf *pf = vf->pf;
+	u8 broadcast[ETH_ALEN];
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int err;
+
+	vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf);
+
+	dev = ice_pf_to_dev(pf);
+	vsi = ice_vf_vsi_setup(vf);
+	if (!vsi)
+		return -ENOMEM;
+
+	err = ice_vsi_add_vlan_zero(vsi);
+	if (err) {
+		dev_warn(dev, "Failed to add VLAN 0 filter for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	err = vlan_ops->ena_rx_filtering(vsi);
+	if (err) {
+		dev_warn(dev, "Failed to enable Rx VLAN filtering for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+	eth_broadcast_addr(broadcast);
+	status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI);
+	if (status) {
+		dev_err(dev, "Failed to add broadcast MAC filter for VF %d, status %s\n",
+			vf->vf_id, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+		goto release_vsi;
+	}
+
+	err = ice_vf_set_spoofchk_cfg(vf, vsi);
+	if (err) {
+		dev_warn(dev, "Failed to initialize spoofchk setting for VF %d\n",
+			 vf->vf_id);
+		goto release_vsi;
+	}
+
+
+	vf->num_mac = 1;
+
+	return 0;
+
+release_vsi:
+	ice_vf_vsi_release(vf);
+	return err;
+}
+
+/**
+ * ice_start_vfs - start VFs so they are ready to be used by SR-IOV
+ * @pf: PF the VFs are associated with
  */
-static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs)
+static int ice_start_vfs(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
+	int retval, i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_clear_vf_reset_trigger(vf);
+
+		retval = ice_init_vf_vsi_res(vf);
+		if (retval) {
+			dev_err(ice_pf_to_dev(pf), "Failed to initialize VSI resources for VF %d, error %d\n",
+				vf->vf_id, retval);
+			goto teardown;
+		}
+
+		set_bit(ICE_VF_STATE_INIT, vf->vf_states);
+		ice_ena_vf_mappings(vf);
+		wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
+	}
+
+	ice_flush(hw);
+	return 0;
+
+teardown:
+	for (i = i - 1; i >= 0; i--) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		ice_dis_vf_mappings(vf);
+		ice_vf_vsi_release(vf);
+	}
+
+	return retval;
+}
+
+static void
+ice_vf_hash_ctx_init(struct ice_vf *vf)
+{
+	memset(&vf->hash_ctx, 0, sizeof(vf->hash_ctx));
+}
+
+/**
+ * ice_set_dflt_settings_vfs - set VF defaults during initialization/creation
+ * @pf: PF holding reference to all VFs for default configuration
+ */
+static void ice_set_dflt_settings_vfs(struct ice_pf *pf)
+{
+	int i;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+		vf->vf_sw_id = pf->first_sw;
+		vf->pf = pf;
+		vf->vf_id = i;
+		/* assign default capabilities */
+		set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vf->vf_caps);
+		vf->spoofchk = true;
+		vf->num_vf_qs = pf->num_qps_per_vf;
+		ice_vc_set_default_allowlist(vf);
+
+		/* ctrl_vsi_idx will be set to a valid value only when iAVF
+		 * creates its first fdir rule.
+		 */
+		ice_vf_ctrl_invalidate_vsi(vf);
+		ice_vf_fdir_init(vf);
+
+		ice_vf_hash_ctx_init(vf);
+
+		ice_vc_set_dflt_vf_ops(&vf->vc_ops);
+	}
+}
+
+/**
+ * ice_alloc_vfs - allocate num_vfs in the PF structure
+ * @pf: PF to store the allocated VFs in
+ * @num_vfs: number of VFs to allocate
+ */
+static int ice_alloc_vfs(struct ice_pf *pf, int num_vfs)
+{
 	struct ice_vf *vfs;
-	int i, ret;
+
+	vfs = devm_kcalloc(ice_pf_to_dev(pf), num_vfs, sizeof(*vfs),
+			   GFP_KERNEL);
+	if (!vfs)
+		return -ENOMEM;
+
+	pf->vf = vfs;
+	pf->num_alloc_vfs = num_vfs;
+
+	return 0;
+}
+
+/**
+ * ice_ena_vfs - enable VFs so they are ready to be used
+ * @pf: pointer to the PF structure
+ * @num_vfs: number of VFs to enable
+ */
+static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int ret;
 
 	/* Disable global interrupt 0 so we don't try to handle the VFLR. */
 	wr32(hw, GLINT_DYN_CTL(pf->oicr_idx),
 	     ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S);
-	set_bit(__ICE_OICR_INTR_DIS, pf->state);
+	set_bit(ICE_OICR_INTR_DIS, pf->state);
 	ice_flush(hw);
 
-	ret = pci_enable_sriov(pf->pdev, num_alloc_vfs);
+	ret = pci_enable_sriov(pf->pdev, num_vfs);
 	if (ret) {
 		pf->num_alloc_vfs = 0;
 		goto err_unroll_intr;
 	}
-	/* allocate memory */
-	vfs = devm_kcalloc(&pf->pdev->dev, num_alloc_vfs, sizeof(*vfs),
-			   GFP_KERNEL);
-	if (!vfs) {
-		ret = -ENOMEM;
+
+	ret = ice_alloc_vfs(pf, num_vfs);
+	if (ret)
 		goto err_pci_disable_sriov;
-	}
-	pf->vf = vfs;
 
-	/* apply default profile */
-	for (i = 0; i < num_alloc_vfs; i++) {
-		vfs[i].pf = pf;
-		vfs[i].vf_sw_id = pf->first_sw;
-		vfs[i].vf_id = i;
+	ice_dcf_init_sw_rule_mgmt(pf);
 
-		/* assign default capabilities */
-		set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps);
-		vfs[i].spoofchk = true;
+	if (ice_set_per_vf_res(pf)) {
+		dev_err(dev, "Not enough resources for %d VFs, try with fewer number of VFs\n",
+			num_vfs);
+		ret = -ENOSPC;
+		goto err_unroll_sriov;
 	}
-	pf->num_alloc_vfs = num_alloc_vfs;
 
-	/* VF resources get allocated with initialization */
-	if (!ice_config_res_vfs(pf)) {
-		ret = -EIO;
+	ice_set_dflt_settings_vfs(pf);
+
+	if (ice_start_vfs(pf)) {
+		dev_err(dev, "Failed to start VF(s)\n");
+		ret = -EAGAIN;
 		goto err_unroll_sriov;
 	}
 
-	return ret;
+	clear_bit(ICE_VF_DIS, pf->state);
+
+	if (ice_eswitch_configure(pf))
+		goto err_unroll_sriov;
+
+	/* rearm global interrupts */
+	if (test_and_clear_bit(ICE_OICR_INTR_DIS, pf->state))
+		ice_irq_dynamic_ena(hw, NULL, NULL);
+
+	return 0;
 
 err_unroll_sriov:
+	devm_kfree(dev, pf->vf);
 	pf->vf = NULL;
-	devm_kfree(&pf->pdev->dev, vfs);
-	vfs = NULL;
 	pf->num_alloc_vfs = 0;
 err_pci_disable_sriov:
 	pci_disable_sriov(pf->pdev);
 err_unroll_intr:
 	/* rearm interrupts here */
 	ice_irq_dynamic_ena(hw, NULL, NULL);
-	clear_bit(__ICE_OICR_INTR_DIS, pf->state);
+	clear_bit(ICE_OICR_INTR_DIS, pf->state);
 	return ret;
 }
 
-/**
- * ice_pf_state_is_nominal - checks the PF for nominal state
- * @pf: pointer to PF to check
- *
- * Check the PF's state for a collection of bits that would indicate
- * the PF is in a state that would inhibit normal operation for
- * driver functionality.
- *
- * Returns true if PF is in a nominal state.
- * Returns false otherwise
- */
-static bool ice_pf_state_is_nominal(struct ice_pf *pf)
-{
-	DECLARE_BITMAP(check_bits, __ICE_STATE_NBITS) = { 0 };
-
-	if (!pf)
-		return false;
-
-	bitmap_set(check_bits, 0, __ICE_STATE_NOMINAL_CHECK_BITS);
-	if (bitmap_intersects(pf->state, check_bits, __ICE_STATE_NBITS))
-		return false;
-
-	return true;
-}
-
 /**
  * ice_pci_sriov_ena - Enable or change number of VFs
  * @pf: pointer to the PF structure
  * @num_vfs: number of VFs to allocate
+ *
+ * Returns 0 on success and negative on failure
  */
 static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs)
 {
 	int pre_existing_vfs = pci_num_vf(pf->pdev);
-	struct device *dev = &pf->pdev->dev;
+	struct device *dev = ice_pf_to_dev(pf);
 	int err;
 
-	if (!ice_pf_state_is_nominal(pf)) {
-		dev_err(dev, "Cannot enable SR-IOV, device not ready\n");
-		return -EBUSY;
-	}
-
-	if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) {
-		dev_err(dev, "This device is not capable of SR-IOV\n");
-		return -ENODEV;
-	}
-
 	if (pre_existing_vfs && pre_existing_vfs != num_vfs)
 		ice_free_vfs(pf);
 	else if (pre_existing_vfs && pre_existing_vfs == num_vfs)
-		return num_vfs;
+		return 0;
 
 	if (num_vfs > pf->num_vfs_supported) {
 		dev_err(dev, "Can't enable %d VFs, max VFs supported is %d\n",
 			num_vfs, pf->num_vfs_supported);
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
-	dev_info(dev, "Allocating %d VFs\n", num_vfs);
-	err = ice_alloc_vfs(pf, num_vfs);
+	dev_info(dev, "Enabling %d VFs\n", num_vfs);
+	err = ice_ena_vfs(pf, num_vfs);
 	if (err) {
 		dev_err(dev, "Failed to enable SR-IOV: %d\n", err);
 		return err;
 	}
 
 	set_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
-	return num_vfs;
+	return 0;
+}
+
+
+/**
+ * ice_check_sriov_allowed - check if SR-IOV is allowed based on various checks
+ * @pf: PF to enabled SR-IOV on
+ */
+static int ice_check_sriov_allowed(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) {
+		dev_err(dev, "This device is not capable of SR-IOV\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (test_bit(ICE_RECOVERY_MODE, pf->state)) {
+		dev_err(dev, "SR-IOV cannot be configured - Device is in Recovery Mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ice_is_safe_mode(pf)) {
+		dev_err(dev, "SR-IOV cannot be configured - Device is in Safe Mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!ice_pf_state_is_nominal(pf)) {
+		dev_err(dev, "Cannot enable SR-IOV, device not ready\n");
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 /**
  * ice_sriov_configure - Enable or change number of VFs via sysfs
  * @pdev: pointer to a pci_dev structure
- * @num_vfs: number of VFs to allocate
+ * @num_vfs: number of VFs to allocate or 0 to free VFs
  *
- * This function is called when the user updates the number of VFs in sysfs.
+ * This function is called when the user updates the number of VFs in sysfs. On
+ * success return whatever num_vfs was set to by the caller. Return negative on
+ * failure.
  */
 int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
+	struct device *dev = ice_pf_to_dev(pf);
+	enum ice_status status;
+	int err;
 
-	if (ice_is_safe_mode(pf)) {
-		dev_err(&pf->pdev->dev,
-			"SR-IOV cannot be configured - Device is in Safe Mode\n");
-		return -EOPNOTSUPP;
-	}
+	err = ice_check_sriov_allowed(pf);
+	if (err)
+		return err;
 
-	if (num_vfs)
-		return ice_pci_sriov_ena(pf, num_vfs);
+	if (!num_vfs) {
+		if (!pci_vfs_assigned(pdev)) {
+			ice_mbx_deinit_snapshot(&pf->hw);
+			ice_free_vfs(pf);
+			return 0;
+		}
 
-	if (!pci_vfs_assigned(pdev)) {
-		ice_free_vfs(pf);
-	} else {
-		dev_err(&pf->pdev->dev,
-			"can't free VFs because some are assigned to VMs.\n");
+		dev_err(dev, "can't free VFs because some are assigned to VMs.\n");
 		return -EBUSY;
 	}
 
-	return 0;
+	status = ice_mbx_init_snapshot(&pf->hw, num_vfs);
+	if (status)
+		return ice_status_to_errno(status);
+
+	err = ice_pci_sriov_ena(pf, num_vfs);
+	if (err) {
+		ice_mbx_deinit_snapshot(&pf->hw);
+		return err;
+	}
+
+	return num_vfs;
 }
 
 /**
@@ -1477,14 +2944,14 @@ int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 void ice_process_vflr_event(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
-	int vf_id;
+	unsigned int vf_id;
 	u32 reg;
 
-	if (!test_and_clear_bit(__ICE_VFLR_EVENT_PENDING, pf->state) ||
+	if (!test_and_clear_bit(ICE_VFLR_EVENT_PENDING, pf->state) ||
 	    !pf->num_alloc_vfs)
 		return;
 
-	for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
+	ice_for_each_vf(pf, vf_id) {
 		struct ice_vf *vf = &pf->vf[vf_id];
 		u32 reg_idx, bit_idx;
 
@@ -1499,17 +2966,81 @@ void ice_process_vflr_event(struct ice_pf *pf)
 }
 
 /**
- * ice_vc_dis_vf - Disable a given VF via SW reset
+ * ice_vc_reset_vf - Perform software reset on the VF after informing the AVF
  * @vf: pointer to the VF info
- *
- * Disable the VF through a SW reset
  */
-static void ice_vc_dis_vf(struct ice_vf *vf)
+static void ice_vc_reset_vf(struct ice_vf *vf)
 {
 	ice_vc_notify_vf_reset(vf);
 	ice_reset_vf(vf, false);
 }
 
+/**
+ * ice_get_vf_from_pfq - get the VF who owns the PF space queue passed in
+ * @pf: PF used to index all VFs
+ * @pfq: queue index relative to the PF's function space
+ *
+ * If no VF is found who owns the pfq then return NULL, otherwise return a
+ * pointer to the VF who owns the pfq
+ */
+static struct ice_vf *ice_get_vf_from_pfq(struct ice_pf *pf, u16 pfq)
+{
+	unsigned int vf_id;
+
+	ice_for_each_vf(pf, vf_id) {
+		struct ice_vf *vf = &pf->vf[vf_id];
+		struct ice_vsi *vsi;
+		u16 rxq_idx;
+
+		vsi = ice_get_vf_vsi(vf);
+
+		ice_for_each_rxq(vsi, rxq_idx)
+			if (vsi->rxq_map[rxq_idx] == pfq)
+				return vf;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_globalq_to_pfq - convert from global queue index to PF space queue index
+ * @pf: PF used for conversion
+ * @globalq: global queue index used to convert to PF space queue index
+ */
+static u32 ice_globalq_to_pfq(struct ice_pf *pf, u32 globalq)
+{
+	return globalq - pf->hw.func_caps.common_cap.rxq_first_id;
+}
+
+/**
+ * ice_vf_lan_overflow_event - handle LAN overflow event for a VF
+ * @pf: PF that the LAN overflow event happened on
+ * @event: structure holding the event information for the LAN overflow event
+ *
+ * Determine if the LAN overflow event was caused by a VF queue. If it was not
+ * caused by a VF, do nothing. If a VF caused this LAN overflow event trigger a
+ * reset on the offending VF.
+ */
+void
+ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
+{
+	u32 gldcb_rtctq, queue;
+	struct ice_vf *vf;
+
+	gldcb_rtctq = le32_to_cpu(event->desc.params.lan_overflow.prtdcb_ruptq);
+	dev_dbg(ice_pf_to_dev(pf), "GLDCB_RTCTQ: 0x%08x\n", gldcb_rtctq);
+
+	/* event returns device global Rx queue number */
+	queue = (gldcb_rtctq & GLDCB_RTCTQ_RXQNUM_M) >>
+		GLDCB_RTCTQ_RXQNUM_S;
+
+	vf = ice_get_vf_from_pfq(pf, ice_globalq_to_pfq(pf, queue));
+	if (!vf)
+		return;
+
+	ice_vc_reset_vf(vf);
+}
+
 /**
  * ice_vc_send_msg_to_vf - Send message to VF
  * @vf: pointer to the VF info
@@ -1520,29 +3051,32 @@ static void ice_vc_dis_vf(struct ice_vf *vf)
  *
  * send msg to VF
  */
-static int
+int
 ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
 {
 	enum ice_status aq_ret;
+	struct device *dev;
 	struct ice_pf *pf;
 
-	/* validate the request */
-	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+	if (!vf)
 		return -EINVAL;
 
 	pf = vf->pf;
+	if (ice_validate_vf_id(pf, vf->vf_id))
+		return -EINVAL;
+
+	dev = ice_pf_to_dev(pf);
 
 	/* single place to detect unsuccessful return values */
 	if (v_retval) {
 		vf->num_inval_msgs++;
-		dev_info(&pf->pdev->dev, "VF %d failed opcode %d, retval: %d\n",
-			 vf->vf_id, v_opcode, v_retval);
+		dev_info(dev, "VF %d failed opcode %d, retval: %d\n", vf->vf_id,
+			 v_opcode, v_retval);
 		if (vf->num_inval_msgs > ICE_DFLT_NUM_INVAL_MSGS_ALLOWED) {
-			dev_err(&pf->pdev->dev,
-				"Number of invalid messages exceeded for VF %d\n",
+			dev_err(dev, "Number of invalid messages exceeded for VF %d\n",
 				vf->vf_id);
-			dev_err(&pf->pdev->dev, "Use PF Control I/F to enable the VF\n");
+			dev_err(dev, "Use PF Control I/F to enable the VF\n");
 			set_bit(ICE_VF_STATE_DIS, vf->vf_states);
 			return -EIO;
 		}
@@ -1555,9 +3089,9 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 	aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval,
 				       msg, msglen, NULL);
 	if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) {
-		dev_info(&pf->pdev->dev,
-			 "Unable to send the message to VF %d ret %d aq_err %d\n",
-			 vf->vf_id, aq_ret, pf->hw.mailboxq.sq_last_status);
+		dev_info(dev, "Unable to send the message to VF %d ret %s aq_err %s\n",
+			 vf->vf_id, ice_stat_str(aq_ret),
+			 ice_aq_str(pf->hw.mailboxq.sq_last_status));
 		return -EIO;
 	}
 
@@ -1587,6 +3121,28 @@ static int ice_vc_get_ver_msg(struct ice_vf *vf, u8 *msg)
 				     sizeof(struct virtchnl_version_info));
 }
 
+/**
+ * ice_vc_get_max_frame_size - get max frame size allowed for VF
+ * @vf: VF used to determine max frame size
+ *
+ * Max frame size is determined based on the current port's max frame size and
+ * whether a port VLAN is configured on this VF. The VF is not aware whether
+ * it's in a port VLAN so the PF needs to account for this in max frame size
+ * checks and sending the max frame size to the VF.
+ */
+static u16 ice_vc_get_max_frame_size(struct ice_vf *vf)
+{
+	struct ice_port_info *pi = ice_vf_get_port_info(vf);
+	u16 max_frame_size;
+
+	max_frame_size = pi->phy.link_info.max_frame_size;
+
+	if (ice_vf_is_port_vlan_ena(vf))
+		max_frame_size -= VLAN_HLEN;
+
+	return max_frame_size;
+}
+
 /**
  * ice_vc_get_vf_res_msg
  * @vf: pointer to the VF info
@@ -1603,14 +3159,14 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	int len = 0;
 	int ret;
 
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
+	if (ice_check_vf_init(pf, vf)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto err;
 	}
 
 	len = sizeof(struct virtchnl_vf_resource);
 
-	vfres = devm_kzalloc(&pf->pdev->dev, len, GFP_KERNEL);
+	vfres = kzalloc(len, GFP_KERNEL);
 	if (!vfres) {
 		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
 		len = 0;
@@ -1624,14 +3180,39 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 				  VIRTCHNL_VF_OFFLOAD_VLAN;
 
 	vfres->vf_cap_flags = VIRTCHNL_VF_OFFLOAD_L2;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto err;
 	}
 
-	if (!vsi->info.pvid)
-		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN_V2) {
+		/* VLAN offloads based on current device configuration */
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN_V2;
+	} else if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN) {
+		/* allow VF to negotiate VIRTCHNL_VF_OFFLOAD explicitly for
+		 * these two conditions, which amounts to guest VLAN filtering
+		 * and offloads being based on the inner VLAN or the
+		 * inner/single VLAN respectively and don't allow VF to
+		 * negotiate VIRTCHNL_VF_OFFLOAD in any other cases
+		 */
+		if (ice_is_dvm_ena(&pf->hw) && ice_vf_is_port_vlan_ena(vf)) {
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+		} else if (!ice_is_dvm_ena(&pf->hw) &&
+			   !ice_vf_is_port_vlan_ena(vf)) {
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+			/* configure backward compatible support for VFs that
+			 * only support VIRTCHNL_VF_OFFLOAD_VLAN, the PF is
+			 * configured in SVM, and no port VLAN is configured
+			 */
+			ice_vf_vsi_cfg_svm_legacy_vlan_mode(vsi);
+		} else if (ice_is_dvm_ena(&pf->hw)) {
+			/* configure software offloaded VLAN support when DVM
+			 * is enabled, but no port VLAN is enabled
+			 */
+			ice_vf_vsi_cfg_dvm_legacy_vlan_mode(vsi);
+		}
+	}
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
@@ -1642,6 +3223,12 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC;
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_FDIR_PF;
+
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
 
@@ -1660,21 +3247,84 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_CRC)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_CRC;
+
 	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF;
+#ifdef __TC_MQPRIO_MODE_MAX
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ && !ice_is_eswitch_mode_switchdev(pf))
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADQ;
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2 && !ice_is_eswitch_mode_switchdev(pf))
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADQ_V2;
+#endif /* __TC_MQPRIO_MODE_MAX */
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO;
+
+	if (vf->driver_caps & VIRTCHNL_VF_LARGE_NUM_QPAIRS)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_LARGE_NUM_QPAIRS;
+
+	/* Negotiate DCF capability. */
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_DCF) {
+		if (!ice_is_vf_dcf(vf)) {
+			if (!ice_check_dcf_allowed(vf)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+                        if (!ice_is_vf_dcf(vf))
+                                ice_vc_notify_dcf_vf_info(pf->dcf.vf, vf);
+			pf->dcf.vf = vf;
+			dev_info(ice_pf_to_dev(pf), "Grant request for DCF functionality to VF%d\n",
+				 vf->vf_id);
+			if (!ice_is_tunnel_empty(&pf->hw)) {
+				dev_info(ice_pf_to_dev(pf), "Failed to grant UDP tunnel capability to VF%d as UDP tunnel rules already exist\n",
+					 vf->vf_id);
+				pf->hw.dcf_caps &= ~DCF_UDP_TUNNEL_CAP;
+			}
+		}
+
+		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_DCF;
+		ice_dcf_set_state(pf, ICE_DCF_STATE_ON);
+	} else if (ice_is_vf_dcf(vf) &&
+		   ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		/* If a designated DCF requests AVF functionality from the
+		 * same VF without the DCF gracefully relinquishing the DCF
+		 * functionality first, remove ALL switch filters that were
+		 * added by the DCF.
+		 */
+		dev_info(ice_pf_to_dev(pf), "DCF is not in the OFF state, removing all filters that were added by the DCF\n");
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_clear_dcf_acl_cfg(pf);
+		ice_clear_dcf_udp_tunnel_cfg(pf);
+		pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+		ice_reset_vf(vf, false);
+	}
+
 	vfres->num_vsis = 1;
 	/* Tx and Rx queue are equal for VF */
 	vfres->num_queue_pairs = vsi->num_txq;
-	vfres->max_vectors = pf->num_vf_msix;
+	vfres->max_vectors = pf->num_msix_per_vf;
 	vfres->rss_key_size = ICE_VSIQF_HKEY_ARRAY_SIZE;
-	vfres->rss_lut_size = ICE_VSIQF_HLUT_ARRAY_SIZE;
+	vfres->rss_lut_size = vsi->rss_table_size;
+	vfres->max_mtu = ice_vc_get_max_frame_size(vf);
 
 	vfres->vsi_res[0].vsi_id = vf->lan_vsi_num;
 	vfres->vsi_res[0].vsi_type = VIRTCHNL_VSI_SRIOV;
 	vfres->vsi_res[0].num_queue_pairs = vsi->num_txq;
 	ether_addr_copy(vfres->vsi_res[0].default_mac_addr,
-			vf->dflt_lan_addr.addr);
+			vf->hw_lan_addr.addr);
+
+	/* match guest capabilities */
+	vf->driver_caps = vfres->vf_cap_flags;
+
+	ice_vc_set_caps_allowlist(vf);
+	ice_vc_set_working_allowlist(vf);
 
 	set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
 
@@ -1683,7 +3333,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_VF_RESOURCES, v_ret,
 				    (u8 *)vfres, len);
 
-	devm_kfree(&pf->pdev->dev, vfres);
+	kfree(vfres);
 	return ret;
 }
 
@@ -1697,7 +3347,7 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
  */
 static void ice_vc_reset_vf_msg(struct ice_vf *vf)
 {
-	if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+	if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
 		ice_reset_vf(vf, false);
 }
 
@@ -1726,7 +3376,7 @@ static struct ice_vsi *ice_find_vsi_from_id(struct ice_pf *pf, u16 id)
  *
  * check for the valid VSI ID
  */
-static bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
@@ -1766,474 +3416,5309 @@ static bool ice_vc_isvalid_ring_len(u16 ring_len)
 		!(ring_len % ICE_REQ_DESC_MULTIPLE));
 }
 
-/**
- * ice_vc_config_rss_key
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
+static enum virtchnl_status_code ice_vc_rss_hash_update(struct ice_hw *hw,
+							struct ice_vsi *vsi,
+							u8 hash_type)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return VIRTCHNL_STATUS_ERR_NO_MEMORY;
+
+
+	/* clear previous hash_type */
+	ctx->info.q_opt_rss = vsi->info.q_opt_rss &
+		~(ICE_AQ_VSI_Q_OPT_RSS_HASH_M);
+	/* hash_type is passed in as ICE_AQ_VSI_Q_OPT_RSS_<XOR|TPLZ|SYM_TPLZ */
+	ctx->info.q_opt_rss |= hash_type;
+
+	/* Preserve existing queueing option setting */
+	ctx->info.q_opt_tc = vsi->info.q_opt_tc;
+	ctx->info.q_opt_flags = vsi->info.q_opt_flags;
+
+	ctx->info.valid_sections =
+			cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_hw_to_dev(hw),
+			"update VSI for rss failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	} else {
+		vsi->info.q_opt_rss = ctx->info.q_opt_rss;
+	}
+
+	kfree(ctx);
+
+	return v_ret;
+}
+
+/**
+ * ice_pkg_name_to_type - translate DDP package name string to type
+ * @hw: pointer to the hardware
+ *
+ * This is a helper function to translate the DDP package name string
+ * to ice_pkg_type, in order to select the correct hash list.
+ */
+enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw)
+{
+	uint16_t i;
+	static const struct {
+		char name[ICE_PKG_NAME_SIZE];
+		enum ice_pkg_type pkg_type;
+	} ice_pkg_type_list[] = {
+		{"ICE OS Default Package",	ICE_PKG_TYPE_OS_DEFAULT},
+		{"ICE COMMS Package",	ICE_PKG_TYPE_COMMS},
+		{"ICE Wireless Edge Package",	ICE_PKG_TYPE_WIRELESS_EDGE},
+		{"ICE GTP over GRE Package",	ICE_PKG_TYPE_GTP_OVER_GRE},
+		{"ICE Tencent GRE Package",	ICE_PKG_TYPE_OS_DEFAULT},
+	};
+
+	for (i = 0; i < ARRAY_SIZE(ice_pkg_type_list); i++) {
+		if (!strcmp(ice_pkg_type_list[i].name,
+			    (const char *)hw->active_pkg_name))
+			return ice_pkg_type_list[i].pkg_type;
+	}
+
+	return ICE_PKG_TYPE_UNKNOWN;
+};
+
+/**
+ * ice_vc_validate_pattern
+ * @vf: pointer to the VF info
+ * @proto: virtchnl protocol headers
+ *
+ * validate the pattern is supported or not.
+ *
+ * Return: true on success, false on error.
+ */
+bool
+ice_vc_validate_pattern(struct ice_vf *vf, struct virtchnl_proto_hdrs *proto)
+{
+	bool is_l2tpv2 = false;
+	bool is_ipv4 = false;
+	bool is_ipv6 = false;
+	bool is_udp = false;
+	u16 ptype = -1;
+	int i = 0;
+
+	while (i < proto->count &&
+	       proto->proto_hdr[i].type != VIRTCHNL_PROTO_HDR_NONE) {
+		switch (proto->proto_hdr[i].type) {
+		case VIRTCHNL_PROTO_HDR_ETH:
+			ptype = ICE_PTYPE_MAC_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV4:
+			ptype = ICE_PTYPE_IPV4_PAY;
+			is_ipv4 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_IPV6:
+			ptype = ICE_PTYPE_IPV6_PAY;
+			is_ipv6 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_UDP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_UDP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_UDP_PAY;
+			is_udp = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_TCP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_TCP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_TCP_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_SCTP:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_SCTP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_SCTP_PAY;
+			break;
+		case VIRTCHNL_PROTO_HDR_L2TPV2:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_L2TPV2;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_L2TPV2;
+			is_l2tpv2 = true;
+			break;
+		case VIRTCHNL_PROTO_HDR_GTPU_IP:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN:
+		case VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_GTPU;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_GTPU;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_L2TPV3:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_L2TPV3;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_L2TPV3;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_ESP:
+			if (is_ipv4)
+				ptype = is_udp ? ICE_MAC_IPV4_NAT_T_ESP :
+						ICE_MAC_IPV4_ESP;
+			else if (is_ipv6)
+				ptype = is_udp ? ICE_MAC_IPV6_NAT_T_ESP :
+						ICE_MAC_IPV6_ESP;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_AH:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_AH;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_AH;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_PFCP:
+			if (is_ipv4)
+				ptype = ICE_MAC_IPV4_PFCP_SESSION;
+			else if (is_ipv6)
+				ptype = ICE_MAC_IPV6_PFCP_SESSION;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_ECPRI:
+			if (is_ipv4)
+				ptype = ICE_PTYPE_IPV4_UDP_PAY;
+			else if (is_ipv6)
+				ptype = ICE_PTYPE_IPV6_UDP_PAY;
+			goto out;
+		case VIRTCHNL_PROTO_HDR_PPP:
+			if (is_ipv4 && is_l2tpv2)
+				ptype = ICE_MAC_IPV4_PPPOL2TPV2;
+			else if (is_ipv6 && is_l2tpv2)
+				ptype = ICE_MAC_IPV6_PPPOL2TPV2;
+			goto out;
+		default:
+			break;
+		}
+		i++;
+	}
+
+out:
+	return ice_hw_ptype_ena(&vf->pf->hw, ptype);
+}
+
+/**
+ * ice_vc_parse_rss_cfg - parses hash fields and headers from
+ * a specific virtchnl RSS cfg
+ * @hw: pointer to the hardware
+ * @rss_cfg: pointer to the virtchnl rss cfg
+ * @hash_cfg: pointer to the HW hash configuration
+ *
+ * Return true if all the protocol header and hash fields in the rss cfg could
+ * be parsed, else return false
+ *
+ * This function parses the virtchnl rss cfg to be the intended
+ * hash fields and the intended header for RSS configuration
+ */
+static bool ice_vc_parse_rss_cfg(struct ice_hw *hw,
+				 struct virtchnl_rss_cfg *rss_cfg,
+				 struct ice_rss_hash_cfg *hash_cfg)
+{
+	const struct ice_vc_hash_field_match_type *hf_list;
+	const struct ice_vc_hdr_match_type *hdr_list;
+	int i, hf_list_len, hdr_list_len;
+	bool outer_ipv4 = false;
+	bool outer_ipv6 = false;
+	bool inner_hdr = false;
+
+	u32 *addl_hdrs = &hash_cfg->addl_hdrs;
+	u64 *hash_flds = &hash_cfg->hash_flds;
+	/* set outer layer RSS as default */
+	hash_cfg->hdr_type = ICE_RSS_OUTER_HEADERS;
+
+	hf_list = ice_vc_hash_field_list;
+	hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list);
+	hdr_list = ice_vc_hdr_list;
+	hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list);
+
+	for (i = 0; i < rss_cfg->proto_hdrs.count; i++) {
+		struct virtchnl_proto_hdr *proto_hdr =
+				&rss_cfg->proto_hdrs.proto_hdr[i];
+		u32 hdr_found = 0;
+		int j;
+
+		/* find matched ice headers according to virtchnl headers.
+		 * Also figure out the outer type of GTPU headers.
+		 */
+		for (j = 0; j < hdr_list_len; j++) {
+			struct ice_vc_hdr_match_type hdr_map =
+				hdr_list[j];
+
+			if (proto_hdr->type == hdr_map.vc_hdr)
+				hdr_found = hdr_map.ice_hdr;
+		}
+
+		if (!hdr_found)
+			return false;
+
+		if (proto_hdr->type == VIRTCHNL_PROTO_HDR_IPV4 && !inner_hdr)
+			outer_ipv4 = true;
+		else if (proto_hdr->type == VIRTCHNL_PROTO_HDR_IPV6 &&
+			 !inner_hdr)
+			outer_ipv6 = true;
+		/* for GTPU and L2TPv2, take inner header as input set if no
+		 * any field is selected from outer headers.
+		 */
+		else if ((proto_hdr->type == VIRTCHNL_PROTO_HDR_L2TPV2 ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_IP ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN ||
+			  proto_hdr->type == VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP) &&
+			  *hash_flds == 0) {
+			/* set inner_hdr flag, and clean up outer header */
+			inner_hdr = true;
+
+			/* clear outer headers */
+			*addl_hdrs = 0;
+
+			if (outer_ipv4 && outer_ipv6)
+				return false;
+
+			if (outer_ipv4)
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV4;
+			else if (outer_ipv6)
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS_W_OUTER_IPV6;
+			else
+				hash_cfg->hdr_type = ICE_RSS_INNER_HEADERS;
+		}
+
+		*addl_hdrs |= hdr_found;
+
+		/* Find matched ice hash fields according to
+		 * virtchnl hash fields.
+		 */
+		for (j = 0; j < hf_list_len; j++) {
+			struct ice_vc_hash_field_match_type hf_map =
+				hf_list[j];
+
+			if (proto_hdr->type == hf_map.vc_hdr &&
+			    proto_hdr->field_selector ==
+			     hf_map.vc_hash_field) {
+				*hash_flds |= hf_map.ice_hash_field;
+				break;
+			}
+		}
+	}
+
+	/* refine gtpu header if we take outer as input set for a no inner
+	 * ip gtpu flow.
+	 */
+	if (hash_cfg->hdr_type == ICE_RSS_OUTER_HEADERS &&
+	    *addl_hdrs & ICE_FLOW_SEG_HDR_GTPU_IP) {
+		*addl_hdrs &= ~(ICE_FLOW_SEG_HDR_GTPU_IP);
+		*addl_hdrs |= ICE_FLOW_SEG_HDR_GTPU_NON_IP;
+	}
+
+	/* refine hash field for esp and nat-t-esp. */
+	if ((*addl_hdrs & ICE_FLOW_SEG_HDR_UDP) &&
+	    (*addl_hdrs & ICE_FLOW_SEG_HDR_ESP)) {
+		*addl_hdrs &= ~(ICE_FLOW_SEG_HDR_ESP | ICE_FLOW_SEG_HDR_UDP);
+		*addl_hdrs |= ICE_FLOW_SEG_HDR_NAT_T_ESP;
+		*hash_flds &= ~(BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI));
+		*hash_flds |= BIT_ULL(ICE_FLOW_FIELD_IDX_NAT_T_ESP_SPI);
+	}
+
+	/* refine hash hdrs for L4 udp/tcp/sctp. */
+	if (*addl_hdrs & (ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP |
+			  ICE_FLOW_SEG_HDR_SCTP) &&
+	    *addl_hdrs & ICE_FLOW_SEG_HDR_IPV_OTHER)
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_IPV_OTHER;
+
+	/* refine hash field for ecpri over mac or udp */
+	if ((*addl_hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) &&
+	    (*addl_hdrs & ICE_FLOW_SEG_HDR_UDP)) {
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_ECPRI_TP0;
+		*hash_flds &= ~(BIT_ULL(ICE_FLOW_FIELD_IDX_ECPRI_TP0_PC_ID));
+	} else if (*addl_hdrs & ICE_FLOW_SEG_HDR_ECPRI_TP0) {
+		*addl_hdrs &= ~ICE_FLOW_SEG_HDR_UDP_ECPRI_TP0;
+		*hash_flds &=
+			~(BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_ECPRI_TP0_PC_ID));
+	}
+
+	return true;
+}
+
+/**
+ * ice_vf_adv_rss_offload_ena - determine if capabilities support advanced
+ * rss offloads
+ * @caps: VF driver negotiated capabilities
+ *
+ * Return true if VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF capability is set,
+ * else return false
+ */
+static bool ice_vf_adv_rss_offload_ena(u32 caps)
+{
+	return !!(caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF);
+}
+
+/**
+ * is_hash_cfg_valid - check if the hash context is valid
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will return true if the hash context is valid, otherwise
+ * return false.
+ */
+static bool is_hash_cfg_valid(struct ice_rss_hash_cfg *cfg)
+{
+	return (cfg->hash_flds != 0 && cfg->addl_hdrs != 0) ?
+		true : false;
+}
+
+/**
+ * hash_cfg_reset - reset the hash context
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will reset the hash context which stores the valid rule info.
+ */
+static void hash_cfg_reset(struct ice_rss_hash_cfg *cfg)
+{
+	cfg->hash_flds = 0;
+	cfg->addl_hdrs = 0;
+	cfg->hdr_type = ICE_RSS_OUTER_HEADERS;
+	cfg->symm = 0;
+}
+
+/**
+ * hash_cfg_record - record the hash context
+ * @ctx: pointer to the global RSS hash configuration
+ * @cfg: pointer to the RSS hash configuration to be recorded
+ *
+ * This function will record the hash context which stores the valid rule info.
+ */
+static void hash_cfg_record(struct ice_rss_hash_cfg *ctx,
+			    struct ice_rss_hash_cfg *cfg)
+{
+	ctx->hash_flds = cfg->hash_flds;
+	ctx->addl_hdrs = cfg->addl_hdrs;
+	ctx->hdr_type = cfg->hdr_type;
+	ctx->symm = cfg->symm;
+}
+
+/**
+ * ice_hash_moveout - delete a RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will delete an existing RSS hash configuration but not delete
+ * the hash context which stores the rule info.
+ */
+static int
+ice_hash_moveout(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (!is_hash_cfg_valid(cfg))
+		return -ENOENT;
+
+	status = ice_rem_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "ice_rem_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_hash_moveback - add an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * The function will add a RSS hash configuration if the hash context is valid.
+ */
+static int
+ice_hash_moveback(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (!is_hash_cfg_valid(cfg))
+		return -ENOENT;
+
+	status = ice_add_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status) {
+		dev_err(dev, "ice_add_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_hash_remove - remove a RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function will delete a RSS hash configuration and also delete the
+ * hash context which stores the rule info.
+ */
+static int
+ice_hash_remove(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	int ret;
+
+	ret = ice_hash_moveout(vf, cfg);
+	if (ret && (ret != -ENOENT))
+		return ret;
+
+	hash_cfg_reset(cfg);
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_pre_gtpu - pre-process the GTPU RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the GTPU hash
+ * @ctx_idx: The index of the hash context
+ *
+ * This function pre-process the GTPU hash configuration before adding a hash
+ * config, it will remove or rotate some prior hash configs which will cause
+ * conflicts.  For example, if a GTPU_UP/DWN rule be configured after a GTPU_EH
+ * rule, the GTPU_EH hash will be hit at first due to TCAM write sequence from
+ * top to down, and the hash hit sequence also from top to down. So the
+ * GTPU_EH rule need roolback to the later of the GTPU_UP/DWN rule. On the
+ * other hand, when a GTPU_EH rule be configured after a GTPU_UP/DWN rule,
+ * just need to remove the GTPU_DWN/UP rules.
+ */
+static int
+ice_add_rss_cfg_pre_gtpu(struct ice_vf *vf, struct ice_vf_hash_gtpu_ctx *ctx,
+			 u32 ctx_idx)
+{
+	int ret;
+
+	switch (ctx_idx) {
+	case ICE_HASH_GTPU_CTX_EH_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_UDP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_TCP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP_UDP:
+	case ICE_HASH_GTPU_CTX_UP_IP_TCP:
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_DW_IP:
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_remove(vf,
+				      &ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_DW_IP_UDP:
+	case ICE_HASH_GTPU_CTX_DW_IP_TCP:
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveout(vf,
+				       &ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_pre_ip - pre-process the IP RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the IP L4 hash
+ *
+ * This function will remove all covered and recorded IP RSS configurations,
+ * including IP with ESP/UDP_ESP/AH/L2TPV3/PFCP and UDP/TCP/SCTP.
+ */
+static int
+ice_add_rss_cfg_pre_ip(struct ice_vf *vf, struct ice_vf_hash_ip_ctx *ctx)
+{
+	int i, ret;
+
+	for (i = 1; i < ICE_HASH_IP_CTX_MAX; i++)
+		if (is_hash_cfg_valid(&ctx->ctx[i])) {
+			ret = ice_hash_remove(vf, &ctx->ctx[i]);
+
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+/**
+ * calc_gtpu_ctx_idx - calculate the index of the GTPU hash context
+ * @hdrs: the protocol headers prefix with ICE_FLOW_SEG_HDR_XXX.
+ *
+ * The GTPU hash context use the index to classify for IPV4/IPV6 and
+ * GTPU_EH/GTPU_UP/GTPU_DWN, this function used to calculate the index
+ * by the protocol headers.
+ */
+static u32 calc_gtpu_ctx_idx(u32 hdrs)
+{
+	u32 eh_idx, ip_idx;
+
+	if (hdrs & ICE_FLOW_SEG_HDR_GTPU_EH)
+		eh_idx = 0;
+	else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_UP)
+		eh_idx = 1;
+	else if (hdrs & ICE_FLOW_SEG_HDR_GTPU_DWN)
+		eh_idx = 2;
+	else
+		return ICE_HASH_GTPU_CTX_MAX;
+
+	ip_idx = 0;
+	if (hdrs & ICE_FLOW_SEG_HDR_UDP)
+		ip_idx = 1;
+	else if (hdrs & ICE_FLOW_SEG_HDR_TCP)
+		ip_idx = 2;
+
+	if (hdrs & (ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6))
+		return eh_idx * 3 + ip_idx;
+	else
+		return ICE_HASH_GTPU_CTX_MAX;
+}
+
+/**
+ * ice_map_ip_ctx_idx - map the index of the IP L4 hash context
+ * @hdrs: protocol headers prefix with ICE_FLOW_SEG_HDR_XXX.
+ *
+ * The IP L4 hash context use the index to classify for IPv4/IPv6 with
+ * ESP/UDP_ESP/AH/L2TPV3/PFCP and non-tunnel UDP/TCP/SCTP
+ * this function map the index based on the protocol headers.
+ */
+static u8 ice_map_ip_ctx_idx(u32 hdrs)
+{
+	u8 i;
+
+	static struct {
+		u32 hdrs;
+		u8 ctx_idx;
+	} ip_ctx_idx_map[] = {
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_ESP,
+			ICE_HASH_IP_CTX_IP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_NAT_T_ESP,
+			ICE_HASH_IP_CTX_IP_UDP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_AH,
+			ICE_HASH_IP_CTX_IP_AH },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_L2TPV3,
+			ICE_HASH_IP_CTX_IP_L2TPV3 },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_PFCP_SESSION,
+			ICE_HASH_IP_CTX_IP_PFCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_ESP,
+			ICE_HASH_IP_CTX_IP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_NAT_T_ESP,
+			ICE_HASH_IP_CTX_IP_UDP_ESP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_AH,
+			ICE_HASH_IP_CTX_IP_AH },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_L2TPV3,
+			ICE_HASH_IP_CTX_IP_L2TPV3 },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER |
+			ICE_FLOW_SEG_HDR_PFCP_SESSION,
+			ICE_HASH_IP_CTX_IP_PFCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_ETH | ICE_FLOW_SEG_HDR_VLAN |
+			ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		/* the remaining mappings are used for default RSS */
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_UDP,
+			ICE_HASH_IP_CTX_IP_UDP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_TCP,
+			ICE_HASH_IP_CTX_IP_TCP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_SCTP,
+			ICE_HASH_IP_CTX_IP_SCTP },
+		{ ICE_FLOW_SEG_HDR_IPV6 | ICE_FLOW_SEG_HDR_IPV_OTHER,
+			ICE_HASH_IP_CTX_IP },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(ip_ctx_idx_map); i++) {
+		if (hdrs == ip_ctx_idx_map[i].hdrs)
+			return ip_ctx_idx_map[i].ctx_idx;
+	}
+
+	return ICE_HASH_IP_CTX_MAX;
+}
+
+static int
+ice_add_rss_cfg_pre(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx == ICE_HASH_IP_CTX_IP) {
+		int ret = 0;
+
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			ret = ice_add_rss_cfg_pre_ip(vf, &vf->hash_ctx.v4);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			ret = ice_add_rss_cfg_pre_ip(vf, &vf->hash_ctx.v6);
+
+		if (ret)
+			return ret;
+	}
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+		return ice_add_rss_cfg_pre_gtpu(vf, &vf->hash_ctx.ipv4,
+						ice_gtpu_ctx_idx);
+	} else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+		return ice_add_rss_cfg_pre_gtpu(vf, &vf->hash_ctx.ipv6,
+						ice_gtpu_ctx_idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_add_rss_cfg_post_gtpu - A wrap function of deleting an RSS configuration
+ * @vf: pointer to the VF info
+ * @ctx: pointer to the context of the GTPU hash
+ * @cfg: pointer to the rss hash configuration
+ * @ctx_idx: The index of the hash context
+ *
+ * This function post process the hash configuration after the hash config is
+ * successfully adding, it will re-configure the prior hash config which was
+ * moveout but need to moveback again.
+ */
+static int
+ice_add_rss_cfg_post_gtpu(struct ice_vf *vf, struct ice_vf_hash_gtpu_ctx *ctx,
+			  struct ice_rss_hash_cfg *cfg, u32 ctx_idx)
+{
+	int ret;
+
+	if (ctx_idx < ICE_HASH_GTPU_CTX_MAX) {
+		ctx->ctx[ctx_idx].addl_hdrs = cfg->addl_hdrs;
+		ctx->ctx[ctx_idx].hash_flds = cfg->hash_flds;
+		ctx->ctx[ctx_idx].hdr_type = cfg->hdr_type;
+		ctx->ctx[ctx_idx].symm = cfg->symm;
+	}
+
+	switch (ctx_idx) {
+	case ICE_HASH_GTPU_CTX_EH_IP:
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_UDP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_EH_IP_TCP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_UP_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_DW_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	case ICE_HASH_GTPU_CTX_UP_IP:
+	case ICE_HASH_GTPU_CTX_UP_IP_UDP:
+	case ICE_HASH_GTPU_CTX_UP_IP_TCP:
+	case ICE_HASH_GTPU_CTX_DW_IP:
+	case ICE_HASH_GTPU_CTX_DW_IP_UDP:
+	case ICE_HASH_GTPU_CTX_DW_IP_TCP:
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_UDP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		ret = ice_hash_moveback(vf,
+					&ctx->ctx[ICE_HASH_GTPU_CTX_EH_IP_TCP]);
+		if (ret && (ret != -ENOENT))
+			return ret;
+
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int
+ice_add_rss_cfg_post(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx && ip_ctx_idx < ICE_HASH_IP_CTX_MAX) {
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hash_cfg_record(&vf->hash_ctx.v4.ctx[ip_ctx_idx], cfg);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hash_cfg_record(&vf->hash_ctx.v6.ctx[ip_ctx_idx], cfg);
+	}
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4) {
+		return ice_add_rss_cfg_post_gtpu(vf, &vf->hash_ctx.ipv4,
+						 cfg, ice_gtpu_ctx_idx);
+	} else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6) {
+		return ice_add_rss_cfg_post_gtpu(vf, &vf->hash_ctx.ipv6,
+						 cfg, ice_gtpu_ctx_idx);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_rem_rss_cfg_post - post-process the RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * This function post-process the RSS hash configuration after deleting a hash
+ * config. Such as, it will reset the hash context for the GTPU hash.
+ */
+static void
+ice_rem_rss_cfg_post(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	u32 ice_gtpu_ctx_idx = calc_gtpu_ctx_idx(cfg->addl_hdrs);
+
+	u8 ip_ctx_idx = ice_map_ip_ctx_idx(cfg->addl_hdrs);
+
+	if (ip_ctx_idx && ip_ctx_idx < ICE_HASH_IP_CTX_MAX) {
+		if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			hash_cfg_reset(&vf->hash_ctx.v4.ctx[ip_ctx_idx]);
+		else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			hash_cfg_reset(&vf->hash_ctx.v6.ctx[ip_ctx_idx]);
+	}
+
+	if (ice_gtpu_ctx_idx >= ICE_HASH_GTPU_CTX_MAX)
+		return;
+
+	if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV4)
+		hash_cfg_reset(&vf->hash_ctx.ipv4.ctx[ice_gtpu_ctx_idx]);
+	else if (cfg->addl_hdrs & ICE_FLOW_SEG_HDR_IPV6)
+		hash_cfg_reset(&vf->hash_ctx.ipv6.ctx[ice_gtpu_ctx_idx]);
+}
+
+/**
+ * ice_rem_rss_cfg_wrap - A wrap function of deleting an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * Wrapper function to delete a flow profile base on an RSS configuration,
+ * and also post process the hash context base on the rollback mechanism
+ * which handle some rules conflict by ice_add_rss_cfg_wrap.
+ */
+static enum ice_status
+ice_rem_rss_cfg_wrap(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	status = ice_rem_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	/* We just ignore ICE_ERR_DOES_NOT_EXIST, because
+	 * if two configurations share the same profile remove
+	 * one of them actually removes both, since the
+	 * profile is deleted.
+	 */
+	if (status && status != ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "ice_rem_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		goto error;
+	}
+
+	ice_rem_rss_cfg_post(vf, cfg);
+
+error:
+	return status;
+}
+
+/**
+ * ice_add_rss_cfg_wrap - A wrap function of adding an RSS configuration
+ * @vf: pointer to the VF info
+ * @cfg: pointer to the RSS hash configuration
+ *
+ * Wapper function to add a flow profile base on a RSS configuration, and
+ * also use a rollback mechanism to handle some rules conflict due to TCAM
+ * write sequence from top to down.
+ */
+static enum ice_status
+ice_add_rss_cfg_wrap(struct ice_vf *vf, struct ice_rss_hash_cfg *cfg)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	enum ice_status status = 0;
+	struct ice_hw *hw = &vf->pf->hw;
+
+	if (ice_add_rss_cfg_pre(vf, cfg))
+		return ICE_ERR_PARAM;
+
+	status = ice_add_rss_cfg(hw, vf->lan_vsi_idx, cfg);
+	if (status) {
+		dev_err(dev, "ice_add_rss_cfg failed for VSI:%d, error:%s\n",
+			vf->lan_vsi_num, ice_stat_str(status));
+		goto error;
+	}
+
+	if (ice_add_rss_cfg_post(vf, cfg))
+		status = ICE_ERR_PARAM;
+
+error:
+	return status;
+}
+
+/**
+ * ice_vc_handle_rss_cfg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the message buffer
+ * @add: add a RSS config if true, otherwise delete a RSS config
+ *
+ * This function adds/deletes a RSS config
+ */
+static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add)
+{
+	struct virtchnl_rss_cfg *rss_cfg = (struct virtchnl_rss_cfg *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	u32 v_opcode = add ? VIRTCHNL_OP_ADD_RSS_CFG :
+			VIRTCHNL_OP_DEL_RSS_CFG;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS is not supported by the PF\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto error_param;
+	}
+
+	if (!ice_vf_adv_rss_offload_ena(vf->driver_caps)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but Advanced rss offload is not supported\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->proto_hdrs.count > VIRTCHNL_MAX_NUM_PROTO_HDRS ||
+	    rss_cfg->rss_algorithm < VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC ||
+	    rss_cfg->rss_algorithm > VIRTCHNL_RSS_ALG_XOR_SYMMETRIC) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS configuration is not valid\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_pattern(vf, &rss_cfg->proto_hdrs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->rss_algorithm == VIRTCHNL_RSS_ALG_R_ASYMMETRIC) {
+		u8 hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_XOR : ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
+
+		v_ret = ice_vc_rss_hash_update(hw, vsi, hash_type);
+	} else {
+		struct ice_rss_hash_cfg cfg;
+		u8 hash_type;
+
+		cfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		cfg.hash_flds = ICE_HASH_INVALID;
+		cfg.hdr_type = ICE_RSS_ANY_HEADERS;
+
+		hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ :
+				ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
+
+		v_ret = ice_vc_rss_hash_update(hw, vsi, hash_type);
+		if (v_ret)
+			goto error_param;
+
+		if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &cfg)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (add) {
+			if (rss_cfg->rss_algorithm ==
+				     VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC)
+				cfg.symm = true;
+			else
+				cfg.symm = false;
+
+			if (ice_add_rss_cfg_wrap(vf, &cfg))
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		} else {
+			if (ice_rem_rss_cfg_wrap(vf, &cfg))
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	}
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, v_opcode, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_config_rss_key
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
  * Configure the VF's RSS key
  */
-static int ice_vc_config_rss_key(struct ice_vf *vf, u8 *msg)
+static int ice_vc_config_rss_key(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_rss_key *vrk =
+		(struct virtchnl_rss_key *)msg;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vrk->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (ice_set_rss_key(vsi, vrk->key))
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_KEY, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_config_rss_lut
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Configure the VF's RSS LUT
+ */
+static int ice_vc_config_rss_lut(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vrl->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vrl->lut_entries != vsi->rss_table_size) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (ice_set_rss_lut(vsi, vrl->lut, vrl->lut_entries))
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_LUT, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_wait_on_vf_reset - poll to make sure a given VF is ready after reset
+ * @vf: The VF being resseting
+ *
+ * The max poll time is about ~800ms, which is about the maximum time it takes
+ * for a VF to be reset and/or a VF driver to be removed.
+ */
+static void ice_wait_on_vf_reset(struct ice_vf *vf)
+{
+	int i;
+
+	for (i = 0; i < ICE_MAX_VF_RESET_TRIES; i++) {
+		if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
+			break;
+		msleep(ICE_MAX_VF_RESET_SLEEP_MS);
+	}
+}
+
+/**
+ * ice_check_vf_ready_for_cfg - check if VF is ready to be configured/queried
+ * @vf: VF to check if it's ready to be configured/queried
+ *
+ * The purpose of this function is to make sure the VF is not in reset, not
+ * disabled, and initialized so it can be configured and/or queried by a host
+ * administrator.
+ */
+int ice_check_vf_ready_for_cfg(struct ice_vf *vf)
+{
+	struct ice_pf *pf;
+
+	ice_wait_on_vf_reset(vf);
+
+	if (ice_is_vf_disabled(vf))
+		return -EINVAL;
+
+	pf = vf->pf;
+	if (ice_check_vf_init(pf, vf))
+		return -EBUSY;
+
+	return 0;
+}
+
+/**
+ * ice_set_vf_spoofchk
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @ena: flag to enable or disable feature
+ *
+ * Enable or disable VF spoof checking
+ */
+int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_pf *pf = np->vsi->back;
+	struct ice_vsi *vf_vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vf_vsi = ice_get_vf_vsi(vf);
+	if (!vf_vsi) {
+		netdev_err(netdev, "VSI %d for VF %d is null\n",
+			   vf->lan_vsi_idx, vf->vf_id);
+		return -EINVAL;
+	}
+
+	if (vf_vsi->type != ICE_VSI_VF) {
+		netdev_err(netdev, "Type %d of VSI %d for VF %d is no ICE_VSI_VF\n",
+			   vf_vsi->type, vf_vsi->vsi_num, vf->vf_id);
+		return -ENODEV;
+	}
+
+	if (ena == vf->spoofchk) {
+		dev_dbg(dev, "VF spoofchk already %s\n", ena ? "ON" : "OFF");
+		return 0;
+	}
+
+	if (ena)
+		ret = ice_vsi_ena_spoofchk(vf_vsi);
+	else
+		ret = ice_vsi_dis_spoofchk(vf_vsi);
+	if (ret)
+		dev_err(dev, "Failed to set spoofchk %s for VF %d VSI %d\n error %d\n",
+			ena ? "ON" : "OFF", vf->vf_id, vf_vsi->vsi_num, ret);
+	else
+		vf->spoofchk = ena;
+
+	return ret;
+}
+
+/**
+ * ice_is_any_vf_in_promisc - check if any VF(s) are in promiscuous mode
+ * @pf: PF structure for accessing VF(s)
+ *
+ * Return false if no VF(s) are in unicast and/or multicast promiscuous mode,
+ * else return true
+ */
+bool ice_is_any_vf_in_promisc(struct ice_pf *pf)
+{
+	int vf_idx;
+
+	ice_for_each_vf(pf, vf_idx) {
+		struct ice_vf *vf = &pf->vf[vf_idx];
+
+		/* found a VF that has promiscuous mode configured */
+		if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
+		    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_cfg_promiscuous_mode_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure VF VSIs promiscuous mode
+ */
+static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	bool rm_promisc, alluni = false, allmulti = false;
+	struct virtchnl_promisc_info *info =
+	    (struct virtchnl_promisc_info *)msg;
+	struct ice_vsi_vlan_ops *vlan_ops;
+	int mcast_err = 0, ucast_err = 0;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int ret = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, info->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		dev_err(dev, "Unprivileged VF %d is attempting to configure promiscuous mode\n",
+			vf->vf_id);
+		/* Leave v_ret alone, lie to the VF on purpose. */
+		goto error_param;
+	}
+
+	if (info->flags & FLAG_VF_UNICAST_PROMISC)
+		alluni = true;
+
+	if (info->flags & FLAG_VF_MULTICAST_PROMISC)
+		allmulti = true;
+
+	rm_promisc = !allmulti && !alluni;
+
+	vlan_ops = ice_get_compat_vsi_vlan_ops(vsi);
+	if (rm_promisc)
+		ret = vlan_ops->ena_rx_filtering(vsi);
+	else
+		ret = vlan_ops->dis_rx_filtering(vsi);
+	if (ret) {
+		dev_err(dev, "Failed to configure VLAN pruning in promiscuous mode\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, pf->flags)) {
+		bool set_dflt_vsi = alluni || allmulti;
+
+		if (set_dflt_vsi && !ice_is_dflt_vsi_in_use(vsi->vsw))
+			/* only attempt to set the default forwarding VSI if
+			 * it's not currently set
+			 */
+			ret = ice_set_dflt_vsi(vsi->vsw, vsi);
+		else if (!set_dflt_vsi &&
+			 ice_is_vsi_dflt_vsi(vsi->vsw, vsi))
+			/* only attempt to free the default forwarding VSI if we
+			 * are the owner
+			 */
+			ret = ice_clear_dflt_vsi(vsi->vsw);
+
+		if (ret) {
+			dev_err(dev, "%sable VF %d as the default VSI failed, error %d\n",
+				set_dflt_vsi ? "en" : "dis", vf->vf_id, ret);
+			v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+			goto error_param;
+		}
+	} else {
+		u8 mcast_m, ucast_m;
+
+		if (ice_vf_is_port_vlan_ena(vf) ||
+		    ice_vsi_has_non_zero_vlans(vsi)) {
+			mcast_m = ICE_MCAST_VLAN_PROMISC_BITS;
+			ucast_m = ICE_UCAST_VLAN_PROMISC_BITS;
+		} else {
+			mcast_m = ICE_MCAST_PROMISC_BITS;
+			ucast_m = ICE_UCAST_PROMISC_BITS;
+		}
+
+		if (alluni)
+			ucast_err = ice_vf_set_vsi_promisc(vf, vsi, ucast_m);
+		else
+			ucast_err = ice_vf_clear_vsi_promisc(vf, vsi, ucast_m);
+
+		if (allmulti)
+			mcast_err = ice_vf_set_vsi_promisc(vf, vsi, mcast_m);
+		else
+			mcast_err = ice_vf_clear_vsi_promisc(vf, vsi, mcast_m);
+	}
+
+	if (!mcast_err) {
+		if (allmulti &&
+		    !test_and_set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully set multicast promiscuous mode\n",
+				 vf->vf_id);
+		else if (!allmulti && test_and_clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully unset multicast promiscuous mode\n",
+				 vf->vf_id);
+	}
+
+	if (!ucast_err) {
+		if (alluni && !test_and_set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully set unicast promiscuous mode\n",
+				 vf->vf_id);
+		else if (!alluni && test_and_clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+			dev_info(dev, "VF %u successfully unset unicast promiscuous mode\n",
+				 vf->vf_id);
+	}
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_get_stats_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to get VSI stats
+ */
+static int ice_vc_get_stats_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+		(struct virtchnl_queue_select *)msg;
+	struct ice_eth_stats stats = { 0 };
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	ice_update_eth_stats(vsi);
+
+	stats = vsi->eth_stats;
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_STATS, v_ret,
+				     (u8 *)&stats, sizeof(stats));
+}
+
+/**
+ * ice_vf_get_tc_based_qid - get the updated QID based on offset
+ * @qid: queue ID
+ * @offset : TC specific queue offset
+ *
+ * This function returns updated queueID based on offset. This is
+ * meant to be used only with VF ADQ. Queue ID will always be
+ * 0-based from the specified offset
+ */
+static u16 ice_vf_get_tc_based_qid(u16 qid, u16 offset)
+{
+	return (qid >= offset) ? (qid - offset) : qid;
+}
+
+/**
+ * ice_vf_q_id_get_vsi_q_id
+ * @vf: pointer to the VF info
+ * @vf_q_id: VF relative queue ID
+ * @t_tc: traffic class for indexing the VSIs
+ * @vqs: the VFs virtual queue selection
+ * @vsi_p: pointer to VSI pointer, which changes based on TC for ADQ
+ * @vsi_id: VSI ID specific to desired queue ID
+ * @q_id: queue ID of the VSI
+ *
+ * provides ADQ queue enablement support by mapping the VF queue ID and TC to
+ * VSI ID and queue ID. call while iterating through VF queue IDs, VF VSIs and
+ * TCs.
+ */
+static void ice_vf_q_id_get_vsi_q_id(struct ice_vf *vf, u16 vf_q_id, u16 *t_tc,
+				     struct virtchnl_queue_select *vqs,
+				     struct ice_vsi **vsi_p, u16 *vsi_id,
+				     u16 *q_id)
+{
+	struct ice_vsi *vsi = *vsi_p;
+	u32 max_chnl_tc;
+	u16 tc = *t_tc;
+
+	max_chnl_tc = ice_vc_get_max_chnl_tc_allowed(vf);
+
+	/* Update the VSI and TC based on per TC queue region and offset */
+	if (tc + 1U < max_chnl_tc && vf_q_id == vf->ch[tc + 1].offset &&
+	    tc < vf->num_tc && ice_is_vf_adq_ena(vf)) {
+		vsi = vf->pf->vsi[vf->ch[tc + 1].vsi_idx];
+		tc++;
+	}
+
+	/* Update vsi_id and queue_id based on TC if TC is VF ADQ TC, then
+	 * use VF ADQ VSI otherwise main VF VSI
+	 */
+	if (tc >= ICE_VF_CHNL_START_TC && ice_is_vf_adq_ena(vf)) {
+		*vsi_id = vsi->vsi_num;
+		*q_id = ice_vf_get_tc_based_qid(vf_q_id, vf->ch[tc].offset);
+	} else {
+		*vsi_id = vqs->vsi_id;
+		*q_id = vf_q_id;
+	}
+
+	*vsi_p = vsi;
+	*t_tc = tc;
+}
+
+/**
+ * ice_vc_validate_vqs_bitmaps - validate Rx/Tx queue bitmaps from VIRTCHNL
+ * @vqs: virtchnl_queue_select structure containing bitmaps to validate
+ *
+ * Return true on successful validation, else false
+ */
+static bool ice_vc_validate_vqs_bitmaps(struct virtchnl_queue_select *vqs)
+{
+	if ((!vqs->rx_queues && !vqs->tx_queues) ||
+	    vqs->rx_queues >= BIT(ICE_MAX_DFLT_QS_PER_VF) ||
+	    vqs->tx_queues >= BIT(ICE_MAX_DFLT_QS_PER_VF))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vf_ena_txq_interrupt - enable Tx queue interrupt via QINT_TQCTL
+ * @vsi: VSI of the VF to configure
+ * @q_idx: VF queue index used to determine the queue in the PF's space
+ */
+static void ice_vf_ena_txq_interrupt(struct ice_vsi *vsi, u32 q_idx)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 pfq = vsi->txq_map[q_idx];
+	u32 reg;
+
+	reg = rd32(hw, QINT_TQCTL(pfq));
+
+	/* MSI-X index 0 in the VF's space is always for the OICR, which means
+	 * this is most likely a poll mode VF driver, so don't enable an
+	 * interrupt that was never configured via VIRTCHNL_OP_CONFIG_IRQ_MAP
+	 */
+	if (!(reg & QINT_TQCTL_MSIX_INDX_M))
+		return;
+
+	wr32(hw, QINT_TQCTL(pfq), reg | QINT_TQCTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_vf_ena_rxq_interrupt - enable Tx queue interrupt via QINT_RQCTL
+ * @vsi: VSI of the VF to configure
+ * @q_idx: VF queue index used to determine the queue in the PF's space
+ */
+static void ice_vf_ena_rxq_interrupt(struct ice_vsi *vsi, u32 q_idx)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 pfq = vsi->rxq_map[q_idx];
+	u32 reg;
+
+	reg = rd32(hw, QINT_RQCTL(pfq));
+
+	/* MSI-X index 0 in the VF's space is always for the OICR, which means
+	 * this is most likely a poll mode VF driver, so don't enable an
+	 * interrupt that was never configured via VIRTCHNL_OP_CONFIG_IRQ_MAP
+	 */
+	if (!(reg & QINT_RQCTL_MSIX_INDX_M))
+		return;
+
+	wr32(hw, QINT_RQCTL(pfq), reg | QINT_RQCTL_CAUSE_ENA_M);
+}
+
+/**
+ * ice_vf_vsi_ena_single_rxq - enable single Rx queue based on relative q_id
+ * @vf: VF to enable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to enable the Rx queue passed in. If the Rx queue was successfully enabled then set
+ * q_id bit in the enabled queues bitmap and return success. Otherwise return error.
+ */
+static int ice_vf_vsi_ena_single_rxq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	int err;
+
+	if (test_bit(vf_q_id, vf->rxq_ena))
+		return 0;
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_id, true);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to enable Rx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	ice_vf_ena_rxq_interrupt(vsi, q_id);
+	set_bit(vf_q_id, vf->rxq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_ena_single_txq - enable single Tx queue based on relative q_id
+ * @vf: VF to enable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Enable the Tx queue's interrupt then set the q_id bit in the enabled queues bitmap. Note that the
+ * Tx queue(s) should have already been configurated/enabled in VIRTCHNL_OP_CONFIG_QUEUES so this
+ * function only enables the interrupt associated with the q_id.
+ */
+static void ice_vf_vsi_ena_single_txq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	if (test_bit(vf_q_id, vf->txq_ena))
+		return;
+
+	ice_vf_ena_txq_interrupt(vsi, q_id);
+	set_bit(vf_q_id, vf->txq_ena);
+}
+
+/**
+ * ice_vc_ena_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to enable all or specific queue(s)
+ */
+static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct ice_vsi *vsi;
+	unsigned long q_map;
+	u16 vf_q_id = 0;
+	u16 tc = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_vqs_bitmaps(vqs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* Enable only Rx rings, Tx rings were enabled by the FW when the
+	 * Tx queue group list was configured and the context bits were
+	 * programmed using ice_vsi_cfg_txqs
+	 */
+	q_map = vqs->rx_queues;
+	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_id, q_id;
+
+		ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+					 &vsi_id, &q_id);
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (ice_vf_vsi_ena_single_rxq(vf, vsi, q_id, vf_q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+	}
+
+	tc = 0;
+	vsi = ice_get_vf_vsi(vf);
+	q_map = vqs->tx_queues;
+	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_id, q_id;
+
+		ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+					 &vsi_id, &q_id);
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		ice_vf_vsi_ena_single_txq(vf, vsi, q_id, vf_q_id);
+	}
+
+	/* Set flag to indicate that queues are enabled */
+	if (v_ret == VIRTCHNL_STATUS_SUCCESS) {
+		set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+		if (vf->repr)
+			netif_carrier_on(vf->repr->netdev);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vf_vsi_dis_single_txq - disable a single Tx queue for the VF based on relative queue ID
+ * @vf: VF to disable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to disable the Tx queue passed in. If the Tx queue was successfully disabled then clear
+ * q_id bit in the enabled queues bitmap and return success. Otherwise return error.
+ */
+static int ice_vf_vsi_dis_single_txq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	struct ice_txq_meta txq_meta = { 0 };
+	struct ice_ring *ring;
+	int err;
+
+	/* Skip queue if not enabled */
+	if (!test_bit(vf_q_id, vf->txq_ena))
+		return 0;
+
+	ring = vsi->tx_rings[q_id];
+	if (!ring)
+		return -EINVAL;
+
+	ice_fill_txq_meta(vsi, ring, &txq_meta);
+
+	err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, ring, &txq_meta);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	/* Clear enabled queues flag */
+	clear_bit(vf_q_id, vf->txq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vsi_dis_single_rxq - disable a Rx queue for VF on relative queue ID
+ * @vf: VF to disable queue for
+ * @vsi: VSI for the VF
+ * @q_id: VSI relative (0-based) queue ID
+ * @vf_q_id: VF relative (0-based) queue ID
+ *
+ * Attempt to disable the Rx queue passed in. If the Rx queue was successfully
+ * disabled then clear q_id bit in the enabled queues bitmap and return success.
+ * Otherwise return error.
+ */
+
+static int ice_vf_vsi_dis_single_rxq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id, u16 vf_q_id)
+{
+	int err;
+
+	if (!test_bit(vf_q_id, vf->rxq_ena))
+		return 0;
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_id, true);
+	if (err) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Rx ring %d on VSI %d\n",
+			q_id, vsi->vsi_num);
+		return err;
+	}
+
+	/* Clear enabled queues flag */
+	clear_bit(vf_q_id, vf->rxq_ena);
+
+	return 0;
+}
+
+/**
+ * ice_vc_dis_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to disable all or specific
+ * queue(s)
+ */
+static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct ice_vsi *vsi;
+	unsigned long q_map;
+	u16 vf_q_id = 0;
+	u16 tc = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) &&
+	    !test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_validate_vqs_bitmaps(vqs)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vqs->tx_queues) {
+		q_map = vqs->tx_queues;
+
+		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+			u16 vsi_id, q_id;
+
+			ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+						 &vsi_id, &q_id);
+			if (ice_is_vf_adq_ena(vf) &&
+			    tc >= ICE_VF_CHNL_START_TC) {
+				if (!ice_vf_adq_vsi_valid(vf, tc)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+
+			if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			if (ice_vf_vsi_dis_single_txq(vf, vsi, q_id, vf_q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+	}
+
+	q_map = vqs->rx_queues;
+	tc = 0;
+	/* Reset VSI pointer as it was assigned to ADQ VSIs */
+	vsi = ice_get_vf_vsi(vf);
+	/* speed up Rx queue disable by batching them if possible */
+	if (q_map &&
+	    bitmap_equal(&q_map, vf->rxq_ena, ICE_MAX_DFLT_QS_PER_VF)) {
+		if (ice_vsi_stop_all_rx_rings(vsi)) {
+			dev_err(ice_pf_to_dev(vsi->back), "Failed to stop all Rx rings on VSI %d\n",
+				vsi->vsi_num);
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+		if (ice_is_vf_adq_ena(vf)) {
+			for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+				vsi = ice_get_vf_adq_vsi(vf, tc);
+				if (ice_vsi_stop_all_rx_rings(vsi)) {
+					dev_err(ice_pf_to_dev(vsi->back),
+						"Failed to stop all Rx rings on VF ADQ VSI %d\n",
+						vsi->vsi_num);
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+		}
+		bitmap_zero(vf->rxq_ena, ICE_MAX_DFLT_QS_PER_VF);
+	} else if (q_map) {
+		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_DFLT_QS_PER_VF) {
+			u16 vsi_id, q_id;
+
+			ice_vf_q_id_get_vsi_q_id(vf, vf_q_id, &tc, vqs, &vsi,
+						 &vsi_id, &q_id);
+			if (ice_is_vf_adq_ena(vf) &&
+			    tc >= ICE_VF_CHNL_START_TC) {
+				if (!ice_vf_adq_vsi_valid(vf, tc)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			}
+			if (!ice_vc_isvalid_q_id(vf, vsi_id, q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			if (ice_vf_vsi_dis_single_rxq(vf, vsi, q_id, vf_q_id)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+	}
+
+	/* Clear enabled queues flag */
+	if (v_ret == VIRTCHNL_STATUS_SUCCESS && ice_vf_has_no_qs_ena(vf)) {
+		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+		if (vf->repr)
+			netif_carrier_off(vf->repr->netdev);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_cfg_interrupt
+ * @vf: pointer to the VF info
+ * @vsi: the VSI being configured
+ * @vector_id: vector ID
+ * @tc: traffic class number for ADQ
+ * @map: vector map for mapping vectors to queues
+ * @q_vector: structure for interrupt vector
+ * configure the IRQ to queue map
+ */
+static int
+ice_cfg_interrupt(struct ice_vf *vf, struct ice_vsi *vsi, u16 vector_id,
+		  u8 __maybe_unused tc, struct virtchnl_vector_map *map,
+		  struct ice_q_vector *q_vector)
+{
+	unsigned long qmap;
+	u16 vsi_q_id_idx;
+
+	q_vector->num_ring_rx = 0;
+	q_vector->num_ring_tx = 0;
+
+	qmap = map->rxq_map;
+	for_each_set_bit(vsi_q_id_idx, &qmap, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_q_id = vsi_q_id_idx;
+
+		if (tc && ice_is_vf_adq_ena(vf))
+			vsi_q_id = ice_vf_get_tc_based_qid(vsi_q_id_idx,
+							   vf->ch[tc].offset);
+
+		if (!ice_vc_isvalid_q_id(vf, vsi->vsi_num, vsi_q_id))
+			return VIRTCHNL_STATUS_ERR_PARAM;
+
+		q_vector->num_ring_rx++;
+		q_vector->rx.itr_idx = map->rxitr_idx;
+		vsi->rx_rings[vsi_q_id]->q_vector = q_vector;
+		ice_cfg_rxq_interrupt(vsi, vsi_q_id, vector_id,
+				      q_vector->rx.itr_idx);
+	}
+
+	qmap = map->txq_map;
+	for_each_set_bit(vsi_q_id_idx, &qmap, ICE_MAX_DFLT_QS_PER_VF) {
+		u16 vsi_q_id = vsi_q_id_idx;
+
+		if (tc && ice_is_vf_adq_ena(vf))
+			vsi_q_id = ice_vf_get_tc_based_qid(vsi_q_id_idx,
+							   vf->ch[tc].offset);
+
+		if (!ice_vc_isvalid_q_id(vf, vsi->vsi_num, vsi_q_id))
+			return VIRTCHNL_STATUS_ERR_PARAM;
+
+		q_vector->num_ring_tx++;
+		q_vector->tx.itr_idx = map->txitr_idx;
+		vsi->tx_rings[vsi_q_id]->q_vector = q_vector;
+		ice_cfg_txq_interrupt(vsi, vsi_q_id, vector_id,
+				      q_vector->tx.itr_idx);
+	}
+
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_vc_cfg_irq_map_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure the IRQ to queue map
+ */
+static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_irq_map_info *irqmap_info;
+	struct virtchnl_vector_map *map;
+	struct ice_pf *pf = vf->pf;
+	u16 num_q_vectors_mapped;
+	struct ice_vsi *vsi;
+	u16 vector_id_ch;
+	u16 tc = 0;
+	int i;
+
+	irqmap_info = (struct virtchnl_irq_map_info *)msg;
+	num_q_vectors_mapped = irqmap_info->num_vectors;
+
+	/* Check to make sure number of VF vectors mapped is not greater than
+	 * number of VF vectors originally allocated, and check that
+	 * there is actually at least a single VF queue vector mapped
+	 */
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    pf->num_msix_per_vf < num_q_vectors_mapped ||
+	    !num_q_vectors_mapped) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < num_q_vectors_mapped; i++) {
+		struct ice_q_vector *q_vector;
+		u16 vsi_id, vector_id;
+
+		map = &irqmap_info->vecmap[i];
+
+		vector_id = map->vector_id;
+		vsi_id = map->vsi_id;
+		if (ice_is_vf_adq_ena(vf) && tc >= ICE_VF_CHNL_START_TC) {
+			if (!ice_vf_adq_vsi_valid(vf, tc)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+			vsi_id = vsi->vsi_num;
+		}
+		/* vector_id is always 0-based for each VF, and can never be
+		 * larger than or equal to the max allowed interrupts per VF
+		 */
+		if (!(vector_id < pf->num_msix_per_vf) ||
+		    !ice_vc_isvalid_vsi_id(vf, vsi_id) ||
+		    (!vector_id && (map->rxq_map || map->txq_map))) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* No need to map VF miscellaneous or rogue vector */
+		if (!vector_id)
+			continue;
+
+		/* Subtract non queue vector from vector_id passed by VF
+		 * to get actual number of VSI queue vector array index
+		 */
+		if (tc && ice_is_vf_adq_ena(vf))
+			vector_id_ch = vector_id - vf->ch[tc].offset;
+		else
+			vector_id_ch = vector_id;
+
+		/* if ADQ enablement failed, the main VF VSI could have been
+		 * reconfigured (based on TC0 information - means main
+		 * VF VSI queues and vectors are equal to TC0.num_qps and
+		 * not equal to "num_q_vectors" which is part of
+		 * irq_cfg virtchnl message) so prevent using invalid vector ID
+		 */
+		if ((vector_id_ch - ICE_NONQ_VECS_VF) >= vsi->num_q_vectors) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+		q_vector = vsi->q_vectors[vector_id_ch - ICE_NONQ_VECS_VF];
+		if (!q_vector) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* lookout for the invalid queue index */
+
+		v_ret = (enum virtchnl_status_code)
+			ice_cfg_interrupt(vf, vsi, vector_id, tc, map,
+					  q_vector);
+		if (v_ret)
+			goto error_param;
+
+		/* Update VSI and TC only when ADQ is configured */
+		if (ice_is_vf_adq_ena(vf) &&
+		    vector_id == vf->ch[tc + 1].offset) {
+			vsi = pf->vsi[vf->ch[tc + 1].vsi_idx];
+			tc++;
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_IRQ_MAP, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_get_max_allowed_qpairs - get max allowed queue pairs based on negotiated capabilities
+ * @vf: VF used to get max queue pairs allowed
+ *
+ * The maximum allowed queues is determined based on whether VIRTCHNL_VF_LARGE_NUM_QPAIRS was
+ * negotiated.
+ */
+static int ice_vc_get_max_allowed_qpairs(struct ice_vf *vf)
+{
+	if (vf->driver_caps & VIRTCHNL_VF_LARGE_NUM_QPAIRS)
+		return ICE_MAX_QS_PER_VF;
+
+	return ICE_MAX_DFLT_QS_PER_VF;
+}
+
+/**
+ * ice_vc_cfg_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * called from the VF to configure the Rx/Tx queues
+ */
+static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vsi_queue_config_info *qci =
+	    (struct virtchnl_vsi_queue_config_info *)msg;
+	struct virtchnl_queue_pair_info *qpi;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	u16 queue_id_tmp, tc;
+	int i, q_idx;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* check for number of queues is done in ice_alloc_vf_res() function
+	 * for ADQ
+	 */
+	if (ice_is_vf_adq_ena(vf))
+		goto skip_num_queues_check;
+
+	if (qci->num_queue_pairs > ice_vc_get_max_allowed_qpairs(vf) ||
+	    qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) {
+		dev_err(ice_pf_to_dev(pf), "VF-%d trying to configure more than allocated number of queues: %d\n",
+			vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq));
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+skip_num_queues_check:
+	queue_id_tmp = 0;
+	tc = 0;
+	for (i = 0; i < qci->num_queue_pairs; i++) {
+		if (!qci->qpair[i].rxq.crc_disable)
+			continue;
+
+		if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_CRC) ||
+		    vf->dcf_vlan_info.outer_stripping_ena ||
+		    vf->vlan_strip_ena) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+	}
+	for (i = 0; i < qci->num_queue_pairs; i++) {
+		qpi = &qci->qpair[i];
+		if (ice_is_vf_adq_ena(vf))
+			goto skip_non_adq_checks;
+
+		if (qpi->txq.vsi_id != qci->vsi_id ||
+		    qpi->rxq.vsi_id != qci->vsi_id ||
+		    qpi->rxq.queue_id != qpi->txq.queue_id ||
+		    qpi->txq.headwb_enabled ||
+		    !ice_vc_isvalid_ring_len(qpi->txq.ring_len) ||
+		    !ice_vc_isvalid_ring_len(qpi->rxq.ring_len) ||
+		    !ice_vc_isvalid_q_id(vf, qci->vsi_id,
+					 qpi->txq.queue_id)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+skip_non_adq_checks:
+		if (ice_is_vf_adq_ena(vf)) {
+			q_idx = queue_id_tmp;
+			vsi = ice_find_vsi_from_id(vf->pf, vf->ch[tc].vsi_num);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		} else {
+			q_idx = qpi->rxq.queue_id;
+		}
+
+		/* make sure selected "q_idx" is in valid range of queues
+		 * for selected "vsi" (which could be main VF VSI or
+		 * VF ADQ VSI
+		 */
+		if (q_idx >= vsi->alloc_txq || q_idx >= vsi->alloc_rxq) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* copy Tx queue info from VF into VSI */
+		if (qpi->txq.ring_len > 0) {
+			vsi->tx_rings[q_idx]->dma = qpi->txq.dma_ring_addr;
+			vsi->tx_rings[q_idx]->count = qpi->txq.ring_len;
+			if (ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		/* copy Rx queue info from VF into VSI */
+		if (qpi->rxq.ring_len > 0) {
+			u16 max_frame_size = ice_vc_get_max_frame_size(vf);
+			u32 rxdid;
+
+			vsi->rx_rings[q_idx]->dma = qpi->rxq.dma_ring_addr;
+			vsi->rx_rings[q_idx]->count = qpi->rxq.ring_len;
+
+			vsi->rx_rings[q_idx]->rx_crc_strip_dis = qpi->rxq.crc_disable;
+
+			if (qpi->rxq.databuffer_size != 0 &&
+			    (qpi->rxq.databuffer_size > ((16 * 1024) - 128) ||
+			     qpi->rxq.databuffer_size < 1024)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+			vsi->rx_buf_len = qpi->rxq.databuffer_size;
+			vsi->rx_rings[q_idx]->rx_buf_len = vsi->rx_buf_len;
+			if (qpi->rxq.max_pkt_size > max_frame_size ||
+			    qpi->rxq.max_pkt_size < 64) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			vsi->max_frame = qpi->rxq.max_pkt_size;
+			/* add space for the port VLAN since the VF driver is not
+			 * expected to account for it in the MTU calculation
+			 */
+			if (ice_vf_is_port_vlan_ena(vf))
+				vsi->max_frame += VLAN_HLEN;
+
+			if (ice_vsi_cfg_single_rxq(vsi, q_idx)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* If Rx flex desc is supported, select RXDID for Rx queues.
+			 * Otherwise, use legacy 32byte descriptor format.
+			 * Legacy 16byte descriptor is not supported. If this RXDID
+			 * is selected, return error.
+			 */
+			if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+				rxdid = qpi->rxq.rxdid;
+				if (!(BIT(rxdid) & pf->supported_rxdids)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					goto error_param;
+				}
+			} else {
+				rxdid = ICE_RXDID_LEGACY_1;
+			}
+
+			ice_write_qrxflxp_cntxt(&vsi->back->hw, vsi->rxq_map[q_idx], rxdid, 0x03,
+						false);
+		}
+
+		/* For ADQ there can be up to 4 VSIs with max 4 queues each.
+		 * VF does not know about these additional VSIs and all
+		 * it cares is about its own queues. PF configures these queues
+		 * to its appropriate VSIs based on TC mapping
+		 */
+		if (ice_is_vf_adq_ena(vf)) {
+			if (queue_id_tmp == (vf->ch[tc].num_qps - 1)) {
+				tc++;
+				/* reset the queue num */
+				queue_id_tmp = 0;
+			} else {
+				queue_id_tmp++;
+			}
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_is_vf_trusted
+ * @vf: pointer to the VF info
+ */
+static bool ice_is_vf_trusted(struct ice_vf *vf)
+{
+	return test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+}
+
+/**
+ * ice_can_vf_change_mac
+ * @vf: pointer to the VF info
+ *
+ * Return true if the VF is allowed to change its MAC filters, false otherwise
+ */
+static bool ice_can_vf_change_mac(struct ice_vf *vf)
+{
+	/* If the VF MAC address has been set administratively (via the
+	 * ndo_set_vf_mac command), then deny permission to the VF to
+	 * add/delete unicast MAC addresses, unless the VF is trusted
+	 */
+	if (vf->pf_set_mac && !ice_is_vf_trusted(vf))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_ether_addr_type - get type of virtchnl_ether_addr
+ * @vc_ether_addr: used to extract the type
+ */
+static u8
+ice_vc_ether_addr_type(struct virtchnl_ether_addr *vc_ether_addr)
+{
+	return (vc_ether_addr->type & VIRTCHNL_ETHER_ADDR_TYPE_MASK);
+}
+
+/**
+ * ice_is_vc_addr_legacy - check if the MAC address is from an older VF
+ * @vc_ether_addr: VIRTCHNL structure that contains MAC and type
+ */
+static bool
+ice_is_vc_addr_legacy(struct virtchnl_ether_addr __maybe_unused *vc_ether_addr)
+{
+	u8 type = ice_vc_ether_addr_type(vc_ether_addr);
+
+	return (type == VIRTCHNL_ETHER_ADDR_LEGACY);
+}
+
+/**
+ * ice_is_vc_addr_primary - check if the MAC address is the VF's primary MAC
+ * @vc_ether_addr: VIRTCHNL structure that contains MAC and type
+ *
+ * This function should only be called when the MAC address in
+ * virtchnl_ether_addr is a valid unicast MAC
+ */
+static bool
+ice_is_vc_addr_primary(struct virtchnl_ether_addr __maybe_unused *vc_ether_addr)
+{
+	u8 type = ice_vc_ether_addr_type(vc_ether_addr);
+
+	return (type == VIRTCHNL_ETHER_ADDR_PRIMARY);
+}
+
+/**
+ * ice_vfhw_mac_add - update the VF's cached hardware MAC if allowed
+ * @vf: VF to update
+ * @vc_ether_addr: structure from VIRTCHNL with MAC to add
+ */
+static void
+ice_vfhw_mac_add(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr)
+{
+	u8 *mac_addr = vc_ether_addr->addr;
+
+	if (!is_valid_ether_addr(mac_addr))
+		return;
+
+	/* only allow legacy VF drivers to set the device and hardware MAC if it
+	 * is zero and allow new VF drivers to set the hardware MAC if the type
+	 * was correctly specified over VIRTCHNL
+	 */
+	if ((ice_is_vc_addr_legacy(vc_ether_addr) &&
+	     is_zero_ether_addr(vf->hw_lan_addr.addr)) ||
+	    ice_is_vc_addr_primary(vc_ether_addr)) {
+		ether_addr_copy(vf->dev_lan_addr.addr, mac_addr);
+		ether_addr_copy(vf->hw_lan_addr.addr, mac_addr);
+	}
+
+	/* hardware and device MACs are already set, but its possible that the
+	 * VF driver sent the VIRTCHNL_OP_ADD_ETH_ADDR message before the
+	 * VIRTCHNL_OP_DEL_ETH_ADDR when trying to update its MAC, so save it
+	 * away for the legacy VF driver case as it will be updated in the
+	 * delete flow for this case
+	 */
+	if (ice_is_vc_addr_legacy(vc_ether_addr)) {
+		ether_addr_copy(vf->legacy_last_added_umac.addr,
+				mac_addr);
+		vf->legacy_last_added_umac.time_modified = jiffies;
+	}
+}
+
+/**
+ * ice_vc_add_mac_addr - attempt to add the MAC address passed in
+ * @vf: pointer to the VF info
+ * @vsi: pointer to the VF's VSI
+ * @vc_ether_addr: VIRTCHNL MAC address structure used to add MAC
+ */
+static int
+ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi,
+		    struct virtchnl_ether_addr *vc_ether_addr)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u8 *mac_addr = vc_ether_addr->addr;
+	enum ice_status status;
+	int ret = 0;
+
+	/* device MAC already added */
+	if (!ether_addr_equal(mac_addr, vf->dev_lan_addr.addr)) {
+		if (is_unicast_ether_addr(mac_addr) &&
+		    !ice_can_vf_change_mac(vf)) {
+			dev_err(dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n");
+			return -EPERM;
+		}
+
+		status = ice_fltr_add_mac(vsi, mac_addr, ICE_FWD_TO_VSI);
+		if (status == ICE_ERR_ALREADY_EXISTS) {
+			dev_dbg(dev, "MAC %pM already exists for VF %d\n", mac_addr,
+				vf->vf_id);
+			/* don’t return since we might need to update
+			 * the primary MAC in ice_vfhw_mac_add() below
+			 */
+			ret = -EEXIST;
+		} else if (status) {
+			dev_err(dev, "Failed to add MAC %pM for VF %d\n, error %s\n", mac_addr,
+				vf->vf_id, ice_stat_str(status));
+			return -EIO;
+		} else {
+			vf->num_mac++;
+		}
+
+		ice_vfhw_mac_add(vf, vc_ether_addr);
+	}
+
+	return ret;
+}
+
+/**
+ * ice_is_legacy_umac_expired - check if last added legacy unicast MAC expired
+ * @last_added_umac: structure used to check expiration
+ */
+static bool ice_is_legacy_umac_expired(struct ice_time_mac *last_added_umac)
+{
+#define ICE_LEGACY_VF_MAC_CHANGE_EXPIRE_TIME	msecs_to_jiffies(3000)
+	return time_is_before_jiffies(last_added_umac->time_modified +
+				      ICE_LEGACY_VF_MAC_CHANGE_EXPIRE_TIME);
+}
+
+/**
+ * ice_vfhw_mac_del - update the VF's cached hardware MAC if allowed
+ * @vf: VF to update
+ * @vc_ether_addr: structure from VIRTCHNL with MAC to delete
+ */
+static void
+ice_vfhw_mac_del(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr)
+{
+	u8 *mac_addr = vc_ether_addr->addr;
+
+	if (!is_valid_ether_addr(mac_addr) ||
+	    !ether_addr_equal(vf->dev_lan_addr.addr, mac_addr))
+		return;
+
+	/* allow the device MAC to be repopulated in the add flow and don't
+	 * clear the hardware MAC (i.e. hw_lan_addr.addr) here as that is meant
+	 * to be persistent on VM reboot and across driver unload/load, which
+	 * won't work if we clear the hardware MAC here
+	 */
+	eth_zero_addr(vf->dev_lan_addr.addr);
+
+	/* only update cached hardware MAC for legacy VF drivers on delete
+	 * because we cannot guarantee order/type of MAC from the VF driver
+	 */
+	if (ice_is_vc_addr_legacy(vc_ether_addr) &&
+	    !ice_is_legacy_umac_expired(&vf->legacy_last_added_umac)) {
+		ether_addr_copy(vf->dev_lan_addr.addr,
+				vf->legacy_last_added_umac.addr);
+		ether_addr_copy(vf->hw_lan_addr.addr,
+				vf->legacy_last_added_umac.addr);
+	}
+}
+
+/**
+ * ice_vc_del_mac_addr - attempt to delete the MAC address passed in
+ * @vf: pointer to the VF info
+ * @vsi: pointer to the VF's VSI
+ * @vc_ether_addr: VIRTCHNL MAC address structure used to delete MAC
+ */
+static int
+ice_vc_del_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi,
+		    struct virtchnl_ether_addr *vc_ether_addr)
+{
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	u8 *mac_addr = vc_ether_addr->addr;
+	enum ice_status status;
+
+	if (!ice_can_vf_change_mac(vf) &&
+	    ether_addr_equal(vf->dev_lan_addr.addr, mac_addr))
+		return 0;
+
+	status = ice_fltr_remove_mac(vsi, mac_addr, ICE_FWD_TO_VSI);
+	if (status == ICE_ERR_DOES_NOT_EXIST) {
+		dev_err(dev, "MAC %pM does not exist for VF %d\n", mac_addr,
+			vf->vf_id);
+		return -ENOENT;
+	} else if (status) {
+		dev_err(dev, "Failed to delete MAC %pM for VF %d, error %s\n",
+			mac_addr, vf->vf_id, ice_stat_str(status));
+		return -EIO;
+	}
+
+
+	ice_vfhw_mac_del(vf, vc_ether_addr);
+
+	vf->num_mac--;
+
+	return 0;
+}
+
+/**
+ * ice_vc_handle_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @set: true if MAC filters are being set, false otherwise
+ *
+ * add guest MAC address filter
+ */
+static int
+ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
+{
+	int (*ice_vc_cfg_mac)
+		(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_ether_addr *virtchnl_ether_addr);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
+	struct ice_pf *pf = vf->pf;
+	enum virtchnl_ops vc_op;
+	struct ice_vsi *vsi;
+	int i;
+
+	if (set) {
+		vc_op = VIRTCHNL_OP_ADD_ETH_ADDR;
+		ice_vc_cfg_mac = ice_vc_add_mac_addr;
+	} else {
+		vc_op = VIRTCHNL_OP_DEL_ETH_ADDR;
+		ice_vc_cfg_mac = ice_vc_del_mac_addr;
+	}
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	/* If this VF is not privileged, then we can't add more than a
+	 * limited number of addresses. Check to make sure that the
+	 * additions do not push us over the limit.
+	 */
+	if (set && !ice_is_vf_trusted(vf) &&
+	    (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) {
+		dev_err(ice_pf_to_dev(pf), "Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto handle_mac_exit;
+	}
+
+	for (i = 0; i < al->num_elements; i++) {
+		u8 *mac_addr = al->list[i].addr;
+		int result;
+
+		if (is_broadcast_ether_addr(mac_addr) ||
+		    is_zero_ether_addr(mac_addr))
+			continue;
+
+		result = ice_vc_cfg_mac(vf, vsi, &al->list[i]);
+		if (result == -EEXIST || result == -ENOENT) {
+			continue;
+		} else if (result) {
+			v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+			goto handle_mac_exit;
+		}
+	}
+
+handle_mac_exit:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_add_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * add guest MAC address filter
+ */
+static int ice_vc_add_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_handle_mac_addr_msg(vf, msg, true);
+}
+
+/**
+ * ice_vc_del_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * remove guest MAC address filter
+ */
+static int ice_vc_del_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_handle_mac_addr_msg(vf, msg, false);
+}
+
+/**
+ * ice_vc_request_qs_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number. If the request is successful, PF will reset the VF and
+ * return 0. If unsuccessful, PF will send message informing VF of number of
+ * available queue pairs via virtchnl message response to VF.
+ */
+static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	u16 max_avail_vf_qps, max_allowed_vf_qps;
+	u16 req_queues = vfres->num_queue_pairs;
+	struct ice_pf *pf = vf->pf;
+	u16 tx_rx_queue_left;
+	struct device *dev;
+	u16 cur_queues;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	max_allowed_vf_qps = ice_vc_get_max_allowed_qpairs(vf);
+
+	cur_queues = vf->num_vf_qs;
+	tx_rx_queue_left = min_t(u16, ice_get_avail_txq_count(pf),
+				 ice_get_avail_rxq_count(pf));
+	max_avail_vf_qps = tx_rx_queue_left + cur_queues;
+	if (!req_queues) {
+		dev_err(dev, "VF %d tried to request 0 queues. Ignoring.\n",
+			vf->vf_id);
+	} else if (req_queues > max_allowed_vf_qps) {
+		dev_err(dev, "VF %d tried to request more than %d queues.\n",
+			vf->vf_id, max_allowed_vf_qps);
+		vfres->num_queue_pairs = max_allowed_vf_qps;
+	} else if (req_queues > cur_queues &&
+		   req_queues - cur_queues > tx_rx_queue_left) {
+		dev_warn(dev, "VF %d requested %u more queues, but only %u left.\n",
+			 vf->vf_id, req_queues - cur_queues, tx_rx_queue_left);
+		vfres->num_queue_pairs = min_t(u16, max_avail_vf_qps, max_allowed_vf_qps);
+	} else {
+		/* request is successful, then reset VF */
+		vf->num_req_qs = req_queues;
+		ice_vc_reset_vf(vf);
+		dev_info(dev, "VF %d granted request of %u queues.\n",
+			 vf->vf_id, req_queues);
+		return 0;
+	}
+
+error_param:
+	/* send the response to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES,
+				     v_ret, (u8 *)vfres, sizeof(*vfres));
+}
+
+/**
+ * ice_is_supported_port_vlan_proto - make sure the vlan_proto is supported
+ * @hw: hardware structure used to check the VLAN mode
+ * @vlan_proto: VLAN TPID being checked
+ *
+ * If the device is configured in Double VLAN Mode (DVM), then both ETH_P_8021Q
+ * and ETH_P_8021AD are supported. If the device is configured in Single VLAN
+ * Mode (SVM), then only ETH_P_8021Q is supported.
+ */
+static bool
+ice_is_supported_port_vlan_proto(struct ice_hw *hw, u16 vlan_proto)
+{
+	bool is_supported = false;
+
+	switch (vlan_proto) {
+	case ETH_P_8021Q:
+		is_supported = true;
+		break;
+	case ETH_P_8021AD:
+		if (ice_is_dvm_ena(hw))
+			is_supported = true;
+		break;
+	}
+
+	return is_supported;
+}
+
+#ifdef IFLA_VF_VLAN_INFO_MAX
+/**
+ * ice_set_vf_port_vlan
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @vlan_id: VLAN ID being set
+ * @qos: priority setting
+ * @vlan_proto: VLAN protocol
+ *
+ * program VF Port VLAN ID and/or QoS
+ */
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
+#else
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos)
+#endif /* IFLA_VF_VLAN_INFO_MAX */
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+#ifdef IFLA_VF_VLAN_INFO_MAX
+	u16 local_vlan_proto = ntohs(vlan_proto);
+#else
+	u16 local_vlan_proto = ETH_P_8021Q;
+#endif
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	if (vlan_id >= VLAN_N_VID || qos > 7) {
+		dev_err(dev, "Invalid Port VLAN parameters for VF %d, ID %d, QoS %d\n",
+			vf_id, vlan_id, qos);
+		return -EINVAL;
+	}
+
+	if (!ice_is_supported_port_vlan_proto(&pf->hw, local_vlan_proto)) {
+		dev_err(dev, "VF VLAN protocol 0x%04x is not supported\n",
+			local_vlan_proto);
+		return -EPROTONOSUPPORT;
+	}
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	if (ice_vf_get_port_vlan_prio(vf) == qos &&
+	    ice_vf_get_port_vlan_tpid(vf) == local_vlan_proto &&
+	    ice_vf_get_port_vlan_id(vf) == vlan_id) {
+		/* duplicate request, so just return success */
+		dev_dbg(dev, "Duplicate port VLAN %u, QoS %u, TPID 0x%04x request\n",
+			vlan_id, qos, local_vlan_proto);
+		return 0;
+	}
+
+	vf->port_vlan_info =
+		ICE_VLAN(local_vlan_proto, vlan_id, qos, ICE_FWD_TO_VSI);
+	if (ice_vf_is_port_vlan_ena(vf))
+		dev_info(dev, "Setting VLAN %u, QoS %u, TPID 0x%04x on VF %d\n",
+			 vlan_id, qos, local_vlan_proto, vf_id);
+	else
+		dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id);
+
+	ice_vc_reset_vf(vf);
+
+	return 0;
+}
+
+/**
+ * ice_vf_vlan_offload_ena - determine if capabilities support VLAN offloads
+ * @caps: VF driver negotiated capabilities
+ *
+ * Return true if VIRTCHNL_VF_OFFLOAD_VLAN capability is set, else return false
+ */
+static bool ice_vf_vlan_offload_ena(u32 caps)
+{
+	return !!(caps & VIRTCHNL_VF_OFFLOAD_VLAN);
+}
+
+/**
+ * ice_is_vlan_promisc_allowed - check if VLAN promiscuous config is allowed
+ * @vf: VF used to determine if VLAN promiscuous config is allowed
+ */
+static bool ice_is_vlan_promisc_allowed(struct ice_vf *vf)
+{
+	if ((test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
+	     test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) &&
+	    test_bit(ICE_FLAG_VF_TRUE_PROMISC_ENA, vf->pf->flags))
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_vf_ena_vlan_promisc - Enable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to enable VLAN promiscuous mode
+ * @vlan: VLAN used to enable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_ena_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+	enum ice_status status;
+
+	status = ice_fltr_set_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+					  vlan->vid, vsi->port_info->lport);
+	if (status && status != ICE_ERR_ALREADY_EXISTS)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_vf_dis_vlan_promisc - Disable Tx/Rx VLAN promiscuous for the VLAN
+ * @vsi: VF's VSI used to disable VLAN promiscuous mode for
+ * @vlan: VLAN used to disable VLAN promiscuous
+ *
+ * This function should only be called if VLAN promiscuous mode is allowed,
+ * which can be determined via ice_is_vlan_promisc_allowed().
+ */
+static int ice_vf_dis_vlan_promisc(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u8 promisc_m = ICE_PROMISC_VLAN_TX | ICE_PROMISC_VLAN_RX;
+	enum ice_status status;
+
+	status = ice_fltr_clear_vsi_promisc(&vsi->back->hw, vsi->idx, promisc_m,
+					    vlan->vid, vsi->port_info->lport);
+	if (status && status != ICE_ERR_DOES_NOT_EXIST)
+		return ice_status_to_errno(status);
+
+	return 0;
+}
+
+/**
+ * ice_vf_has_max_vlans - check if VF already has the max allowed VLAN filters
+ * @vf: VF to check against
+ * @vsi: VF's VSI
+ *
+ * If the VF is trusted then the VF is allowed to add as many VLANs as it
+ * wants to, so return false.
+ *
+ * When the VF is untrusted compare the number of non-zero VLANs + 1 to the max
+ * allowed VLANs for an untrusted VF. Return the result of this comparison.
+ */
+static bool ice_vf_has_max_vlans(struct ice_vf *vf, struct ice_vsi *vsi)
+{
+	if (ice_is_vf_trusted(vf))
+		return false;
+
+#define ICE_VF_ADDED_VLAN_ZERO_FLTRS	1
+	return ((ice_vsi_num_non_zero_vlans(vsi) +
+		 ICE_VF_ADDED_VLAN_ZERO_FLTRS) >= ICE_MAX_VLAN_PER_VF);
+}
+
+/**
+ * ice_vc_process_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @add_v: Add VLAN if true, otherwise delete VLAN
+ *
+ * Process virtchnl op to add or remove programmed guest VLAN ID
+ */
+static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_filter_list *vfl =
+	    (struct virtchnl_vlan_filter_list *)msg;
+	struct ice_pf *pf = vf->pf;
+	bool vlan_promisc = false;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	int status = 0;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		if (vfl->vlan_id[i] >= VLAN_N_VID) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			dev_err(dev, "invalid VF VLAN id %d\n",
+				vfl->vlan_id[i]);
+			goto error_param;
+		}
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (add_v && ice_vf_has_max_vlans(vf, vsi)) {
+		dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
+			 vf->vf_id);
+		/* There is no need to let VF know about being not trusted,
+		 * so we can just return success message here
+		 */
+		goto error_param;
+	}
+
+	/* in DVM a VF can add/delete inner VLAN filters when
+	 * VIRTCHNL_VF_OFFLOAD_VLAN is negotiated, so only reject in SVM
+	 */
+	if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&pf->hw)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	/* in DVM VLAN promiscuous is based on the outer VLAN, which would be
+	 * the port VLAN if VIRTCHNL_VF_OFFLOAD_VLAN was negotiated, so only
+	 * allow vlan_promisc = true in SVM and if no port VLAN is configured
+	 */
+	vlan_promisc = ice_is_vlan_promisc_allowed(vf) &&
+		!ice_is_dvm_ena(&pf->hw) &&
+		!ice_vf_is_port_vlan_ena(vf);
+
+	if (add_v) {
+		for (i = 0; i < vfl->num_elements; i++) {
+			u16 vid = vfl->vlan_id[i];
+			struct ice_vlan vlan;
+
+			if (ice_vf_has_max_vlans(vf, vsi)) {
+				dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
+					 vf->vf_id);
+				/* There is no need to let VF know about being
+				 * not trusted, so we can just return success
+				 * message here as well.
+				 */
+				goto error_param;
+			}
+
+			/* we add VLAN 0 by default for each VF so we can enable
+			 * Tx VLAN anti-spoof without triggering MDD events so
+			 * we don't need to add it again here
+			 */
+			if (!vid)
+				continue;
+
+			vlan = ICE_VLAN(ETH_P_8021Q, vid, 0, ICE_FWD_TO_VSI);
+			status = vsi->inner_vlan_ops.add_vlan(vsi, &vlan);
+			if (status) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* Enable VLAN filtering on first non-zero VLAN */
+			if (!vlan_promisc && vid && !ice_is_dvm_ena(&pf->hw)) {
+				if (vsi->inner_vlan_ops.ena_rx_filtering(vsi)) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					dev_err(dev, "Enable VLAN pruning on VLAN ID: %d failed error-%d\n",
+						vid, status);
+					goto error_param;
+				}
+			} else if (vlan_promisc) {
+				status = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (status) {
+					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+					dev_err(dev, "Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n",
+						vid, status);
+				}
+			}
+		}
+	} else {
+		/* In case of non_trusted VF, number of VLAN elements passed
+		 * to PF for removal might be greater than number of VLANs
+		 * filter programmed for that VF - So, use actual number of
+		 * VLANS added earlier with add VLAN opcode. In order to avoid
+		 * removing VLAN that doesn't exist, which result to sending
+		 * erroneous failed message back to the VF
+		 */
+		int num_vf_vlan;
+
+		num_vf_vlan = vsi->num_vlan;
+		for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) {
+			u16 vid = vfl->vlan_id[i];
+			struct ice_vlan vlan;
+
+			/* we add VLAN 0 by default for each VF so we can enable
+			 * Tx VLAN anti-spoof without triggering MDD events so
+			 * we don't want a VIRTCHNL request to remove it
+			 */
+			if (!vid)
+				continue;
+
+			vlan = ICE_VLAN(ETH_P_8021Q, vid, 0, ICE_FWD_TO_VSI);
+			status = vsi->inner_vlan_ops.del_vlan(vsi, &vlan);
+			if (status) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto error_param;
+			}
+
+			/* Disable VLAN filtering when only VLAN 0 is left */
+			if (!ice_vsi_has_non_zero_vlans(vsi))
+				vsi->inner_vlan_ops.dis_rx_filtering(vsi);
+
+			if (vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	if (add_v)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN, v_ret,
+					     NULL, 0);
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN, v_ret,
+					     NULL, 0);
+}
+
+/**
+ * ice_vc_add_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Add and program guest VLAN ID
+ */
+static int ice_vc_add_vlan_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_process_vlan_msg(vf, msg, true);
+}
+
+/**
+ * ice_vc_remove_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * remove programmed guest VLAN ID
+ */
+static int ice_vc_remove_vlan_msg(struct ice_vf *vf, u8 *msg)
+{
+	return ice_vc_process_vlan_msg(vf, msg, false);
+}
+
+/**
+ * ice_vsi_is_rxq_crc_strip_dis - check if Rx queue CRC strip is disabled or not
+ * @vsi: pointer to the VF VSI info
+ */
+static bool ice_vsi_is_rxq_crc_strip_dis(struct ice_vsi *vsi)
+{
+	u16 i;
+
+	for (i = 0; i < vsi->alloc_rxq; i++)
+		if (vsi->rx_rings[i]->rx_crc_strip_dis)
+			return true;
+
+	return false;
+}
+
+/**
+ * ice_vc_ena_vlan_stripping
+ * @vf: pointer to the VF info
+ *
+ * Enable VLAN header stripping for a given VF
+ */
+static int ice_vc_ena_vlan_stripping(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (ice_vsi_is_rxq_crc_strip_dis(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto error_param;
+	}
+	if (vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	else
+		vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_vlan_stripping
+ * @vf: pointer to the VF info
+ *
+ * Disable VLAN header stripping for a given VF
+ */
+static int ice_vc_dis_vlan_stripping(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vsi->inner_vlan_ops.dis_stripping(vsi))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	else
+		vf->vlan_strip_ena &= ~ICE_INNER_VLAN_STRIP_ENA;
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_get_rss_hena - return the RSS HENA bits allowed by the hardware
+ * @vf: pointer to the VF info
+ */
+static int ice_vc_get_rss_hena(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_rss_hena *vrh = NULL;
+	int len = 0, ret;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		dev_err(ice_pf_to_dev(vf->pf), "RSS not supported by PF\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	len = sizeof(struct virtchnl_rss_hena);
+	vrh = kzalloc(len, GFP_KERNEL);
+	if (!vrh) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+
+	vrh->hena = ICE_DEFAULT_RSS_HENA;
+err:
+	/* send the response back to the VF */
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HENA_CAPS, v_ret,
+				    (u8 *)vrh, len);
+	kfree(vrh);
+	return ret;
+}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_validate_cloud_filter
+ * @vf: pointer to the VF info
+ * @tc_filter: pointer to virtchnl_filter
+ *
+ * This function validates cloud filter programmed as TC filter for ADQ
+ */
+static int
+ice_validate_cloud_filter(struct ice_vf *vf, struct virtchnl_filter *tc_filter)
+{
+	struct virtchnl_l4_spec mask = tc_filter->mask.tcp_spec;
+	struct virtchnl_l4_spec data = tc_filter->data.tcp_spec;
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!tc_filter->action) {
+		dev_err(dev, "VF %d: Currently ADQ doesn't support Drop Action\n",
+			vf->vf_id);
+		return -EOPNOTSUPP;
+	}
+
+	/* Check filter if it's programmed for advanced mode or basic mode.
+	 * There are two ADQ modes (for VF only),
+	 * 1. Basic mode: intended to allow as many filter options as possible
+	 *		  to be added to a VF in Non-trusted mode. Main goal is
+	 *		  to add filters to its own MAC and VLAN ID.
+	 * 2. Advanced mode: is for allowing filters to be applied other than
+	 *		  its own MAC or VLAN. This mode requires the VF to be
+	 *		  Trusted.
+	 */
+	if (mask.dst_mac[0] && !mask.dst_ip[0]) {
+		/* As of now supporting, MAC filter if MAC address is the
+		 * default LAN addr for this VF
+		 */
+		if (!ice_mac_fltr_exist(&pf->hw, data.dst_mac,
+					vf->lan_vsi_idx)) {
+			dev_err(dev, "Destination MAC %pM doesn't belong to VF %d\n",
+				data.dst_mac, vf->vf_id);
+			return -EINVAL;
+		}
+	} else if (!test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		/* Check if VF is trusted */
+		dev_err(dev, "VF %d not trusted, make VF trusted to add ADQ filters\n",
+			vf->vf_id);
+		return -EOPNOTSUPP;
+	}
+
+	if (mask.dst_mac[0] & data.dst_mac[0]) {
+		if (is_broadcast_ether_addr(data.dst_mac) ||
+		    is_zero_ether_addr(data.dst_mac)) {
+			dev_err(dev, "VF %d: Invalid Dest MAC addr %pM\n",
+				vf->vf_id, data.dst_mac);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.src_mac[0] & data.src_mac[0]) {
+		if (is_broadcast_ether_addr(data.src_mac) ||
+		    is_zero_ether_addr(data.src_mac)) {
+			dev_err(dev, "VF %d: Invalid Source MAC addr %pM\n",
+				vf->vf_id, data.src_mac);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.dst_port & data.dst_port) {
+		if (!data.dst_port) {
+			dev_err(dev, "VF %d: Invalid Dest port\n", vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.src_port & data.src_port) {
+		if (!data.src_port) {
+			dev_err(dev, "VF %d: Invalid Source port\n", vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	if (mask.vlan_id & data.vlan_id) {
+		if (ntohs(data.vlan_id) >= VLAN_N_VID) {
+			dev_err(dev, "VF %d: invalid VLAN ID\n", vf->vf_id);
+			return -EINVAL;
+		}
+		/* Validate VLAN for the VF the same way we do for the PF */
+		if (!ice_vlan_fltr_exist(&pf->hw, ntohs(data.vlan_id),
+					 vf->lan_vsi_idx)) {
+			dev_err(dev, "specified VLAN %u doesn't belong to this VF %d\n",
+				ntohs(data.vlan_id), vf->vf_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_get_tc_flower_fltr - locate the TC flower filter
+ * @vf: pointer to the VF info
+ * @fltr: pointer to the tc_flower filter
+ * @mask: ptr to filter mask (representing filter data specification)
+ *
+ * This function is used to locate specific filter in filter list. It returns
+ * NULL if unable to locate such filter otherwise returns found filter
+ */
+static struct ice_tc_flower_fltr *
+ice_get_tc_flower_fltr(struct ice_vf *vf, struct ice_tc_flower_fltr *fltr,
+		       struct virtchnl_l4_spec *mask)
+{
+	struct ice_tc_flower_lyr_2_4_hdrs *hdrs;
+	struct ice_tc_l2_hdr *l2_key;
+	struct ice_tc_l3_hdr *l3_key;
+	struct ice_tc_l4_hdr *l4_key;
+	struct ice_tc_flower_fltr *f;
+	struct hlist_node *node;
+
+	hdrs = &fltr->outer_headers;
+	if (!hdrs)
+		return NULL;
+
+	l2_key = &hdrs->l2_key;
+	l3_key = &hdrs->l3_key;
+	l4_key = &hdrs->l4_key;
+
+	hlist_for_each_entry_safe(f, node,
+				  &vf->tc_flower_fltr_list, tc_flower_node) {
+		struct ice_tc_flower_lyr_2_4_hdrs *f_hdrs;
+
+		if (!f->dest_vsi || fltr->dest_vsi != f->dest_vsi ||
+		    fltr->dest_vsi->idx != f->dest_vsi->idx)
+			continue;
+
+		f_hdrs = &f->outer_headers;
+
+		/* handle L2 fields if specified and do not match */
+		if ((mask->src_mac[0] &&
+		     !ether_addr_equal(l2_key->src_mac,
+		     f_hdrs->l2_key.src_mac)) ||
+		    (mask->dst_mac[0] &&
+		     !ether_addr_equal(l2_key->dst_mac,
+		     f_hdrs->l2_key.dst_mac)))
+			continue;
+
+		/* handle VLAN if specified and do not match  */
+		if (mask->vlan_id && hdrs->vlan_hdr.vlan_id !=
+		    f_hdrs->vlan_hdr.vlan_id)
+			continue;
+
+		/* handle L3 IPv4 if specified and do not match
+		 * for ipv4 data to be valid, check only first dword of mask
+		 */
+		if (l2_key->n_proto == ETH_P_IP)
+			if ((mask->dst_ip[0] &&
+			     l3_key->dst_ipv4 != f_hdrs->l3_key.dst_ipv4) ||
+			    (mask->src_ip[0] &&
+			     l3_key->src_ipv4 != f_hdrs->l3_key.src_ipv4))
+				continue;
+
+		/* handle L3 IPv6 if specified and do not match
+		 * for ipv6 to be valid, last dword from mask must be valid
+		 * hence check only last dword of mask
+		 */
+		if (l2_key->n_proto == ETH_P_IPV6 && mask->dst_ip[3])
+			if (memcmp(&l3_key->ip.v6.dst_ip6,
+				   &f_hdrs->l3_key.ip.v6.dst_ip6,
+				   sizeof(l3_key->ip.v6.dst_ip6)))
+				continue;
+		if (l2_key->n_proto == ETH_P_IPV6 && mask->src_ip[3])
+			if (memcmp(&l3_key->ip.v6.src_ip6,
+				   &f_hdrs->l3_key.ip.v6.src_ip6,
+				   sizeof(l3_key->ip.v6.src_ip6)))
+				continue;
+
+		/* make sure "ip_proto" is same */
+		if (l3_key->ip_proto != f_hdrs->l3_key.ip_proto)
+			continue;
+
+		/* handle L4 fields if specified and do not match */
+		if ((mask->dst_port &&
+		     l4_key->dst_port != f_hdrs->l4_key.dst_port) ||
+		    (mask->src_port &&
+		     l4_key->src_port != f_hdrs->l4_key.src_port))
+			continue;
+
+		/* if reached here, means found matching filter entry */
+		return f;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_vc_chnl_fltr_state_verify - verify general state of VF
+ * @vf: pointer to the VF info
+ * @vcf: pointer to virtchannel_filter
+ *
+ * This function performs general validation including validation of filter
+ * message and content
+ */
+static enum virtchnl_status_code
+ice_vc_chnl_fltr_state_verify(struct ice_vf *vf, struct virtchnl_filter *vcf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	u32 max_tc_allowed;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		return VIRTCHNL_STATUS_ERR_PARAM;
+
+	if (!ice_is_vf_adq_ena(vf)) {
+		dev_err(dev, "VF %d: ADQ is not enabled, can't apply switch filter\n",
+			vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		dev_err(dev, "VF %d: No corresponding VF VSI\n", vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	max_tc_allowed = ice_vc_get_max_chnl_tc_allowed(vf);
+	if (vcf->action == VIRTCHNL_ACTION_TC_REDIRECT &&
+	    vcf->action_meta >= max_tc_allowed) {
+		dev_err(dev, "VF %d: Err: action(%u)_meta(TC): %u >= max_tc_allowed (%u)\n",
+			vf->vf_id, vcf->action, vcf->action_meta,
+			max_tc_allowed);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* enforce supported flow_type based on negotiated capability */
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2) {
+		if (!(vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_TCP_V6_FLOW ||
+		      vcf->flow_type == VIRTCHNL_UDP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_UDP_V6_FLOW)) {
+			dev_err(ice_pf_to_dev(pf), "VF %d: Invalid input/s, unsupported flow_type %u\n",
+				vf->vf_id, vcf->flow_type);
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	} else {
+		if (!(vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+		      vcf->flow_type == VIRTCHNL_TCP_V6_FLOW)){
+			dev_err(ice_pf_to_dev(pf), "VF %d: Invalid input/s, unsupported flow_type %u\n",
+				vf->vf_id, vcf->flow_type);
+			return VIRTCHNL_STATUS_ERR_PARAM;
+		}
+	}
+
+	if (ice_validate_cloud_filter(vf, vcf)) {
+		dev_err(dev, "VF %d: Invalid input/s, can't apply switch filter\n",
+			vf->vf_id);
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* filter state fully verified, return SUCCESS */
+	return VIRTCHNL_STATUS_SUCCESS;
+}
+
+/**
+ * ice_setup_fltr - populate fields in TC flower filter structure
+ * @vf: ptr to VF
+ * @vcf: ptr to virt channel message
+ * @fltr: pointer to the TC filter structure
+ * @dest_vsi: pointer to destination VSI for filter
+ * @tc_class: TC number when action type to FWD_TO_VSI, counter index when
+ *	      action is count, queue number when action is FWD_TO_QUEUE,
+ *	      queue group ID when action is FWD_TO_QGRP
+ */
+static void
+ice_setup_fltr(struct ice_vf *vf, struct ice_tc_flower_fltr *fltr,
+	       struct virtchnl_filter *vcf, struct ice_vsi *dest_vsi,
+	       int tc_class)
+{
+	struct virtchnl_l4_spec *mask = &vcf->mask.tcp_spec;
+	struct virtchnl_l4_spec *tcf = &vcf->data.tcp_spec;
+	struct ice_tc_flower_lyr_2_4_hdrs *hdrs;
+
+	memset(fltr, 0, sizeof(*fltr));
+
+	hdrs = &fltr->outer_headers;
+	if (!hdrs)
+		return;
+
+	/* copy L2 MAC address and MAC mask */
+	ether_addr_copy(hdrs->l2_key.dst_mac, tcf->dst_mac);
+	ether_addr_copy(hdrs->l2_mask.dst_mac, mask->dst_mac);
+	if (!is_zero_ether_addr(hdrs->l2_key.dst_mac))
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+
+	/* copy L2 source address and MAC mask */
+	ether_addr_copy(hdrs->l2_key.src_mac, tcf->src_mac);
+	ether_addr_copy(hdrs->l2_mask.src_mac, mask->src_mac);
+	if (!is_zero_ether_addr(hdrs->l2_key.src_mac))
+		fltr->flags |= ICE_TC_FLWR_FIELD_SRC_MAC;
+
+	/* copy VLAN info */
+	hdrs->vlan_hdr.vlan_id = mask->vlan_id & tcf->vlan_id;
+	if (hdrs->vlan_hdr.vlan_id)
+		fltr->flags |= ICE_TC_FLWR_FIELD_VLAN;
+
+	/* copy L4 fields */
+	hdrs->l4_key.dst_port = mask->dst_port & tcf->dst_port;
+	hdrs->l4_mask.dst_port = mask->dst_port;
+	if (hdrs->l4_key.dst_port)
+		fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT;
+
+	hdrs->l4_key.src_port = mask->src_port & tcf->src_port;
+	hdrs->l4_mask.src_port = mask->src_port;
+	if (hdrs->l4_key.src_port)
+		fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT;
+
+	/* copy L3 fields, IPv4[6] */
+	if (vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+	    vcf->flow_type == VIRTCHNL_UDP_V4_FLOW) {
+		struct ice_tc_l3_hdr *key, *msk;
+
+		key = &hdrs->l3_key;
+		msk = &hdrs->l3_mask;
+
+		/* set n_proto based on flow_type */
+		hdrs->l2_key.n_proto = ETH_P_IP;
+		if (mask->dst_ip[0] & tcf->dst_ip[0]) {
+			key->dst_ipv4 = tcf->dst_ip[0];
+			msk->dst_ipv4 = mask->dst_ip[0];
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV4;
+		}
+		if (mask->src_ip[0] & tcf->src_ip[0]) {
+			key->src_ipv4 = tcf->src_ip[0];
+			msk->src_ipv4 = mask->src_ip[0];
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV4;
+		}
+	} else if (vcf->flow_type == VIRTCHNL_TCP_V6_FLOW ||
+		   vcf->flow_type == VIRTCHNL_UDP_V6_FLOW) {
+		struct ice_tc_l3_hdr *key, *msk;
+
+		key = &hdrs->l3_key;
+		msk = &hdrs->l3_mask;
+
+		/* set n_proto based on flow_type */
+		hdrs->l2_key.n_proto = ETH_P_IPV6;
+		if (mask->dst_ip[3] & tcf->dst_ip[3]) {
+			memcpy(&key->ip.v6.dst_ip6, tcf->dst_ip,
+			       sizeof(key->ip.v6.dst_ip6));
+			memcpy(&msk->ip.v6.dst_ip6, mask->dst_ip,
+			       sizeof(msk->ip.v6.dst_ip6));
+			fltr->flags |= ICE_TC_FLWR_FIELD_DEST_IPV6;
+		}
+		if (mask->src_ip[3] & tcf->src_ip[3]) {
+			memcpy(&key->ip.v6.src_ip6, tcf->src_ip,
+			       sizeof(key->ip.v6.src_ip6));
+			memcpy(&msk->ip.v6.src_ip6, mask->src_ip,
+			       sizeof(msk->ip.v6.src_ip6));
+			fltr->flags |= ICE_TC_FLWR_FIELD_SRC_IPV6;
+		}
+	}
+
+	/* get the VSI to which the TC belongs to */
+	fltr->dest_vsi = dest_vsi;
+	if (vcf->action == VIRTCHNL_ACTION_TC_REDIRECT)
+		fltr->action.fltr_act = ICE_FWD_TO_VSI;
+	else
+		fltr->action.fltr_act = ICE_DROP_PACKET;
+
+	/* make sure to include VF's MAC address when adding ADQ filter */
+	if ((!(fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)) &&
+	    fltr->action.fltr_act == ICE_FWD_TO_VSI) {
+		fltr->flags |= ICE_TC_FLWR_FIELD_DST_MAC;
+		ether_addr_copy(hdrs->l2_key.dst_mac, vf->dev_lan_addr.addr);
+		eth_broadcast_addr(hdrs->l2_mask.dst_mac);
+	}
+
+	/* 'tc_class' could be TC/QUEUE/QUEUE_GRP number */
+	fltr->action.tc_class = tc_class;
+
+	/* must to set the tunnel_type to be INVALID, otherwise if left as zero,
+	 * it gets treated as VxLAN tunnel since definition of VxLAN tunnel
+	 * type is zero
+	 */
+	fltr->tunnel_type = TNL_LAST;
+
+	/* set ip_proto in headers based on flow_type which is part of VIRTCHNL
+	 * message, "add filter"
+	 */
+	if (vcf->flow_type == VIRTCHNL_TCP_V4_FLOW ||
+	    vcf->flow_type == VIRTCHNL_TCP_V6_FLOW)
+		hdrs->l3_key.ip_proto = IPPROTO_TCP;
+	else
+		hdrs->l3_key.ip_proto = IPPROTO_UDP;
+}
+
+/**
+ * ice_vc_del_switch_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function deletes a cloud filter programmed as TC filter for ADQ
+ */
+static int ice_vc_del_switch_filter(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct virtchnl_l4_spec *mask = &vcf->mask.tcp_spec;
+	struct ice_rule_query_data rule;
+	enum virtchnl_status_code v_ret;
+	struct ice_tc_flower_fltr fltr;
+	struct ice_tc_flower_fltr *f;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *dest_vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	/* Advanced switch filters and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. Advanced switch filters cannot be deleted.\n");
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	v_ret = ice_vc_chnl_fltr_state_verify(vf, vcf);
+	if (v_ret) {
+		dev_err(dev, "VF %d: failed to verify ADQ state during filter message processing\n",
+			vf->vf_id);
+		goto err;
+	}
+
+	dest_vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+
+	/* prepare the TC flower filter based on input */
+	ice_setup_fltr(vf, &fltr, vcf, dest_vsi, vcf->action_meta);
+
+	/* locate the filter in VF tc_flower filter list */
+	f = ice_get_tc_flower_fltr(vf, &fltr, mask);
+	if (!f) {
+		dev_err(dev, "VF %d: Invalid input/s, unable to locate filter due to mismatch\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Deleting TC filter */
+	rule.rid = f->rid;
+	rule.rule_id = f->rule_id;
+	rule.vsi_handle = f->dest_id;
+	err = ice_rem_adv_rule_by_id(&pf->hw, &rule);
+	if (err) {
+		dev_err(dev, "VF %d: Failed to delete switch filter for tc %u, err %d\n",
+			vf->vf_id, vcf->action_meta, err);
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		goto err;
+	}
+
+	/* book-keeping and update filter type if filter count reached zero */
+	dest_vsi->num_chnl_fltr--;
+
+	/* reset filter type for channel if channel filter
+	 * count reaches zero
+	 */
+	if (!dest_vsi->num_chnl_fltr)
+		vf->ch[vcf->action_meta].fltr_type = ICE_CHNL_FLTR_TYPE_INVALID;
+
+	hlist_del(&f->tc_flower_node);
+	devm_kfree(dev, f);
+	if (f->flags & ICE_TC_FLWR_FIELD_DST_MAC)
+		vf->num_dmac_chnl_fltrs--;
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+err:
+	/* send the response back to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_CLOUD_FILTER, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_vc_add_switch_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function adds a switch filter programmed as TC filter for ADQ
+ *
+ * General info about filtering mode:
+ * VF ADQ has two different modes when it comes to applying the switch
+ * filters
+ * 1. basic mode: only dst MAC and dst VLAN filters supported
+ * 2. advanced mode: all combination of filters including dst MAC and
+ *			dst VLAN ex:
+ *	a. dst IP + dst PORT
+ *	b. dst MAC + src PORT
+ *	c. dst MAC + dst PORT
+ * basic mode is for 'untrusted VFs' and advanced mode is only for
+ * 'trusted VFs'. When a VF is toggled from being 'trusted' to
+ * 'untrusted' we remove all filters irrespective if it's basic or
+ * advanced.
+ * when ADQ is enabled we need to do ice_down irrespective if VF is
+ * 'trusted' or not and delete switch filters only if a 'trusted' VF
+ * is made 'untrusted'.
+ */
+static int ice_vc_add_switch_filter(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct ice_tc_flower_fltr *fltr = NULL;
+	enum virtchnl_status_code v_ret;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *dest_vsi;
+	struct device *dev;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	/* Advanced switch filters and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. Advanced switch filters cannot be added\n");
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	v_ret = ice_vc_chnl_fltr_state_verify(vf, vcf);
+	if (v_ret) {
+		dev_err(dev, "VF %d: failed to verify ADQ state during filter message processing\n",
+			vf->vf_id);
+		goto err;
+	}
+
+	dest_vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+
+	fltr = devm_kzalloc(dev, sizeof(*fltr), GFP_KERNEL);
+	if (!fltr) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto err;
+	}
+
+	/* prepare the TC flower filter based on input */
+	ice_setup_fltr(vf, fltr, vcf, dest_vsi, vcf->action_meta);
+
+	/* call function which adds advanced switch filter */
+	ret = ice_add_tc_flower_adv_fltr(ice_get_vf_vsi(vf), fltr);
+	if (ret) {
+		dev_err(dev, "Failed to add TC Flower filter using advance filter recipe\n");
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		devm_kfree(dev, fltr);
+		goto err;
+	}
+
+	INIT_HLIST_NODE(&fltr->tc_flower_node);
+	hlist_add_head(&fltr->tc_flower_node, &vf->tc_flower_fltr_list);
+	if (fltr->flags & ICE_TC_FLWR_FIELD_DST_MAC)
+		vf->num_dmac_chnl_fltrs++;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	vf->adq_fltr_ena = true;
+
+err:
+	/* send the response back to the VF */
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_CLOUD_FILTER, v_ret,
+				     NULL, 0);
+}
+
+/**
+ * ice_conv_virtchnl_speed_to_mbps
+ * @virt_speed: virt speed that needs to be converted from
+ *
+ * convert virt channel speeds to mbps, return link speed on success,
+ * '0' otherwise
+ */
+static u32 ice_conv_virtchnl_speed_to_mbps(u16 virt_speed)
+{
+	u32 speed, link_speed;
+
+	speed = ice_conv_link_speed_to_virtchnl(false, virt_speed);
+
+       /* get link speed in MB to validate rate limit */
+	switch (speed) {
+	case VIRTCHNL_LINK_SPEED_100MB:
+		link_speed = SPEED_100;
+		break;
+	case VIRTCHNL_LINK_SPEED_1GB:
+		link_speed = SPEED_1000;
+		break;
+	case VIRTCHNL_LINK_SPEED_10GB:
+		link_speed = SPEED_10000;
+		break;
+	case VIRTCHNL_LINK_SPEED_20GB:
+		link_speed = SPEED_20000;
+		break;
+	case VIRTCHNL_LINK_SPEED_25GB:
+		link_speed = SPEED_25000;
+		break;
+	case VIRTCHNL_LINK_SPEED_40GB:
+		link_speed = SPEED_40000;
+		break;
+	default:
+		/* on failure to detect link speed the expectation of the caller
+		 * to this function is '0'.
+		 */
+		link_speed = 0;
+		break;
+	}
+
+	return link_speed;
+}
+
+/**
+ * ice_vc_add_qch_msg: Add queue channel and enable ADQ
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ */
+static int ice_vc_add_qch_msg(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_tc_info *tci =
+		(struct virtchnl_tc_info *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	int adq_request_qps = 0;
+	struct ice_link_status *ls;
+	u16 available_vsis = 0;
+	u64 total_max_rate = 0;
+	u32 max_tc_allowed;
+	struct device *dev;
+	u16 total_qs = 0;
+	u32 link_speed;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	ls = &pf->hw.port_info->phy.link_info;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* check if VF has negotiated this capability before anything else */
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ)) {
+		dev_dbg(dev, "VF %d attempting to enable ADQ, but hasn't properly negotiated that capability\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Currently ADQ and DCB are mutually exclusive and keeping in sync
+	 * with PF, don't allow VF ADQ configuration when DCB Firmware LLDP
+	 * agent is already running/enabled.
+	 */
+	if (test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) {
+		dev_err(dev, "FW LLDP is enabled, cannot enable ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* VF ADQ and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. VF ADQ cannot be enabled\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* ADQ cannot be applied if spoof check is ON */
+	if (vf->spoofchk) {
+		dev_err(dev, "Spoof check is ON, turn it OFF to enable ADQ\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	ice_for_each_vsi(pf, i) {
+		if (!pf->vsi[i])
+			++available_vsis;
+	}
+
+	if (available_vsis < tci->num_tc - 1) {
+		dev_err(dev, "Not enough VSIs left to enable ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	max_tc_allowed = ice_vc_get_max_chnl_tc_allowed(vf);
+	/* max number of traffic classes for VF currently capped at 4 for legacy
+	 * ADQ and 16 for ADQ V2.
+	 */
+	if (!tci->num_tc || tci->num_tc > max_tc_allowed) {
+		dev_dbg(dev, "VF %d trying to set %u TCs, valid range 1-%u TCs per VF\n",
+			vf->vf_id, tci->num_tc, max_tc_allowed);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* validate queues for each TC */
+	for (i = 0; i < tci->num_tc; i++) {
+		if (!tci->list[i].count) {
+			dev_err(dev, "VF %d: TC %d trying to set %u queues, should be > 0 per TC\n",
+				vf->vf_id, i, tci->list[i].count);
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+		total_qs += tci->list[i].count;
+	}
+
+	if (total_qs > ICE_MAX_DFLT_QS_PER_VF) {
+		dev_err(dev, "VF %d: Total number of queues of all TCs cannot exceed %u\n",
+			vf->vf_id, ICE_MAX_DFLT_QS_PER_VF);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* Speed in Mbps */
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
+		link_speed = ice_conv_link_speed_to_virtchnl(true,
+							     ls->link_speed);
+	else
+		link_speed = ice_conv_virtchnl_speed_to_mbps(ls->link_speed);
+
+	if (!link_speed) {
+		dev_err(dev, "Cannot detect link speed on VF %d\n", vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	for (i = 0; i < tci->num_tc; i++)
+		if (tci->list[i].max_tx_rate)
+			total_max_rate += tci->list[i].max_tx_rate;
+
+	if (total_max_rate > link_speed) {
+		dev_err(dev, "Invalid tx rate specified for ADQ on VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (vf->max_tx_rate && total_max_rate > vf->max_tx_rate) {
+		dev_err(dev, "Invalid tx rate specified for ADQ on VF %d, total_max_rate %llu Mpbs > host set max_tx_rate %u Mbps\n",
+			vf->vf_id, total_max_rate, vf->max_tx_rate);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* need Max VF queues but already have default number of queues */
+	adq_request_qps = ICE_MAX_DFLT_QS_PER_VF - pf->num_qps_per_vf;
+
+	if (ice_get_avail_txq_count(pf) < adq_request_qps) {
+		dev_err(dev, "No queues left to allocate to VF %d\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto err;
+	} else {
+		/* we need to allocate max VF queues to enable ADQ so as to
+		 * make sure ADQ enabled VF always gets back queues when it
+		 * goes through a reset.
+		 */
+		vf->num_vf_qs = ICE_MAX_DFLT_QS_PER_VF;
+	}
+
+	/* parse data from the queue channel info */
+	vf->num_tc = tci->num_tc;
+
+	for (i = 0; i < vf->num_tc; i++) {
+		if (tci->list[i].max_tx_rate)
+			vf->ch[i].max_tx_rate = tci->list[i].max_tx_rate;
+
+		vf->ch[i].num_qps = tci->list[i].count;
+		vf->ch[i].offset = tci->list[i].offset;
+	}
+
+	/* set this flag only after making sure all inputs are sane */
+	vf->adq_enabled = true;
+	/* initialize filter enable flag, set it only if filters are applied */
+	vf->adq_fltr_ena = false;
+
+	/* reset the VF in order to allocate resources. Don't reset if ADQ_V2
+	 * capability is negotiated, since in that case AVF driver will request
+	 * for a reset.
+	 */
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)) {
+		ice_vc_notify_vf_reset(vf);
+		ice_reset_vf(vf, false);
+	}
+	/* send the response to the VF */
+err:
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_CHANNELS,
+					     v_ret, (u8 *)tci, sizeof(*tci));
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_CHANNELS,
+					     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_del_qch_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * delete the additional VSIs which are created as part of ADQ
+ */
+static int ice_vc_del_qch_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vsi;
+	struct device *dev;
+	u8 tc;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* VF ADQ and DCF are mutually exclusive. */
+	if (ice_is_dcf_enabled(pf)) {
+		dev_err(dev, "Device Control Functionality is currently enabled. VF ADQ cannot be enabled\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (ice_is_vf_adq_ena(vf)) {
+		/* if ADQ_V2 is set, perform inline cleanup of ADQ resources and
+		 * return success and eventually VF driver will initiate reset
+		 * as per design
+		 */
+		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2) {
+			dev_info(ice_pf_to_dev(pf),
+				 "Deleting Queue Channels for ADQ on VF %d and ADQ_V2 is set\n",
+				 vf->vf_id);
+
+			/* release VF ADQ filters and VSIs inline */
+			ice_vf_adq_release(vf);
+			v_ret = VIRTCHNL_STATUS_SUCCESS;
+			goto err;
+		}
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		/* delete all ADQ filters for given VF */
+		ice_del_all_adv_switch_fltr(vf);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+		/* stop all Tx/Rx rings and clean them before deleting the ADQ
+		 * resources, if not it will throw fail to set the LAN Tx queue
+		 * context error. This is needed irrespective of ADQ_V2. Channel
+		 * related TC starts at 1. Don't down the VSI and related
+		 * resources for TC 0 because it is primary VF VSI and downing
+		 * that VSI is handled somewhere else.
+		 */
+		for (tc = ICE_VF_CHNL_START_TC; tc < vf->num_tc; tc++) {
+			vsi = ice_get_vf_adq_vsi(vf, tc);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+			if (vf->ch[tc].vsi_num)
+				ice_down(vsi);
+		}
+
+		/* this order of code is very important, if num_tc is not
+		 * cleared, VF again rebuilds as ADQ enabled clearly contrary
+		 * to what we're trying to do. Also clearing num_tc before
+		 * deleting ADQ filters leads to the condition where the code
+		 * will try to delete filters when none are configured.
+		 */
+		vf->num_tc = 0;
+		dev_info(ice_pf_to_dev(pf), "Deleting Queue Channels for ADQ on VF %d\n",
+			 vf->vf_id);
+
+		/* reset needs to happen first, before we clear the adq_enabled
+		 * flag, since freeing up of ADQ resources happens based off of
+		 * this flag in reset path. Doing a reset after clearing the
+		 * flag will leave the ADQ resources in zombie state which in
+		 * turn creates undesired problems such as system lock up, stack
+		 * trace etc.,
+		 * Also we shouldn't be doing a reset if ADQ flag is cleared in
+		 * some other place, hence sending the failure response back to
+		 * the VF.
+		 */
+		ice_vc_notify_vf_reset(vf);
+		ice_reset_vf(vf, false);
+		if (ice_is_vf_link_up(vf)) {
+			/* bring the VSI 0 back up again */
+			vsi = ice_get_vf_adq_vsi(vf, 0);
+			if (!vsi) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				goto err;
+			}
+			ice_up(vsi);
+		}
+
+		vf->adq_enabled = false;
+	} else {
+		dev_info(dev, "VF %d trying to delete queue channels but ADQ isn't enabled\n",
+			 vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	}
+
+	/* send the response to the VF */
+err:
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_CHANNELS,
+					     v_ret, msg,
+					     sizeof(struct virtchnl_tc_info));
+	else
+		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_CHANNELS,
+					     v_ret, NULL, 0);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_vc_set_rss_hena - set RSS HENA bits for the VF
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ */
+static int ice_vc_set_rss_hena(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_rss_hena *vrh = (struct virtchnl_rss_hena *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	enum ice_status status;
+	struct ice_vsi *vsi;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
+		dev_err(dev, "RSS not supported by PF\n");
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	/* clear all previously programmed RSS configuration to allow VF drivers
+	 * the ability to customize the RSS configuration and/or completely
+	 * disable RSS
+	 */
+	status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx);
+	if (status && !vrh->hena) {
+		/* only report failure to clear the current RSS configuration if
+		 * that was clearly the VF's intention (i.e. vrh->hena = 0)
+		 */
+		v_ret = ice_err_to_virt_err(status);
+		goto err;
+	} else if (status) {
+		/* allow the VF to update the RSS configuration even on failure
+		 * to clear the current RSS confguration in an attempt to keep
+		 * RSS in a working state
+		 */
+		dev_warn(dev, "Failed to clear the RSS configuration for VF %u\n",
+			 vf->vf_id);
+	}
+
+	if (vrh->hena) {
+		status = ice_add_avf_rss_cfg(&pf->hw, vsi->idx, vrh->hena);
+		v_ret = ice_err_to_virt_err(status);
+	}
+
+	/* send the response to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, v_ret,
+				      NULL, 0);
+}
+
+/**
+ * ice_vc_rdma_msg - send msg to RDMA PF from VF
+ * @vf: pointer to VF info
+ * @msg: pointer to msg buffer
+ * @len: length of the message
+ *
+ * This function is called indirectly from the AQ clean function.
+ */
+static int ice_vc_rdma_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_peer_obj *rdma_peer;
+	int ret;
+
+	rdma_peer = vf->pf->rdma_peer;
+	if (!rdma_peer) {
+		pr_err("Invalid RDMA peer attempted to send message to peer\n");
+		return -EIO;
+	}
+
+	if (!rdma_peer->peer_ops || !rdma_peer->peer_ops->vc_receive) {
+		pr_err("Incomplete RMDA peer attempting to send msg\n");
+		return -EINVAL;
+	}
+
+	ret = rdma_peer->peer_ops->vc_receive(rdma_peer, vf->vf_id, msg, len);
+	if (ret)
+		pr_err("Failed to send message to RDMA peer, error %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * ice_vf_cfg_rdma_ceq_irq_map - configure the CEQ IRQ mapping
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @qv_info: RDMA queue vector mapping information
+ *
+ * Configure the CEQ index for the passed in VF. This will result in the CEQ
+ * being able to generate interrupts
+ */
+static void
+ice_vf_cfg_rdma_ceq_irq_map(struct ice_vf *vf,
+			    struct virtchnl_rdma_qv_info *qv_info)
+{
+	u16 glint_ceqctl_idx = ice_vf_get_glint_ceqctl_idx(vf,
+							   qv_info->ceq_idx);
+
+	u32 regval = (qv_info->v_idx & GLINT_CEQCTL_MSIX_INDX_M) |
+		     ((qv_info->itr_idx << GLINT_CEQCTL_ITR_INDX_S) &
+		      GLINT_CEQCTL_ITR_INDX_M) | GLINT_CEQCTL_CAUSE_ENA_M;
+
+	wr32(&vf->pf->hw, GLINT_CEQCTL(glint_ceqctl_idx), regval);
+}
+
+/**
+ * ice_vf_cfg_rdma_aeq_irq_map - configure the AEQ IRQ mapping
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @qv_info: RDMA queue vector mapping information
+ *
+ * Configure the AEQ for the passed in VF. This will result in the AEQ being
+ * able to generate interrupts
+ */
+static void
+ice_vf_cfg_rdma_aeq_irq_map(struct ice_vf *vf,
+			    struct virtchnl_rdma_qv_info *qv_info)
+{
+	u32 regval = (qv_info->v_idx & PFINT_AEQCTL_MSIX_INDX_M) |
+		     ((qv_info->itr_idx << VPINT_AEQCTL_ITR_INDX_S) &
+		      VPINT_AEQCTL_ITR_INDX_M) | VPINT_AEQCTL_CAUSE_ENA_M;
+
+	wr32(&vf->pf->hw, VPINT_AEQCTL(vf->vf_id), regval);
+}
+
+/**
+ * ice_vc_cfg_rdma_irq_map_msg - MSIX mapping of RDMA control queue interrupts
+ * @vf: VF structure associated to the VF that requested the mapping
+ * @msg: Message from the VF used to configure the RDMA mapping
+ *
+ * Handler for the VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP opcode in virtchnl. This
+ * causes the specified control queues to be mapped to the specified MSIX
+ * indices and ITR indices. Also, the control queue's interrupt will be
+ * enabled.
+ */
+static int ice_vc_cfg_rdma_irq_map_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_rss_key *vrk =
-		(struct virtchnl_rss_key *)msg;
+	struct virtchnl_rdma_qvlist_info *qvlist =
+		(struct virtchnl_rdma_qvlist_info *)msg;
+	u16 num_msix_per_vf;
+	u32 i;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	num_msix_per_vf = vf->pf->num_msix_per_vf;
+	if (qvlist->num_vectors > num_msix_per_vf) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	for (i = 0; i < qvlist->num_vectors; i++) {
+		struct virtchnl_rdma_qv_info *qv_info = &qvlist->qv_info[i];
+
+		if (qv_info->v_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->ceq_idx == VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->aeq_idx == VIRTCHNL_RDMA_INVALID_QUEUE_IDX) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->ceq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->ceq_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (qv_info->aeq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX &&
+		    qv_info->aeq_idx >= num_msix_per_vf) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < qvlist->num_vectors; i++) {
+		struct virtchnl_rdma_qv_info *qv_info = &qvlist->qv_info[i];
+
+		if (qv_info->ceq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX)
+			ice_vf_cfg_rdma_ceq_irq_map(vf, qv_info);
+
+		if (qv_info->aeq_idx != VIRTCHNL_RDMA_INVALID_QUEUE_IDX)
+			ice_vf_cfg_rdma_aeq_irq_map(vf, qv_info);
+	}
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_clear_rdma_irq_map - clear mapped RDMA control queue interrupts
+ * @vf: VF structure associated to the VF that requested to release the mapping
+ *
+ * Handler for the VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP opcode in virtchnl. This
+ * causes all of the MSIX mapping of all the RDMA control queues to be cleared
+ * and disabled.
+ */
+static int ice_vc_clear_rdma_irq_map(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	ice_vf_clear_rdma_irq_map(vf);
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_query_rxdid - query RXDID supported by DDP package
+ * @vf: pointer to VF info
+ *
+ * Called from VF to query a bitmap of supported flexible
+ * descriptor RXDIDs of a DDP package.
+ */
+static int ice_vc_query_rxdid(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_supported_rxdids *rxdid = NULL;
+	struct ice_hw *hw = &vf->pf->hw;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi = NULL;
+	int len = 0;
+	int ret, i;
+	u32 regval;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vrk->vsi_id)) {
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (vrk->key_len != ICE_VSIQF_HKEY_ARRAY_SIZE) {
+	len = sizeof(struct virtchnl_supported_rxdids);
+	rxdid = kzalloc(len, GFP_KERNEL);
+	if (!rxdid) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+
+	/* RXDIDs supported by DDP package can be read from the register
+	 * to get the supported RXDID bitmap. But the legacy 32byte RXDID
+	 * is not listed in DDP package, add it in the bitmap manually.
+	 * Legacy 16byte descriptor is not supported.
+	 */
+	rxdid->supported_rxdids |= BIT(ICE_RXDID_LEGACY_1);
+
+	for (i = ICE_RXDID_FLEX_NIC; i < ICE_FLEX_DESC_RXDID_MAX_NUM; i++) {
+		regval = rd32(hw, GLFLXP_RXDID_FLAGS(i, 0));
+		if ((regval >> GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S)
+			& GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M)
+			rxdid->supported_rxdids |= BIT(i);
+	}
+
+	pf->supported_rxdids = rxdid->supported_rxdids;
+
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+				    v_ret, (u8 *)rxdid, len);
+	kfree(rxdid);
+	return ret;
+}
+
+/**
+ * ice_vf_init_vlan_stripping - enable/disable VLAN stripping on initialization
+ * @vf: VF to enable/disable VLAN stripping for on initialization
+ *
+ * Set the default for VLAN stripping based on whether a port VLAN is configured
+ * and the current VLAN mode of the device.
+ */
+static int ice_vf_init_vlan_stripping(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi = ice_get_vf_vsi(vf);
+
+	vf->vlan_strip_ena = 0;
+
+	if (!vsi)
+		return -EINVAL;
+
+	/* don't modify stripping if port VLAN is configured in SVM since the
+	 * port VLAN is based on the inner/single VLAN in SVM
+	 */
+	if (ice_vf_is_port_vlan_ena(vf) && !ice_is_dvm_ena(&vsi->back->hw))
+		return 0;
+
+	if (ice_vf_vlan_offload_ena(vf->driver_caps)) {
+		int err = vsi->inner_vlan_ops.ena_stripping(vsi, ETH_P_8021Q);
+
+		if (!err)
+			vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
+
+		return err;
+	}
+
+	return vsi->inner_vlan_ops.dis_stripping(vsi);
+}
+
+/**
+ * ice_validate_tpid - validate the VLAN TPID
+ * @tpid: VLAN TPID
+ */
+static int ice_validate_tpid(u16 tpid)
+{
+	if (tpid == ETH_P_8021Q ||
+	    tpid == ETH_P_8021AD ||
+	    tpid == ETH_P_QINQ1)
+		return 0;
+
+	return -EINVAL;
+}
+
+/**
+ * ice_vc_dcf_vlan_offload_msg - send msg to handle VLAN offload from DCF
+ * @vf: pointer to VF info
+ * @msg: pointer to msg buffer
+ */
+static int ice_vc_dcf_vlan_offload_msg(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_dcf_vlan_offload *offload = (struct virtchnl_dcf_vlan_offload *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_dcf_vlan_info *dcf_vlan;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *target_vsi;
+	struct ice_vf *target_vf;
+	u16 insert_mode;
+	u16 strip_mode;
+	u16 vlan_flags;
+	u16 vlan_type;
+
+	if (!ice_is_dvm_ena(&pf->hw) || !(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_VLAN_V2) ||
+	    !ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	vlan_flags = offload->vlan_flags;
+	insert_mode = (vlan_flags & VIRTCHNL_DCF_VLAN_INSERT_MODE_M) >>
+				    VIRTCHNL_DCF_VLAN_INSERT_MODE_S;
+	strip_mode = (vlan_flags & VIRTCHNL_DCF_VLAN_STRIP_MODE_M) >>
+				   VIRTCHNL_DCF_VLAN_STRIP_MODE_S;
+	vlan_type = (vlan_flags & VIRTCHNL_DCF_VLAN_TYPE_M) >>
+				  VIRTCHNL_DCF_VLAN_TYPE_S;
+
+	if (ice_validate_vf_id(pf, offload->vf_id) || ice_validate_tpid(offload->tpid) ||
+	    (!insert_mode && !strip_mode) || vlan_type != VIRTCHNL_DCF_VLAN_TYPE_OUTER ||
+	    offload->vlan_id >= VLAN_N_VID) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	target_vf = &pf->vf[offload->vf_id];
+	if (ice_check_vf_ready_for_cfg(target_vf)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	target_vsi = ice_get_vf_vsi(target_vf);
+	if (!target_vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (ice_set_rss(vsi, vrk->key, NULL, 0))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_KEY, v_ret,
-				     NULL, 0);
+	if (ice_vf_is_port_vlan_ena(target_vf) || ice_vsi_has_non_zero_vlans(target_vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	dcf_vlan = &target_vf->dcf_vlan_info;
+
+	if (insert_mode == VIRTCHNL_DCF_VLAN_INSERT_DISABLE) {
+		if (dcf_vlan->outer_port_vlan.vid) {
+			dcf_vlan->outer_port_vlan.vid = 0;
+			dcf_vlan->applying = 1;
+		}
+	} else if (insert_mode == VIRTCHNL_DCF_VLAN_INSERT_PORT_BASED) {
+		if (strip_mode) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto err;
+		}
+
+		if (dcf_vlan->outer_port_vlan.tpid != offload->tpid ||
+		    dcf_vlan->outer_port_vlan.vid != offload->vlan_id) {
+			dcf_vlan->outer_port_vlan.tpid = offload->tpid;
+			dcf_vlan->outer_port_vlan.vid = offload->vlan_id;
+			dcf_vlan->outer_port_vlan.prio = 0;
+			dcf_vlan->outer_port_vlan.fwd_act = ICE_FWD_TO_VSI;
+			dcf_vlan->applying = 1;
+		}
+	} else if (insert_mode) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	if (strip_mode == VIRTCHNL_DCF_VLAN_STRIP_DISABLE) {
+		if (dcf_vlan->outer_stripping_ena) {
+			dcf_vlan->outer_stripping_ena = 0;
+			dcf_vlan->applying = 1;
+		}
+	} else if (strip_mode == VIRTCHNL_DCF_VLAN_STRIP_INTO_RX_DESC) {
+		if (dcf_vlan->outer_stripping_tpid != offload->tpid ||
+		    !dcf_vlan->outer_stripping_ena) {
+			if (ice_vsi_is_rxq_crc_strip_dis(target_vsi)) {
+				v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+				goto err;
+			}
+			dcf_vlan->outer_stripping_tpid = offload->tpid;
+			dcf_vlan->outer_stripping_ena = 1;
+			dcf_vlan->applying = 1;
+		}
+	} else if (strip_mode) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto err;
+	}
+
+	if (dcf_vlan->applying)
+		ice_vc_reset_vf(target_vf);
+
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_VLAN_OFFLOAD,
+				     v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_config_rss_lut
+ * ice_dcf_handle_aq_cmd - handle the AdminQ command from DCF to FW
  * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * @aq_desc: the AdminQ command descriptor
+ * @aq_buf: the AdminQ command buffer if aq_buf_size is non-zero
+ * @aq_buf_size: the AdminQ command buffer size
  *
- * Configure the VF's RSS LUT
+ * The VF splits the AdminQ command into two parts: one is the descriptor of
+ * AdminQ command, the other is the buffer of AdminQ command (the descriptor
+ * has BUF flag set). When both of them are received by PF, this function will
+ * forward them to firmware once to get the AdminQ's response. And also, the
+ * filled descriptor and buffer of the response will be sent back to VF one by
+ * one through the virtchnl.
  */
-static int ice_vc_config_rss_lut(struct ice_vf *vf, u8 *msg)
+static int
+ice_dcf_handle_aq_cmd(struct ice_vf *vf, struct ice_aq_desc *aq_desc,
+		      u8 *aq_buf, u16 aq_buf_size)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct ice_pf *pf = vf->pf;
+	enum virtchnl_ops v_op;
+	enum ice_status aq_ret;
+	u16 v_msg_len = 0;
+	u8 *v_msg = NULL;
+	int ret;
+
+	pf->dcf.aq_desc_received = false;
+
+	if ((aq_buf && !aq_buf_size) || (!aq_buf && aq_buf_size))
+		return -EINVAL;
+
+	if (ice_dcf_is_acl_aq_cmd(aq_desc) && !ice_dcf_is_acl_capable(&pf->hw))
+		return 0;
+
+	if (ice_dcf_is_udp_tunnel_aq_cmd(aq_desc, aq_buf) &&
+	    !(pf->hw.dcf_caps & DCF_UDP_TUNNEL_CAP) &&
+	    !ice_is_tunnel_empty(&pf->hw))
+		return 0;
+
+	if (ice_dcf_pre_aq_send_cmd(vf, aq_desc, aq_buf, aq_buf_size)) {
+		ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC,
+					    VIRTCHNL_STATUS_SUCCESS,
+					    (u8 *)aq_desc, sizeof(*aq_desc));
+		if (ret || !aq_buf_size)
+			return ret;
+
+		v_op = VIRTCHNL_OP_DCF_CMD_BUFF;
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		goto err;
+	}
+
+	aq_ret = ice_aq_send_cmd(&pf->hw, aq_desc, aq_buf, aq_buf_size, NULL);
+	/* It needs to send back the AQ response message if ICE_ERR_AQ_ERROR
+	 * returns, some AdminQ handlers will use the error code filled by FW
+	 * to do exception handling.
+	 */
+	if (aq_ret && aq_ret != ICE_ERR_AQ_ERROR) {
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+		v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+		goto err;
+	}
+
+	if (aq_ret != ICE_ERR_AQ_ERROR) {
+		v_ret = ice_dcf_post_aq_send_cmd(pf, aq_desc, aq_buf);
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS) {
+			v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+			goto err;
+		}
+
+		v_ret = ice_dcf_update_acl_rule_info(pf, aq_desc, aq_buf);
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS) {
+			v_op = VIRTCHNL_OP_DCF_CMD_DESC;
+			goto err;
+		}
+	}
+
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC, v_ret,
+				    (u8 *)aq_desc, sizeof(*aq_desc));
+	/* Bail out so we don't send the VIRTCHNL_OP_DCF_CMD_BUFF message
+	 * below if failure happens or no AdminQ command buffer.
+	 */
+	if (ret || !aq_buf_size)
+		return ret;
+
+	v_op = VIRTCHNL_OP_DCF_CMD_BUFF;
+	v_msg_len = le16_to_cpu(aq_desc->datalen);
+
+	/* buffer is not updated if data length exceeds buffer size */
+	if (v_msg_len > aq_buf_size)
+		v_msg_len = 0;
+	else if (v_msg_len)
+		v_msg = aq_buf;
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, v_op, v_ret, v_msg, v_msg_len);
+}
+
+/**
+ * ice_vc_dcf_cmd_desc_msg - handle the DCF AdminQ command descriptor
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer which holds the command descriptor
+ * @len: length of the message
+ */
+static int ice_vc_dcf_cmd_desc_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_aq_desc *aq_desc = (struct ice_aq_desc *)msg;
+	struct ice_pf *pf = vf->pf;
+
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON)
+		goto err;
+
+	if (len != sizeof(*aq_desc) || !ice_dcf_aq_cmd_permitted(aq_desc)) {
+		/* In case to avoid the VIRTCHNL_OP_DCF_CMD_DESC message with
+		 * the ICE_AQ_FLAG_BUF set followed by another bad message
+		 * VIRTCHNL_OP_DCF_CMD_DESC.
+		 */
+		pf->dcf.aq_desc_received = false;
+		goto err;
+	}
+
+	/* The AdminQ descriptor needs to be stored for use when the followed
+	 * VIRTCHNL_OP_DCF_CMD_BUFF is received.
+	 */
+	if (aq_desc->flags & cpu_to_le16(ICE_AQ_FLAG_BUF)) {
+		pf->dcf.aq_desc = *aq_desc;
+		pf->dcf.aq_desc_received = true;
+		pf->dcf.aq_desc_expires = jiffies + ICE_DCF_AQ_DESC_TIMEOUT;
+		return 0;
+	}
+
+	return ice_dcf_handle_aq_cmd(vf, aq_desc, NULL, 0);
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_DESC,
+				     VIRTCHNL_STATUS_ERR_PARAM, NULL, 0);
+}
+
+/**
+ * ice_vc_dcf_cmd_buff_msg - handle the DCF AdminQ command buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer which holds the command buffer
+ * @len: length of the message
+ */
+static int ice_vc_dcf_cmd_buff_msg(struct ice_vf *vf, u8 *msg, u16 len)
+{
+	struct ice_pf *pf = vf->pf;
+
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON ||
+	    !len || !pf->dcf.aq_desc_received ||
+	    time_is_before_jiffies(pf->dcf.aq_desc_expires))
+		goto err;
+
+	return ice_dcf_handle_aq_cmd(vf, &pf->dcf.aq_desc, msg, len);
+
+	/* send the response back to the VF */
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_CMD_BUFF,
+				     VIRTCHNL_STATUS_ERR_PARAM, NULL, 0);
+}
+
+static int ice_vc_flush_dcf_rule(struct ice_vf *vf)
+{
+        enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+        struct ice_pf *pf = vf->pf;
+
+        if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                goto err;
+        }
+
+        if (!ice_is_vf_dcf(vf)) {
+                v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+                goto err;
+        }
+
+        ice_rm_all_dcf_sw_rules(pf);
+
+err:
+        return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_RULE_FLUSH,
+                                     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dis_dcf_cap - disable DCF capability for the VF
+ * @vf: pointer to the VF
+ */
+static int ice_vc_dis_dcf_cap(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!ice_is_vf_dcf(vf)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_CAP_DCF) {
+#ifndef HAVE_NDO_SET_VF_TRUST
+		ice_set_vf_trust(ice_get_main_vsi(vf->pf)->netdev, vf->vf_id, false);
+#endif /* !HAVE_NOD_SET_VF_TRUST */
+		vf->driver_caps &= ~VIRTCHNL_VF_CAP_DCF;
+		ice_rm_all_dcf_sw_rules(vf->pf);
+		ice_clear_dcf_acl_cfg(vf->pf);
+		ice_clear_dcf_udp_tunnel_cfg(vf->pf);
+		vf->pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(vf->pf, ICE_DCF_STATE_OFF);
+		vf->pf->dcf.vf = NULL;
+	}
+err:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_DISABLE,
+				     v_ret, NULL, 0);
+}
+
+/**
+ * ice_vc_dcf_get_vsi_map - get VSI mapping table
+ * @vf: pointer to the VF info
+ */
+static int ice_vc_dcf_get_vsi_map(struct ice_vf *vf)
 {
-	struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_dcf_vsi_map *vsi_map = NULL;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi = NULL;
+	struct ice_vsi *pf_vsi;
+	u16 len = 0;
+	int vf_id;
+	int ret;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vrl->vsi_id)) {
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (vrl->lut_entries != ICE_VSIQF_HLUT_ARRAY_SIZE) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	len = struct_size(vsi_map, vf_vsi, pf->num_alloc_vfs - 1);
+	vsi_map = kzalloc(len, GFP_KERNEL);
+	if (!vsi_map) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
 	}
 
-	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+	pf_vsi = ice_get_main_vsi(pf);
+	if (!pf_vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		len = 0;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	vsi_map->pf_vsi = pf_vsi->vsi_num;
+	vsi_map->num_vfs = pf->num_alloc_vfs;
+
+	ice_for_each_vf(pf, vf_id) {
+		struct ice_vf *tmp_vf = &pf->vf[vf_id];
+
+		if (!ice_is_vf_disabled(tmp_vf) &&
+		    test_bit(ICE_VF_STATE_INIT, tmp_vf->vf_states))
+			vsi_map->vf_vsi[vf_id] = tmp_vf->lan_vsi_num |
+				VIRTCHNL_DCF_VF_VSI_VALID;
 	}
 
-	if (ice_set_rss(vsi, NULL, vrl->lut, ICE_VSIQF_HLUT_ARRAY_SIZE))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_LUT, v_ret,
-				     NULL, 0);
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_GET_VSI_MAP, v_ret,
+				    (u8 *)vsi_map, len);
+	kfree(vsi_map);
+	return ret;
 }
 
 /**
- * ice_vc_get_stats_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_dcf_query_pkg_info - query DDP package info from PF
+ * @vf: pointer to VF info
  *
- * called from the VF to get VSI stats
+ * Called from VF to query DDP package information loaded in PF,
+ * including track ID, package name, version and device serial
+ * number.
  */
-static int ice_vc_get_stats_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_dcf_query_pkg_info(struct ice_vf *vf)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-		(struct virtchnl_queue_select *)msg;
-	struct ice_eth_stats stats = { 0 };
+	struct virtchnl_pkg_info *pkg_info = NULL;
+	struct ice_hw *hw = &vf->pf->hw;
 	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
+	int len = 0;
+	int ret;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	if (!ice_is_vf_dcf(vf) || ice_dcf_get_state(pf) != ICE_DCF_STATE_ON) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto err;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	len = sizeof(struct virtchnl_pkg_info);
+	pkg_info = kzalloc(len, GFP_KERNEL);
+	if (!pkg_info) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
 	}
 
-	ice_update_eth_stats(vsi);
-
-	stats = vsi->eth_stats;
+	pkg_info->track_id = hw->active_track_id;
+	memcpy(&pkg_info->pkg_ver, &hw->active_pkg_ver,
+	       sizeof(pkg_info->pkg_ver));
+	memcpy(pkg_info->pkg_name, hw->active_pkg_name,
+	       sizeof(pkg_info->pkg_name));
+	memcpy(pkg_info->dsn, pf->dcf.dsn, sizeof(pkg_info->dsn));
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_STATS, v_ret,
-				     (u8 *)&stats, sizeof(stats));
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DCF_GET_PKG_INFO,
+				    v_ret, (u8 *)pkg_info, len);
+	kfree(pkg_info);
+	return ret;
 }
 
 /**
- * ice_vc_ena_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to enable all or specific queue(s)
+ * ice_vc_get_max_rss_qregion - message handling for VIRTCHNL_OP_GET_MAX_RSS_QREGION
+ * @vf: source of the request
  */
-static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_get_max_rss_qregion(struct ice_vf *vf)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-	    (struct virtchnl_queue_select *)msg;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_max_rss_qregion *max_rss_qregion = NULL;
 	struct ice_vsi *vsi;
-	unsigned long q_map;
-	u16 vf_q_id;
+	int err, len = 0;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!vqs->rx_queues && !vqs->tx_queues) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	max_rss_qregion = kzalloc(sizeof(*max_rss_qregion), GFP_KERNEL);
+	if (!max_rss_qregion) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
 		goto error_param;
 	}
 
-	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
-	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	len = sizeof(*max_rss_qregion);
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	max_rss_qregion->vport_id = vf->lan_vsi_num;
+	max_rss_qregion->qregion_width = ilog2(ICE_MAX_RSS_QS_PER_VF);
+	if (vsi->global_lut_id)
+		max_rss_qregion->qregion_width = ilog2(ICE_MAX_RSS_QS_PER_LARGE_VF);
 
-	/* Enable only Rx rings, Tx rings were enabled by the FW when the
-	 * Tx queue group list was configured and the context bits were
-	 * programmed using ice_vsi_cfg_txqs
-	 */
-	q_map = vqs->rx_queues;
-	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-		if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+error_param:
+	err = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_MAX_RSS_QREGION, v_ret,
+				    (u8 *)max_rss_qregion, len);
+	kfree(max_rss_qregion);
+	return err;
+}
 
-		/* Skip queue if enabled */
-		if (test_bit(vf_q_id, vf->rxq_ena))
-			continue;
+static bool ice_vc_supported_queue_type(s32 queue_type)
+{
+	return (queue_type == VIRTCHNL_QUEUE_TYPE_RX || queue_type == VIRTCHNL_QUEUE_TYPE_TX);
+}
 
-		if (ice_vsi_ctrl_rx_ring(vsi, true, vf_q_id)) {
-			dev_err(&vsi->back->pdev->dev,
-				"Failed to enable Rx ring %d on VSI %d\n",
-				vf_q_id, vsi->vsi_num);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+/**
+ * ice_vc_validate_qs_v2_msg - validate all parameters sent in the qs_msg structure
+ * @vf: VF the message was received from
+ * @qs_msg: contents of the message from the VF
+ *
+ * Used to validate both the VIRTCHNL_OP_ENABLE_QUEUES_V2 and VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * messages. This should always be called before attempting to enable and/or disable queues on
+ * behalf of a VF in response to the preivously mentioned opcodes. If all checks succeed, then
+ * return success indicating to the caller that the qs_msg is valid. Otherwise return false,
+ * indicating to the caller that the qs_msg is invalid.
+ */
+static bool ice_vc_validate_qs_v2_msg(struct ice_vf *vf, struct virtchnl_del_ena_dis_queues *qs_msg)
+{
+	struct virtchnl_queue_chunks *chunks = &qs_msg->chunks;
+	int i;
+
+	if (qs_msg->vport_id != vf->lan_vsi_num)
+		return false;
+
+	if (!chunks->num_chunks)
+		return false;
 
-		set_bit(vf_q_id, vf->rxq_ena);
-		vf->num_qs_ena++;
+	for (i = 0; i < chunks->num_chunks; i++) {
+		if (!ice_vc_supported_queue_type(chunks->chunks[i].type))
+			return false;
+
+		if (!chunks->chunks[i].num_queues)
+			return false;
+
+		if (chunks->chunks[i].start_queue_id + chunks->chunks[i].num_queues > vf->num_vf_qs)
+			return false;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	q_map = vqs->tx_queues;
-	for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-		if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+	return true;
+}
 
-		/* Skip queue if enabled */
-		if (test_bit(vf_q_id, vf->txq_ena))
-			continue;
+#define ice_for_each_q_in_chunk(chunk, q_id) \
+	for ((q_id) = (chunk)->start_queue_id; \
+	     (q_id) < (chunk)->start_queue_id + (chunk)->num_queues; \
+	     (q_id)++)
+
+static int ice_vc_ena_rxq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	int q_id;
+
+	if (!vsi)
+		return -EINVAL;
+
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err = ice_vf_vsi_ena_single_rxq(vf, vsi, q_id, q_id);
 
-		set_bit(vf_q_id, vf->txq_ena);
-		vf->num_qs_ena++;
+		if (err)
+			return err;
 	}
 
-	/* Set flag to indicate that queues are enabled */
-	if (v_ret == VIRTCHNL_STATUS_SUCCESS)
-		set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
+	return 0;
+}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES, v_ret,
-				     NULL, 0);
+static int ice_vc_ena_txq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	int q_id;
+
+	if (!vsi)
+		return -EINVAL;
+
+	ice_for_each_q_in_chunk(chunk, q_id)
+		ice_vf_vsi_ena_single_txq(vf, vsi, q_id, q_id);
+
+	return 0;
 }
 
 /**
- * ice_vc_dis_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to disable all or specific
- * queue(s)
+ * ice_vc_ena_qs_v2_msg - message handling for VIRTCHNL_OP_ENABLE_QUEUES_V2
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_ena_qs_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_del_ena_dis_queues *ena_qs_msg = (struct virtchnl_del_ena_dis_queues *)msg;
+	struct virtchnl_queue_chunks *chunks = &ena_qs_msg->chunks;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_queue_select *vqs =
-	    (struct virtchnl_queue_select *)msg;
-	struct ice_pf *pf = vf->pf;
-	struct ice_vsi *vsi;
-	unsigned long q_map;
-	u16 vf_q_id;
+	int i;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) &&
-	    !test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, ena_qs_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (!vqs->rx_queues && !vqs->tx_queues) {
+	if (!ice_vc_validate_qs_v2_msg(vf, ena_qs_msg)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (vqs->rx_queues > ICE_MAX_BASE_QS_PER_VF ||
-	    vqs->tx_queues > ICE_MAX_BASE_QS_PER_VF) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+	for (i = 0; i < chunks->num_chunks; i++) {
+		struct virtchnl_queue_chunk *chunk = &chunks->chunks[i];
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
+		if (chunk->type == VIRTCHNL_QUEUE_TYPE_RX && ice_vc_ena_rxq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		else if (chunk->type == VIRTCHNL_QUEUE_TYPE_TX && ice_vc_ena_txq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
-	if (vqs->tx_queues) {
-		q_map = vqs->tx_queues;
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS)
+			goto error_param;
+	}
 
-		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-			struct ice_ring *ring = vsi->tx_rings[vf_q_id];
-			struct ice_txq_meta txq_meta = { 0 };
+	set_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 
-			if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES_V2, v_ret, NULL, 0);
+}
 
-			/* Skip queue if not enabled */
-			if (!test_bit(vf_q_id, vf->txq_ena))
-				continue;
+static int ice_vc_dis_rxq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	u16 q_id;
 
-			ice_fill_txq_meta(vsi, ring, &txq_meta);
+	if (!vsi)
+		return -EINVAL;
 
-			if (ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id,
-						 ring, &txq_meta)) {
-				dev_err(&vsi->back->pdev->dev,
-					"Failed to stop Tx ring %d on VSI %d\n",
-					vf_q_id, vsi->vsi_num);
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err;
 
-			/* Clear enabled queues flag */
-			clear_bit(vf_q_id, vf->txq_ena);
-			vf->num_qs_ena--;
-		}
+		err = ice_vf_vsi_dis_single_rxq(vf, vsi, q_id, q_id);
+		if (err)
+			return err;
 	}
 
-	if (vqs->rx_queues) {
-		q_map = vqs->rx_queues;
+	return 0;
+}
 
-		for_each_set_bit(vf_q_id, &q_map, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+static int ice_vc_dis_txq_chunk(struct ice_vf *vf, struct virtchnl_queue_chunk *chunk)
+{
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	u16 q_id;
 
-			/* Skip queue if not enabled */
-			if (!test_bit(vf_q_id, vf->rxq_ena))
-				continue;
+	if (!vsi)
+		return -EINVAL;
 
-			if (ice_vsi_ctrl_rx_ring(vsi, false, vf_q_id)) {
-				dev_err(&vsi->back->pdev->dev,
-					"Failed to stop Rx ring %d on VSI %d\n",
-					vf_q_id, vsi->vsi_num);
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	ice_for_each_q_in_chunk(chunk, q_id) {
+		int err;
 
-			/* Clear enabled queues flag */
-			clear_bit(vf_q_id, vf->rxq_ena);
-			vf->num_qs_ena--;
-		}
+		err = ice_vf_vsi_dis_single_txq(vf, vsi, q_id, q_id);
+		if (err)
+			return err;
 	}
 
-	/* Clear enabled queues flag */
-	if (v_ret == VIRTCHNL_STATUS_SUCCESS && !vf->num_qs_ena)
-		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
-
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES, v_ret,
-				     NULL, 0);
+	return 0;
 }
 
 /**
- * ice_vc_cfg_irq_map_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to configure the IRQ to queue map
+ * ice_vc_dis_qs_v2_msg - message handling for VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_cfg_irq_map_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_dis_qs_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_del_ena_dis_queues *dis_qs_msg = (struct virtchnl_del_ena_dis_queues *)msg;
+	struct virtchnl_queue_chunks *chunks = &dis_qs_msg->chunks;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_irq_map_info *irqmap_info;
-	u16 vsi_id, vsi_q_id, vector_id;
-	struct virtchnl_vector_map *map;
-	struct ice_pf *pf = vf->pf;
-	u16 num_q_vectors_mapped;
-	struct ice_vsi *vsi;
-	unsigned long qmap;
 	int i;
 
-	irqmap_info = (struct virtchnl_irq_map_info *)msg;
-	num_q_vectors_mapped = irqmap_info->num_vectors;
-
-	/* Check to make sure number of VF vectors mapped is not greater than
-	 * number of VF vectors originally allocated, and check that
-	 * there is actually at least a single VF queue vector mapped
-	 */
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
-	    pf->num_vf_msix < num_q_vectors_mapped ||
-	    !irqmap_info->num_vectors) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (!ice_vc_isvalid_vsi_id(vf, dis_qs_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	for (i = 0; i < num_q_vectors_mapped; i++) {
-		struct ice_q_vector *q_vector;
+	if (!ice_vc_validate_qs_v2_msg(vf, dis_qs_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
 
-		map = &irqmap_info->vecmap[i];
+	for (i = 0; i < chunks->num_chunks; i++) {
+		struct virtchnl_queue_chunk *chunk = &chunks->chunks[i];
 
-		vector_id = map->vector_id;
-		vsi_id = map->vsi_id;
-		/* validate msg params */
-		if (!(vector_id < pf->hw.func_caps.common_cap
-		    .num_msix_vectors) || !ice_vc_isvalid_vsi_id(vf, vsi_id) ||
-		    (!vector_id && (map->rxq_map || map->txq_map))) {
+		if (chunk->type == VIRTCHNL_QUEUE_TYPE_RX && ice_vc_dis_rxq_chunk(vf, chunk))
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		else if (chunk->type == VIRTCHNL_QUEUE_TYPE_TX && ice_vc_dis_txq_chunk(vf, chunk))
 			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+		if (v_ret != VIRTCHNL_STATUS_SUCCESS)
 			goto error_param;
-		}
+	}
 
-		/* No need to map VF miscellaneous or rogue vector */
-		if (!vector_id)
-			continue;
+	if (ice_vf_has_no_qs_ena(vf))
+		clear_bit(ICE_VF_STATE_QS_ENA, vf->vf_states);
 
-		/* Subtract non queue vector from vector_id passed by VF
-		 * to get actual number of VSI queue vector array index
-		 */
-		q_vector = vsi->q_vectors[vector_id - ICE_NONQ_VECS_VF];
-		if (!q_vector) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES_V2, v_ret, NULL, 0);
+}
 
-		/* lookout for the invalid queue index */
-		qmap = map->rxq_map;
-		q_vector->num_ring_rx = 0;
-		for_each_set_bit(vsi_q_id, &qmap, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vsi_id, vsi_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			q_vector->num_ring_rx++;
-			q_vector->rx.itr_idx = map->rxitr_idx;
-			vsi->rx_rings[vsi_q_id]->q_vector = q_vector;
-			ice_cfg_rxq_interrupt(vsi, vsi_q_id, vector_id,
-					      q_vector->rx.itr_idx);
-		}
+/**
+ * ice_vc_validate_qv_maps - validate parameters sent in the qs_msg structure
+ * @vf: VF the message was received from
+ * @qv_maps: contents of the message from the VF
+ *
+ * Used to validate VIRTCHNL_OP_MAP_VECTOR  messages. This should always be called before attempting
+ * map interrupts to queues. If all checks succeed, then return success indicating to the caller
+ * that the qv_maps are valid. Otherwise return false, indicating to the caller that the qv_maps
+ * are invalid.
+ */
+static bool ice_vc_validate_qv_maps(struct ice_vf *vf, struct virtchnl_queue_vector_maps *qv_maps)
+{
+	struct ice_vsi *vsi;
+	int i;
+
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi)
+		return false;
+
+	if (!qv_maps->num_qv_maps)
+		return false;
 
-		qmap = map->txq_map;
-		q_vector->num_ring_tx = 0;
-		for_each_set_bit(vsi_q_id, &qmap, ICE_MAX_BASE_QS_PER_VF) {
-			if (!ice_vc_isvalid_q_id(vf, vsi_id, vsi_q_id)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			q_vector->num_ring_tx++;
-			q_vector->tx.itr_idx = map->txitr_idx;
-			vsi->tx_rings[vsi_q_id]->q_vector = q_vector;
-			ice_cfg_txq_interrupt(vsi, vsi_q_id, vector_id,
-					      q_vector->tx.itr_idx);
-		}
+	for (i = 0; i < qv_maps->num_qv_maps; i++) {
+		if (!ice_vc_supported_queue_type(qv_maps->qv_maps[i].queue_type))
+			return false;
+
+		if (qv_maps->qv_maps[i].queue_id >= vf->num_vf_qs)
+			return false;
+
+		if (qv_maps->qv_maps[i].vector_id >= (vsi->num_q_vectors + ICE_NONQ_VECS_VF))
+			return false;
 	}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_IRQ_MAP, v_ret,
-				     NULL, 0);
+	return true;
 }
 
 /**
- * ice_vc_cfg_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * called from the VF to configure the Rx/Tx queues
+ * ice_vc_map_q_vector_msg - message handling for VIRTCHNL_OP_MAP_QUEUE_VECTOR
+ * @vf: source of the request
+ * @msg: message to handle
  */
-static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_map_q_vector_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_queue_vector_maps *qv_maps = (struct virtchnl_queue_vector_maps *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vsi_queue_config_info *qci =
-	    (struct virtchnl_vsi_queue_config_info *)msg;
-	struct virtchnl_queue_pair_info *qpi;
-	u16 num_rxq = 0, num_txq = 0;
-	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
 	int i;
 
@@ -2242,655 +8727,1156 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, qci->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, qv_maps->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!vsi) {
+	if (!ice_vc_validate_qv_maps(vf, qv_maps)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	if (qci->num_queue_pairs > ICE_MAX_BASE_QS_PER_VF ||
-	    qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) {
-		dev_err(&pf->pdev->dev,
-			"VF-%d requesting more than supported number of queues: %d\n",
-			vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq));
+	vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 		goto error_param;
 	}
 
-	for (i = 0; i < qci->num_queue_pairs; i++) {
-		qpi = &qci->qpair[i];
-		if (qpi->txq.vsi_id != qci->vsi_id ||
-		    qpi->rxq.vsi_id != qci->vsi_id ||
-		    qpi->rxq.queue_id != qpi->txq.queue_id ||
-		    qpi->txq.headwb_enabled ||
-		    !ice_vc_isvalid_ring_len(qpi->txq.ring_len) ||
-		    !ice_vc_isvalid_ring_len(qpi->rxq.ring_len) ||
-		    !ice_vc_isvalid_q_id(vf, qci->vsi_id, qpi->txq.queue_id)) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto error_param;
-		}
-		/* copy Tx queue info from VF into VSI */
-		if (qpi->txq.ring_len > 0) {
-			num_txq++;
-			vsi->tx_rings[i]->dma = qpi->txq.dma_ring_addr;
-			vsi->tx_rings[i]->count = qpi->txq.ring_len;
-		}
+	for (i = 0; i < qv_maps->num_qv_maps; i++) {
+		struct virtchnl_queue_vector *qv_map = &qv_maps->qv_maps[i];
 
-		/* copy Rx queue info from VF into VSI */
-		if (qpi->rxq.ring_len > 0) {
-			num_rxq++;
-			vsi->rx_rings[i]->dma = qpi->rxq.dma_ring_addr;
-			vsi->rx_rings[i]->count = qpi->rxq.ring_len;
+		if (qv_map->queue_type == VIRTCHNL_QUEUE_TYPE_RX)
+			ice_cfg_rxq_interrupt(vsi, qv_map->queue_id, qv_map->vector_id,
+					      qv_map->itr_idx);
+		else if (qv_map->queue_type == VIRTCHNL_QUEUE_TYPE_TX)
+			ice_cfg_txq_interrupt(vsi, qv_map->queue_id, qv_map->vector_id,
+					      qv_map->itr_idx);
+	}
 
-			if (qpi->rxq.databuffer_size != 0 &&
-			    (qpi->rxq.databuffer_size > ((16 * 1024) - 128) ||
-			     qpi->rxq.databuffer_size < 1024)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-			vsi->rx_buf_len = qpi->rxq.databuffer_size;
-			vsi->rx_rings[i]->rx_buf_len = vsi->rx_buf_len;
-			if (qpi->rxq.max_pkt_size >= (16 * 1024) ||
-			    qpi->rxq.max_pkt_size < 64) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
-		}
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_MAP_QUEUE_VECTOR, v_ret, NULL, 0);
+}
+
+static u16 ice_vc_get_max_vlan_fltrs(struct ice_vf *vf)
+{
+	if (vf->trusted)
+		return VLAN_N_VID;
+	else
+		return ICE_MAX_VLAN_PER_VF;
+}
+
+/**
+ * ice_vf_outer_vlan_not_allowed - check outer VLAN can be used when the device is in DVM
+ * @vf: VF that being checked for
+ */
+static bool ice_vf_outer_vlan_not_allowed(struct ice_vf *vf)
+{
+	if (ice_vf_is_port_vlan_ena(vf))
+		return true;
+
+	if (vf->dcf_vlan_info.outer_port_vlan.vid ||
+	    vf->dcf_vlan_info.outer_stripping_ena)
+		return true;
+
+	return false;
+}
+
+/**
+ * ice_vc_set_dvm_caps - set VLAN capabilities when the device is in DVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
+ *
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF should use the inner
+ * filtering/offload capabilities since the port VLAN is using the outer VLAN
+ * capabilies.
+ */
+static void
+ice_vc_set_dvm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
+{
+	struct virtchnl_vlan_supported_caps *supported_caps;
 
-		vsi->max_frame = qpi->rxq.max_pkt_size;
+	if (ice_vf_outer_vlan_not_allowed(vf)) {
+		/* until support for inner VLAN filtering is added when a port
+		 * VLAN is configured, only support software offloaded inner
+		 * VLANs when a port VLAN is confgured in DVM
+		 */
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+	} else {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_AND;
+		caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+						 VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+						 VIRTCHNL_VLAN_ETHERTYPE_9100;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_XOR |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+					VIRTCHNL_VLAN_ETHERTYPE_9100 |
+					VIRTCHNL_VLAN_ETHERTYPE_XOR |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
 	}
 
-	/* VF can request to configure less than allocated queues
-	 * or default allocated queues. So update the VSI with new number
-	 */
-	vsi->num_txq = num_txq;
-	vsi->num_rxq = num_rxq;
-	/* All queues of VF VSI are in TC 0 */
-	vsi->tc_cfg.tc_info[0].qcount_tx = num_txq;
-	vsi->tc_cfg.tc_info[0].qcount_rx = num_rxq;
+	caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+}
 
-	if (ice_vsi_cfg_lan_txqs(vsi) || ice_vsi_cfg_rxqs(vsi))
-		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+/**
+ * ice_vc_set_svm_caps - set VLAN capabilities when the device is in SVM
+ * @vf: VF that capabilities are being set for
+ * @caps: VLAN capabilities to populate
+ *
+ * Determine VLAN capabilities support based on whether a port VLAN is
+ * configured. If a port VLAN is configured then the VF does not have any VLAN
+ * filtering or offload capabilities since the port VLAN is using the inner VLAN
+ * capabilities in single VLAN mode (SVM). Otherwise allow the VF to use inner
+ * VLAN fitlering and offload capabilities.
+ */
+static void
+ice_vc_set_svm_caps(struct ice_vf *vf, struct virtchnl_vlan_caps *caps)
+{
+	struct virtchnl_vlan_supported_caps *supported_caps;
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES, v_ret,
-				     NULL, 0);
+	if (ice_vf_is_port_vlan_ena(vf)) {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_UNSUPPORTED;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->offloads.ethertype_match = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->filtering.max_filters = 0;
+	} else {
+		supported_caps = &caps->filtering.filtering_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+		caps->filtering.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+
+		supported_caps = &caps->offloads.stripping_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		supported_caps = &caps->offloads.insertion_support;
+		supported_caps->inner = VIRTCHNL_VLAN_ETHERTYPE_8100 |
+					VIRTCHNL_VLAN_TOGGLE |
+					VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1;
+		supported_caps->outer = VIRTCHNL_VLAN_UNSUPPORTED;
+
+		caps->offloads.ethertype_init = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		caps->offloads.ethertype_match =
+			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+		caps->filtering.max_filters = ice_vc_get_max_vlan_fltrs(vf);
+	}
 }
 
 /**
- * ice_is_vf_trusted
- * @vf: pointer to the VF info
+ * ice_vc_get_offload_vlan_v2_caps - determine VF's VLAN capabilities
+ * @vf: VF to determine VLAN capabilities for
+ *
+ * This will only be called if the VF and PF successfully negotiated
+ * VIRTCHNL_VF_OFFLOAD_VLAN_V2.
+ *
+ * Set VLAN capabilities based on the current VLAN mode and whether a port VLAN
+ * is configured or not.
  */
-static bool ice_is_vf_trusted(struct ice_vf *vf)
+static int ice_vc_get_offload_vlan_v2_caps(struct ice_vf *vf)
 {
-	return test_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_caps *caps = NULL;
+	int err, len = 0;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+	if (!caps) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		goto out;
+	}
+	len = sizeof(*caps);
+
+	if (ice_is_dvm_ena(&vf->pf->hw))
+		ice_vc_set_dvm_caps(vf, caps);
+	else
+		ice_vc_set_svm_caps(vf, caps);
+
+	/* store negotiated caps to prevent invalid VF messages */
+	memcpy(&vf->vlan_v2_caps, caps, sizeof(*caps));
+
+out:
+	err = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS,
+				    v_ret, (u8 *)caps, len);
+	kfree(caps);
+	return err;
 }
 
 /**
- * ice_can_vf_change_mac
- * @vf: pointer to the VF info
+ * ice_vc_validate_vlan_tpid - validate VLAN TPID
+ * @filtering_caps: negotiated/supported VLAN filtering capabilities
+ * @tpid: VLAN TPID used for validation
  *
- * Return true if the VF is allowed to change its MAC filters, false otherwise
+ * Convert the VLAN TPID to a VIRTCHNL_VLAN_ETHERTYPE_* and then compare against
+ * the negotiated/supported filtering caps to see if the VLAN TPID is valid.
  */
-static bool ice_can_vf_change_mac(struct ice_vf *vf)
+static bool ice_vc_validate_vlan_tpid(u16 filtering_caps, u16 tpid)
 {
-	/* If the VF MAC address has been set administratively (via the
-	 * ndo_set_vf_mac command), then deny permission to the VF to
-	 * add/delete unicast MAC addresses, unless the VF is trusted
-	 */
-	if (vf->pf_set_mac && !ice_is_vf_trusted(vf))
+	enum virtchnl_vlan_support vlan_ethertype = VIRTCHNL_VLAN_UNSUPPORTED;
+
+	switch (tpid) {
+	case ETH_P_8021Q:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_8100;
+		break;
+	case ETH_P_8021AD:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_88A8;
+		break;
+	case ETH_P_QINQ1:
+		vlan_ethertype = VIRTCHNL_VLAN_ETHERTYPE_9100;
+		break;
+	}
+
+	if (!(filtering_caps & vlan_ethertype))
 		return false;
 
 	return true;
 }
 
 /**
- * ice_vc_handle_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- * @set: true if MAC filters are being set, false otherwise
+ * ice_vc_is_valid_vlan - validate the virtchnl_vlan
+ * @vc_vlan: virtchnl_vlan to validate
  *
- * add guest MAC address filter
+ * If the VLAN TCI and VLAN TPID are 0, then this filter is invalid, so return
+ * false. Otherwise return true.
+ */
+static bool ice_vc_is_valid_vlan(struct virtchnl_vlan *vc_vlan)
+{
+	if (!vc_vlan->tci || !vc_vlan->tpid)
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_validate_vlan_filter_list - validate the filter list from the VF
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
+ *
+ * Validate all of the filters in the VLAN filter list from the VF. If any of
+ * the checks fail then return false. Otherwise return true.
+ */
+static bool
+ice_vc_validate_vlan_filter_list(struct virtchnl_vlan_filtering_caps *vfc,
+				 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	u16 i;
+
+	if (!vfl->num_elements)
+		return false;
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_supported_caps *filtering_support =
+			&vfc->filtering_support;
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *outer = &vlan_fltr->outer;
+		struct virtchnl_vlan *inner = &vlan_fltr->inner;
+
+		if ((ice_vc_is_valid_vlan(outer) &&
+		     filtering_support->outer == VIRTCHNL_VLAN_UNSUPPORTED) ||
+		    (ice_vc_is_valid_vlan(inner) &&
+		     filtering_support->inner == VIRTCHNL_VLAN_UNSUPPORTED))
+			return false;
+
+		if ((outer->tci_mask &&
+		     !(filtering_support->outer & VIRTCHNL_VLAN_FILTER_MASK)) ||
+		    (inner->tci_mask &&
+		     !(filtering_support->inner & VIRTCHNL_VLAN_FILTER_MASK)))
+			return false;
+
+		if (((outer->tci & VLAN_PRIO_MASK) &&
+		     !(filtering_support->outer & VIRTCHNL_VLAN_PRIO)) ||
+		    ((inner->tci & VLAN_PRIO_MASK) &&
+		     !(filtering_support->inner & VIRTCHNL_VLAN_PRIO)))
+			return false;
+
+		if ((ice_vc_is_valid_vlan(outer) &&
+		     !ice_vc_validate_vlan_tpid(filtering_support->outer, outer->tpid)) ||
+		    (ice_vc_is_valid_vlan(inner) &&
+		     !ice_vc_validate_vlan_tpid(filtering_support->inner, inner->tpid)))
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_vc_to_vlan - transform from struct virtchnl_vlan to struct ice_vlan
+ * @vc_vlan: struct virtchnl_vlan to transform
+ */
+static struct ice_vlan ice_vc_to_vlan(struct virtchnl_vlan *vc_vlan)
+{
+	struct ice_vlan vlan = { 0 };
+
+	vlan.prio = (vc_vlan->tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+	vlan.vid = vc_vlan->tci & VLAN_VID_MASK;
+	vlan.tpid = vc_vlan->tpid;
+
+	return vlan;
+}
+
+/**
+ * ice_vc_vlan_action - action to perform on the virthcnl_vlan
+ * @vsi: VF's VSI used to perform the action
+ * @vlan_action: function to perform the action with (i.e. add/del)
+ * @vlan: VLAN filter to perform the action with
  */
 static int
-ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set)
+ice_vc_vlan_action(struct ice_vsi *vsi,
+		   int (*vlan_action)(struct ice_vsi *, struct ice_vlan *),
+		   struct ice_vlan *vlan)
+{
+	int err;
+
+	err = vlan_action(vsi, vlan);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * ice_vc_del_vlans - delete VLAN(s) from the virtchnl filter list
+ * @vf: VF used to delete the VLAN(s)
+ * @vsi: VF's VSI used to delete the VLAN(s)
+ * @vfl: virthchnl filter list used to delete the filters
+ */
+static int
+ice_vc_del_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+	int err;
+	u16 i;
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *vc_vlan;
+
+		vc_vlan = &vlan_fltr->outer;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->outer_vlan_ops.del_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			if (vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+
+		vc_vlan = &vlan_fltr->inner;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->inner_vlan_ops.del_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			/* no support for VLAN promiscuous on inner VLAN unless
+			 * we are in Single VLAN Mode (SVM)
+			 */
+			if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc)
+				ice_vf_dis_vlan_promisc(vsi, &vlan);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_remove_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_DEL_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ */
+static int ice_vc_remove_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
 {
+	struct virtchnl_vlan_filter_list_v2 *vfl =
+		(struct virtchnl_vlan_filter_list_v2 *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_ether_addr_list *al =
-	    (struct virtchnl_ether_addr_list *)msg;
-	struct ice_pf *pf = vf->pf;
-	enum virtchnl_ops vc_op;
-	enum ice_status status;
 	struct ice_vsi *vsi;
-	int mac_count = 0;
-	int i;
-
-	if (set)
-		vc_op = VIRTCHNL_OP_ADD_ETH_ADDR;
-	else
-		vc_op = VIRTCHNL_OP_DEL_ETH_ADDR;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
-	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
+	if (!ice_vc_validate_vlan_filter_list(&vf->vlan_v2_caps.filtering,
+					      vfl)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	if (set && !ice_is_vf_trusted(vf) &&
-	    (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) {
-		dev_err(&pf->pdev->dev,
-			"Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n",
-			vf->vf_id);
-		/* There is no need to let VF know about not being trusted
-		 * to add more MAC addr, so we can just return success message.
-		 */
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto handle_mac_exit;
+		goto out;
 	}
 
-	for (i = 0; i < al->num_elements; i++) {
-		u8 *maddr = al->list[i].addr;
+	if (ice_vc_del_vlans(vf, vsi, vfl))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 
-		if (ether_addr_equal(maddr, vf->dflt_lan_addr.addr) ||
-		    is_broadcast_ether_addr(maddr)) {
-			if (set) {
-				/* VF is trying to add filters that the PF
-				 * already added. Just continue.
-				 */
-				dev_info(&pf->pdev->dev,
-					 "MAC %pM already set for VF %d\n",
-					 maddr, vf->vf_id);
-				continue;
-			} else {
-				/* VF can't remove dflt_lan_addr/bcast MAC */
-				dev_err(&pf->pdev->dev,
-					"VF can't remove default MAC address or MAC %pM programmed by PF for VF %d\n",
-					maddr, vf->vf_id);
-				continue;
-			}
-		}
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN_V2, v_ret, NULL,
+				     0);
+}
 
-		/* check for the invalid cases and bail if necessary */
-		if (is_zero_ether_addr(maddr)) {
-			dev_err(&pf->pdev->dev,
-				"invalid MAC %pM provided for VF %d\n",
-				maddr, vf->vf_id);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto handle_mac_exit;
-		}
+/**
+ * ice_vc_add_vlans - add VLAN(s) from the virtchnl filter list
+ * @vf: VF used to add the VLAN(s)
+ * @vsi: VF's VSI used to add the VLAN(s)
+ * @vfl: virthchnl filter list used to add the filters
+ */
+static int
+ice_vc_add_vlans(struct ice_vf *vf, struct ice_vsi *vsi,
+		 struct virtchnl_vlan_filter_list_v2 *vfl)
+{
+	bool vlan_promisc = ice_is_vlan_promisc_allowed(vf);
+	int err;
+	u16 i;
 
-		if (is_unicast_ether_addr(maddr) &&
-		    !ice_can_vf_change_mac(vf)) {
-			dev_err(&pf->pdev->dev,
-				"can't change unicast MAC for untrusted VF %d\n",
-				vf->vf_id);
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			goto handle_mac_exit;
-		}
+	for (i = 0; i < vfl->num_elements; i++) {
+		struct virtchnl_vlan_filter *vlan_fltr = &vfl->filters[i];
+		struct virtchnl_vlan *vc_vlan;
 
-		/* program the updated filter list */
-		status = ice_vsi_cfg_mac_fltr(vsi, maddr, set);
-		if (status == ICE_ERR_DOES_NOT_EXIST ||
-		    status == ICE_ERR_ALREADY_EXISTS) {
-			dev_info(&pf->pdev->dev,
-				 "can't %s MAC filters %pM for VF %d, error %d\n",
-				 set ? "add" : "remove", maddr, vf->vf_id,
-				 status);
-		} else if (status) {
-			dev_err(&pf->pdev->dev,
-				"can't %s MAC filters for VF %d, error %d\n",
-				set ? "add" : "remove", vf->vf_id, status);
-			v_ret = ice_err_to_virt_err(status);
-			goto handle_mac_exit;
+		vc_vlan = &vlan_fltr->outer;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
+
+			err = ice_vc_vlan_action(vsi,
+						 vsi->outer_vlan_ops.add_vlan,
+						 &vlan);
+			if (err)
+				return err;
+
+			if (vlan_promisc) {
+				err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (err)
+					return err;
+			}
 		}
 
-		mac_count++;
-	}
+		vc_vlan = &vlan_fltr->inner;
+		if (ice_vc_is_valid_vlan(vc_vlan)) {
+			struct ice_vlan vlan = ice_vc_to_vlan(vc_vlan);
 
-	/* Track number of MAC filters programmed for the VF VSI */
-	if (set)
-		vf->num_mac += mac_count;
-	else
-		vf->num_mac -= mac_count;
+			err = ice_vc_vlan_action(vsi,
+						 vsi->inner_vlan_ops.add_vlan,
+						 &vlan);
+			if (err)
+				return err;
 
-handle_mac_exit:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0);
-}
+			/* no support for VLAN promiscuous on inner VLAN unless
+			 * we are in Single VLAN Mode (SVM)
+			 */
+			if (!ice_is_dvm_ena(&vsi->back->hw) && vlan_promisc) {
+				err = ice_vf_ena_vlan_promisc(vsi, &vlan);
+				if (err)
+					return err;
+			}
+		}
+	}
 
-/**
- * ice_vc_add_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * add guest MAC address filter
- */
-static int ice_vc_add_mac_addr_msg(struct ice_vf *vf, u8 *msg)
-{
-	return ice_vc_handle_mac_addr_msg(vf, msg, true);
+	return 0;
 }
 
 /**
- * ice_vc_del_mac_addr_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_validate_add_vlan_filter_list - validate add filter list from the VF
+ * @vsi: VF VSI used to get number of existing VLAN filters
+ * @vfc: negotiated/supported VLAN filtering capabilities
+ * @vfl: VLAN filter list from VF to validate
  *
- * remove guest MAC address filter
+ * Validate all of the filters in the VLAN filter list from the VF during the
+ * VIRTCHNL_OP_ADD_VLAN_V2 opcode. If any of the checks fail then return false.
+ * Otherwise return true.
  */
-static int ice_vc_del_mac_addr_msg(struct ice_vf *vf, u8 *msg)
+static bool
+ice_vc_validate_add_vlan_filter_list(struct ice_vsi *vsi,
+				     struct virtchnl_vlan_filtering_caps *vfc,
+				     struct virtchnl_vlan_filter_list_v2 *vfl)
 {
-	return ice_vc_handle_mac_addr_msg(vf, msg, false);
+	u16 num_requested_filters = vsi->num_vlan + vfl->num_elements;
+
+	if (num_requested_filters > vfc->max_filters)
+		return false;
+
+	return ice_vc_validate_vlan_filter_list(vfc, vfl);
 }
 
 /**
- * ice_vc_request_qs_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * VFs get a default number of queues but can use this message to request a
- * different number. If the request is successful, PF will reset the VF and
- * return 0. If unsuccessful, PF will send message informing VF of number of
- * available queue pairs via virtchnl message response to VF.
+ * ice_vc_add_vlan_v2_msg - virtchnl handler for VIRTCHNL_OP_ADD_VLAN_V2
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  */
-static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_add_vlan_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vf_res_request *vfres =
-		(struct virtchnl_vf_res_request *)msg;
-	u16 req_queues = vfres->num_queue_pairs;
-	struct ice_pf *pf = vf->pf;
-	u16 max_allowed_vf_queues;
-	u16 tx_rx_queue_left;
-	u16 cur_queues;
+	struct virtchnl_vlan_filter_list_v2 *vfl =
+		(struct virtchnl_vlan_filter_list_v2 *)msg;
+	struct ice_vsi *vsi;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	cur_queues = vf->num_vf_qs;
-	tx_rx_queue_left = min_t(u16, ice_get_avail_txq_count(pf),
-				 ice_get_avail_rxq_count(pf));
-	max_allowed_vf_queues = tx_rx_queue_left + cur_queues;
-	if (!req_queues) {
-		dev_err(&pf->pdev->dev,
-			"VF %d tried to request 0 queues. Ignoring.\n",
-			vf->vf_id);
-	} else if (req_queues > ICE_MAX_BASE_QS_PER_VF) {
-		dev_err(&pf->pdev->dev,
-			"VF %d tried to request more than %d queues.\n",
-			vf->vf_id, ICE_MAX_BASE_QS_PER_VF);
-		vfres->num_queue_pairs = ICE_MAX_BASE_QS_PER_VF;
-	} else if (req_queues > cur_queues &&
-		   req_queues - cur_queues > tx_rx_queue_left) {
-		dev_warn(&pf->pdev->dev,
-			 "VF %d requested %u more queues, but only %u left.\n",
-			 vf->vf_id, req_queues - cur_queues, tx_rx_queue_left);
-		vfres->num_queue_pairs = min_t(u16, max_allowed_vf_queues,
-					       ICE_MAX_BASE_QS_PER_VF);
-	} else {
-		/* request is successful, then reset VF */
-		vf->num_req_qs = req_queues;
-		ice_vc_dis_vf(vf);
-		dev_info(&pf->pdev->dev,
-			 "VF %d granted request of %u queues.\n",
-			 vf->vf_id, req_queues);
-		return 0;
+	if (!ice_vc_isvalid_vsi_id(vf, vfl->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
 	}
 
-error_param:
-	/* send the response to the VF */
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES,
-				     v_ret, (u8 *)vfres, sizeof(*vfres));
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (!ice_vc_validate_add_vlan_filter_list(vsi,
+						  &vf->vlan_v2_caps.filtering,
+						  vfl)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (ice_vc_add_vlans(vf, vsi, vfl))
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN_V2, v_ret, NULL,
+				     0);
 }
 
 /**
- * ice_set_vf_port_vlan
- * @netdev: network interface device structure
- * @vf_id: VF identifier
- * @vlan_id: VLAN ID being set
- * @qos: priority setting
- * @vlan_proto: VLAN protocol
+ * ice_vc_valid_vlan_setting - validate VLAN setting
+ * @negotiated_settings: negotiated VLAN settings during VF init
+ * @ethertype_setting: ethertype(s) requested for the VLAN setting
+ */
+static bool
+ice_vc_valid_vlan_setting(u32 negotiated_settings, u32 ethertype_setting)
+{
+	if (ethertype_setting && !(negotiated_settings & ethertype_setting))
+		return false;
+
+	/* only allow a single VIRTCHNL_VLAN_ETHERTYPE if
+	 * VIRTHCNL_VLAN_ETHERTYPE_AND is not negotiated/supported
+	 */
+	if (!(negotiated_settings & VIRTCHNL_VLAN_ETHERTYPE_AND) &&
+	    hweight32(ethertype_setting) > 1)
+		return false;
+
+	/* ability to modify the VLAN setting was not negotiated */
+	if (!(negotiated_settings & VIRTCHNL_VLAN_TOGGLE))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_valid_vlan_setting_msg - validate the VLAN setting message
+ * @caps: negotiated VLAN settings during VF init
+ * @msg: message to validate
  *
- * program VF Port VLAN ID and/or QoS
+ * Used to validate any VLAN virtchnl message sent as a
+ * virtchnl_vlan_setting structure. Validates the message against the
+ * negotiated/supported caps during VF driver init.
  */
-int
-ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
-		     __be16 vlan_proto)
+static bool
+ice_vc_valid_vlan_setting_msg(struct virtchnl_vlan_supported_caps *caps,
+			      struct virtchnl_vlan_setting *msg)
 {
-	u16 vlanprio = vlan_id | (qos << ICE_VLAN_PRIORITY_S);
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_pf *pf = np->vsi->back;
-	struct ice_vsi *vsi;
-	struct ice_vf *vf;
-	int ret = 0;
+	if ((!msg->outer_ethertype_setting &&
+	     !msg->inner_ethertype_setting) ||
+	    (!caps->outer && !caps->inner))
+		return false;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
+	if (msg->outer_ethertype_setting &&
+	    !ice_vc_valid_vlan_setting(caps->outer,
+				       msg->outer_ethertype_setting))
+		return false;
 
-	if (vlan_id > ICE_MAX_VLANID || qos > 7) {
-		dev_err(&pf->pdev->dev, "Invalid VF Parameters\n");
+	if (msg->inner_ethertype_setting &&
+	    !ice_vc_valid_vlan_setting(caps->inner,
+				       msg->inner_ethertype_setting))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_get_tpid - transform from VIRTCHNL_VLAN_ETHERTYPE_* to VLAN TPID
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* used to get VLAN TPID
+ * @tpid: VLAN TPID to populate
+ */
+static int ice_vc_get_tpid(u32 ethertype_setting, u16 *tpid)
+{
+	switch (ethertype_setting) {
+	case VIRTCHNL_VLAN_ETHERTYPE_8100:
+		*tpid = ETH_P_8021Q;
+		break;
+	case VIRTCHNL_VLAN_ETHERTYPE_88A8:
+		*tpid = ETH_P_8021AD;
+		break;
+	case VIRTCHNL_VLAN_ETHERTYPE_9100:
+		*tpid = ETH_P_QINQ1;
+		break;
+	default:
+		*tpid = 0;
 		return -EINVAL;
 	}
 
-	if (vlan_proto != htons(ETH_P_8021Q)) {
-		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
-		return -EPROTONOSUPPORT;
-	}
+	return 0;
+}
 
-	vf = &pf->vf[vf_id];
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+/**
+ * ice_vc_ena_vlan_offload - enable VLAN offload based on the ethertype_setting
+ * @vsi: VF's VSI used to enable the VLAN offload
+ * @ena_offload: function used to enable the VLAN offload
+ * @ethertype_setting: VIRTCHNL_VLAN_ETHERTYPE_* to enable offloads for
+ */
+static int
+ice_vc_ena_vlan_offload(struct ice_vsi *vsi,
+			int (*ena_offload)(struct ice_vsi *vsi, u16 tpid),
+			u32 ethertype_setting)
+{
+	u16 tpid;
+	int err;
 
-	if (le16_to_cpu(vsi->info.pvid) == vlanprio) {
-		/* duplicate request, so just return success */
-		dev_info(&pf->pdev->dev,
-			 "Duplicate pvid %d request\n", vlanprio);
-		return ret;
-	}
+	err = ice_vc_get_tpid(ethertype_setting, &tpid);
+	if (err)
+		return err;
 
-	/* If PVID, then remove all filters on the old VLAN */
-	if (vsi->info.pvid)
-		ice_vsi_kill_vlan(vsi, (le16_to_cpu(vsi->info.pvid) &
-				  VLAN_VID_MASK));
+	err = ena_offload(vsi, tpid);
+	if (err)
+		return err;
 
-	if (vlan_id || qos) {
-		ret = ice_vsi_manage_pvid(vsi, vlanprio, true);
-		if (ret)
-			goto error_set_pvid;
-	} else {
-		ice_vsi_manage_pvid(vsi, 0, false);
-		vsi->info.pvid = 0;
-	}
+	return 0;
+}
 
-	if (vlan_id) {
-		dev_info(&pf->pdev->dev, "Setting VLAN %d, QOS 0x%x on VF %d\n",
-			 vlan_id, qos, vf_id);
+#define ICE_L2TSEL_QRX_CONTEXT_REG_IDX	3
+#define ICE_L2TSEL_BIT_OFFSET		23
+enum ice_l2tsel {
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND,
+	ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1,
+};
 
-		/* add new VLAN filter for each MAC */
-		ret = ice_vsi_add_vlan(vsi, vlan_id);
-		if (ret)
-			goto error_set_pvid;
-	}
+/**
+ * ice_vsi_update_l2tsel - update l2tsel field for all Rx rings on this VSI
+ * @vsi: VSI used to update l2tsel on
+ * @l2tsel: l2tsel setting requested
+ *
+ * Use the l2tsel setting to update all of the Rx queue context bits for l2tsel.
+ * This will modify which descriptor field the first offloaded VLAN will be
+ * stripped into.
+ */
+static void ice_vsi_update_l2tsel(struct ice_vsi *vsi, enum ice_l2tsel l2tsel)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	u32 l2tsel_bit;
+	int i;
 
-	/* The Port VLAN needs to be saved across resets the same as the
-	 * default LAN MAC address.
-	 */
-	vf->port_vlan_id = le16_to_cpu(vsi->info.pvid);
+	if (l2tsel == ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND)
+		l2tsel_bit = 0;
+	else
+		l2tsel_bit = BIT(ICE_L2TSEL_BIT_OFFSET);
 
-error_set_pvid:
-	return ret;
+	for (i = 0; i < vsi->alloc_rxq; i++) {
+		u16 pfq = vsi->rxq_map[i];
+		u32 qrx_context_offset;
+		u32 regval;
+
+		qrx_context_offset =
+			QRX_CONTEXT(ICE_L2TSEL_QRX_CONTEXT_REG_IDX, pfq);
+
+		regval = rd32(hw, qrx_context_offset);
+		regval &= ~BIT(ICE_L2TSEL_BIT_OFFSET);
+		regval |= l2tsel_bit;
+		wr32(hw, qrx_context_offset, regval);
+	}
 }
 
 /**
- * ice_vc_process_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- * @add_v: Add VLAN if true, otherwise delete VLAN
+ * ice_vc_ena_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * Process virtchnl op to add or remove programmed guest VLAN ID
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
  */
-static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v)
+static int ice_vc_ena_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_vlan_filter_list *vfl =
-	    (struct virtchnl_vlan_filter_list *)msg;
-	struct ice_pf *pf = vf->pf;
-	bool vlan_promisc = false;
+	struct virtchnl_vlan_supported_caps *stripping_support;
+	struct virtchnl_vlan_setting *strip_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
 	struct ice_vsi *vsi;
-	struct ice_hw *hw;
-	int status = 0;
-	u8 promisc_m;
-	int i;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (!ice_vc_isvalid_vsi_id(vf, vfl->vsi_id)) {
+	if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
-	}
-
-	if (add_v && !ice_is_vf_trusted(vf) &&
-	    vf->num_vlan >= ICE_MAX_VLAN_PER_VF) {
-		dev_info(&pf->pdev->dev,
-			 "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
-			 vf->vf_id);
-		/* There is no need to let VF know about being not trusted,
-		 * so we can just return success message here
-		 */
-		goto error_param;
-	}
-
-	for (i = 0; i < vfl->num_elements; i++) {
-		if (vfl->vlan_id[i] > ICE_MAX_VLANID) {
-			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-			dev_err(&pf->pdev->dev,
-				"invalid VF VLAN id %d\n", vfl->vlan_id[i]);
-			goto error_param;
-		}
+		goto out;
 	}
 
-	hw = &pf->hw;
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (vsi->info.pvid) {
+	stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+	if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
 	}
 
-	if (ice_vsi_manage_vlan_stripping(vsi, add_v)) {
-		dev_err(&pf->pdev->dev,
-			"%sable VLAN stripping failed for VSI %i\n",
-			 add_v ? "en" : "dis", vsi->vsi_num);
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+	if (ice_vsi_is_rxq_crc_strip_dis(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto out;
 	}
 
-	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
-	    test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
-		vlan_promisc = true;
+	ethertype_setting = strip_msg->outer_ethertype_setting;
+	if (ethertype_setting) {
+		if (ice_vc_ena_vlan_offload(vsi,
+					    vsi->outer_vlan_ops.ena_stripping,
+					    ethertype_setting)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto out;
+		} else {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG2_2ND;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support outer stripping so the first tag always ends
+			 * up in L2TAG2_2ND and the second/inner tag, if
+			 * enabled, is extracted in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
 
-	if (add_v) {
-		for (i = 0; i < vfl->num_elements; i++) {
-			u16 vid = vfl->vlan_id[i];
+			vf->vlan_strip_ena |= ICE_OUTER_VLAN_STRIP_ENA;
+		}
+	}
 
-			if (!ice_is_vf_trusted(vf) &&
-			    vf->num_vlan >= ICE_MAX_VLAN_PER_VF) {
-				dev_info(&pf->pdev->dev,
-					 "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n",
-					 vf->vf_id);
-				/* There is no need to let VF know about being
-				 * not trusted, so we can just return success
-				 * message here as well.
-				 */
-				goto error_param;
-			}
+	ethertype_setting = strip_msg->inner_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_stripping,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			if (ice_vsi_add_vlan(vsi, vid)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	if (ethertype_setting)
+		vf->vlan_strip_ena |= ICE_INNER_VLAN_STRIP_ENA;
 
-			vf->num_vlan++;
-			/* Enable VLAN pruning when VLAN is added */
-			if (!vlan_promisc) {
-				status = ice_cfg_vlan_pruning(vsi, true, false);
-				if (status) {
-					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-					dev_err(&pf->pdev->dev,
-						"Enable VLAN pruning on VLAN ID: %d failed error-%d\n",
-						vid, status);
-					goto error_param;
-				}
-			} else {
-				/* Enable Ucast/Mcast VLAN promiscuous mode */
-				promisc_m = ICE_PROMISC_VLAN_TX |
-					    ICE_PROMISC_VLAN_RX;
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
+}
 
-				status = ice_set_vsi_promisc(hw, vsi->idx,
-							     promisc_m, vid);
-				if (status) {
-					v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-					dev_err(&pf->pdev->dev,
-						"Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n",
-						vid, status);
-				}
-			}
-		}
-	} else {
-		/* In case of non_trusted VF, number of VLAN elements passed
-		 * to PF for removal might be greater than number of VLANs
-		 * filter programmed for that VF - So, use actual number of
-		 * VLANS added earlier with add VLAN opcode. In order to avoid
-		 * removing VLAN that doesn't exist, which result to sending
-		 * erroneous failed message back to the VF
-		 */
-		int num_vf_vlan;
+/**
+ * ice_vc_dis_vlan_stripping_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
+ *
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ */
+static int ice_vc_dis_vlan_stripping_v2_msg(struct ice_vf *vf, u8 *msg)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_supported_caps *stripping_support;
+	struct virtchnl_vlan_setting *strip_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
+	struct ice_vsi *vsi;
 
-		num_vf_vlan = vf->num_vlan;
-		for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) {
-			u16 vid = vfl->vlan_id[i];
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			/* Make sure ice_vsi_kill_vlan is successful before
-			 * updating VLAN information
-			 */
-			if (ice_vsi_kill_vlan(vsi, vid)) {
-				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-				goto error_param;
-			}
+	if (!ice_vc_isvalid_vsi_id(vf, strip_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			vf->num_vlan--;
-			/* Disable VLAN pruning when the last VLAN is removed */
-			if (!vf->num_vlan)
-				ice_cfg_vlan_pruning(vsi, false, false);
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-			/* Disable Unicast/Multicast VLAN promiscuous mode */
-			if (vlan_promisc) {
-				promisc_m = ICE_PROMISC_VLAN_TX |
-					    ICE_PROMISC_VLAN_RX;
+	stripping_support = &vf->vlan_v2_caps.offloads.stripping_support;
+	if (!ice_vc_valid_vlan_setting_msg(stripping_support, strip_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-				ice_clear_vsi_promisc(hw, vsi->idx,
-						      promisc_m, vid);
-			}
+	ethertype_setting = strip_msg->outer_ethertype_setting;
+	if (ethertype_setting) {
+		if (vsi->outer_vlan_ops.dis_stripping(vsi)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto out;
+		} else {
+			enum ice_l2tsel l2tsel =
+				ICE_L2TSEL_EXTRACT_FIRST_TAG_L2TAG1;
+
+			/* PF tells the VF that the outer VLAN tag is always
+			 * extracted to VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 and
+			 * inner is always extracted to
+			 * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1. This is needed to
+			 * support inner stripping while outer stripping is
+			 * disabled so that the first and only tag is extracted
+			 * in L2TAG1.
+			 */
+			ice_vsi_update_l2tsel(vsi, l2tsel);
+
+			vf->vlan_strip_ena &= ~ICE_OUTER_VLAN_STRIP_ENA;
 		}
 	}
 
-error_param:
-	/* send the response to the VF */
-	if (add_v)
-		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_VLAN, v_ret,
-					     NULL, 0);
-	else
-		return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_VLAN, v_ret,
-					     NULL, 0);
-}
+	ethertype_setting = strip_msg->inner_ethertype_setting;
+	if (ethertype_setting && vsi->inner_vlan_ops.dis_stripping(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-/**
- * ice_vc_add_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- *
- * Add and program guest VLAN ID
- */
-static int ice_vc_add_vlan_msg(struct ice_vf *vf, u8 *msg)
-{
-	return ice_vc_process_vlan_msg(vf, msg, true);
+	if (ethertype_setting)
+		vf->vlan_strip_ena &= ~ICE_INNER_VLAN_STRIP_ENA;
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2, v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_remove_vlan_msg
- * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
+ * ice_vc_ena_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * remove programmed guest VLAN ID
+ * virthcnl handler for VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
  */
-static int ice_vc_remove_vlan_msg(struct ice_vf *vf, u8 *msg)
+static int ice_vc_ena_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
 {
-	return ice_vc_process_vlan_msg(vf, msg, false);
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_vlan_supported_caps *insertion_support;
+	struct virtchnl_vlan_setting *insertion_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+	if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->outer_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->outer_vlan_ops.ena_insertion,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->inner_ethertype_setting;
+	if (ethertype_setting &&
+	    ice_vc_ena_vlan_offload(vsi, vsi->inner_vlan_ops.ena_insertion,
+				    ethertype_setting)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
 }
 
 /**
- * ice_vc_ena_vlan_stripping
- * @vf: pointer to the VF info
+ * ice_vc_dis_vlan_insertion_v2_msg
+ * @vf: VF the message was received from
+ * @msg: message received from the VF
  *
- * Enable VLAN header stripping for a given VF
+ * virthcnl handler for VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
  */
-static int ice_vc_ena_vlan_stripping(struct ice_vf *vf)
+static int ice_vc_dis_vlan_insertion_v2_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_vlan_supported_caps *insertion_support;
+	struct virtchnl_vlan_setting *insertion_msg =
+		(struct virtchnl_vlan_setting *)msg;
+	u32 ethertype_setting;
 	struct ice_vsi *vsi;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto out;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, insertion_msg->vport_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
-	if (ice_vsi_manage_vlan_stripping(vsi, true))
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
 
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
-				     v_ret, NULL, 0);
+	insertion_support = &vf->vlan_v2_caps.offloads.insertion_support;
+	if (!ice_vc_valid_vlan_setting_msg(insertion_support, insertion_msg)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->outer_ethertype_setting;
+	if (ethertype_setting && vsi->outer_vlan_ops.dis_insertion(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+	ethertype_setting = insertion_msg->inner_ethertype_setting;
+	if (ethertype_setting && vsi->inner_vlan_ops.dis_insertion(vsi)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto out;
+	}
+
+out:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2, v_ret, NULL, 0);
+}
+
+static struct ice_vc_vf_ops ice_vc_vf_dflt_ops = {
+	.get_ver_msg = ice_vc_get_ver_msg,
+	.get_vf_res_msg = ice_vc_get_vf_res_msg,
+	.reset_vf = ice_vc_reset_vf_msg,
+	.add_mac_addr_msg = ice_vc_add_mac_addr_msg,
+	.del_mac_addr_msg = ice_vc_del_mac_addr_msg,
+	.cfg_qs_msg = ice_vc_cfg_qs_msg,
+	.ena_qs_msg = ice_vc_ena_qs_msg,
+	.dis_qs_msg = ice_vc_dis_qs_msg,
+	.request_qs_msg = ice_vc_request_qs_msg,
+	.cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
+	.config_rss_key = ice_vc_config_rss_key,
+	.config_rss_lut = ice_vc_config_rss_lut,
+	.get_stats_msg = ice_vc_get_stats_msg,
+	.cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
+	.add_vlan_msg = ice_vc_add_vlan_msg,
+	.remove_vlan_msg = ice_vc_remove_vlan_msg,
+	.query_rxdid = ice_vc_query_rxdid,
+	.get_rss_hena = ice_vc_get_rss_hena,
+	.set_rss_hena_msg = ice_vc_set_rss_hena,
+	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
+	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	.add_qch_msg = ice_vc_add_qch_msg,
+	.add_switch_filter_msg = ice_vc_add_switch_filter,
+	.del_switch_filter_msg = ice_vc_del_switch_filter,
+	.del_qch_msg = ice_vc_del_qch_msg,
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	.rdma_msg = ice_vc_rdma_msg,
+	.cfg_rdma_irq_map_msg = ice_vc_cfg_rdma_irq_map_msg,
+	.clear_rdma_irq_map = ice_vc_clear_rdma_irq_map,
+	.dcf_vlan_offload_msg = ice_vc_dcf_vlan_offload_msg,
+	.dcf_cmd_desc_msg = ice_vc_dcf_cmd_desc_msg,
+	.dcf_cmd_buff_msg = ice_vc_dcf_cmd_buff_msg,
+	.dis_dcf_cap = ice_vc_dis_dcf_cap,
+	.dcf_get_vsi_map = ice_vc_dcf_get_vsi_map,
+	.dcf_query_pkg_info = ice_vc_dcf_query_pkg_info,
+	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
+	.add_fdir_fltr_msg = ice_vc_add_fdir_fltr,
+	.del_fdir_fltr_msg = ice_vc_del_fdir_fltr,
+	.get_max_rss_qregion = ice_vc_get_max_rss_qregion,
+	.ena_qs_v2_msg = ice_vc_ena_qs_v2_msg,
+	.dis_qs_v2_msg = ice_vc_dis_qs_v2_msg,
+	.map_q_vector_msg = ice_vc_map_q_vector_msg,
+	.get_offload_vlan_v2_caps = ice_vc_get_offload_vlan_v2_caps,
+	.add_vlan_v2_msg = ice_vc_add_vlan_v2_msg,
+	.remove_vlan_v2_msg = ice_vc_remove_vlan_v2_msg,
+	.ena_vlan_stripping_v2_msg = ice_vc_ena_vlan_stripping_v2_msg,
+	.dis_vlan_stripping_v2_msg = ice_vc_dis_vlan_stripping_v2_msg,
+	.ena_vlan_insertion_v2_msg = ice_vc_ena_vlan_insertion_v2_msg,
+	.dis_vlan_insertion_v2_msg = ice_vc_dis_vlan_insertion_v2_msg,
+};
+
+void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops)
+{
+	*ops = ice_vc_vf_dflt_ops;
+}
+
+static int ice_vc_repr_no_action_msg(struct ice_vf __always_unused *vf,
+				     u8 __always_unused *msg)
+{
+	return 0;
 }
 
 /**
- * ice_vc_dis_vlan_stripping
- * @vf: pointer to the VF info
+ * ice_vc_repr_add_mac
+ * @vf: pointer to VF
+ * @msg: virtchannel message
  *
- * Disable VLAN header stripping for a given VF
+ * When port representors are created, we do not add MAC rule
+ * to firmware, we store it so that PF could report same
+ * MAC as VF.
  */
-static int ice_vc_dis_vlan_stripping(struct ice_vf *vf)
+static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct ice_pf *pf = vf->pf;
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
 	struct ice_vsi *vsi;
+	struct ice_pf *pf;
+	int i;
 
-	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) ||
+	    !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto handle_mac_exit;
 	}
 
-	vsi = pf->vsi[vf->lan_vsi_idx];
+	pf = vf->pf;
+
+	vsi = ice_get_vf_vsi(vf);
 	if (!vsi) {
 		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
-		goto error_param;
+		goto handle_mac_exit;
 	}
 
-	if (ice_vsi_manage_vlan_stripping(vsi, false))
-		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+	for (i = 0; i < al->num_elements; i++) {
+		u8 *mac_addr = al->list[i].addr;
 
-error_param:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+		if (!is_unicast_ether_addr(mac_addr) ||
+		    ether_addr_equal(mac_addr, vf->hw_lan_addr.addr))
+			continue;
+
+		if (vf->pf_set_mac) {
+			dev_err(ice_pf_to_dev(pf),
+				"VF attempting to override administratively set MAC address\n");
+			v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+			goto handle_mac_exit;
+		}
+
+
+		ice_vfhw_mac_add(vf, &al->list[i]);
+		vf->num_mac++;
+		break;
+	}
+
+handle_mac_exit:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_ETH_ADDR,
 				     v_ret, NULL, 0);
 }
 
+/**
+ * ice_vc_repr_del_mac - response with success for deleting MAC
+ * @vf: pointer to VF
+ * @msg: virtchannel message
+ *
+ * Respond with success to not break normal VF flow.
+ */
+static int ice_vc_repr_del_mac(struct ice_vf __always_unused *vf,
+			       u8 __always_unused *msg)
+{
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR,
+				     VIRTCHNL_STATUS_SUCCESS, NULL, 0);
+}
+
+static int ice_vc_repr_no_action(struct ice_vf __always_unused *vf)
+{
+	return 0;
+}
+
+void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops)
+{
+	ops->add_mac_addr_msg = ice_vc_repr_add_mac;
+	ops->del_mac_addr_msg = ice_vc_repr_del_mac;
+	ops->add_vlan_msg = ice_vc_repr_no_action_msg;
+	ops->remove_vlan_msg = ice_vc_repr_no_action_msg;
+	ops->ena_vlan_stripping = ice_vc_repr_no_action;
+	ops->dis_vlan_stripping = ice_vc_repr_no_action;
+	ops->cfg_promiscuous_mode_msg = ice_vc_repr_no_action_msg;
+}
+
 /**
  * ice_vc_process_vf_msg - Process request from VF
  * @pf: pointer to the PF structure
@@ -2904,23 +9890,27 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 	u32 v_opcode = le32_to_cpu(event->desc.cookie_high);
 	s16 vf_id = le16_to_cpu(event->desc.retval);
 	u16 msglen = event->msg_len;
+	struct ice_vc_vf_ops *ops;
 	u8 *msg = event->msg_buf;
 	struct ice_vf *vf = NULL;
+	struct device *dev;
 	int err = 0;
 
-	if (vf_id >= pf->num_alloc_vfs) {
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id)) {
 		err = -EINVAL;
 		goto error_handler;
 	}
 
 	vf = &pf->vf[vf_id];
-
 	/* Check if VF is disabled. */
 	if (test_bit(ICE_VF_STATE_DIS, vf->vf_states)) {
 		err = -EPERM;
 		goto error_handler;
 	}
 
+	ops = &vf->vc_ops;
+
 	/* Perform basic checks on the msg */
 	err = virtchnl_vc_validate_vf_msg(&vf->vf_ver, v_opcode, msg, msglen);
 	if (err) {
@@ -2930,73 +9920,185 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 			err = -EINVAL;
 	}
 
+	if (!ice_vc_is_opcode_allowed(vf, v_opcode)) {
+		ice_vc_send_msg_to_vf(vf, v_opcode,
+				      VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL,
+				      0);
+		return;
+	}
+
 error_handler:
 	if (err) {
 		ice_vc_send_msg_to_vf(vf, v_opcode, VIRTCHNL_STATUS_ERR_PARAM,
 				      NULL, 0);
-		dev_err(&pf->pdev->dev, "Invalid message from VF %d, opcode %d, len %d, error %d\n",
+		dev_err(dev, "Invalid message from VF %d, opcode %d, len %d, error %d\n",
 			vf_id, v_opcode, msglen, err);
 		return;
 	}
 
 	switch (v_opcode) {
 	case VIRTCHNL_OP_VERSION:
-		err = ice_vc_get_ver_msg(vf, msg);
+		err = ops->get_ver_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_GET_VF_RESOURCES:
-		err = ice_vc_get_vf_res_msg(vf, msg);
+		err = ops->get_vf_res_msg(vf, msg);
+		if (ice_vf_init_vlan_stripping(vf))
+			dev_dbg(dev, "Failed to initialize VLAN stripping for VF %d\n",
+				vf->vf_id);
 		ice_vc_notify_vf_link_state(vf);
 		break;
 	case VIRTCHNL_OP_RESET_VF:
-		ice_vc_reset_vf_msg(vf);
+		ops->reset_vf(vf);
 		break;
 	case VIRTCHNL_OP_ADD_ETH_ADDR:
-		err = ice_vc_add_mac_addr_msg(vf, msg);
+		err = ops->add_mac_addr_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_DEL_ETH_ADDR:
-		err = ice_vc_del_mac_addr_msg(vf, msg);
+		err = ops->del_mac_addr_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
-		err = ice_vc_cfg_qs_msg(vf, msg);
+		err = ops->cfg_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_QUEUES:
-		err = ice_vc_ena_qs_msg(vf, msg);
+		err = ops->ena_qs_msg(vf, msg);
 		ice_vc_notify_vf_link_state(vf);
 		break;
 	case VIRTCHNL_OP_DISABLE_QUEUES:
-		err = ice_vc_dis_qs_msg(vf, msg);
+		err = ops->dis_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_REQUEST_QUEUES:
-		err = ice_vc_request_qs_msg(vf, msg);
+		err = ops->request_qs_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
-		err = ice_vc_cfg_irq_map_msg(vf, msg);
+		err = ops->cfg_irq_map_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_RSS_KEY:
-		err = ice_vc_config_rss_key(vf, msg);
+		err = ops->config_rss_key(vf, msg);
 		break;
 	case VIRTCHNL_OP_CONFIG_RSS_LUT:
-		err = ice_vc_config_rss_lut(vf, msg);
+		err = ops->config_rss_lut(vf, msg);
 		break;
 	case VIRTCHNL_OP_GET_STATS:
-		err = ice_vc_get_stats_msg(vf, msg);
+		err = ops->get_stats_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		err = ops->cfg_promiscuous_mode_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ADD_VLAN:
-		err = ice_vc_add_vlan_msg(vf, msg);
+		err = ops->add_vlan_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_DEL_VLAN:
-		err = ice_vc_remove_vlan_msg(vf, msg);
+		err = ops->remove_vlan_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		err = ops->query_rxdid(vf);
+		break;
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		err = ops->get_rss_hena(vf);
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		err = ops->set_rss_hena_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
-		err = ice_vc_ena_vlan_stripping(vf);
+		err = ops->ena_vlan_stripping(vf);
 		break;
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
-		err = ice_vc_dis_vlan_stripping(vf);
+		err = ops->dis_vlan_stripping(vf);
+		break;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		err = ops->add_qch_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+		err = ops->add_switch_filter_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		err = ops->del_switch_filter_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		err = ops->del_qch_msg(vf, msg);
+		break;
+#endif /* HAVE_TC_SETUP_FLOWER */
+	case VIRTCHNL_OP_RDMA:
+		err = ops->rdma_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		err = ops->cfg_rdma_irq_map_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		err = ops->clear_rdma_irq_map(vf);
+		break;
+	case VIRTCHNL_OP_DCF_VLAN_OFFLOAD:
+		err = ops->dcf_vlan_offload_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+		err = ops->dcf_cmd_desc_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		err = ops->dcf_cmd_buff_msg(vf, msg, msglen);
+		break;
+        case VIRTCHNL_OP_DCF_RULE_FLUSH:
+                err = ice_vc_flush_dcf_rule(vf);
+                break;
+	case VIRTCHNL_OP_DCF_DISABLE:
+		err = ops->dis_dcf_cap(vf);
+		break;
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+		err = ops->dcf_get_vsi_map(vf);
+		break;
+	case VIRTCHNL_OP_DCF_GET_PKG_INFO:
+		err = ops->dcf_query_pkg_info(vf);
+		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+		err = ops->handle_rss_cfg_msg(vf, msg, true);
+		break;
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		err = ops->handle_rss_cfg_msg(vf, msg, false);
+		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		err = ops->add_fdir_fltr_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		err = ops->del_fdir_fltr_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		err = ops->get_max_rss_qregion(vf);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+		err = ops->ena_qs_v2_msg(vf, msg);
+		ice_vc_notify_vf_link_state(vf);
+		break;
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		err = ops->dis_qs_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		err = ops->map_q_vector_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		err = ops->get_offload_vlan_v2_caps(vf);
+		break;
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+		err = ops->add_vlan_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		err = ops->remove_vlan_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+		err = ops->ena_vlan_stripping_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+		err = ops->dis_vlan_stripping_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+		err = ops->ena_vlan_insertion_v2_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+		err = ops->dis_vlan_insertion_v2_msg(vf, msg);
 		break;
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
-		dev_err(&pf->pdev->dev, "Unsupported opcode %d from VF %d\n",
-			v_opcode, vf_id);
+		dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode,
+			vf_id);
 		err = ice_vc_send_msg_to_vf(vf, v_opcode,
 					    VIRTCHNL_STATUS_ERR_NOT_SUPPORTED,
 					    NULL, 0);
@@ -3006,8 +10108,7 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 		/* Helper function cares less about error return values here
 		 * as it is busy with pending work.
 		 */
-		dev_info(&pf->pdev->dev,
-			 "PF failed to honor VF %d, opcode %d, error %d\n",
+		dev_info(dev, "PF failed to honor VF %d, opcode %d, error %d\n",
 			 vf_id, v_opcode, err);
 	}
 }
@@ -3023,107 +10124,48 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 int
 ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_vf *vf;
-
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
-
-	vf = &pf->vf[vf_id];
-	vsi = pf->vsi[vf->lan_vsi_idx];
-
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
-
-	ivi->vf = vf_id;
-	ether_addr_copy(ivi->mac, vf->dflt_lan_addr.addr);
-
-	/* VF configuration for VLAN and applicable QoS */
-	ivi->vlan = le16_to_cpu(vsi->info.pvid) & ICE_VLAN_M;
-	ivi->qos = (le16_to_cpu(vsi->info.pvid) & ICE_PRIORITY_M) >>
-		    ICE_VLAN_PRIORITY_S;
-
-	ivi->trusted = vf->trusted;
-	ivi->spoofchk = vf->spoofchk;
-	if (!vf->link_forced)
-		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
-	else if (vf->link_up)
-		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
-	else
-		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
-	ivi->max_tx_rate = vf->tx_rate;
-	ivi->min_tx_rate = 0;
-	return 0;
-}
-
-/**
- * ice_set_vf_spoofchk
- * @netdev: network interface device structure
- * @vf_id: VF identifier
- * @ena: flag to enable or disable feature
- *
- * Enable or disable VF spoof checking
- */
-int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
-{
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
-	struct ice_vsi_ctx *ctx;
-	enum ice_status status;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	int ret = 0;
-
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
-	}
-
-	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
-
-	if (ena == vf->spoofchk) {
-		dev_dbg(&pf->pdev->dev, "VF spoofchk already %s\n",
-			ena ? "ON" : "OFF");
-		return 0;
-	}
-
-	ctx = devm_kzalloc(&pf->pdev->dev, sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
+	int ret;
 
-	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
 
-	if (ena) {
-		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF;
-		ctx->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M;
-	}
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
-	status = ice_update_vsi(&pf->hw, vsi->idx, ctx, NULL);
-	if (status) {
-		dev_dbg(&pf->pdev->dev,
-			"Error %d, failed to update VSI* parameters\n", status);
-		ret = -EIO;
-		goto out;
-	}
+	ivi->vf = vf_id;
+	ether_addr_copy(ivi->mac, vf->hw_lan_addr.addr);
 
-	vf->spoofchk = ena;
-	vsi->info.sec_flags = ctx->info.sec_flags;
-	vsi->info.sw_flags2 = ctx->info.sw_flags2;
-out:
-	devm_kfree(&pf->pdev->dev, ctx);
-	return ret;
+	/* VF configuration for VLAN and applicable QoS */
+	ivi->vlan = ice_vf_get_port_vlan_id(vf);
+	ivi->qos = ice_vf_get_port_vlan_prio(vf);
+#ifdef IFLA_VF_VLAN_INFO_MAX
+	if (ice_vf_is_port_vlan_ena(vf))
+		ivi->vlan_proto = cpu_to_be16(ice_vf_get_port_vlan_tpid(vf));
+#endif /* IFLA_VF_VLAN_INFO_MAX */
+
+#ifdef HAVE_NDO_SET_VF_TRUST
+	ivi->trusted = vf->trusted;
+#endif /* HAVE_NDO_SET_VF_TRUST */
+	ivi->spoofchk = vf->spoofchk;
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
+	if (!vf->link_forced)
+		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
+	else if (vf->link_up)
+		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
+	else
+		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
+#endif
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	ivi->max_tx_rate = vf->max_tx_rate;
+	ivi->min_tx_rate = vf->min_tx_rate;
+#else
+	ivi->tx_rate = vf->max_tx_rate;
+#endif
+	return 0;
 }
 
 /**
@@ -3136,42 +10178,54 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
  */
 int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	int ret = 0;
+	int ret;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		netdev_err(netdev, "invalid VF id: %d\n", vf_id);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	if (is_multicast_ether_addr(mac)) {
+		netdev_err(netdev, "%pM not a valid unicast address\n", mac);
 		return -EINVAL;
 	}
 
 	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		netdev_err(netdev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	/* nothing left to do, unicast MAC already set */
+	if (ether_addr_equal(vf->dev_lan_addr.addr, mac) &&
+	    ether_addr_equal(vf->hw_lan_addr.addr, mac))
+		return 0;
 
-	if (is_zero_ether_addr(mac) || is_multicast_ether_addr(mac)) {
-		netdev_err(netdev, "%pM not a valid unicast address\n", mac);
-		return -EINVAL;
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	if (ice_vf_chnl_dmac_fltr_cnt(vf)) {
+		netdev_err(netdev,
+			   "can't set mac %pM. VF %d has tc-flower filters, delete them and try again\n",
+			   mac, vf_id);
+		return -EAGAIN;
 	}
 
-	/* copy MAC into dflt_lan_addr and trigger a VF reset. The reset
-	 * flow will use the updated dflt_lan_addr and add a MAC filter
-	 * using ice_add_mac. Also set pf_set_mac to indicate that the PF has
-	 * set the MAC address for this VF.
+	/* VF is notified of its new MAC via the PF's response to the
+	 * VIRTCHNL_OP_GET_VF_RESOURCES message after the VF has been reset
 	 */
-	ether_addr_copy(vf->dflt_lan_addr.addr, mac);
-	vf->pf_set_mac = true;
-	netdev_info(netdev,
-		    "MAC on VF %d set to %pM. VF driver will be reinitialized\n",
-		    vf_id, mac);
+	ether_addr_copy(vf->dev_lan_addr.addr, mac);
+	ether_addr_copy(vf->hw_lan_addr.addr, mac);
+	if (is_zero_ether_addr(mac)) {
+		/* VF will send VIRTCHNL_OP_ADD_ETH_ADDR message with its MAC */
+		vf->pf_set_mac = false;
+		netdev_info(netdev, "Removing MAC on VF %d. VF driver will be reinitialized\n",
+			    vf->vf_id);
+	} else {
+		/* PF will add MAC rule for the VF */
+		vf->pf_set_mac = true;
+		netdev_info(netdev, "Setting MAC %pM on VF %d. VF driver will be reinitialized\n",
+			    mac, vf_id);
+	}
 
-	ice_vc_dis_vf(vf);
-	return ret;
+	ice_vc_reset_vf(vf);
+	return 0;
 }
 
 /**
@@ -3184,35 +10238,55 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
  */
 int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_vsi *vsi = np->vsi;
-	struct ice_pf *pf = vsi->back;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
+	int ret;
 
-	/* validate the request */
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "invalid VF id: %d\n", vf_id);
-		return -EINVAL;
+	if (ice_is_eswitch_mode_switchdev(pf)) {
+		dev_info(ice_pf_to_dev(pf),
+			 "Trusted VF is forbidden in switchdev mode\n");
+		return -EOPNOTSUPP;
 	}
 
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
 	vf = &pf->vf[vf_id];
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "VF %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
 	/* Check if already trusted */
 	if (trusted == vf->trusted)
 		return 0;
 
+#ifdef HAVE_NDO_SET_VF_TRUST
+	/* If the trust mode of a given DCF is taken away without the DCF
+	 * gracefully relinquishing the DCF functionality, remove ALL switch
+	 * filters that were added by the DCF and treat this VF as any other
+	 * untrusted AVF.
+	 */
+	if (ice_is_vf_dcf(vf) && !trusted &&
+	    ice_dcf_get_state(pf) != ICE_DCF_STATE_OFF) {
+		ice_rm_all_dcf_sw_rules(pf);
+		ice_clear_dcf_acl_cfg(pf);
+		ice_clear_dcf_udp_tunnel_cfg(pf);
+		pf->hw.dcf_caps &= ~(DCF_ACL_CAP | DCF_UDP_TUNNEL_CAP);
+		ice_dcf_set_state(pf, ICE_DCF_STATE_OFF);
+		pf->dcf.vf = NULL;
+		vf->driver_caps &= ~VIRTCHNL_VF_CAP_DCF;
+	}
+
+	ice_vc_reset_vf(vf);
+#endif /* HAVE_NDO_SET_VF_TRUST */
 	vf->trusted = trusted;
-	ice_vc_dis_vf(vf);
-	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
-		 vf_id, trusted ? "" : "un");
+	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", vf_id,
+		 trusted ? "" : "un");
 
 	return 0;
 }
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 /**
  * ice_set_vf_link_state
  * @netdev: network interface device structure
@@ -3223,34 +10297,25 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
  */
 int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state)
 {
-	struct ice_netdev_priv *np = netdev_priv(netdev);
-	struct ice_pf *pf = np->vsi->back;
-	struct virtchnl_pf_event pfe = { 0 };
-	struct ice_link_status *ls;
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
 	struct ice_vf *vf;
-	struct ice_hw *hw;
+	int ret;
 
-	if (vf_id >= pf->num_alloc_vfs) {
-		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+	if (ice_validate_vf_id(pf, vf_id))
 		return -EINVAL;
-	}
 
 	vf = &pf->vf[vf_id];
-	hw = &pf->hw;
-	ls = &pf->hw.port_info->phy.link_info;
-
-	if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
-		dev_err(&pf->pdev->dev, "vf %d in reset. Try again.\n", vf_id);
-		return -EBUSY;
-	}
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
 
-	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
-	pfe.severity = PF_EVENT_SEVERITY_INFO;
+	/* disallow link state change if eeprom is corrupted */
+	if (test_bit(ICE_BAD_EEPROM, pf->state))
+		return -EOPNOTSUPP;
 
 	switch (link_state) {
 	case IFLA_VF_LINK_STATE_AUTO:
 		vf->link_forced = false;
-		vf->link_up = ls->link_info & ICE_AQ_LINK_UP;
 		break;
 	case IFLA_VF_LINK_STATE_ENABLE:
 		vf->link_forced = true;
@@ -3264,15 +10329,466 @@ int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state)
 		return -EINVAL;
 	}
 
-	if (vf->link_forced)
-		ice_set_pfe_link_forced(vf, &pfe, vf->link_up);
-	else
-		ice_set_pfe_link(vf, &pfe, ls->link_speed, vf->link_up);
+	if (vf->repr) {
+		struct net_device *pr_netdev = vf->repr->netdev;
+		unsigned int flags = pr_netdev->flags;
 
-	/* Notify the VF of its new link state */
-	ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT,
-			      VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe,
-			      sizeof(pfe), NULL);
+		flags = vf->link_up ? flags | IFF_UP : flags & ~IFF_UP;
+		dev_change_flags(pr_netdev, flags, NULL);
+	}
+
+	ice_vc_notify_vf_link_state(vf);
+
+	return 0;
+}
+#endif /* HAVE_NDO_SET_VF_LINK_STATE */
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+/**
+ * ice_calc_all_vfs_min_tx_rate - calculate cummulative min Tx rate on all VFs
+ * @pf: PF associated with VFs
+ */
+static int ice_calc_all_vfs_min_tx_rate(struct ice_pf *pf)
+{
+	int rate = 0, i;
+
+	ice_for_each_vf(pf, i)
+		rate += pf->vf[i].min_tx_rate;
+
+	return rate;
+}
+
+/**
+ * ice_min_tx_rate_oversubscribed - check if min Tx rate causes oversubscription
+ * @vf: VF trying to configure min_tx_rate
+ * @min_tx_rate: min Tx rate in Mbps
+ *
+ * Check if the min_tx_rate being passed in will cause oversubscription of total
+ * min_tx_rate based on the current link speed and all other VFs configured
+ * min_tx_rate
+ *
+ * Return true if the passed min_tx_rate would cause oversubscription, else
+ * return false
+ */
+static bool
+ice_min_tx_rate_oversubscribed(struct ice_vf *vf, int min_tx_rate)
+{
+	int link_speed_mbps = ice_get_link_speed_mbps(ice_get_vf_vsi(vf));
+	int all_vfs_min_tx_rate = ice_calc_all_vfs_min_tx_rate(vf->pf);
+
+	/* this VF's previous rate is being overwritten */
+	all_vfs_min_tx_rate -= vf->min_tx_rate;
+
+	if (all_vfs_min_tx_rate + min_tx_rate > link_speed_mbps) {
+		dev_err(ice_pf_to_dev(vf->pf), "min_tx_rate of %d Mbps on VF %u would cause oversubscription of %d Mbps based on the current link speed %d Mbps\n",
+			min_tx_rate, vf->vf_id,
+			all_vfs_min_tx_rate + min_tx_rate - link_speed_mbps,
+			link_speed_mbps);
+		return true;
+	}
+
+	return false;
+}
+#endif /* HAVE_NDO_SET_VF_MIN_MAX_TX_RATE */
+
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+/**
+ * ice_vf_adq_total_max_tx_rate - cummulative max_tx_rate when VF ADQ is enabled
+ * @vf: Pointer to VF
+ *
+ * This function cummulative max Tx rate of all TCs if VF ADQ is enabled
+ */
+static u64 ice_vf_adq_total_max_tx_rate(struct ice_vf *vf)
+{
+	u64 cummulative_max_tx_rate = 0;
+	int i;
+
+	if (!ice_is_vf_adq_ena(vf))
+		return 0;
+
+	for (i = 0; i < vf->num_tc; i++)
+		cummulative_max_tx_rate += vf->ch[i].max_tx_rate;
+
+	return cummulative_max_tx_rate;
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+/**
+ * ice_set_vf_bw - set min/max VF bandwidth
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @min_tx_rate: Minimum Tx rate in Mbps
+ * @max_tx_rate: Maximum Tx rate in Mbps
+ */
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+int
+ice_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+	      int max_tx_rate)
+#else
+int ice_set_vf_bw(struct net_device *netdev, int vf_id, int max_tx_rate)
+#endif
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_vf *vf;
+	int ret;
+
+	dev = ice_pf_to_dev(pf);
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vsi = ice_get_vf_vsi(vf);
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+	/* when max_tx_rate is zero that means no max Tx rate limiting, so only
+	 * check if max_tx_rate is non-zero
+	 */
+	if (max_tx_rate && min_tx_rate > max_tx_rate) {
+		dev_err(dev, "Cannot set min Tx rate %d Mbps greater than max Tx rate %d Mbps\n",
+			min_tx_rate, max_tx_rate);
+		return -EINVAL;
+	}
+
+#ifdef NETIF_F_HW_TC
+	if (min_tx_rate && ice_is_adq_active(pf)) {
+		dev_err(dev, "ADQ on PF is currently enabled. VF min Tx rate limiting not allowed on this PF.\n");
+		return -EOPNOTSUPP;
+	}
+#endif /* NETIF_F_HW_TC */
+
+	if (min_tx_rate && ice_is_dcb_active(pf)) {
+		dev_err(dev, "DCB on PF is currently enabled. VF min Tx rate limiting not allowed on this PF.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ice_min_tx_rate_oversubscribed(vf, min_tx_rate))
+		return -EINVAL;
+
+	if (vf->min_tx_rate != (unsigned int)min_tx_rate) {
+		ret = ice_set_min_bw_limit(vsi, (u64)min_tx_rate * 1000);
+		if (ret) {
+			dev_err(dev, "Unable to set min-tx-rate for VF %d\n",
+				vf->vf_id);
+			return ret;
+		}
+
+		vf->min_tx_rate = min_tx_rate;
+	}
+
+#endif /* HAVE_NDO_SET_VF_MIN_MAX_TX_RATE */
+	if (vf->max_tx_rate != (unsigned int)max_tx_rate) {
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		u64 adq_max_tx_rate;
+#endif
+		ret = ice_set_max_bw_limit(vsi, (u64)max_tx_rate * 1000);
+		if (ret) {
+			dev_err(dev, "Unable to set max-tx-rate for VF %d\n",
+				vf->vf_id);
+			return ret;
+		}
+
+		vf->max_tx_rate = max_tx_rate;
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+		adq_max_tx_rate = ice_vf_adq_total_max_tx_rate(vf);
+		if (vf->max_tx_rate < adq_max_tx_rate)
+			dev_warn(dev, "Host managed max_tx_rate %u Mpbs for VF %d is less VF ADQ cummulative max_tx_rate %llu Mpbs\n",
+				 vf->vf_id, vf->max_tx_rate, adq_max_tx_rate);
+#endif
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_VF_STATS
+/**
+ * ice_get_vf_stats - populate some stats for the VF
+ * @netdev: the netdev of the PF
+ * @vf_id: the host OS identifier (0-255)
+ * @vf_stats: pointer to the OS memory to be initialized
+ */
+int ice_get_vf_stats(struct net_device *netdev, int vf_id,
+		     struct ifla_vf_stats *vf_stats)
+{
+	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_eth_stats *stats;
+	struct ice_vsi *vsi;
+	struct ice_vf *vf;
+	int ret;
+
+	if (ice_validate_vf_id(pf, vf_id))
+		return -EINVAL;
+
+	vf = &pf->vf[vf_id];
+	ret = ice_check_vf_ready_for_cfg(vf);
+	if (ret)
+		return ret;
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi)
+		return -EINVAL;
+
+	ice_update_eth_stats(vsi);
+	stats = &vsi->eth_stats;
+
+	memset(vf_stats, 0, sizeof(*vf_stats));
+
+	vf_stats->rx_packets = stats->rx_unicast + stats->rx_broadcast +
+		stats->rx_multicast;
+	vf_stats->tx_packets = stats->tx_unicast + stats->tx_broadcast +
+		stats->tx_multicast;
+	vf_stats->rx_bytes   = stats->rx_bytes;
+	vf_stats->tx_bytes   = stats->tx_bytes;
+	vf_stats->broadcast  = stats->rx_broadcast;
+	vf_stats->multicast  = stats->rx_multicast;
+#ifdef HAVE_VF_STATS_DROPPED
+	vf_stats->rx_dropped = stats->rx_discards;
+	vf_stats->tx_dropped = stats->tx_discards;
+#endif
 
 	return 0;
 }
+#endif /* HAVE_VF_STATS */
+
+/**
+ * ice_print_vf_rx_mdd_event - print VF Rx malicious driver detect event
+ * @vf: pointer to the VF structure
+ */
+void ice_print_vf_rx_mdd_event(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct device *dev;
+
+	dev = ice_pf_to_dev(pf);
+
+	dev_info(dev, "%d Rx Malicious Driver Detection events detected on PF %d VF %d MAC %pM. mdd-auto-reset-vfs=%s\n",
+		 vf->mdd_rx_events.count, pf->hw.pf_id, vf->vf_id,
+		 vf->dev_lan_addr.addr,
+		 test_bit(ICE_FLAG_MDD_AUTO_RESET_VF, pf->flags)
+			  ? "on" : "off");
+}
+
+/**
+ * ice_print_vfs_mdd_events - print VFs malicious driver detect event
+ * @pf: pointer to the PF structure
+ *
+ * Called from ice_handle_mdd_event to rate limit and print VFs MDD events.
+ */
+void ice_print_vfs_mdd_events(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_hw *hw = &pf->hw;
+	int i;
+
+	/* check that there are pending MDD events to print */
+	if (!test_and_clear_bit(ICE_MDD_VF_PRINT_PENDING, pf->state))
+		return;
+
+	/* VF MDD event logs are rate limited to one second intervals */
+	if (time_is_after_jiffies(pf->last_printed_mdd_jiffies + HZ * 1))
+		return;
+
+	pf->last_printed_mdd_jiffies = jiffies;
+
+	ice_for_each_vf(pf, i) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		/* only print Rx MDD event message if there are new events */
+		if (vf->mdd_rx_events.count != vf->mdd_rx_events.last_printed) {
+			vf->mdd_rx_events.last_printed =
+							vf->mdd_rx_events.count;
+			ice_print_vf_rx_mdd_event(vf);
+		}
+
+		/* only print Tx MDD event message if there are new events */
+		if (vf->mdd_tx_events.count != vf->mdd_tx_events.last_printed) {
+			vf->mdd_tx_events.last_printed =
+							vf->mdd_tx_events.count;
+
+			dev_info(dev, "%d Tx Malicious Driver Detection events detected on PF %d VF %d MAC %pM.\n",
+				 vf->mdd_tx_events.count, hw->pf_id, i,
+				 vf->dev_lan_addr.addr);
+		}
+	}
+}
+
+/**
+ * ice_restore_all_vfs_msi_state - restore VF MSI state after PF FLR
+ * @pdev: pointer to a pci_dev structure
+ *
+ * Called when recovering from a PF FLR to restore interrupt capability to
+ * the VFs.
+ */
+void ice_restore_all_vfs_msi_state(struct pci_dev *pdev)
+{
+	u16 vf_id;
+	int pos;
+
+	if (!pci_num_vf(pdev))
+		return;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos) {
+		struct pci_dev *vfdev;
+
+		pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID,
+				     &vf_id);
+		vfdev = pci_get_device(pdev->vendor, vf_id, NULL);
+		while (vfdev) {
+			if (vfdev->is_virtfn && vfdev->physfn == pdev)
+				pci_restore_msi_state(vfdev);
+			vfdev = pci_get_device(pdev->vendor, vf_id,
+					       vfdev);
+		}
+	}
+}
+
+/**
+ * ice_is_malicious_vf - helper function to detect a malicious VF
+ * @pf: ptr to struct ice_pf
+ * @event: pointer to the AQ event
+ * @num_msg_proc: the number of messages processed so far
+ * @num_msg_pending: the number of messages peinding in admin queue
+ */
+bool
+ice_is_malicious_vf(struct ice_pf *pf, struct ice_rq_event_info *event,
+		    u16 num_msg_proc, u16 num_msg_pending)
+{
+	s16 vf_id = le16_to_cpu(event->desc.retval);
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_mbx_data mbxdata;
+	enum ice_status status;
+	bool malvf = false;
+	struct ice_vf *vf;
+
+	if (ice_validate_vf_id(pf, vf_id))
+		return false;
+
+	vf = &pf->vf[vf_id];
+	/* Check if VF is disabled. */
+	if (test_bit(ICE_VF_STATE_DIS, vf->vf_states))
+		return false;
+
+	mbxdata.num_msg_proc = num_msg_proc;
+	mbxdata.num_pending_arq = num_msg_pending;
+	mbxdata.max_num_msgs_mbx = pf->hw.mailboxq.num_rq_entries;
+#define ICE_MBX_OVERFLOW_WATERMARK 64
+	mbxdata.async_watermark_val = ICE_MBX_OVERFLOW_WATERMARK;
+
+	/* check to see if we have a malicious VF */
+	status = ice_mbx_vf_state_handler(&pf->hw, &mbxdata, vf_id, &malvf);
+	if (status)
+		return false;
+
+	if (malvf) {
+		bool report_vf = false;
+
+		/* if the VF is malicious and we haven't let the user
+		 * know about it, then let them know now
+		 */
+		status = ice_mbx_report_malvf(&pf->hw, pf->malvfs,
+					      ICE_MAX_VF_COUNT, vf_id,
+					      &report_vf);
+		if (status)
+			dev_dbg(dev, "Error reporting malicious VF\n");
+
+		if (report_vf) {
+			struct ice_vsi *pf_vsi = ice_get_main_vsi(pf);
+
+			if (pf_vsi)
+				dev_warn(dev, "VF MAC %pM on PF MAC %pM is generating asynchronous messages and may be overflowing the PF message queue. Please see the Adapter User Guide for more information\n",
+					 &vf->dev_lan_addr.addr[0],
+					 pf_vsi->netdev->dev_addr);
+		}
+
+		return true;
+	}
+
+	/* if there was an error in detection or the VF is not malicious then
+	 * return false
+	 */
+	return false;
+}
+
+static void ice_dump_vf(struct ice_vf *vf)
+{
+	struct ice_vsi *vsi;
+	struct device *dev;
+	struct ice_pf *pf;
+
+	if (!vf)
+		return;
+
+	pf = vf->pf;
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi)
+		return;
+
+	dev = ice_pf_to_dev(pf);
+	dev_info(dev, "VF[%d]:\n", vf->vf_id);
+	dev_info(dev, "\tvf_ver.major = %d vf_ver.minor = %d\n",
+		 vf->vf_ver.major, vf->vf_ver.minor);
+	dev_info(dev, "\tdriver_caps = 0x%08x\n", vf->driver_caps);
+	dev_info(dev, "\tvf_caps = 0x%08lx\n", vf->vf_caps);
+	dev_info(dev, "\tvf_states:\n");
+	if (test_bit(ICE_VF_STATE_INIT, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_INIT\n");
+	if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_ACTIVE\n");
+	if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_QS_ENA\n");
+	if (test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_MC_PROMISC\n");
+	if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+		dev_info(dev, "\t\tICE_VF_STATE_UC_PROMISC\n");
+	dev_info(dev, "\tvsi = %pK, vsi->idx = %d, vsi->vsi_num = %d\n",
+		 vsi, vsi->idx, vsi->vsi_num);
+	dev_info(dev, "\tlan_vsi_idx = %d\n", vf->lan_vsi_idx);
+	dev_info(dev, "\tlan_vsi_num = %d\n", vf->lan_vsi_num);
+	dev_info(dev, "\tnum_mac = %d\n", vf->num_mac);
+	dev_info(dev, "\tdev_lan_addr = %pM\n", &vf->dev_lan_addr.addr[0]);
+	dev_info(dev, "\thw_lan_addr = %pM\n", &vf->hw_lan_addr.addr[0]);
+	dev_info(dev, "\tnum_req_qs = %d\n", vf->num_req_qs);
+	dev_info(dev, "\trxq_ena = 0x%lx\n", *vf->rxq_ena);
+	dev_info(dev, "\ttxq_ena = 0x%lx\n", *vf->txq_ena);
+	dev_info(dev, "\tPort VLAN status: %s\n",
+		 ice_vf_is_port_vlan_ena(vf) ? "enabled" : "disabled");
+	dev_info(dev, "\t\tPort VLAN ID = %d\n", ice_vf_get_port_vlan_id(vf));
+	dev_info(dev, "\t\tQoS = %d\n", ice_vf_get_port_vlan_prio(vf));
+	dev_info(dev, "\t\tTPID = 0x%x", ice_vf_get_port_vlan_tpid(vf));
+	dev_info(dev, "\tpf_set_mac = %s\n", vf->pf_set_mac ? "true" : "false");
+	dev_info(dev, "\ttrusted = %s\n", vf->trusted ? "true" : "false");
+	dev_info(dev, "\tspoofchk = %s\n", vf->spoofchk ? "true" : "false");
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
+	dev_info(dev, "\tlink_forced = %s, link_up (only valid when link_forced is true) = %s\n",
+		 vf->link_forced ? "true" : "false",
+		 vf->link_up ? "true" : "false");
+#endif
+	dev_info(dev, "\tmax_tx_rate = %d\n", vf->max_tx_rate);
+	dev_info(dev, "\tmin_tx_rate = %d\n", vf->min_tx_rate);
+	dev_info(dev, "\tnum_inval_msgs = %lld\n", vf->num_inval_msgs);
+	dev_info(dev, "\tnum_valid_msgs = %lld\n", vf->num_valid_msgs);
+	dev_info(dev, "\tmdd_rx_events = %u\n", vf->mdd_rx_events.count);
+	dev_info(dev, "\tmdd_tx_events = %u\n", vf->mdd_tx_events.count);
+	dev_info(dev, "\tfirst_vector_idx = %d\n", vf->first_vector_idx);
+	dev_info(dev, "\tvf_sw_id = %pK\n", vf->vf_sw_id);
+	dev_info(dev, "\tadq_enabled = %s\n",
+		 vf->adq_enabled ? "true" : "false");
+	dev_info(dev, "\tadq_fltr_ena = %s\n",
+		 vf->adq_fltr_ena ? "true" : "false");
+	dev_info(dev, "\tnum_tc = %u\n", vf->num_tc);
+	dev_info(dev, "\tnum_dmac_chnl_fltrs = %u\n", vf->num_dmac_chnl_fltrs);
+}
+
+void ice_dump_all_vfs(struct ice_pf *pf)
+{
+	u16 v;
+
+	ice_for_each_vf(pf, v)
+		ice_dump_vf(&pf->vf[v]);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
index 0d9880c8bba315df05716d0d6348df85f33e99ee..ba727a4c986e2f3154d9fd207ad228c963b09039 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -1,18 +1,21 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2018, Intel Corporation. */
+/* Copyright (C) 2018-2021, Intel Corporation. */
 
 #ifndef _ICE_VIRTCHNL_PF_H_
 #define _ICE_VIRTCHNL_PF_H_
 #include "ice.h"
+#include "ice_virtchnl_fdir.h"
+#include "ice_dcf.h"
+#include "ice_vsi_vlan_ops.h"
 
-#define ICE_MAX_VLANID			4095
-#define ICE_VLAN_PRIORITY_S		12
-#define ICE_VLAN_M			0xFFF
-#define ICE_PRIORITY_M			0x7000
+#define ICE_VIRTCHNL_SUPPORTED_QTYPES	2
 
 /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */
 #define ICE_MAX_VLAN_PER_VF		8
-#define ICE_MAX_MACADDR_PER_VF		12
+/* MAC filters: 1 is reserved for the VF's default/perm_addr/LAA MAC, 1 for
+ * broadcast, and 16 for additional unicast/multicast filters
+ */
+#define ICE_MAX_MACADDR_PER_VF		18
 
 /* Malicious Driver Detection */
 #define ICE_DFLT_NUM_INVAL_MSGS_ALLOWED		10
@@ -26,18 +29,31 @@
 #define ICE_PCI_CIAD_WAIT_COUNT		100
 #define ICE_PCI_CIAD_WAIT_DELAY_US	1
 
-/* VF resources default values and limitation */
+/* VF resource constraints */
 #define ICE_MAX_VF_COUNT		256
-#define ICE_MAX_QS_PER_VF		256
+#define ICE_MAX_QS_PER_VF	256
+/* Maximum number of queue pairs to configure by default for a VF */
+#define ICE_MAX_DFLT_QS_PER_VF		16
 #define ICE_MIN_QS_PER_VF		1
-#define ICE_DFLT_QS_PER_VF		4
 #define ICE_NONQ_VECS_VF		1
 #define ICE_MAX_SCATTER_QS_PER_VF	16
-#define ICE_MAX_BASE_QS_PER_VF		16
-#define ICE_MAX_INTR_PER_VF		65
-#define ICE_MAX_POLICY_INTR_PER_VF	33
+#define ICE_MAX_RSS_QS_PER_LARGE_VF	64
+#define ICE_MAX_RSS_QS_PER_VF		16
+#define ICE_NUM_VF_MSIX_MAX		65
+#define ICE_NUM_VF_MSIX_LARGE		33
+#define ICE_NUM_VF_MSIX_MED		17
+#define ICE_NUM_VF_MSIX_SMALL		5
+#define ICE_NUM_VF_MSIX_MULTIQ_MIN	3
 #define ICE_MIN_INTR_PER_VF		(ICE_MIN_QS_PER_VF + 1)
-#define ICE_DFLT_INTR_PER_VF		(ICE_DFLT_QS_PER_VF + 1)
+#define ICE_MAX_VF_RESET_TRIES		40
+#define ICE_MAX_VF_RESET_SLEEP_MS	20
+#define ICE_MAX_IPSEC_CAPABLE_VF_ID	127
+
+#define ice_for_each_vf(pf, i) \
+	for ((i) = 0; (i) < (pf)->num_alloc_vfs; (i)++)
+
+/* Max number of flexible descriptor rxdid */
+#define ICE_FLEX_DESC_RXDID_MAX_NUM 64
 
 /* Specific VF states */
 enum ice_vf_states {
@@ -56,46 +72,240 @@ enum ice_virtchnl_cap {
 	ICE_VIRTCHNL_VF_CAP_PRIVILEGE,
 };
 
+/* DDP package type */
+enum ice_pkg_type {
+	ICE_PKG_TYPE_UNKNOWN = 0,
+	ICE_PKG_TYPE_OS_DEFAULT,
+	ICE_PKG_TYPE_COMMS,
+	ICE_PKG_TYPE_WIRELESS_EDGE,
+	ICE_PKG_TYPE_GTP_OVER_GRE,
+	ICE_PKG_TYPE_END,
+};
+
+/* In ADQ, max 4 VSI's can be allocated per VF including primary VF VSI.
+ * These variables are used to store indices, ID's and number of queues
+ * for each VSI including that of primary VF VSI. Each Traffic class is
+ * termed as channel and each channel can in-turn have 4 queues which
+ * means max 16 queues overall per VF.
+ */
+struct ice_channel_vf {
+	u16 vsi_idx; /* index in PF struct for all channel VSIs */
+	u16 vsi_num; /* HW (absolute) index of this VSI */
+	u16 num_qps; /* number of queue pairs requested by user */
+	u16 offset;
+	u64 max_tx_rate; /* Tx rate limiting for channels */
+
+	/* type of filter: dest/src/dest+src port */
+	u32 fltr_type;
+};
+
+struct ice_time_mac {
+	unsigned long time_modified;
+	u8 addr[ETH_ALEN];
+};
+
+/* VF MDD events print structure */
+struct ice_mdd_vf_events {
+	u16 count;			/* total count of Rx|Tx events */
+	/* count number of the last printed event */
+	u16 last_printed;
+};
+
+/* The VF VLAN information controlled by DCF */
+struct ice_dcf_vlan_info {
+	struct ice_vlan outer_port_vlan;
+	u16 outer_stripping_tpid;
+	u8 outer_stripping_ena:1;
+	u8 applying:1;
+};
+
+#define ICE_HASH_IP_CTX_IP		0
+#define ICE_HASH_IP_CTX_IP_ESP		1
+#define ICE_HASH_IP_CTX_IP_UDP_ESP	2
+#define ICE_HASH_IP_CTX_IP_AH		3
+#define ICE_HASH_IP_CTX_IP_L2TPV3	4
+#define ICE_HASH_IP_CTX_IP_PFCP		5
+#define ICE_HASH_IP_CTX_IP_UDP		6
+#define ICE_HASH_IP_CTX_IP_TCP		7
+#define ICE_HASH_IP_CTX_IP_SCTP		8
+#define ICE_HASH_IP_CTX_MAX		9
+
+struct ice_vf_hash_ip_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_IP_CTX_MAX];
+};
+
+#define ICE_HASH_GTPU_CTX_EH_IP		0
+#define ICE_HASH_GTPU_CTX_EH_IP_UDP	1
+#define ICE_HASH_GTPU_CTX_EH_IP_TCP	2
+#define ICE_HASH_GTPU_CTX_UP_IP		3
+#define ICE_HASH_GTPU_CTX_UP_IP_UDP	4
+#define ICE_HASH_GTPU_CTX_UP_IP_TCP	5
+#define ICE_HASH_GTPU_CTX_DW_IP		6
+#define ICE_HASH_GTPU_CTX_DW_IP_UDP	7
+#define ICE_HASH_GTPU_CTX_DW_IP_TCP	8
+#define ICE_HASH_GTPU_CTX_MAX		9
+
+struct ice_vf_hash_gtpu_ctx {
+	struct ice_rss_hash_cfg ctx[ICE_HASH_GTPU_CTX_MAX];
+};
+
+struct ice_vf_hash_ctx {
+	struct ice_vf_hash_ip_ctx v4;
+	struct ice_vf_hash_ip_ctx v6;
+	struct ice_vf_hash_gtpu_ctx ipv4;
+	struct ice_vf_hash_gtpu_ctx ipv6;
+};
+
+struct ice_vf;
+
+struct ice_vc_vf_ops {
+	int (*get_ver_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_vf_res_msg)(struct ice_vf *vf, u8 *msg);
+	void (*reset_vf)(struct ice_vf *vf);
+	int (*add_mac_addr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_mac_addr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*request_qs_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_irq_map_msg)(struct ice_vf *vf, u8 *msg);
+	int (*config_rss_key)(struct ice_vf *vf, u8 *msg);
+	int (*config_rss_lut)(struct ice_vf *vf, u8 *msg);
+	int (*get_stats_msg)(struct ice_vf *vf, u8 *msg);
+	int (*cfg_promiscuous_mode_msg)(struct ice_vf *vf, u8 *msg);
+	int (*add_vlan_msg)(struct ice_vf *vf, u8 *msg);
+	int (*remove_vlan_msg)(struct ice_vf *vf, u8 *msg);
+	int (*query_rxdid)(struct ice_vf *vf);
+	int (*get_rss_hena)(struct ice_vf *vf);
+	int (*set_rss_hena_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_stripping)(struct ice_vf *vf);
+	int (*dis_vlan_stripping)(struct ice_vf *vf);
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+	int (*add_qch_msg)(struct ice_vf *vf, u8 *msg);
+	int (*add_switch_filter_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_switch_filter_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_qch_msg)(struct ice_vf *vf, u8 *msg);
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+	int (*rdma_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*cfg_rdma_irq_map_msg)(struct ice_vf *vf, u8 *msg);
+	int (*clear_rdma_irq_map)(struct ice_vf *vf);
+	int (*dcf_vlan_offload_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dcf_cmd_desc_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*dcf_cmd_buff_msg)(struct ice_vf *vf, u8 *msg, u16 msglen);
+	int (*dis_dcf_cap)(struct ice_vf *vf);
+	int (*dcf_get_vsi_map)(struct ice_vf *vf);
+	int (*dcf_query_pkg_info)(struct ice_vf *vf);
+	int (*handle_rss_cfg_msg)(struct ice_vf *vf, u8 *msg, bool add);
+	int (*add_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*del_fdir_fltr_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_max_rss_qregion)(struct ice_vf *vf);
+	int (*ena_qs_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_qs_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*map_q_vector_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_offload_vlan_v2_caps)(struct ice_vf *vf);
+	int (*add_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*remove_vlan_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_vlan_stripping_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*ena_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
+	int (*dis_vlan_insertion_v2_msg)(struct ice_vf *vf, u8 *msg);
+};
+
 /* VF information structure */
 struct ice_vf {
 	struct ice_pf *pf;
 
-	s16 vf_id;			/* VF ID in the PF space */
+
+	u16 vf_id;			/* VF ID in the PF space */
 	u16 lan_vsi_idx;		/* index into PF struct */
+	u16 ctrl_vsi_idx;
+	struct ice_vf_fdir fdir;
+	struct ice_vf_hash_ctx hash_ctx;
 	/* first vector index of this VF in the PF space */
 	int first_vector_idx;
 	struct ice_sw *vf_sw_id;	/* switch ID the VF VSIs connect to */
 	struct virtchnl_version_info vf_ver;
 	u32 driver_caps;		/* reported by VF driver */
-	struct virtchnl_ether_addr dflt_lan_addr;
-	DECLARE_BITMAP(txq_ena, ICE_MAX_BASE_QS_PER_VF);
-	DECLARE_BITMAP(rxq_ena, ICE_MAX_BASE_QS_PER_VF);
-	u16 port_vlan_id;
+	u16 stag;			/* VF Port Extender (PE) stag if used */
+	struct virtchnl_ether_addr dev_lan_addr;
+	struct virtchnl_ether_addr hw_lan_addr;
+	struct ice_time_mac legacy_last_added_umac;
+	DECLARE_BITMAP(txq_ena, ICE_MAX_QS_PER_VF);
+	DECLARE_BITMAP(rxq_ena, ICE_MAX_QS_PER_VF);
+	struct ice_vlan port_vlan_info;	/* Port VLAN ID, QoS, and TPID */
+	struct virtchnl_vlan_caps vlan_v2_caps;
+	struct ice_dcf_vlan_info dcf_vlan_info;
 	u8 pf_set_mac:1;		/* VF MAC address set by VMM admin */
 	u8 trusted:1;
 	u8 spoofchk:1;
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 	u8 link_forced:1;
 	u8 link_up:1;			/* only valid if VF link is forced */
+#endif
 	/* VSI indices - actual VSI pointers are maintained in the PF structure
 	 * When assigned, these will be non-zero, because VSI 0 is always
 	 * the main LAN VSI for the PF.
 	 */
 	u16 lan_vsi_num;		/* ID as used by firmware */
-	unsigned int tx_rate;		/* Tx bandwidth limit in Mbps */
+	unsigned int min_tx_rate;	/* Minimum Tx bandwidth limit in Mbps */
+	unsigned int max_tx_rate;	/* Maximum Tx bandwidth limit in Mbps */
 	DECLARE_BITMAP(vf_states, ICE_VF_STATES_NBITS);	/* VF runtime states */
 
-	u64 num_mdd_events;		/* number of MDD events detected */
 	u64 num_inval_msgs;		/* number of continuous invalid msgs */
 	u64 num_valid_msgs;		/* number of valid msgs detected */
 	unsigned long vf_caps;		/* VF's adv. capabilities */
-	u8 num_req_qs;			/* num of queue pairs requested by VF */
+	u16 num_req_qs;			/* num of queue pairs requested by VF */
 	u16 num_mac;
-	u16 num_vlan;
 	u16 num_vf_qs;			/* num of queue configured per VF */
-	u16 num_qs_ena;			/* total num of Tx/Rx queue enabled */
+	u8 vlan_strip_ena;		/* Outer and Inner VLAN strip enable */
+#define ICE_INNER_VLAN_STRIP_ENA	BIT(0)
+#define ICE_OUTER_VLAN_STRIP_ENA	BIT(1)
+	/* ADQ related variables */
+	u8 adq_enabled; /* flag to enable ADQ */
+	u8 adq_fltr_ena; /* flag to denote that ADQ filters are applied */
+	u8 num_tc;
+	u16 num_dmac_chnl_fltrs;
+	struct ice_channel_vf ch[VIRTCHNL_MAX_ADQ_V2_CHANNELS];
+	struct hlist_head tc_flower_fltr_list;
+	struct ice_mdd_vf_events mdd_rx_events;
+	struct ice_mdd_vf_events mdd_tx_events;
+	struct ice_repr *repr;
+	DECLARE_BITMAP(opcodes_allowlist, VIRTCHNL_OP_MAX);
+	struct ice_vc_vf_ops vc_ops;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+	/* devlink port data */
+	struct devlink_port devlink_port;
+#endif /* CONFIG_NET_DEVLINK */
 };
 
+/**
+ * ice_vc_get_max_chnl_tc_allowed
+ * @vf: pointer to the VF info
+ *
+ * This function returns max channel TC allowed depends upon "driver_caps"
+ */
+static inline u32 ice_vc_get_max_chnl_tc_allowed(struct ice_vf *vf)
+{
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ_V2)
+		return VIRTCHNL_MAX_ADQ_V2_CHANNELS;
+	else
+		return VIRTCHNL_MAX_ADQ_CHANNELS;
+}
+
+/**
+ * ice_vf_chnl_dmac_fltr_cnt - number of dmac based channel filters
+ * @vf: pointer to the VF info
+ */
+static inline u16 ice_vf_chnl_dmac_fltr_cnt(struct ice_vf *vf)
+{
+	return vf->num_dmac_chnl_fltrs;
+}
+
+
 #ifdef CONFIG_PCI_IOV
+void ice_dump_all_vfs(struct ice_pf *pf);
+struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf);
 void ice_process_vflr_event(struct ice_pf *pf);
 int ice_sriov_configure(struct pci_dev *pdev, int num_vfs);
 int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
@@ -104,30 +314,106 @@ ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi);
 
 void ice_free_vfs(struct ice_pf *pf);
 void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event);
+
+/* VF configuration related iplink handlers */
 void ice_vc_notify_link_state(struct ice_pf *pf);
 void ice_vc_notify_reset(struct ice_pf *pf);
+void ice_vc_notify_vf_link_state(struct ice_vf *vf);
+void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops);
+void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops);
 bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr);
+bool ice_reset_vf(struct ice_vf *vf, bool is_vflr);
+void ice_restore_all_vfs_msi_state(struct pci_dev *pdev);
+bool
+ice_is_malicious_vf(struct ice_pf *pf, struct ice_rq_event_info *event,
+		    u16 num_msg_proc, u16 num_msg_pending);
+
 
+#ifdef IFLA_VF_VLAN_INFO_MAX
 int
 ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
 		     __be16 vlan_proto);
+#else
+int
+ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos);
+#endif
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+int
+ice_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+	      int max_tx_rate);
+#else
+int ice_set_vf_bw(struct net_device *netdev, int vf_id, int tx_rate);
+#endif
 
 int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted);
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state);
+#endif
+
+int ice_check_vf_ready_for_cfg(struct ice_vf *vf);
 
 int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena);
 
-int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector);
+int ice_calc_vf_reg_idx(struct ice_vf *vf, struct ice_q_vector *q_vector,
+			u8 tc);
 
 void ice_set_vf_state_qs_dis(struct ice_vf *vf);
+#ifdef HAVE_VF_STATS
+int
+ice_get_vf_stats(struct net_device *netdev, int vf_id,
+		 struct ifla_vf_stats *vf_stats);
+#endif /* HAVE_VF_STATS */
+bool ice_is_any_vf_in_promisc(struct ice_pf *pf);
+void
+ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event);
+void ice_print_vfs_mdd_events(struct ice_pf *pf);
+void ice_print_vf_rx_mdd_event(struct ice_vf *vf);
+enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw);
+bool ice_vc_validate_pattern(struct ice_vf *vf,
+			     struct virtchnl_proto_hdrs *proto);
+struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf);
+int
+ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
+		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen);
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id);
+bool ice_vf_is_port_vlan_ena(struct ice_vf *vf);
 #else /* CONFIG_PCI_IOV */
-#define ice_process_vflr_event(pf) do {} while (0)
-#define ice_free_vfs(pf) do {} while (0)
-#define ice_vc_process_vf_msg(pf, event) do {} while (0)
-#define ice_vc_notify_link_state(pf) do {} while (0)
-#define ice_vc_notify_reset(pf) do {} while (0)
-#define ice_set_vf_state_qs_dis(vf) do {} while (0)
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static inline struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf)
+{
+	return NULL;
+}
+#endif /* CONFIG_NET_DEVLINK */
+static inline void ice_dump_all_vfs(struct ice_pf *pf) { }
+static inline void ice_process_vflr_event(struct ice_pf *pf) { }
+static inline void ice_free_vfs(struct ice_pf *pf) { }
+static inline
+void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_vc_notify_link_state(struct ice_pf *pf) { }
+static inline void ice_vc_notify_reset(struct ice_pf *pf) { }
+static inline void ice_vc_notify_vf_link_state(struct ice_vf *vf) { }
+static inline void ice_vc_change_ops_to_repr(struct ice_vc_vf_ops *ops) { }
+static inline int ice_check_vf_ready_for_cfg(struct ice_vf *vf)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ice_vc_set_dflt_vf_ops(struct ice_vc_vf_ops *ops) { }
+static inline void ice_set_vf_state_qs_dis(struct ice_vf *vf) { }
+static inline
+void ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event) { }
+static inline void ice_print_vfs_mdd_events(struct ice_pf *pf) { }
+static inline void ice_print_vf_rx_mdd_event(struct ice_vf *vf) { }
+static inline void ice_restore_all_vfs_msi_state(struct pci_dev *pdev) { }
+static inline bool
+ice_is_malicious_vf(struct ice_pf __always_unused *pf,
+		    struct ice_rq_event_info __always_unused *event,
+		    u16 __always_unused num_msg_proc,
+		    u16 __always_unused num_msg_pending)
+{
+	return false;
+}
 
 static inline bool
 ice_reset_all_vfs(struct ice_pf __always_unused *pf,
@@ -136,6 +422,12 @@ ice_reset_all_vfs(struct ice_pf __always_unused *pf,
 	return true;
 }
 
+static inline bool
+ice_reset_vf(struct ice_vf __always_unused *vf, bool __always_unused is_vflr)
+{
+	return true;
+}
+
 static inline int
 ice_sriov_configure(struct pci_dev __always_unused *pdev,
 		    int __always_unused num_vfs)
@@ -158,13 +450,16 @@ ice_get_vf_cfg(struct net_device __always_unused *netdev,
 	return -EOPNOTSUPP;
 }
 
+#ifdef HAVE_NDO_SET_VF_TRUST
 static inline int
 ice_set_vf_trust(struct net_device __always_unused *netdev,
 		 int __always_unused vf_id, bool __always_unused trusted)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* HAVE_NDO_SET_VF_TRUST */
 
+#ifdef IFLA_VF_VLAN_INFO_MAX
 static inline int
 ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
 		     int __always_unused vf_id, u16 __always_unused vid,
@@ -172,6 +467,15 @@ ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
 {
 	return -EOPNOTSUPP;
 }
+#else
+static inline int
+ice_set_vf_port_vlan(struct net_device __always_unused *netdev,
+		     int __always_unused vf_id, u16 __always_unused vid,
+		     u8 __always_unused qos)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* IFLA_VF_VLAN_INFO_MAX */
 
 static inline int
 ice_set_vf_spoofchk(struct net_device __always_unused *netdev,
@@ -180,18 +484,77 @@ ice_set_vf_spoofchk(struct net_device __always_unused *netdev,
 	return -EOPNOTSUPP;
 }
 
+#ifdef HAVE_NDO_SET_VF_LINK_STATE
 static inline int
 ice_set_vf_link_state(struct net_device __always_unused *netdev,
 		      int __always_unused vf_id, int __always_unused link_state)
 {
 	return -EOPNOTSUPP;
 }
+#endif /* HAVE_NDO_SET_VF_LINK_STATE */
+
+#ifdef HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+static inline int
+ice_set_vf_bw(struct net_device __always_unused *netdev,
+	      int __always_unused vf_id, int __always_unused min_tx_rate,
+	      int __always_unused max_tx_rate)
+#else
+static inline int
+ice_set_vf_bw(struct net_device __always_unused *netdev,
+	      int __always_unused vf_id, int __always_unused max_tx_rate)
+#endif
+{
+	return -EOPNOTSUPP;
+}
 
 static inline int
 ice_calc_vf_reg_idx(struct ice_vf __always_unused *vf,
-		    struct ice_q_vector __always_unused *q_vector)
+		    struct ice_q_vector __always_unused *q_vector,
+		    u8 __always_unused tc)
 {
 	return 0;
 }
+
+#ifdef HAVE_VF_STATS
+static inline int
+ice_get_vf_stats(struct net_device __always_unused *netdev,
+		 int __always_unused vf_id,
+		 struct ifla_vf_stats __always_unused *vf_stats)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_VF_STATS */
+
+static inline bool ice_is_any_vf_in_promisc(struct ice_pf __always_unused *pf)
+{
+	return false;
+}
+
+static inline enum ice_pkg_type ice_pkg_name_to_type(struct ice_hw *hw)
+{
+	return ICE_PKG_TYPE_UNKNOWN;
+}
+
+static inline struct ice_vsi *
+ice_vf_ctrl_vsi_setup(struct ice_vf __always_unused *vf)
+{
+	return NULL;
+}
+
+static inline int
+ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
+		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
+{
+	return 0;
+}
+
+static inline bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
+{
+	return 0;
+}
+static inline bool ice_vf_is_port_vlan_ena(struct ice_vf __always_unused *vf)
+{
+	return false;
+}
 #endif /* CONFIG_PCI_IOV */
 #endif /* _ICE_VIRTCHNL_PF_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan.h b/drivers/net/ethernet/intel/ice/ice_vlan.h
new file mode 100644
index 0000000000000000000000000000000000000000..69b78750c1f5b6c3db67746f95a8d9216a0e61db
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_H_
+#define _ICE_VLAN_H_
+
+#include <linux/types.h>
+#include "ice_type.h"
+
+struct ice_vlan {
+	u16 tpid;
+	u16 vid;
+	u8 prio;
+	enum ice_sw_fwd_act_type fwd_act;
+};
+
+#endif /* _ICE_VLAN_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.c b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
new file mode 100644
index 0000000000000000000000000000000000000000..5cddf9ec042f693058b7e9e77246f03c116575a8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_common.h"
+
+/**
+ * ice_pkg_get_supported_vlan_mode - chk if DDP supports Double VLAN mode (DVM)
+ * @hw: pointer to the HW struct
+ * @dvm: output variable to determine if DDP supports DVM(true) or SVM(false)
+ */
+static enum ice_status
+ice_pkg_get_supported_vlan_mode(struct ice_hw *hw, bool *dvm)
+{
+	u16 meta_init_size = sizeof(struct ice_meta_init_section);
+	struct ice_meta_init_section *sect;
+	struct ice_buf_build *bld;
+	enum ice_status status;
+
+	/* if anything fails, we assume there is no DVM support */
+	*dvm = false;
+
+	bld = ice_pkg_buf_alloc_single_section(hw,
+					       ICE_SID_RXPARSER_METADATA_INIT,
+					       meta_init_size, (void **)&sect);
+	if (!bld)
+		return ICE_ERR_NO_MEMORY;
+
+	/* only need to read a single section */
+	sect->count = cpu_to_le16(1);
+	sect->offset = cpu_to_le16(ICE_META_VLAN_MODE_ENTRY);
+
+	status = ice_aq_upload_section(hw,
+				       (struct ice_buf_hdr *)ice_pkg_buf(bld),
+				       ICE_PKG_BUF_SIZE, NULL);
+	if (!status) {
+		DECLARE_BITMAP(entry, ICE_META_INIT_BITS);
+		u32 arr[ICE_META_INIT_DW_CNT];
+		u16 i;
+
+		/* convert to host bitmap format */
+		for (i = 0; i < ICE_META_INIT_DW_CNT; i++)
+			arr[i] = le32_to_cpu(sect->entry[0].bm[i]);
+
+		bitmap_from_arr32(entry, arr, (u16)ICE_META_INIT_BITS);
+
+		/* check if DVM is supported */
+		*dvm = test_bit(ICE_META_VLAN_MODE_BIT, entry);
+	}
+
+	ice_pkg_buf_free(hw, bld);
+
+	return status;
+}
+
+/**
+ * ice_aq_get_vlan_mode - get the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @get_params: structure FW fills in based on the current VLAN mode config
+ *
+ * Get VLAN Mode Parameters (0x020D)
+ */
+static enum ice_status
+ice_aq_get_vlan_mode(struct ice_hw *hw,
+		     struct ice_aqc_get_vlan_mode *get_params)
+{
+	struct ice_aq_desc desc;
+
+	if (!get_params)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_get_vlan_mode_parameters);
+
+	return ice_aq_send_cmd(hw, &desc, get_params, sizeof(*get_params),
+			       NULL);
+}
+
+/**
+ * ice_aq_is_dvm_ena - query FW to check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * Returns true if the hardware/firmware is configured in double VLAN mode,
+ * else return false signaling that the hardware/firmware is configured in
+ * single VLAN mode.
+ *
+ * Also, return false if this call fails for any reason (i.e. firmware doesn't
+ * support this AQ call).
+ */
+static bool ice_aq_is_dvm_ena(struct ice_hw *hw)
+{
+	struct ice_aqc_get_vlan_mode get_params = { 0 };
+	enum ice_status status;
+
+	status = ice_aq_get_vlan_mode(hw, &get_params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_AQ, "Failed to get VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return (get_params.vlan_mode & ICE_AQ_VLAN_MODE_DVM_ENA);
+}
+
+/**
+ * ice_is_dvm_ena - check if double VLAN mode is enabled
+ * @hw: pointer to the HW structure
+ *
+ * The device is configured in single or double VLAN mode on initialization and
+ * this cannot be dynamically changed during runtime. Based on this there is no
+ * need to make an AQ call every time the driver needs to know the VLAN mode.
+ * Instead, use the cached VLAN mode.
+ */
+bool ice_is_dvm_ena(struct ice_hw *hw)
+{
+	return hw->dvm_ena;
+}
+
+/**
+ * ice_cache_vlan_mode - cache VLAN mode after DDP is downloaded
+ * @hw: pointer to the HW structure
+ *
+ * This is only called after downloading the DDP and after the global
+ * configuration lock has been released because all ports on a device need to
+ * cache the VLAN mode.
+ */
+static void ice_cache_vlan_mode(struct ice_hw *hw)
+{
+	hw->dvm_ena = ice_aq_is_dvm_ena(hw) ? true : false;
+}
+
+/**
+ * ice_pkg_supports_dvm - find out if DDP supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_pkg_supports_dvm(struct ice_hw *hw)
+{
+	enum ice_status status;
+	bool pkg_supports_dvm;
+
+	status = ice_pkg_get_supported_vlan_mode(hw, &pkg_supports_dvm);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PKG, "Failed to get supported VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return pkg_supports_dvm;
+}
+
+/**
+ * ice_fw_supports_dvm - find out if FW supports DVM
+ * @hw: pointer to the HW structure
+ */
+static bool ice_fw_supports_dvm(struct ice_hw *hw)
+{
+	struct ice_aqc_get_vlan_mode get_vlan_mode = { 0 };
+	enum ice_status status;
+
+	/* If firmware returns success, then it supports DVM, else it only
+	 * supports SVM
+	 */
+	status = ice_aq_get_vlan_mode(hw, &get_vlan_mode);
+	if (status) {
+		ice_debug(hw, ICE_DBG_NVM, "Failed to get VLAN mode, status %d\n",
+			  status);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_is_dvm_supported - check if Double VLAN Mode is supported
+ * @hw: pointer to the hardware structure
+ *
+ * Returns true if Double VLAN Mode (DVM) is supported and false if only Single
+ * VLAN Mode (SVM) is supported. In order for DVM to be supported the DDP and
+ * firmware must support it, otherwise only SVM is supported. This function
+ * should only be called while the global config lock is held and after the
+ * package has been successfully downloaded.
+ */
+static bool ice_is_dvm_supported(struct ice_hw *hw)
+{
+	if (!ice_pkg_supports_dvm(hw)) {
+		ice_debug(hw, ICE_DBG_PKG, "DDP doesn't support DVM\n");
+		return false;
+	}
+
+	if (!ice_fw_supports_dvm(hw)) {
+		ice_debug(hw, ICE_DBG_PKG, "FW doesn't support DVM\n");
+		return false;
+	}
+
+	return true;
+}
+
+#define ICE_EXTERNAL_VLAN_ID_FV_IDX			11
+#define ICE_SW_LKUP_VLAN_LOC_LKUP_IDX			1
+#define ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX		2
+#define ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX		2
+#define ICE_PKT_FLAGS_0_TO_15_FV_IDX			1
+#define ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK		0xD000
+static struct ice_update_recipe_lkup_idx_params ice_dvm_dflt_recipes[] = {
+	{
+		/* Update recipe ICE_SW_LKUP_VLAN to filter based on the
+		 * outer/single VLAN in DVM
+		 */
+		.rid = ICE_SW_LKUP_VLAN,
+		.fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+		.ignore_valid = true,
+		.mask = 0,
+		.mask_valid = false, /* use pre-existing mask */
+		.lkup_idx = ICE_SW_LKUP_VLAN_LOC_LKUP_IDX,
+	},
+	{
+		/* Update recipe ICE_SW_LKUP_VLAN to filter based on the VLAN
+		 * packet flags to support VLAN filtering on multiple VLAN
+		 * ethertypes (i.e. 0x8100 and 0x88a8) in DVM
+		 */
+		.rid = ICE_SW_LKUP_VLAN,
+		.fv_idx = ICE_PKT_FLAGS_0_TO_15_FV_IDX,
+		.ignore_valid = false,
+		.mask = ICE_PKT_FLAGS_0_TO_15_VLAN_FLAGS_MASK,
+		.mask_valid = true,
+		.lkup_idx = ICE_SW_LKUP_VLAN_PKT_FLAGS_LKUP_IDX,
+	},
+	{
+		/* Update recipe ICE_SW_LKUP_PROMISC_VLAN to filter based on the
+		 * outer/single VLAN in DVM
+		 */
+		.rid = ICE_SW_LKUP_PROMISC_VLAN,
+		.fv_idx = ICE_EXTERNAL_VLAN_ID_FV_IDX,
+		.ignore_valid = true,
+		.mask = 0,
+		.mask_valid = false,  /* use pre-existing mask */
+		.lkup_idx = ICE_SW_LKUP_PROMISC_VLAN_LOC_LKUP_IDX,
+	},
+};
+
+/**
+ * ice_dvm_update_dflt_recipes - update default switch recipes in DVM
+ * @hw: hardware structure used to update the recipes
+ */
+static enum ice_status ice_dvm_update_dflt_recipes(struct ice_hw *hw)
+{
+	unsigned long i;
+
+	for (i = 0; i < ARRAY_SIZE(ice_dvm_dflt_recipes); i++) {
+		struct ice_update_recipe_lkup_idx_params *params;
+		enum ice_status status;
+
+		params = &ice_dvm_dflt_recipes[i];
+
+		status = ice_update_recipe_lkup_idx(hw, params);
+		if (status) {
+			ice_debug(hw, ICE_DBG_INIT, "Failed to update RID %d lkup_idx %d fv_idx %d mask_valid %s mask 0x%04x\n",
+				  params->rid, params->lkup_idx, params->fv_idx,
+				  params->mask_valid ? "true" : "false",
+				  params->mask);
+			return status;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_aq_set_vlan_mode - set the VLAN mode of the device
+ * @hw: pointer to the HW structure
+ * @set_params: requested VLAN mode configuration
+ *
+ * Set VLAN Mode Parameters (0x020C)
+ */
+static enum ice_status
+ice_aq_set_vlan_mode(struct ice_hw *hw,
+		     struct ice_aqc_set_vlan_mode *set_params)
+{
+	u8 rdma_packet, mng_vlan_prot_id;
+	struct ice_aq_desc desc;
+
+	if (!set_params)
+		return ICE_ERR_PARAM;
+
+	if (set_params->l2tag_prio_tagging > ICE_AQ_VLAN_PRIO_TAG_MAX)
+		return ICE_ERR_PARAM;
+
+	rdma_packet = set_params->rdma_packet;
+	if (rdma_packet != ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING &&
+	    rdma_packet != ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING)
+		return ICE_ERR_PARAM;
+
+	mng_vlan_prot_id = set_params->mng_vlan_prot_id;
+	if (mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER &&
+	    mng_vlan_prot_id != ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER)
+		return ICE_ERR_PARAM;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_set_vlan_mode_parameters);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	return ice_aq_send_cmd(hw, &desc, set_params, sizeof(*set_params),
+			       NULL);
+}
+
+/**
+ * ice_set_dvm - sets up software and hardware for double VLAN mode
+ * @hw: pointer to the hardware structure
+ */
+static enum ice_status ice_set_dvm(struct ice_hw *hw)
+{
+	struct ice_aqc_set_vlan_mode params = { 0 };
+	enum ice_status status;
+
+	params.l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_OUTER_CTAG;
+	params.rdma_packet = ICE_AQ_DVM_VLAN_RDMA_PKT_FLAG_SETTING;
+	params.mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_OUTER;
+
+	status = ice_aq_set_vlan_mode(hw, &params);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set double VLAN mode parameters, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_dvm_update_dflt_recipes(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to update default recipes for double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_aq_set_port_params(hw->port_info, 0, false, false, true,
+					NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set port in double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_set_dvm_boost_entries(hw);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set boost TCAM entries for double VLAN mode, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_set_svm - set single VLAN mode
+ * @hw: pointer to the HW structure
+ */
+static enum ice_status ice_set_svm(struct ice_hw *hw)
+{
+	struct ice_aqc_set_vlan_mode *set_params;
+	enum ice_status status;
+
+	status = ice_aq_set_port_params(hw->port_info, 0, false, false, false, NULL);
+	if (status) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set port parameters for single VLAN mode\n");
+		return status;
+	}
+
+	set_params = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*set_params),
+				  GFP_KERNEL);
+	if (!set_params)
+		return ICE_ERR_NO_MEMORY;
+
+	/* default configuration for SVM configurations */
+	set_params->l2tag_prio_tagging = ICE_AQ_VLAN_PRIO_TAG_INNER_CTAG;
+	set_params->rdma_packet = ICE_AQ_SVM_VLAN_RDMA_PKT_FLAG_SETTING;
+	set_params->mng_vlan_prot_id = ICE_AQ_VLAN_MNG_PROTOCOL_ID_INNER;
+
+	status = ice_aq_set_vlan_mode(hw, set_params);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to configure port in single VLAN mode\n");
+
+	devm_kfree(ice_hw_to_dev(hw), set_params);
+	return status;
+}
+
+/**
+ * ice_set_vlan_mode
+ * @hw: pointer to the HW structure
+ */
+enum ice_status ice_set_vlan_mode(struct ice_hw *hw)
+{
+	if (!ice_is_dvm_supported(hw))
+		return 0;
+
+	if (!ice_set_dvm(hw))
+		return 0;
+
+	return ice_set_svm(hw);
+}
+
+/**
+ * ice_print_dvm_not_supported - print if DDP and/or FW doesn't support DVM
+ * @hw: pointer to the HW structure
+ *
+ * The purpose of this function is to print that  QinQ is not supported due to
+ * incompatibilty from the DDP and/or FW. This will give a hint to the user to
+ * update one and/or both components if they expect QinQ functionality.
+ */
+static void ice_print_dvm_not_supported(struct ice_hw *hw)
+{
+	bool pkg_supports_dvm = ice_pkg_supports_dvm(hw);
+	bool fw_supports_dvm = ice_fw_supports_dvm(hw);
+
+	if (!fw_supports_dvm && !pkg_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your DDP package and NVM to versions that support QinQ.\n");
+	else if (!pkg_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your DDP package to a version that supports QinQ.\n");
+	else if (!fw_supports_dvm)
+		dev_info(ice_hw_to_dev(hw),
+		         "QinQ functionality cannot be enabled on this device. Update your NVM to a version that supports QinQ.\n");
+}
+
+/**
+ * ice_post_pkg_dwnld_vlan_mode_cfg - configure VLAN mode after DDP download
+ * @hw: pointer to the HW structure
+ *
+ * This function is meant to configure any VLAN mode specific functionality
+ * after the global configuration lock has been released and the DDP has been
+ * downloaded.
+ *
+ * Since only one PF downloads the DDP and configures the VLAN mode there needs
+ * to be a way to configure the other PFs after the DDP has been downloaded and
+ * the global configuration lock has been released. All such code should go in
+ * this function.
+ */
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw)
+{
+	ice_cache_vlan_mode(hw);
+
+	if (ice_is_dvm_ena(hw))
+		ice_change_proto_id_to_dvm();
+	else
+		ice_print_dvm_not_supported(hw);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.h b/drivers/net/ethernet/intel/ice/ice_vlan_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..5072529c1e038e0c02480b40c080da18baa29f91
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vlan_mode.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VLAN_MODE_H_
+#define _ICE_VLAN_MODE_H_
+
+#include "ice_osdep.h"
+
+struct ice_hw;
+
+bool ice_is_dvm_ena(struct ice_hw *hw);
+enum ice_status ice_set_vlan_mode(struct ice_hw *hw);
+void ice_post_pkg_dwnld_vlan_mode_cfg(struct ice_hw *hw);
+
+#endif /* _ICE_VLAN_MODE_H */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
new file mode 100644
index 0000000000000000000000000000000000000000..705e496b7f747989eba7992f9689bdd8d872e350
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.c
@@ -0,0 +1,744 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_vsi_vlan_lib.h"
+#include "ice_lib.h"
+#include "ice_fltr.h"
+#include "ice.h"
+
+static void print_invalid_tpid(struct ice_vsi *vsi, u16 tpid)
+{
+	dev_err(ice_pf_to_dev(vsi->back), "%s %d specified invalid VLAN tpid 0x%04x\n",
+		ice_vsi_type_str(vsi->type), vsi->idx, tpid);
+}
+
+/**
+ * validate_vlan - check if the ice_vlan passed in is valid
+ * @vsi: VSI used for printing error message
+ * @vlan: ice_vlan structure to validate
+ *
+ * Return true if the VLAN TPID is valid or if the VLAN TPID is 0 and the VLAN
+ * VID is 0, which allows for non-zero VLAN filters with the specified VLAN TPID
+ * and untagged VLAN 0 filtersto be added to the prune list respectively.
+ */
+static bool validate_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	if (vlan->tpid != ETH_P_8021Q && vlan->tpid != ETH_P_8021AD &&
+	    vlan->tpid != ETH_P_QINQ1 && (vlan->tpid || vlan->vid)) {
+		print_invalid_tpid(vsi, vlan->tpid);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * ice_vsi_add_vlan - default add VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to add
+ */
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	enum ice_status status;
+	int err = 0;
+
+	if (!validate_vlan(vsi, vlan))
+		return -EINVAL;
+
+	status = ice_fltr_add_vlan(vsi, vlan);
+	if (status && status != ICE_ERR_ALREADY_EXISTS) {
+		err = -ENODEV;
+		dev_err(ice_pf_to_dev(vsi->back), "Failure Adding VLAN %d on VSI %i, status %s\n",
+			vlan->vid, vsi->vsi_num, ice_stat_str(status));
+	} else {
+		vsi->num_vlan++;
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_del_vlan - default del VLAN implementation for all VSI types
+ * @vsi: VSI being configured
+ * @vlan: VLAN filter to delete
+ */
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+	int err = 0;
+
+	dev = ice_pf_to_dev(pf);
+
+	if (!validate_vlan(vsi, vlan))
+		return -EINVAL;
+
+	status = ice_fltr_remove_vlan(vsi, vlan);
+	if (!status) {
+		vsi->num_vlan--;
+	} else if (status != ICE_ERR_DOES_NOT_EXIST &&
+		   status != ICE_ERR_RESET_ONGOING) {
+		dev_err(dev, "Error removing VLAN %d on VSI %i error: %s\n",
+			vlan->vid, vsi->vsi_num, ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	}
+
+	return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
+ * @vsi: the VSI being changed
+ */
+static int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	/* Here we are configuring the VSI to let the driver add VLAN tags by
+	 * setting inner_vlan_flags to ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL. The actual VLAN tag
+	 * insertion happens in the Tx hot path, in ice_tx_map.
+	 */
+	ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+	/* Preserve existing VLAN strip setting */
+	ctxt->info.inner_vlan_flags |= (vsi->info.inner_vlan_flags &
+					ICE_AQ_VSI_INNER_VLAN_EMODE_M);
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN insert failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_manage_vlan_stripping - Manage VLAN stripping for the VSI for Rx
+ * @vsi: the VSI being changed
+ * @ena: boolean value indicating if this is a enable or disable request
+ */
+static int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_inner_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	/* Here we are configuring what the VSI should do with the VLAN tag in
+	 * the Rx packet. We can either leave the tag in the packet or put it in
+	 * the Rx descriptor.
+	 */
+	if (ena)
+		/* Strip VLAN tag from Rx packet and put it in the desc */
+		ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_STR_BOTH;
+	else
+		/* Disable stripping. Leave tag in packet */
+		ctxt->info.inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_EMODE_NOTHING;
+
+	/* Allow all packets untagged/tagged */
+	ctxt->info.inner_vlan_flags |= ICE_AQ_VSI_INNER_VLAN_TX_MODE_ALL;
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN strip failed, ena = %d err %s aq_err %s\n",
+			ena, ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = ctxt->info.inner_vlan_flags;
+out:
+	kfree(ctxt);
+	return err;
+}
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, const u16 tpid)
+{
+	if (tpid != ETH_P_8021Q) {
+		print_invalid_tpid(vsi, tpid);
+		return -EINVAL;
+	}
+
+	return ice_vsi_manage_vlan_stripping(vsi, true);
+}
+
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi)
+{
+	return ice_vsi_manage_vlan_stripping(vsi, false);
+}
+
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, const u16 tpid)
+{
+	if (tpid != ETH_P_8021Q) {
+		print_invalid_tpid(vsi, tpid);
+		return -EINVAL;
+	}
+
+	return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi)
+{
+	return ice_vsi_manage_vlan_insertion(vsi);
+}
+
+/**
+ * __ice_vsi_set_inner_port_vlan - set port VLAN VSI context settings to enable a port VLAN
+ * @vsi: the VSI to update
+ * @pvid_info: VLAN ID and QoS used to set the PVID VSI context field
+ */
+static int __ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, u16 pvid_info)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_aqc_vsi_props *info;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int ret = 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+	info = &ctxt->info;
+	info->inner_vlan_flags = ICE_AQ_VSI_INNER_VLAN_TX_MODE_ACCEPTUNTAGGED |
+		ICE_AQ_VSI_INNER_VLAN_INSERT_PVID |
+		ICE_AQ_VSI_INNER_VLAN_EMODE_STR;
+	info->sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	info->port_based_inner_vlan = cpu_to_le16(pvid_info);
+	info->valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_VLAN_VALID |
+					   ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_info(ice_hw_to_dev(hw), "update VSI for port VLAN failed, err %s aq_err %s\n",
+			 ice_stat_str(status),
+			 ice_aq_str(hw->adminq.sq_last_status));
+		ret = -EIO;
+		goto out;
+	}
+
+	vsi->info.inner_vlan_flags = info->inner_vlan_flags;
+	vsi->info.sw_flags2 = info->sw_flags2;
+	vsi->info.port_based_inner_vlan = info->port_based_inner_vlan;
+out:
+	kfree(ctxt);
+	return ret;
+}
+
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u16 port_vlan_info;
+
+	if (vlan->tpid != ETH_P_8021Q)
+		return -EINVAL;
+
+	if (vlan->prio > 7)
+		return -EINVAL;
+
+	port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+	return __ice_vsi_set_inner_port_vlan(vsi, port_vlan_info);
+}
+
+/**
+ * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
+ * @vsi: VSI to enable or disable VLAN pruning on
+ * @ena: set to true to enable VLAN pruning and false to disable it
+ *
+ * returns 0 if VSI is updated, negative otherwise
+ */
+static int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	struct ice_pf *pf;
+
+	if (!vsi)
+		return -EINVAL;
+
+	/* Don't enable VLAN pruning if the netdev is currently in promiscuous
+	 * mode. VLAN pruning will be enabled when the interface exits
+	 * promiscuous mode if any VLAN filters are active.
+	 */
+	if (vsi->netdev && vsi->netdev->flags & IFF_PROMISC && ena)
+		return 0;
+
+	pf = vsi->back;
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	if (ena)
+		ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+	else
+		ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI handle: %d, VSI HW ID: %d failed, err = %s, aq_err = %s\n",
+			   ena ? "En" : "Dis", vsi->idx, vsi->vsi_num,
+			   ice_stat_str(status),
+			   ice_aq_str(pf->hw.adminq.sq_last_status));
+		goto err_out;
+	}
+
+	vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+
+	kfree(ctxt);
+	return 0;
+
+err_out:
+	kfree(ctxt);
+	return -EIO;
+}
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_pruning(vsi, true);
+}
+
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_pruning(vsi, false);
+}
+
+static int ice_cfg_vlan_antispoof(struct ice_vsi *vsi, bool enable)
+{
+	struct ice_vsi_ctx *ctx;
+	enum ice_status status;
+	int err = 0;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->info.sec_flags = vsi->info.sec_flags;
+	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID);
+
+	if (enable)
+		ctx->info.sec_flags |= ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+			ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
+	else
+		ctx->info.sec_flags &= ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+					 ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+
+	status = ice_update_vsi(&vsi->back->hw, vsi->idx, ctx, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "Failed to configure Tx VLAN anti-spoof %s for VSI %d, error %s\n",
+			enable ? "ON" : "OFF", vsi->vsi_num,
+			ice_stat_str(status));
+		err = ice_status_to_errno(status);
+	} else {
+		vsi->info.sec_flags = ctx->info.sec_flags;
+	}
+
+	kfree(ctx);
+
+	return err;
+}
+
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_antispoof(vsi, true);
+}
+
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi)
+{
+	return ice_cfg_vlan_antispoof(vsi, false);
+}
+
+/**
+ * tpid_to_vsi_outer_vlan_type - convert from TPID to VSI context based tag_type
+ * @tpid: tpid used to translate into VSI context based tag_type
+ * @tag_type: output variable to hold the VSI context based tag type
+ */
+static int tpid_to_vsi_outer_vlan_type(u16 tpid, u8 *tag_type)
+{
+	switch (tpid) {
+	case ETH_P_8021Q:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_8100;
+		break;
+	case ETH_P_8021AD:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_STAG;
+		break;
+	case ETH_P_QINQ1:
+		*tag_type = ICE_AQ_VSI_OUTER_TAG_VLAN_9100;
+		break;
+	default:
+		*tag_type = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_ena_outer_stripping - enable outer VLAN stripping
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN stripping for
+ *
+ * Enable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for stripping will affect the TPID for insertion.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN stripping settings and the VLAN TPID. Outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This enables hardware to strip a VLAN tag with the specified TPID to be
+ * stripped from the packet and placed in the receive descriptor.
+ */
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	/* do not allow modifying VLAN stripping when a port VLAN is configured
+	 * on this VSI
+	 */
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN strip settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_EMODE_M | ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt->info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW_BOTH <<
+		  ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		 ((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M));
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN stripping failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_stripping - disable outer VLAN stripping
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN stripping via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN stripping settings. The VLAN TPID and outer VLAN
+ * insertion settings are unmodified.
+ *
+ * This tells the hardware to not strip any VLAN tagged packets, thus leaving
+ * them in the packet. This enables software offloaded VLAN stripping and
+ * disables hardware offloaded VLAN stripping.
+ */
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN strip settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~ICE_AQ_VSI_OUTER_VLAN_EMODE_M;
+	ctxt->info.outer_vlan_flags |= ICE_AQ_VSI_OUTER_VLAN_EMODE_NOTHING <<
+		ICE_AQ_VSI_OUTER_VLAN_EMODE_S;
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN stripping failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_ena_outer_insertion - enable outer VLAN insertion
+ * @vsi: VSI to configure
+ * @tpid: TPID to enable outer VLAN insertion for
+ *
+ * Enable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Since the VSI context only supports a single TPID for insertion and
+ * stripping, setting the TPID for insertion will affect the TPID for stripping.
+ * Callers need to be aware of this limitation.
+ *
+ * Only modify outer VLAN insertion settings and the VLAN TPID. Outer VLAN
+ * stripping settings are unmodified.
+ *
+ * This allows a VLAN tag with the specified TPID to be inserted in the transmit
+ * descriptor.
+ */
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M |
+		  ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+	ctxt->info.outer_vlan_flags |=
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for enabling outer VLAN insertion failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_dis_outer_insertion - disable outer VLAN insertion
+ * @vsi: VSI to configure
+ *
+ * Disable outer VLAN insertion via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * Only modify the outer VLAN insertion settings. The VLAN TPID and outer VLAN
+ * settings are unmodified.
+ *
+ * This tells the hardware to not allow any VLAN tagged packets in the transmit
+ * descriptor. This enables software offloaded VLAN insertion and disables
+ * hardware offloaded VLAN insertion.
+ */
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	int err = 0;
+
+	if (vsi->info.port_based_outer_vlan)
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID);
+	/* clear current outer VLAN insertion settings */
+	ctxt->info.outer_vlan_flags = vsi->info.outer_vlan_flags &
+		~(ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT |
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+	ctxt->info.outer_vlan_flags |=
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		((ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ALL <<
+		  ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) &
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_M);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for disabling outer VLAN insertion failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * __ice_vsi_set_outer_port_vlan - set the outer port VLAN and related settings
+ * @vsi: VSI to configure
+ * @vlan_info: packed u16 that contains the VLAN prio and ID
+ * @tpid: TPID of the port VLAN
+ *
+ * Set the port VLAN prio, ID, and TPID.
+ *
+ * Enable VLAN pruning so the VSI doesn't receive any traffic that doesn't match
+ * a VLAN prune rule. The caller should take care to add a VLAN prune rule that
+ * matches the port VLAN ID and TPID.
+ *
+ * Tell hardware to strip outer VLAN tagged packets on receive and don't put
+ * them in the receive descriptor. VSI(s) in port VLANs should not be aware of
+ * the port VLAN ID or TPID they are assigned to.
+ *
+ * Tell hardware to prevent outer VLAN tag insertion on transmit and only allow
+ * untagged outer packets from the transmit descriptor.
+ *
+ * Also, tell the hardware to insert the port VLAN on transmit.
+ */
+static int
+__ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, u16 vlan_info, u16 tpid)
+{
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx *ctxt;
+	enum ice_status status;
+	u8 tag_type;
+	int err = 0;
+
+	if (tpid_to_vsi_outer_vlan_type(tpid, &tag_type))
+		return -EINVAL;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+
+	ctxt->info.port_based_outer_vlan = cpu_to_le16(vlan_info);
+	ctxt->info.outer_vlan_flags =
+		(ICE_AQ_VSI_OUTER_VLAN_EMODE_SHOW <<
+		 ICE_AQ_VSI_OUTER_VLAN_EMODE_S) |
+		((tag_type << ICE_AQ_VSI_OUTER_TAG_TYPE_S) &
+		 ICE_AQ_VSI_OUTER_TAG_TYPE_M) |
+		ICE_AQ_VSI_OUTER_VLAN_BLOCK_TX_DESC |
+		(ICE_AQ_VSI_OUTER_VLAN_TX_MODE_ACCEPTUNTAGGED <<
+		 ICE_AQ_VSI_OUTER_VLAN_TX_MODE_S) |
+		ICE_AQ_VSI_OUTER_VLAN_PORT_BASED_INSERT;
+
+	ctxt->info.valid_sections =
+		cpu_to_le16(ICE_AQ_VSI_PROP_OUTER_TAG_VALID |
+			    ICE_AQ_VSI_PROP_SW_VALID);
+
+	status = ice_update_vsi(hw, vsi->idx, ctxt, NULL);
+	if (status) {
+		dev_err(ice_pf_to_dev(vsi->back), "update VSI for setting outer port based VLAN failed, err %s aq_err %s\n",
+			ice_stat_str(status),
+			ice_aq_str(hw->adminq.sq_last_status));
+		err = -EIO;
+	} else {
+		vsi->info.port_based_outer_vlan = ctxt->info.port_based_outer_vlan;
+		vsi->info.outer_vlan_flags = ctxt->info.outer_vlan_flags;
+		vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+	}
+
+	kfree(ctxt);
+	return err;
+}
+
+/**
+ * ice_vsi_set_outer_port_vlan - public version of __ice_vsi_set_outer_port_vlan
+ * @vsi: VSI to configure
+ * @vlan: ice_vlan structure used to set the port VLAN
+ *
+ * Set the outer port VLAN via VSI context. This function should only be
+ * used if DVM is supported. Also, this function should never be called directly
+ * as it should be part of ice_vsi_vlan_ops if it's needed.
+ *
+ * This function does not support clearing the port VLAN as there is currently
+ * no use case for this.
+ *
+ * Use the ice_vlan structure passed in to set this VSI in a port VLAN.
+ */
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan)
+{
+	u16 port_vlan_info;
+
+	if (vlan->prio > (VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT))
+		return -EINVAL;
+
+	port_vlan_info = vlan->vid | (vlan->prio << VLAN_PRIO_SHIFT);
+
+	return __ice_vsi_set_outer_port_vlan(vsi, port_vlan_info, vlan->tpid);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..60925e920facc8a3e9d78bfef1aaf3ea63c4b307
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_lib.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_LIB_H_
+#define _ICE_VSI_VLAN_LIB_H_
+
+#include <linux/types.h>
+#include "ice_vlan.h"
+
+struct ice_vsi;
+
+#define ICE_VLAN(tpid, vid, prio, fwd_action)	\
+	((struct ice_vlan){ tpid, vid, prio, fwd_action })
+
+int ice_vsi_add_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+int ice_vsi_del_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_inner_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_inner_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_inner_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_inner_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+
+int ice_vsi_ena_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_rx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_ena_tx_vlan_filtering(struct ice_vsi *vsi);
+int ice_vsi_dis_tx_vlan_filtering(struct ice_vsi *vsi);
+
+int ice_vsi_ena_outer_stripping(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_stripping(struct ice_vsi *vsi);
+int ice_vsi_ena_outer_insertion(struct ice_vsi *vsi, u16 tpid);
+int ice_vsi_dis_outer_insertion(struct ice_vsi *vsi);
+int ice_vsi_set_outer_port_vlan(struct ice_vsi *vsi, struct ice_vlan *vlan);
+#endif /* _ICE_VSI_VLAN_LIB_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
new file mode 100644
index 0000000000000000000000000000000000000000..466c691d6848ffe9bba131e1fe7946ee430b6419
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "ice_pf_vsi_vlan_ops.h"
+#include "ice_vf_vsi_vlan_ops.h"
+#include "ice_lib.h"
+#include "ice.h"
+
+static int
+op_unsupported_vlan_arg(struct ice_vsi * __always_unused vsi,
+			struct ice_vlan * __always_unused vlan)
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+op_unsupported_tpid_arg(struct ice_vsi *__always_unused vsi,
+			u16 __always_unused tpid)
+{
+	return -EOPNOTSUPP;
+}
+
+static int op_unsupported(struct ice_vsi *__always_unused vsi)
+{
+	return -EOPNOTSUPP;
+}
+
+/* If any new ops are added to the VSI VLAN ops interface then an unsupported
+ * implementation should be set here.
+ */
+static struct ice_vsi_vlan_ops ops_unsupported = {
+	.add_vlan = op_unsupported_vlan_arg,
+	.del_vlan = op_unsupported_vlan_arg,
+	.ena_stripping = op_unsupported_tpid_arg,
+	.dis_stripping = op_unsupported,
+	.ena_insertion = op_unsupported_tpid_arg,
+	.dis_insertion = op_unsupported,
+	.ena_rx_filtering = op_unsupported,
+	.dis_rx_filtering = op_unsupported,
+	.ena_tx_filtering = op_unsupported,
+	.dis_tx_filtering = op_unsupported,
+	.set_port_vlan = op_unsupported_vlan_arg,
+};
+
+/**
+ * ice_vsi_init_unsupported_vlan_ops - init all VSI VLAN ops to unsupported
+ * @vsi: VSI to initialize VSI VLAN ops to unsupported for
+ *
+ * By default all inner and outer VSI VLAN ops return -EOPNOTSUPP. This was done
+ * as oppsed to leaving the ops null to prevent unexpected crashes. Instead if
+ * an unsupported VSI VLAN op is called it will just return -EOPNOTSUPP.
+ *
+ */
+static void ice_vsi_init_unsupported_vlan_ops(struct ice_vsi *vsi)
+{
+	vsi->outer_vlan_ops = ops_unsupported;
+	vsi->inner_vlan_ops = ops_unsupported;
+}
+
+/**
+ * ice_vsi_init_vlan_ops - initialize type specific VSI VLAN ops
+ * @vsi: VSI to initialize ops for
+ *
+ * If any VSI types are added and/or require different ops than the PF or VF VSI
+ * then they will have to add a case here to handle that. Also, VSI type
+ * specific files should be added in the same manner that was done for PF VSI.
+ */
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi)
+{
+	/* Initialize all VSI types to have unsupported VSI VLAN ops */
+	ice_vsi_init_unsupported_vlan_ops(vsi);
+
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+	case ICE_VSI_CHNL:
+	case ICE_VSI_SWITCHDEV_CTRL:
+		ice_pf_vsi_init_vlan_ops(vsi);
+		break;
+	case ICE_VSI_VF:
+		ice_vf_vsi_init_vlan_ops(vsi);
+		break;
+	default:
+		dev_dbg(ice_pf_to_dev(vsi->back), "%s does not support VLAN operations\n",
+			ice_vsi_type_str(vsi->type));
+		break;
+	}
+}
+
+/**
+ * ice_get_compat_vsi_vlan_ops - Get VSI VLAN ops based on VLAN mode
+ * @vsi: VSI used to get the VSI VLAN ops
+ *
+ * This function is meant to be used when the caller doesn't know which VLAN ops
+ * to use (i.e. inner or outer). This allows backward compatibility for VLANs
+ * since most of the Outer VSI VLAN functins are not supported when
+ * the device is configured in Single VLAN Mode (SVM).
+ */
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi)
+{
+	if (ice_is_dvm_ena(&vsi->back->hw))
+		return &vsi->outer_vlan_ops;
+	else
+		return &vsi->inner_vlan_ops;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..70ff0e289d0ce6930ae5c64bb1032096e00b2a67
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_VSI_VLAN_OPS_H_
+#define _ICE_VSI_VLAN_OPS_H_
+
+#include "ice_type.h"
+#include "ice_vsi_vlan_lib.h"
+
+struct ice_vsi;
+
+struct ice_vsi_vlan_ops {
+	int (*add_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+	int (*del_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+	int (*ena_stripping)(struct ice_vsi *vsi, const u16 tpid);
+	int (*dis_stripping)(struct ice_vsi *vsi);
+	int (*ena_insertion)(struct ice_vsi *vsi, const u16 tpid);
+	int (*dis_insertion)(struct ice_vsi *vsi);
+	int (*ena_rx_filtering)(struct ice_vsi *vsi);
+	int (*dis_rx_filtering)(struct ice_vsi *vsi);
+	int (*ena_tx_filtering)(struct ice_vsi *vsi);
+	int (*dis_tx_filtering)(struct ice_vsi *vsi);
+	int (*set_port_vlan)(struct ice_vsi *vsi, struct ice_vlan *vlan);
+};
+
+void ice_vsi_init_vlan_ops(struct ice_vsi *vsi);
+struct ice_vsi_vlan_ops *ice_get_compat_vsi_vlan_ops(struct ice_vsi *vsi);
+
+#endif /* _ICE_VSI_VLAN_OPS_H_ */
+
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
new file mode 100644
index 0000000000000000000000000000000000000000..43012bb9b1155fdb53fffc55fa673f3766f5649c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -0,0 +1,1393 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <net/xdp.h>
+#include <net/busy_poll.h>
+#include "ice.h"
+#include "ice_lib.h"
+#include "ice_base.h"
+#include "ice_type.h"
+#include "ice_xsk.h"
+#include "ice_txrx.h"
+#include "ice_txrx_lib.h"
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+
+/**
+ * ice_qp_reset_stats - Resets all stats for rings of given index
+ * @vsi: VSI that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx)
+{
+	memset(&vsi->rx_rings[q_idx]->rx_stats, 0,
+	       sizeof(vsi->rx_rings[q_idx]->rx_stats));
+	memset(&vsi->tx_rings[q_idx]->stats, 0,
+	       sizeof(vsi->tx_rings[q_idx]->stats));
+	if (ice_is_xdp_ena_vsi(vsi))
+		memset(&vsi->xdp_rings[q_idx]->stats, 0,
+		       sizeof(vsi->xdp_rings[q_idx]->stats));
+}
+
+/**
+ * ice_qp_clean_rings - Cleans all the rings of a given index
+ * @vsi: VSI that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx)
+{
+	ice_clean_tx_ring(vsi->tx_rings[q_idx]);
+	if (ice_is_xdp_ena_vsi(vsi))
+		ice_clean_tx_ring(vsi->xdp_rings[q_idx]);
+	ice_clean_rx_ring(vsi->rx_rings[q_idx]);
+}
+
+/**
+ * ice_qvec_toggle_napi - Enables/disables NAPI for a given q_vector
+ * @vsi: VSI that has netdev
+ * @q_vector: q_vector that has NAPI context
+ * @enable: true for enable, false for disable
+ */
+static void
+ice_qvec_toggle_napi(struct ice_vsi *vsi, struct ice_q_vector *q_vector,
+		     bool enable)
+{
+	if (!vsi->netdev || !q_vector)
+		return;
+
+	if (enable)
+		napi_enable(&q_vector->napi);
+	else
+		napi_disable(&q_vector->napi);
+}
+
+/**
+ * ice_qvec_dis_irq - Mask off queue interrupt generation on given ring
+ * @vsi: the VSI that contains queue vector being un-configured
+ * @rx_ring: Rx ring that will have its IRQ disabled
+ * @q_vector: queue vector
+ */
+static void
+ice_qvec_dis_irq(struct ice_vsi *vsi, struct ice_ring *rx_ring,
+		 struct ice_q_vector *q_vector)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	int base = vsi->base_vector;
+	u16 reg;
+	u32 val;
+
+	/* QINT_TQCTL is being cleared in ice_vsi_stop_tx_ring, so handle
+	 * here only QINT_RQCTL
+	 */
+	reg = rx_ring->reg_idx;
+	val = rd32(hw, QINT_RQCTL(reg));
+	val &= ~QINT_RQCTL_CAUSE_ENA_M;
+	wr32(hw, QINT_RQCTL(reg), val);
+
+	if (q_vector) {
+		u16 v_idx = q_vector->v_idx;
+
+		wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx), 0);
+
+		ice_flush(hw);
+		synchronize_irq(pf->msix_entries[v_idx + base].vector);
+	}
+}
+
+/**
+ * ice_qvec_cfg_msix - Enable IRQ for given queue vector
+ * @vsi: the VSI that contains queue vector
+ * @q_vector: queue vector
+ */
+static void
+ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector)
+{
+	u16 reg_idx = q_vector->reg_idx;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	struct ice_ring *ring;
+
+	ice_cfg_itr(hw, q_vector);
+
+	ice_for_each_ring(ring, q_vector->tx)
+		ice_cfg_txq_interrupt(vsi, ring->reg_idx, reg_idx,
+				      q_vector->tx.itr_idx);
+
+	ice_for_each_ring(ring, q_vector->rx)
+		ice_cfg_rxq_interrupt(vsi, ring->reg_idx, reg_idx,
+				      q_vector->rx.itr_idx);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_qvec_ena_irq - Enable IRQ for given queue vector
+ * @vsi: the VSI that contains queue vector
+ * @q_vector: queue vector
+ */
+static void ice_qvec_ena_irq(struct ice_vsi *vsi, struct ice_q_vector *q_vector)
+{
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+
+	ice_irq_dynamic_ena(hw, vsi, q_vector);
+
+	ice_flush(hw);
+}
+
+/**
+ * ice_qp_dis - Disables a queue pair
+ * @vsi: VSI of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx)
+{
+	struct ice_txq_meta txq_meta = { };
+	struct ice_ring *tx_ring, *rx_ring;
+	struct ice_q_vector *q_vector;
+	int timeout = 50;
+	int err;
+
+	if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq)
+		return -EINVAL;
+
+	tx_ring = vsi->tx_rings[q_idx];
+	rx_ring = vsi->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	while (test_and_set_bit(ICE_CFG_BUSY, vsi->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+	netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+
+	ice_qvec_dis_irq(vsi, rx_ring, q_vector);
+
+	ice_fill_txq_meta(vsi, tx_ring, &txq_meta);
+	err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta);
+	if (err)
+		return err;
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		struct ice_ring *xdp_ring = vsi->xdp_rings[q_idx];
+
+		memset(&txq_meta, 0, sizeof(txq_meta));
+		ice_fill_txq_meta(vsi, xdp_ring, &txq_meta);
+		err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, xdp_ring,
+					   &txq_meta);
+		if (err)
+			return err;
+	}
+	err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true);
+	if (err)
+		return err;
+
+	ice_qvec_toggle_napi(vsi, q_vector, false);
+	ice_qp_clean_rings(vsi, q_idx);
+	ice_qp_reset_stats(vsi, q_idx);
+
+	return 0;
+}
+
+/**
+ * ice_qp_ena - Enables a queue pair
+ * @vsi: VSI of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx)
+{
+	struct ice_aqc_add_tx_qgrp *qg_buf;
+	struct ice_ring *tx_ring, *rx_ring;
+	struct ice_q_vector *q_vector;
+	u16 size;
+	int err;
+
+	if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq)
+		return -EINVAL;
+
+	size = struct_size(qg_buf, txqs, 1);
+	qg_buf = kzalloc(size, GFP_KERNEL);
+	if (!qg_buf)
+		return -ENOMEM;
+
+	qg_buf->num_txqs = 1;
+
+	tx_ring = vsi->tx_rings[q_idx];
+	rx_ring = vsi->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	err = ice_vsi_cfg_txq(vsi, tx_ring, qg_buf);
+	if (err)
+		goto free_buf;
+
+	if (ice_is_xdp_ena_vsi(vsi)) {
+		struct ice_ring *xdp_ring = vsi->xdp_rings[q_idx];
+
+		memset(qg_buf, 0, size);
+		qg_buf->num_txqs = 1;
+		err = ice_vsi_cfg_txq(vsi, xdp_ring, qg_buf);
+		if (err)
+			goto free_buf;
+		ice_set_ring_xdp(xdp_ring);
+		xdp_ring->xsk_pool = ice_xsk_umem(xdp_ring);
+	}
+
+	err = ice_vsi_cfg_rxq(rx_ring);
+	if (err)
+		goto free_buf;
+
+	ice_qvec_cfg_msix(vsi, q_vector);
+
+	err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true);
+	if (err)
+		goto free_buf;
+
+	clear_bit(ICE_CFG_BUSY, vsi->state);
+	ice_qvec_toggle_napi(vsi, q_vector, true);
+	ice_qvec_ena_irq(vsi, q_vector);
+
+	netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+free_buf:
+	kfree(qg_buf);
+	return err;
+}
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+/**
+ * ice_xsk_alloc_umems - allocate a UMEM region for an XDP socket
+ * @vsi: VSI to allocate the UMEM on
+ *
+ * Returns 0 on success, negative on error
+ */
+static int ice_xsk_alloc_umems(struct ice_vsi *vsi)
+{
+	if (vsi->xsk_umems)
+		return 0;
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
+				 GFP_KERNEL);
+#else
+	vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
+				 GFP_KERNEL);
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+
+	if (!vsi->xsk_umems) {
+		vsi->num_xsk_umems = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
+ * @vsi: VSI from which the VSI will be removed
+ * @qid: Ring/qid associated with the UMEM
+ */
+static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid)
+{
+	vsi->xsk_umems[qid] = NULL;
+	vsi->num_xsk_umems_used--;
+
+	if (vsi->num_xsk_umems_used == 0) {
+		kfree(vsi->xsk_umems);
+		vsi->xsk_umems = NULL;
+		vsi->num_xsk_umems = 0;
+	}
+}
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
+ * @vsi: VSI to map the UMEM region
+ * @umem: UMEM to map
+ *
+ * Returns 0 on success, negative on error
+ */
+static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	for (i = 0; i < umem->npgs; i++) {
+		dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
+						    PAGE_SIZE,
+						    DMA_BIDIRECTIONAL,
+						    ICE_RX_DMA_ATTR);
+		if (dma_mapping_error(dev, dma)) {
+			dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d/n",
+				i);
+			goto out_unmap;
+		}
+
+		umem->pages[i].dma = dma;
+	}
+
+	return 0;
+
+out_unmap:
+	for (; i > 0; i--) {
+		dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
+		umem->pages[i].dma = 0;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
+ * @vsi: VSI from which the UMEM will be unmapped
+ * @umem: UMEM to unmap
+ */
+static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
+{
+	struct ice_pf *pf = vsi->back;
+	struct device *dev;
+	unsigned int i;
+
+	dev = ice_pf_to_dev(pf);
+	for (i = 0; i < umem->npgs; i++) {
+		dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
+
+		umem->pages[i].dma = 0;
+	}
+}
+#endif
+
+/**
+ * ice_xsk_umem_disable - disable a UMEM region
+ * @vsi: Current VSI
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
+{
+#ifdef HAVE_AF_XDP_NETDEV_UMEM
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+	struct xsk_buff_pool *umem = xsk_get_pool_from_qid(vsi->netdev, qid);
+#else
+	struct xdp_umem *umem = xsk_get_pool_from_qid(vsi->netdev, qid);
+#endif
+#else
+	struct xdp_umem *umem;
+
+	if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems)
+		return -EINVAL;
+
+	umem = vsi->xsk_umems[qid];
+#endif
+
+	if (!umem)
+		return -EINVAL;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	ice_xsk_umem_dma_unmap(vsi, umem);
+#else
+	xsk_pool_dma_unmap(umem, ICE_RX_DMA_ATTR);
+#endif
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	ice_xsk_remove_umem(vsi, qid);
+#endif
+
+	return 0;
+}
+
+/**
+ * ice_xsk_umem_enable - enable a UMEM region
+ * @vsi: Current VSI
+ * @umem: pointer to a requested UMEM region
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+ice_xsk_umem_enable(struct ice_vsi *vsi, struct xsk_buff_pool *umem, u16 qid)
+#else
+ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
+#endif /* HAVE_NETDEV_BPF_XSK_POOL */
+{
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct xdp_umem_fq_reuse *reuseq;
+#endif
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	int err;
+#endif
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!vsi->num_xsk_umems)
+		vsi->num_xsk_umems = min_t(u16, vsi->num_rxq, vsi->num_txq);
+	if (qid >= vsi->num_xsk_umems)
+		return -EINVAL;
+
+	err = ice_xsk_alloc_umems(vsi);
+	if (err)
+		return err;
+
+	if (vsi->xsk_umems && vsi->xsk_umems[qid])
+		return -EBUSY;
+
+	vsi->xsk_umems[qid] = umem;
+	vsi->num_xsk_umems_used++;
+#else
+	if (qid >= vsi->netdev->real_num_rx_queues ||
+	    qid >= vsi->netdev->real_num_tx_queues)
+		return -EINVAL;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
+	if (!reuseq)
+		return -ENOMEM;
+
+	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
+	return ice_xsk_umem_dma_map(vsi, umem);
+#else
+	return xsk_pool_dma_map(umem, ice_pf_to_dev(vsi->back),
+			       ICE_RX_DMA_ATTR);
+#endif
+}
+
+/**
+ * ice_xsk_umem_setup - enable/disable a UMEM region depending on its state
+ * @vsi: Current VSI
+ * @umem: UMEM to enable/associate to a ring, NULL to disable
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xsk_buff_pool *umem, u16 qid)
+#else
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
+#endif
+{
+	bool if_running, umem_present = !!umem;
+	int ret = 0, umem_failure = 0;
+
+	if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi);
+
+	if (if_running) {
+		ret = ice_qp_dis(vsi, qid);
+		if (ret) {
+			netdev_err(vsi->netdev, "ice_qp_dis error = %d\n", ret);
+			goto xsk_umem_if_up;
+		}
+	}
+
+	umem_failure = umem_present ? ice_xsk_umem_enable(vsi, umem, qid) :
+				      ice_xsk_umem_disable(vsi, qid);
+
+xsk_umem_if_up:
+	if (if_running) {
+		ret = ice_qp_ena(vsi, qid);
+		if (!ret && umem_present)
+			napi_schedule(&vsi->xdp_rings[qid]->q_vector->napi);
+		else if (ret)
+			netdev_err(vsi->netdev, "ice_qp_ena error = %d\n", ret);
+	}
+
+	if (umem_failure) {
+		netdev_err(vsi->netdev, "Could not %sable UMEM, error = %d\n",
+			   umem_present ? "en" : "dis", umem_failure);
+		return umem_failure;
+	}
+
+	return ret;
+}
+
+#ifndef NO_XDP_QUERY_XSK_UMEM
+/**
+ * ice_xsk_umem_query - queries a certain ring/qid for its UMEM
+ * @vsi: Current VSI
+ * @umem: UMEM associated to the ring, if any
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_xsk_umem_query(struct ice_vsi *vsi, struct xdp_umem **umem, u16 qid)
+{
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	if (qid >= min_t(u16, vsi->num_rxq, vsi->num_txq))
+		return -EINVAL;
+
+	if (vsi->xsk_umems) {
+		if (qid >= vsi->num_xsk_umems)
+			return -EINVAL;
+		*umem = vsi->xsk_umems[qid];
+		return 0;
+	}
+
+	*umem = NULL;
+#else
+	struct net_device *netdev = vsi->netdev;
+	struct xdp_umem *queried_umem;
+
+	if (vsi->type != ICE_VSI_PF)
+		return -EINVAL;
+
+	queried_umem = xsk_get_pool_from_qid(netdev, qid);
+	if (!queried_umem)
+		return -EINVAL;
+
+	*umem = queried_umem;
+#endif /* !HAVE_AF_XDP_NETDEV_UMEM */
+
+	return 0;
+}
+#endif /* NO_XDP_QUERY_XSK_UMEM */
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
+ * @zca: zero-cpoy allocator
+ * @handle: Buffer handle
+ */
+void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
+{
+	struct ice_rx_buf *rx_buf;
+	struct ice_ring *rx_ring;
+	struct xdp_umem *umem;
+	u64 hr, mask;
+	u16 nta;
+
+	rx_ring = container_of(zca, struct ice_ring, zca);
+	umem = rx_ring->xsk_pool;
+	hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+#ifndef HAVE_XDP_UMEM_PROPS
+	mask = umem->chunk_mask;
+#else
+	mask = umem->props.chunk_mask;
+#endif
+
+	nta = rx_ring->next_to_alloc;
+	rx_buf = &rx_ring->rx_buf[nta];
+
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	handle &= mask;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += hr;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += hr;
+
+	rx_buf->handle = (u64)handle + umem->headroom;
+}
+
+/**
+ * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
+ * @rx_ring: ring with an xdp_umem bound to it
+ * @rx_buf: buffer to which xsk page address will be assigned
+ *
+ * This function allocates an Rx buffer in the hot path.
+ * The buffer can come from fill queue or recycle queue.
+ *
+ * Returns true if an assignment was successful, false if not.
+ */
+static __always_inline bool
+ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
+{
+	struct xdp_umem *umem = rx_ring->xsk_pool;
+	void *addr = rx_buf->addr;
+	u64 handle, hr;
+
+	if (addr) {
+#ifdef ICE_ADD_PROBES
+		rx_ring->rx_stats.page_reuse++;
+#endif /* ICE_ADD_PROBES */
+		return true;
+	}
+
+	if (!xsk_umem_peek_addr(umem, &handle)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += hr;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += hr;
+
+	rx_buf->handle = handle + umem->headroom;
+
+	xsk_umem_release_addr(umem);
+	return true;
+}
+
+/**
+ * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
+ * @rx_ring: ring with an xdp_umem bound to it
+ * @rx_buf: buffer to which xsk page address will be assigned
+ *
+ * This function allocates an Rx buffer in the slow path.
+ * The buffer can come from fill queue or recycle queue.
+ *
+ * Returns true if an assignment was successful, false if not.
+ */
+static __always_inline bool
+ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
+{
+	struct xdp_umem *umem = rx_ring->xsk_pool;
+	u64 handle, headroom;
+
+	if (!xsk_umem_peek_addr_rq(umem, &handle)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	handle &= umem->chunk_mask;
+	headroom = umem->headroom + XDP_PACKET_HEADROOM;
+
+	rx_buf->dma = xdp_umem_get_dma(umem, handle);
+	rx_buf->dma += headroom;
+
+	rx_buf->addr = xdp_umem_get_data(umem, handle);
+	rx_buf->addr += headroom;
+
+	rx_buf->handle = handle + umem->headroom;
+
+	xsk_umem_release_addr_rq(umem);
+	return true;
+}
+#endif /* !HAVE_MEM_TYPE_XSK_BUFF_POOL */
+
+/*
+ * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ * @alloc: the function pointer to call for allocation
+ *
+ * This function allocates a number of Rx buffers from the fill ring
+ * or the internal recycle mechanism and places them on the Rx ring.
+ *
+ * Returns false if all allocations were successful, true if any fail.
+ * NOTE: this function header description doesn't do kdoc style
+ *       because of the function pointer creating problems.
+ */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+static bool
+ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
+		     bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
+#else
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count)
+#endif
+{
+	union ice_32b_rx_flex_desc *rx_desc;
+	u16 ntu = rx_ring->next_to_use;
+	struct ice_rx_buf *rx_buf;
+	bool ret = false;
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	dma_addr_t dma;
+#endif
+
+	if (!count)
+		return false;
+
+	rx_desc = ICE_RX_DESC(rx_ring, ntu);
+	rx_buf = &rx_ring->rx_buf[ntu];
+
+	do {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		if (!alloc(rx_ring, rx_buf)) {
+			ret = true;
+			break;
+		}
+
+		dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
+						 rx_ring->rx_buf_len,
+						 DMA_BIDIRECTIONAL);
+
+		rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
+#else
+		rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
+		if (!rx_buf->xdp) {
+			ret = true;
+			break;
+		}
+
+		dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma);
+#endif
+		rx_desc->wb.status_error0 = 0;
+
+		rx_desc++;
+		rx_buf++;
+		ntu++;
+
+		if (unlikely(ntu == rx_ring->count)) {
+			rx_desc = ICE_RX_DESC(rx_ring, 0);
+			rx_buf = rx_ring->rx_buf;
+			ntu = 0;
+		}
+	} while (--count);
+
+	if (rx_ring->next_to_use != ntu)
+		ice_release_rx_desc(rx_ring, ntu);
+
+	return ret;
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
+ * @rx_ring: Rx ring
+ * @count: number of bufs to allocate
+ *
+ * Returns false on success, true on failure.
+ */
+static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
+{
+	return ice_alloc_rx_bufs_zc(rx_ring, count,
+				    ice_alloc_buf_fast_zc);
+}
+
+/**
+ * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
+ * @rx_ring: Rx ring
+ * @count: number of bufs to allocate
+ *
+ * Returns false on success, true on failure.
+ */
+bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
+{
+	return ice_alloc_rx_bufs_zc(rx_ring, count,
+				    ice_alloc_buf_slow_zc);
+}
+#endif
+
+/**
+ * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
+ * @rx_ring: Rx ring
+ */
+static void ice_bump_ntc(struct ice_ring *rx_ring)
+{
+	int ntc = rx_ring->next_to_clean + 1;
+
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+	prefetch(ICE_RX_DESC(rx_ring, ntc));
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+/**
+ * ice_get_rx_buf_zc - Fetch the current Rx buffer
+ * @rx_ring: Rx ring
+ * @size: size of a buffer
+ *
+ * This function returns the current, received Rx buffer and does
+ * DMA synchronization.
+ *
+ * Returns a pointer to the received Rx buffer.
+ */
+static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
+{
+	struct ice_rx_buf *rx_buf;
+
+	rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+
+	dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
+				      size, DMA_BIDIRECTIONAL);
+
+	return rx_buf;
+}
+
+/**
+ * ice_reuse_rx_buf_zc - reuse an Rx buffer
+ * @rx_ring: Rx ring
+ * @old_buf: The buffer to recycle
+ *
+ * This function recycles a finished Rx buffer, and places it on the recycle
+ * queue (next_to_alloc).
+ */
+static void
+ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
+{
+#ifndef HAVE_XDP_UMEM_PROPS
+	unsigned long mask = (unsigned long)rx_ring->xsk_pool->chunk_mask;
+#else
+	unsigned long mask = (unsigned long)rx_ring->xsk_pool->props.chunk_mask;
+#endif /* NO_XDP_UMEM_PROPS */
+	u64 hr = rx_ring->xsk_pool->headroom + XDP_PACKET_HEADROOM;
+	u16 nta = rx_ring->next_to_alloc;
+	struct ice_rx_buf *new_buf;
+
+	new_buf = &rx_ring->rx_buf[nta++];
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	new_buf->dma = old_buf->dma & mask;
+	new_buf->dma += hr;
+
+	new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
+	new_buf->addr += hr;
+
+	new_buf->handle = old_buf->handle & mask;
+	new_buf->handle += rx_ring->xsk_pool->headroom;
+
+	old_buf->addr = NULL;
+}
+#endif
+
+/**
+ * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
+ * @rx_ring: Rx ring
+ * @rx_buf: zero-copy Rx buffer
+ * @xdp: xdp buffer
+ *
+ * This function allocates a new skb from a zero-copy Rx buffer.
+ *
+ * Returns the skb on success, NULL on failure.
+ */
+static struct sk_buff *
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+		     struct xdp_buff *xdp)
+{
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	unsigned int datasize = xdp->data_end - xdp->data;
+	unsigned int datasize_hard = xdp->data_end -
+				     xdp->data_hard_start;
+	struct sk_buff *skb;
+
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	ice_reuse_rx_buf_zc(rx_ring, rx_buf);
+#else
+	xsk_buff_free(rx_buf->xdp);
+	rx_buf->xdp = NULL;
+#endif
+
+	return skb;
+}
+
+/**
+ * ice_run_xdp_zc - Executes an XDP program in zero-copy path
+ * @rx_ring: Rx ring
+ * @xdp: xdp_buff used as input to the XDP program
+ *
+ * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
+ */
+static int
+ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
+{
+	int err, result = ICE_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	struct ice_ring *xdp_ring;
+	u32 act;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+	if (!xdp_prog) {
+		rcu_read_unlock();
+		return ICE_XDP_PASS;
+	}
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	xdp->handle += xdp->data - xdp->data_hard_start;
+#endif
+	switch (act) {
+	case XDP_PASS:
+		break;
+	case XDP_TX:
+		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
+		result = ice_xmit_xdp_buff(xdp, xdp_ring);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fallthrough -- not supported action */
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		/* fallthrough -- handle aborts by dropping frame */
+	case XDP_DROP:
+		result = ICE_XDP_CONSUMED;
+		break;
+	}
+
+	rcu_read_unlock();
+	return result;
+}
+
+/**
+ * ice_clean_rx_irq_zc - consumes packets from the hardware ring
+ * @rx_ring: AF_XDP Rx ring
+ * @budget: NAPI budget
+ *
+ * Returns number of processed packets on success, remaining budget on failure.
+ */
+int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
+{
+	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+	unsigned int xdp_xmit = 0;
+	bool failure = false;
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
+
+#endif
+	while (likely(total_rx_packets < (unsigned int)budget)) {
+		union ice_32b_rx_flex_desc *rx_desc;
+		unsigned int size, xdp_res = 0;
+		struct ice_rx_buf *rx_buf;
+		struct sk_buff *skb;
+		u16 stat_err_bits;
+		u16 vlan_tag = 0;
+		u16 rx_ptype;
+
+		if (cleaned_count >= ICE_RX_BUF_WRITE) {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
+							     cleaned_count);
+#else
+			failure |= ice_alloc_rx_bufs_zc(rx_ring, cleaned_count);
+#endif
+			cleaned_count = 0;
+		}
+
+		rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
+
+		stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
+		if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we have
+		 * verified the descriptor has been written back.
+		 */
+		dma_rmb();
+
+		size = le16_to_cpu(rx_desc->wb.pkt_len) &
+				   ICE_RX_FLX_DESC_PKT_LEN_M;
+		if (!size)
+			break;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		rx_buf = ice_get_rx_buf_zc(rx_ring, size);
+		if (!rx_buf->addr)
+			break;
+
+		xdp.data = rx_buf->addr;
+		xdp.data_meta = xdp.data;
+		xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
+		xdp.data_end = xdp.data + size;
+		xdp.handle = rx_buf->handle;
+
+		xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
+#else
+		rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+		if (!rx_buf->xdp)
+			break;
+
+		rx_buf->xdp->data_end = rx_buf->xdp->data + size;
+		xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
+
+		xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
+#endif
+		if (xdp_res) {
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+				xdp_xmit |= xdp_res;
+				rx_buf->addr = NULL;
+			} else {
+				ice_reuse_rx_buf_zc(rx_ring, rx_buf);
+			}
+#else
+			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
+				xdp_xmit |= xdp_res;
+			else
+				xsk_buff_free(rx_buf->xdp);
+			rx_buf->xdp = NULL;
+#endif
+			total_rx_bytes += size;
+			total_rx_packets++;
+			cleaned_count++;
+
+			ice_bump_ntc(rx_ring);
+			continue;
+		}
+
+		/* XDP_PASS path */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
+#else
+		skb = ice_construct_skb_zc(rx_ring, rx_buf, rx_buf->xdp);
+#endif
+		if (!skb) {
+			rx_ring->rx_stats.alloc_buf_failed++;
+			break;
+		}
+
+		cleaned_count++;
+		ice_bump_ntc(rx_ring);
+
+		if (eth_skb_pad(skb)) {
+			skb = NULL;
+			continue;
+		}
+
+		total_rx_bytes += skb->len;
+		total_rx_packets++;
+
+		vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
+
+		rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) &
+				       ICE_RX_FLEX_DESC_PTYPE_M;
+
+		ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+		ice_receive_skb(rx_ring, skb, vlan_tag);
+	}
+
+	ice_finalize_xdp_rx(rx_ring, xdp_xmit);
+	ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use ||
+		    (ice_ring_ch_enabled(rx_ring) &&
+		     !ice_vsi_pkt_inspect_opt_ena(rx_ring->vsi)))
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+
+		return (int)total_rx_packets;
+	}
+
+#endif /* HAVE_NDO_XSK_WAKEUP */
+	return failure ? budget : (int)total_rx_packets;
+}
+
+/**
+ * ice_xmit_zc - Completes AF_XDP entries, and cleans XDP entries
+ * @xdp_ring: XDP Tx ring
+ * @budget: max number of frames to xmit
+ *
+ * Returns true if cleanup/transmission is done.
+ */
+static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget)
+{
+	unsigned int sent_frames = 0, total_bytes = 0;
+	struct ice_tx_desc *tx_desc = NULL;
+	u16 ntu = xdp_ring->next_to_use;
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+	struct xdp_desc desc;
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+	dma_addr_t dma;
+#ifndef XSK_UMEM_RETURNS_XDP_DESC
+	u32 len;
+#endif /* !XSK_UMEM_RETURNS_XDP_DESC */
+
+	while (likely(budget-- > 0)) {
+		struct ice_tx_buf *tx_buf;
+
+		tx_buf = &xdp_ring->tx_buf[ntu];
+
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+		if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
+			break;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		dma = xdp_umem_get_dma(xdp_ring->xsk_pool, desc.addr);
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
+					   DMA_BIDIRECTIONAL);
+#else
+		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
+		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
+						 desc.len);
+#endif
+		tx_buf->bytecount = desc.len;
+#else
+		if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &dma, &len))
+			break;
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, len,
+					   DMA_BIDIRECTIONAL);
+
+		tx_buf->bytecount = len;
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+
+		tx_desc = ICE_TX_DESC(xdp_ring, ntu);
+		tx_desc->buf_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz =
+#ifdef XSK_UMEM_RETURNS_XDP_DESC
+			ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0, desc.len, 0);
+#else
+			ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0, len, 0);
+#endif /* XSK_UMEM_RETURNS_XDP_DESC */
+
+		xdp_ring->next_rs_idx = ntu;
+		ntu++;
+		if (ntu == xdp_ring->count)
+			ntu = 0;
+		sent_frames++;
+		total_bytes += tx_buf->bytecount;
+	}
+
+	if (tx_desc) {
+		xdp_ring->next_to_use = ntu;
+		/* Set RS bit for the last frame and bump tail ptr */
+		tx_desc->cmd_type_offset_bsz |=
+			cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+		ice_xdp_ring_update_tail(xdp_ring);
+		xsk_tx_release(xdp_ring->xsk_pool);
+		ice_update_tx_ring_stats(xdp_ring, sent_frames, total_bytes);
+	}
+
+	return budget > 0;
+}
+
+/**
+ * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
+ * @xdp_ring: XDP Tx ring
+ * @tx_buf: Tx buffer to clean
+ */
+static void
+ice_clean_xdp_tx_buf(struct ice_ring *xdp_ring, struct ice_tx_buf *tx_buf)
+{
+	xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf);
+	xdp_ring->xdp_tx_active--;
+	dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+			 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buf, len, 0);
+}
+
+/**
+ * ice_clean_tx_irq_zc - Completes AF_XDP entries, and cleans XDP entries
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns true if cleanup/tranmission is done.
+ */
+bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring)
+{
+	u16 next_rs_idx = xdp_ring->next_rs_idx;
+	u16 ntc = xdp_ring->next_to_clean;
+	u16 frames_ready = 0, send_budget;
+	struct ice_tx_desc *next_rs_desc;
+	struct ice_tx_buf *tx_buf;
+	u32 xsk_frames = 0;
+	u16 i;
+
+	next_rs_desc = ICE_TX_DESC(xdp_ring, next_rs_idx);
+	if (next_rs_desc->cmd_type_offset_bsz &
+	    cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)) {
+		if (next_rs_idx >= ntc)
+			frames_ready = next_rs_idx - ntc;
+		else
+			frames_ready = next_rs_idx + xdp_ring->count - ntc;
+	}
+
+	if (!frames_ready)
+		goto out_xmit;
+
+	if (likely(!xdp_ring->xdp_tx_active)) {
+		xsk_frames = frames_ready;
+		goto skip;
+	}
+
+	for (i = 0; i < frames_ready; i++) {
+		tx_buf = &xdp_ring->tx_buf[ntc];
+
+		if (tx_buf->raw_buf) {
+			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
+			tx_buf->raw_buf = NULL;
+		} else {
+			xsk_frames++;
+		}
+
+		++ntc;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+skip:
+	xdp_ring->next_to_clean += frames_ready;
+	if (unlikely(xdp_ring->next_to_clean >= xdp_ring->count))
+		xdp_ring->next_to_clean -= xdp_ring->count;
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+
+out_xmit:
+#ifdef HAVE_NDO_XSK_WAKEUP
+	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
+		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
+#endif /* HAVE_NDO_XSK_WAKEUP */
+	send_budget = ICE_DESC_UNUSED(xdp_ring);
+	send_budget = min_t(u16, send_budget, xdp_ring->count >> 2);
+	return ice_xmit_zc(xdp_ring, send_budget);
+}
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+/**
+ * ice_xsk_wakeup - Implements ndo_xsk_wakeup
+ * @netdev: net_device
+ * @queue_id: queue to wake up
+ * @flags: ignored in our case, since we have Rx and Tx in the same NAPI
+ *
+ * Returns negative on error, zero otherwise.
+ */
+int
+ice_xsk_wakeup(struct net_device *netdev, u32 queue_id,
+	       u32 __always_unused flags)
+#else
+int ice_xsk_async_xmit(struct net_device *netdev, u32 queue_id)
+#endif /* HAVE_NDO_XSK_WAKEUP */
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_q_vector *q_vector;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_ring *ring;
+
+	if (test_bit(ICE_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!ice_is_xdp_ena_vsi(vsi))
+		return -ENXIO;
+
+	if (queue_id >= vsi->num_txq)
+		return -ENXIO;
+
+	if (!vsi->xdp_rings[queue_id]->xsk_pool)
+		return -ENXIO;
+
+	ring = vsi->xdp_rings[queue_id];
+
+	/* The idea here is that if NAPI is running, mark a miss, so
+	 * it will run again. If not, trigger an interrupt and
+	 * schedule the NAPI from interrupt context. If NAPI would be
+	 * scheduled here, the interrupt affinity would not be
+	 * honored.
+	 */
+	q_vector = ring->q_vector;
+	if (!napi_if_scheduled_mark_missed(&q_vector->napi)) {
+		if (ice_ring_ch_enabled(vsi->rx_rings[queue_id]) &&
+		    !ice_vsi_pkt_inspect_opt_ena(vsi))
+#define ICE_BUSY_POLL_BUDGET 8
+			napi_busy_loop(q_vector->napi.napi_id, NULL, NULL,
+				       false, ICE_BUSY_POLL_BUDGET);
+		else
+			ice_trigger_sw_intr(&vsi->back->hw, q_vector);
+	}
+
+	return 0;
+}
+
+/**
+ * ice_xsk_any_rx_ring_ena - Checks if Rx rings have AF_XDP UMEM attached
+ * @vsi: VSI to be checked
+ *
+ * Returns true if any of the Rx rings has an AF_XDP UMEM attached
+ */
+bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi)
+{
+	int i;
+
+#ifndef HAVE_AF_XDP_NETDEV_UMEM
+	if (!vsi->xsk_umems)
+		return false;
+
+	for (i = 0; i < vsi->num_xsk_umems; i++) {
+		if (vsi->xsk_umems[i])
+			return true;
+	}
+#else
+	ice_for_each_rxq(vsi, i) {
+		if (xsk_get_pool_from_qid(vsi->netdev, i))
+			return true;
+	}
+#endif /* HAVE_AF_XDP_NETDEV_UMEM */
+
+	return false;
+}
+
+/**
+ * ice_xsk_clean_rx_ring - clean UMEM queues connected to a given Rx ring
+ * @rx_ring: ring to be cleaned
+ */
+void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
+{
+	u16 i;
+
+	for (i = 0; i < rx_ring->count; i++) {
+		struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
+
+		if (!rx_buf->addr)
+			continue;
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+		xsk_umem_fq_reuse(rx_ring->xsk_pool, rx_buf->handle);
+#endif
+		rx_buf->addr = NULL;
+	}
+}
+
+/**
+ * ice_xsk_clean_xdp_ring - Clean the XDP Tx ring and its UMEM queues
+ * @xdp_ring: XDP_Tx ring
+ */
+void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean, ntu = xdp_ring->next_to_use;
+	u32 xsk_frames = 0;
+
+	while (ntc != ntu) {
+		struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
+
+		if (tx_buf->raw_buf)
+			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
+		else
+			xsk_frames++;
+
+		tx_buf->raw_buf = NULL;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h
new file mode 100644
index 0000000000000000000000000000000000000000..63599776bac121ad98289abaf25f1d8baec527e2
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_XSK_H_
+#define _ICE_XSK_H_
+#include "ice_txrx.h"
+#include "ice.h"
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+#include <net/xdp_sock_drv.h>
+#endif
+
+struct ice_vsi;
+
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifdef CONFIG_XDP_SOCKETS
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xsk_buff_pool *umem,
+		       u16 qid);
+#else
+int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem,
+		       u16 qid);
+#endif
+int ice_xsk_umem_query(struct ice_vsi *vsi, struct xdp_umem **umem, u16 qid);
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
+#endif
+int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
+bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring);
+#ifdef HAVE_NDO_XSK_WAKEUP
+int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
+#else
+int ice_xsk_async_xmit(struct net_device *netdev, u32 queue_id);
+#endif /* HAVE_NDO_XSK_WAKEUP */
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
+#else
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count);
+#endif
+bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
+void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
+void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
+#else
+static inline int
+ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi,
+#ifdef HAVE_NETDEV_BPF_XSK_POOL
+		   struct xsk_buff_pool __always_unused *pool,
+#else
+		   struct xdp_umem __always_unused *umem,
+#endif
+		   u16 __always_unused qid)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ice_xsk_umem_query(struct ice_vsi __always_unused *vsi,
+		   struct xdp_umem __always_unused **umem,
+		   u16 __always_unused qid)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+static inline void
+ice_zca_free(struct zero_copy_allocator __always_unused *zca,
+	     unsigned long __always_unused handle)
+{
+}
+#endif
+
+static inline int
+ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
+		    int __always_unused budget)
+{
+	return 0;
+}
+
+static inline bool
+ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring)
+{
+	return false;
+}
+
+static inline bool
+#ifndef HAVE_MEM_TYPE_XSK_BUFF_POOL
+ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
+			  u16 __always_unused count)
+#else
+ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
+		     u16 __always_unused count)
+#endif
+{
+	return false;
+}
+
+static inline bool ice_xsk_any_rx_ring_ena(struct ice_vsi __always_unused *vsi)
+{
+	return false;
+}
+
+#ifdef HAVE_NDO_XSK_WAKEUP
+static inline int
+ice_xsk_wakeup(struct net_device __always_unused *netdev,
+	       u32 __always_unused queue_id, u32 __always_unused flags)
+{
+	return -EOPNOTSUPP;
+}
+#else
+static inline int
+ice_xsk_async_xmit(struct net_device __always_unused *netdev,
+		   u32 __always_unused queue_id)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* HAVE_NDO_XSK_WAKEUP */
+
+static inline void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring) { }
+static inline void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring) { }
+#endif /* CONFIG_XDP_SOCKETS */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#endif /* !_ICE_XSK_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat.c b/drivers/net/ethernet/intel/ice/kcompat.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5f8068c875c482def4018de4cf4c1f97645ca12
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat.c
@@ -0,0 +1,1360 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#include "kcompat.h"
+
+
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+#ifdef HAVE_FDB_OPS
+#ifdef USE_CONST_DEV_UC_CHAR
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev, const unsigned char *addr,
+			  u16 flags)
+#else
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr, u16 flags)
+#endif
+{
+	int err = -EINVAL;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return err;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_add_excl(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_add_excl(dev, addr);
+
+	/* Only return duplicate errors if NLM_F_EXCL is set */
+	if (err == -EEXIST && !(flags & NLM_F_EXCL))
+		err = 0;
+
+	return err;
+}
+
+#ifdef USE_CONST_DEV_UC_CHAR
+#ifdef HAVE_FDB_DEL_NLATTR
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev, const unsigned char *addr)
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr)
+#endif
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr)
+#endif
+{
+	int err = -EINVAL;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (!(ndm->ndm_state & NUD_PERMANENT)) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return err;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_del(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_del(dev, addr);
+
+	return err;
+}
+
+#endif /* HAVE_FDB_OPS */
+#ifdef CONFIG_PCI_IOV
+int __kc_pci_vfs_assigned(struct pci_dev __maybe_unused *dev)
+{
+	unsigned int vfs_assigned = 0;
+#ifdef HAVE_PCI_DEV_FLAGS_ASSIGNED
+	int pos;
+	struct pci_dev *vfdev;
+	unsigned short dev_id;
+
+	/* only search if we are a PF */
+	if (!dev->is_physfn)
+		return 0;
+
+	/* find SR-IOV capability */
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (!pos)
+		return 0;
+
+	/*
+	 * determine the device ID for the VFs, the vendor ID will be the
+	 * same as the PF so there is no need to check for that one
+	 */
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &dev_id);
+
+	/* loop through all the VFs to see if we own any that are assigned */
+	vfdev = pci_get_device(dev->vendor, dev_id, NULL);
+	while (vfdev) {
+		/*
+		 * It is considered assigned if it is a virtual function with
+		 * our dev as the physical function and the assigned bit is set
+		 */
+		if (vfdev->is_virtfn && (vfdev->physfn == dev) &&
+		    (vfdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED))
+			vfs_assigned++;
+
+		vfdev = pci_get_device(dev->vendor, dev_id, vfdev);
+	}
+
+#endif /* HAVE_PCI_DEV_FLAGS_ASSIGNED */
+	return vfs_assigned;
+}
+
+#endif /* CONFIG_PCI_IOV */
+#endif /* 3.10.0 */
+
+static const unsigned char __maybe_unused pcie_link_speed[] = {
+	PCI_SPEED_UNKNOWN,      /* 0 */
+	PCIE_SPEED_2_5GT,       /* 1 */
+	PCIE_SPEED_5_0GT,       /* 2 */
+	PCIE_SPEED_8_0GT,       /* 3 */
+	PCIE_SPEED_16_0GT,      /* 4 */
+	PCI_SPEED_UNKNOWN,      /* 5 */
+	PCI_SPEED_UNKNOWN,      /* 6 */
+	PCI_SPEED_UNKNOWN,      /* 7 */
+	PCI_SPEED_UNKNOWN,      /* 8 */
+	PCI_SPEED_UNKNOWN,      /* 9 */
+	PCI_SPEED_UNKNOWN,      /* A */
+	PCI_SPEED_UNKNOWN,      /* B */
+	PCI_SPEED_UNKNOWN,      /* C */
+	PCI_SPEED_UNKNOWN,      /* D */
+	PCI_SPEED_UNKNOWN,      /* E */
+	PCI_SPEED_UNKNOWN       /* F */
+};
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) )
+int __kc_pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
+			       enum pcie_link_width *width)
+{
+
+	*speed = PCI_SPEED_UNKNOWN;
+	*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	while (dev) {
+		u16 lnksta;
+		enum pci_bus_speed next_speed;
+		enum pcie_link_width next_width;
+		int ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+
+		if (ret)
+			return ret;
+
+		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
+			PCI_EXP_LNKSTA_NLW_SHIFT;
+
+		if (next_speed < *speed)
+			*speed = next_speed;
+
+		if (next_width < *width)
+			*width = next_width;
+
+		dev = dev->bus->self;
+	}
+
+	return 0;
+}
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,7))
+int _kc_pci_wait_for_pending_transaction(struct pci_dev *dev)
+{
+	int i;
+	u16 status;
+
+	/* Wait for Transaction Pending bit clean */
+	for (i = 0; i < 4; i++) {
+		if (i)
+			msleep((1 << (i - 1)) * 100);
+
+		pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &status);
+		if (!(status & PCI_EXP_DEVSTA_TRPND))
+			return 1;
+	}
+
+	return 0;
+}
+#endif /* <RHEL6.7 */
+
+#endif /* <3.12 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) )
+int __kc_dma_set_mask_and_coherent(struct device *dev, u64 mask)
+{
+	int err = dma_set_mask(dev, mask);
+
+	if (!err)
+		/* coherent mask for the same size will always succeed if
+		 * dma_set_mask does. However we store the error anyways, due
+		 * to some kernels which use gcc's warn_unused_result on their
+		 * definition of dma_set_coherent_mask.
+		 */
+		err = dma_set_coherent_mask(dev, mask);
+	return err;
+}
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+static bool _kc_pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn,
+					   u32 *l, int crs_timeout)
+{
+	int delay = 1;
+
+	if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
+		return false;
+
+	/* some broken boards return 0 or ~0 if a slot is empty: */
+	if (*l == 0xffffffff || *l == 0x00000000 ||
+	    *l == 0x0000ffff || *l == 0xffff0000)
+		return false;
+
+	/* Configuration request Retry Status */
+	while (*l == 0xffff0001) {
+		if (!crs_timeout)
+			return false;
+
+		msleep(delay);
+		delay *= 2;
+		if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
+			return false;
+		/* Card hasn't responded in 60 seconds?  Must be stuck. */
+		if (delay > crs_timeout) {
+			printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not "
+			       "responding\n", pci_domain_nr(bus),
+			       bus->number, PCI_SLOT(devfn),
+			       PCI_FUNC(devfn));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool _kc_pci_device_is_present(struct pci_dev *pdev)
+{
+	u32 v;
+
+	return _kc_pci_bus_read_dev_vendor_id(pdev->bus, pdev->devfn, &v, 0);
+}
+#endif /* <RHEL7.0 */
+#endif /* 3.13.0 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) )
+/******************************************************************************
+ * ripped from linux/net/ipv6/exthdrs_core.c, GPL2, no direct copyright,
+ * inferred copyright from kernel
+ */
+int __kc_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		       int target, unsigned short *fragoff, int *flags)
+{
+	unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+	u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+	bool found;
+
+#define __KC_IP6_FH_F_FRAG	BIT(0)
+#define __KC_IP6_FH_F_AUTH	BIT(1)
+#define __KC_IP6_FH_F_SKIP_RH	BIT(2)
+
+	if (fragoff)
+		*fragoff = 0;
+
+	if (*offset) {
+		struct ipv6hdr _ip6, *ip6;
+
+		ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
+		if (!ip6 || (ip6->version != 6)) {
+			printk(KERN_ERR "IPv6 header not found\n");
+			return -EBADMSG;
+		}
+		start = *offset + sizeof(struct ipv6hdr);
+		nexthdr = ip6->nexthdr;
+	}
+
+	do {
+		struct ipv6_opt_hdr _hdr, *hp;
+		unsigned int hdrlen;
+		found = (nexthdr == target);
+
+		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+			if (target < 0 || found)
+				break;
+			return -ENOENT;
+		}
+
+		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+		if (!hp)
+			return -EBADMSG;
+
+		if (nexthdr == NEXTHDR_ROUTING) {
+			struct ipv6_rt_hdr _rh, *rh;
+
+			rh = skb_header_pointer(skb, start, sizeof(_rh),
+						&_rh);
+			if (!rh)
+				return -EBADMSG;
+
+			if (flags && (*flags & __KC_IP6_FH_F_SKIP_RH) &&
+			    rh->segments_left == 0)
+				found = false;
+		}
+
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			unsigned short _frag_off;
+			__be16 *fp;
+
+			if (flags)	/* Indicate that this is a fragment */
+				*flags |= __KC_IP6_FH_F_FRAG;
+			fp = skb_header_pointer(skb,
+						start+offsetof(struct frag_hdr,
+							       frag_off),
+						sizeof(_frag_off),
+						&_frag_off);
+			if (!fp)
+				return -EBADMSG;
+
+			_frag_off = ntohs(*fp) & ~0x7;
+			if (_frag_off) {
+				if (target < 0 &&
+				    ((!ipv6_ext_hdr(hp->nexthdr)) ||
+				     hp->nexthdr == NEXTHDR_NONE)) {
+					if (fragoff)
+						*fragoff = _frag_off;
+					return hp->nexthdr;
+				}
+				return -ENOENT;
+			}
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH) {
+			if (flags && (*flags & __KC_IP6_FH_F_AUTH) && (target < 0))
+				break;
+			hdrlen = (hp->hdrlen + 2) << 2;
+		} else
+			hdrlen = ipv6_optlen(hp);
+
+		if (!found) {
+			nexthdr = hp->nexthdr;
+			start += hdrlen;
+		}
+	} while (!found);
+
+	*offset = start;
+	return nexthdr;
+}
+
+int __kc_pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
+			       int minvec, int maxvec)
+{
+        int nvec = maxvec;
+        int rc;
+
+        if (maxvec < minvec)
+                return -ERANGE;
+
+        do {
+                rc = pci_enable_msix(dev, entries, nvec);
+                if (rc < 0) {
+                        return rc;
+                } else if (rc > 0) {
+                        if (rc < minvec)
+                                return -ENOSPC;
+                        nvec = rc;
+                }
+        } while (rc);
+
+        return nvec;
+}
+#endif /* 3.14.0 */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0))
+char *_kc_devm_kstrdup(struct device *dev, const char *s, gfp_t gfp)
+{
+	size_t size;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	size = strlen(s) + 1;
+	buf = devm_kzalloc(dev, size, gfp);
+	if (buf)
+		memcpy(buf, s, size);
+	return buf;
+}
+
+void __kc_netdev_rss_key_fill(void *buffer, size_t len)
+{
+	/* Set of random keys generated using kernel random number generator */
+	static const u8 seed[NETDEV_RSS_KEY_LEN] = {0xE6, 0xFA, 0x35, 0x62,
+				0x95, 0x12, 0x3E, 0xA3, 0xFB, 0x46, 0xC1, 0x5F,
+				0xB1, 0x43, 0x82, 0x5B, 0x6A, 0x49, 0x50, 0x95,
+				0xCD, 0xAB, 0xD8, 0x11, 0x8F, 0xC5, 0xBD, 0xBC,
+				0x6A, 0x4A, 0xB2, 0xD4, 0x1F, 0xFE, 0xBC, 0x41,
+				0xBF, 0xAC, 0xB2, 0x9A, 0x8F, 0x70, 0xE9, 0x2A,
+				0xD7, 0xB2, 0x80, 0xB6, 0x5B, 0xAA, 0x9D, 0x20};
+
+	BUG_ON(len > NETDEV_RSS_KEY_LEN);
+	memcpy(buffer, seed, len);
+}
+#endif /* 3.15.0 */
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) )
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+int __kc_hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+	int err;
+
+	/* first go through and flush out any stale entries */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (!ha->synced || ha->refcount != 1)
+#else
+		if (!ha->sync_cnt || ha->refcount != 1)
+#endif
+			continue;
+
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+		list_del_rcu(&ha->list);
+		kfree_rcu(ha, rcu_head);
+		list->count--;
+	}
+
+	/* go through and sync new entries to the list */
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (ha->synced)
+#else
+		if (ha->sync_cnt)
+#endif
+			continue;
+
+		err = sync(dev, ha->addr);
+		if (err)
+			return err;
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		ha->synced = true;
+#else
+		ha->sync_cnt++;
+#endif
+		ha->refcount++;
+	}
+
+	return 0;
+}
+
+void __kc_hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		if (!ha->synced)
+#else
+		if (!ha->sync_cnt)
+#endif
+			continue;
+
+		if (unsync && unsync(dev, ha->addr))
+			continue;
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+		ha->synced = false;
+#else
+		ha->sync_cnt--;
+#endif
+		if (--ha->refcount)
+			continue;
+
+		list_del_rcu(&ha->list);
+		kfree_rcu(ha, rcu_head);
+		list->count--;
+	}
+}
+
+#endif /* NETDEV_HW_ADDR_T_UNICAST  */
+#ifndef NETDEV_HW_ADDR_T_MULTICAST
+int __kc_dev_addr_sync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct dev_addr_list *da, **next = list;
+	int err;
+
+	/* first go through and flush out any stale entries */
+	while ((da = *next) != NULL) {
+		if (da->da_synced && da->da_users == 1) {
+			if (!unsync || !unsync(dev, da->da_addr)) {
+				*next = da->next;
+				kfree(da);
+				(*count)--;
+				continue;
+			}
+		}
+		next = &da->next;
+	}
+
+	/* go through and sync new entries to the list */
+	for (da = *list; da != NULL; da = da->next) {
+		if (da->da_synced)
+			continue;
+
+		err = sync(dev, da->da_addr);
+		if (err)
+			return err;
+
+		da->da_synced++;
+		da->da_users++;
+	}
+
+	return 0;
+}
+
+void __kc_dev_addr_unsync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *))
+{
+	struct dev_addr_list *da;
+
+	while ((da = *list) != NULL) {
+		if (da->da_synced) {
+			if (!unsync || !unsync(dev, da->da_addr)) {
+				da->da_synced--;
+				if (--da->da_users == 0) {
+					*list = da->next;
+					kfree(da);
+					(*count)--;
+					continue;
+				}
+			}
+		}
+		list = &da->next;
+	}
+}
+#endif /* NETDEV_HW_ADDR_T_MULTICAST  */
+#endif /* HAVE_SET_RX_MODE */
+void *__kc_devm_kmemdup(struct device *dev, const void *src, size_t len,
+			gfp_t gfp)
+{
+	void *p;
+
+	p = devm_kzalloc(dev, len, gfp);
+	if (p)
+		memcpy(p, src, len);
+
+	return p;
+}
+#endif /* 3.16.0 */
+
+/******************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+int _kc_param_set_ullong(const char *val, const struct kernel_param *kp)
+{
+	return kstrtoull(val, 0, (unsigned long long *)kp->arg);
+}
+int _kc_param_get_ullong(char *buffer, const struct kernel_param *kp)
+{
+	return scnprintf(buffer, PAGE_SIZE, "%llu",
+			 *((unsigned long long *)kp->arg));
+}
+const struct kernel_param_ops _kc_param_ops_ullong = {
+	.set = _kc_param_set_ullong,
+	.get = _kc_param_get_ullong,
+};
+#endif /* <3.17.0 && RHEL_RELEASE_CODE < RHEL7.5 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+static void __kc_sock_efree(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+
+struct sk_buff *__kc_skb_clone_sk(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct sk_buff *clone;
+
+	if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+		return NULL;
+
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (!clone) {
+		sock_put(sk);
+		return NULL;
+	}
+
+	clone->sk = sk;
+	clone->destructor = __kc_sock_efree;
+
+	return clone;
+}
+
+void __kc_skb_complete_tx_timestamp(struct sk_buff *skb,
+				    struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock_exterr_skb *serr;
+	struct sock *sk = skb->sk;
+	int err;
+
+	sock_hold(sk);
+
+	*skb_hwtstamps(skb) = *hwtstamps;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+
+	sock_put(sk);
+}
+
+/* include headers needed for get_headlen function */
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+#include <scsi/fc/fc_fcoe.h>
+#endif
+#ifdef HAVE_SCTP
+#include <linux/sctp.h>
+#endif
+
+u32 __kc_eth_get_headlen(const struct net_device __always_unused *dev,
+			 unsigned char *data, unsigned int max_len)
+{
+	union {
+		unsigned char *network;
+		/* l2 headers */
+		struct ethhdr *eth;
+		struct vlan_hdr *vlan;
+		/* l3 headers */
+		struct iphdr *ipv4;
+		struct ipv6hdr *ipv6;
+	} hdr;
+	__be16 proto;
+	u8 nexthdr = 0;	/* default to not TCP */
+	u8 hlen;
+
+	/* this should never happen, but better safe than sorry */
+	if (max_len < ETH_HLEN)
+		return max_len;
+
+	/* initialize network frame pointer */
+	hdr.network = data;
+
+	/* set first protocol and move network header forward */
+	proto = hdr.eth->h_proto;
+	hdr.network += ETH_HLEN;
+
+again:
+	switch (proto) {
+	/* handle any vlan tag if present */
+	case __constant_htons(ETH_P_8021AD):
+	case __constant_htons(ETH_P_8021Q):
+		if ((hdr.network - data) > (max_len - VLAN_HLEN))
+			return max_len;
+
+		proto = hdr.vlan->h_vlan_encapsulated_proto;
+		hdr.network += VLAN_HLEN;
+		goto again;
+	/* handle L3 protocols */
+	case __constant_htons(ETH_P_IP):
+		if ((hdr.network - data) > (max_len - sizeof(struct iphdr)))
+			return max_len;
+
+		/* access ihl as a u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+
+		/* verify hlen meets minimum size requirements */
+		if (hlen < sizeof(struct iphdr))
+			return hdr.network - data;
+
+		/* record next protocol if header is present */
+		if (!(hdr.ipv4->frag_off & htons(IP_OFFSET)))
+			nexthdr = hdr.ipv4->protocol;
+
+		hdr.network += hlen;
+		break;
+#ifdef NETIF_F_TSO6
+	case __constant_htons(ETH_P_IPV6):
+		if ((hdr.network - data) > (max_len - sizeof(struct ipv6hdr)))
+			return max_len;
+
+		/* record next protocol */
+		nexthdr = hdr.ipv6->nexthdr;
+		hdr.network += sizeof(struct ipv6hdr);
+		break;
+#endif /* NETIF_F_TSO6 */
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	case __constant_htons(ETH_P_FCOE):
+		hdr.network += FCOE_HEADER_LEN;
+		break;
+#endif
+	default:
+		return hdr.network - data;
+	}
+
+	/* finally sort out L4 */
+	switch (nexthdr) {
+	case IPPROTO_TCP:
+		if ((hdr.network - data) > (max_len - sizeof(struct tcphdr)))
+			return max_len;
+
+		/* access doff as a u8 to avoid unaligned access on ia64 */
+		hdr.network += max_t(u8, sizeof(struct tcphdr),
+				     (hdr.network[12] & 0xF0) >> 2);
+
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		hdr.network += sizeof(struct udphdr);
+		break;
+#ifdef HAVE_SCTP
+	case IPPROTO_SCTP:
+		hdr.network += sizeof(struct sctphdr);
+		break;
+#endif
+	}
+
+	/*
+	 * If everything has gone correctly hdr.network should be the
+	 * data section of the packet and will be the end of the header.
+	 * If not then it probably represents the end of the last recognized
+	 * header.
+	 */
+	return min_t(unsigned int, hdr.network - data, max_len);
+}
+
+#endif /* < 3.18.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+#ifdef HAVE_NET_GET_RANDOM_ONCE
+static u8 __kc_netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void __kc_netdev_rss_key_fill(void *buffer, size_t len)
+{
+	BUG_ON(len > sizeof(__kc_netdev_rss_key));
+	net_get_random_once(__kc_netdev_rss_key, sizeof(__kc_netdev_rss_key));
+	memcpy(buffer, __kc_netdev_rss_key, len);
+}
+#endif
+
+int _kc_bitmap_print_to_pagebuf(bool list, char *buf,
+				const unsigned long *maskp,
+				int nmaskbits)
+{
+	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
+	int n = 0;
+
+	if (len > 1) {
+		n = list ? bitmap_scnlistprintf(buf, len, maskp, nmaskbits) :
+			   bitmap_scnprintf(buf, len, maskp, nmaskbits);
+		buf[n++] = '\n';
+		buf[n] = '\0';
+	}
+	return n;
+}
+#endif
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) )
+#if !((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,8) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+      (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+      (SLE_VERSION_CODE > SLE_VERSION(12,1,0)))
+unsigned int _kc_cpumask_local_spread(unsigned int i, int node)
+{
+	int cpu;
+
+	/* Wrap: we always want a cpu. */
+	i %= num_online_cpus();
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) )
+	/* Kernels prior to 2.6.28 do not have for_each_cpu or
+	 * cpumask_of_node, so just use for_each_online_cpu()
+	 */
+	for_each_online_cpu(cpu)
+		if (i-- == 0)
+			return cpu;
+
+	return 0;
+#else
+	if (node == -1) {
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+	} else {
+		/* NUMA first. */
+		for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			/* Skip NUMA nodes, done above. */
+			if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+				continue;
+
+			if (i-- == 0)
+				return cpu;
+		}
+	}
+#endif /* KERNEL_VERSION >= 2.6.28 */
+	BUG();
+}
+#endif
+#endif
+
+/******************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+/**
+ * _kc_skb_flow_dissect_flow_keys - parse SKB to fill _kc_flow_keys
+ * @skb: SKB used to fille _kc_flow_keys
+ * @flow: _kc_flow_keys to set with SKB fields
+ * @flags: currently unused flags
+ *
+ * The purpose of using kcompat for this function is so the caller doesn't have
+ * to care about which kernel version they are on, which prevents a larger than
+ * normal #ifdef mess created by using a HAVE_* flag for this case. This is also
+ * done for 4.2 kernels to simplify calling skb_flow_dissect_flow_keys()
+ * because in 4.2 kernels skb_flow_dissect_flow_keys() exists, but only has 2
+ * arguments. Recent kernels have skb_flow_dissect_flow_keys() that has 3
+ * arguments.
+ *
+ * The caller needs to understand that this function was only implemented as a
+ * bare-minimum replacement for recent versions of skb_flow_dissect_flow_keys()
+ * and this function is in no way similar to skb_flow_dissect_flow_keys(). An
+ * example use can be found in the ice driver, specifically ice_arfs.c.
+ *
+ * This function is treated as a allowlist of supported fields the SKB can
+ * parse. If new functionality is added make sure to keep this format (i.e. only
+ * check for fields that are explicity wanted).
+ *
+ * Current allowlist:
+ *
+ * TCPv4, TCPv6, UDPv4, UDPv6
+ *
+ * If any unexpected protocol or other field is found this function memsets the
+ * flow passed in back to 0 and returns false. Otherwise the flow is populated
+ * and returns true.
+ */
+bool
+_kc_skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+			       struct _kc_flow_keys *flow,
+			       unsigned int __always_unused flags)
+{
+	memset(flow, 0, sizeof(*flow));
+
+	flow->basic.n_proto = skb->protocol;
+	switch (flow->basic.n_proto) {
+	case htons(ETH_P_IP):
+		flow->basic.ip_proto = ip_hdr(skb)->protocol;
+		flow->addrs.v4addrs.src = ip_hdr(skb)->saddr;
+		flow->addrs.v4addrs.dst = ip_hdr(skb)->daddr;
+		break;
+	case htons(ETH_P_IPV6):
+		flow->basic.ip_proto = ipv6_hdr(skb)->nexthdr;
+		memcpy(&flow->addrs.v6addrs.src, &ipv6_hdr(skb)->saddr,
+		       sizeof(struct in6_addr));
+		memcpy(&flow->addrs.v6addrs.dst, &ipv6_hdr(skb)->daddr,
+		       sizeof(struct in6_addr));
+		break;
+	default:
+		netdev_dbg(skb->dev, "%s: Unsupported/unimplemented layer 3 protocol %04x\n", __func__, htons(flow->basic.n_proto));
+		goto unsupported;
+	}
+
+	switch (flow->basic.ip_proto) {
+	case IPPROTO_TCP:
+	{
+		struct tcphdr *tcph;
+
+		tcph = tcp_hdr(skb);
+		flow->ports.src = tcph->source;
+		flow->ports.dst = tcph->dest;
+		break;
+	}
+	case IPPROTO_UDP:
+	{
+		struct udphdr *udph;
+
+		udph = udp_hdr(skb);
+		flow->ports.src = udph->source;
+		flow->ports.dst = udph->dest;
+		break;
+	}
+	default:
+		netdev_dbg(skb->dev, "%s: Unsupported/unimplemented layer 4 protocol %02x\n", __func__, flow->basic.ip_proto);
+		return false;
+	}
+
+	return true;
+
+unsupported:
+	memset(flow, 0, sizeof(*flow));
+	return false;
+}
+#endif /* ! >= RHEL7.4 && ! >= SLES12.2 */
+#endif /* 4.3.0 */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) )
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+#ifdef CONFIG_SPARC
+#include <asm/idprom.h>
+#include <asm/prom.h>
+#endif
+int _kc_eth_platform_get_mac_address(struct device *dev __maybe_unused,
+				     u8 *mac_addr __maybe_unused)
+{
+#if (((LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) && defined(CONFIG_OF) && \
+      !defined(HAVE_STRUCT_DEVICE_OF_NODE) || !defined(CONFIG_OF)) && \
+     !defined(CONFIG_SPARC))
+	return -ENODEV;
+#else
+	const unsigned char *addr;
+	struct device_node *dp;
+
+	if (dev_is_pci(dev))
+		dp = pci_device_to_OF_node(to_pci_dev(dev));
+	else
+#if defined(HAVE_STRUCT_DEVICE_OF_NODE) && defined(CONFIG_OF)
+		dp = dev->of_node;
+#else
+		dp = NULL;
+#endif
+
+	addr = NULL;
+	if (dp)
+		addr = of_get_mac_address(dp);
+#ifdef CONFIG_SPARC
+	/* Kernel hasn't implemented arch_get_platform_mac_address, but we
+	 * should handle the SPARC case here since it was supported
+	 * originally. This is replaced by arch_get_platform_mac_address()
+	 * upstream.
+	 */
+	if (!addr)
+		addr = idprom->id_ethaddr;
+#endif
+	if (!addr)
+		return -ENODEV;
+
+	ether_addr_copy(mac_addr, addr);
+	return 0;
+#endif
+}
+#endif /* !(RHEL_RELEASE >= 7.3) */
+#endif /* < 4.5.0 */
+
+/*****************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))))
+const char *_kc_phy_speed_to_str(int speed)
+{
+	switch (speed) {
+	case SPEED_10:
+		return "10Mbps";
+	case SPEED_100:
+		return "100Mbps";
+	case SPEED_1000:
+		return "1Gbps";
+	case SPEED_2500:
+		return "2.5Gbps";
+	case SPEED_5000:
+		return "5Gbps";
+	case SPEED_10000:
+		return "10Gbps";
+	case SPEED_14000:
+		return "14Gbps";
+	case SPEED_20000:
+		return "20Gbps";
+	case SPEED_25000:
+		return "25Gbps";
+	case SPEED_40000:
+		return "40Gbps";
+	case SPEED_50000:
+		return "50Gbps";
+	case SPEED_56000:
+		return "56Gbps";
+#ifdef SPEED_100000
+	case SPEED_100000:
+		return "100Gbps";
+#endif
+	case SPEED_UNKNOWN:
+		return "Unknown";
+	default:
+		return "Unsupported (update phy-core.c)";
+	}
+}
+#endif /* (LINUX < 4.14.0) || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+
+/******************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) )
+void _kc_ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				      struct ethtool_link_ksettings *src)
+{
+	unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS);
+	unsigned int idx = 0;
+
+	for (; idx < size; idx++) {
+		dst->link_modes.supported[idx] &=
+			src->link_modes.supported[idx];
+		dst->link_modes.advertising[idx] &=
+			src->link_modes.advertising[idx];
+	}
+}
+#endif /* 4.15.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0) || \
+       SLE_VERSION_CODE >= SLE_VERSION(15,1,0))
+#if BITS_PER_LONG == 64
+/**
+ * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
+ * @bitmap: array of unsigned longs, the destination bitmap
+ * @buf: array of u32 (in host byte order), the source bitmap
+ * @nbits: number of bits in @bitmap
+ */
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
+{
+	unsigned int i, halfwords;
+
+	halfwords = DIV_ROUND_UP(nbits, 32);
+	for (i = 0; i < halfwords; i++) {
+		bitmap[i/2] = (unsigned long) buf[i];
+		if (++i < halfwords)
+			bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
+	}
+
+	/* Clear tail bits in last word beyond nbits. */
+	if (nbits % BITS_PER_LONG)
+		bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
+}
+#endif /* BITS_PER_LONG == 64 */
+#endif /* !(RHEL >= 8.0) && !(SLES >= 12.5 && SLES < 15.0 || SLES >= 15.1) */
+#endif /* 4.16.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+/* PCIe link information */
+#define PCIE_SPEED2STR(speed) \
+	((speed) == PCIE_SPEED_16_0GT ? "16 GT/s" : \
+	 (speed) == PCIE_SPEED_8_0GT ? "8 GT/s" : \
+	 (speed) == PCIE_SPEED_5_0GT ? "5 GT/s" : \
+	 (speed) == PCIE_SPEED_2_5GT ? "2.5 GT/s" : \
+	 "Unknown speed")
+
+/* PCIe speed to Mb/s reduced by encoding overhead */
+#define PCIE_SPEED2MBS_ENC(speed) \
+	((speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \
+	 (speed) == PCIE_SPEED_8_0GT  ?  8000*128/130 : \
+	 (speed) == PCIE_SPEED_5_0GT  ?  5000*8/10 : \
+	 (speed) == PCIE_SPEED_2_5GT  ?  2500*8/10 : \
+	 0)
+
+static u32
+_kc_pcie_bandwidth_available(struct pci_dev *dev,
+			     struct pci_dev **limiting_dev,
+			     enum pci_bus_speed *speed,
+			     enum pcie_link_width *width)
+{
+	u16 lnksta;
+	enum pci_bus_speed next_speed;
+	enum pcie_link_width next_width;
+	u32 bw, next_bw;
+
+	if (speed)
+		*speed = PCI_SPEED_UNKNOWN;
+	if (width)
+		*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	bw = 0;
+
+	while (dev) {
+		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+
+		next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+		next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
+			PCI_EXP_LNKSTA_NLW_SHIFT;
+
+		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
+
+		/* Check if current device limits the total bandwidth */
+		if (!bw || next_bw <= bw) {
+			bw = next_bw;
+
+			if (limiting_dev)
+				*limiting_dev = dev;
+			if (speed)
+				*speed = next_speed;
+			if (width)
+				*width = next_width;
+		}
+
+		dev = pci_upstream_bridge(dev);
+	}
+
+	return bw;
+}
+
+static enum pci_bus_speed _kc_pcie_get_speed_cap(struct pci_dev *dev)
+{
+	u32 lnkcap2, lnkcap;
+
+	/*
+	 * PCIe r4.0 sec 7.5.3.18 recommends using the Supported Link
+	 * Speeds Vector in Link Capabilities 2 when supported, falling
+	 * back to Max Link Speed in Link Capabilities otherwise.
+	 */
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+	if (lnkcap2) { /* PCIe r3.0-compliant */
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+			return PCIE_SPEED_16_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+			return PCIE_SPEED_8_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+			return PCIE_SPEED_5_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+			return PCIE_SPEED_2_5GT;
+		return PCI_SPEED_UNKNOWN;
+	}
+
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+	if (lnkcap) {
+		if (lnkcap & PCI_EXP_LNKCAP_SLS_16_0GB)
+			return PCIE_SPEED_16_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_8_0GB)
+			return PCIE_SPEED_8_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_5_0GB)
+			return PCIE_SPEED_5_0GT;
+		else if (lnkcap & PCI_EXP_LNKCAP_SLS_2_5GB)
+			return PCIE_SPEED_2_5GT;
+	}
+
+	return PCI_SPEED_UNKNOWN;
+}
+
+static enum pcie_link_width _kc_pcie_get_width_cap(struct pci_dev *dev)
+{
+	u32 lnkcap;
+
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+	if (lnkcap)
+		return (lnkcap & PCI_EXP_LNKCAP_MLW) >> 4;
+
+	return PCIE_LNK_WIDTH_UNKNOWN;
+}
+
+static u32
+_kc_pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
+			   enum pcie_link_width *width)
+{
+	*speed = _kc_pcie_get_speed_cap(dev);
+	*width = _kc_pcie_get_width_cap(dev);
+
+	if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
+		return 0;
+
+	return *width * PCIE_SPEED2MBS_ENC(*speed);
+}
+
+void _kc_pcie_print_link_status(struct pci_dev *dev) {
+	enum pcie_link_width width, width_cap;
+	enum pci_bus_speed speed, speed_cap;
+	struct pci_dev *limiting_dev = NULL;
+	u32 bw_avail, bw_cap;
+
+	bw_cap = _kc_pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
+	bw_avail = _kc_pcie_bandwidth_available(dev, &limiting_dev, &speed,
+						&width);
+
+	if (bw_avail >= bw_cap)
+		pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth (%s x%d link)\n",
+			 bw_cap / 1000, bw_cap % 1000,
+			 PCIE_SPEED2STR(speed_cap), width_cap);
+	else
+		pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)\n",
+			 bw_avail / 1000, bw_avail % 1000,
+			 PCIE_SPEED2STR(speed), width,
+			 limiting_dev ? pci_name(limiting_dev) : "<unknown>",
+			 bw_cap / 1000, bw_cap % 1000,
+			 PCIE_SPEED2STR(speed_cap), width_cap);
+}
+#endif /* 4.17.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,1,0)) || (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,1)))
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#define FLOW_DISSECTOR_MATCH(__rule, __type, __out)				\
+	const struct flow_match *__m = &(__rule)->match;			\
+	struct flow_dissector *__d = (__m)->dissector;				\
+										\
+	(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key);	\
+	(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask);	\
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+			   struct flow_match_basic *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out);
+}
+
+void flow_rule_match_control(const struct flow_rule *rule,
+			     struct flow_match_control *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out);
+}
+
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+			       struct flow_match_eth_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out);
+}
+
+#ifdef HAVE_TC_FLOWER_ENC
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+			       struct flow_match_enc_keyid *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out);
+}
+
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+			       struct flow_match_ports *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out);
+}
+
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+				 struct flow_match_control *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out);
+}
+
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv4_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out);
+}
+
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv6_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out);
+}
+#endif
+
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+void flow_rule_match_vlan(const struct flow_rule *rule,
+			  struct flow_match_vlan *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out);
+}
+#endif
+
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv4_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out);
+}
+
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv6_addrs *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out);
+}
+
+void flow_rule_match_ports(const struct flow_rule *rule,
+			   struct flow_match_ports *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#endif /* 5.1.0 || (RHEL && RHEL < 8.1) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+int _kc_flow_block_cb_setup_simple(struct flow_block_offload *f,
+				   struct list_head __always_unused *driver_list,
+				   tc_setup_cb_t *cb,
+				   void *cb_ident, void *cb_priv,
+				   bool ingress_only)
+{
+	if (ingress_only &&
+	    f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	/* Note: Upstream has driver_block_list, but older kernels do not */
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+#ifdef HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv,
+					     f->extack);
+#else
+		return tcf_block_cb_register(f->block, cb, cb_ident, cb_priv);
+#endif
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, cb, cb_ident);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#endif /* !RHEL >= 8.2 */
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+u64 _kc_pci_get_dsn(struct pci_dev *dev)
+{
+	u32 dword;
+	u64 dsn;
+	int pos;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN);
+	if (!pos)
+		return 0;
+
+	/*
+	 * The Device Serial Number is two dwords offset 4 bytes from the
+	 * capability position. The specification says that the first dword is
+	 * the lower half, and the second dword is the upper half.
+	 */
+	pos += 4;
+	pci_read_config_dword(dev, pos, &dword);
+	dsn = (u64)dword;
+	pci_read_config_dword(dev, pos + 4, &dword);
+	dsn |= ((u64)dword) << 32;
+
+	return dsn;
+}
+#endif /* 5.7.0 */
diff --git a/drivers/net/ethernet/intel/ice/kcompat.h b/drivers/net/ethernet/intel/ice/kcompat.h
new file mode 100644
index 0000000000000000000000000000000000000000..6549382150a6a4918e42ed2c930d3ecc5eec7115
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat.h
@@ -0,0 +1,3558 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_H_
+#define _KCOMPAT_H_
+
+#ifndef LINUX_VERSION_CODE
+#include <linux/version.h>
+#else
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/if_link.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/list.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/vmalloc.h>
+
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000		\
+		     + __GNUC_MINOR__ * 100	\
+		     + __GNUC_PATCHLEVEL__)
+#endif /* GCC_VERSION */
+
+#ifndef IEEE_8021QAZ_APP_SEL_DSCP
+#define IEEE_8021QAZ_APP_SEL_DSCP	5
+#endif
+
+/* Backport macros for controlling GCC diagnostics */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0) )
+
+/* Compilers before gcc-4.6 do not understand "#pragma GCC diagnostic push" */
+#if GCC_VERSION >= 40600
+#define __diag_str1(s)		#s
+#define __diag_str(s)		__diag_str1(s)
+#define __diag(s)		_Pragma(__diag_str(GCC diagnostic s))
+#else
+#define __diag(s)
+#endif /* GCC_VERSION >= 4.6 */
+#define __diag_push()	__diag(push)
+#define __diag_pop()	__diag(pop)
+#endif /* LINUX_VERSION < 4.18.0 */
+
+#ifndef NSEC_PER_MSEC
+#define NSEC_PER_MSEC 1000000L
+#endif
+#include <net/ipv6.h>
+/* UTS_RELEASE is in a different header starting in kernel 2.6.18 */
+#ifndef UTS_RELEASE
+/* utsrelease.h changed locations in 2.6.33 */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) )
+#include <linux/utsrelease.h>
+#else
+#include <generated/utsrelease.h>
+#endif
+#endif
+
+
+#define adapter_struct ice_pf
+#define adapter_q_vector ice_q_vector
+
+
+/* Dynamic LTR and deeper C-State support disable/enable */
+
+/* packet split disable/enable */
+#ifdef DISABLE_PACKET_SPLIT
+#endif /* DISABLE_PACKET_SPLIT */
+
+/* MSI compatibility code for all kernels and drivers */
+#ifdef DISABLE_PCI_MSI
+#undef CONFIG_PCI_MSI
+#endif
+#ifndef CONFIG_PCI_MSI
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) )
+struct msix_entry {
+	u16 vector; /* kernel uses to write allocated vector */
+	u16 entry;  /* driver uses to specify entry, OS writes */
+};
+#endif
+#undef pci_enable_msi
+#define pci_enable_msi(a) -ENOTSUPP
+#undef pci_disable_msi
+#define pci_disable_msi(a) do {} while (0)
+#undef pci_enable_msix
+#define pci_enable_msix(a, b, c) -ENOTSUPP
+#undef pci_disable_msix
+#define pci_disable_msix(a) do {} while (0)
+#define msi_remove_pci_irq_vectors(a) do {} while (0)
+#endif /* CONFIG_PCI_MSI */
+#ifdef DISABLE_PM
+#undef CONFIG_PM
+#endif
+
+#ifdef DISABLE_NET_POLL_CONTROLLER
+#undef CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#ifndef PMSG_SUSPEND
+#define PMSG_SUSPEND 3
+#endif
+
+/* generic boolean compatibility */
+#undef TRUE
+#undef FALSE
+#define TRUE true
+#define FALSE false
+#ifdef GCC_VERSION
+#if ( GCC_VERSION < 3000 )
+#define _Bool char
+#endif
+#else
+#define _Bool char
+#endif
+
+
+#undef __always_unused
+#define __always_unused __attribute__((__unused__))
+
+#undef __maybe_unused
+#define __maybe_unused __attribute__((__unused__))
+
+/* kernels less than 2.4.14 don't have this */
+#ifndef ETH_P_8021Q
+#define ETH_P_8021Q 0x8100
+#endif
+
+#ifndef module_param
+#define module_param(v,t,p) MODULE_PARM(v, "i");
+#endif
+
+#ifndef DMA_64BIT_MASK
+#define DMA_64BIT_MASK  0xffffffffffffffffULL
+#endif
+
+#ifndef DMA_32BIT_MASK
+#define DMA_32BIT_MASK  0x00000000ffffffffULL
+#endif
+
+#ifndef PCI_CAP_ID_EXP
+#define PCI_CAP_ID_EXP 0x10
+#endif
+
+#ifndef uninitialized_var
+#define uninitialized_var(x) x = x
+#endif
+
+#ifndef PCIE_LINK_STATE_L0S
+#define PCIE_LINK_STATE_L0S 1
+#endif
+#ifndef PCIE_LINK_STATE_L1
+#define PCIE_LINK_STATE_L1 2
+#endif
+
+#ifndef SET_NETDEV_DEV
+#define SET_NETDEV_DEV(net, pdev)
+#endif
+
+#if !defined(HAVE_FREE_NETDEV) && ( LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0) )
+#define free_netdev(x)	kfree(x)
+#endif
+
+#ifdef HAVE_POLL_CONTROLLER
+#define CONFIG_NET_POLL_CONTROLLER
+#endif
+
+#ifndef SKB_DATAREF_SHIFT
+/* if we do not have the infrastructure to detect if skb_header is cloned
+   just return false in all cases */
+#define skb_header_cloned(x) 0
+#endif
+
+#ifndef NETIF_F_GSO
+#define gso_size tso_size
+#define gso_segs tso_segs
+#endif
+
+#ifndef NETIF_F_GRO
+#define vlan_gro_receive(_napi, _vlgrp, _vlan, _skb) \
+		vlan_hwaccel_receive_skb(_skb, _vlgrp, _vlan)
+#define napi_gro_receive(_napi, _skb) netif_receive_skb(_skb)
+#endif
+
+#ifndef NETIF_F_SCTP_CSUM
+#define NETIF_F_SCTP_CSUM 0
+#endif
+
+#ifndef NETIF_F_LRO
+#define NETIF_F_LRO BIT(15)
+#endif
+
+#ifndef NETIF_F_NTUPLE
+#define NETIF_F_NTUPLE BIT(27)
+#endif
+
+#ifndef NETIF_F_ALL_FCOE
+#define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
+				 NETIF_F_FSO)
+#endif
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132
+#endif
+
+#ifndef IPPROTO_UDPLITE
+#define IPPROTO_UDPLITE 136
+#endif
+
+#ifndef CHECKSUM_PARTIAL
+#define CHECKSUM_PARTIAL CHECKSUM_HW
+#define CHECKSUM_COMPLETE CHECKSUM_HW
+#endif
+
+#ifndef __read_mostly
+#define __read_mostly
+#endif
+
+#ifndef MII_RESV1
+#define MII_RESV1		0x17		/* Reserved...		*/
+#endif
+
+#ifndef unlikely
+#define unlikely(_x) _x
+#define likely(_x) _x
+#endif
+
+#ifndef WARN_ON
+#define WARN_ON(x) ({0;})
+#endif
+
+#ifndef PCI_DEVICE
+#define PCI_DEVICE(vend,dev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
+#endif
+
+#ifndef node_online
+#define node_online(node) ((node) == 0)
+#endif
+
+#ifndef _LINUX_RANDOM_H
+#include <linux/random.h>
+#endif
+
+#ifndef BITS_PER_TYPE
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#endif
+
+#ifndef BITS_TO_LONGS
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#endif
+
+#ifndef DECLARE_BITMAP
+#define DECLARE_BITMAP(name,bits) long name[BITS_TO_LONGS(bits)]
+#endif
+
+#ifndef VLAN_HLEN
+#define VLAN_HLEN 4
+#endif
+
+#ifndef VLAN_ETH_HLEN
+#define VLAN_ETH_HLEN 18
+#endif
+
+#ifndef VLAN_ETH_FRAME_LEN
+#define VLAN_ETH_FRAME_LEN 1518
+#endif
+
+#ifndef DCA_GET_TAG_TWO_ARGS
+#define dca3_get_tag(a,b) dca_get_tag(b)
+#endif
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+#if defined(__i386__) || defined(__x86_64__)
+#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+#endif
+#endif
+
+/* taken from 2.6.24 definition in linux/kernel.h */
+#ifndef IS_ALIGNED
+#define IS_ALIGNED(x,a)         (((x) % ((typeof(x))(a))) == 0)
+#endif
+
+#ifdef IS_ENABLED
+#undef IS_ENABLED
+#undef __ARG_PLACEHOLDER_1
+#undef config_enabled
+#undef _config_enabled
+#undef __config_enabled
+#undef ___config_enabled
+#endif
+
+#define __ARG_PLACEHOLDER_1 0,
+#define config_enabled(cfg) _config_enabled(cfg)
+#ifdef __CHECKER__
+/* cppcheck-suppress preprocessorErrorDirective */
+#endif /* __CHECKER__ */
+#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value)
+#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0)
+#define ___config_enabled(__ignored, val, ...) val
+
+#define IS_ENABLED(option) \
+	(config_enabled(option) || config_enabled(option##_MODULE))
+
+#if !defined(NETIF_F_HW_VLAN_TX) && !defined(NETIF_F_HW_VLAN_CTAG_TX)
+struct _kc_vlan_ethhdr {
+	unsigned char	h_dest[ETH_ALEN];
+	unsigned char	h_source[ETH_ALEN];
+	__be16		h_vlan_proto;
+	__be16		h_vlan_TCI;
+	__be16		h_vlan_encapsulated_proto;
+};
+#define vlan_ethhdr _kc_vlan_ethhdr
+struct _kc_vlan_hdr {
+	__be16		h_vlan_TCI;
+	__be16		h_vlan_encapsulated_proto;
+};
+#define vlan_hdr _kc_vlan_hdr
+#define vlan_tx_tag_present(_skb) 0
+#define vlan_tx_tag_get(_skb) 0
+#endif /* NETIF_F_HW_VLAN_TX && NETIF_F_HW_VLAN_CTAG_TX */
+
+#ifndef VLAN_PRIO_SHIFT
+#define VLAN_PRIO_SHIFT 13
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_2_5GB
+#define PCI_EXP_LNKSTA_CLS_2_5GB 0x0001
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_5_0GB
+#define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002
+#endif
+
+#ifndef PCI_EXP_LNKSTA_CLS_8_0GB
+#define PCI_EXP_LNKSTA_CLS_8_0GB 0x0003
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X1
+#define PCI_EXP_LNKSTA_NLW_X1 0x0010
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X2
+#define PCI_EXP_LNKSTA_NLW_X2 0x0020
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X4
+#define PCI_EXP_LNKSTA_NLW_X4 0x0040
+#endif
+
+#ifndef PCI_EXP_LNKSTA_NLW_X8
+#define PCI_EXP_LNKSTA_NLW_X8 0x0080
+#endif
+
+
+#ifndef __GFP_COLD
+#define __GFP_COLD 0
+#endif
+
+#ifndef __GFP_COMP
+#define __GFP_COMP 0
+#endif
+
+#ifndef IP_OFFSET
+#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */
+#endif
+
+/*****************************************************************************/
+/* Installations with ethtool version without eeprom, adapter id, or statistics
+ * support */
+
+#ifndef ETH_GSTRING_LEN
+#define ETH_GSTRING_LEN 32
+#endif
+
+#ifndef ETHTOOL_GSTATS
+#define ETHTOOL_GSTATS 0x1d
+#undef ethtool_drvinfo
+#define ethtool_drvinfo k_ethtool_drvinfo
+struct k_ethtool_drvinfo {
+	u32 cmd;
+	char driver[32];
+	char version[32];
+	char fw_version[32];
+	char bus_info[32];
+	char reserved1[32];
+	char reserved2[16];
+	u32 n_stats;
+	u32 testinfo_len;
+	u32 eedump_len;
+	u32 regdump_len;
+};
+
+struct ethtool_stats {
+	u32 cmd;
+	u32 n_stats;
+	u64 data[0];
+};
+#endif /* ETHTOOL_GSTATS */
+
+#ifndef ETHTOOL_PHYS_ID
+#define ETHTOOL_PHYS_ID 0x1c
+#endif /* ETHTOOL_PHYS_ID */
+
+#ifndef ETHTOOL_GSTRINGS
+#define ETHTOOL_GSTRINGS 0x1b
+enum ethtool_stringset {
+	ETH_SS_TEST             = 0,
+	ETH_SS_STATS,
+};
+struct ethtool_gstrings {
+	u32 cmd;            /* ETHTOOL_GSTRINGS */
+	u32 string_set;     /* string set id e.c. ETH_SS_TEST, etc*/
+	u32 len;            /* number of strings in the string set */
+	u8 data[0];
+};
+#endif /* ETHTOOL_GSTRINGS */
+
+#ifndef ETHTOOL_TEST
+#define ETHTOOL_TEST 0x1a
+enum ethtool_test_flags {
+	ETH_TEST_FL_OFFLINE	= BIT(0),
+	ETH_TEST_FL_FAILED	= BIT(1),
+};
+struct ethtool_test {
+	u32 cmd;
+	u32 flags;
+	u32 reserved;
+	u32 len;
+	u64 data[0];
+};
+#endif /* ETHTOOL_TEST */
+
+#ifndef ETHTOOL_GEEPROM
+#define ETHTOOL_GEEPROM 0xb
+#undef ETHTOOL_GREGS
+struct ethtool_eeprom {
+	u32 cmd;
+	u32 magic;
+	u32 offset;
+	u32 len;
+	u8 data[0];
+};
+
+struct ethtool_value {
+	u32 cmd;
+	u32 data;
+};
+#endif /* ETHTOOL_GEEPROM */
+
+#ifndef ETHTOOL_GLINK
+#define ETHTOOL_GLINK 0xa
+#endif /* ETHTOOL_GLINK */
+
+#ifndef ETHTOOL_GWOL
+#define ETHTOOL_GWOL 0x5
+#define ETHTOOL_SWOL 0x6
+#define SOPASS_MAX      6
+struct ethtool_wolinfo {
+	u32 cmd;
+	u32 supported;
+	u32 wolopts;
+	u8 sopass[SOPASS_MAX]; /* SecureOn(tm) password */
+};
+#endif /* ETHTOOL_GWOL */
+
+#ifndef ETHTOOL_GREGS
+#define ETHTOOL_GREGS		0x00000004 /* Get NIC registers */
+#define ethtool_regs _kc_ethtool_regs
+/* for passing big chunks of data */
+struct _kc_ethtool_regs {
+	u32 cmd;
+	u32 version; /* driver-specific, indicates different chips/revs */
+	u32 len; /* bytes */
+	u8 data[0];
+};
+#endif /* ETHTOOL_GREGS */
+
+#ifndef ETHTOOL_GMSGLVL
+#define ETHTOOL_GMSGLVL		0x00000007 /* Get driver message level */
+#endif
+#ifndef ETHTOOL_SMSGLVL
+#define ETHTOOL_SMSGLVL		0x00000008 /* Set driver msg level, priv. */
+#endif
+#ifndef ETHTOOL_NWAY_RST
+#define ETHTOOL_NWAY_RST	0x00000009 /* Restart autonegotiation, priv */
+#endif
+#ifndef ETHTOOL_GLINK
+#define ETHTOOL_GLINK		0x0000000a /* Get link status */
+#endif
+#ifndef ETHTOOL_GEEPROM
+#define ETHTOOL_GEEPROM		0x0000000b /* Get EEPROM data */
+#endif
+#ifndef ETHTOOL_SEEPROM
+#define ETHTOOL_SEEPROM		0x0000000c /* Set EEPROM data */
+#endif
+#ifndef ETHTOOL_GCOALESCE
+#define ETHTOOL_GCOALESCE	0x0000000e /* Get coalesce config */
+/* for configuring coalescing parameters of chip */
+#define ethtool_coalesce _kc_ethtool_coalesce
+struct _kc_ethtool_coalesce {
+	u32	cmd;	/* ETHTOOL_{G,S}COALESCE */
+
+	/* How many usecs to delay an RX interrupt after
+	 * a packet arrives.  If 0, only rx_max_coalesced_frames
+	 * is used.
+	 */
+	u32	rx_coalesce_usecs;
+
+	/* How many packets to delay an RX interrupt after
+	 * a packet arrives.  If 0, only rx_coalesce_usecs is
+	 * used.  It is illegal to set both usecs and max frames
+	 * to zero as this would cause RX interrupts to never be
+	 * generated.
+	 */
+	u32	rx_max_coalesced_frames;
+
+	/* Same as above two parameters, except that these values
+	 * apply while an IRQ is being serviced by the host.  Not
+	 * all cards support this feature and the values are ignored
+	 * in that case.
+	 */
+	u32	rx_coalesce_usecs_irq;
+	u32	rx_max_coalesced_frames_irq;
+
+	/* How many usecs to delay a TX interrupt after
+	 * a packet is sent.  If 0, only tx_max_coalesced_frames
+	 * is used.
+	 */
+	u32	tx_coalesce_usecs;
+
+	/* How many packets to delay a TX interrupt after
+	 * a packet is sent.  If 0, only tx_coalesce_usecs is
+	 * used.  It is illegal to set both usecs and max frames
+	 * to zero as this would cause TX interrupts to never be
+	 * generated.
+	 */
+	u32	tx_max_coalesced_frames;
+
+	/* Same as above two parameters, except that these values
+	 * apply while an IRQ is being serviced by the host.  Not
+	 * all cards support this feature and the values are ignored
+	 * in that case.
+	 */
+	u32	tx_coalesce_usecs_irq;
+	u32	tx_max_coalesced_frames_irq;
+
+	/* How many usecs to delay in-memory statistics
+	 * block updates.  Some drivers do not have an in-memory
+	 * statistic block, and in such cases this value is ignored.
+	 * This value must not be zero.
+	 */
+	u32	stats_block_coalesce_usecs;
+
+	/* Adaptive RX/TX coalescing is an algorithm implemented by
+	 * some drivers to improve latency under low packet rates and
+	 * improve throughput under high packet rates.  Some drivers
+	 * only implement one of RX or TX adaptive coalescing.  Anything
+	 * not implemented by the driver causes these values to be
+	 * silently ignored.
+	 */
+	u32	use_adaptive_rx_coalesce;
+	u32	use_adaptive_tx_coalesce;
+
+	/* When the packet rate (measured in packets per second)
+	 * is below pkt_rate_low, the {rx,tx}_*_low parameters are
+	 * used.
+	 */
+	u32	pkt_rate_low;
+	u32	rx_coalesce_usecs_low;
+	u32	rx_max_coalesced_frames_low;
+	u32	tx_coalesce_usecs_low;
+	u32	tx_max_coalesced_frames_low;
+
+	/* When the packet rate is below pkt_rate_high but above
+	 * pkt_rate_low (both measured in packets per second) the
+	 * normal {rx,tx}_* coalescing parameters are used.
+	 */
+
+	/* When the packet rate is (measured in packets per second)
+	 * is above pkt_rate_high, the {rx,tx}_*_high parameters are
+	 * used.
+	 */
+	u32	pkt_rate_high;
+	u32	rx_coalesce_usecs_high;
+	u32	rx_max_coalesced_frames_high;
+	u32	tx_coalesce_usecs_high;
+	u32	tx_max_coalesced_frames_high;
+
+	/* How often to do adaptive coalescing packet rate sampling,
+	 * measured in seconds.  Must not be zero.
+	 */
+	u32	rate_sample_interval;
+};
+#endif /* ETHTOOL_GCOALESCE */
+
+#ifndef ETHTOOL_SCOALESCE
+#define ETHTOOL_SCOALESCE	0x0000000f /* Set coalesce config. */
+#endif
+#ifndef ETHTOOL_GRINGPARAM
+#define ETHTOOL_GRINGPARAM	0x00000010 /* Get ring parameters */
+/* for configuring RX/TX ring parameters */
+#define ethtool_ringparam _kc_ethtool_ringparam
+struct _kc_ethtool_ringparam {
+	u32	cmd;	/* ETHTOOL_{G,S}RINGPARAM */
+
+	/* Read only attributes.  These indicate the maximum number
+	 * of pending RX/TX ring entries the driver will allow the
+	 * user to set.
+	 */
+	u32	rx_max_pending;
+	u32	rx_mini_max_pending;
+	u32	rx_jumbo_max_pending;
+	u32	tx_max_pending;
+
+	/* Values changeable by the user.  The valid values are
+	 * in the range 1 to the "*_max_pending" counterpart above.
+	 */
+	u32	rx_pending;
+	u32	rx_mini_pending;
+	u32	rx_jumbo_pending;
+	u32	tx_pending;
+};
+#endif /* ETHTOOL_GRINGPARAM */
+
+#ifndef ETHTOOL_SRINGPARAM
+#define ETHTOOL_SRINGPARAM	0x00000011 /* Set ring parameters, priv. */
+#endif
+#ifndef ETHTOOL_GPAUSEPARAM
+#define ETHTOOL_GPAUSEPARAM	0x00000012 /* Get pause parameters */
+/* for configuring link flow control parameters */
+#define ethtool_pauseparam _kc_ethtool_pauseparam
+struct _kc_ethtool_pauseparam {
+	u32	cmd;	/* ETHTOOL_{G,S}PAUSEPARAM */
+
+	/* If the link is being auto-negotiated (via ethtool_cmd.autoneg
+	 * being true) the user may set 'autoneg' here non-zero to have the
+	 * pause parameters be auto-negotiated too.  In such a case, the
+	 * {rx,tx}_pause values below determine what capabilities are
+	 * advertised.
+	 *
+	 * If 'autoneg' is zero or the link is not being auto-negotiated,
+	 * then {rx,tx}_pause force the driver to use/not-use pause
+	 * flow control.
+	 */
+	u32	autoneg;
+	u32	rx_pause;
+	u32	tx_pause;
+};
+#endif /* ETHTOOL_GPAUSEPARAM */
+
+#ifndef ETHTOOL_SPAUSEPARAM
+#define ETHTOOL_SPAUSEPARAM	0x00000013 /* Set pause parameters. */
+#endif
+#ifndef ETHTOOL_GRXCSUM
+#define ETHTOOL_GRXCSUM		0x00000014 /* Get RX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_SRXCSUM
+#define ETHTOOL_SRXCSUM		0x00000015 /* Set RX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_GTXCSUM
+#define ETHTOOL_GTXCSUM		0x00000016 /* Get TX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_STXCSUM
+#define ETHTOOL_STXCSUM		0x00000017 /* Set TX hw csum enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_GSG
+#define ETHTOOL_GSG		0x00000018 /* Get scatter-gather enable
+					    * (ethtool_value) */
+#endif
+#ifndef ETHTOOL_SSG
+#define ETHTOOL_SSG		0x00000019 /* Set scatter-gather enable
+					    * (ethtool_value). */
+#endif
+#ifndef ETHTOOL_TEST
+#define ETHTOOL_TEST		0x0000001a /* execute NIC self-test, priv. */
+#endif
+#ifndef ETHTOOL_GSTRINGS
+#define ETHTOOL_GSTRINGS	0x0000001b /* get specified string set */
+#endif
+#ifndef ETHTOOL_PHYS_ID
+#define ETHTOOL_PHYS_ID		0x0000001c /* identify the NIC */
+#endif
+#ifndef ETHTOOL_GSTATS
+#define ETHTOOL_GSTATS		0x0000001d /* get NIC-specific statistics */
+#endif
+#ifndef ETHTOOL_GTSO
+#define ETHTOOL_GTSO		0x0000001e /* Get TSO enable (ethtool_value) */
+#endif
+#ifndef ETHTOOL_STSO
+#define ETHTOOL_STSO		0x0000001f /* Set TSO enable (ethtool_value) */
+#endif
+
+#ifndef ETHTOOL_BUSINFO_LEN
+#define ETHTOOL_BUSINFO_LEN	32
+#endif
+
+#ifndef WAKE_FILTER
+#define WAKE_FILTER	BIT(7)
+#endif
+
+#ifndef SPEED_2500
+#define SPEED_2500 2500
+#endif
+#ifndef SPEED_5000
+#define SPEED_5000 5000
+#endif
+#ifndef SPEED_14000
+#define SPEED_14000 14000
+#endif
+#ifndef SPEED_25000
+#define SPEED_25000 25000
+#endif
+#ifndef SPEED_50000
+#define SPEED_50000 50000
+#endif
+#ifndef SPEED_56000
+#define SPEED_56000 56000
+#endif
+#ifndef SPEED_100000
+#define SPEED_100000 100000
+#endif
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+#ifndef AX_RELEASE_VERSION
+#define AX_RELEASE_VERSION(a,b) (((a) << 8) + (b))
+#endif
+
+#ifndef AX_RELEASE_CODE
+#define AX_RELEASE_CODE 0
+#endif
+
+#if (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,0))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,0)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,1))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,1)
+#elif (AX_RELEASE_CODE && AX_RELEASE_CODE == AX_RELEASE_VERSION(3,2))
+#define RHEL_RELEASE_CODE RHEL_RELEASE_VERSION(5,3)
+#endif
+
+#ifndef RHEL_RELEASE_CODE
+/* NOTE: RHEL_RELEASE_* introduced in RHEL4.5 */
+#define RHEL_RELEASE_CODE 0
+#endif
+
+/* RHEL 7 didn't backport the parameter change in
+ * create_singlethread_workqueue.
+ * If/when RH corrects this we will want to tighten up the version check.
+ */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0))
+#undef create_singlethread_workqueue
+#define create_singlethread_workqueue(name)	\
+	alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)
+#endif
+
+/* Ubuntu Release ABI is the 4th digit of their kernel version. You can find
+ * it in /usr/src/linux/$(uname -r)/include/generated/utsrelease.h for new
+ * enough versions of Ubuntu. Otherwise you can simply see it in the output of
+ * uname as the 4th digit of the kernel. The UTS_UBUNTU_RELEASE_ABI is not in
+ * the linux-source package, but in the linux-headers package. It begins to
+ * appear in later releases of 14.04 and 14.10.
+ *
+ * Ex:
+ * <Ubuntu 14.04.1>
+ *  $uname -r
+ *  3.13.0-45-generic
+ * ABI is 45
+ *
+ * <Ubuntu 14.10>
+ *  $uname -r
+ *  3.16.0-23-generic
+ * ABI is 23
+ */
+#ifndef UTS_UBUNTU_RELEASE_ABI
+#define UTS_UBUNTU_RELEASE_ABI 0
+#define UBUNTU_VERSION_CODE 0
+#else
+/* Ubuntu does not provide actual release version macro, so we use the kernel
+ * version plus the ABI to generate a unique version code specific to Ubuntu.
+ * In addition, we mask the lower 8 bits of LINUX_VERSION_CODE in order to
+ * ignore differences in sublevel which are not important since we have the
+ * ABI value. Otherwise, it becomes impossible to correlate ABI to version for
+ * ordering checks.
+ *
+ * This also lets us store an ABI value up to 65535, since it can take the
+ * space that would use the lower byte of the Linux version code.
+ */
+#define UBUNTU_VERSION_CODE (((~0xFF & LINUX_VERSION_CODE) << 8) + \
+			     UTS_UBUNTU_RELEASE_ABI)
+
+#if UTS_UBUNTU_RELEASE_ABI > 65535
+#error UTS_UBUNTU_RELEASE_ABI is larger than 65535...
+#endif /* UTS_UBUNTU_RELEASE_ABI > 65535 */
+
+#if ( LINUX_VERSION_CODE <= KERNEL_VERSION(3,0,0) )
+/* Our version code scheme does not make sense for non 3.x or newer kernels,
+ * and we have no support in kcompat for this scenario. Thus, treat this as a
+ * non-Ubuntu kernel. Possibly might be better to error here.
+ */
+#define UTS_UBUNTU_RELEASE_ABI 0
+#define UBUNTU_VERSION_CODE 0
+#endif /* <= 3.0.0 */
+#endif /* !UTS_UBUNTU_RELEASE_ABI */
+
+/* We ignore the 3rd digit since we want to give precedence to the additional
+ * ABI value provided by Ubuntu.
+ */
+#define UBUNTU_VERSION(a,b,c,d) (((a) << 24) + ((b) << 16) + (d))
+
+/* SLE_VERSION is used to generate a 3-digit encoding that can order SLE
+ * kernels based on their major release, service pack, and a possible
+ * maintenance release.
+ */
+#define SLE_VERSION(a,b,c)	(((a) << 16) + ((b) << 8) + (c))
+
+/* The SLE_LOCALVERSION_CODE comes from a 3-digit code added as part of the
+ * Linux kernel version. It is extracted by the driver Makefile. This macro is
+ * used to generate codes for making comparisons below.
+ */
+#define SLE_LOCALVERSION(a,b,c)	(((a) << 16) + ((b) << 8) + (c))
+
+#ifdef CONFIG_SUSE_KERNEL
+/* Starting since at least SLE 12sp4 and SLE 15, the SUSE kernels have
+ * provided CONFIG_SUSE_VERSION, CONFIG_SUSE_PATCHLEVEL and
+ * CONFIG_SUSE_AUXRELEASE. Use these to generate SLE_VERSION if available.
+ * Only fall back to the manual table otherwise. We expect all future versions
+ * of SLE kernels to include these values, so the table will remain only for
+ * the older releases.
+ */
+#ifdef CONFIG_SUSE_VERSION
+#ifndef CONFIG_SUSE_PATCHLEVEL
+#error "CONFIG_SUSE_VERSION exists but CONFIG_SUSE_PATCHLEVEL is missing"
+#endif
+#ifndef CONFIG_SUSE_AUXRELEASE
+#error "CONFIG_SUSE_VERSION exists but CONFIG_SUSE_AUXRELEASE is missing"
+#endif
+#define SLE_VERSION_CODE SLE_VERSION(CONFIG_SUSE_VERSION, CONFIG_SUSE_PATCHLEVEL, CONFIG_SUSE_AUXRELEASE)
+#else
+/* If we do not have the CONFIG_SUSE_VERSION configuration values, fall back
+ * to the following table for older releases.
+ */
+#if ( LINUX_VERSION_CODE == KERNEL_VERSION(2,6,27) )
+/* SLES11 GA is 2.6.27 based */
+#define SLE_VERSION_CODE SLE_VERSION(11,0,0)
+#elif ( LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32) )
+/* SLES11 SP1 is 2.6.32 based */
+#define SLE_VERSION_CODE SLE_VERSION(11,1,0)
+#elif ( LINUX_VERSION_CODE == KERNEL_VERSION(3,0,13) )
+/* SLES11 SP2 GA is 3.0.13-0.27 */
+#define SLE_VERSION_CODE SLE_VERSION(11,2,0)
+#elif ((LINUX_VERSION_CODE == KERNEL_VERSION(3,0,76)))
+/* SLES11 SP3 GA is 3.0.76-0.11 */
+#define SLE_VERSION_CODE SLE_VERSION(11,3,0)
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,0,101))
+  #if (SLE_LOCALVERSION_CODE < SLE_LOCALVERSION(0,8,0))
+  /* some SLES11sp2 update kernels up to 3.0.101-0.7.x */
+  #define SLE_VERSION_CODE SLE_VERSION(11,2,0)
+  #elif (SLE_LOCALVERSION_CODE < SLE_LOCALVERSION(63,0,0))
+  /* most SLES11sp3 update kernels */
+  #define SLE_VERSION_CODE SLE_VERSION(11,3,0)
+  #else
+  /* SLES11 SP4 GA (3.0.101-63) and update kernels 3.0.101-63+ */
+  #define SLE_VERSION_CODE SLE_VERSION(11,4,0)
+  #endif
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,12,28))
+/* SLES12 GA is 3.12.28-4
+ * kernel updates 3.12.xx-<33 through 52>[.yy] */
+#define SLE_VERSION_CODE SLE_VERSION(12,0,0)
+#elif (LINUX_VERSION_CODE == KERNEL_VERSION(3,12,49))
+/* SLES12 SP1 GA is 3.12.49-11
+ * updates 3.12.xx-60.yy where xx={51..} */
+#define SLE_VERSION_CODE SLE_VERSION(12,1,0)
+#elif ((LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,21) && \
+       (LINUX_VERSION_CODE <= KERNEL_VERSION(4,4,59))) || \
+       (LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,74) && \
+        LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+        SLE_LOCALVERSION_CODE >= KERNEL_VERSION(92,0,0) && \
+        SLE_LOCALVERSION_CODE <  KERNEL_VERSION(93,0,0)))
+/* SLES12 SP2 GA is 4.4.21-69.
+ * SLES12 SP2 updates before SLES12 SP3 are: 4.4.{21,38,49,59}
+ * SLES12 SP2 updates after SLES12 SP3 are: 4.4.{74,90,103,114,120}
+ * but they all use a SLE_LOCALVERSION_CODE matching 92.nn.y */
+#define SLE_VERSION_CODE SLE_VERSION(12,2,0)
+#elif ((LINUX_VERSION_CODE == KERNEL_VERSION(4,4,73) || \
+        LINUX_VERSION_CODE == KERNEL_VERSION(4,4,82) || \
+        LINUX_VERSION_CODE == KERNEL_VERSION(4,4,92)) || \
+       (LINUX_VERSION_CODE == KERNEL_VERSION(4,4,103) && \
+       (SLE_LOCALVERSION_CODE == KERNEL_VERSION(6,33,0) || \
+        SLE_LOCALVERSION_CODE == KERNEL_VERSION(6,38,0))) || \
+       (LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,114) && \
+        LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) && \
+        SLE_LOCALVERSION_CODE >= KERNEL_VERSION(94,0,0) && \
+        SLE_LOCALVERSION_CODE <  KERNEL_VERSION(95,0,0)) )
+/* SLES12 SP3 GM is 4.4.73-5 and update kernels are 4.4.82-6.3.
+ * SLES12 SP3 updates not conflicting with SP2 are: 4.4.{82,92}
+ * SLES12 SP3 updates conflicting with SP2 are:
+ *   - 4.4.103-6.33.1, 4.4.103-6.38.1
+ *   - 4.4.{114,120}-94.nn.y */
+#define SLE_VERSION_CODE SLE_VERSION(12,3,0)
+#else
+#error "This looks like a SUSE kernel, but it has an unrecognized local version code."
+#endif /* LINUX_VERSION_CODE == KERNEL_VERSION(x,y,z) */
+#endif /* !CONFIG_SUSE_VERSION */
+#endif /* CONFIG_SUSE_KERNEL */
+#ifndef SLE_VERSION_CODE
+#define SLE_VERSION_CODE 0
+#endif /* SLE_VERSION_CODE */
+#ifndef SLE_LOCALVERSION_CODE
+#define SLE_LOCALVERSION_CODE 0
+#endif /* SLE_LOCALVERSION_CODE */
+
+/*
+ * Include the definitions file for HAVE/NEED flags for the standard upstream
+ * kernels.
+ *
+ * Then, based on the distribution we detect, load the distribution specific
+ * definitions file that customizes the definitions for the target
+ * distribution.
+ */
+#include "kcompat_std_defs.h"
+
+#ifdef CONFIG_SUSE_KERNEL
+#include "kcompat_sles_defs.h"
+#elif UBUNTU_VERSION_CODE
+#include "kcompat_ubuntu_defs.h"
+#elif RHEL_RELEASE_CODE
+#include "kcompat_rhel_defs.h"
+#endif
+
+/*
+ * ADQ depends on __TC_MQPRIO_MODE_MAX and related kernel code
+ * added around 4.15. Some distributions (e.g. Oracle Linux 7.7)
+ * have done a partial back-port of that to their kernels based
+ * on older mainline kernels that did not include all the necessary
+ * kernel enablement to support ADQ.
+ * Undefine __TC_MQPRIO_MODE_MAX for all OSV distributions with
+ * kernels based on mainline kernels older than 4.15 except for
+ * RHEL, SLES and Ubuntu which are known to have good back-ports.
+ */
+#if (!RHEL_RELEASE_CODE && !SLE_VERSION_CODE && !UBUNTU_VERSION_CODE)
+  #if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+  #undef __TC_MQPRIO_MODE_MAX
+  #endif /*  LINUX_VERSION_CODE == KERNEL_VERSION(4,15,0) */
+#endif /* if (NOT RHEL && NOT SLES && NOT UBUNTU) */
+
+
+#ifdef __KLOCWORK__
+ */
+#ifdef ARRAY_SIZE
+#undef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define memcpy(dest, src, len)	memcpy_s(dest, len, src, len)
+#define memset(dest, ch, len)	memset_s(dest, len, ch, len)
+
+static inline int _kc_test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long flags = 0;
+
+	_atomic_spin_lock_irqsave(p, flags);
+	old = *p;
+	*p = old & ~mask;
+	_atomic_spin_unlock_irqrestore(p, flags);
+
+	return (old & mask) != 0;
+}
+#define test_and_clear_bit(nr, addr) _kc_test_and_clear_bit(nr, addr)
+
+static inline int _kc_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long flags = 0;
+
+	_atomic_spin_lock_irqsave(p, flags);
+	old = *p;
+	*p = old | mask;
+	_atomic_spin_unlock_irqrestore(p, flags);
+
+	return (old & mask) != 0;
+}
+#define test_and_set_bit(nr, addr) _kc_test_and_set_bit(nr, addr)
+
+#ifdef CONFIG_DYNAMIC_DEBUG
+#undef dev_dbg
+#define dev_dbg(dev, format, arg...) dev_printk(KERN_DEBUG, dev, format, ##arg)
+#undef pr_debug
+#define pr_debug(format, arg...) printk(KERN_DEBUG format, ##arg)
+#endif /* CONFIG_DYNAMIC_DEBUG */
+
+
+#undef hlist_for_each_entry_safe
+#define hlist_for_each_entry_safe(pos, n, head, member)			     \
+	for (n = NULL, pos = hlist_entry_safe((head)->first, typeof(*(pos)), \
+					      member);			     \
+	     pos;							     \
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+#ifdef uninitialized_var
+#undef uninitialized_var
+#define uninitialized_var(x) x = *(&(x))
+#endif
+
+#ifdef WRITE_ONCE
+#undef WRITE_ONCE
+#define WRITE_ONCE(x, val)	((x) = (val))
+#endif /* WRITE_ONCE */
+
+#ifdef wait_event_interruptible_timeout
+#undef wait_event_interruptible_timeout
+#define wait_event_interruptible_timeout(wq_head, condition, timeout) ({	\
+	long ret;								\
+	if ((condition))							\
+		ret = timeout;							\
+	else									\
+		ret = 0;							\
+	ret;									\
+})
+#endif /* wait_event_interruptible_timeout */
+
+#ifdef max_t
+#undef max_t
+#define max_t(type, x, y) ({							\
+type __x = (x);								\
+type __y = (y);								\
+__x > __y ? __x : __y;							\
+})
+#endif /* max_t */
+
+#ifdef min_t
+#undef min_t
+#define min_t(type, x, y) ({							\
+type __x = (x);								\
+type __y = (y);								\
+__x < __y ? __x : __y;							\
+})
+#endif /* min_t */
+#endif /* __KLOCWORK__ */
+
+
+/* Older versions of GCC will trigger -Wformat-nonliteral warnings for const
+ * char * strings. Unfortunately, the implementation of do_trace_printk does
+ * this, in order to add a storage attribute to the memory. This was fixed in
+ * GCC 5.1, but we still use older distributions built with GCC 4.x.
+ *
+ * The string pointer is only passed as a const char * to the __trace_bprintk
+ * function. Since that function has the __printf attribute, it will trigger
+ * the warnings. We can't remove the attribute, so instead we'll use the
+ * __diag macro to disable -Wformat-nonliteral around the call to
+ * __trace_bprintk.
+ */
+#if GCC_VERSION < 50100
+#define __trace_bprintk(ip, fmt, args...) ({		\
+	int err;					\
+	__diag_push();					\
+	__diag(ignored "-Wformat-nonliteral");		\
+	err = __trace_bprintk(ip, fmt, ##args);		\
+	__diag_pop();					\
+	err;						\
+})
+#endif /* GCC_VERSION < 5.1.0 */
+
+/* Newer kernels removed <linux/pci-aspm.h> */
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0)) && \
+     (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)) && \
+     !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))))
+#define HAVE_PCI_ASPM_H
+#endif
+
+#include <linux/aer.h>
+#include <linux/pci_hotplug.h>
+#include <linux/of_net.h>
+#include <linux/of.h>
+#define HAVE_SET_RX_MODE
+#define HAVE_STRUCT_DEVICE_OF_NODE
+#define HAVE_BRIDGE_FILTER
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) )
+#ifndef NAPI_POLL_WEIGHT
+#define NAPI_POLL_WEIGHT 64
+#endif
+#ifdef CONFIG_PCI_IOV
+int __kc_pci_vfs_assigned(struct pci_dev *dev);
+#else
+static inline int __kc_pci_vfs_assigned(struct pci_dev __always_unused *dev)
+{
+	return 0;
+}
+#endif
+#define pci_vfs_assigned(dev) __kc_pci_vfs_assigned(dev)
+
+#ifndef list_first_entry_or_null
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+#endif
+
+#ifndef VLAN_TX_COOKIE_MAGIC
+static inline struct sk_buff *__kc__vlan_hwaccel_put_tag(struct sk_buff *skb,
+							 u16 vlan_tci)
+{
+#ifdef VLAN_TAG_PRESENT
+	vlan_tci |= VLAN_TAG_PRESENT;
+#endif
+	skb->vlan_tci = vlan_tci;
+        return skb;
+}
+#define __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci) \
+	__kc__vlan_hwaccel_put_tag(skb, vlan_tci)
+#endif
+
+#ifdef HAVE_FDB_OPS
+#if defined(HAVE_NDO_FDB_ADD_NLATTR)
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev,
+			  const unsigned char *addr, u16 flags);
+#elif defined(USE_CONST_DEV_UC_CHAR)
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr, u16 flags);
+#else
+int __kc_ndo_dflt_fdb_add(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr, u16 flags);
+#endif /* HAVE_NDO_FDB_ADD_NLATTR */
+#if defined(HAVE_FDB_DEL_NLATTR)
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev,
+			  const unsigned char *addr);
+#elif defined(USE_CONST_DEV_UC_CHAR)
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  const unsigned char *addr);
+#else
+int __kc_ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev,
+			  unsigned char *addr);
+#endif /* HAVE_FDB_DEL_NLATTR */
+#define ndo_dflt_fdb_add __kc_ndo_dflt_fdb_add
+#define ndo_dflt_fdb_del __kc_ndo_dflt_fdb_del
+#endif /* HAVE_FDB_OPS */
+
+#ifndef PCI_DEVID
+#define PCI_DEVID(bus, devfn)  ((((u16)(bus)) << 8) | (devfn))
+#endif
+
+/* The definitions for these functions when CONFIG_OF_NET is defined are
+ * pulled in from <linux/of_net.h>. For kernels older than 3.5 we already have
+ * backports for when CONFIG_OF_NET is true. These are separated and
+ * duplicated in order to cover all cases so that all kernels get either the
+ * real definitions (when CONFIG_OF_NET is defined) or the stub definitions
+ * (when CONFIG_OF_NET is not defined, or the kernel is too old to have real
+ * definitions).
+ */
+#ifndef CONFIG_OF_NET
+static inline int of_get_phy_mode(struct device_node __always_unused *np)
+{
+	return -ENODEV;
+}
+
+static inline const void *
+of_get_mac_address(struct device_node __always_unused *np)
+{
+	return NULL;
+}
+#endif
+
+#else /* >= 3.10.0 */
+#define HAVE_ENCAP_TSO_OFFLOAD
+#define USE_DEFAULT_FDB_DEL_DUMP
+#define HAVE_SKB_INNER_NETWORK_HEADER
+
+#if (RHEL_RELEASE_CODE && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)))
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+#define HAVE_RHEL7_PCI_DRIVER_RH
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#define HAVE_RHEL7_PCI_RESET_NOTIFY
+#endif /* RHEL >= 7.2 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define HAVE_GENEVE_RX_OFFLOAD
+#endif /* RHEL < 7.5 */
+#define HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+#define HAVE_RHEL7_NET_DEVICE_OPS_EXT
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_GENEVE)
+#define HAVE_UDP_ENC_TUNNEL
+#endif /* !HAVE_UDP_ENC_TUNNEL && CONFIG_GENEVE */
+#endif /* RHEL >= 7.3 */
+
+/* new hooks added to net_device_ops_extended in RHEL7.4 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SET_VF_VLAN
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_UDP_TUNNEL
+#define HAVE_UDP_ENC_RX_OFFLOAD
+#endif /* RHEL >= 7.4 */
+#else  /* RHEL >= 8.0 */
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#define NO_NETDEV_BPF_PROG_ATTACHED
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#endif /* RHEL >= 8.0 */
+#endif /* RHEL >= 7.0 */
+#endif /* >= 3.10.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) )
+#define netdev_notifier_info_to_dev(ptr) ptr
+#ifndef time_in_range64
+#define time_in_range64(a, b, c) \
+	(time_after_eq64(a, b) && \
+	 time_before_eq64(a, c))
+#endif /* time_in_range64 */
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,6)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(11,4,0)))
+#define HAVE_NDO_SET_VF_LINK_STATE
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#else /* >= 3.11.0 */
+#define HAVE_NDO_SET_VF_LINK_STATE
+#define HAVE_SKB_INNER_PROTOCOL
+#define HAVE_MPLS_FEATURES
+#endif /* >= 3.11.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) )
+int __kc_pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
+			       enum pcie_link_width *width);
+#ifndef pcie_get_minimum_link
+#define pcie_get_minimum_link(_p, _s, _w) __kc_pcie_get_minimum_link(_p, _s, _w)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(6,7))
+int _kc_pci_wait_for_pending_transaction(struct pci_dev *dev);
+#define pci_wait_for_pending_transaction _kc_pci_wait_for_pending_transaction
+#endif /* <RHEL6.7 */
+
+#else /* >= 3.12.0 */
+#if ( SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) )
+#define HAVE_VXLAN_RX_OFFLOAD
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_VXLAN)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+#endif /* < 4.8.0 */
+#define HAVE_NDO_GET_PHYS_PORT_ID
+#define HAVE_NETIF_SET_XPS_QUEUE_CONST_MASK
+#endif /* >= 3.12.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) )
+#define dma_set_mask_and_coherent(_p, _m) __kc_dma_set_mask_and_coherent(_p, _m)
+int __kc_dma_set_mask_and_coherent(struct device *dev, u64 mask);
+#ifndef u64_stats_init
+#define u64_stats_init(a) do { } while(0)
+#endif
+#undef BIT_ULL
+#define BIT_ULL(n) (1ULL << (n))
+
+#if (!(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)) && \
+     !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)))
+static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev)
+{
+	dev = pci_physfn(dev);
+	if (pci_is_root_bus(dev->bus))
+		return NULL;
+
+	return dev->bus->self;
+}
+#endif
+
+#if (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,1,0))
+#undef HAVE_STRUCT_PAGE_PFMEMALLOC
+#define HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+#endif
+#ifndef list_next_entry
+#define list_next_entry(pos, member) \
+	list_entry((pos)->member.next, typeof(*(pos)), member)
+#endif
+#ifndef list_prev_entry
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+#endif
+
+#if ( LINUX_VERSION_CODE > KERNEL_VERSION(2,6,20) )
+#define devm_kcalloc(dev, cnt, size, flags) \
+	devm_kzalloc(dev, (cnt) * (size), flags)
+#endif /* > 2.6.20 */
+
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+#define list_last_entry(ptr, type, member) list_entry((ptr)->prev, type, member)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+bool _kc_pci_device_is_present(struct pci_dev *pdev);
+#define pci_device_is_present _kc_pci_device_is_present
+#endif /* <RHEL7.0 */
+#else /* >= 3.13.0 */
+#define HAVE_VXLAN_CHECKS
+#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,24))
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#else
+#define HAVE_NDO_SELECT_QUEUE_ACCEL
+#endif
+#define HAVE_HWMON_DEVICE_REGISTER_WITH_GROUPS
+#endif
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) )
+
+#ifndef U16_MAX
+#define U16_MAX ((u16)~0U)
+#endif
+
+#ifndef U32_MAX
+#define U32_MAX ((u32)~0U)
+#endif
+
+#ifndef U64_MAX
+#define U64_MAX ((u64)~0ULL)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+#define dev_consume_skb_any(x) dev_kfree_skb_any(x)
+#define dev_consume_skb_irq(x) dev_kfree_skb_irq(x)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,0)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+
+/* it isn't expected that this would be a #define unless we made it so */
+#ifndef skb_set_hash
+
+#define PKT_HASH_TYPE_NONE	0
+#define PKT_HASH_TYPE_L2	1
+#define PKT_HASH_TYPE_L3	2
+#define PKT_HASH_TYPE_L4	3
+
+enum _kc_pkt_hash_types {
+	_KC_PKT_HASH_TYPE_NONE = PKT_HASH_TYPE_NONE,
+	_KC_PKT_HASH_TYPE_L2 = PKT_HASH_TYPE_L2,
+	_KC_PKT_HASH_TYPE_L3 = PKT_HASH_TYPE_L3,
+	_KC_PKT_HASH_TYPE_L4 = PKT_HASH_TYPE_L4,
+};
+#define pkt_hash_types         _kc_pkt_hash_types
+
+#define skb_set_hash __kc_skb_set_hash
+static inline void __kc_skb_set_hash(struct sk_buff __maybe_unused *skb,
+				     u32 __maybe_unused hash,
+				     int __maybe_unused type)
+{
+#ifdef HAVE_SKB_L4_RXHASH
+	skb->l4_rxhash = (type == PKT_HASH_TYPE_L4);
+#endif
+#ifdef NETIF_F_RXHASH
+	skb->rxhash = hash;
+#endif
+}
+#endif /* !skb_set_hash */
+
+#else	/* RHEL_RELEASE_CODE >= 7.0 || SLE_VERSION_CODE >= 12.0 */
+
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,0)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE <= SLE_VERSION(12,1,0)))
+/* GPLv2 code taken from 5.10-rc2 kernel source include/linux/pci.h, Copyright
+ * original authors.
+ */
+static inline int pci_enable_msix_exact(struct pci_dev *dev,
+					struct msix_entry *entries, int nvec)
+{
+	int rc = pci_enable_msix_range(dev, entries, nvec, nvec);
+	if (rc < 0)
+		return rc;
+	return 0;
+}
+#endif /* <=EL7.0 || <=SLES 12.1 */
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#ifndef HAVE_VXLAN_RX_OFFLOAD
+#define HAVE_VXLAN_RX_OFFLOAD
+#endif /* HAVE_VXLAN_RX_OFFLOAD */
+#endif
+
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_VXLAN)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+
+#ifndef HAVE_VXLAN_CHECKS
+#define HAVE_VXLAN_CHECKS
+#endif /* HAVE_VXLAN_CHECKS */
+#endif /* !(RHEL_RELEASE_CODE >= 7.0 && SLE_VERSION_CODE >= 12.0) */
+
+#if ((RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) ||\
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12,0,0)))
+#define HAVE_NDO_DFWD_OPS
+#endif
+
+#ifndef pci_enable_msix_range
+int __kc_pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
+			       int minvec, int maxvec);
+#define pci_enable_msix_range __kc_pci_enable_msix_range
+#endif
+
+#ifndef ether_addr_copy
+#define ether_addr_copy __kc_ether_addr_copy
+static inline void __kc_ether_addr_copy(u8 *dst, const u8 *src)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	*(u32 *)dst = *(const u32 *)src;
+	*(u16 *)(dst + 4) = *(const u16 *)(src + 4);
+#else
+	u16 *a = (u16 *)dst;
+	const u16 *b = (const u16 *)src;
+
+	a[0] = b[0];
+	a[1] = b[1];
+	a[2] = b[2];
+#endif
+}
+#endif /* ether_addr_copy */
+int __kc_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		       int target, unsigned short *fragoff, int *flags);
+#define ipv6_find_hdr(a, b, c, d, e) __kc_ipv6_find_hdr((a), (b), (c), (d), (e))
+
+#ifndef OPTIMIZE_HIDE_VAR
+#ifdef __GNUC__
+#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var))
+#else
+#include <linux/barrier.h>
+#define OPTIMIZE_HIDE_VAR(var)	barrier()
+#endif
+#endif
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)) && \
+     !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(10,4,0)))
+static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
+{
+#ifdef NETIF_F_RXHASH
+	return skb->rxhash;
+#else
+	return 0;
+#endif /* NETIF_F_RXHASH */
+}
+#endif /* !RHEL > 5.9 && !SLES >= 10.4 */
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define request_firmware_direct	request_firmware
+#endif /* !RHEL || RHEL < 7.5 */
+
+#else /* >= 3.14.0 */
+
+/* for ndo_dfwd_ ops add_station, del_station and _start_xmit */
+#ifndef HAVE_NDO_DFWD_OPS
+#define HAVE_NDO_DFWD_OPS
+#endif
+#define HAVE_NDO_SELECT_QUEUE_ACCEL_FALLBACK
+#endif /* 3.14.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0) )
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) )
+#define HAVE_SKBUFF_RXHASH
+#endif /* >= 2.6.35 */
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1)) && \
+     !(UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE >= UBUNTU_VERSION(3,13,0,30)))
+#define u64_stats_fetch_begin_irq u64_stats_fetch_begin_bh
+#define u64_stats_fetch_retry_irq u64_stats_fetch_retry_bh
+#endif
+
+char *_kc_devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+#define devm_kstrdup(dev, s, gfp) _kc_devm_kstrdup(dev, s, gfp)
+
+#else /* >= 3.15.0 */
+#define HAVE_NET_GET_RANDOM_ONCE
+#define HAVE_PTP_1588_CLOCK_PINS
+#define HAVE_NETDEV_PORT
+#endif /* 3.15.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) )
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic() smp_mb()
+#define smp_mb__after_atomic()  smp_mb()
+#endif
+#ifndef __dev_uc_sync
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+int __kc_hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *));
+void __kc_hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *));
+#endif
+#ifndef NETDEV_HW_ADDR_T_MULTICAST
+int __kc_dev_addr_sync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*sync)(struct net_device *, const unsigned char *),
+		int (*unsync)(struct net_device *, const unsigned char *));
+void __kc_dev_addr_unsync_dev(struct dev_addr_list **list, int *count,
+		struct net_device *dev,
+		int (*unsync)(struct net_device *, const unsigned char *));
+#endif
+#endif /* HAVE_SET_RX_MODE */
+
+static inline int __kc_dev_uc_sync(struct net_device __maybe_unused *dev,
+				   int __maybe_unused (*sync)(struct net_device *, const unsigned char *),
+				   int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+	return __kc_hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
+#elif defined(HAVE_SET_RX_MODE)
+	return __kc_dev_addr_sync_dev(&dev->uc_list, &dev->uc_count,
+				      dev, sync, unsync);
+#else
+	return 0;
+#endif
+}
+#define __dev_uc_sync __kc_dev_uc_sync
+
+static inline void __kc_dev_uc_unsync(struct net_device __maybe_unused *dev,
+				      int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_UNICAST
+	__kc_hw_addr_unsync_dev(&dev->uc, dev, unsync);
+#else /* NETDEV_HW_ADDR_T_MULTICAST */
+	__kc_dev_addr_unsync_dev(&dev->uc_list, &dev->uc_count, dev, unsync);
+#endif /* NETDEV_HW_ADDR_T_UNICAST */
+#endif /* HAVE_SET_RX_MODE */
+}
+#define __dev_uc_unsync __kc_dev_uc_unsync
+
+static inline int __kc_dev_mc_sync(struct net_device __maybe_unused *dev,
+				   int __maybe_unused (*sync)(struct net_device *, const unsigned char *),
+				   int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef NETDEV_HW_ADDR_T_MULTICAST
+	return __kc_hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
+#elif defined(HAVE_SET_RX_MODE)
+	return __kc_dev_addr_sync_dev(&dev->mc_list, &dev->mc_count,
+				      dev, sync, unsync);
+#else
+	return 0;
+#endif
+
+}
+#define __dev_mc_sync __kc_dev_mc_sync
+
+static inline void __kc_dev_mc_unsync(struct net_device __maybe_unused *dev,
+				      int __maybe_unused (*unsync)(struct net_device *, const unsigned char *))
+{
+#ifdef HAVE_SET_RX_MODE
+#ifdef NETDEV_HW_ADDR_T_MULTICAST
+	__kc_hw_addr_unsync_dev(&dev->mc, dev, unsync);
+#else /* NETDEV_HW_ADDR_T_MULTICAST */
+	__kc_dev_addr_unsync_dev(&dev->mc_list, &dev->mc_count, dev, unsync);
+#endif /* NETDEV_HW_ADDR_T_MULTICAST */
+#endif /* HAVE_SET_RX_MODE */
+}
+#define __dev_mc_unsync __kc_dev_mc_unsync
+#endif /* __dev_uc_sync */
+
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+#endif
+
+#ifndef NETIF_F_GSO_UDP_TUNNEL_CSUM
+/* if someone backports this, hopefully they backport as a #define.
+ * declare it as zero on older kernels so that if it get's or'd in
+ * it won't effect anything, therefore preventing core driver changes
+ */
+#define NETIF_F_GSO_UDP_TUNNEL_CSUM 0
+#define SKB_GSO_UDP_TUNNEL_CSUM 0
+#endif
+void *__kc_devm_kmemdup(struct device *dev, const void *src, size_t len,
+			gfp_t gfp);
+#define devm_kmemdup __kc_devm_kmemdup
+
+#else
+#if ( ( LINUX_VERSION_CODE < KERNEL_VERSION(4,13,0) ) && \
+      ! ( SLE_VERSION_CODE && ( SLE_VERSION_CODE >= SLE_VERSION(12,4,0)) ) )
+#define HAVE_PCI_ERROR_HANDLER_RESET_NOTIFY
+#endif /* >= 3.16.0 && < 4.13.0 && !(SLES >= 12sp4) */
+#define HAVE_NDO_SET_VF_MIN_MAX_TX_RATE
+#endif /* 3.16.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) )
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+      RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+    !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#ifndef timespec64
+#define timespec64 timespec
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	return ts64;
+}
+#define timespec64_equal timespec_equal
+#define timespec64_compare timespec_compare
+#define set_normalized_timespec64 set_normalized_timespec
+#define timespec64_add_safe timespec_add_safe
+#define timespec64_add timespec_add
+#define timespec64_sub timespec_sub
+#define timespec64_valid timespec_valid
+#define timespec64_valid_strict timespec_valid_strict
+#define timespec64_to_ns timespec_to_ns
+#define ns_to_timespec64 ns_to_timespec
+#define ktime_to_timespec64 ktime_to_timespec
+#define ktime_get_ts64 ktime_get_ts
+#define ktime_get_real_ts64 ktime_get_real_ts
+#define timespec64_add_ns timespec_add_ns
+#endif /* timespec64 */
+#endif /* !(RHEL6.8<RHEL7.0) && !RHEL7.2+ */
+
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+     RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0))
+static inline void ktime_get_real_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_real());
+}
+
+static inline void ktime_get_ts64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get());
+}
+#endif
+
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define hlist_add_behind(_a, _b) hlist_add_after(_b, _a)
+#endif
+
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#define param_ops_ullong _kc_param_ops_ullong
+extern const struct kernel_param_ops _kc_param_ops_ullong;
+#define param_set_ullong _kc_param_set_ullong
+int _kc_param_set_ullong(const char *val, const struct kernel_param *kp);
+#define param_get_ullong _kc_param_get_ullong
+int _kc_param_get_ullong(char *buffer, const struct kernel_param *kp);
+#define param_check_ullong(name, p) __param_check(name, p, unsigned long long)
+#endif /* RHEL_RELEASE_CODE < RHEL7.5 */
+
+#if RHEL_RELEASE_CODE && \
+	RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,3) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,3)
+static inline u64 ktime_get_ns(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+
+static inline u64 ktime_get_real_ns(void)
+{
+	return ktime_to_ns(ktime_get_real());
+}
+
+static inline u64 ktime_get_boot_ns(void)
+{
+	return ktime_to_ns(ktime_get_boottime());
+}
+#endif /* RHEL < 7.3 */
+
+#else
+#define HAVE_DCBNL_OPS_SETAPP_RETURN_INT
+#include <linux/time64.h>
+#define HAVE_RHASHTABLE
+#endif /* 3.17.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) )
+#include <linux/errqueue.h>
+struct sk_buff *__kc_skb_clone_sk(struct sk_buff *skb);
+void __kc_skb_complete_tx_timestamp(struct sk_buff *skb,
+				    struct skb_shared_hwtstamps *hwtstamps);
+#define skb_clone_sk __kc_skb_clone_sk
+#define skb_complete_tx_timestamp __kc_skb_complete_tx_timestamp
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+u32 __kc_eth_get_headlen(const struct net_device *dev, unsigned char *data,
+			 unsigned int max_len);
+#else
+unsigned int __kc_eth_get_headlen(unsigned char *data, unsigned int max_len);
+#endif /* !RHEL >= 8.2 */
+
+#define eth_get_headlen __kc_eth_get_headlen
+#ifndef ETH_P_XDSA
+#define ETH_P_XDSA 0x00F8
+#endif
+/* RHEL 7.1 backported csum_level, but SLES 12 and 12-SP1 did not */
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,1))
+#define HAVE_SKBUFF_CSUM_LEVEL
+#endif /* >= RH 7.1 */
+
+/* RHEL 7.3 backported xmit_more */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#define HAVE_SKB_XMIT_MORE
+#endif /* >= RH 7.3 */
+
+#undef GENMASK
+#define GENMASK(h, l) \
+	(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+#undef GENMASK_ULL
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+
+#else /*  3.18.0 */
+#define HAVE_SKBUFF_CSUM_LEVEL
+#define HAVE_SKB_XMIT_MORE
+#define HAVE_SKB_INNER_PROTOCOL_TYPE
+#endif /* 3.18.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,4) )
+#else
+#define HAVE_NDO_FEATURES_CHECK
+#endif /* 3.18.4 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,18,13) )
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) ({ ACCESS_ONCE(x) = (val); })
+#endif
+#endif /* 3.18.13 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+/* netdev_phys_port_id renamed to netdev_phys_item_id */
+#define netdev_phys_item_id netdev_phys_port_id
+
+static inline void _kc_napi_complete_done(struct napi_struct *napi,
+					  int __always_unused work_done) {
+	napi_complete(napi);
+}
+/* don't use our backport if the distro kernels already have it */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))) || \
+    (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+#define napi_complete_done _kc_napi_complete_done
+#endif
+
+int _kc_bitmap_print_to_pagebuf(bool list, char *buf,
+				const unsigned long *maskp, int nmaskbits);
+#define bitmap_print_to_pagebuf _kc_bitmap_print_to_pagebuf
+
+#ifndef NETDEV_RSS_KEY_LEN
+#define NETDEV_RSS_KEY_LEN (13 * 4)
+#endif
+#if (!(RHEL_RELEASE_CODE && \
+      ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,7) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) || \
+       (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))))
+#define netdev_rss_key_fill(buffer, len) __kc_netdev_rss_key_fill(buffer, len)
+#endif /* RHEL_RELEASE_CODE */
+void __kc_netdev_rss_key_fill(void *buffer, size_t len);
+#define SPEED_20000 20000
+#define SPEED_40000 40000
+#ifndef dma_rmb
+#define dma_rmb() rmb()
+#endif
+#ifndef dev_alloc_pages
+#ifndef NUMA_NO_NODE
+#define NUMA_NO_NODE -1
+#endif
+#define dev_alloc_pages(_order) alloc_pages_node(NUMA_NO_NODE, (GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC), (_order))
+#endif
+#ifndef dev_alloc_page
+#define dev_alloc_page() dev_alloc_pages(0)
+#endif
+#if !defined(eth_skb_pad) && !defined(skb_put_padto)
+/**
+ *     __kc_skb_put_padto - increase size and pad an skbuff up to a minimal size
+ *     @skb: buffer to pad
+ *     @len: minimal length
+ *
+ *     Pads up a buffer to ensure the trailing bytes exist and are
+ *     blanked. If the buffer already contains sufficient data it
+ *     is untouched. Otherwise it is extended. Returns zero on
+ *     success. The skb is freed on error.
+ */
+static inline int __kc_skb_put_padto(struct sk_buff *skb, unsigned int len)
+{
+	unsigned int size = skb->len;
+
+	if (unlikely(size < len)) {
+		len -= size;
+		if (skb_pad(skb, len))
+			return -ENOMEM;
+		__skb_put(skb, len);
+	}
+	return 0;
+}
+#define skb_put_padto(skb, len) __kc_skb_put_padto(skb, len)
+
+static inline int __kc_eth_skb_pad(struct sk_buff *skb)
+{
+	return __kc_skb_put_padto(skb, ETH_ZLEN);
+}
+#define eth_skb_pad(skb) __kc_eth_skb_pad(skb)
+#endif /* eth_skb_pad && skb_put_padto */
+
+#ifndef SKB_ALLOC_NAPI
+/* RHEL 7.2 backported napi_alloc_skb and friends */
+static inline struct sk_buff *__kc_napi_alloc_skb(struct napi_struct *napi, unsigned int length)
+{
+	return netdev_alloc_skb_ip_align(napi->dev, length);
+}
+#define napi_alloc_skb(napi,len) __kc_napi_alloc_skb(napi,len)
+#define __napi_alloc_skb(napi,len,mask) __kc_napi_alloc_skb(napi,len)
+#endif /* SKB_ALLOC_NAPI */
+#define HAVE_CONFIG_PM_RUNTIME
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,7)) && \
+     (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)))
+#define HAVE_RXFH_HASHFUNC
+#endif /* 6.7 < RHEL < 7.0 */
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_RXFH_HASHFUNC
+#define NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS
+#endif /* RHEL > 7.1 */
+#ifndef napi_schedule_irqoff
+#define napi_schedule_irqoff	napi_schedule
+#endif
+#ifndef READ_ONCE
+#define READ_ONCE(_x) ACCESS_ONCE(_x)
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_FDB_ADD_VID
+#endif
+#ifndef ETH_MODULE_SFF_8636
+#define ETH_MODULE_SFF_8636		0x3
+#endif
+#ifndef ETH_MODULE_SFF_8636_LEN
+#define ETH_MODULE_SFF_8636_LEN		256
+#endif
+#ifndef ETH_MODULE_SFF_8436
+#define ETH_MODULE_SFF_8436		0x4
+#endif
+#ifndef ETH_MODULE_SFF_8436_LEN
+#define ETH_MODULE_SFF_8436_LEN		256
+#endif
+#ifndef writel_relaxed
+#define writel_relaxed	writel
+#endif
+#else /* 3.19.0 */
+#define HAVE_NDO_FDB_ADD_VID
+#define HAVE_RXFH_HASHFUNC
+#define NDO_DFLT_BRIDGE_GETLINK_HAS_BRFLAGS
+#endif /* 3.19.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,20,0) )
+/* vlan_tx_xx functions got renamed to skb_vlan */
+#ifndef skb_vlan_tag_get
+#define skb_vlan_tag_get vlan_tx_tag_get
+#endif
+#ifndef skb_vlan_tag_present
+#define skb_vlan_tag_present vlan_tx_tag_present
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,1))
+#define HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS
+#endif
+#else
+#define HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#define HAVE_NDO_BRIDGE_SET_DEL_LINK_FLAGS
+#endif /* 3.20.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) )
+/* Definition for CONFIG_OF was introduced earlier */
+#if !defined(CONFIG_OF) && \
+    !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+static inline struct device_node *
+pci_device_to_OF_node(const struct pci_dev __always_unused *pdev) { return NULL; }
+#else /* !CONFIG_OF && RHEL < 7.3 */
+#define HAVE_DDP_PROFILE_UPLOAD_SUPPORT
+#endif /* !CONFIG_OF && RHEL < 7.3 */
+#else /* < 4.0 */
+#define HAVE_DDP_PROFILE_UPLOAD_SUPPORT
+#endif /* < 4.0 */
+
+/*****************************************************************************/
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) )
+#ifdef HAVE_INCLUDE_LINUX_TIMECOUNTER_H
+#include <linux/timecounter.h>
+#else
+#include <linux/clocksource.h>
+#endif
+static inline void __kc_timecounter_adjtime(struct timecounter *tc, s64 delta)
+{
+	tc->nsec += delta;
+}
+
+static inline struct net_device *
+of_find_net_device_by_node(struct device_node __always_unused *np)
+{
+	return NULL;
+}
+
+#define timecounter_adjtime __kc_timecounter_adjtime
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,2,0))))
+#define HAVE_NDO_SET_VF_RSS_QUERY_EN
+#endif
+#if RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))
+#define HAVE_NDO_BRIDGE_GETLINK_NLFLAGS
+#define HAVE_RHEL7_EXTENDED_NDO_SET_TX_MAXRATE
+#define HAVE_NDO_SET_TX_MAXRATE
+#endif
+#if !((RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(6,8) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) && \
+      (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+      (SLE_VERSION_CODE > SLE_VERSION(12,1,0)))
+unsigned int _kc_cpumask_local_spread(unsigned int i, int node);
+#define cpumask_local_spread _kc_cpumask_local_spread
+#endif
+#ifdef HAVE_RHASHTABLE
+#define rhashtable_loopup_fast(ht, key, params)		\
+	do {						\
+		(void)params;				\
+		rhashtable_lookup((ht), (key));		\
+	} while (0)
+
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) )
+#define rhashtable_insert_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_insert((ht), (obj), GFP_KERNEL);	\
+	} while (0)
+
+#define rhashtable_remove_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_remove((ht), (obj), GFP_KERNEL);	\
+	} while (0)
+
+#else /* >= 3,19,0 */
+#define rhashtable_insert_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_insert((ht), (obj));			\
+	} while (0)
+
+#define rhashtable_remove_fast(ht, obj, params)			\
+	do {							\
+		(void)params;					\
+		rhashtable_remove((ht), (obj));			\
+	} while (0)
+
+#endif /* 3,19,0 */
+#endif /* HAVE_RHASHTABLE */
+#else /* >= 4,1,0 */
+#define HAVE_NDO_GET_PHYS_PORT_NAME
+#define HAVE_PTP_CLOCK_INFO_GETTIME64
+#define HAVE_NDO_BRIDGE_GETLINK_NLFLAGS
+#define HAVE_PASSTHRU_FEATURES_CHECK
+#define HAVE_NDO_SET_VF_RSS_QUERY_EN
+#define HAVE_NDO_SET_TX_MAXRATE
+#endif /* 4,1,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9))
+#if (!(RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2)) && \
+     !((SLE_VERSION_CODE == SLE_VERSION(11,3,0)) && \
+       (SLE_LOCALVERSION_CODE >= SLE_LOCALVERSION(0,47,71))) && \
+     !((SLE_VERSION_CODE == SLE_VERSION(11,4,0)) && \
+       (SLE_LOCALVERSION_CODE >= SLE_LOCALVERSION(65,0,0))) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,1,0)))
+static inline bool page_is_pfmemalloc(struct page __maybe_unused *page)
+{
+#ifdef HAVE_STRUCT_PAGE_PFMEMALLOC
+	return page->pfmemalloc;
+#else
+	return false;
+#endif
+}
+#endif /* !RHEL7.2+ && !SLES11sp3(3.0.101-0.47.71+ update) && !SLES11sp4(3.0.101-65+ update) & !SLES12sp1+ */
+#else
+#undef HAVE_STRUCT_PAGE_PFMEMALLOC
+#endif /* 4.1.9 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,1,0)))
+#define ETHTOOL_RX_FLOW_SPEC_RING	0x00000000FFFFFFFFULL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF	0x000000FF00000000ULL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
+static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
+{
+	return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
+};
+
+static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
+{
+	return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
+				ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+};
+#endif /* ! RHEL >= 7.2 && ! SLES >= 12.1 */
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+#endif
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,27))
+#if (!((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,8) && \
+	RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)) || \
+       RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2)))
+static inline bool pci_ari_enabled(struct pci_bus *bus)
+{
+	return bus->self && bus->self->ari_enabled;
+}
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2))
+#define HAVE_VF_STATS
+#endif /* (RHEL7.2+) */
+#endif /* !(RHEL6.8+ || RHEL7.2+) */
+#else
+static inline bool pci_ari_enabled(struct pci_bus *bus)
+{
+	return false;
+}
+#endif /* 2.6.27 */
+#else
+#define HAVE_NDO_DFLT_BRIDGE_GETLINK_VLAN_SUPPORT
+#define HAVE_VF_STATS
+#endif /* 4.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+/**
+ * _kc_flow_dissector_key_ipv4_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct _kc_flow_dissector_key_ipv4_addrs {
+	__be32 src;
+	__be32 dst;
+};
+
+/**
+ * _kc_flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct _kc_flow_dissector_key_ipv6_addrs {
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+/**
+ * _kc_flow_dissector_key_addrs:
+ * @v4addrs: IPv4 addresses
+ * @v6addrs: IPv6 addresses
+ */
+struct _kc_flow_dissector_key_addrs {
+	union {
+		struct _kc_flow_dissector_key_ipv4_addrs v4addrs;
+		struct _kc_flow_dissector_key_ipv6_addrs v6addrs;
+	};
+};
+
+/**
+ * _kc_flow_dissector_key_tp_ports:
+ *	@ports: port numbers of Transport header
+ *		src: source port number
+ *		dst: destination port number
+ */
+struct _kc_flow_dissector_key_ports {
+	union {
+		__be32 ports;
+		struct {
+			__be16 src;
+			__be16 dst;
+		};
+	};
+};
+
+/**
+ * _kc_flow_dissector_key_basic:
+ * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ * @padding: padding for alignment
+ */
+struct _kc_flow_dissector_key_basic {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+};
+
+struct _kc_flow_keys {
+	struct _kc_flow_dissector_key_basic basic;
+	struct _kc_flow_dissector_key_ports ports;
+	struct _kc_flow_dissector_key_addrs addrs;
+};
+
+/* These are all the include files for kernels inside this #ifdef block that
+ * have any reference to the in kernel definition of struct flow_keys. The
+ * reason for putting them here is to make 100% sure that these files do not get
+ * included after re-defining flow_keys to _kc_flow_keys. This is done to
+ * prevent any possible ABI issues that this structure re-definition could case.
+ */
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) && \
+      LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)) || \
+      RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,7) || \
+      SLE_VERSION_CODE >= SLE_VERSION(11,4,0))
+#include <net/flow_keys.h>
+#endif /* (>= 3.3.0 && < 4.2.0) || >= RHEL 6.7  || >= SLE 11.4 */
+#if (LINUX_VERSION_CODE == KERNEL_VERSION(4,2,0))
+#include <net/flow_dissector.h>
+#endif /* 4.2.0 */
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/flow.h>
+
+#define flow_keys _kc_flow_keys
+bool
+_kc_skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+			       struct flow_keys *flow,
+			       unsigned int __always_unused flags);
+#define skb_flow_dissect_flow_keys	_kc_skb_flow_dissect_flow_keys
+#endif /* ! >= RHEL 7.4 && ! >= SLES 12.2 */
+
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+#include <net/dst_metadata.h>
+#endif /* >= RHEL7.3 || >= SLE12sp2 */
+#else /* >= 4.3.0 */
+#include <net/dst_metadata.h>
+#endif /* 4.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,4,0))
+#if (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3))
+#define HAVE_NDO_SET_VF_TRUST
+#endif /* (RHEL_RELEASE >= 7.3) */
+#ifndef CONFIG_64BIT
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0))
+#include <asm-generic/io-64-nonatomic-lo-hi.h>	/* 32-bit readq/writeq */
+#else /* 3.3.0 => 4.3.x */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26))
+#include <asm-generic/int-ll64.h>
+#endif /* 2.6.26 => 3.3.0 */
+#ifndef readq
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl(p);
+	high = readl(p + 1);
+
+	return low + ((u64)high << 32);
+}
+#define readq readq
+#endif
+
+#ifndef writeq
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, (u8 *)addr + 4);
+}
+#define writeq writeq
+#endif
+#endif /* < 3.3.0 */
+#endif /* !CONFIG_64BIT */
+#else /* < 4.4.0 */
+#define HAVE_NDO_SET_VF_TRUST
+
+#ifndef CONFIG_64BIT
+#include <linux/io-64-nonatomic-lo-hi.h>	/* 32-bit readq/writeq */
+#endif /* !CONFIG_64BIT */
+#endif /* 4.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0))
+/* protect against a likely backport */
+#ifndef NETIF_F_CSUM_MASK
+#define NETIF_F_CSUM_MASK NETIF_F_ALL_CSUM
+#endif /* NETIF_F_CSUM_MASK */
+#ifndef NETIF_F_SCTP_CRC
+#define NETIF_F_SCTP_CRC NETIF_F_SCTP_CSUM
+#endif /* NETIF_F_SCTP_CRC */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)))
+#define eth_platform_get_mac_address _kc_eth_platform_get_mac_address
+int _kc_eth_platform_get_mac_address(struct device *dev __maybe_unused,
+				     u8 *mac_addr __maybe_unused);
+#endif /* !(RHEL_RELEASE >= 7.3) */
+#else /* 4.5.0 */
+#if ( LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) )
+#define HAVE_GENEVE_RX_OFFLOAD
+#if !defined(HAVE_UDP_ENC_TUNNEL) && IS_ENABLED(CONFIG_GENEVE)
+#define HAVE_UDP_ENC_TUNNEL
+#endif
+#endif /* < 4.8.0 */
+#define HAVE_NETIF_NAPI_ADD_CALLS_NAPI_HASH_ADD
+#define HAVE_NETDEV_UPPER_INFO
+#endif /* 4.5.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0))
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,3))
+static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22))
+	return skb->head + skb->csum_start;
+#else /* < 2.6.22 */
+	return skb_transport_header(skb);
+#endif
+}
+#endif
+
+#if !(UBUNTU_VERSION_CODE && \
+		UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4,4,0,21)) && \
+	!(RHEL_RELEASE_CODE && \
+		(RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) && \
+	!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+static inline void napi_consume_skb(struct sk_buff *skb,
+				    int __always_unused budget)
+{
+	dev_consume_skb_any(skb);
+}
+
+#endif /* UBUNTU 4,4,0,21, RHEL 7.2, SLES12 SP3 */
+#if !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) && \
+	!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
+{
+	* sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+}
+#endif
+#if !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,2))) && \
+	!(SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0)))
+static inline void page_ref_inc(struct page *page)
+{
+	get_page(page);
+}
+#else
+#define HAVE_PAGE_COUNT_BULK_UPDATE
+#endif
+#ifndef IPV4_USER_FLOW
+#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#endif
+
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_TC_SETUP_CLSFLOWER
+#define HAVE_TC_FLOWER_ENC
+#endif
+
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(12,2,0)))
+#define HAVE_TC_SETUP_CLSU32
+#endif
+
+#if (SLE_VERSION_CODE >= SLE_VERSION(12,2,0))
+#define HAVE_TC_SETUP_CLSFLOWER
+#endif
+
+#else /* >= 4.6.0 */
+#define HAVE_PAGE_COUNT_BULK_UPDATE
+#define HAVE_ETHTOOL_FLOW_UNION_IP6_SPEC
+#define HAVE_PTP_CROSSTIMESTAMP
+#define HAVE_TC_SETUP_CLSFLOWER
+#define HAVE_TC_SETUP_CLSU32
+#endif /* 4.6.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0))
+#if ((SLE_VERSION_CODE >= SLE_VERSION(12,3,0)) ||\
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)))
+#define HAVE_NETIF_TRANS_UPDATE
+#endif /* SLES12sp3+ || RHEL7.4+ */
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,3)) ||\
+     (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_ETHTOOL_25G_BITS
+#define HAVE_ETHTOOL_50G_BITS
+#define HAVE_ETHTOOL_100G_BITS
+#endif /* RHEL7.3+ || SLES12sp3+ */
+#else /* 4.7.0 */
+#define HAVE_NETIF_TRANS_UPDATE
+#define HAVE_ETHTOOL_CONVERT_U32_AND_LINK_MODE
+#define HAVE_ETHTOOL_25G_BITS
+#define HAVE_ETHTOOL_50G_BITS
+#define HAVE_ETHTOOL_100G_BITS
+#define HAVE_TCF_MIRRED_REDIRECT
+#endif /* 4.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0))
+#if !(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+enum udp_parsable_tunnel_type {
+	UDP_TUNNEL_TYPE_VXLAN,
+	UDP_TUNNEL_TYPE_GENEVE,
+};
+struct udp_tunnel_info {
+	unsigned short type;
+	sa_family_t sa_family;
+	__be16 port;
+};
+#endif
+
+#if (UBUNTU_VERSION_CODE && UBUNTU_VERSION_CODE < UBUNTU_VERSION(4,8,0,0))
+#define tc_no_actions(_exts) true
+#define tc_for_each_action(_a, _exts) while (0)
+#endif
+#if !(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) &&\
+	!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+static inline int
+#ifdef HAVE_NON_CONST_PCI_DRIVER_NAME
+pci_request_io_regions(struct pci_dev *pdev, char *name)
+#else
+pci_request_io_regions(struct pci_dev *pdev, const char *name)
+#endif
+{
+	return pci_request_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_IO), name);
+}
+
+static inline void
+pci_release_io_regions(struct pci_dev *pdev)
+{
+	return pci_release_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_IO));
+}
+
+static inline int
+#ifdef HAVE_NON_CONST_PCI_DRIVER_NAME
+pci_request_mem_regions(struct pci_dev *pdev, char *name)
+#else
+pci_request_mem_regions(struct pci_dev *pdev, const char *name)
+#endif
+{
+	return pci_request_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_MEM), name);
+}
+
+static inline void
+pci_release_mem_regions(struct pci_dev *pdev)
+{
+	return pci_release_selected_regions(pdev,
+			    pci_select_bars(pdev, IORESOURCE_MEM));
+}
+#endif /* !SLE_VERSION(12,3,0) */
+#if ((RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) ||\
+     (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_ETHTOOL_NEW_50G_BITS
+#endif /* RHEL7.4+ || SLES12sp3+ */
+#else
+#define HAVE_UDP_ENC_RX_OFFLOAD
+#define HAVE_ETHTOOL_NEW_50G_BITS
+#endif /* 4.8.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,9,0))
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#if (!(RHEL_RELEASE_CODE) && !(SLE_VERSION_CODE) || \
+    (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))))
+#define HAVE_TC_FLOWER_VLAN_IN_TAGS
+#endif /* !RHEL_RELEASE_CODE && !SLE_VERSION_CODE || <SLE_VERSION(12,3,0) */
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_ETHTOOL_NEW_1G_BITS
+#define HAVE_ETHTOOL_NEW_10G_BITS
+#endif /* RHEL7.4+ */
+#if (!(SLE_VERSION_CODE) && !(RHEL_RELEASE_CODE)) || \
+     SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0)) || \
+     RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))
+#define time_is_before_jiffies64(a)	time_after64(get_jiffies_64(), a)
+#endif /* !SLE_VERSION_CODE && !RHEL_RELEASE_CODE || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,4))
+static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+{
+	dst[0] = mask & ULONG_MAX;
+
+	if (sizeof(mask) > sizeof(unsigned long))
+		dst[1] = mask >> 32;
+}
+#endif /* <RHEL7.4 */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,3,0)) && \
+     !(UBUNTU_VERSION_CODE >= UBUNTU_VERSION(4,13,0,16)))
+static inline bool eth_type_vlan(__be16 ethertype)
+{
+	switch (ethertype) {
+	case htons(ETH_P_8021Q):
+#ifdef ETH_P_8021AD
+	case htons(ETH_P_8021AD):
+#endif
+		return true;
+	default:
+		return false;
+	}
+}
+#endif /* Linux < 4.9 || RHEL < 7.4 || SLES < 12.3 || Ubuntu < 4.3.0-16 */
+#else /* >=4.9 */
+#define HAVE_FLOW_DISSECTOR_KEY_VLAN_PRIO
+#define HAVE_ETHTOOL_NEW_1G_BITS
+#define HAVE_ETHTOOL_NEW_10G_BITS
+#endif /* KERNEL_VERSION(4.9.0) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0))
+/* SLES 12.3 and RHEL 7.5 backported this interface */
+#if (!SLE_VERSION_CODE && !RHEL_RELEASE_CODE) || \
+    (SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(12,3,0))) || \
+    (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5)))
+static inline bool _kc_napi_complete_done2(struct napi_struct *napi,
+					   int __always_unused work_done)
+{
+	/* it was really hard to get napi_complete_done to be safe to call
+	 * recursively without running into our own kcompat, so just use
+	 * napi_complete
+	 */
+	napi_complete(napi);
+
+	/* true means that the stack is telling the driver to go-ahead and
+	 * re-enable interrupts
+	 */
+	return true;
+}
+
+#ifdef napi_complete_done
+#undef napi_complete_done
+#endif
+#define napi_complete_done _kc_napi_complete_done2
+#endif /* sles and rhel exclusion for < 4.10 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,4))
+#define HAVE_DEV_WALK_API
+#define HAVE_ETHTOOL_NEW_2500MB_BITS
+#define HAVE_ETHTOOL_5G_BITS
+#endif /* RHEL7.4+ */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE == SLE_VERSION(12,3,0)))
+#define HAVE_STRUCT_DMA_ATTRS
+#endif /* (SLES == 12.3.0) */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0)))
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#endif /* (SLES >= 12.3.0) */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_STRUCT_DMA_ATTRS
+#define HAVE_RHEL7_EXTENDED_MIN_MAX_MTU
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#endif
+#if (!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) && \
+     !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))))
+#ifndef dma_map_page_attrs
+#define dma_map_page_attrs __kc_dma_map_page_attrs
+static inline dma_addr_t __kc_dma_map_page_attrs(struct device *dev,
+						 struct page *page,
+						 size_t offset, size_t size,
+						 enum dma_data_direction dir,
+						 unsigned long __always_unused attrs)
+{
+	return dma_map_page(dev, page, offset, size, dir);
+}
+#endif
+
+#ifndef dma_unmap_page_attrs
+#define dma_unmap_page_attrs __kc_dma_unmap_page_attrs
+static inline void __kc_dma_unmap_page_attrs(struct device *dev,
+					     dma_addr_t addr, size_t size,
+					     enum dma_data_direction dir,
+					     unsigned long __always_unused attrs)
+{
+	dma_unmap_page(dev, addr, size, dir);
+}
+#endif
+
+static inline void __page_frag_cache_drain(struct page *page,
+					   unsigned int count)
+{
+#ifdef HAVE_PAGE_COUNT_BULK_UPDATE
+	if (!page_ref_sub_and_test(page, count))
+		return;
+
+	init_page_count(page);
+#else
+	BUG_ON(count > 1);
+	if (!count)
+		return;
+#endif
+	__free_pages(page, compound_order(page));
+}
+#endif /* !SLE_VERSION(12,3,0) && !RHEL_VERSION(7,5) */
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0))) ||\
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_SWIOTLB_SKIP_CPU_SYNC
+#endif
+
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE < SLE_VERSION(15,0,0))) ||\
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,4))))
+#define page_frag_free __free_page_frag
+#endif
+#ifndef ETH_MIN_MTU
+#define ETH_MIN_MTU 68
+#endif /* ETH_MIN_MTU */
+
+/* If kernel is older than 4.10 but distro is RHEL >= 7.5 || SLES > 12SP4,
+ * it does have support for NAPI_STATE
+ */
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))) ||\
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0))))
+#define HAVE_NAPI_STATE_IN_BUSY_POLL
+#endif /* RHEL >= 7.5 || SLES >=12.4 */
+#else /* >= 4.10 */
+#define HAVE_TC_FLOWER_ENC
+#define HAVE_NETDEVICE_MIN_MAX_MTU
+#define HAVE_SWIOTLB_SKIP_CPU_SYNC
+#define HAVE_NETDEV_TC_RESETS_XPS
+#define HAVE_XPS_QOS_SUPPORT
+#define HAVE_DEV_WALK_API
+#define HAVE_ETHTOOL_NEW_2500MB_BITS
+#define HAVE_ETHTOOL_5G_BITS
+/* kernel 4.10 onwards, as part of busy_poll rewrite, new state were added
+ * which is part of NAPI:state. If NAPI:state=NAPI_STATE_IN_BUSY_POLL,
+ * it means napi_poll is invoked in busy_poll context
+ */
+#define HAVE_NAPI_STATE_IN_BUSY_POLL
+#define HAVE_TCF_MIRRED_EGRESS_REDIRECT
+#define HAVE_PTP_CLOCK_INFO_ADJFINE
+#endif /* 4.10.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0))
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define HAVE_NDO_BUSY_POLL
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))))
+#define HAVE_VOID_NDO_GET_STATS64
+#endif /* (SLES >= 12.3.0) && (RHEL >= 7.5) */
+
+static inline void _kc_dev_kfree_skb_irq(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_kfree_skb_irq(skb);
+}
+
+#undef dev_kfree_skb_irq
+#define dev_kfree_skb_irq _kc_dev_kfree_skb_irq
+
+static inline void _kc_dev_consume_skb_irq(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_consume_skb_irq(skb);
+}
+
+#undef dev_consume_skb_irq
+#define dev_consume_skb_irq _kc_dev_consume_skb_irq
+
+static inline void _kc_dev_kfree_skb_any(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_kfree_skb_any(skb);
+}
+
+#undef dev_kfree_skb_any
+#define dev_kfree_skb_any _kc_dev_kfree_skb_any
+
+static inline void _kc_dev_consume_skb_any(struct sk_buff *skb)
+{
+	if (!skb)
+		return;
+	dev_consume_skb_any(skb);
+}
+
+#undef dev_consume_skb_any
+#define dev_consume_skb_any _kc_dev_consume_skb_any
+
+#else /* > 4.11 */
+#define HAVE_VOID_NDO_GET_STATS64
+#define HAVE_VM_OPS_FAULT_NO_VMA
+#endif /* 4.11.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,12,0))
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7) && \
+     RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+/* The RHEL 7.7+ NL_SET_ERR_MSG_MOD triggers unused parameter warnings */
+#undef NL_SET_ERR_MSG_MOD
+#endif
+/* If kernel is older than 4.12 but distro is RHEL >= 7.5 || SLES > 12SP4,
+ * it does have support for MIN_NAPI_ID
+ */
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0))))
+#define HAVE_MIN_NAPI_ID
+#endif /* RHEL >= 7.5 || SLES >= 12.4 */
+#ifndef NL_SET_ERR_MSG_MOD
+#define NL_SET_ERR_MSG_MOD(extack, msg)						\
+	do {									\
+		uninitialized_var(extack);					\
+		pr_err(KBUILD_MODNAME ": " msg);				\
+	} while (0)
+#endif /* !NL_SET_ERR_MSG_MOD */
+#else /* >= 4.12 */
+#define HAVE_NAPI_BUSY_LOOP
+#define HAVE_MIN_NAPI_ID
+#endif /* 4.12 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,13,0))
+#if ((SLE_VERSION_CODE && (SLE_VERSION_CODE > SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_TCF_EXTS_HAS_ACTION
+#endif
+#define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
+#endif /* SLES >= 12sp4 */
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define UUID_SIZE 16
+typedef struct {
+	__u8 b[UUID_SIZE];
+} uuid_t;
+#define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
+((uuid_t)								\
+{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
+   ((b) >> 8) & 0xff, (b) & 0xff,					\
+   ((c) >> 8) & 0xff, (c) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+static inline bool uuid_equal(const uuid_t *u1, const uuid_t *u2)
+{
+	return memcmp(u1, u2, sizeof(uuid_t)) == 0;
+}
+#else
+#define HAVE_METADATA_PORT_INFO
+#endif /* !(RHEL >= 7.5) && !(SLES >= 12.4) */
+#else /* > 4.13 */
+#define HAVE_METADATA_PORT_INFO
+#define HAVE_HWTSTAMP_FILTER_NTP_ALL
+#define HAVE_NDO_SETUP_TC_CHAIN_INDEX
+#define HAVE_PCI_ERROR_HANDLER_RESET_PREPARE
+#define HAVE_PTP_CLOCK_DO_AUX_WORK
+#endif /* 4.13.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0))
+#ifdef ETHTOOL_GLINKSETTINGS
+#ifndef ethtool_link_ksettings_del_link_mode
+#define ethtool_link_ksettings_del_link_mode(ptr, name, mode)		\
+	__clear_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+#endif
+#endif /* ETHTOOL_GLINKSETTINGS */
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(12,4,0)))
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#endif
+
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,5)))
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#define HAVE_RHEL7_NETDEV_OPS_EXT_NDO_SETUP_TC
+#endif
+
+#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
+
+#define timer_setup(timer, callback, flags)				\
+	__setup_timer((timer), (TIMER_FUNC_TYPE)(callback),		\
+		      (TIMER_DATA_TYPE)(timer), (flags))
+
+#define from_timer(var, callback_timer, timer_fieldname) \
+	container_of(callback_timer, typeof(*var), timer_fieldname)
+
+#ifndef xdp_do_flush_map
+#define xdp_do_flush_map() do {} while (0)
+#endif
+struct _kc_xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_hard_start;
+};
+#define xdp_buff _kc_xdp_buff
+struct _kc_bpf_prog {
+};
+#define bpf_prog _kc_bpf_prog
+#ifndef DIV_ROUND_DOWN_ULL
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+#endif /* DIV_ROUND_DOWN_ULL */
+#else /* > 4.14 */
+#define HAVE_XDP_SUPPORT
+#define HAVE_NDO_SETUP_TC_REMOVE_TC_TO_NETDEV
+#define HAVE_TCF_EXTS_HAS_ACTION
+#endif /* 4.14.0 */
+
+/*****************************************************************************/
+#ifndef ETHTOOL_GLINKSETTINGS
+
+#define __ETHTOOL_LINK_MODE_MASK_NBITS 32
+#define ETHTOOL_LINK_MASK_SIZE BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * struct ethtool_link_ksettings
+ * @link_modes: supported and advertising, single item arrays
+ * @link_modes.supported: bitmask of supported link speeds
+ * @link_modes.advertising: bitmask of currently advertised speeds
+ * @base: base link details
+ * @base.speed: current link speed
+ * @base.port: current port type
+ * @base.duplex: current duplex mode
+ * @base.autoneg: current autonegotiation settings
+ *
+ * This struct and the following macros provide a way to support the old
+ * ethtool get/set_settings API on older kernels, but in the style of the new
+ * GLINKSETTINGS API.  In this way, the same code can be used to support both
+ * APIs as seemlessly as possible.
+ *
+ * It should be noted the old API only has support up to the first 32 bits.
+ */
+struct ethtool_link_ksettings {
+	struct {
+		u32 speed;
+		u8 port;
+		u8 duplex;
+		u8 autoneg;
+	} base;
+	struct {
+		unsigned long supported[ETHTOOL_LINK_MASK_SIZE];
+		unsigned long advertising[ETHTOOL_LINK_MASK_SIZE];
+	} link_modes;
+};
+
+#define ETHTOOL_LINK_NAME_advertising(mode) ADVERTISED_ ## mode
+#define ETHTOOL_LINK_NAME_supported(mode) SUPPORTED_ ## mode
+#define ETHTOOL_LINK_NAME(name) ETHTOOL_LINK_NAME_ ## name
+#define ETHTOOL_LINK_CONVERT(name, mode) ETHTOOL_LINK_NAME(name)(mode)
+
+/**
+ * ethtool_link_ksettings_zero_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)\
+	(*((ptr)->link_modes.name) = 0x0)
+
+/**
+ * ethtool_link_ksettings_add_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to add
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)\
+	(*((ptr)->link_modes.name) |= (typeof(*((ptr)->link_modes.name)))ETHTOOL_LINK_CONVERT(name, mode))
+
+/**
+ * ethtool_link_ksettings_del_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to delete
+ */
+#define ethtool_link_ksettings_del_link_mode(ptr, name, mode)\
+	(*((ptr)->link_modes.name) &= ~(typeof(*((ptr)->link_modes.name)))ETHTOOL_LINK_CONVERT(name, mode))
+
+/**
+ * ethtool_link_ksettings_test_link_mode
+ * @ptr: ptr to ksettings struct
+ * @name: supported or advertising
+ * @mode: link mode to add
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)\
+	(!!(*((ptr)->link_modes.name) & ETHTOOL_LINK_CONVERT(name, mode)))
+
+/**
+ * _kc_ethtool_ksettings_to_cmd - Convert ethtool_link_ksettings to ethtool_cmd
+ * @ks: ethtool_link_ksettings struct
+ * @cmd: ethtool_cmd struct
+ *
+ * Convert an ethtool_link_ksettings structure into the older ethtool_cmd
+ * structure. We provide this in kcompat.h so that drivers can easily
+ * implement the older .{get|set}_settings as wrappers around the new api.
+ * Hence, we keep it prefixed with _kc_ to make it clear this isn't actually
+ * a real function in the kernel.
+ */
+static inline void
+_kc_ethtool_ksettings_to_cmd(struct ethtool_link_ksettings *ks,
+			     struct ethtool_cmd *cmd)
+{
+	cmd->supported = (u32)ks->link_modes.supported[0];
+	cmd->advertising = (u32)ks->link_modes.advertising[0];
+	ethtool_cmd_speed_set(cmd, ks->base.speed);
+	cmd->duplex = ks->base.duplex;
+	cmd->autoneg = ks->base.autoneg;
+	cmd->port = ks->base.port;
+}
+
+#endif /* !ETHTOOL_GLINKSETTINGS */
+
+/*****************************************************************************/
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0)) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE <= SLE_VERSION(12,3,0))) || \
+     (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(7,5))))
+#define phy_speed_to_str _kc_phy_speed_to_str
+const char *_kc_phy_speed_to_str(int speed);
+#else /* (LINUX >= 4.14.0) || (SLES > 12.3.0) || (RHEL > 7.5) */
+#include <linux/phy.h>
+#endif /* (LINUX < 4.14.0) || (SLES <= 12.3.0) || (RHEL <= 7.5) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+#if ((RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,6))) || \
+     (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,1,0))))
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK
+#else /* RHEL >= 7.6 || SLES >= 15.1 */
+#endif /* !(RHEL >= 7.6) && !(SLES >= 15.1) */
+void _kc_ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				      struct ethtool_link_ksettings *src);
+#define ethtool_intersect_link_masks _kc_ethtool_intersect_link_masks
+#else /* >= 4.15 */
+#define HAVE_NDO_BPF
+#define HAVE_XDP_BUFF_DATA_META
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK
+#endif /* 4.15.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,4,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+/* The return value of the strscpy() and strlcpy() functions is different.
+ * This could be potentially hazard for the future.
+ * To avoid this the void result is forced.
+ * So it is not possible use this function with the return value.
+ * Return value is required in kernel 4.3 through 4.15
+ */
+#define strscpy(...) (void)(strlcpy(__VA_ARGS__))
+#endif /* !RHEL >= 7.7 && !SLES12sp4+ && !SLES15sp1+ */
+
+#define pci_printk(level, pdev, fmt, arg...) \
+	dev_printk(level, &(pdev)->dev, fmt, ##arg)
+#define pci_emerg(pdev, fmt, arg...)	dev_emerg(&(pdev)->dev, fmt, ##arg)
+#define pci_alert(pdev, fmt, arg...)	dev_alert(&(pdev)->dev, fmt, ##arg)
+#define pci_crit(pdev, fmt, arg...)	dev_crit(&(pdev)->dev, fmt, ##arg)
+#define pci_err(pdev, fmt, arg...)	dev_err(&(pdev)->dev, fmt, ##arg)
+#define pci_warn(pdev, fmt, arg...)	dev_warn(&(pdev)->dev, fmt, ##arg)
+#define pci_notice(pdev, fmt, arg...)	dev_notice(&(pdev)->dev, fmt, ##arg)
+#define pci_info(pdev, fmt, arg...)	dev_info(&(pdev)->dev, fmt, ##arg)
+#define pci_dbg(pdev, fmt, arg...)	dev_dbg(&(pdev)->dev, fmt, ##arg)
+
+#ifndef array_index_nospec
+static inline unsigned long _kc_array_index_mask_nospec(unsigned long index,
+							unsigned long size)
+{
+	/*
+	 * Always calculate and emit the mask even if the compiler
+	 * thinks the mask is not needed. The compiler does not take
+	 * into account the value of @index under speculation.
+	 */
+	OPTIMIZER_HIDE_VAR(index);
+	return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
+}
+
+#define array_index_nospec(index, size)					\
+({									\
+	typeof(index) _i = (index);					\
+	typeof(size) _s = (size);					\
+	unsigned long _mask = _kc_array_index_mask_nospec(_i, _s);	\
+									\
+	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
+	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
+									\
+	(typeof(_i)) (_i & _mask);					\
+})
+#endif /* array_index_nospec */
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) (sizeof((((TYPE *)0)->MEMBER)))
+#endif /* sizeof_field */
+#if !(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+     !(SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+       SLE_VERSION_CODE < SLE_VERSION(15,0,0) || \
+       SLE_VERSION_CODE >= SLE_VERSION(15,1,0))
+/*
+ * Copy bitmap and clear tail bits in last word.
+ */
+static inline void
+bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
+{
+	bitmap_copy(dst, src, nbits);
+	if (nbits % BITS_PER_LONG)
+		dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
+}
+
+/*
+ * On 32-bit systems bitmaps are represented as u32 arrays internally, and
+ * therefore conversion is not needed when copying data from/to arrays of u32.
+ */
+#if BITS_PER_LONG == 64
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits);
+#else
+#define bitmap_from_arr32(bitmap, buf, nbits)			\
+	bitmap_copy_clear_tail((unsigned long *) (bitmap),	\
+			       (const unsigned long *) (buf), (nbits))
+#endif /* BITS_PER_LONG == 64 */
+#endif /* !(RHEL >= 8.0) && !(SLES >= 12.5 && SLES < 15.0 || SLES >= 15.1) */
+#else /* >= 4.16 */
+#include <linux/nospec.h>
+#define HAVE_XDP_BUFF_RXQ
+#define HAVE_TC_FLOWER_OFFLOAD_COMMON_EXTACK
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_VF_STATS_DROPPED
+#endif /* 4.16.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+#include <linux/pci_regs.h>
+#include <linux/pci.h>
+#define PCIE_SPEED_16_0GT 0x17
+#define PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
+#define PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
+#define PCI_EXP_LNKCAP2_SLS_16_0GB 0x00000010 /* Supported Speed 16GT/s */
+void _kc_pcie_print_link_status(struct pci_dev *dev);
+#define pcie_print_link_status _kc_pcie_print_link_status
+#else /* >= 4.17.0 */
+#define HAVE_XDP_BUFF_IN_XDP_H
+#endif /* 4.17.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0))
+#include "kcompat_overflow.h"
+
+#if (SLE_VERSION_CODE < SLE_VERSION(15,1,0))
+#define firmware_request_nowarn	request_firmware_direct
+#endif /* SLES < 15.1 */
+
+#else
+#include <linux/overflow.h>
+#include <net/xdp_sock.h>
+#define HAVE_XDP_FRAME_STRUCT
+#define HAVE_XDP_SOCK
+#define HAVE_NDO_XDP_XMIT_BULK_AND_FLAGS
+#define NO_NDO_XDP_FLUSH
+#define HAVE_AF_XDP_SUPPORT
+#endif /* 4.18.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0))
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)) && \
+    (RHEL_RELEASE_CODE <= RHEL_RELEASE_VERSION(8,2)))
+#define HAVE_DEVLINK_REGIONS
+#endif /* RHEL >= 8.0 && RHEL <= 8.2 */
+#define bitmap_alloc(nbits, flags) \
+	kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags)
+#define bitmap_zalloc(nbits, flags) bitmap_alloc(nbits, ((flags) | __GFP_ZERO))
+#define bitmap_free(bitmap) kfree(bitmap)
+#ifdef ETHTOOL_GLINKSETTINGS
+#define ethtool_ks_clear(ptr, name) \
+	ethtool_link_ksettings_zero_link_mode(ptr, name)
+#define ethtool_ks_add_mode(ptr, name, mode) \
+	ethtool_link_ksettings_add_link_mode(ptr, name, mode)
+#define ethtool_ks_del_mode(ptr, name, mode) \
+	ethtool_link_ksettings_del_link_mode(ptr, name, mode)
+#define ethtool_ks_test(ptr, name, mode) \
+	ethtool_link_ksettings_test_link_mode(ptr, name, mode)
+#endif /* ETHTOOL_GLINKSETTINGS */
+#define HAVE_NETPOLL_CONTROLLER
+#define REQUIRE_PCI_CLEANUP_AER_ERROR_STATUS
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#endif
+
+static inline void __kc_metadata_dst_free(void *md_dst)
+{
+	kfree(md_dst);
+}
+
+#define metadata_dst_free(md_dst) __kc_metadata_dst_free(md_dst)
+#else /* >= 4.19.0 */
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#define NO_NETDEV_BPF_PROG_ATTACHED
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_NETDEV_SB_DEV
+#define HAVE_TCF_VLAN_TPID
+#define HAVE_RHASHTABLE_TYPES
+#define HAVE_DEVLINK_REGIONS
+#define HAVE_DEVLINK_PARAMS
+#endif /* 4.19.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0))
+#define HAVE_XDP_UMEM_PROPS
+#ifdef HAVE_AF_XDP_SUPPORT
+#ifndef napi_if_scheduled_mark_missed
+static inline bool __kc_napi_if_scheduled_mark_missed(struct napi_struct *n)
+{
+	unsigned long val, new;
+
+	do {
+		val = READ_ONCE(n->state);
+		if (val & NAPIF_STATE_DISABLE)
+			return true;
+
+		if (!(val & NAPIF_STATE_SCHED))
+			return false;
+
+		new = val | NAPIF_STATE_MISSED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	return true;
+}
+
+#define napi_if_scheduled_mark_missed __kc_napi_if_scheduled_mark_missed
+#endif /* !napi_if_scheduled_mark_missed */
+#endif /* HAVE_AF_XDP_SUPPORT */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,0)))
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#endif /* RHEL >= 8.0 */
+#if ((SLE_VERSION_CODE >= SLE_VERSION(12,5,0) && \
+      SLE_VERSION_CODE < SLE_VERSION(15,0,0)) || \
+     (SLE_VERSION_CODE >= SLE_VERSION(15,1,0)))
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#endif /* SLE == 12sp5 || SLE >= 15sp1 */
+#else /* >= 4.20.0 */
+#define HAVE_DEVLINK_ESWITCH_OPS_EXTACK
+#define HAVE_AF_XDP_ZC_SUPPORT
+#define HAVE_VXLAN_TYPE
+#define HAVE_ETF_SUPPORT /* Earliest TxTime First */
+#endif /* 4.20.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,0,0))
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,0)))
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,12,0))
+#define NETLINK_MAX_COOKIE_LEN	20
+struct netlink_ext_ack {
+	const char *_msg;
+	const struct nlattr *bad_attr;
+	u8 cookie[NETLINK_MAX_COOKIE_LEN];
+	u8 cookie_len;
+};
+
+#endif /* < 4.12 */
+static inline int _kc_dev_open(struct net_device *netdev,
+			       struct netlink_ext_ack __always_unused *extack)
+{
+	return dev_open(netdev);
+}
+
+#define dev_open _kc_dev_open
+
+static inline int
+_kc_dev_change_flags(struct net_device *netdev, unsigned int flags,
+		     struct netlink_ext_ack __always_unused *extack)
+{
+	return dev_change_flags(netdev, flags);
+}
+
+#define dev_change_flags _kc_dev_change_flags
+#endif /* !(RHEL_RELEASE_CODE && RHEL > RHEL(8,0)) */
+#if (RHEL_RELEASE_CODE && \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,7) && \
+      RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0)) || \
+     (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_PTP_SYS_OFFSET_EXTENDED_IOCTL
+#define HAVE_PTP_CLOCK_INFO_GETTIMEX64
+#else /* RHEL >= 7.7 && RHEL < 8.0 || RHEL >= 8.1 */
+struct ptp_system_timestamp {
+	struct timespec64 pre_ts;
+	struct timespec64 post_ts;
+};
+
+static inline void
+ptp_read_system_prets(struct ptp_system_timestamp __always_unused *sts)
+{
+	;
+}
+
+static inline void
+ptp_read_system_postts(struct ptp_system_timestamp __always_unused *sts)
+{
+	;
+}
+#endif /* !(RHEL >= 7.7 && RHEL != 8.0) */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_NDO_BRIDGE_SETLINK_EXTACK
+#endif /* RHEL 8.1 */
+#if (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))
+#define HAVE_TC_INDIR_BLOCK
+#endif /* RHEL 8.2 */
+#define INDIRECT_CALLABLE_DECLARE(x) x
+#else /* >= 5.0.0 */
+#define HAVE_PTP_SYS_OFFSET_EXTENDED_IOCTL
+#define HAVE_PTP_CLOCK_INFO_GETTIMEX64
+#define HAVE_NDO_BRIDGE_SETLINK_EXTACK
+#define HAVE_DMA_ALLOC_COHERENT_ZEROES_MEM
+#define HAVE_GENEVE_TYPE
+#define HAVE_TC_INDIR_BLOCK
+#define HAVE_INDIRECT_CALL_WRAPPER_HEADER
+#endif /* 5.0.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,1,0))
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+#define HAVE_NDO_FDB_ADD_EXTACK
+#define HAVE_DEVLINK_INFO_GET
+#define HAVE_DEVLINK_FLASH_UPDATE
+#else /* RHEL < 8.1 */
+#ifdef HAVE_TC_SETUP_CLSFLOWER
+#include <net/pkt_cls.h>
+
+struct flow_match {
+	struct flow_dissector	*dissector;
+	void			*mask;
+	void			*key;
+};
+
+struct flow_match_basic {
+	struct flow_dissector_key_basic *key, *mask;
+};
+
+struct flow_match_control {
+	struct flow_dissector_key_control *key, *mask;
+};
+
+struct flow_match_eth_addrs {
+	struct flow_dissector_key_eth_addrs *key, *mask;
+};
+
+#ifdef HAVE_TC_FLOWER_ENC
+struct flow_match_enc_keyid {
+	struct flow_dissector_key_keyid *key, *mask;
+};
+#endif
+
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+struct flow_match_vlan {
+	struct flow_dissector_key_vlan *key, *mask;
+};
+#endif
+
+struct flow_match_ipv4_addrs {
+	struct flow_dissector_key_ipv4_addrs *key, *mask;
+};
+
+struct flow_match_ipv6_addrs {
+	struct flow_dissector_key_ipv6_addrs *key, *mask;
+};
+
+struct flow_match_ports {
+	struct flow_dissector_key_ports *key, *mask;
+};
+
+struct flow_rule {
+	struct flow_match	match;
+};
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+			   struct flow_match_basic *out);
+void flow_rule_match_control(const struct flow_rule *rule,
+			     struct flow_match_control *out);
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+			       struct flow_match_eth_addrs *out);
+#ifndef HAVE_TC_FLOWER_VLAN_IN_TAGS
+void flow_rule_match_vlan(const struct flow_rule *rule,
+			  struct flow_match_vlan *out);
+#endif
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv4_addrs *out);
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+				struct flow_match_ipv6_addrs *out);
+void flow_rule_match_ports(const struct flow_rule *rule,
+			   struct flow_match_ports *out);
+#ifdef HAVE_TC_FLOWER_ENC
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+			       struct flow_match_ports *out);
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+				 struct flow_match_control *out);
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv4_addrs *out);
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+				    struct flow_match_ipv6_addrs *out);
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+			       struct flow_match_enc_keyid *out);
+#endif
+
+static inline struct flow_rule *
+tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd)
+{
+	return (struct flow_rule *)&tc_flow_cmd->dissector;
+}
+
+static inline bool flow_rule_match_key(const struct flow_rule *rule,
+				       enum flow_dissector_key_id key)
+{
+	return dissector_uses_key(rule->match.dissector, key);
+}
+#endif /* HAVE_TC_SETUP_CLSFLOWER */
+
+#endif /* RHEL < 8.1 */
+
+#if (!(RHEL_RELEASE_CODE && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,1)))
+#define devlink_params_publish(devlink) do { } while (0)
+#define devlink_params_unpublish(devlink) do { } while (0)
+#endif
+
+#else /* >= 5.1.0 */
+#define HAVE_NDO_FDB_ADD_EXTACK
+#define NO_XDP_QUERY_XSK_UMEM
+#define HAVE_AF_XDP_NETDEV_UMEM
+#define HAVE_TC_FLOW_RULE_INFRASTRUCTURE
+#define HAVE_TC_FLOWER_ENC_IP
+#define HAVE_DEVLINK_INFO_GET
+#define HAVE_DEVLINK_FLASH_UPDATE
+#define HAVE_DEVLINK_PORT_PARAMS
+#endif /* 5.1.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0))
+#if (defined HAVE_SKB_XMIT_MORE) && \
+(!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#define netdev_xmit_more()	(skb->xmit_more)
+#else
+#define netdev_xmit_more()	(0)
+#endif
+
+#if (!(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2))))
+#ifndef eth_get_headlen
+static inline u32
+__kc_eth_get_headlen(const struct net_device __always_unused *dev, void *data,
+		     unsigned int len)
+{
+	return eth_get_headlen(data, len);
+}
+
+#define eth_get_headlen(dev, data, len) __kc_eth_get_headlen(dev, data, len)
+#endif /* !eth_get_headlen */
+#endif /* !RHEL >= 8.2 */
+
+#ifndef mmiowb
+#ifdef CONFIG_IA64
+#define mmiowb() asm volatile ("mf.a" ::: "memory")
+#else
+#define mmiowb()
+#endif
+#endif /* mmiowb */
+
+#if (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,1))
+#define HAVE_NDO_GET_DEVLINK_PORT
+#endif /* RHEL > 8.1 */
+
+#else /* >= 5.2.0 */
+#define HAVE_NDO_SELECT_QUEUE_FALLBACK_REMOVED
+#define SPIN_UNLOCK_IMPLIES_MMIOWB
+#define HAVE_NDO_GET_DEVLINK_PORT
+#endif /* 5.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#if (!(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,2)))
+#define flow_block_offload tc_block_offload
+#define flow_block_command tc_block_command
+#define flow_cls_offload tc_cls_flower_offload
+#define flow_block_binder_type tcf_block_binder_type
+#define flow_cls_common_offload tc_cls_common_offload
+#define flow_cls_offload_flow_rule tc_cls_flower_offload_flow_rule
+#define FLOW_CLS_REPLACE TC_CLSFLOWER_REPLACE
+#define FLOW_CLS_DESTROY TC_CLSFLOWER_DESTROY
+#define FLOW_CLS_STATS TC_CLSFLOWER_STATS
+#define FLOW_CLS_TMPLT_CREATE TC_CLSFLOWER_TMPLT_CREATE
+#define FLOW_CLS_TMPLT_DESTROY TC_CLSFLOWER_TMPLT_DESTROY
+#define FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS \
+		TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS
+#define FLOW_BLOCK_BIND TC_BLOCK_BIND
+#define FLOW_BLOCK_UNBIND TC_BLOCK_UNBIND
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#include <net/pkt_cls.h>
+
+int _kc_flow_block_cb_setup_simple(struct flow_block_offload *f,
+				   struct list_head *driver_list,
+				   tc_setup_cb_t *cb,
+				   void *cb_ident, void *cb_priv,
+				   bool ingress_only);
+
+#define flow_block_cb_setup_simple(f, driver_list, cb, cb_ident, cb_priv, \
+				   ingress_only) \
+	_kc_flow_block_cb_setup_simple(f, driver_list, cb, cb_ident, cb_priv, \
+				       ingress_only)
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+#else /* RHEL >= 8.2 */
+#define HAVE_FLOW_BLOCK_API
+#define HAVE_DEVLINK_PORT_ATTR_PCI_VF
+#endif /* RHEL >= 8.2 */
+
+#ifndef ETH_P_LLDP
+#define ETH_P_LLDP	0x88CC
+#endif /* !ETH_P_LLDP */
+
+#else /* >= 5.3.0 */
+#define XSK_UMEM_RETURNS_XDP_DESC
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0))
+#if !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define HAVE_XSK_UMEM_HAS_ADDRS
+#endif /* SLE < 15.3 */
+#endif /* < 5.8.0*/
+#define HAVE_FLOW_BLOCK_API
+#define HAVE_DEVLINK_PORT_ATTR_PCI_VF
+#if IS_ENABLED(CONFIG_DIMLIB)
+#define HAVE_CONFIG_DIMLIB
+#endif
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0))
+#if (SLE_VERSION_CODE >= SLE_VERSION(15,2,0))
+#define HAVE_NDO_XSK_WAKEUP
+#endif /* SLES15sp2 */
+#else /* >= 5.4.0 */
+#define HAVE_NDO_XSK_WAKEUP
+#endif /* 5.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0))
+static inline unsigned long _kc_bitmap_get_value8(const unsigned long *map,
+						  unsigned long start)
+{
+	const size_t index = BIT_WORD(start);
+	const unsigned long offset = start % BITS_PER_LONG;
+
+	return (map[index] >> offset) & 0xFF;
+}
+#define bitmap_get_value8 _kc_bitmap_get_value8
+
+static inline void _kc_bitmap_set_value8(unsigned long *map,
+					 unsigned long value,
+					 unsigned long start)
+{
+	const size_t index = BIT_WORD(start);
+	const unsigned long offset = start % BITS_PER_LONG;
+
+	map[index] &= ~(0xFFUL << offset);
+	map[index] |= value << offset;
+}
+#define bitmap_set_value8 _kc_bitmap_set_value8
+
+#endif /* 5.5.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,6,0))
+#ifdef HAVE_AF_XDP_SUPPORT
+#define xsk_umem_release_addr		xsk_umem_discard_addr
+#define xsk_umem_release_addr_rq	xsk_umem_discard_addr_rq
+#endif /* HAVE_AF_XDP_SUPPORT */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)) || \
+     (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_TX_TIMEOUT_TXQUEUE
+#endif
+#else /* >= 5.6.0 */
+#define HAVE_TX_TIMEOUT_TXQUEUE
+#endif /* 5.6.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+u64 _kc_pci_get_dsn(struct pci_dev *dev);
+#define pci_get_dsn(dev) _kc_pci_get_dsn(dev)
+#if !(SLE_VERSION_CODE > SLE_VERSION(15,2,0)) && \
+    !((LINUX_VERSION_CODE == KERNEL_VERSION(5,3,18)) && \
+      (SLE_LOCALVERSION_CODE >= KERNEL_VERSION(14,0,0))) && \
+    !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)))
+#define pci_aer_clear_nonfatal_status	pci_cleanup_aer_uncorrect_error_status
+#endif
+
+#define cpu_latency_qos_update_request pm_qos_update_request
+#define cpu_latency_qos_add_request(arg1, arg2) pm_qos_add_request(arg1, PM_QOS_CPU_DMA_LATENCY, arg2)
+#define cpu_latency_qos_remove_request pm_qos_remove_request
+
+#ifndef DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID
+#define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id"
+#endif
+#else /* >= 5.7.0 */
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT
+#define HAVE_ETHTOOL_COALESCE_PARAMS_SUPPORT
+#endif /* 5.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0))
+#if !(RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,4))) && \
+    !(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define xdp_convert_buff_to_frame convert_to_xdp_frame
+#endif /* (RHEL < 8.4) || (SLE < 15.3) */
+#define flex_array_size(p, member, count) \
+	array_size(count, sizeof(*(p)->member) + __must_be_array((p)->member))
+#if (!(SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#ifdef HAVE_AF_XDP_ZC_SUPPORT
+#ifndef xsk_umem_get_rx_frame_size
+static inline u32 _xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+	return umem->chunk_size_nohr - XDP_PACKET_HEADROOM;
+}
+
+#define xsk_umem_get_rx_frame_size _xsk_umem_get_rx_frame_size
+#endif /* xsk_umem_get_rx_frame_size */
+#endif /* HAVE_AF_XDP_ZC_SUPPORT */
+#else /* SLE >= 15.3 */
+#define HAVE_XDP_BUFF_FRAME_SZ
+#define HAVE_MEM_TYPE_XSK_BUFF_POOL
+#endif /* SLE >= 15.3 */
+#else /* >= 5.8.0 */
+#define HAVE_TC_FLOW_INDIR_DEV
+#define HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP
+#define HAVE_XDP_BUFF_FRAME_SZ
+#define HAVE_MEM_TYPE_XSK_BUFF_POOL
+#endif /* 5.8.0 */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,3)))
+#define HAVE_TC_FLOW_INDIR_DEV
+#endif
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_TC_FLOW_INDIR_DEV
+#endif /* SLE_VERSION_CODE && SLE_VERSION_CODE >= SLES15SP3 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8,4)))
+#define HAVE_TC_FLOW_INDIR_BLOCK_CLEANUP
+#endif /* (RHEL >= 8.4) */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0))
+#else /* >= 5.9.0 */
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#define HAVE_UDP_TUNNEL_NIC_INFO
+#endif /* 5.9.0 */
+#if (RHEL_RELEASE_CODE && (RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(8,3)))
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#endif
+#if (SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0)))
+#define HAVE_FLOW_INDIR_BLOCK_QDISC
+#endif /* SLE_VERSION_CODE && SLE_VERSION_CODE >= SLES15SP3 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0))
+#if (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(15,3,0))
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+#else /* SLE >= 15.3 */
+struct devlink_flash_update_params {
+	const char *file_name;
+	const char *component;
+	u32 overwrite_mask;
+};
+
+#ifndef DEVLINK_FLASH_OVERWRITE_SETTINGS
+#define DEVLINK_FLASH_OVERWRITE_SETTINGS BIT(0)
+#endif
+
+#ifndef DEVLINK_FLASH_OVERWRITE_IDENTIFIERS
+#define DEVLINK_FLASH_OVERWRITE_IDENTIFIERS BIT(1)
+#endif
+#endif /* !(SLE >= 15.3) */
+
+#if (!(SLE_VERSION_CODE && (SLE_VERSION_CODE >= SLE_VERSION(15,3,0))))
+#define XDP_SETUP_XSK_POOL XDP_SETUP_XSK_UMEM
+#define xsk_get_pool_from_qid xdp_get_umem_from_qid
+#define xsk_pool_get_rx_frame_size xsk_umem_get_rx_frame_size
+#define xsk_pool_set_rxq_info xsk_buff_set_rxq_info
+#define xsk_pool_dma_unmap xsk_buff_dma_unmap
+#define xsk_pool_dma_map xsk_buff_dma_map
+#define xsk_tx_peek_desc xsk_umem_consume_tx
+#define xsk_tx_release xsk_umem_consume_tx_done
+#define xsk_tx_completed xsk_umem_complete_tx
+#define xsk_uses_need_wakeup xsk_umem_uses_need_wakeup
+#ifdef HAVE_MEM_TYPE_XSK_BUFF_POOL
+#include <net/xdp_sock_drv.h>
+static inline void
+_kc_xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp,
+			      void __always_unused *pool)
+{
+	xsk_buff_dma_sync_for_cpu(xdp);
+}
+
+#define xsk_buff_dma_sync_for_cpu(xdp, pool) \
+	_kc_xsk_buff_dma_sync_for_cpu(xdp, pool)
+#endif /* HAVE_MEM_TYPE_XSK_BUFF_POOL */
+#else /* SLE >= 15.3 */
+#define HAVE_NETDEV_BPF_XSK_POOL
+#endif /* SLE >= 15.3 */
+#else /* >= 5.10.0 */
+#define HAVE_DEVLINK_REGION_OPS_SNAPSHOT_OPS
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS
+#define HAVE_NETDEV_BPF_XSK_POOL
+#endif /* 5.10.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0))
+#ifdef HAVE_XDP_BUFF_RXQ
+#include <net/xdp.h>
+static inline int
+_kc_xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev,
+		     u32 queue_index, unsigned int __always_unused napi_id)
+{
+	return xdp_rxq_info_reg(xdp_rxq, dev, queue_index);
+}
+
+#define xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id) \
+	_kc_xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id)
+#endif /* HAVE_XDP_BUFF_RXQ */
+#ifdef HAVE_NAPI_BUSY_LOOP
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+static inline void
+_kc_napi_busy_loop(unsigned int napi_id,
+		   bool (*loop_end)(void *, unsigned long), void *loop_end_arg,
+		   bool __always_unused prefer_busy_poll,
+		   u16 __always_unused budget)
+{
+	napi_busy_loop(napi_id, loop_end, loop_end_arg);
+}
+
+#define napi_busy_loop(napi_id, loop_end, loop_end_arg, prefer_busy_poll, budget) \
+	_kc_napi_busy_loop(napi_id, loop_end, loop_end_arg, prefer_busy_poll, budget)
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+#endif /* HAVE_NAPI_BUSY_LOOP */
+#define HAVE_DEVLINK_FLASH_UPDATE_BEGIN_END_NOTIFY
+#else /* >= 5.11.0 */
+#define HAVE_DEVLINK_FLASH_UPDATE_PARAMS_FW
+#endif /* 5.11.0 */
+
+/*
+ * Load the implementations file which actually defines kcompat backports.
+ * Legacy backports still exist in this file, but all new backports must be
+ * implemented using kcompat_*defs.h and kcompat_impl.h
+ */
+#include "kcompat_impl.h"
+
+#endif /* _KCOMPAT_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_dim.c b/drivers/net/ethernet/intel/ice/kcompat_dim.c
new file mode 100644
index 0000000000000000000000000000000000000000..f5228e9718e4cf35b135e2bca76f2d169ba9a261
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_dim.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "kcompat.h"
+#include "kcompat_dim.h"
+
+bool dim_on_top(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		return true;
+	case DIM_GOING_RIGHT:
+		return (dim->steps_left > 1) && (dim->steps_right == 1);
+	default: /* DIM_GOING_LEFT */
+		return (dim->steps_right > 1) && (dim->steps_left == 1);
+	}
+}
+
+void dim_turn(struct dim *dim)
+{
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		dim->tune_state = DIM_GOING_LEFT;
+		dim->steps_left = 0;
+		break;
+	case DIM_GOING_LEFT:
+		dim->tune_state = DIM_GOING_RIGHT;
+		dim->steps_right = 0;
+		break;
+	}
+}
+
+void dim_park_on_top(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tired        = 0;
+	dim->tune_state   = DIM_PARKING_ON_TOP;
+}
+
+void dim_park_tired(struct dim *dim)
+{
+	dim->steps_right  = 0;
+	dim->steps_left   = 0;
+	dim->tune_state   = DIM_PARKING_TIRED;
+}
+
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats)
+{
+	/* u32 holds up to 71 minutes, should be enough */
+	u32 delta_us = ktime_us_delta(end->time, start->time);
+	u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+	u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+			     start->byte_ctr);
+	u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr,
+			     start->comp_ctr);
+
+	if (!delta_us)
+		return;
+
+	curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+	curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+	curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC,
+					delta_us);
+	curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us);
+	if (curr_stats->epms != 0)
+		curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL(
+			curr_stats->cpms * 100, curr_stats->epms);
+	else
+		curr_stats->cpe_ratio = 0;
+
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_dim.h b/drivers/net/ethernet/intel/ice/kcompat_dim.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab8da64c8a8369b4b681fa3e96a96ca095b70da2
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_dim.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef _KCOMPAT_DIM_H_
+#define _KCOMPAT_DIM_H_
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/*
+ * Number of events between DIM iterations.
+ * Causes a moderation of the algorithm run.
+ */
+#define DIM_NEVENTS 64
+
+/*
+ * Is a difference between values justifies taking an action.
+ * We consider 10% difference as significant.
+ */
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+	(((100UL * abs((val) - (ref))) / (ref)) > 10)
+
+/*
+ * Calculate the gap between two values.
+ * Take wrap-around and variable size into consideration.
+ */
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) \
+		& (BIT_ULL(bits) - 1))
+
+/**
+ * struct dim_cq_moder - Structure for CQ moderation values.
+ * Used for communications between DIM and its consumer.
+ *
+ * @usec: CQ timer suggestion (by DIM)
+ * @pkts: CQ packet counter suggestion (by DIM)
+ * @comps: Completion counter
+ * @cq_period_mode: CQ period count mode (from CQE/EQE)
+ */
+struct dim_cq_moder {
+	u16 usec;
+	u16 pkts;
+	u16 comps;
+	u8 cq_period_mode;
+};
+
+/**
+ * struct dim_sample - Structure for DIM sample data.
+ * Used for communications between DIM and its consumer.
+ *
+ * @time: Sample timestamp
+ * @pkt_ctr: Number of packets
+ * @byte_ctr: Number of bytes
+ * @event_ctr: Number of events
+ * @comp_ctr: Current completion counter
+ */
+struct dim_sample {
+	ktime_t time;
+	u32 pkt_ctr;
+	u32 byte_ctr;
+	u16 event_ctr;
+	u32 comp_ctr;
+};
+
+/**
+ * struct dim_stats - Structure for DIM stats.
+ * Used for holding current measured rates.
+ *
+ * @ppms: Packets per msec
+ * @bpms: Bytes per msec
+ * @epms: Events per msec
+ * @cpms: Completions per msec
+ * @cpe_ratio: Ratio of completions to events
+ */
+struct dim_stats {
+	int ppms; /* packets per msec */
+	int bpms; /* bytes per msec */
+	int epms; /* events per msec */
+	int cpms; /* completions per msec */
+	int cpe_ratio; /* ratio of completions to events */
+};
+
+/**
+ * struct dim - Main structure for dynamic interrupt moderation (DIM).
+ * Used for holding all information about a specific DIM instance.
+ *
+ * @state: Algorithm state (see below)
+ * @prev_stats: Measured rates from previous iteration (for comparison)
+ * @start_sample: Sampled data at start of current iteration
+ * @measuring_sample: A &dim_sample that is used to update the current events
+ * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
+ * @profile_ix: Current moderation profile
+ * @mode: CQ period count mode
+ * @tune_state: Algorithm tuning state (see below)
+ * @steps_right: Number of steps taken towards higher moderation
+ * @steps_left: Number of steps taken towards lower moderation
+ * @tired: Parking depth counter
+ */
+struct dim {
+	u8 state;
+	struct dim_stats prev_stats;
+	struct dim_sample start_sample;
+	struct dim_sample measuring_sample;
+	struct work_struct work;
+	void *priv;
+	u8 profile_ix;
+	u8 mode;
+	u8 tune_state;
+	u8 steps_right;
+	u8 steps_left;
+	u8 tired;
+};
+
+/**
+ * enum dim_cq_period_mode - Modes for CQ period count
+ *
+ * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
+ * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
+ * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
+ */
+enum dim_cq_period_mode {
+	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	DIM_CQ_PERIOD_NUM_MODES
+};
+
+/**
+ * enum dim_state - DIM algorithm states
+ *
+ * These will determine if the algorithm is in a valid state to start an iteration.
+ *
+ * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
+ * @DIM_MEASURE_IN_PROGRESS: Algorithm is already in progress - check if
+ * need to perform an action
+ * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
+ */
+enum dim_state {
+	DIM_START_MEASURE,
+	DIM_MEASURE_IN_PROGRESS,
+	DIM_APPLY_NEW_PROFILE,
+};
+
+/**
+ * enum dim_tune_state - DIM algorithm tune states
+ *
+ * These will determine which action the algorithm should perform.
+ *
+ * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
+ * @DIM_PARKING_TIRED: Algorithm found a deep top point - don't exit if tired > 0
+ * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
+ * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
+ */
+enum dim_tune_state {
+	DIM_PARKING_ON_TOP,
+	DIM_PARKING_TIRED,
+	DIM_GOING_RIGHT,
+	DIM_GOING_LEFT,
+};
+
+/**
+ * enum dim_stats_state - DIM algorithm statistics states
+ *
+ * These will determine the verdict of current iteration.
+ *
+ * @DIM_STATS_WORSE: Current iteration shows worse performance than before
+ * @DIM_STATS_SAME:  Current iteration shows same performance than before
+ * @DIM_STATS_BETTER: Current iteration shows better performance than before
+ */
+enum dim_stats_state {
+	DIM_STATS_WORSE,
+	DIM_STATS_SAME,
+	DIM_STATS_BETTER,
+};
+
+/**
+ * enum dim_step_result - DIM algorithm step results
+ *
+ * These describe the result of a step.
+ *
+ * @DIM_STEPPED: Performed a regular step
+ * @DIM_TOO_TIRED: Same kind of step was done multiple times - should go to
+ * tired parking
+ * @DIM_ON_EDGE: Stepped to the most left/right profile
+ */
+enum dim_step_result {
+	DIM_STEPPED,
+	DIM_TOO_TIRED,
+	DIM_ON_EDGE,
+};
+
+/**
+ *	dim_on_top - check if current state is a good place to stop (top location)
+ *	@dim: DIM context
+ *
+ * Check if current profile is a good place to park at.
+ * This will result in reducing the DIM checks frequency as we assume we
+ * shouldn't probably change profiles, unless traffic pattern wasn't changed.
+ */
+bool dim_on_top(struct dim *dim);
+
+/**
+ *	dim_turn - change profile altering direction
+ *	@dim: DIM context
+ *
+ * Go left if we were going right and vice-versa.
+ * Do nothing if currently parking.
+ */
+void dim_turn(struct dim *dim);
+
+/**
+ *	dim_park_on_top - enter a parking state on a top location
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history.
+ */
+void dim_park_on_top(struct dim *dim);
+
+/**
+ *	dim_park_tired - enter a tired parking state
+ *	@dim: DIM context
+ *
+ * Enter parking state.
+ * Clear all movement history and cause DIM checks frequency to reduce.
+ */
+void dim_park_tired(struct dim *dim);
+
+/**
+ *	dim_calc_stats - calculate the difference between two samples
+ *	@start: start sample
+ *	@end: end sample
+ *	@curr_stats: delta between samples
+ *
+ * Calculate the delta between two samples (in data rates).
+ * Takes into consideration counter wrap-around.
+ */
+void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
+		    struct dim_stats *curr_stats);
+
+/**
+ *	dim_update_sample - set a sample's fields with given values
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample(u16 event_ctr, u64 packets, u64 bytes, struct dim_sample *s)
+{
+	s->time	     = ktime_get();
+	s->pkt_ctr   = packets;
+	s->byte_ctr  = bytes;
+	s->event_ctr = event_ctr;
+}
+
+/**
+ *	dim_update_sample_with_comps - set a sample's fields with given
+ *	values including the completion parameter
+ *	@event_ctr: number of events to set
+ *	@packets: number of packets to set
+ *	@bytes: number of bytes to set
+ *	@comps: number of completions to set
+ *	@s: DIM sample
+ */
+static inline void
+dim_update_sample_with_comps(u16 event_ctr, u64 packets, u64 bytes, u64 comps,
+			     struct dim_sample *s)
+{
+	dim_update_sample(event_ctr, packets, bytes, s);
+	s->comp_ctr = comps;
+}
+
+/* Net DIM */
+
+/**
+ *	net_dim_get_rx_moderation - provide a CQ moderation object for the given RX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_rx_moderation - provide the default RX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_rx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim_get_tx_moderation - provide a CQ moderation object for the given TX profile
+ *	@cq_period_mode: CQ period mode
+ *	@ix: Profile index
+ */
+struct dim_cq_moder net_dim_get_tx_moderation(u8 cq_period_mode, int ix);
+
+/**
+ *	net_dim_get_def_tx_moderation - provide the default TX moderation
+ *	@cq_period_mode: CQ period mode
+ */
+struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
+
+/**
+ *	net_dim - main DIM algorithm entry point
+ *	@dim: DIM instance information
+ *	@end_sample: Current data measurement
+ *
+ * Called by the consumer.
+ * This is the main logic of the algorithm, where data is processed in order
+ * to decide on next required action.
+ */
+void net_dim(struct dim *dim, struct dim_sample end_sample);
+
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
+#endif /* DIM_H */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_impl.h b/drivers/net/ethernet/intel/ice/kcompat_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..266415ca7a967b24247ce033d0f498164458de92
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_impl.h
@@ -0,0 +1,498 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_IMPL_H_
+#define _KCOMPAT_IMPL_H_
+
+/* This file contains implementations of backports from various kernels. It
+ * must rely only on NEED_<FLAG> and HAVE_<FLAG> checks. It must not make any
+ * checks to determine the kernel version when deciding whether to include an
+ * implementation.
+ *
+ * All new implementations must go in this file, and legacy implementations
+ * should be migrated to the new format over time.
+ */
+
+/*
+ * generic network stack functions
+ */
+
+/* NEED_NET_PREFETCH
+ *
+ * net_prefetch was introduced by commit f468f21b7af0 ("net: Take common
+ * prefetch code structure into a function")
+ *
+ * This function is trivial to re-implement in full.
+ */
+#ifdef NEED_NET_PREFETCH
+static inline void net_prefetch(void *p)
+{
+	prefetch(p);
+#if L1_CACHE_BYTES < 128
+	prefetch((u8 *)p + L1_CACHE_BYTES);
+#endif
+}
+#endif /* NEED_NET_PREFETCH */
+
+/* NEED_SKB_FRAG_OFF and NEED_SKB_FRAG_OFF_ADD
+ *
+ * skb_frag_off and skb_frag_off_add were added in upstream commit
+ * 7240b60c98d6 ("linux: Add skb_frag_t page_offset accessors")
+ *
+ * Implementing the wrappers directly for older kernels which still have the
+ * old implementation of skb_frag_t is trivial.
+ *
+ * LTS 4.19 backported the define for skb_frag_off in 4.19.201.
+ * d94d95ae0dd0 ("gro: ensure frag0 meets IP header alignment")
+ * Need to exclude defining skb_frag_off for 4.19.X where X > 200
+ */
+#ifdef NEED_SKB_FRAG_OFF
+static inline unsigned int skb_frag_off(const skb_frag_t *frag)
+{
+	return frag->page_offset;
+}
+#endif /* NEED_SKB_FRAG_OFF */
+#ifdef NEED_SKB_FRAG_OFF_ADD
+static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
+{
+	frag->page_offset += delta;
+}
+#endif /* NEED_SKB_FRAG_OFF_ADD */
+
+/*
+ * NETIF_F_HW_L2FW_DOFFLOAD related functions
+ *
+ * Support for NETIF_F_HW_L2FW_DOFFLOAD was first introduced upstream by
+ * commit a6cc0cfa72e0 ("net: Add layer 2 hardware acceleration operations for
+ * macvlan devices")
+ */
+#ifdef NETIF_F_HW_L2FW_DOFFLOAD
+
+#include <linux/if_macvlan.h>
+
+/* NEED_MACVLAN_ACCEL_PRIV
+ *
+ * macvlan_accel_priv is an accessor function that replaced direct access to
+ * the macvlan->fwd_priv variable. It was introduced in commit 7d775f63470c
+ * ("macvlan: Rename fwd_priv to accel_priv and add accessor function")
+ *
+ * Implement the new wrapper name by simply accessing the older
+ * macvlan->fwd_priv name.
+ */
+#ifdef NEED_MACVLAN_ACCEL_PRIV
+static inline void *macvlan_accel_priv(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->fwd_priv;
+}
+#endif /* NEED_MACVLAN_ACCEL_PRIV */
+
+/* NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+ *
+ * macvlan_release_l2fw_offload was introduced upstream by commit 53cd4d8e4dfb
+ * ("macvlan: Provide function for interfaces to release HW offload")
+ *
+ * Implementing this is straight forward, but we must be careful to use
+ * fwd_priv instead of accel_priv. Note that both the change to accel_priv and
+ * introduction of this function happened in the same release.
+ */
+#ifdef NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+static inline int macvlan_release_l2fw_offload(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	macvlan->fwd_priv = NULL;
+	return dev_uc_add(macvlan->lowerdev, dev->dev_addr);
+}
+#endif /* NEED_MACVLAN_RELEASE_L2FW_OFFLOAD */
+
+/* NEED_MACVLAN_SUPPORTS_DEST_FILTER
+ *
+ * macvlan_supports_dest_filter was introduced upstream by commit 6cb1937d4eff
+ * ("macvlan: Add function to test for destination filtering support")
+ *
+ * The implementation doesn't rely on anything new and is trivial to backport
+ * for kernels that have NETIF_F_HW_L2FW_DOFFLOAD support.
+ */
+#ifdef NEED_MACVLAN_SUPPORTS_DEST_FILTER
+static inline bool macvlan_supports_dest_filter(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->mode == MACVLAN_MODE_PRIVATE ||
+	       macvlan->mode == MACVLAN_MODE_VEPA ||
+	       macvlan->mode == MACVLAN_MODE_BRIDGE;
+}
+#endif /* NEED_MACVLAN_SUPPORTS_DEST_FILTER */
+
+#endif /* NETIF_F_HW_L2FW_DOFFLOAD */
+
+/*
+ * tc functions
+ */
+
+/* NEED_FLOW_INDR_BLOCK_CB_REGISTER
+ *
+ * __flow_indr_block_cb_register and __flow_indr_block_cb_unregister were
+ * added in upstream commit 4e481908c51b ("flow_offload: move tc indirect
+ * block to flow offload")
+ *
+ * This was a simple rename so we can just translate from the old
+ * naming scheme with a macro.
+ */
+#ifdef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#define __flow_indr_block_cb_register __tc_indr_block_cb_register
+#define __flow_indr_block_cb_unregister __tc_indr_block_cb_unregister
+#endif
+
+/*
+ * devlink support
+ */
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+#include <net/devlink.h>
+
+#ifdef HAVE_DEVLINK_REGIONS
+/* NEED_DEVLINK_REGION_CREATE_OPS
+ *
+ * The ops parameter to devlink_region_create was added by commit e8937681797c
+ * ("devlink: prepare to support region operations")
+ *
+ * For older kernels, define _kc_devlink_region_create that takes an ops
+ * parameter, and calls the old implementation function by extracting the name
+ * from the structure.
+ */
+#ifdef NEED_DEVLINK_REGION_CREATE_OPS
+struct devlink_region_ops {
+	const char *name;
+	void (*destructor)(const void *data);
+};
+
+static inline struct devlink_region *
+_kc_devlink_region_create(struct devlink *devlink,
+			  const struct devlink_region_ops *ops,
+			  u32 region_max_snapshots, u64 region_size)
+{
+	return devlink_region_create(devlink, ops->name, region_max_snapshots,
+				     region_size);
+}
+
+#define devlink_region_create _kc_devlink_region_create
+#endif /* NEED_DEVLINK_REGION_CREATE_OPS */
+#endif /* HAVE_DEVLINK_REGIONS */
+
+/* NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+ *
+ * devlink_flash_update_status_notify, _begin_notify, and _end_notify were
+ * added by upstream commit 191ed2024de9 ("devlink: allow driver to update
+ * progress of flash update")
+ *
+ * For older kernels that lack the netlink messages, convert the functions
+ * into no-ops.
+ */
+#ifdef NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+static inline void
+devlink_flash_update_begin_notify(struct devlink __always_unused *devlink)
+{
+}
+
+static inline void
+devlink_flash_update_end_notify(struct devlink __always_unused *devlink)
+{
+}
+
+static inline void
+devlink_flash_update_status_notify(struct devlink __always_unused *devlink,
+				   const char __always_unused *status_msg,
+				   const char __always_unused *component,
+				   unsigned long __always_unused done,
+				   unsigned long __always_unused total)
+{
+}
+#endif /* NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY */
+
+/* NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+ *
+ * devlink_flash_update_timeout_notify was added by upstream commit
+ * f92970c694b3 ("devlink: add timeout information to status_notify").
+ *
+ * For older kernels, just convert timeout notifications into regular status
+ * notification messages without timeout information.
+ */
+#ifdef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+static inline void
+devlink_flash_update_timeout_notify(struct devlink *devlink,
+				    const char *status_msg,
+				    const char *component,
+				    unsigned long __always_unused timeout)
+{
+	devlink_flash_update_status_notify(devlink, status_msg, component, 0, 0);
+}
+#endif /* NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY */
+
+/*
+ * NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+ *
+ * HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+ * HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+ *
+ * devlink_port_attrs_set was introduced by commit b9ffcbaf56d3 ("devlink:
+ * introduce devlink_port_attrs_set")
+ *
+ * It's function signature has changed multiple times over several kernel
+ * releases:
+ *
+ * commit 5ec1380a21bb ("devlink: extend attrs_set for setting port
+ * flavours") added the ability to set port flavour. (Note that there is no
+ * official kernel release with devlink_port_attrs_set without the flavour
+ * argument, as they were introduced in the same series.)
+ *
+ * commit bec5267cded2 ("net: devlink: extend port attrs for switch ID") added
+ * the ability to set the switch ID (HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID)
+ *
+ * Finally commit 71ad8d55f8e5 ("devlink: Replace devlink_port_attrs_set
+ * parameters with a struct") refactored to pass devlink_port_attrs struct
+ * instead of individual parameters. (!NEED_DEVLINK_PORT_ATTRS_SET_STRUCT)
+ *
+ * We want core drivers to just use the latest form that takes
+ * a devlink_port_attrs structure. Note that this structure did exist as part
+ * of <net/devlink.h> but was never used directly by driver code prior to the
+ * function parameter change. For this reason, the implementation always
+ * relies on _kc_devlink_port_attrs instead of what was defined in the kernel.
+ */
+#ifdef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+
+#ifndef HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL,
+	DEVLINK_PORT_FLAVOUR_CPU,
+	DEVLINK_PORT_FLAVOUR_DSA,
+	DEVLINK_PORT_FLAVOUR_PCI_PF,
+	DEVLINK_PORT_FLAVOUR_PCI_VF,
+};
+#endif
+
+struct _kc_devlink_port_phys_attrs {
+	u32 port_number;
+	u32 split_subport_number;
+};
+
+struct _kc_devlink_port_pci_pf_attrs {
+	u16 pf;
+};
+
+struct _kc_devlink_port_pci_vf_attrs {
+	u16 pf;
+	u16 vf;
+};
+
+struct _kc_devlink_port_attrs {
+	u8 split:1,
+	   splittable:1;
+	u32 lanes;
+	enum devlink_port_flavour flavour;
+	struct netdev_phys_item_id switch_id;
+	union {
+		struct _kc_devlink_port_phys_attrs phys;
+		struct _kc_devlink_port_pci_pf_attrs pci_pf;
+		struct _kc_devlink_port_pci_vf_attrs pci_vf;
+	};
+};
+
+#define devlink_port_attrs _kc_devlink_port_attrs
+
+static inline void
+_kc_devlink_port_attrs_set(struct devlink_port *devlink_port,
+			   struct _kc_devlink_port_attrs *attrs)
+{
+#if defined(HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID)
+	devlink_port_attrs_set(devlink_port, attrs->flavour, attrs->phys.port_number,
+			       attrs->split, attrs->phys.split_subport_number,
+			       attrs->switch_id.id, attrs->switch_id.id_len);
+#elif defined(HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR)
+	devlink_port_attrs_set(devlink_port, attrs->flavour, attrs->phys.port_number,
+			       attrs->split, attrs->phys.split_subport_number);
+#else
+	if (attrs->split)
+		devlink_port_split_set(devlink_port, attrs->phys.port_number);
+#endif
+}
+
+#define devlink_port_attrs_set _kc_devlink_port_attrs_set
+
+#endif /* NEED_DEVLINK_PORT_ATTRS_SET_STRUCT */
+
+#endif /* CONFIG_NET_DEVLINK */
+
+#ifdef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+/* ida_alloc(), ida_alloc_min(), ida_alloc_max(), ida_alloc_range(), and
+ * ida_free() were added in commit 5ade60dda43c ("ida: add new API").
+ *
+ * Also, using "0" as the "end" argument (3rd argument) to ida_simple_get() is
+ * considered the max value, which is why it's used in ida_alloc() and
+ * ida_alloc_min().
+ */
+static inline int ida_alloc(struct ida *ida, gfp_t gfp)
+{
+	return ida_simple_get(ida, 0, 0, gfp);
+}
+
+static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
+{
+	return ida_simple_get(ida, min, 0, gfp);
+}
+
+static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
+{
+	return ida_simple_get(ida, 0, max, gfp);
+}
+
+static inline int
+ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, gfp_t gfp)
+{
+	return ida_simple_get(ida, min, max, gfp);
+}
+
+static inline void ida_free(struct ida *ida, unsigned int id)
+{
+	ida_simple_remove(ida, id);
+}
+#endif /* NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE */
+
+/*
+ * dev_printk implementations
+ */
+
+/* NEED_DEV_PRINTK_ONCE
+ *
+ * The dev_*_once family of printk functions was introduced by commit
+ * e135303bd5be ("device: Add dev_<level>_once variants")
+ *
+ * The implementation is very straight forward so we will just implement them
+ * as-is here.
+ */
+#ifdef NEED_DEV_PRINTK_ONCE
+#ifdef CONFIG_PRINTK
+#define dev_level_once(dev_level, dev, fmt, ...)			\
+do {									\
+	static bool __print_once __read_mostly;				\
+									\
+	if (!__print_once) {						\
+		__print_once = true;					\
+		dev_level(dev, fmt, ##__VA_ARGS__);			\
+	}								\
+} while (0)
+#else
+#define dev_level_once(dev_level, dev, fmt, ...)			\
+do {									\
+	if (0)								\
+		dev_level(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
+
+#define dev_emerg_once(dev, fmt, ...)					\
+	dev_level_once(dev_emerg, dev, fmt, ##__VA_ARGS__)
+#define dev_alert_once(dev, fmt, ...)					\
+	dev_level_once(dev_alert, dev, fmt, ##__VA_ARGS__)
+#define dev_crit_once(dev, fmt, ...)					\
+	dev_level_once(dev_crit, dev, fmt, ##__VA_ARGS__)
+#define dev_err_once(dev, fmt, ...)					\
+	dev_level_once(dev_err, dev, fmt, ##__VA_ARGS__)
+#define dev_warn_once(dev, fmt, ...)					\
+	dev_level_once(dev_warn, dev, fmt, ##__VA_ARGS__)
+#define dev_notice_once(dev, fmt, ...)					\
+	dev_level_once(dev_notice, dev, fmt, ##__VA_ARGS__)
+#define dev_info_once(dev, fmt, ...)					\
+	dev_level_once(dev_info, dev, fmt, ##__VA_ARGS__)
+#define dev_dbg_once(dev, fmt, ...)					\
+	dev_level_once(dev_dbg, dev, fmt, ##__VA_ARGS__)
+#endif /* NEED_DEV_PRINTK_ONCE */
+
+#ifdef HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+
+/* NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+ *
+ * tc_cls_can_offload_and_chain0 was added by upstream commit
+ * 878db9f0f26d ("pkt_cls: add new tc cls helper to check offload flag and
+ * chain index").
+ *
+ * This patch backports this function for older kernels by calling
+ * tc_can_offload() directly.
+ */
+#ifdef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#include <net/pkt_cls.h>
+static inline bool
+tc_cls_can_offload_and_chain0(const struct net_device *dev,
+			      struct tc_cls_common_offload *common)
+{
+	if (!tc_can_offload(dev))
+		return false;
+	if (common->chain_index)
+		return false;
+
+	return true;
+}
+#endif /* NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0 */
+#endif /* HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO */
+
+/* NEED_TC_SETUP_QDISC_MQPRIO
+ *
+ * TC_SETUP_QDISC_MQPRIO was added by upstream commit
+ * 575ed7d39e2f ("net_sch: mqprio: Change TC_SETUP_MQPRIO to
+ * TC_SETUP_QDISC_MQPRIO").
+ *
+ * For older kernels which are using TC_SETUP_MQPRIO
+ */
+#ifdef NEED_TC_SETUP_QDISC_MQPRIO
+#define TC_SETUP_QDISC_MQPRIO TC_SETUP_MQPRIO
+#endif /* NEED_TC_SETUP_QDISC_MQPRIO */
+
+/*
+ * ART/TSC functions
+ */
+#ifdef HAVE_PTP_CROSSTIMESTAMP
+/* NEED_CONVERT_ART_NS_TO_TSC
+ *
+ * convert_art_ns_to_tsc was added by upstream commit fc804f65d462 ("x86/tsc:
+ * Convert ART in nanoseconds to TSC").
+ *
+ * This function is similar to convert_art_to_tsc, but expects the input in
+ * terms of nanoseconds, rather than ART cycles. We implement this by
+ * accessing the tsc_khz value and performing the proper calculation. In order
+ * to access the correct clock object on returning, we use the function
+ * convert_art_to_tsc, because the art_related_clocksource is inaccessible.
+ */
+#ifdef NEED_CONVERT_ART_NS_TO_TSC
+#ifdef CONFIG_X86
+#include <asm/tsc.h>
+
+static inline struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+	struct system_counterval_t system;
+	u64 tmp, res, rem;
+
+	rem = do_div(art_ns, USEC_PER_SEC);
+
+	res = art_ns * tsc_khz;
+	tmp = rem * tsc_khz;
+
+	do_div(tmp, USEC_PER_SEC);
+	res += tmp;
+
+	system = convert_art_to_tsc(art_ns);
+	system.cycles = res;
+
+	return system;
+}
+#else /* CONFIG_X86 */
+static inline struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+	WARN_ONCE(1, "%s is only supported on X86", __func__);
+	return (struct system_counterval_t){};
+}
+#endif /* !CONFIG_X86 */
+#endif /* NEED_CONVERT_ART_NS_TO_TSC */
+#endif /* HAVE_PTP_CROSSTIMESTAMP */
+
+#endif /* _KCOMPAT_IMPL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_net_dim.c b/drivers/net/ethernet/intel/ice/kcompat_net_dim.c
new file mode 100644
index 0000000000000000000000000000000000000000..244c87634abafd1b78d4005970ec8217aa19e7dd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_net_dim.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include "kcompat.h"
+#include "kcompat_dim.h"
+
+/*
+ * Net DIM profiles:
+ *        There are different set of profiles for each CQ period mode.
+ *        There are different set of profiles for RX/TX CQs.
+ *        Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
+ */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
+#define NET_DIM_RX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
+}
+
+#define NET_DIM_RX_CQE_PROFILES { \
+	{2,  256},             \
+	{8,  128},             \
+	{16, 64},              \
+	{32, 64},              \
+	{64, 64}               \
+}
+
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
+static const struct dim_cq_moder
+rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct dim_cq_moder
+tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
+};
+
+struct dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+#ifdef __CHECKER__
+			/* cppcheck-suppress duplicateValueTernary */
+#endif /* __CHECKER__ */
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+struct dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
+{
+	struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
+
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+struct dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+#ifdef __CHECKER__
+			/* cppcheck-suppress duplicateValueTernary */
+#endif /* __CHECKER__ */
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
+}
+
+static int net_dim_step(struct dim *dim)
+{
+	if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
+		return DIM_TOO_TIRED;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+	case DIM_PARKING_TIRED:
+		break;
+	case DIM_GOING_RIGHT:
+		if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+		break;
+	case DIM_GOING_LEFT:
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+		break;
+	}
+
+	dim->tired++;
+	return DIM_STEPPED;
+}
+
+static void net_dim_exit_parking(struct dim *dim)
+{
+	dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT;
+	net_dim_step(dim);
+}
+
+static int net_dim_stats_compare(struct dim_stats *curr,
+				 struct dim_stats *prev)
+{
+	if (!prev->bpms)
+		return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+		return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->ppms)
+		return curr->ppms ? DIM_STATS_BETTER :
+				    DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+		return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	if (!prev->epms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+		return (curr->epms < prev->epms) ? DIM_STATS_BETTER :
+						   DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_state = dim->tune_state;
+	int prev_ix = dim->profile_ix;
+	int stats_res;
+	int step_res;
+
+	switch (dim->tune_state) {
+	case DIM_PARKING_ON_TOP:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_SAME)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_PARKING_TIRED:
+		dim->tired--;
+		if (!dim->tired)
+			net_dim_exit_parking(dim);
+		break;
+
+	case DIM_GOING_RIGHT:
+	case DIM_GOING_LEFT:
+		stats_res = net_dim_stats_compare(curr_stats,
+						  &dim->prev_stats);
+		if (stats_res != DIM_STATS_BETTER)
+			dim_turn(dim);
+
+		if (dim_on_top(dim)) {
+			dim_park_on_top(dim);
+			break;
+		}
+
+		step_res = net_dim_step(dim);
+		switch (step_res) {
+		case DIM_ON_EDGE:
+			dim_park_on_top(dim);
+			break;
+		case DIM_TOO_TIRED:
+			dim_park_tired(dim);
+			break;
+		}
+
+		break;
+	}
+
+	if (prev_state != DIM_PARKING_ON_TOP ||
+	    dim->tune_state != DIM_PARKING_ON_TOP)
+		dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void net_dim(struct dim *dim, struct dim_sample end_sample)
+{
+	struct dim_stats curr_stats;
+	u16 nevents;
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = BIT_GAP(BITS_PER_TYPE(u16),
+				  end_sample.event_ctr,
+				  dim->start_sample.event_ctr);
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats);
+		if (net_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr,
+				  end_sample.byte_ctr, &dim->start_sample);
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_overflow.h b/drivers/net/ethernet/intel/ice/kcompat_overflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e80d6e01d4b9073e9b6f719ab5ea85800e6b51c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_overflow.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+/* The is_signed_type macro is redefined in a few places in various kernel
+ * headers. If this header is included at the same time as one of those, we
+ * will generate compilation warnings. Since we can't fix every old kernel,
+ * rename is_signed_type for this file to _kc_is_signed_type. This prevents
+ * the macro name collision, and should be safe since our drivers do not
+ * directly call the macro.
+ */
+#define _kc_is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - _kc_is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+})
+
+#define check_sub_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_sub_overflow(__a, __b, __d);	\
+})
+
+#define check_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_mul_overflow(__a, __b, __d);	\
+})
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a + __b;			\
+	*__d < __a;				\
+})
+#define __unsigned_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a - __b;			\
+	__a < __b;				\
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);				\
+	typeof(b) __b = (b);				\
+	typeof(d) __d = (d);				\
+	(void) (&__a == &__b);				\
+	(void) (&__a == __d);				\
+	*__d = __a * __b;				\
+	__builtin_constant_p(__b) ?			\
+	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
+	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a + (u64)__b;		\
+	(((~(__a ^ __b)) & (*__d ^ __a))	\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a - (u64)__b;		\
+	((((__a ^ __b)) & (*__d ^ __a))		\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({				\
+	typeof(a) __a = (a);						\
+	typeof(b) __b = (b);						\
+	typeof(d) __d = (d);						\
+	typeof(a) __tmax = type_max(typeof(a));				\
+	typeof(a) __tmin = type_min(typeof(a));				\
+	(void) (&__a == &__b);						\
+	(void) (&__a == __d);						\
+	*__d = (u64)__a * (u64)__b;					\
+	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
+	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+	(__b == (typeof(__b))-1 && __a == __tmin);			\
+})
+
+
+#define check_add_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_add_overflow(a, b, d),			\
+			__unsigned_add_overflow(a, b, d))
+
+#define check_sub_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_sub_overflow(a, b, d),			\
+			__unsigned_sub_overflow(a, b, d))
+
+#define check_mul_overflow(a, b, d)					\
+	__builtin_choose_expr(_kc_is_signed_type(typeof(a)),		\
+			__signed_mul_overflow(a, b, d),			\
+			__unsigned_mul_overflow(a, b, d))
+
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) ({					\
+	typeof(a) _a = a;						\
+	typeof(s) _s = s;						\
+	typeof(d) _d = d;						\
+	u64 _a_full = _a;						\
+	unsigned int _to_shift =					\
+		_s >= 0 && _s < 8 * sizeof(*d) ? _s : 0;		\
+	*_d = (_a_full << _to_shift);					\
+	(_to_shift != _s || *_d < 0 || _a < 0 ||			\
+		(*_d >> _to_shift) != _a);				\
+})
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_mul_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(n, size, &bytes))
+		return SIZE_MAX;
+	if (check_add_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @n: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @n @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, n)					\
+	__ab_c_size(n,							\
+		    sizeof(*(p)->member) + __must_be_array((p)->member),\
+		    sizeof(*(p)))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c
new file mode 100644
index 0000000000000000000000000000000000000000..181700796b4ee311ace82f824d82c9889c7451ed
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.c
@@ -0,0 +1,1117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2020, Intel Corporation. */
+
+/* This is taken from upstream "lib/pldmfw/pldmfw.c" */
+
+#include <asm/unaligned.h>
+#include <linux/crc32.h>
+#include <linux/device.h>
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+
+#include "kcompat.h"
+#include "kcompat_pldmfw.h"
+
+#if IS_ENABLED(CONFIG_PLDMFW)
+#error "CONFIG_PLDMFW is enabled, use builtin lib/pldmfw instead."
+#endif
+
+/* This section is taken from upstream "lib/pldmfw/pldmfw_private.h" */
+
+/* The following data structures define the layout of a firmware binary
+ * following the "PLDM For Firmware Update Specification", DMTF standard
+ * #DSP0267.
+ *
+ * pldmfw.c uses these structures to implement a simple engine that will parse
+ * a fw binary file in this format and perform a firmware update for a given
+ * device.
+ *
+ * Due to the variable sized data layout, alignment of fields within these
+ * structures is not guaranteed when reading. For this reason, all multi-byte
+ * field accesses should be done using the unaligned access macros.
+ * Additionally, the standard specifies that multi-byte fields are in
+ * LittleEndian format.
+ *
+ * The structure definitions are not made public, in order to keep direct
+ * accesses within code that is prepared to deal with the limitation of
+ * unaligned access.
+ */
+
+/* UUID for PLDM firmware packages: f018878c-cb7d-4943-9800-a02f059aca02 */
+static const uuid_t pldm_firmware_header_id =
+	UUID_INIT(0xf018878c, 0xcb7d, 0x4943,
+		  0x98, 0x00, 0xa0, 0x2f, 0x05, 0x9a, 0xca, 0x02);
+
+/* Revision number of the PLDM header format this code supports */
+#define PACKAGE_HEADER_FORMAT_REVISION 0x01
+
+/* timestamp104 structure defined in PLDM Base specification */
+#define PLDM_TIMESTAMP_SIZE 13
+struct __pldm_timestamp {
+	u8 b[PLDM_TIMESTAMP_SIZE];
+};
+
+/* Package Header Information */
+struct __pldm_header {
+	uuid_t id;			    /* PackageHeaderIdentifier */
+	u8 revision;			    /* PackageHeaderFormatRevision */
+	__le16 size;			    /* PackageHeaderSize */
+	struct __pldm_timestamp release_date; /* PackageReleaseDateTime */
+	__le16 component_bitmap_len;	    /* ComponentBitmapBitLength */
+	u8 version_type;		    /* PackageVersionStringType */
+	u8 version_len;			    /* PackageVersionStringLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * PackageVersionString, length is version_len.
+	 *
+	 * The total size of this section is
+	 *   sizeof(pldm_header) + version_len;
+	 */
+	u8 version_string[];		/* PackageVersionString */
+} __packed __aligned(1);
+
+/* Firmware Device ID Record */
+struct __pldmfw_record_info {
+	__le16 record_len;		/* RecordLength */
+	u8 descriptor_count;		/* DescriptorCount */
+	__le32 device_update_flags;	/* DeviceUpdateOptionFlags */
+	u8 version_type;		/* ComponentImageSetVersionType */
+	u8 version_len;			/* ComponentImageSetVersionLength */
+	__le16 package_data_len;	/* FirmwareDevicePackageDataLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * ApplicableComponents, length is component_bitmap_len from header
+	 * ComponentImageSetVersionString, length is version_len
+	 * RecordDescriptors, a series of TLVs with 16bit type and length
+	 * FirmwareDevicePackageData, length is package_data_len
+	 *
+	 * The total size of each record is
+	 *   sizeof(pldmfw_record_info) +
+	 *   component_bitmap_len (converted to bytes!) +
+	 *   version_len +
+	 *   <length of RecordDescriptors> +
+	 *   package_data_len
+	 */
+	u8 variable_record_data[];
+} __packed __aligned(1);
+
+/* Firmware Descriptor Definition */
+struct __pldmfw_desc_tlv {
+	__le16 type;			/* DescriptorType */
+	__le16 size;			/* DescriptorSize */
+	u8 data[];			/* DescriptorData */
+} __aligned(1);
+
+/* Firmware Device Identification Area */
+struct __pldmfw_record_area {
+	u8 record_count;		/* DeviceIDRecordCount */
+	/* This is not a struct type because the size of each record varies */
+	u8 records[];
+} __aligned(1);
+
+/* Individual Component Image Information */
+struct __pldmfw_component_info {
+	__le16 classification;		/* ComponentClassfication */
+	__le16 identifier;		/* ComponentIdentifier */
+	__le32 comparison_stamp;	/* ComponentComparisonStamp */
+	__le16 options;			/* componentOptions */
+	__le16 activation_method;	/* RequestedComponentActivationMethod */
+	__le32 location_offset;		/* ComponentLocationOffset */
+	__le32 size;			/* ComponentSize */
+	u8 version_type;		/* ComponentVersionStringType */
+	u8 version_len;		/* ComponentVersionStringLength */
+
+	/*
+	 * DSP0267 also includes the following variable length fields at the
+	 * end of this structure:
+	 *
+	 * ComponentVersionString, length is version_len
+	 *
+	 * The total size of this section is
+	 *   sizeof(pldmfw_component_info) + version_len;
+	 */
+	u8 version_string[];		/* ComponentVersionString */
+} __packed __aligned(1);
+
+/* Component Image Information Area */
+struct __pldmfw_component_area {
+	__le16 component_image_count;
+	/* This is not a struct type because the component size varies */
+	u8 components[];
+} __aligned(1);
+
+/**
+ * pldm_first_desc_tlv
+ * @start: byte offset of the start of the descriptor TLVs
+ *
+ * Converts the starting offset of the descriptor TLVs into a pointer to the
+ * first descriptor.
+ */
+#define pldm_first_desc_tlv(start)					\
+	((const struct __pldmfw_desc_tlv *)(start))
+
+/**
+ * pldm_next_desc_tlv
+ * @desc: pointer to a descriptor TLV
+ *
+ * Finds the pointer to the next descriptor following a given descriptor
+ */
+#define pldm_next_desc_tlv(desc)						\
+	((const struct __pldmfw_desc_tlv *)((desc)->data +			\
+					     get_unaligned_le16(&(desc)->size)))
+
+/**
+ * pldm_for_each_desc_tlv
+ * @i: variable to store descriptor index
+ * @desc: variable to store descriptor pointer
+ * @start: byte offset of the start of the descriptors
+ * @count: the number of descriptors
+ *
+ * for loop macro to iterate over all of the descriptors of a given PLDM
+ * record.
+ */
+#define pldm_for_each_desc_tlv(i, desc, start, count)			\
+	for ((i) = 0, (desc) = pldm_first_desc_tlv(start);		\
+	     (i) < (count);						\
+	     (i)++, (desc) = pldm_next_desc_tlv(desc))
+
+/**
+ * pldm_first_record
+ * @start: byte offset of the start of the PLDM records
+ *
+ * Converts a starting offset of the PLDM records into a pointer to the first
+ * record.
+ */
+#define pldm_first_record(start)					\
+	((const struct __pldmfw_record_info *)(start))
+
+/**
+ * pldm_next_record
+ * @record: pointer to a PLDM record
+ *
+ * Finds a pointer to the next record following a given record
+ */
+#define pldm_next_record(record)					\
+	((const struct __pldmfw_record_info *)				\
+	 ((const u8 *)(record) + get_unaligned_le16(&(record)->record_len)))
+
+/**
+ * pldm_for_each_record
+ * @i: variable to store record index
+ * @record: variable to store record pointer
+ * @start: byte offset of the start of the records
+ * @count: the number of records
+ *
+ * for loop macro to iterate over all of the records of a PLDM file.
+ */
+#define pldm_for_each_record(i, record, start, count)			\
+	for ((i) = 0, (record) = pldm_first_record(start);		\
+	     (i) < (count);						\
+	     (i)++, (record) = pldm_next_record(record))
+
+/**
+ * pldm_first_component
+ * @start: byte offset of the start of the PLDM components
+ *
+ * Convert a starting offset of the PLDM components into a pointer to the
+ * first component
+ */
+#define pldm_first_component(start)					\
+	((const struct __pldmfw_component_info *)(start))
+
+/**
+ * pldm_next_component
+ * @component: pointer to a PLDM component
+ *
+ * Finds a pointer to the next component following a given component
+ */
+#define pldm_next_component(component)						\
+	((const struct __pldmfw_component_info *)((component)->version_string +	\
+						  (component)->version_len))
+
+/**
+ * pldm_for_each_component
+ * @i: variable to store component index
+ * @component: variable to store component pointer
+ * @start: byte offset to the start of the first component
+ * @count: the number of components
+ *
+ * for loop macro to iterate over all of the components of a PLDM file.
+ */
+#define pldm_for_each_component(i, component, start, count)		\
+	for ((i) = 0, (component) = pldm_first_component(start);	\
+	     (i) < (count);						\
+	     (i)++, (component) = pldm_next_component(component))
+
+/* End of lib/pldmfw/pldmfw_private.h */
+
+/* Internal structure used to store details about the PLDM image file as it is
+ * being validated and processed.
+ */
+struct pldmfw_priv {
+	struct pldmfw *context;
+	const struct firmware *fw;
+
+	/* current offset of firmware image */
+	size_t offset;
+
+	struct list_head records;
+	struct list_head components;
+
+	/* PLDM Firmware Package Header */
+	const struct __pldm_header *header;
+	u16 total_header_size;
+
+	/* length of the component bitmap */
+	u16 component_bitmap_len;
+	u16 bitmap_size;
+
+	/* Start of the component image information */
+	u16 component_count;
+	const u8 *component_start;
+
+	/* Start pf the firmware device id records */
+	const u8 *record_start;
+	u8 record_count;
+
+	/* The CRC at the end of the package header */
+	u32 header_crc;
+
+	struct pldmfw_record *matching_record;
+};
+
+/**
+ * pldm_check_fw_space - Verify that the firmware image has space left
+ * @data: pointer to private data
+ * @offset: offset to start from
+ * @length: length to check for
+ *
+ * Verify that the firmware data can hold a chunk of bytes with the specified
+ * offset and length.
+ *
+ * Returns: zero on success, or -EFAULT if the image does not have enough
+ * space left to fit the expected length.
+ */
+static int
+pldm_check_fw_space(struct pldmfw_priv *data, size_t offset, size_t length)
+{
+	size_t expected_size = offset + length;
+	struct device *dev = data->context->dev;
+
+	if (data->fw->size < expected_size) {
+		dev_dbg(dev, "Firmware file size smaller than expected. Got %zu bytes, needed %zu bytes\n",
+			data->fw->size, expected_size);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_move_fw_offset - Move the current firmware offset forward
+ * @data: pointer to private data
+ * @bytes_to_move: number of bytes to move the offset forward by
+ *
+ * Check that there is enough space past the current offset, and then move the
+ * offset forward by this ammount.
+ *
+ * Returns: zero on success, or -EFAULT if the image is too small to fit the
+ * expected length.
+ */
+static int
+pldm_move_fw_offset(struct pldmfw_priv *data, size_t bytes_to_move)
+{
+	int err;
+
+	err = pldm_check_fw_space(data, data->offset, bytes_to_move);
+	if (err)
+		return err;
+
+	data->offset += bytes_to_move;
+
+	return 0;
+}
+
+/**
+ * pldm_parse_header - Validate and extract details about the PLDM header
+ * @data: pointer to private data
+ *
+ * Performs initial basic verification of the PLDM image, up to the first
+ * firmware record.
+ *
+ * This includes the following checks and extractions
+ *
+ *   * Verify that the UUID at the start of the header matches the expected
+ *     value as defined in the DSP0267 PLDM specification
+ *   * Check that the revision is 0x01
+ *   * Extract the total header_size and verify that the image is large enough
+ *     to contain at least the length of this header
+ *   * Extract the size of the component bitmap length
+ *   * Extract a pointer to the start of the record area
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_header(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_record_area *record_area;
+	struct device *dev = data->context->dev;
+	const struct __pldm_header *header;
+	size_t header_size;
+	int err;
+
+	err = pldm_move_fw_offset(data, sizeof(*header));
+	if (err)
+		return err;
+
+	header = (const struct __pldm_header *)data->fw->data;
+	data->header = header;
+
+	if (!uuid_equal(&header->id, &pldm_firmware_header_id)) {
+		dev_dbg(dev, "Invalid package header identifier. Expected UUID %pUB, but got %pUB\n",
+			&pldm_firmware_header_id, &header->id);
+		return -EINVAL;
+	}
+
+	if (header->revision != PACKAGE_HEADER_FORMAT_REVISION) {
+		dev_dbg(dev, "Invalid package header revision. Expected revision %u but got %u\n",
+			PACKAGE_HEADER_FORMAT_REVISION, header->revision);
+		return -EOPNOTSUPP;
+	}
+
+	data->total_header_size = get_unaligned_le16(&header->size);
+	header_size = data->total_header_size - sizeof(*header);
+
+	err = pldm_check_fw_space(data, data->offset, header_size);
+	if (err)
+		return err;
+
+	data->component_bitmap_len =
+		get_unaligned_le16(&header->component_bitmap_len);
+
+	if (data->component_bitmap_len % 8 != 0) {
+		dev_dbg(dev, "Invalid component bitmap length. The length is %u, which is not a multiple of 8\n",
+			data->component_bitmap_len);
+		return -EINVAL;
+	}
+
+	data->bitmap_size = data->component_bitmap_len / 8;
+
+	err = pldm_move_fw_offset(data, header->version_len);
+	if (err)
+		return err;
+
+	/* extract a pointer to the record area, which just follows the main
+	 * PLDM header data.
+	 */
+	record_area = (const struct __pldmfw_record_area *)(data->fw->data +
+							 data->offset);
+
+	err = pldm_move_fw_offset(data, sizeof(*record_area));
+	if (err)
+		return err;
+
+	data->record_count = record_area->record_count;
+	data->record_start = record_area->records;
+
+	return 0;
+}
+
+/**
+ * pldm_check_desc_tlv_len - Check that the length matches expectation
+ * @data: pointer to image details
+ * @type: the descriptor type
+ * @size: the length from the descriptor header
+ *
+ * If the descriptor type is one of the documented descriptor types according
+ * to the standard, verify that the provided length matches.
+ *
+ * If the type is not recognized or is VENDOR_DEFINED, return zero.
+ *
+ * Returns: zero on success, or -EINVAL if the specified size of a standard
+ * TLV does not match the expected value defined for that TLV.
+ */
+static int
+pldm_check_desc_tlv_len(struct pldmfw_priv *data, u16 type, u16 size)
+{
+	struct device *dev = data->context->dev;
+	u16 expected_size;
+
+	switch (type) {
+	case PLDM_DESC_ID_PCI_VENDOR_ID:
+	case PLDM_DESC_ID_PCI_DEVICE_ID:
+	case PLDM_DESC_ID_PCI_SUBVENDOR_ID:
+	case PLDM_DESC_ID_PCI_SUBDEV_ID:
+		expected_size = 2;
+		break;
+	case PLDM_DESC_ID_PCI_REVISION_ID:
+		expected_size = 1;
+		break;
+	case PLDM_DESC_ID_PNP_VENDOR_ID:
+		expected_size = 3;
+		break;
+	case PLDM_DESC_ID_IANA_ENTERPRISE_ID:
+	case PLDM_DESC_ID_ACPI_VENDOR_ID:
+	case PLDM_DESC_ID_PNP_PRODUCT_ID:
+	case PLDM_DESC_ID_ACPI_PRODUCT_ID:
+		expected_size = 4;
+		break;
+	case PLDM_DESC_ID_UUID:
+		expected_size = 16;
+		break;
+	case PLDM_DESC_ID_VENDOR_DEFINED:
+		return 0;
+	default:
+		/* Do not report an error on an unexpected TLV */
+		dev_dbg(dev, "Found unrecognized TLV type 0x%04x\n", type);
+		return 0;
+	}
+
+	if (size != expected_size) {
+		dev_dbg(dev, "Found TLV type 0x%04x with unexpected length. Got %u bytes, but expected %u bytes\n",
+			type, size, expected_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_desc_tlvs - Check and skip past a number of TLVs
+ * @data: pointer to private data
+ * @record: pointer to the record this TLV belongs too
+ * @desc_count: descriptor count
+ *
+ * From the current offset, read and extract the descriptor TLVs, updating the
+ * current offset each time.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_parse_desc_tlvs(struct pldmfw_priv *data, struct pldmfw_record *record, u8 desc_count)
+{
+	const struct __pldmfw_desc_tlv *__desc;
+	const u8 *desc_start;
+	u8 i;
+
+	desc_start = data->fw->data + data->offset;
+
+	pldm_for_each_desc_tlv(i, __desc, desc_start, desc_count) {
+		struct pldmfw_desc_tlv *desc;
+		int err;
+		u16 type, size;
+
+		err = pldm_move_fw_offset(data, sizeof(*__desc));
+		if (err)
+			return err;
+
+		type = get_unaligned_le16(&__desc->type);
+
+		/* According to DSP0267, this only includes the data field */
+		size = get_unaligned_le16(&__desc->size);
+
+		err = pldm_check_desc_tlv_len(data, type, size);
+		if (err)
+			return err;
+
+		/* check that we have space and move the offset forward */
+		err = pldm_move_fw_offset(data, size);
+		if (err)
+			return err;
+
+		desc = (struct pldmfw_desc_tlv *)kzalloc(sizeof(*desc), GFP_KERNEL);
+		if (!desc)
+			return -ENOMEM;
+
+		desc->type = type;
+		desc->size = size;
+		desc->data = __desc->data;
+
+		list_add_tail(&desc->entry, &record->descs);
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_one_record - Verify size of one PLDM record
+ * @data: pointer to image details
+ * @__record: pointer to the record to check
+ *
+ * This function checks that the record size does not exceed either the size
+ * of the firmware file or the total length specified in the header section.
+ *
+ * It also verifies that the recorded length of the start of the record
+ * matches the size calculated by adding the static structure length, the
+ * component bitmap length, the version string length, the length of all
+ * descriptor TLVs, and the length of the package data.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_parse_one_record(struct pldmfw_priv *data,
+		      const struct __pldmfw_record_info *__record)
+{
+	struct pldmfw_record *record;
+	size_t measured_length;
+	int err;
+	const u8 *bitmap_ptr;
+	u16 record_len;
+	int i;
+
+	/* Make a copy and insert it into the record list */
+	record = (struct pldmfw_record *)kzalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&record->descs);
+	list_add_tail(&record->entry, &data->records);
+
+	/* Then check that we have space and move the offset */
+	err = pldm_move_fw_offset(data, sizeof(*__record));
+	if (err)
+		return err;
+
+	record_len = get_unaligned_le16(&__record->record_len);
+	record->package_data_len = get_unaligned_le16(&__record->package_data_len);
+	record->version_len = __record->version_len;
+	record->version_type = __record->version_type;
+
+	bitmap_ptr = data->fw->data + data->offset;
+
+	/* check that we have space for the component bitmap length */
+	err = pldm_move_fw_offset(data, data->bitmap_size);
+	if (err)
+		return err;
+
+	record->component_bitmap_len = data->component_bitmap_len;
+	record->component_bitmap = bitmap_zalloc(record->component_bitmap_len,
+						 GFP_KERNEL);
+	if (!record->component_bitmap)
+		return -ENOMEM;
+
+	for (i = 0; i < data->bitmap_size; i++)
+		bitmap_set_value8(record->component_bitmap, bitmap_ptr[i], i * 8);
+
+	record->version_string = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, record->version_len);
+	if (err)
+		return err;
+
+	/* Scan through the descriptor TLVs and find the end */
+	err = pldm_parse_desc_tlvs(data, record, __record->descriptor_count);
+	if (err)
+		return err;
+
+	record->package_data = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, record->package_data_len);
+	if (err)
+		return err;
+
+	measured_length = data->offset - ((const u8 *)__record - data->fw->data);
+	if (measured_length != record_len) {
+		dev_dbg(data->context->dev, "Unexpected record length. Measured record length is %zu bytes, expected length is %u bytes\n",
+			measured_length, record_len);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_parse_records - Locate the start of the component area
+ * @data: pointer to private data
+ *
+ * Extract the record count, and loop through each record, searching for the
+ * component area.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_records(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_component_area *component_area;
+	const struct __pldmfw_record_info *record;
+	int err;
+	u8 i;
+
+	pldm_for_each_record(i, record, data->record_start, data->record_count) {
+		err = pldm_parse_one_record(data, record);
+		if (err)
+			return err;
+	}
+
+	/* Extract a pointer to the component area, which just follows the
+	 * PLDM device record data.
+	 */
+	component_area = (const struct __pldmfw_component_area *)(data->fw->data + data->offset);
+
+	err = pldm_move_fw_offset(data, sizeof(*component_area));
+	if (err)
+		return err;
+
+	data->component_count =
+		get_unaligned_le16(&component_area->component_image_count);
+	data->component_start = component_area->components;
+
+	return 0;
+}
+
+/**
+ * pldm_parse_components - Locate the CRC header checksum
+ * @data: pointer to private data
+ *
+ * Extract the component count, and find the pointer to the component area.
+ * Scan through each component searching for the end, which should point to
+ * the package header checksum.
+ *
+ * Extract the package header CRC and save it for verification.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_components(struct pldmfw_priv *data)
+{
+	const struct __pldmfw_component_info *__component;
+	struct device *dev = data->context->dev;
+	const u8 *header_crc_ptr;
+	int err;
+	u8 i;
+
+	pldm_for_each_component(i, __component, data->component_start, data->component_count) {
+		struct pldmfw_component *component;
+		u32 offset, size;
+
+		err = pldm_move_fw_offset(data, sizeof(*__component));
+		if (err)
+			return err;
+
+		err = pldm_move_fw_offset(data, __component->version_len);
+		if (err)
+			return err;
+
+		offset = get_unaligned_le32(&__component->location_offset);
+		size = get_unaligned_le32(&__component->size);
+
+		err = pldm_check_fw_space(data, offset, size);
+		if (err)
+			return err;
+
+		component = (struct pldmfw_component *)kzalloc(sizeof(*component), GFP_KERNEL);
+		if (!component)
+			return -ENOMEM;
+
+		component->index = i;
+		component->classification = get_unaligned_le16(&__component->classification);
+		component->identifier = get_unaligned_le16(&__component->identifier);
+		component->comparison_stamp = get_unaligned_le32(&__component->comparison_stamp);
+		component->options = get_unaligned_le16(&__component->options);
+		component->activation_method = get_unaligned_le16(&__component->activation_method);
+		component->version_type = __component->version_type;
+		component->version_len = __component->version_len;
+		component->version_string = __component->version_string;
+		component->component_data = data->fw->data + offset;
+		component->component_size = size;
+
+		list_add_tail(&component->entry, &data->components);
+	}
+
+	header_crc_ptr = data->fw->data + data->offset;
+
+	err = pldm_move_fw_offset(data, sizeof(data->header_crc));
+	if (err)
+		return err;
+
+	/* Make sure that we reached the expected offset */
+	if (data->offset != data->total_header_size) {
+		dev_dbg(dev, "Invalid firmware header size. Expected %u but got %zu\n",
+			data->total_header_size, data->offset);
+		return -EFAULT;
+	}
+
+	data->header_crc = get_unaligned_le32(header_crc_ptr);
+
+	return 0;
+}
+
+/**
+ * pldm_verify_header_crc - Verify that the CRC in the header matches
+ * @data: pointer to private data
+ *
+ * Calculates the 32-bit CRC using the standard IEEE 802.3 CRC polynomial and
+ * compares it to the value stored in the header.
+ *
+ * Returns: zero on success if the CRC matches, or -EBADMSG on an invalid CRC.
+ */
+static int pldm_verify_header_crc(struct pldmfw_priv *data)
+{
+	struct device *dev = data->context->dev;
+	u32 calculated_crc;
+	size_t length;
+
+	/* Calculate the 32-bit CRC of the header header contents up to but
+	 * not including the checksum. Note that the Linux crc32_le function
+	 * does not perform an expected final XOR.
+	 */
+	length = data->offset - sizeof(data->header_crc);
+	calculated_crc = crc32_le(~0, data->fw->data, length) ^ ~0;
+
+	if (calculated_crc != data->header_crc) {
+		dev_dbg(dev, "Invalid CRC in firmware header. Got 0x%08x but expected 0x%08x\n",
+			calculated_crc, data->header_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+/**
+ * pldmfw_free_priv - Free memory allocated while parsing the PLDM image
+ * @data: pointer to the PLDM data structure
+ *
+ * Loops through and clears all allocated memory associated with each
+ * allocated descriptor, record, and component.
+ */
+static void pldmfw_free_priv(struct pldmfw_priv *data)
+{
+	struct pldmfw_component *component, *c_safe;
+	struct pldmfw_record *record, *r_safe;
+	struct pldmfw_desc_tlv *desc, *d_safe;
+
+	list_for_each_entry_safe(component, c_safe, &data->components, entry) {
+		list_del(&component->entry);
+		kfree(component);
+	}
+
+	list_for_each_entry_safe(record, r_safe, &data->records, entry) {
+		list_for_each_entry_safe(desc, d_safe, &record->descs, entry) {
+			list_del(&desc->entry);
+			kfree(desc);
+		}
+
+		if (record->component_bitmap) {
+			bitmap_free(record->component_bitmap);
+			record->component_bitmap = NULL;
+		}
+
+		list_del(&record->entry);
+		kfree(record);
+	}
+}
+
+/**
+ * pldm_parse_image - parse and extract details from PLDM image
+ * @data: pointer to private data
+ *
+ * Verify that the firmware file contains valid data for a PLDM firmware
+ * file. Extract useful pointers and data from the firmware file and store
+ * them in the data structure.
+ *
+ * The PLDM firmware file format is defined in DMTF DSP0267 1.0.0. Care
+ * should be taken to use get_unaligned_le* when accessing data from the
+ * pointers in data.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_parse_image(struct pldmfw_priv *data)
+{
+	int err;
+
+	if (WARN_ON(!(data->context->dev && data->fw->data && data->fw->size)))
+		return -EINVAL;
+
+	err = pldm_parse_header(data);
+	if (err)
+		return err;
+
+	err = pldm_parse_records(data);
+	if (err)
+		return err;
+
+	err = pldm_parse_components(data);
+	if (err)
+		return err;
+
+	return pldm_verify_header_crc(data);
+}
+
+/* these are u32 so that we can store PCI_ANY_ID */
+struct pldm_pci_record_id {
+	int vendor;
+	int device;
+	int subsystem_vendor;
+	int subsystem_device;
+};
+
+/**
+ * pldmfw_op_pci_match_record - Check if a PCI device matches the record
+ * @context: PLDM fw update structure
+ * @record: list of records extracted from the PLDM image
+ *
+ * Determine of the PCI device associated with this device matches the record
+ * data provided.
+ *
+ * Searches the descriptor TLVs and extracts the relevant descriptor data into
+ * a pldm_pci_record_id. This is then compared against the PCI device ID
+ * information.
+ *
+ * Returns: true if the device matches the record, false otherwise.
+ */
+bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record)
+{
+	struct pci_dev *pdev = to_pci_dev(context->dev);
+	struct pldm_pci_record_id id = {
+		.vendor = PCI_ANY_ID,
+		.device = PCI_ANY_ID,
+		.subsystem_vendor = PCI_ANY_ID,
+		.subsystem_device = PCI_ANY_ID,
+	};
+	struct pldmfw_desc_tlv *desc;
+
+	list_for_each_entry(desc, &record->descs, entry) {
+		u16 value;
+		int *ptr;
+
+		switch (desc->type) {
+		case PLDM_DESC_ID_PCI_VENDOR_ID:
+			ptr = &id.vendor;
+			break;
+		case PLDM_DESC_ID_PCI_DEVICE_ID:
+			ptr = &id.device;
+			break;
+		case PLDM_DESC_ID_PCI_SUBVENDOR_ID:
+			ptr = &id.subsystem_vendor;
+			break;
+		case PLDM_DESC_ID_PCI_SUBDEV_ID:
+			ptr = &id.subsystem_device;
+			break;
+		default:
+			/* Skip unrelated TLVs */
+			continue;
+		}
+
+		value = get_unaligned_le16(desc->data);
+		/* A value of zero for one of the descriptors is sometimes
+		 * used when the record should ignore this field when matching
+		 * device. For example if the record applies to any subsystem
+		 * device or vendor.
+		 */
+		if (value)
+			*ptr = (int)value;
+		else
+			*ptr = PCI_ANY_ID;
+	}
+
+	if ((id.vendor == PCI_ANY_ID || id.vendor == pdev->vendor) &&
+	    (id.device == PCI_ANY_ID || id.device == pdev->device) &&
+	    (id.subsystem_vendor == PCI_ANY_ID || id.subsystem_vendor == pdev->subsystem_vendor) &&
+	    (id.subsystem_device == PCI_ANY_ID || id.subsystem_device == pdev->subsystem_device))
+		return true;
+	else
+		return false;
+}
+
+/**
+ * pldm_find_matching_record - Find the first matching PLDM record
+ * @data: pointer to private data
+ *
+ * Search through PLDM records and find the first matching entry. It is
+ * expected that only one entry matches.
+ *
+ * Store a pointer to the matching record, if found.
+ *
+ * Returns: zero on success, or -ENOENT if no matching record is found.
+ */
+static int pldm_find_matching_record(struct pldmfw_priv *data)
+{
+	struct pldmfw_record *record;
+
+	list_for_each_entry(record, &data->records, entry) {
+		if (data->context->ops->match_record(data->context, record)) {
+			data->matching_record = record;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * pldm_send_package_data - Send firmware the package data for the record
+ * @data: pointer to private data
+ *
+ * Send the package data associated with the matching record to the firmware,
+ * using the send_pkg_data operation.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_send_package_data(struct pldmfw_priv *data)
+{
+	struct pldmfw_record *record = data->matching_record;
+	const struct pldmfw_ops *ops = data->context->ops;
+
+	return ops->send_package_data(data->context, record->package_data,
+				      record->package_data_len);
+}
+
+/**
+ * pldm_send_component_tables - Send component table information to firmware
+ * @data: pointer to private data
+ *
+ * Loop over each component, sending the applicable components to the firmware
+ * via the send_component_table operation.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int
+pldm_send_component_tables(struct pldmfw_priv *data)
+{
+	unsigned long *bitmap = data->matching_record->component_bitmap;
+	struct pldmfw_component *component;
+	int err;
+
+	list_for_each_entry(component, &data->components, entry) {
+		u8 index = component->index, transfer_flag = 0;
+
+		/* Skip components which are not intended for this device */
+		if (!test_bit(index, bitmap))
+			continue;
+
+		/* determine whether this is the start, middle, end, or both
+		 * the start and end of the component tables
+		 */
+		if (index == find_first_bit(bitmap, data->component_bitmap_len))
+			transfer_flag |= PLDM_TRANSFER_FLAG_START;
+		if (index == find_last_bit(bitmap, data->component_bitmap_len))
+			transfer_flag |= PLDM_TRANSFER_FLAG_END;
+		if (!transfer_flag)
+			transfer_flag = PLDM_TRANSFER_FLAG_MIDDLE;
+
+		err = data->context->ops->send_component_table(data->context,
+							       component,
+							       transfer_flag);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_flash_components - Program each component to device flash
+ * @data: pointer to private data
+ *
+ * Loop through each component that is active for the matching device record,
+ * and send it to the device driver for flashing.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+static int pldm_flash_components(struct pldmfw_priv *data)
+{
+	unsigned long *bitmap = data->matching_record->component_bitmap;
+	struct pldmfw_component *component;
+	int err;
+
+	list_for_each_entry(component, &data->components, entry) {
+		u8 index = component->index;
+
+		/* Skip components which are not intended for this device */
+		if (!test_bit(index, bitmap))
+			continue;
+
+		err = data->context->ops->flash_component(data->context, component);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * pldm_finalize_update - Finalize the device flash update
+ * @data: pointer to private data
+ *
+ * Tell the device driver to perform any remaining logic to complete the
+ * device update.
+ *
+ * Returns: zero on success, or a PLFM_FWU error indicating the reason for
+ * failure.
+ */
+static int pldm_finalize_update(struct pldmfw_priv *data)
+{
+	if (data->context->ops->finalize_update)
+		return data->context->ops->finalize_update(data->context);
+
+	return 0;
+}
+
+/**
+ * pldmfw_flash_image - Write a PLDM-formatted firmware image to the device
+ * @context: ops and data for firmware update
+ * @fw: firmware object pointing to the relevant firmware file to program
+ *
+ * Parse the data for a given firmware file, verifying that it is a valid PLDM
+ * formatted image that matches this device.
+ *
+ * Extract the device record Package Data and Component Tables and send them
+ * to the device firmware. Extract and write the flash data for each of the
+ * components indicated in the firmware file.
+ *
+ * Returns: zero on success, or a negative error code on failure.
+ */
+int pldmfw_flash_image(struct pldmfw *context, const struct firmware *fw)
+{
+	struct pldmfw_priv *data;
+	int err;
+
+	data = (struct pldmfw_priv *)kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&data->records);
+	INIT_LIST_HEAD(&data->components);
+
+	data->fw = fw;
+	data->context = context;
+
+	err = pldm_parse_image(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_find_matching_record(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_send_package_data(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_send_component_tables(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_flash_components(data);
+	if (err)
+		goto out_release_data;
+
+	err = pldm_finalize_update(data);
+
+out_release_data:
+	pldmfw_free_priv(data);
+	kfree(data);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa425cd7ec70dd36b0107925f53efca449b13c8c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_pldmfw.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020, Intel Corporation. */
+
+/* This is taken from upstream <linux/pldmfw.h> */
+
+#ifndef _KCOMPAT_PLDMFW_H_
+#define _KCOMPAT_PLDMFW_H_
+
+#ifdef _PLDMFW_H_
+#error "Do not include both kcompat_pldmfw.h and <linux/pldmfw.h>"
+#endif
+
+#if IS_ENABLED(CONFIG_PLDMFW)
+#error "CONFIG_PLDMFW is enabled, use <linux/pldmfw.h>"
+#endif
+
+#include <linux/list.h>
+#include <linux/firmware.h>
+
+#define PLDM_DEVICE_UPDATE_CONTINUE_AFTER_FAIL BIT(0)
+
+#define PLDM_STRING_TYPE_UNKNOWN	0
+#define PLDM_STRING_TYPE_ASCII		1
+#define PLDM_STRING_TYPE_UTF8		2
+#define PLDM_STRING_TYPE_UTF16		3
+#define PLDM_STRING_TYPE_UTF16LE	4
+#define PLDM_STRING_TYPE_UTF16BE	5
+
+struct pldmfw_record {
+	struct list_head entry;
+
+	/* List of descriptor TLVs */
+	struct list_head descs;
+
+	/* Component Set version string*/
+	const u8 *version_string;
+	u8 version_type;
+	u8 version_len;
+
+	/* Package Data length */
+	u16 package_data_len;
+
+	/* Bitfield of Device Update Flags */
+	u32 device_update_flags;
+
+	/* Package Data block */
+	const u8 *package_data;
+
+	/* Bitmap of components applicable to this record */
+	unsigned long *component_bitmap;
+	u16 component_bitmap_len;
+};
+
+/* Standard descriptor TLV identifiers */
+#define PLDM_DESC_ID_PCI_VENDOR_ID	0x0000
+#define PLDM_DESC_ID_IANA_ENTERPRISE_ID	0x0001
+#define PLDM_DESC_ID_UUID		0x0002
+#define PLDM_DESC_ID_PNP_VENDOR_ID	0x0003
+#define PLDM_DESC_ID_ACPI_VENDOR_ID	0x0004
+#define PLDM_DESC_ID_PCI_DEVICE_ID	0x0100
+#define PLDM_DESC_ID_PCI_SUBVENDOR_ID	0x0101
+#define PLDM_DESC_ID_PCI_SUBDEV_ID	0x0102
+#define PLDM_DESC_ID_PCI_REVISION_ID	0x0103
+#define PLDM_DESC_ID_PNP_PRODUCT_ID	0x0104
+#define PLDM_DESC_ID_ACPI_PRODUCT_ID	0x0105
+#define PLDM_DESC_ID_VENDOR_DEFINED	0xFFFF
+
+struct pldmfw_desc_tlv {
+	struct list_head entry;
+
+	const u8 *data;
+	u16 type;
+	u16 size;
+};
+
+#define PLDM_CLASSIFICATION_UNKNOWN		0x0000
+#define PLDM_CLASSIFICATION_OTHER		0x0001
+#define PLDM_CLASSIFICATION_DRIVER		0x0002
+#define PLDM_CLASSIFICATION_CONFIG_SW		0x0003
+#define PLDM_CLASSIFICATION_APP_SW		0x0004
+#define PLDM_CLASSIFICATION_INSTRUMENTATION	0x0005
+#define PLDM_CLASSIFICATION_BIOS		0x0006
+#define PLDM_CLASSIFICATION_DIAGNOSTIC_SW	0x0007
+#define PLDM_CLASSIFICATION_OS			0x0008
+#define PLDM_CLASSIFICATION_MIDDLEWARE		0x0009
+#define PLDM_CLASSIFICATION_FIRMWARE		0x000A
+#define PLDM_CLASSIFICATION_CODE		0x000B
+#define PLDM_CLASSIFICATION_SERVICE_PACK	0x000C
+#define PLDM_CLASSIFICATION_SOFTWARE_BUNDLE	0x000D
+
+#define PLDM_ACTIVATION_METHOD_AUTO		BIT(0)
+#define PLDM_ACTIVATION_METHOD_SELF_CONTAINED	BIT(1)
+#define PLDM_ACTIVATION_METHOD_MEDIUM_SPECIFIC	BIT(2)
+#define PLDM_ACTIVATION_METHOD_REBOOT		BIT(3)
+#define PLDM_ACTIVATION_METHOD_DC_CYCLE		BIT(4)
+#define PLDM_ACTIVATION_METHOD_AC_CYCLE		BIT(5)
+
+#define PLDMFW_COMPONENT_OPTION_FORCE_UPDATE		BIT(0)
+#define PLDMFW_COMPONENT_OPTION_USE_COMPARISON_STAMP	BIT(1)
+
+struct pldmfw_component {
+	struct list_head entry;
+
+	/* component identifier */
+	u16 classification;
+	u16 identifier;
+
+	u16 options;
+	u16 activation_method;
+
+	u32 comparison_stamp;
+
+	u32 component_size;
+	const u8 *component_data;
+
+	/* Component version string */
+	const u8 *version_string;
+	u8 version_type;
+	u8 version_len;
+
+	/* component index */
+	u8 index;
+
+};
+
+/* Transfer flag used for sending components to the firmware */
+#define PLDM_TRANSFER_FLAG_START		BIT(0)
+#define PLDM_TRANSFER_FLAG_MIDDLE		BIT(1)
+#define PLDM_TRANSFER_FLAG_END			BIT(2)
+
+struct pldmfw_ops;
+
+/* Main entry point to the PLDM firmware update engine. Device drivers
+ * should embed this in a private structure and use container_of to obtain
+ * a pointer to their own data, used to implement the device specific
+ * operations.
+ */
+struct pldmfw {
+	const struct pldmfw_ops *ops;
+	struct device *dev;
+};
+
+bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record);
+
+/* Operations invoked by the generic PLDM firmware update engine. Used to
+ * implement device specific logic.
+ *
+ * @match_record: check if the device matches the given record. For
+ * convenience, a standard implementation is provided for PCI devices.
+ *
+ * @send_package_data: send the package data associated with the matching
+ * record to firmware.
+ *
+ * @send_component_table: send the component data associated with a given
+ * component to firmware. Called once for each applicable component.
+ *
+ * @flash_component: Flash the data for a given component to the device.
+ * Called once for each applicable component, after all component tables have
+ * been sent.
+ *
+ * @finalize_update: (optional) Finish the update. Called after all components
+ * have been flashed.
+ */
+struct pldmfw_ops {
+	bool (*match_record)(struct pldmfw *context, struct pldmfw_record *record);
+	int (*send_package_data)(struct pldmfw *context, const u8 *data, u16 length);
+	int (*send_component_table)(struct pldmfw *context, struct pldmfw_component *component,
+				    u8 transfer_flag);
+	int (*flash_component)(struct pldmfw *context, struct pldmfw_component *component);
+	int (*finalize_update)(struct pldmfw *context);
+};
+
+int pldmfw_flash_image(struct pldmfw *context, const struct firmware *fw);
+
+#endif
diff --git a/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h b/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5ddbe16e6a0f88de863f8a3254e616bc6a0f819
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_rhel_defs.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_RHEL_DEFS_H_
+#define _KCOMPAT_RHEL_DEFS_H_
+
+/* This is the RedHat Enterprise Linux distribution specific definitions file.
+ * It defines what features need backports for a given version of the RHEL
+ * kernel.
+ *
+ * It checks the RHEL_RELEASE_CODE and RHEL_RELEASE_VERSION macros to decide
+ * what support the target kernel has.
+ *
+ * It assumes that kcompat_std_defs.h has already been processed, and will
+ * #define or #undef any flags that have changed based on backports done by
+ * RHEL.
+ */
+
+#if !RHEL_RELEASE_CODE
+#error "RHEL_RELEASE_CODE is 0 or undefined"
+#endif
+
+#ifndef RHEL_RELEASE_VERSION
+#error "RHEL_RELEASE_VERSION is undefined"
+#endif
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,3))
+#else /* >= 7.3 */
+#undef NEED_DEV_PRINTK_ONCE
+#endif /* 7.3 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,5))
+#else /* >= 7.5 */
+#define HAVE_TCF_EXTS_TO_LIST
+#endif /* 7.5 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,6))
+#else /* >= 7.6 */
+#undef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#undef NEED_TC_SETUP_QDISC_MQPRIO
+#endif /* 7.6 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,7))
+#else /* >= 7.7 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#endif /* 7.7 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0))
+#else /* >= 8.0 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 7.5 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,1))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 8.1 */
+#undef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#endif /* 8.1 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,2))
+#else /* >= 8.2 */
+#undef NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+#undef NEED_SKB_FRAG_OFF
+#undef NEED_SKB_FRAG_OFF_ADD
+#undef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#define HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+#endif /* 8.2 */
+
+/*****************************************************************************/
+#if (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,4))
+#else /* >= 8.4 */
+#undef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#undef NEED_NET_PREFETCH
+#undef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#undef HAVE_XDP_QUERY_PROG
+#endif /* 8.4 */
+
+#endif /* _KCOMPAT_RHEL_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h b/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee6563993d5f251d32b8036e50626686077b01a
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_sles_defs.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_SLES_DEFS_H_
+#define _KCOMPAT_SLES_DEFS_H_
+
+/* This is the SUSE Linux Enterprise distribution specific definitions file.
+ * It defines what features need backports for a given version of the SUSE
+ * Linux Enterprise kernel.
+ *
+ * It checks a combination of the LINUX_VERSION code and the
+ * SLE_LOCALVERSION_CODE to determine what support the kernel has.
+ *
+ * It assumes that kcompat_std_defs.h has already been processed, and will
+ * #define or #undef any flags that have changed based on backports done by
+ * SUSE.
+ */
+
+#ifndef LINUX_VERSION_CODE
+#error "LINUX_VERSION_CODE is undefined"
+#endif
+
+#ifndef KERNEL_VERSION
+#error "KERNEL_VERSION is undefined"
+#endif
+
+#if !SLE_KERNEL_REVISION
+#error "SLE_KERNEL_REVISION is 0 or undefined"
+#endif
+
+#if SLE_KERNEL_REVISION > 65535
+#error "SLE_KERNEL_REVISION is unexpectedly large"
+#endif
+
+/* SLE kernel versions are a combination of the LINUX_VERSION_CODE along with
+ * an extra digit that indicates the SUSE specific revision of that kernel.
+ * This value is found in the CONFIG_LOCALVERSION of the SUSE kernel, which is
+ * extracted by common.mk and placed into SLE_KERNEL_REVISION_CODE.
+ *
+ * We combine the value of SLE_KERNEL_REVISION along with the LINUX_VERSION code
+ * to generate the useful value that determines what specific kernel we're
+ * dealing with.
+ *
+ * Just in case the SLE_KERNEL_REVISION ever goes above 255, we reserve 16 bits
+ * instead of 8 for this value.
+ */
+#define SLE_KERNEL_CODE ((LINUX_VERSION_CODE << 16) + SLE_KERNEL_REVISION)
+#define SLE_KERNEL_VERSION(a,b,c,d) ((KERNEL_VERSION(a,b,c) << 16) + (d))
+
+/* Unlike RHEL, SUSE kernels are not always tied to a single service pack. For
+ * example, 4.12.14 was used as the base for SLE 15 SP1, SLE 12 SP4, and SLE 12
+ * SP5.
+ *
+ * You can find the patches that SUSE applied to the kernel tree at
+ * https://github.com/SUSE/kernel-source.
+ *
+ * You can find the correct kernel version for a check by using steps similar
+ * to the following
+ *
+ * 1) download the kernel-source repo
+ * 2) checkout the relevant branch, i.e SLE15-SP3
+ * 3) find the relevant backport you're interested in the patches.suse
+ *    directory
+ * 4) git log <patch file> to locate the commit that introduced the backport
+ * 5) git describe --contains to find the relevant tag that includes that
+ *    commit, i.e. rpm-5.3.18-37
+ * 6) those digits represent the SLE kernel that introduced that backport.
+ *
+ * Try to keep the checks in SLE_KERNEL_CODE order and condense where
+ * possible.
+ */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE > SLE_KERNEL_VERSION(4,12,14,23) && \
+     SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,94))
+/*
+ * 4.12.14 is used as the base for SLE 12 SP4, SLE 12 SP5, SLE 15, and SLE 15
+ * SP1. Unfortunately the revision codes do not line up cleanly. SLE 15
+ * launched with 4.12.14-23. It appears that SLE 12 SP4 and SLE 15 SP1 both
+ * diverged from this point, with SLE 12 SP4 kernels starting around
+ * 4.12.14-94. A few backports for SLE 15 SP1 landed in some alpha and beta
+ * kernels tagged between 4.12.14-25 up to 4.12.14-32. These changes did not
+ * make it into SLE 12 SP4. This was cleaned up with SLE 12 SP5 by an apparent
+ * merge in 4.12.14-111. The official launch of SLE 15 SP1 ended up with
+ * version 4.12.14-195.
+ *
+ * Because of this inconsistency and because all of these kernels appear to be
+ * alpha or beta kernel releases for SLE 15 SP1, we do not rely on version
+ * checks between this range. Issue a warning to indicate that we do not
+ * support these.
+ */
+#warning "SLE kernel versions between 4.12.14-23 and 4.12.14-94 are not supported"
+#endif
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,100))
+#else /* >= 4.12.14-100 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 4.12.14-100 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,111))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 4.12.14-111 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#undef NEED_MACVLAN_ACCEL_PRIV
+#undef NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+#undef NEED_MACVLAN_SUPPORTS_DEST_FILTER
+#undef NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#endif /* 4.12.14-111 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(4,12,14,120))
+#else /* >= 4.12.14-120 */
+#define HAVE_NDO_SELECT_QUEUE_SB_DEV
+#define HAVE_TCF_MIRRED_DEV
+#define HAVE_TCF_BLOCK
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#define HAVE_TCF_BLOCK_CB_REGISTER_EXTACK
+#undef NEED_TC_SETUP_QDISC_MQPRIO
+#undef NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#endif /* 4.12.14-120 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,8,2))
+#else /* >= 5.3.8-2 */
+#undef NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#undef NEED_SKB_FRAG_OFF
+#undef NEED_SKB_FRAG_OFF_ADD
+#endif /* 5.3.8-2 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,34))
+#else /* >= 5.3.18-34 */
+#undef NEED_DEVLINK_REGION_CREATE_OPS
+#undef NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#endif /* 5.3.18-34 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,37))
+#else /* >= 5.3.18-37 */
+#undef NEED_NET_PREFETCH
+#endif /* 5.3.18-37 */
+
+/*****************************************************************************/
+#if (SLE_KERNEL_CODE < SLE_KERNEL_VERSION(5,3,18,38))
+#else /* >= 5.3.18-38 */
+#undef NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#endif /* 5.3.18-38 */
+
+#endif /* _KCOMPAT_SLES_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_std_defs.h b/drivers/net/ethernet/intel/ice/kcompat_std_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d657809ff9e2e65a014534c7c43b471bb958362
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_std_defs.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_STD_DEFS_H_
+#define _KCOMPAT_STD_DEFS_H_
+
+/* This file contains the definitions for what kernel features need backports
+ * for a given kernel. It targets only the standard stable kernel releases.
+ * It must check only LINUX_VERSION_CODE and assume the kernel is a standard
+ * release, and not a custom distribution.
+ *
+ * It must define HAVE_<FLAG> and NEED_<FLAG> for features. It must not
+ * implement any backports, instead leaving the implementation to the
+ * kcompat_impl.h header.
+ *
+ * If a feature can be easily implemented as a replacement macro or fully
+ * backported, use a NEED_<FLAG> to indicate that the feature needs
+ * a backport. (If NEED_<FLAG> is undefined, then no backport for that feature
+ * is needed).
+ *
+ * If a feature cannot be easily implemented in kcompat directly, but
+ * requires drivers to make specific changes such as stripping out an entire
+ * feature or modifying a function pointer prototype, use a HAVE_<FLAG>.
+ */
+
+#ifndef LINUX_VERSION_CODE
+#error "LINUX_VERSION_CODE is undefined"
+#endif
+
+#ifndef KERNEL_VERSION
+#error "KERNEL_VERSION is undefined"
+#endif
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0))
+#define NEED_DEV_PRINTK_ONCE
+#else /* >= 3,19,0 */
+#endif /* 3,19,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0))
+#else /* >= 4,8,0 */
+#define HAVE_TCF_EXTS_TO_LIST
+#endif /* 4,8,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0))
+#define NEED_TC_SETUP_QDISC_MQPRIO
+#else /* >= 4,15,0 */
+#define HAVE_TC_CB_AND_SETUP_QDISC_MQPRIO
+#endif /* 4,15,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0))
+#define NEED_TC_CLS_CAN_OFFLOAD_AND_CHAIN0
+#else /* >= 4,16,0 */
+#endif /* 4,16,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,17,0))
+#define NEED_CONVERT_ART_NS_TO_TSC
+#else /* >= 4,17,0 */
+#endif /* 4,17,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0))
+#define NEED_MACVLAN_ACCEL_PRIV
+#define NEED_MACVLAN_RELEASE_L2FW_OFFLOAD
+#define NEED_MACVLAN_SUPPORTS_DEST_FILTER
+#else /* >= 4,18,0 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_PORT_FLAVOUR
+#endif /* 4,18,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0))
+#define NEED_IDA_ALLOC_MIN_MAX_RANGE_FREE
+#else /* >= 4,19,0 */
+#undef HAVE_TCF_EXTS_TO_LIST
+#define HAVE_TCF_EXTS_FOR_EACH_ACTION
+#endif /* 4,19,0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0))
+#else /* >= 5.2.0 */
+#define HAVE_DEVLINK_PORT_ATTRS_SET_SWITCH_ID
+#endif /* 5.2.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0))
+#define NEED_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY
+#else /* >= 5.3.0 */
+#endif /* 5.3.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0))
+#define NEED_DEVLINK_REGION_CREATE_OPS
+#else /* >= 5.7.0 */
+#endif /* 5.7.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0))
+#define NEED_SKB_FRAG_OFF_ADD
+#define NEED_SKB_FRAG_OFF
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(4,19,200) && \
+     LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0))
+#undef NEED_SKB_FRAG_OFF
+#endif /* 4.19.X for X > 201 */
+
+#define NEED_FLOW_INDR_BLOCK_CB_REGISTER
+#else /* >= 5.4.0 */
+#endif /* 5.4.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0))
+#define NEED_DEVLINK_PORT_ATTRS_SET_STRUCT
+#define HAVE_XDP_QUERY_PROG
+#else /* >= 5.9.0 */
+#endif /* 5.9.0 */
+
+/*****************************************************************************/
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0))
+#define NEED_NET_PREFETCH
+#define NEED_DEVLINK_FLASH_UPDATE_TIMEOUT_NOTIFY
+#else /* >= 5.10.0 */
+#endif /* 5.10.0 */
+
+#endif /* _KCOMPAT_STD_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h b/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..9da611f64f2189f1d7e566737958fe72927bda23
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/kcompat_ubuntu_defs.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _KCOMPAT_UBUNTU_DEFS_H_
+#define _KCOMPAT_UBUNTU_DEFS_H_
+
+/* This file contains the definitions for the Ubuntu specific distribution of
+ * the Linux kernel.
+ *
+ * It checks the UBUNTU_VERSION_CODE to decide which features are available in
+ * the target kernel. It assumes that kcompat_std_defs.h has already been
+ * processed, and will #define or #undef the relevant flags based on what
+ * features were backported by Ubuntu.
+ */
+
+#if !UTS_UBUNTU_RELEASE_ABI
+#error "UTS_UBUNTU_RELEASE_ABI is 0 or undefined"
+#endif
+
+#if !UBUNTU_VERSION_CODE
+#error "UBUNTU_VERSION_CODE is 0 or undefined"
+#endif
+
+#ifndef UBUNTU_VERSION
+#error "UBUNTU_VERSION is undefined"
+#endif
+
+#endif /* _KCOMPAT_UBUNTU_DEFS_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl.h b/drivers/net/ethernet/intel/ice/virtchnl.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd7fade36bd3e5eded2048fb59214486fe5b181c
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl.h
@@ -0,0 +1,2232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+
+#ifndef _VIRTCHNL_H_
+#define _VIRTCHNL_H_
+
+/* Description:
+ * This header file describes the Virtual Function (VF) - Physical Function
+ * (PF) communication protocol used by the drivers for all devices starting
+ * from our 40G product line
+ *
+ * Admin queue buffer usage:
+ * desc->opcode is always aqc_opc_send_msg_to_pf
+ * flags, retval, datalen, and data addr are all used normally.
+ * The Firmware copies the cookie fields when sending messages between the
+ * PF and VF, but uses all other fields internally. Due to this limitation,
+ * we must send all messages as "indirect", i.e. using an external buffer.
+ *
+ * All the VSI indexes are relative to the VF. Each VF can have maximum of
+ * three VSIs. All the queue indexes are relative to the VSI.  Each VF can
+ * have a maximum of sixteen queues for all of its VSIs.
+ *
+ * The PF is required to return a status code in v_retval for all messages
+ * except RESET_VF, which does not require any response. The returned value
+ * is of virtchnl_status_code type, defined in the shared type.h.
+ *
+ * In general, VF driver initialization should roughly follow the order of
+ * these opcodes. The VF driver must first validate the API version of the
+ * PF driver, then request a reset, then get resources, then configure
+ * queues and interrupts. After these operations are complete, the VF
+ * driver may start its queues, optionally add MAC and VLAN filters, and
+ * process traffic.
+ */
+
+/* START GENERIC DEFINES
+ * Need to ensure the following enums and defines hold the same meaning and
+ * value in current and future projects
+ */
+
+
+/* Error Codes */
+enum virtchnl_status_code {
+	VIRTCHNL_STATUS_SUCCESS				= 0,
+	VIRTCHNL_STATUS_ERR_PARAM			= -5,
+	VIRTCHNL_STATUS_ERR_NO_MEMORY			= -18,
+	VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH		= -38,
+	VIRTCHNL_STATUS_ERR_CQP_COMPL_ERROR		= -39,
+	VIRTCHNL_STATUS_ERR_INVALID_VF_ID		= -40,
+	VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR		= -53,
+	VIRTCHNL_STATUS_ERR_NOT_SUPPORTED		= -64,
+};
+
+/* Backward compatibility */
+#define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM
+#define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED
+
+#define VIRTCHNL_LINK_SPEED_2_5GB_SHIFT		0x0
+#define VIRTCHNL_LINK_SPEED_100MB_SHIFT		0x1
+#define VIRTCHNL_LINK_SPEED_1000MB_SHIFT	0x2
+#define VIRTCHNL_LINK_SPEED_10GB_SHIFT		0x3
+#define VIRTCHNL_LINK_SPEED_40GB_SHIFT		0x4
+#define VIRTCHNL_LINK_SPEED_20GB_SHIFT		0x5
+#define VIRTCHNL_LINK_SPEED_25GB_SHIFT		0x6
+#define VIRTCHNL_LINK_SPEED_5GB_SHIFT		0x7
+
+enum virtchnl_link_speed {
+	VIRTCHNL_LINK_SPEED_UNKNOWN	= 0,
+	VIRTCHNL_LINK_SPEED_100MB	= BIT(VIRTCHNL_LINK_SPEED_100MB_SHIFT),
+	VIRTCHNL_LINK_SPEED_1GB		= BIT(VIRTCHNL_LINK_SPEED_1000MB_SHIFT),
+	VIRTCHNL_LINK_SPEED_10GB	= BIT(VIRTCHNL_LINK_SPEED_10GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_40GB	= BIT(VIRTCHNL_LINK_SPEED_40GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_20GB	= BIT(VIRTCHNL_LINK_SPEED_20GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_25GB	= BIT(VIRTCHNL_LINK_SPEED_25GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_2_5GB	= BIT(VIRTCHNL_LINK_SPEED_2_5GB_SHIFT),
+	VIRTCHNL_LINK_SPEED_5GB		= BIT(VIRTCHNL_LINK_SPEED_5GB_SHIFT),
+};
+
+/* for hsplit_0 field of Rx HMC context */
+/* deprecated with AVF 1.0 */
+enum virtchnl_rx_hsplit {
+	VIRTCHNL_RX_HSPLIT_NO_SPLIT      = 0,
+	VIRTCHNL_RX_HSPLIT_SPLIT_L2      = 1,
+	VIRTCHNL_RX_HSPLIT_SPLIT_IP      = 2,
+	VIRTCHNL_RX_HSPLIT_SPLIT_TCP_UDP = 4,
+	VIRTCHNL_RX_HSPLIT_SPLIT_SCTP    = 8,
+};
+
+/* END GENERIC DEFINES */
+
+/* Opcodes for VF-PF communication. These are placed in the v_opcode field
+ * of the virtchnl_msg structure.
+ */
+enum virtchnl_ops {
+/* The PF sends status change events to VFs using
+ * the VIRTCHNL_OP_EVENT opcode.
+ * VFs send requests to the PF using the other ops.
+ * Use of "advanced opcode" features must be negotiated as part of capabilities
+ * exchange and are not considered part of base mode feature set.
+ */
+	VIRTCHNL_OP_UNKNOWN = 0,
+	VIRTCHNL_OP_VERSION = 1, /* must ALWAYS be 1 */
+	VIRTCHNL_OP_RESET_VF = 2,
+	VIRTCHNL_OP_GET_VF_RESOURCES = 3,
+	VIRTCHNL_OP_CONFIG_TX_QUEUE = 4,
+	VIRTCHNL_OP_CONFIG_RX_QUEUE = 5,
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES = 6,
+	VIRTCHNL_OP_CONFIG_IRQ_MAP = 7,
+	VIRTCHNL_OP_ENABLE_QUEUES = 8,
+	VIRTCHNL_OP_DISABLE_QUEUES = 9,
+	VIRTCHNL_OP_ADD_ETH_ADDR = 10,
+	VIRTCHNL_OP_DEL_ETH_ADDR = 11,
+	VIRTCHNL_OP_ADD_VLAN = 12,
+	VIRTCHNL_OP_DEL_VLAN = 13,
+	VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14,
+	VIRTCHNL_OP_GET_STATS = 15,
+	VIRTCHNL_OP_RSVD = 16,
+	VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */
+	/* opcode 19 is reserved */
+	VIRTCHNL_OP_IWARP = 20, /* advanced opcode */
+	VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP,
+	VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP = 21, /* advanced opcode */
+	VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP = VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP,
+	VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP = 22, /* advanced opcode */
+	VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP = VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP,
+	VIRTCHNL_OP_CONFIG_RSS_KEY = 23,
+	VIRTCHNL_OP_CONFIG_RSS_LUT = 24,
+	VIRTCHNL_OP_GET_RSS_HENA_CAPS = 25,
+	VIRTCHNL_OP_SET_RSS_HENA = 26,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
+	VIRTCHNL_OP_ENABLE_CHANNELS = 30,
+	VIRTCHNL_OP_DISABLE_CHANNELS = 31,
+	VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
+	VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
+	/* opcode 34 is reserved */
+	VIRTCHNL_OP_DCF_VLAN_OFFLOAD = 38,
+	VIRTCHNL_OP_DCF_CMD_DESC = 39,
+	VIRTCHNL_OP_DCF_CMD_BUFF = 40,
+	VIRTCHNL_OP_DCF_DISABLE = 41,
+	VIRTCHNL_OP_DCF_GET_VSI_MAP = 42,
+	VIRTCHNL_OP_DCF_GET_PKG_INFO = 43,
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44,
+	VIRTCHNL_OP_ADD_RSS_CFG = 45,
+	VIRTCHNL_OP_DEL_RSS_CFG = 46,
+	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
+	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
+	VIRTCHNL_OP_GET_MAX_RSS_QREGION = 50,
+	VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51,
+	VIRTCHNL_OP_ADD_VLAN_V2 = 52,
+	VIRTCHNL_OP_DEL_VLAN_V2 = 53,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55,
+	VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56,
+	VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57,
+	VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2 = 58,
+	VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2 = 59,
+	/* opcodes 60 through 69 are reserved */
+	VIRTCHNL_OP_ENABLE_QUEUES_V2 = 107,
+	VIRTCHNL_OP_DISABLE_QUEUES_V2 = 108,
+	VIRTCHNL_OP_MAP_QUEUE_VECTOR = 111,
+        VIRTCHNL_OP_DCF_RULE_FLUSH = 6000,
+	VIRTCHNL_OP_MAX,
+};
+
+static inline const char *virtchnl_op_str(enum virtchnl_ops v_opcode)
+{
+	switch (v_opcode) {
+	case VIRTCHNL_OP_UNKNOWN:
+		return "VIRTCHNL_OP_UNKNOWN";
+	case VIRTCHNL_OP_VERSION:
+		return "VIRTCHNL_OP_VERSION";
+	case VIRTCHNL_OP_RESET_VF:
+		return "VIRTCHNL_OP_RESET_VF";
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		return "VIRTCHNL_OP_GET_VF_RESOURCES";
+	case VIRTCHNL_OP_CONFIG_TX_QUEUE:
+		return "VIRTCHNL_OP_CONFIG_TX_QUEUE";
+	case VIRTCHNL_OP_CONFIG_RX_QUEUE:
+		return "VIRTCHNL_OP_CONFIG_RX_QUEUE";
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		return "VIRTCHNL_OP_CONFIG_VSI_QUEUES";
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		return "VIRTCHNL_OP_CONFIG_IRQ_MAP";
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+		return "VIRTCHNL_OP_ENABLE_QUEUES";
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		return "VIRTCHNL_OP_DISABLE_QUEUES";
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+		return "VIRTCHNL_OP_ADD_ETH_ADDR";
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		return "VIRTCHNL_OP_DEL_ETH_ADDR";
+	case VIRTCHNL_OP_ADD_VLAN:
+		return "VIRTCHNL_OP_ADD_VLAN";
+	case VIRTCHNL_OP_DEL_VLAN:
+		return "VIRTCHNL_OP_DEL_VLAN";
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		return "VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE";
+	case VIRTCHNL_OP_GET_STATS:
+		return "VIRTCHNL_OP_GET_STATS";
+	case VIRTCHNL_OP_RSVD:
+		return "VIRTCHNL_OP_RSVD";
+	case VIRTCHNL_OP_EVENT:
+		return "VIRTCHNL_OP_EVENT";
+	case VIRTCHNL_OP_RDMA:
+		return "VIRTCHNL_OP_RDMA";
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		return "VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:";
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		return "VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP";
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		return "VIRTCHNL_OP_CONFIG_RSS_KEY";
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		return "VIRTCHNL_OP_CONFIG_RSS_LUT";
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		return "VIRTCHNL_OP_GET_RSS_HENA_CAPS";
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		return "VIRTCHNL_OP_SET_RSS_HENA";
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+		return "VIRTCHNL_OP_ENABLE_VLAN_STRIPPING";
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		return "VIRTCHNL_OP_DISABLE_VLAN_STRIPPING";
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		return "VIRTCHNL_OP_REQUEST_QUEUES";
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		return "VIRTCHNL_OP_ENABLE_CHANNELS";
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		return "VIRTCHNL_OP_DISABLE_CHANNELS";
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+		return "VIRTCHNL_OP_ADD_CLOUD_FILTER";
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		return "VIRTCHNL_OP_DEL_CLOUD_FILTER";
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+		return "VIRTCHNL_OP_DCF_CMD_DESC";
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		return "VIRTCHHNL_OP_DCF_CMD_BUFF";
+	case VIRTCHNL_OP_DCF_DISABLE:
+		return "VIRTCHNL_OP_DCF_DISABLE";
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+		return "VIRTCHNL_OP_DCF_GET_VSI_MAP";
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		return "VIRTCHNL_OP_GET_SUPPORTED_RXDIDS";
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+		return "VIRTCHNL_OP_ADD_RSS_CFG";
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		return "VIRTCHNL_OP_DEL_RSS_CFG";
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		return "VIRTCHNL_OP_ADD_FDIR_FILTER";
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		return "VIRTCHNL_OP_DEL_FDIR_FILTER";
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		return "VIRTCHNL_OP_GET_MAX_RSS_QREGION";
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+		return "VIRTCHNL_OP_ENABLE_QUEUES_V2";
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		return "VIRTCHNL_OP_DISABLE_QUEUES_V2";
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		return "VIRTCHNL_OP_MAP_QUEUE_VECTOR";
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		return "VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS";
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+		return "VIRTCHNL_OP_ADD_VLAN_V2";
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		return "VIRTCHNL_OP_DEL_VLAN_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2";
+	case VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2:
+		return "VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2";
+	case VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2:
+		return "VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2";
+	case VIRTCHNL_OP_MAX:
+		return "VIRTCHNL_OP_MAX";
+	default:
+		return "Unsupported (update virtchnl.h)";
+	}
+}
+
+/* These macros are used to generate compilation errors if a structure/union
+ * is not exactly the correct length. It gives a divide by zero error if the
+ * structure/union is not of the correct size, otherwise it creates an enum
+ * that is never used.
+ */
+#define VIRTCHNL_CHECK_STRUCT_LEN(n, X) enum virtchnl_static_assert_enum_##X \
+	{ virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) }
+#define VIRTCHNL_CHECK_UNION_LEN(n, X) enum virtchnl_static_asset_enum_##X \
+	{ virtchnl_static_assert_##X = (n)/((sizeof(union X) == (n)) ? 1 : 0) }
+
+/* Message descriptions and data structures. */
+
+/* VIRTCHNL_OP_VERSION
+ * VF posts its version number to the PF. PF responds with its version number
+ * in the same format, along with a return code.
+ * Reply from PF has its major/minor versions also in param0 and param1.
+ * If there is a major version mismatch, then the VF cannot operate.
+ * If there is a minor version mismatch, then the VF can operate but should
+ * add a warning to the system log.
+ *
+ * This enum element MUST always be specified as == 1, regardless of other
+ * changes in the API. The PF must always respond to this message without
+ * error regardless of version mismatch.
+ */
+#define VIRTCHNL_VERSION_MAJOR		1
+#define VIRTCHNL_VERSION_MINOR		1
+#define VIRTCHNL_VERSION_MAJOR_2	2
+#define VIRTCHNL_VERSION_MINOR_0	0
+#define VIRTCHNL_VERSION_MINOR_NO_VF_CAPS	0
+
+struct virtchnl_version_info {
+	u32 major;
+	u32 minor;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_version_info);
+
+#define VF_IS_V10(_ver) (((_ver)->major == 1) && ((_ver)->minor == 0))
+#define VF_IS_V11(_ver) (((_ver)->major == 1) && ((_ver)->minor == 1))
+#define VF_IS_V20(_ver) (((_ver)->major == 2) && ((_ver)->minor == 0))
+
+/* VIRTCHNL_OP_RESET_VF
+ * VF sends this request to PF with no parameters
+ * PF does NOT respond! VF driver must delay then poll VFGEN_RSTAT register
+ * until reset completion is indicated. The admin queue must be reinitialized
+ * after this operation.
+ *
+ * When reset is complete, PF must ensure that all queues in all VSIs associated
+ * with the VF are stopped, all queue configurations in the HMC are set to 0,
+ * and all MAC and VLAN filters (except the default MAC address) on all VSIs
+ * are cleared.
+ */
+
+/* VSI types that use VIRTCHNL interface for VF-PF communication. VSI_SRIOV
+ * vsi_type should always be 6 for backward compatibility. Add other fields
+ * as needed.
+ */
+enum virtchnl_vsi_type {
+	VIRTCHNL_VSI_TYPE_INVALID = 0,
+	VIRTCHNL_VSI_SRIOV = 6,
+};
+
+/* VIRTCHNL_OP_GET_VF_RESOURCES
+ * Version 1.0 VF sends this request to PF with no parameters
+ * Version 1.1 VF sends this request to PF with u32 bitmap of its capabilities
+ * PF responds with an indirect message containing
+ * virtchnl_vf_resource and one or more
+ * virtchnl_vsi_resource structures.
+ */
+
+struct virtchnl_vsi_resource {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+
+	/* see enum virtchnl_vsi_type */
+	s32 vsi_type;
+	u16 qset_handle;
+	u8 default_mac_addr[ETH_ALEN];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
+
+/* VF capability flags
+ * VIRTCHNL_VF_OFFLOAD_L2 flag is inclusive of base mode L2 offloads including
+ * TX/RX Checksum offloading and TSO for non-tunnelled packets.
+ */
+#define VIRTCHNL_VF_OFFLOAD_L2			BIT(0)
+#define VIRTCHNL_VF_OFFLOAD_IWARP		BIT(1)
+#define VIRTCHNL_VF_CAP_RDMA			VIRTCHNL_VF_OFFLOAD_IWARP
+#define VIRTCHNL_VF_OFFLOAD_RSS_AQ		BIT(3)
+#define VIRTCHNL_VF_OFFLOAD_RSS_REG		BIT(4)
+#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		BIT(5)
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
+/* used to negotiate communicating link speeds in Mbps */
+#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+	/* BIT(8) is reserved */
+#define VIRTCHNL_VF_LARGE_NUM_QPAIRS		BIT(9)
+#define VIRTCHNL_VF_OFFLOAD_CRC			BIT(10)
+#define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
+#define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
+#define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	BIT(18)
+#define VIRTCHNL_VF_OFFLOAD_RSS_PF		BIT(19)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP		BIT(20)
+#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		BIT(21)
+#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	BIT(22)
+#define VIRTCHNL_VF_OFFLOAD_ADQ			BIT(23)
+#define VIRTCHNL_VF_OFFLOAD_ADQ_V2		BIT(24)
+#define VIRTCHNL_VF_OFFLOAD_USO			BIT(25)
+#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC	BIT(26)
+#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		BIT(27)
+#define VIRTCHNL_VF_OFFLOAD_FDIR_PF		BIT(28)
+#define VIRTCHNL_VF_CAP_DCF			BIT(30)
+	/* BIT(31) is reserved */
+
+#define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \
+			       VIRTCHNL_VF_OFFLOAD_VLAN | \
+			       VIRTCHNL_VF_OFFLOAD_RSS_PF)
+
+struct virtchnl_vf_resource {
+	u16 num_vsis;
+	u16 num_queue_pairs;
+	u16 max_vectors;
+	u16 max_mtu;
+
+	u32 vf_cap_flags;
+	u32 rss_key_size;
+	u32 rss_lut_size;
+
+	struct virtchnl_vsi_resource vsi_res[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_vf_resource);
+
+/* VIRTCHNL_OP_CONFIG_TX_QUEUE
+ * VF sends this message to set up parameters for one TX queue.
+ * External data buffer contains one instance of virtchnl_txq_info.
+ * PF configures requested queue and returns a status code.
+ */
+
+/* Tx queue config info */
+struct virtchnl_txq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u16 ring_len;		/* number of descriptors, multiple of 8 */
+	u16 headwb_enabled; /* deprecated with AVF 1.0 */
+	u64 dma_ring_addr;
+	u64 dma_headwb_addr; /* deprecated with AVF 1.0 */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info);
+
+/* RX descriptor IDs (range from 0 to 63) */
+enum virtchnl_rx_desc_ids {
+	VIRTCHNL_RXDID_0_16B_BASE		= 0,
+	/* 32B_BASE and FLEX_SPLITQ share desc ids as default descriptors
+	 * because they can be differentiated based on queue model; e.g. single
+	 * queue model can only use 32B_BASE and split queue model can only use
+	 * FLEX_SPLITQ.  Having these as 1 allows them to be used as default
+	 * descriptors without negotiation.
+	 */
+	VIRTCHNL_RXDID_1_32B_BASE		= 1,
+	VIRTCHNL_RXDID_1_FLEX_SPLITQ		= 1,
+	VIRTCHNL_RXDID_2_FLEX_SQ_NIC		= 2,
+	VIRTCHNL_RXDID_3_FLEX_SQ_SW		= 3,
+	VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB	= 4,
+	VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL	= 5,
+	VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2		= 6,
+	VIRTCHNL_RXDID_7_HW_RSVD		= 7,
+	/* 9 through 15 are reserved */
+	VIRTCHNL_RXDID_16_COMMS_GENERIC 	= 16,
+	VIRTCHNL_RXDID_17_COMMS_AUX_VLAN 	= 17,
+	VIRTCHNL_RXDID_18_COMMS_AUX_IPV4 	= 18,
+	VIRTCHNL_RXDID_19_COMMS_AUX_IPV6 	= 19,
+	VIRTCHNL_RXDID_20_COMMS_AUX_FLOW 	= 20,
+	VIRTCHNL_RXDID_21_COMMS_AUX_TCP 	= 21,
+	/* 22 through 63 are reserved */
+};
+
+/* RX descriptor ID bitmasks */
+enum virtchnl_rx_desc_id_bitmasks {
+	VIRTCHNL_RXDID_0_16B_BASE_M		= BIT(VIRTCHNL_RXDID_0_16B_BASE),
+	VIRTCHNL_RXDID_1_32B_BASE_M		= BIT(VIRTCHNL_RXDID_1_32B_BASE),
+	VIRTCHNL_RXDID_1_FLEX_SPLITQ_M		= BIT(VIRTCHNL_RXDID_1_FLEX_SPLITQ),
+	VIRTCHNL_RXDID_2_FLEX_SQ_NIC_M		= BIT(VIRTCHNL_RXDID_2_FLEX_SQ_NIC),
+	VIRTCHNL_RXDID_3_FLEX_SQ_SW_M		= BIT(VIRTCHNL_RXDID_3_FLEX_SQ_SW),
+	VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB_M	= BIT(VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB),
+	VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL_M	= BIT(VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL),
+	VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2_M	= BIT(VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2),
+	VIRTCHNL_RXDID_7_HW_RSVD_M		= BIT(VIRTCHNL_RXDID_7_HW_RSVD),
+	/* 9 through 15 are reserved */
+	VIRTCHNL_RXDID_16_COMMS_GENERIC_M	= BIT(VIRTCHNL_RXDID_16_COMMS_GENERIC),
+	VIRTCHNL_RXDID_17_COMMS_AUX_VLAN_M	= BIT(VIRTCHNL_RXDID_17_COMMS_AUX_VLAN),
+	VIRTCHNL_RXDID_18_COMMS_AUX_IPV4_M	= BIT(VIRTCHNL_RXDID_18_COMMS_AUX_IPV4),
+	VIRTCHNL_RXDID_19_COMMS_AUX_IPV6_M	= BIT(VIRTCHNL_RXDID_19_COMMS_AUX_IPV6),
+	VIRTCHNL_RXDID_20_COMMS_AUX_FLOW_M	= BIT(VIRTCHNL_RXDID_20_COMMS_AUX_FLOW),
+	VIRTCHNL_RXDID_21_COMMS_AUX_TCP_M	= BIT(VIRTCHNL_RXDID_21_COMMS_AUX_TCP),
+	/* 22 through 63 are reserved */
+};
+
+/* VIRTCHNL_OP_CONFIG_RX_QUEUE
+ * VF sends this message to set up parameters for one RX queue.
+ * External data buffer contains one instance of virtchnl_rxq_info.
+ * PF configures requested queue and returns a status code. The
+ * crc_disable flag disables CRC stripping on the VF. Setting
+ * the crc_disable flag to 1 will disable CRC stripping for each
+ * queue in the VF where the flag is set. The VIRTCHNL_VF_OFFLOAD_CRC
+ * offload must have been set prior to sending this info or the PF
+ * will ignore the request. This flag should be set the same for
+ * all of the queues for a VF.
+ */
+
+/* Rx queue config info */
+struct virtchnl_rxq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u32 ring_len;		/* number of descriptors, multiple of 32 */
+	u16 hdr_size;
+	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
+	u32 databuffer_size;
+	u32 max_pkt_size;
+	u8 crc_disable;
+	/* see enum virtchnl_rx_desc_ids;
+	 * only used when VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC is supported. Note
+	 * that when the offload is not supported, the descriptor format aligns
+	 * with VIRTCHNL_RXDID_1_32B_BASE.
+	 */
+	u8 rxdid;
+	u8 pad1[2];
+	u64 dma_ring_addr;
+
+	/* see enum virtchnl_rx_hsplit; deprecated with AVF 1.0 */
+	s32 rx_split_pos;
+	u32 pad2;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_rxq_info);
+
+/* VIRTCHNL_OP_CONFIG_VSI_QUEUES
+ * VF sends this message to set parameters for active TX and RX queues
+ * associated with the specified VSI.
+ * PF configures queues and returns status.
+ * If the number of queues specified is greater than the number of queues
+ * associated with the VSI, an error is returned and no queues are configured.
+ * NOTE: The VF is not required to configure all queues in a single request.
+ * It may send multiple messages. PF drivers must correctly handle all VF
+ * requests.
+ */
+struct virtchnl_queue_pair_info {
+	/* NOTE: vsi_id and queue_id should be identical for both queues. */
+	struct virtchnl_txq_info txq;
+	struct virtchnl_rxq_info rxq;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(64, virtchnl_queue_pair_info);
+
+struct virtchnl_vsi_queue_config_info {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+	u32 pad;
+	struct virtchnl_queue_pair_info qpair[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
+
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support.  If the request is successful, PF will
+ * then reset the VF to institute required changes.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
+/* VIRTCHNL_OP_CONFIG_IRQ_MAP
+ * VF uses this message to map vectors to queues.
+ * The rxq_map and txq_map fields are bitmaps used to indicate which queues
+ * are to be associated with the specified vector.
+ * The "other" causes are always mapped to vector 0. The VF may not request
+ * that vector 0 be used for traffic.
+ * PF configures interrupt mapping and returns status.
+ * NOTE: due to hardware requirements, all active queues (both TX and RX)
+ * should be mapped to interrupts, even if the driver intends to operate
+ * only in polling mode. In this case the interrupt may be disabled, but
+ * the ITR timer will still run to trigger writebacks.
+ */
+struct virtchnl_vector_map {
+	u16 vsi_id;
+	u16 vector_id;
+	u16 rxq_map;
+	u16 txq_map;
+	u16 rxitr_idx;
+	u16 txitr_idx;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_vector_map);
+
+struct virtchnl_irq_map_info {
+	u16 num_vectors;
+	struct virtchnl_vector_map vecmap[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(14, virtchnl_irq_map_info);
+
+/* VIRTCHNL_OP_ENABLE_QUEUES
+ * VIRTCHNL_OP_DISABLE_QUEUES
+ * VF sends these message to enable or disable TX/RX queue pairs.
+ * The queues fields are bitmaps indicating which queues to act upon.
+ * (Currently, we only support 16 queues per VF, but we make the field
+ * u32 to allow for expansion.)
+ * PF performs requested action and returns status.
+ * NOTE: The VF is not required to enable/disable all queues in a single
+ * request. It may send multiple messages.
+ * PF drivers must correctly handle all VF requests.
+ */
+struct virtchnl_queue_select {
+	u16 vsi_id;
+	u16 pad;
+	u32 rx_queues;
+	u32 tx_queues;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_select);
+
+/* VIRTCHNL_OP_GET_MAX_RSS_QREGION
+ *
+ * if VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then this op must be supported.
+ *
+ * VF sends this message in order to query the max RSS queue region
+ * size supported by PF, when VIRTCHNL_VF_LARGE_NUM_QPAIRS is enabled.
+ * This information should be used when configuring the RSS LUT and/or
+ * configuring queue region based filters.
+ *
+ * The maximum RSS queue region is 2^qregion_width. So, a qregion_width
+ * of 6 would inform the VF that the PF supports a maximum RSS queue region
+ * of 64.
+ *
+ * A queue region represents a range of queues that can be used to configure
+ * a RSS LUT. For example, if a VF is given 64 queues, but only a max queue
+ * region size of 16 (i.e. 2^qregion_width = 16) then it will only be able
+ * to configure the RSS LUT with queue indices from 0 to 15. However, other
+ * filters can be used to direct packets to queues >15 via specifying a queue
+ * base/offset and queue region width.
+ */
+struct virtchnl_max_rss_qregion {
+	u16 vport_id;
+	u16 qregion_width;
+	u8 pad[4];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_max_rss_qregion);
+
+/* VIRTCHNL_OP_ADD_ETH_ADDR
+ * VF sends this message in order to add one or more unicast or multicast
+ * address filters for the specified VSI.
+ * PF adds the filters and returns status.
+ */
+
+/* VIRTCHNL_OP_DEL_ETH_ADDR
+ * VF sends this message in order to remove one or more unicast or multicast
+ * filters for the specified VSI.
+ * PF removes the filters and returns status.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_LEGACY
+ * Prior to adding the @type member to virtchnl_ether_addr, there were 2 pad
+ * bytes. Moving forward all VF drivers should not set type to
+ * VIRTCHNL_ETHER_ADDR_LEGACY. This is only here to not break previous/legacy
+ * behavior. The control plane function (i.e. PF) can use a best effort method
+ * of tracking the primary/device unicast in this case, but there is no
+ * guarantee and functionality depends on the implementation of the PF.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_PRIMARY
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_PRIMARY for the
+ * primary/device unicast MAC address filter for VIRTCHNL_OP_ADD_ETH_ADDR and
+ * VIRTCHNL_OP_DEL_ETH_ADDR. This allows for the underlying control plane
+ * function (i.e. PF) to accurately track and use this MAC address for
+ * displaying on the host and for VM/function reset.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_EXTRA
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_EXTRA for any extra
+ * unicast and/or multicast filters that are being added/deleted via
+ * VIRTCHNL_OP_DEL_ETH_ADDR/VIRTCHNL_OP_ADD_ETH_ADDR respectively.
+ */
+struct virtchnl_ether_addr {
+	u8 addr[ETH_ALEN];
+	u8 type;
+#define VIRTCHNL_ETHER_ADDR_LEGACY	0
+#define VIRTCHNL_ETHER_ADDR_PRIMARY	1
+#define VIRTCHNL_ETHER_ADDR_EXTRA	2
+#define VIRTCHNL_ETHER_ADDR_TYPE_MASK	3 /* first two bits of type are valid */
+	u8 pad;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_ether_addr);
+
+struct virtchnl_ether_addr_list {
+	u16 vsi_id;
+	u16 num_elements;
+	struct virtchnl_ether_addr list[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_ether_addr_list);
+
+/* VIRTCHNL_OP_ADD_VLAN
+ * VF sends this message to add one or more VLAN tag filters for receives.
+ * PF adds the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+/* VIRTCHNL_OP_DEL_VLAN
+ * VF sends this message to remove one or more VLAN tag filters for receives.
+ * PF removes the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+struct virtchnl_vlan_filter_list {
+	u16 vsi_id;
+	u16 num_elements;
+	u16 vlan_id[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list);
+
+/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related
+ * structures and opcodes.
+ *
+ * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver
+ * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported
+ * by the PF concurrently. For example, if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it
+ * would OR the following bits:
+ *
+ *	VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * The VF would interpret this as VLAN filtering can be supported on both 0x8100
+ * and 0x88A8 VLAN ethertypes.
+ *
+ * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported
+ * by the PF concurrently. For example if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping
+ * offload it would OR the following bits:
+ *
+ *	VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *	VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * The VF would interpret this as VLAN stripping can be supported on either
+ * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override
+ * the previously set value.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 - Used to tell the VF to insert and/or
+ * strip the VLAN tag using the L2TAG1 field of the Tx/Rx descriptors.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to insert hardware
+ * offloaded VLAN tags using the L2TAG2 field of the Tx descriptor.
+ *
+ * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to strip hardware
+ * offloaded VLAN tags using the L2TAG2_2 field of the Rx descriptor.
+ *
+ * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used for
+ * VLAN filtering if the underlying PF supports it.
+ *
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a
+ * certain VLAN capability can be toggled. For example if the underlying PF/CP
+ * allows the VF to toggle VLAN filtering, stripping, and/or insertion it should
+ * set this bit along with the supported ethertypes.
+ */
+enum virtchnl_vlan_support {
+	VIRTCHNL_VLAN_UNSUPPORTED =		0,
+	VIRTCHNL_VLAN_ETHERTYPE_8100 =		0x00000001,
+	VIRTCHNL_VLAN_ETHERTYPE_88A8 =		0x00000002,
+	VIRTCHNL_VLAN_ETHERTYPE_9100 =		0x00000004,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 =	0x00000100,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 =	0x00000200,
+	VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 =	0x00000400,
+	VIRTCHNL_VLAN_PRIO =			0x01000000,
+	VIRTCHNL_VLAN_FILTER_MASK =		0x10000000,
+	VIRTCHNL_VLAN_ETHERTYPE_AND =		0x20000000,
+	VIRTCHNL_VLAN_ETHERTYPE_XOR =		0x40000000,
+	VIRTCHNL_VLAN_TOGGLE =			0x80000000
+};
+
+/* This structure is used as part of the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * for filtering, insertion, and stripping capabilities.
+ *
+ * If only outer capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective.
+ *
+ * If only inner capabilities are supported (for filtering, insertion, and/or
+ * stripping) then this refers to the outer most or single VLAN from the VF's
+ * perspective. Functionally this is the same as if only outer capabilities are
+ * supported. The VF driver is just forced to use the inner fields when
+ * adding/deleting filters and enabling/disabling offloads (if supported).
+ *
+ * If both outer and inner capabilities are supported (for filtering, insertion,
+ * and/or stripping) then outer refers to the outer most or single VLAN and
+ * inner refers to the second VLAN, if it exists, in the packet.
+ *
+ * There is no support for tunneled VLAN offloads, so outer or inner are never
+ * referring to a tunneled packet from the VF's perspective.
+ */
+struct virtchnl_vlan_supported_caps {
+	u32 outer;
+	u32 inner;
+};
+
+/* The PF populates these fields based on the supported VLAN filtering. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using
+ * the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN filtering setting if the
+ * VIRTCHNL_VLAN_TOGGLE bit is set.
+ *
+ * The ethertype(s) specified in the ethertype_init field are the ethertypes
+ * enabled for VLAN filtering. VLAN filtering in this case refers to the outer
+ * most VLAN from the VF's perspective. If both inner and outer filtering are
+ * allowed then ethertype_init only refers to the outer most VLAN as only
+ * VLAN ethertype supported for inner VLAN filtering is
+ * VIRTCHNL_VLAN_ETHERTYPE_8100. By default, inner VLAN filtering is disabled
+ * when both inner and outer filtering are allowed.
+ *
+ * The max_filters field tells the VF how many VLAN filters it's allowed to have
+ * at any one time. If it exceeds this amount and tries to add another filter,
+ * then the request will be rejected by the PF. To prevent failures, the VF
+ * should keep track of how many VLAN filters it has added and not attempt to
+ * add more than max_filters.
+ */
+struct virtchnl_vlan_filtering_caps {
+	struct virtchnl_vlan_supported_caps filtering_support;
+	u32 ethertype_init;
+	u16 max_filters;
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_filtering_caps);
+
+/* This enum is used for the virtchnl_vlan_offload_caps structure to specify
+ * if the PF supports a different ethertype for stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION - The ethertype(s) specified
+ * for stripping affect the ethertype(s) specified for insertion and visa versa
+ * as well. If the VF tries to configure VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 with VIRTCHNL_VLAN_ETHERTYPE_8100 then
+ * that will be the ethertype for both stripping and insertion.
+ *
+ * VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED - The ethertype(s) specified for
+ * stripping do not affect the ethertype(s) specified for insertion and visa
+ * versa.
+ */
+enum virtchnl_vlan_ethertype_match {
+	VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION = 0,
+	VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED = 1,
+};
+
+/* The PF populates these fields based on the supported VLAN offloads. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 or
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 messages using the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN offload setting if the
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED bit is set.
+ *
+ * The VF driver needs to be aware of how the tags are stripped by hardware and
+ * inserted by the VF driver based on the level of offload support. The PF will
+ * populate these fields based on where the VLAN tags are expected to be
+ * offloaded via the VIRTHCNL_VLAN_TAG_LOCATION_* bits. The VF will need to
+ * interpret these fields. See the definition of the
+ * VIRTCHNL_VLAN_TAG_LOCATION_* bits above the virtchnl_vlan_support
+ * enumeration.
+ */
+struct virtchnl_vlan_offload_caps {
+	struct virtchnl_vlan_supported_caps stripping_support;
+	struct virtchnl_vlan_supported_caps insertion_support;
+	u32 ethertype_init;
+	u8 ethertype_match;
+	u8 pad[3];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_vlan_offload_caps);
+
+/* VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS
+ * VF sends this message to determine its VLAN capabilities.
+ *
+ * PF will mark which capabilities it supports based on hardware support and
+ * current configuration. For example, if a port VLAN is configured the PF will
+ * not allow outer VLAN filtering, stripping, or insertion to be configured so
+ * it will block these features from the VF.
+ *
+ * The VF will need to cross reference its capabilities with the PFs
+ * capabilities in the response message from the PF to determine the VLAN
+ * support.
+ */
+struct virtchnl_vlan_caps {
+	struct virtchnl_vlan_filtering_caps filtering;
+	struct virtchnl_vlan_offload_caps offloads;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_caps);
+
+struct virtchnl_vlan {
+	u16 tci;	/* tci[15:13] = PCP and tci[11:0] = VID */
+	u16 tci_mask;	/* only valid if VIRTCHNL_VLAN_FILTER_MASK set in
+			 * filtering caps
+			 */
+	u16 tpid;	/* 0x8100, 0x88a8, etc. and only type(s) set in
+			 * filtering caps. Note that tpid here does not refer to
+			 * VIRTCHNL_VLAN_ETHERTYPE_*, but it refers to the
+			 * actual 2-byte VLAN TPID
+			 */
+	u8 pad[2];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_vlan);
+
+struct virtchnl_vlan_filter {
+	struct virtchnl_vlan inner;
+	struct virtchnl_vlan outer;
+	u8 pad[16];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(32, virtchnl_vlan_filter);
+
+/* VIRTCHNL_OP_ADD_VLAN_V2
+ * VIRTCHNL_OP_DEL_VLAN_V2
+ *
+ * VF sends these messages to add/del one or more VLAN tag filters for Rx
+ * traffic.
+ *
+ * The PF attempts to add the filters and returns status.
+ *
+ * The VF should only ever attempt to add/del virtchnl_vlan_filter(s) using the
+ * supported fields negotiated via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS.
+ */
+struct virtchnl_vlan_filter_list_v2 {
+	u16 vport_id;
+	u16 num_elements;
+	u8 pad[4];
+	struct virtchnl_vlan_filter filters[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_filter_list_v2);
+
+/* VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2
+ *
+ * VF sends this message to enable or disable VLAN stripping or insertion. It
+ * also needs to specify an ethertype. The VF knows which VLAN ethertypes are
+ * allowed and whether or not it's allowed to enable/disable the specific
+ * offload via the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to
+ * parse the virtchnl_vlan_caps.offloads fields to determine which offload
+ * messages are allowed.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable and/or disable 0x8100 inner
+ * VLAN insertion and/or stripping via the opcodes listed above. Inner in this
+ * case means the outer most or single VLAN from the VF's perspective. This is
+ * because no outer offloads are supported. See the comments above the
+ * virtchnl_vlan_supported_caps structure for more details.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.inner =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * In order to enable inner (again note that in this case inner is the outer
+ * most or single VLAN from the VF's perspective) VLAN stripping for 0x8100
+ * VLANs, the VF would populate the virtchnl_vlan_setting structure in the
+ * following manner and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.inner_ethertype_setting =
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * The reason that VLAN TPID(s) are not being used for the
+ * outer_ethertype_setting and inner_ethertype_setting fields is because it's
+ * possible a device could support VLAN insertion and/or stripping offload on
+ * multiple ethertypes concurrently, so this method allows a VF to request
+ * multiple ethertypes in one message using the virtchnl_vlan_support
+ * enumeration.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.offloads in the
+ * following manner the VF will be allowed to enable 0x8100 and 0x88a8 outer
+ * VLAN insertion and stripping simultaneously. The
+ * virtchnl_vlan_caps.offloads.ethertype_match field will also have to be
+ * populated based on what the PF can support.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * In order to enable outer VLAN stripping for 0x8100 and 0x88a8 VLANs, the VF
+ * would populate the virthcnl_vlan_offload_structure in the following manner
+ * and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting =
+ *			VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * There is also the case where a PF and the underlying hardware can support
+ * VLAN offloads on multiple ethertypes, but not concurrently. For example, if
+ * the PF populates the virtchnl_vlan_caps.offloads in the following manner the
+ * VF will be allowed to enable and/or disable 0x8100 XOR 0x88a8 outer VLAN
+ * offloads. The ethertypes must match for stripping and insertion.
+ *
+ * virtchnl_vlan_caps.offloads.stripping_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.insertion_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * virtchnl_vlan_caps.offloads.ethertype_match =
+ *			VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION;
+ *
+ * In order to enable outer VLAN stripping for 0x88a8 VLANs, the VF would
+ * populate the virtchnl_vlan_setting structure in the following manner and send
+ * the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2. Also, this will change the
+ * ethertype for VLAN insertion if it's enabled. So, for completeness, a
+ * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 with the same ethertype should be sent.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting = VIRTHCNL_VLAN_ETHERTYPE_88A8;
+ *
+ * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on
+ * initialization.
+ *
+ * VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2
+ * VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2
+ *
+ * VF sends this message to enable or disable VLAN filtering. It also needs to
+ * specify an ethertype. The VF knows which VLAN ethertypes are allowed and
+ * whether or not it's allowed to enable/disable filtering via the
+ * VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to
+ * parse the virtchnl_vlan_caps.filtering fields to determine which, if any,
+ * filtering messages are allowed.
+ *
+ * For example, if the PF populates the virtchnl_vlan_caps.filtering in the
+ * following manner the VF will be allowed to enable/disable 0x8100 and 0x88a8
+ * outer VLAN filtering together. Note, that the VIRTCHNL_VLAN_ETHERTYPE_AND
+ * means that all filtering ethertypes will to be enabled and disabled together
+ * regardless of the request from the VF. This means that the underlying
+ * hardware only supports VLAN filtering for all VLAN the specified ethertypes
+ * or none of them.
+ *
+ * virtchnl_vlan_caps.filtering.filtering_support.outer =
+ *			VIRTCHNL_VLAN_TOGGLE |
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTHCNL_VLAN_ETHERTYPE_88A8 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_9100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * In order to enable outer VLAN filtering for 0x88a8 and 0x8100 VLANs (0x9100
+ * VLANs aren't supported by the VF driver), the VF would populate the
+ * virtchnl_vlan_setting structure in the following manner and send the
+ * VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2. The same message format would be used
+ * to disable outer VLAN filtering for 0x88a8 and 0x8100 VLANs, but the
+ * VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2 opcode is used.
+ *
+ * virtchnl_vlan_setting.outer_ethertype_setting =
+ *			VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ *			VIRTCHNL_VLAN_ETHERTYPE_88A8;
+ *
+ */
+struct virtchnl_vlan_setting {
+	u32 outer_ethertype_setting;
+	u32 inner_ethertype_setting;
+	u16 vport_id;
+	u8 pad[6];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_setting);
+
+/* VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * VF sends VSI id and flags.
+ * PF returns status code in retval.
+ * Note: we assume that broadcast accept mode is always enabled.
+ */
+struct virtchnl_promisc_info {
+	u16 vsi_id;
+	u16 flags;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_promisc_info);
+
+#define FLAG_VF_UNICAST_PROMISC	0x00000001
+#define FLAG_VF_MULTICAST_PROMISC	0x00000002
+
+/* VIRTCHNL_OP_GET_STATS
+ * VF sends this message to request stats for the selected VSI. VF uses
+ * the virtchnl_queue_select struct to specify the VSI. The queue_id
+ * field is ignored by the PF.
+ *
+ * PF replies with struct virtchnl_eth_stats in an external buffer.
+ */
+
+struct virtchnl_eth_stats {
+	u64 rx_bytes;			/* received bytes */
+	u64 rx_unicast;			/* received unicast pkts */
+	u64 rx_multicast;		/* received multicast pkts */
+	u64 rx_broadcast;		/* received broadcast pkts */
+	u64 rx_discards;
+	u64 rx_unknown_protocol;
+	u64 tx_bytes;			/* transmitted bytes */
+	u64 tx_unicast;			/* transmitted unicast pkts */
+	u64 tx_multicast;		/* transmitted multicast pkts */
+	u64 tx_broadcast;		/* transmitted broadcast pkts */
+	u64 tx_discards;
+	u64 tx_errors;
+};
+
+/* VIRTCHNL_OP_CONFIG_RSS_KEY
+ * VIRTCHNL_OP_CONFIG_RSS_LUT
+ * VF sends these messages to configure RSS. Only supported if both PF
+ * and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during
+ * configuration negotiation. If this is the case, then the RSS fields in
+ * the VF resource struct are valid.
+ * Both the key and LUT are initialized to 0 by the PF, meaning that
+ * RSS is effectively disabled until set up by the VF.
+ */
+struct virtchnl_rss_key {
+	u16 vsi_id;
+	u16 key_len;
+	u8 key[1];         /* RSS hash key, packed bytes */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key);
+
+struct virtchnl_rss_lut {
+	u16 vsi_id;
+	u16 lut_entries;
+	u8 lut[1];        /* RSS lookup table */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut);
+
+/* VIRTCHNL_OP_GET_RSS_HENA_CAPS
+ * VIRTCHNL_OP_SET_RSS_HENA
+ * VF sends these messages to get and set the hash filter enable bits for RSS.
+ * By default, the PF sets these to all possible traffic types that the
+ * hardware supports. The VF can query this value if it wants to change the
+ * traffic types that are hashed by the hardware.
+ */
+struct virtchnl_rss_hena {
+	u64 hena;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena);
+
+/* Type of RSS algorithm */
+enum virtchnl_rss_algorithm {
+	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
+	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
+	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
+	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
+};
+
+/* This is used by PF driver to enforce how many channels can be supported.
+ * When ADQ_V2 capability is negotiated, it will allow 16 channels otherwise
+ * PF driver will allow only max 4 channels
+ */
+#define VIRTCHNL_MAX_ADQ_CHANNELS 4
+#define VIRTCHNL_MAX_ADQ_V2_CHANNELS 16
+
+/* VIRTCHNL_OP_ENABLE_CHANNELS
+ * VIRTCHNL_OP_DISABLE_CHANNELS
+ * VF sends these messages to enable or disable channels based on
+ * the user specified queue count and queue offset for each traffic class.
+ * This struct encompasses all the information that the PF needs from
+ * VF to create a channel.
+ */
+struct virtchnl_channel_info {
+	u16 count; /* number of queues in a channel */
+	u16 offset; /* queues in a channel start from 'offset' */
+	u32 pad;
+	u64 max_tx_rate;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_channel_info);
+
+struct virtchnl_tc_info {
+	u32	num_tc;
+	u32	pad;
+	struct	virtchnl_channel_info list[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_tc_info);
+
+/* VIRTCHNL_ADD_CLOUD_FILTER
+ * VIRTCHNL_DEL_CLOUD_FILTER
+ * VF sends these messages to add or delete a cloud filter based on the
+ * user specified match and action filters. These structures encompass
+ * all the information that the PF needs from the VF to add/delete a
+ * cloud filter.
+ */
+
+struct virtchnl_l4_spec {
+	u8	src_mac[ETH_ALEN];
+	u8	dst_mac[ETH_ALEN];
+	/* vlan_prio is part of this 16 bit field even from OS perspective
+	 * vlan_id:12 is actual vlan_id, then vlanid:bit14..12 is vlan_prio
+	 * in future, when decided to offload vlan_prio, pass that information
+	 * as part of the "vlan_id" field, Bit14..12
+	 */
+	__be16	vlan_id;
+	__be16	pad; /* reserved for future use */
+	__be32	src_ip[4];
+	__be32	dst_ip[4];
+	__be16	src_port;
+	__be16	dst_port;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(52, virtchnl_l4_spec);
+
+union virtchnl_flow_spec {
+	struct	virtchnl_l4_spec tcp_spec;
+	u8	buffer[128]; /* reserved for future use */
+};
+
+VIRTCHNL_CHECK_UNION_LEN(128, virtchnl_flow_spec);
+
+enum virtchnl_action {
+	/* action types */
+	VIRTCHNL_ACTION_DROP = 0,
+	VIRTCHNL_ACTION_TC_REDIRECT,
+	VIRTCHNL_ACTION_PASSTHRU,
+	VIRTCHNL_ACTION_QUEUE,
+	VIRTCHNL_ACTION_Q_REGION,
+	VIRTCHNL_ACTION_MARK,
+	VIRTCHNL_ACTION_COUNT,
+};
+
+enum virtchnl_flow_type {
+	/* flow types */
+	VIRTCHNL_TCP_V4_FLOW = 0,
+	VIRTCHNL_TCP_V6_FLOW,
+	VIRTCHNL_UDP_V4_FLOW,
+	VIRTCHNL_UDP_V6_FLOW,
+};
+
+struct virtchnl_filter {
+	union	virtchnl_flow_spec data;
+	union	virtchnl_flow_spec mask;
+
+	/* see enum virtchnl_flow_type */
+	s32 	flow_type;
+
+	/* see enum virtchnl_action */
+	s32	action;
+	u32	action_meta;
+	u8	field_flags;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter);
+
+/* VIRTCHNL_OP_DCF_GET_VSI_MAP
+ * VF sends this message to get VSI mapping table.
+ * PF responds with an indirect message containing VF's
+ * HW VSI IDs.
+ * The index of vf_vsi array is the logical VF ID, the
+ * value of vf_vsi array is the VF's HW VSI ID with its
+ * valid configuration.
+ */
+struct virtchnl_dcf_vsi_map {
+	u16 pf_vsi;	/* PF's HW VSI ID */
+	u16 num_vfs;	/* The actual number of VFs allocated */
+#define VIRTCHNL_DCF_VF_VSI_ID_S	0
+#define VIRTCHNL_DCF_VF_VSI_ID_M	(0xFFF << VIRTCHNL_DCF_VF_VSI_ID_S)
+#define VIRTCHNL_DCF_VF_VSI_VALID	BIT(15)
+	u16 vf_vsi[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_dcf_vsi_map);
+
+#define PKG_NAME_SIZE	32
+#define DSN_SIZE	8
+
+struct pkg_version {
+	u8 major;
+	u8 minor;
+	u8 update;
+	u8 draft;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(4, pkg_version);
+
+struct virtchnl_pkg_info {
+	struct pkg_version pkg_ver;
+	u32 track_id;
+	char pkg_name[PKG_NAME_SIZE];
+	u8 dsn[DSN_SIZE];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(48, virtchnl_pkg_info);
+
+/* VIRTCHNL_OP_DCF_VLAN_OFFLOAD
+ * DCF negotiates the VIRTCHNL_VF_OFFLOAD_VLAN_V2 capability firstly to get
+ * the double VLAN configuration, then DCF sends this message to configure the
+ * outer or inner VLAN offloads (insertion and strip) for the target VF.
+ */
+struct virtchnl_dcf_vlan_offload {
+	u16 vf_id;
+	u16 tpid;
+	u16 vlan_flags;
+#define VIRTCHNL_DCF_VLAN_TYPE_S		0
+#define VIRTCHNL_DCF_VLAN_TYPE_M		\
+			(0x1 << VIRTCHNL_DCF_VLAN_TYPE_S)
+#define VIRTCHNL_DCF_VLAN_TYPE_INNER		0x0
+#define VIRTCHNL_DCF_VLAN_TYPE_OUTER		0x1
+#define VIRTCHNL_DCF_VLAN_INSERT_MODE_S		1
+#define VIRTCHNL_DCF_VLAN_INSERT_MODE_M	\
+			(0x7 << VIRTCHNL_DCF_VLAN_INSERT_MODE_S)
+#define VIRTCHNL_DCF_VLAN_INSERT_DISABLE	0x1
+#define VIRTCHNL_DCF_VLAN_INSERT_PORT_BASED	0x2
+#define VIRTCHNL_DCF_VLAN_INSERT_VIA_TX_DESC	0x3
+#define VIRTCHNL_DCF_VLAN_STRIP_MODE_S		4
+#define VIRTCHNL_DCF_VLAN_STRIP_MODE_M		\
+			(0x7 << VIRTCHNL_DCF_VLAN_STRIP_MODE_S)
+#define VIRTCHNL_DCF_VLAN_STRIP_DISABLE		0x1
+#define VIRTCHNL_DCF_VLAN_STRIP_ONLY		0x2
+#define VIRTCHNL_DCF_VLAN_STRIP_INTO_RX_DESC	0x3
+	u16 vlan_id;
+	u16 pad[4];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_dcf_vlan_offload);
+
+struct virtchnl_supported_rxdids {
+	/* see enum virtchnl_rx_desc_id_bitmasks */
+	u64 supported_rxdids;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_supported_rxdids);
+
+/* VIRTCHNL_OP_EVENT
+ * PF sends this message to inform the VF driver of events that may affect it.
+ * No direct response is expected from the VF, though it may generate other
+ * messages in response to this one.
+ */
+enum virtchnl_event_codes {
+	VIRTCHNL_EVENT_UNKNOWN = 0,
+	VIRTCHNL_EVENT_LINK_CHANGE,
+	VIRTCHNL_EVENT_RESET_IMPENDING,
+	VIRTCHNL_EVENT_PF_DRIVER_CLOSE,
+	VIRTCHNL_EVENT_DCF_VSI_MAP_UPDATE,
+        VIRTCHNL_EVENT_DCF_VSI_INFO = 1000,
+};
+
+#define PF_EVENT_SEVERITY_INFO		0
+#define PF_EVENT_SEVERITY_CERTAIN_DOOM	255
+
+struct virtchnl_pf_event {
+	/* see enum virtchnl_event_codes */
+	s32 event;
+	union {
+		/* If the PF driver does not support the new speed reporting
+		 * capabilities then use link_event else use link_event_adv to
+		 * get the speed and link information. The ability to understand
+		 * new speeds is indicated by setting the capability flag
+		 * VIRTCHNL_VF_CAP_ADV_LINK_SPEED in vf_cap_flags parameter
+		 * in virtchnl_vf_resource struct and can be used to determine
+		 * which link event struct to use below.
+		 */
+		struct {
+			enum virtchnl_link_speed link_speed;
+			bool link_status;
+			u8 pad[3];
+		} link_event;
+		struct {
+			/* link_speed provided in Mbps */
+			u32 link_speed;
+			u8 link_status;
+			u8 pad[3];
+		} link_event_adv;
+		struct {
+			/* link_speed provided in Mbps */
+			u32 link_speed;
+			u16 vport_id;
+			u8 link_status;
+			u8 pad;
+		} link_event_adv_vport;
+		struct {
+			u16 vf_id;
+			u16 vsi_id;
+			u32 pad;
+		} vf_vsi_map;
+	} event_data;
+
+	s32 severity;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_pf_event);
+
+/* used to specify if a ceq_idx or aeq_idx is invalid */
+#define VIRTCHNL_RDMA_INVALID_QUEUE_IDX	0xFFFF
+/* VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP
+ * VF uses this message to request PF to map RDMA vectors to RDMA queues.
+ * The request for this originates from the VF RDMA driver through
+ * a client interface between VF LAN and VF RDMA driver.
+ * A vector could have an AEQ and CEQ attached to it although
+ * there is a single AEQ per VF RDMA instance in which case
+ * most vectors will have an VIRTCHNL_RDMA_INVALID_QUEUE_IDX for aeq and valid
+ * idx for ceqs There will never be a case where there will be multiple CEQs
+ * attached to a single vector.
+ * PF configures interrupt mapping and returns status.
+ */
+#define virtchnl_iwarp_qv_info virtchnl_rdma_qv_info
+struct virtchnl_rdma_qv_info {
+	u32 v_idx; /* msix_vector */
+	u16 ceq_idx; /* set to VIRTCHNL_RDMA_INVALID_QUEUE_IDX if invalid */
+	u16 aeq_idx; /* set to VIRTCHNL_RDMA_INVALID_QUEUE_IDX if invalid */
+	u8 itr_idx;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_rdma_qv_info);
+
+#define virtchnl_iwarp_qvlist_info virtchnl_rdma_qvlist_info
+struct virtchnl_rdma_qvlist_info {
+	u32 num_vectors;
+	struct virtchnl_rdma_qv_info qv_info[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_rdma_qvlist_info);
+
+
+/* VF reset states - these are written into the RSTAT register:
+ * VFGEN_RSTAT on the VF
+ * When the PF initiates a reset, it writes 0
+ * When the reset is complete, it writes 1
+ * When the PF detects that the VF has recovered, it writes 2
+ * VF checks this register periodically to determine if a reset has occurred,
+ * then polls it to know when the reset is complete.
+ * If either the PF or VF reads the register while the hardware
+ * is in a reset state, it will return DEADBEEF, which, when masked
+ * will result in 3.
+ */
+enum virtchnl_vfr_states {
+	VIRTCHNL_VFR_INPROGRESS = 0,
+	VIRTCHNL_VFR_COMPLETED,
+	VIRTCHNL_VFR_VFACTIVE,
+};
+
+#define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
+#define PROTO_HDR_SHIFT			5
+#define PROTO_HDR_FIELD_START(proto_hdr_type) \
+					(proto_hdr_type << PROTO_HDR_SHIFT)
+#define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1)
+
+/* VF use these macros to configure each protocol header.
+ * Specify which protocol headers and protocol header fields base on
+ * virtchnl_proto_hdr_type and virtchnl_proto_hdr_field.
+ * @param hdr: a struct of virtchnl_proto_hdr
+ * @param hdr_type: ETH/IPV4/TCP, etc
+ * @param field: SRC/DST/TEID/SPI, etc
+ */
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector |= BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector &= ~BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val) \
+	((hdr)->field_selector & BIT((val) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_GET_PROTO_HDR_FIELD(hdr)	((hdr)->field_selector)
+
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+
+#define VIRTCHNL_SET_PROTO_HDR_TYPE(hdr, hdr_type) \
+	((hdr)->type = VIRTCHNL_PROTO_HDR_ ## hdr_type)
+#define VIRTCHNL_GET_PROTO_HDR_TYPE(hdr) \
+	(((hdr)->type) >> PROTO_HDR_SHIFT)
+#define VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) \
+	((hdr)->type == ((s32)((val) >> PROTO_HDR_SHIFT)))
+#define VIRTCHNL_TEST_PROTO_HDR(hdr, val) \
+	(VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) && \
+	 VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val))
+
+/* Protocol header type within a packet segment. A segment consists of one or
+ * more protocol headers that make up a logical group of protocol headers. Each
+ * logical group of protocol headers encapsulates or is encapsulated using/by
+ * tunneling or encapsulation protocols for network virtualization.
+ */
+enum virtchnl_proto_hdr_type {
+	VIRTCHNL_PROTO_HDR_NONE,
+	VIRTCHNL_PROTO_HDR_ETH,
+	VIRTCHNL_PROTO_HDR_S_VLAN,
+	VIRTCHNL_PROTO_HDR_C_VLAN,
+	VIRTCHNL_PROTO_HDR_IPV4,
+	VIRTCHNL_PROTO_HDR_IPV6,
+	VIRTCHNL_PROTO_HDR_TCP,
+	VIRTCHNL_PROTO_HDR_UDP,
+	VIRTCHNL_PROTO_HDR_SCTP,
+	VIRTCHNL_PROTO_HDR_GTPU_IP,
+	VIRTCHNL_PROTO_HDR_GTPU_EH,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+	VIRTCHNL_PROTO_HDR_PPPOE,
+	VIRTCHNL_PROTO_HDR_L2TPV3,
+	VIRTCHNL_PROTO_HDR_ESP,
+	VIRTCHNL_PROTO_HDR_AH,
+	VIRTCHNL_PROTO_HDR_PFCP,
+	VIRTCHNL_PROTO_HDR_GTPC,
+	VIRTCHNL_PROTO_HDR_ECPRI,
+	VIRTCHNL_PROTO_HDR_L2TPV2,
+	VIRTCHNL_PROTO_HDR_PPP,
+	/* IPv4 and IPv6 Fragment header types are only associated to
+	 * VIRTCHNL_PROTO_HDR_IPV4 and VIRTCHNL_PROTO_HDR_IPV6 respectively,
+	 * cannot be used independently.
+	 */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG,
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG,
+};
+
+/* Protocol header field within a protocol header. */
+enum virtchnl_proto_hdr_field {
+	/* ETHER */
+	VIRTCHNL_PROTO_HDR_ETH_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ETH),
+	VIRTCHNL_PROTO_HDR_ETH_DST,
+	VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE,
+	/* S-VLAN */
+	VIRTCHNL_PROTO_HDR_S_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_S_VLAN),
+	/* C-VLAN */
+	VIRTCHNL_PROTO_HDR_C_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_C_VLAN),
+	/* IPV4 */
+	VIRTCHNL_PROTO_HDR_IPV4_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4),
+	VIRTCHNL_PROTO_HDR_IPV4_DST,
+	VIRTCHNL_PROTO_HDR_IPV4_DSCP,
+	VIRTCHNL_PROTO_HDR_IPV4_TTL,
+	VIRTCHNL_PROTO_HDR_IPV4_PROT,
+	/* IPV6 */
+	VIRTCHNL_PROTO_HDR_IPV6_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6),
+	VIRTCHNL_PROTO_HDR_IPV6_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_TC,
+	VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT,
+	VIRTCHNL_PROTO_HDR_IPV6_PROT,
+	/* IPV6 Prefix */
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_SRC,
+	VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_DST,
+	/* TCP */
+	VIRTCHNL_PROTO_HDR_TCP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP),
+	VIRTCHNL_PROTO_HDR_TCP_DST_PORT,
+	/* UDP */
+	VIRTCHNL_PROTO_HDR_UDP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP),
+	VIRTCHNL_PROTO_HDR_UDP_DST_PORT,
+	/* SCTP */
+	VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP),
+	VIRTCHNL_PROTO_HDR_SCTP_DST_PORT,
+	/* GTPU_IP */
+	VIRTCHNL_PROTO_HDR_GTPU_IP_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP),
+	/* GTPU_EH */
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH),
+	VIRTCHNL_PROTO_HDR_GTPU_EH_QFI,
+	/* PPPOE */
+	VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PPPOE),
+	/* L2TPV3 */
+	VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV3),
+	/* ESP */
+	VIRTCHNL_PROTO_HDR_ESP_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ESP),
+	/* AH */
+	VIRTCHNL_PROTO_HDR_AH_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_AH),
+	/* PFCP */
+	VIRTCHNL_PROTO_HDR_PFCP_S_FIELD =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP),
+	VIRTCHNL_PROTO_HDR_PFCP_SEID,
+	/* GTPC */
+	VIRTCHNL_PROTO_HDR_GTPC_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPC),
+	/* ECPRI */
+	VIRTCHNL_PROTO_HDR_ECPRI_MSG_TYPE =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ECPRI),
+	VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID,
+	/* IPv4 Dummy Fragment */
+	VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4_FRAG),
+	/* IPv6 Extension Fragment */
+	VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG),
+};
+
+struct virtchnl_proto_hdr {
+	/* see enum virtchnl_proto_hdr_type */
+	s32 type;
+	u32 field_selector; /* a bit mask to select field for header type */
+	u8 buffer[64];
+	/**
+	 * binary buffer in network order for specific header type.
+	 * For example, if type = VIRTCHNL_PROTO_HDR_IPV4, a IPv4
+	 * header is expected to be copied into the buffer.
+	 */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr);
+
+struct virtchnl_proto_hdrs {
+	u8 tunnel_level;
+	/**
+	 * specify where protocol header start from.
+	 * 0 - from the outer layer
+	 * 1 - from the first inner layer
+	 * 2 - from the second inner layer
+	 * ....
+	 **/
+	int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */
+	struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
+
+struct virtchnl_rss_cfg {
+	struct virtchnl_proto_hdrs proto_hdrs;	   /* protocol headers */
+
+	/* see enum virtchnl_rss_algorithm; rss algorithm type */
+	s32 rss_algorithm;
+	u8 reserved[128];                          /* reserve for future */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2444, virtchnl_rss_cfg);
+
+/* action configuration for FDIR */
+struct virtchnl_filter_action {
+	/* see enum virtchnl_action type */
+	s32 type;
+	union {
+		/* used for queue and qgroup action */
+		struct {
+			u16 index;
+			u8 region;
+		} queue;
+		/* used for count action */
+		struct {
+			/* share counter ID with other flow rules */
+			u8 shared;
+			u32 id; /* counter ID */
+		} count;
+		/* used for mark action */
+		u32 mark_id;
+		u8 reserve[32];
+	} act_conf;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action);
+
+#define VIRTCHNL_MAX_NUM_ACTIONS  8
+
+struct virtchnl_filter_action_set {
+	/* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */
+	int count;
+	struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(292, virtchnl_filter_action_set);
+
+/* pattern and action for FDIR rule */
+struct virtchnl_fdir_rule {
+	struct virtchnl_proto_hdrs proto_hdrs;
+	struct virtchnl_filter_action_set action_set;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2604, virtchnl_fdir_rule);
+
+/* Status returned to VF after VF requests FDIR commands
+ * VIRTCHNL_FDIR_SUCCESS
+ * VF FDIR related request is successfully done by PF
+ * The request can be OP_ADD/DEL/QUERY_FDIR_FILTER.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE
+ * OP_ADD_FDIR_FILTER request is failed due to no Hardware resource.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_EXIST
+ * OP_ADD_FDIR_FILTER request is failed due to the rule is already existed.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT
+ * OP_ADD_FDIR_FILTER request is failed due to conflict with existing rule.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST
+ * OP_DEL_FDIR_FILTER request is failed due to this rule doesn't exist.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_INVALID
+ * OP_ADD_FDIR_FILTER request is failed due to parameters validation
+ * or HW doesn't support.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT
+ * OP_ADD/DEL_FDIR_FILTER request is failed due to timing out
+ * for programming.
+ *
+ * VIRTCHNL_FDIR_FAILURE_QUERY_INVALID
+ * OP_QUERY_FDIR_FILTER request is failed due to parameters validation,
+ * for example, VF query counter of a rule who has no counter action.
+ */
+enum virtchnl_fdir_prgm_status {
+	VIRTCHNL_FDIR_SUCCESS = 0,
+	VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE,
+	VIRTCHNL_FDIR_FAILURE_RULE_EXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT,
+	VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_INVALID,
+	VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT,
+	VIRTCHNL_FDIR_FAILURE_QUERY_INVALID,
+};
+
+/* VIRTCHNL_OP_ADD_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id,
+ * validate_only and rule_cfg. PF will return flow_id
+ * if the request is successfully done and return add_status to VF.
+ */
+struct virtchnl_fdir_add {
+	u16 vsi_id;  /* INPUT */
+	/*
+	 * 1 for validating a fdir rule, 0 for creating a fdir rule.
+	 * Validate and create share one ops: VIRTCHNL_OP_ADD_FDIR_FILTER.
+	 */
+	u16 validate_only; /* INPUT */
+	u32 flow_id;       /* OUTPUT */
+	struct virtchnl_fdir_rule rule_cfg; /* INPUT */
+
+	/* see enum virtchnl_fdir_prgm_status; OUTPUT */
+	s32 status;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2616, virtchnl_fdir_add);
+
+/* VIRTCHNL_OP_DEL_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id
+ * and flow_id. PF will return del_status to VF.
+ */
+struct virtchnl_fdir_del {
+	u16 vsi_id;  /* INPUT */
+	u16 pad;
+	u32 flow_id; /* INPUT */
+
+	/* see enum virtchnl_fdir_prgm_status; OUTPUT */
+	s32 status;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
+
+/* TX and RX queue types are valid in legacy as well as split queue models.
+ * With Split Queue model, 2 additional types are introduced - TX_COMPLETION
+ * and RX_BUFFER. In split queue model, RX corresponds to the queue where HW
+ * posts completions.
+ */
+enum virtchnl_queue_type {
+	VIRTCHNL_QUEUE_TYPE_TX			= 0,
+	VIRTCHNL_QUEUE_TYPE_RX			= 1,
+	VIRTCHNL_QUEUE_TYPE_TX_COMPLETION	= 2,
+	VIRTCHNL_QUEUE_TYPE_RX_BUFFER		= 3,
+	VIRTCHNL_QUEUE_TYPE_CONFIG_TX		= 4,
+	VIRTCHNL_QUEUE_TYPE_CONFIG_RX		= 5
+};
+
+
+/* structure to specify a chunk of contiguous queues */
+struct virtchnl_queue_chunk {
+	/* see enum virtchnl_queue_type */
+	s32 type;
+	u16 start_queue_id;
+	u16 num_queues;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk);
+
+/* structure to specify several chunks of contiguous queues */
+struct virtchnl_queue_chunks {
+	u16 num_chunks;
+	u16 rsvd;
+	struct virtchnl_queue_chunk chunks[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_chunks);
+
+
+/* VIRTCHNL_OP_ENABLE_QUEUES_V2
+ * VIRTCHNL_OP_DISABLE_QUEUES_V2
+ * VIRTCHNL_OP_DEL_QUEUES
+ *
+ * If VIRTCHNL version was negotiated in VIRTCHNL_OP_VERSION as 2.0
+ * then all of these ops are available.
+ *
+ * If VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then VIRTCHNL_OP_ENABLE_QUEUES_V2 and VIRTCHNL_OP_DISABLE_QUEUES_V2 are
+ * available.
+ *
+ * PF sends these messages to enable, disable or delete queues specified in
+ * chunks. PF sends virtchnl_del_ena_dis_queues struct to specify the queues
+ * to be enabled/disabled/deleted. Also applicable to single queue RX or
+ * TX. CP performs requested action and returns status.
+ */
+struct virtchnl_del_ena_dis_queues {
+	u16 vport_id;
+	u16 pad;
+	struct virtchnl_queue_chunks chunks;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_del_ena_dis_queues);
+
+/* Virtchannel interrupt throttling rate index */
+enum virtchnl_itr_idx {
+	VIRTCHNL_ITR_IDX_0	= 0,
+	VIRTCHNL_ITR_IDX_1	= 1,
+	VIRTCHNL_ITR_IDX_NO_ITR	= 3,
+};
+
+/* Queue to vector mapping */
+struct virtchnl_queue_vector {
+	u16 queue_id;
+	u16 vector_id;
+	u8 pad[4];
+
+	/* see enum virtchnl_itr_idx */
+	s32 itr_idx;
+
+	/* see enum virtchnl_queue_type */
+	s32 queue_type;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_queue_vector);
+
+/* VIRTCHNL_OP_MAP_QUEUE_VECTOR
+ *
+ * If VIRTCHNL_VF_LARGE_NUM_QPAIRS was negotiated in VIRTCHNL_OP_GET_VF_RESOURCES
+ * then only VIRTCHNL_OP_MAP_QUEUE_VECTOR is available.
+ *
+ * PF sends this message to map or unmap queues to vectors and ITR index
+ * registers. External data buffer contains virtchnl_queue_vector_maps structure
+ * that contains num_qv_maps of virtchnl_queue_vector structures.
+ * CP maps the requested queue vector maps after validating the queue and vector
+ * ids and returns a status code.
+ */
+struct virtchnl_queue_vector_maps {
+	u16 vport_id;
+	u16 num_qv_maps;
+	u8 pad[4];
+	struct virtchnl_queue_vector qv_maps[1];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_queue_vector_maps);
+
+
+
+/* Since VF messages are limited by u16 size, precalculate the maximum possible
+ * values of nested elements in virtchnl structures that virtual channel can
+ * possibly handle in a single message.
+ */
+enum virtchnl_vector_limits {
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_vsi_queue_config_info)) /
+		sizeof(struct virtchnl_queue_pair_info),
+
+	VIRTCHNL_OP_CONFIG_IRQ_MAP_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_irq_map_info)) /
+		sizeof(struct virtchnl_vector_map),
+
+	VIRTCHNL_OP_ADD_DEL_ETH_ADDR_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_ether_addr_list)) /
+		sizeof(struct virtchnl_ether_addr),
+
+	VIRTCHNL_OP_ADD_DEL_VLAN_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_vlan_filter_list)) /
+		sizeof(u16),
+
+	VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_rdma_qvlist_info)) /
+		sizeof(struct virtchnl_rdma_qv_info),
+
+	VIRTCHNL_OP_ENABLE_CHANNELS_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_tc_info)) /
+		sizeof(struct virtchnl_channel_info),
+
+	VIRTCHNL_OP_ENABLE_DISABLE_DEL_QUEUES_V2_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_del_ena_dis_queues)) /
+		sizeof(struct virtchnl_queue_chunk),
+
+	VIRTCHNL_OP_MAP_UNMAP_QUEUE_VECTOR_MAX	=
+		((u16)(~0) - sizeof(struct virtchnl_queue_vector_maps)) /
+		sizeof(struct virtchnl_queue_vector),
+
+	VIRTCHNL_OP_ADD_DEL_VLAN_V2_MAX		=
+		((u16)(~0) - sizeof(struct virtchnl_vlan_filter_list_v2)) /
+		sizeof(struct virtchnl_vlan_filter),
+};
+
+/**
+ * virtchnl_vc_validate_vf_msg
+ * @ver: Virtchnl version info
+ * @v_opcode: Opcode for the message
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * validate msg format against struct for each opcode
+ */
+static inline int
+virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
+			    u8 *msg, u16 msglen)
+{
+	bool err_msg_format = false;
+	u32 valid_len = 0;
+
+	/* Validate message length. */
+	switch (v_opcode) {
+	case VIRTCHNL_OP_VERSION:
+		valid_len = sizeof(struct virtchnl_version_info);
+		break;
+	case VIRTCHNL_OP_RESET_VF:
+		break;
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		if (VF_IS_V11(ver))
+			valid_len = sizeof(u32);
+		break;
+	case VIRTCHNL_OP_CONFIG_TX_QUEUE:
+		valid_len = sizeof(struct virtchnl_txq_info);
+		break;
+	case VIRTCHNL_OP_CONFIG_RX_QUEUE:
+		valid_len = sizeof(struct virtchnl_rxq_info);
+		break;
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		valid_len = sizeof(struct virtchnl_vsi_queue_config_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_vsi_queue_config_info *vqc =
+			    (struct virtchnl_vsi_queue_config_info *)msg;
+
+			if (vqc->num_queue_pairs == 0 || vqc->num_queue_pairs >
+			    VIRTCHNL_OP_CONFIG_VSI_QUEUES_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vqc->num_queue_pairs *
+				      sizeof(struct
+					     virtchnl_queue_pair_info));
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		valid_len = sizeof(struct virtchnl_irq_map_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_irq_map_info *vimi =
+			    (struct virtchnl_irq_map_info *)msg;
+
+			if (vimi->num_vectors == 0 || vimi->num_vectors >
+			    VIRTCHNL_OP_CONFIG_IRQ_MAP_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vimi->num_vectors *
+				      sizeof(struct virtchnl_vector_map));
+		}
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		valid_len = sizeof(struct virtchnl_queue_select);
+		break;
+	case VIRTCHNL_OP_GET_MAX_RSS_QREGION:
+		break;
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		valid_len = sizeof(struct virtchnl_ether_addr_list);
+		if (msglen >= valid_len) {
+			struct virtchnl_ether_addr_list *veal =
+			    (struct virtchnl_ether_addr_list *)msg;
+
+			if (veal->num_elements == 0 || veal->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_ETH_ADDR_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += veal->num_elements *
+			    sizeof(struct virtchnl_ether_addr);
+		}
+		break;
+	case VIRTCHNL_OP_ADD_VLAN:
+	case VIRTCHNL_OP_DEL_VLAN:
+		valid_len = sizeof(struct virtchnl_vlan_filter_list);
+		if (msglen >= valid_len) {
+			struct virtchnl_vlan_filter_list *vfl =
+			    (struct virtchnl_vlan_filter_list *)msg;
+
+			if (vfl->num_elements == 0 || vfl->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_VLAN_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += vfl->num_elements * sizeof(u16);
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		valid_len = sizeof(struct virtchnl_promisc_info);
+		break;
+	case VIRTCHNL_OP_GET_STATS:
+		valid_len = sizeof(struct virtchnl_queue_select);
+		break;
+	case VIRTCHNL_OP_RDMA:
+		/* These messages are opaque to us and will be validated in
+		 * the RDMA client code. We just need to check for nonzero
+		 * length. The firmware will enforce max length restrictions.
+		 */
+		if (msglen)
+			valid_len = msglen;
+		else
+			err_msg_format = true;
+		break;
+	case VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP:
+		break;
+	case VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP:
+		valid_len = sizeof(struct virtchnl_rdma_qvlist_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_rdma_qvlist_info *qv =
+				(struct virtchnl_rdma_qvlist_info *)msg;
+
+			if (qv->num_vectors == 0 || qv->num_vectors >
+			    VIRTCHNL_OP_CONFIG_RDMA_IRQ_MAP_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += ((qv->num_vectors - 1) *
+				sizeof(struct virtchnl_rdma_qv_info));
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		valid_len = sizeof(struct virtchnl_rss_key);
+		if (msglen >= valid_len) {
+			struct virtchnl_rss_key *vrk =
+				(struct virtchnl_rss_key *)msg;
+
+			if (vrk->key_len == 0) {
+				/* zero length is allowed as input */
+				break;
+			}
+
+			valid_len += vrk->key_len - 1;
+		}
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		valid_len = sizeof(struct virtchnl_rss_lut);
+		if (msglen >= valid_len) {
+			struct virtchnl_rss_lut *vrl =
+				(struct virtchnl_rss_lut *)msg;
+
+			if (vrl->lut_entries == 0) {
+				/* zero entries is allowed as input */
+				break;
+			}
+
+			valid_len += vrl->lut_entries - 1;
+		}
+		break;
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		valid_len = sizeof(struct virtchnl_rss_hena);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		valid_len = sizeof(struct virtchnl_tc_info);
+		if (msglen >= valid_len) {
+			struct virtchnl_tc_info *vti =
+				(struct virtchnl_tc_info *)msg;
+
+			if (vti->num_tc == 0 || vti->num_tc >
+			    VIRTCHNL_OP_ENABLE_CHANNELS_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vti->num_tc - 1) *
+				     sizeof(struct virtchnl_channel_info);
+		}
+		break;
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		break;
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		valid_len = sizeof(struct virtchnl_filter);
+		break;
+	case VIRTCHNL_OP_DCF_VLAN_OFFLOAD:
+		valid_len = sizeof(struct virtchnl_dcf_vlan_offload);
+		break;
+	case VIRTCHNL_OP_DCF_CMD_DESC:
+	case VIRTCHNL_OP_DCF_CMD_BUFF:
+		/* These two opcodes are specific to handle the AdminQ command,
+		 * so the validation needs to be done in PF's context.
+		 */
+		valid_len = msglen;
+		break;
+        case VIRTCHNL_OP_DCF_RULE_FLUSH:
+	case VIRTCHNL_OP_DCF_DISABLE:
+	case VIRTCHNL_OP_DCF_GET_VSI_MAP:
+	case VIRTCHNL_OP_DCF_GET_PKG_INFO:
+		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		valid_len = sizeof(struct virtchnl_rss_cfg);
+		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_add);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_del);
+		break;
+	case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS:
+		break;
+	case VIRTCHNL_OP_ADD_VLAN_V2:
+	case VIRTCHNL_OP_DEL_VLAN_V2:
+		valid_len = sizeof(struct virtchnl_vlan_filter_list_v2);
+		if (msglen >= valid_len) {
+			struct virtchnl_vlan_filter_list_v2 *vfl =
+			    (struct virtchnl_vlan_filter_list_v2 *)msg;
+
+			if (vfl->num_elements == 0 || vfl->num_elements >
+			    VIRTCHNL_OP_ADD_DEL_VLAN_V2_MAX) {
+				err_msg_format = true;
+				break;
+			}
+
+			valid_len += (vfl->num_elements - 1) *
+				sizeof(struct virtchnl_vlan_filter);
+		}
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2:
+	case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2:
+	case VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2:
+	case VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2:
+		valid_len = sizeof(struct virtchnl_vlan_setting);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES_V2:
+	case VIRTCHNL_OP_DISABLE_QUEUES_V2:
+		valid_len = sizeof(struct virtchnl_del_ena_dis_queues);
+		if (msglen >= valid_len) {
+			struct virtchnl_del_ena_dis_queues *qs =
+				(struct virtchnl_del_ena_dis_queues *)msg;
+			if (qs->chunks.num_chunks == 0 ||
+			    qs->chunks.num_chunks > VIRTCHNL_OP_ENABLE_DISABLE_DEL_QUEUES_V2_MAX) {
+				err_msg_format = true;
+				break;
+			}
+			valid_len += (qs->chunks.num_chunks - 1) *
+				      sizeof(struct virtchnl_queue_chunk);
+		}
+		break;
+	case VIRTCHNL_OP_MAP_QUEUE_VECTOR:
+		valid_len = sizeof(struct virtchnl_queue_vector_maps);
+		if (msglen >= valid_len) {
+			struct virtchnl_queue_vector_maps *v_qp =
+				(struct virtchnl_queue_vector_maps *)msg;
+			if (v_qp->num_qv_maps == 0 ||
+			    v_qp->num_qv_maps > VIRTCHNL_OP_MAP_UNMAP_QUEUE_VECTOR_MAX) {
+				err_msg_format = true;
+				break;
+			}
+			valid_len += (v_qp->num_qv_maps - 1) *
+				      sizeof(struct virtchnl_queue_vector);
+		}
+		break;
+	/* These are always errors coming from the VF. */
+	case VIRTCHNL_OP_EVENT:
+	case VIRTCHNL_OP_UNKNOWN:
+	default:
+		return VIRTCHNL_STATUS_ERR_PARAM;
+	}
+	/* few more checks */
+	if (err_msg_format || valid_len != msglen)
+		return VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH;
+
+	return 0;
+}
+#endif /* _VIRTCHNL_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h b/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h
new file mode 100644
index 0000000000000000000000000000000000000000..eec608dde607d76d361fa27d4de5d5857aba6676
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl_inline_ipsec.h
@@ -0,0 +1,548 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+
+#ifndef _VIRTCHNL_INLINE_IPSEC_H_
+#define _VIRTCHNL_INLINE_IPSEC_H_
+
+#define VIRTCHNL_IPSEC_MAX_CRYPTO_CAP_NUM	3
+#define VIRTCHNL_IPSEC_MAX_ALGO_CAP_NUM		16
+#define VIRTCHNL_IPSEC_MAX_TX_DESC_NUM		128
+#define VIRTCHNL_IPSEC_MAX_CRYPTO_ITEM_NUMBER	2
+#define VIRTCHNL_IPSEC_MAX_KEY_LEN		128
+#define VIRTCHNL_IPSEC_MAX_SA_DESTROY_NUM	8
+#define VIRTCHNL_IPSEC_SA_DESTROY		0
+#define VIRTCHNL_IPSEC_BROADCAST_VFID		0xFFFFFFFF
+#define VIRTCHNL_IPSEC_INVALID_REQ_ID		0xFFFF
+#define VIRTCHNL_IPSEC_INVALID_SA_CFG_RESP	0xFFFFFFFF
+#define VIRTCHNL_IPSEC_INVALID_SP_CFG_RESP	0xFFFFFFFF
+
+/* crypto type */
+#define VIRTCHNL_AUTH		1
+#define VIRTCHNL_CIPHER		2
+#define VIRTCHNL_AEAD		3
+
+/* caps enabled */
+#define VIRTCHNL_IPSEC_ESN_ENA			BIT(0)
+#define VIRTCHNL_IPSEC_UDP_ENCAP_ENA		BIT(1)
+#define VIRTCHNL_IPSEC_SA_INDEX_SW_ENA		BIT(2)
+#define VIRTCHNL_IPSEC_AUDIT_ENA		BIT(3)
+#define VIRTCHNL_IPSEC_BYTE_LIMIT_ENA		BIT(4)
+#define VIRTCHNL_IPSEC_DROP_ON_AUTH_FAIL_ENA	BIT(5)
+#define VIRTCHNL_IPSEC_ARW_CHECK_ENA		BIT(6)
+#define VIRTCHNL_IPSEC_24BIT_SPI_ENA		BIT(7)
+
+/* algorithm type */
+/* Hash Algorithm */
+#define VIRTCHNL_HASH_NO_ALG	0 /* NULL algorithm */
+#define VIRTCHNL_AES_CBC_MAC	1 /* AES-CBC-MAC algorithm */
+#define VIRTCHNL_AES_CMAC	2 /* AES CMAC algorithm */
+#define VIRTCHNL_AES_GMAC	3 /* AES GMAC algorithm */
+#define VIRTCHNL_AES_XCBC_MAC	4 /* AES XCBC algorithm */
+#define VIRTCHNL_MD5_HMAC	5 /* HMAC using MD5 algorithm */
+#define VIRTCHNL_SHA1_HMAC	6 /* HMAC using 128 bit SHA algorithm */
+#define VIRTCHNL_SHA224_HMAC	7 /* HMAC using 224 bit SHA algorithm */
+#define VIRTCHNL_SHA256_HMAC	8 /* HMAC using 256 bit SHA algorithm */
+#define VIRTCHNL_SHA384_HMAC	9 /* HMAC using 384 bit SHA algorithm */
+#define VIRTCHNL_SHA512_HMAC	10 /* HMAC using 512 bit SHA algorithm */
+#define VIRTCHNL_SHA3_224_HMAC	11 /* HMAC using 224 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_256_HMAC	12 /* HMAC using 256 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_384_HMAC	13 /* HMAC using 384 bit SHA3 algorithm */
+#define VIRTCHNL_SHA3_512_HMAC	14 /* HMAC using 512 bit SHA3 algorithm */
+/* Cipher Algorithm */
+#define VIRTCHNL_CIPHER_NO_ALG	15 /* NULL algorithm */
+#define VIRTCHNL_3DES_CBC	16 /* Triple DES algorithm in CBC mode */
+#define VIRTCHNL_AES_CBC	17 /* AES algorithm in CBC mode */
+#define VIRTCHNL_AES_CTR	18 /* AES algorithm in Counter mode */
+/* AEAD Algorithm */
+#define VIRTCHNL_AES_CCM	19 /* AES algorithm in CCM mode */
+#define VIRTCHNL_AES_GCM	20 /* AES algorithm in GCM mode */
+#define VIRTCHNL_CHACHA20_POLY1305 21 /* algorithm of ChaCha20-Poly1305 */
+
+/* protocol type */
+#define VIRTCHNL_PROTO_ESP	1
+#define VIRTCHNL_PROTO_AH	2
+#define VIRTCHNL_PROTO_RSVD1	3
+
+/* sa mode */
+#define VIRTCHNL_SA_MODE_TRANSPORT	1
+#define VIRTCHNL_SA_MODE_TUNNEL		2
+#define VIRTCHNL_SA_MODE_TRAN_TUN	3
+#define VIRTCHNL_SA_MODE_UNKNOWN	4
+
+/* sa direction */
+#define VIRTCHNL_DIR_INGRESS		1
+#define VIRTCHNL_DIR_EGRESS		2
+#define VIRTCHNL_DIR_INGRESS_EGRESS	3
+
+/* sa termination */
+#define VIRTCHNL_TERM_SOFTWARE	1
+#define VIRTCHNL_TERM_HARDWARE	2
+
+/* sa ip type */
+#define VIRTCHNL_IPV4	1
+#define VIRTCHNL_IPV6	2
+
+/* for virtchnl_ipsec_resp */
+enum inline_ipsec_resp {
+	INLINE_IPSEC_SUCCESS = 0,
+	INLINE_IPSEC_FAIL = -1,
+	INLINE_IPSEC_ERR_FIFO_FULL = -2,
+	INLINE_IPSEC_ERR_NOT_READY = -3,
+	INLINE_IPSEC_ERR_VF_DOWN = -4,
+	INLINE_IPSEC_ERR_INVALID_PARAMS = -5,
+	INLINE_IPSEC_ERR_NO_MEM = -6,
+};
+
+/* Detailed opcodes for DPDK and IPsec use */
+enum inline_ipsec_ops {
+	INLINE_IPSEC_OP_GET_CAP = 0,
+	INLINE_IPSEC_OP_GET_STATUS = 1,
+	INLINE_IPSEC_OP_SA_CREATE = 2,
+	INLINE_IPSEC_OP_SA_UPDATE = 3,
+	INLINE_IPSEC_OP_SA_DESTROY = 4,
+	INLINE_IPSEC_OP_SP_CREATE = 5,
+	INLINE_IPSEC_OP_SP_DESTROY = 6,
+	INLINE_IPSEC_OP_SA_READ = 7,
+	INLINE_IPSEC_OP_EVENT = 8,
+	INLINE_IPSEC_OP_RESP = 9,
+};
+
+/* Not all valid, if certain field is invalid, set 1 for all bits */
+struct virtchnl_algo_cap  {
+	u32 algo_type;
+
+	u16 block_size;
+
+	u16 min_key_size;
+	u16 max_key_size;
+	u16 inc_key_size;
+
+	u16 min_iv_size;
+	u16 max_iv_size;
+	u16 inc_iv_size;
+
+	u16 min_digest_size;
+	u16 max_digest_size;
+	u16 inc_digest_size;
+
+	u16 min_aad_size;
+	u16 max_aad_size;
+	u16 inc_aad_size;
+} __packed;
+
+/* vf record the capability of crypto from the virtchnl */
+struct virtchnl_sym_crypto_cap {
+	u8 crypto_type;
+	u8 algo_cap_num;
+	struct virtchnl_algo_cap algo_cap_list[VIRTCHNL_IPSEC_MAX_ALGO_CAP_NUM];
+};
+
+/* VIRTCHNL_OP_GET_IPSEC_CAP
+ * VF pass virtchnl_ipsec_cap to PF
+ * and PF return capability of ipsec from virtchnl.
+ */
+struct virtchnl_ipsec_cap {
+	/* max number of SA per VF */
+	u16 max_sa_num;
+
+	/* IPsec SA Protocol - value ref VIRTCHNL_PROTO_XXX */
+	u8 virtchnl_protocol_type;
+
+	/* IPsec SA Mode - value ref VIRTCHNL_SA_MODE_XXX */
+	u8 virtchnl_sa_mode;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 termination_mode;
+
+	/* number of supported crypto capability */
+	u8 crypto_cap_num;
+
+	/* descriptor ID */
+	u16 desc_id;
+
+	/* capabilities enabled - value ref VIRTCHNL_IPSEC_XXX_ENA */
+	u32 caps_enabled;
+
+	/* crypto capabilities */
+	struct virtchnl_sym_crypto_cap cap[VIRTCHNL_IPSEC_MAX_CRYPTO_CAP_NUM];
+} __packed;
+
+/* configuration of crypto function */
+struct virtchnl_ipsec_crypto_cfg_item {
+	u8 crypto_type;
+
+	u32 algo_type;
+
+	/* Length of valid IV data. */
+	u16 iv_len;
+
+	/* Length of digest */
+	u16 digest_len;
+
+	/* SA salt */
+	u32 salt;
+
+	/* The length of the symmetric key */
+	u16 key_len;
+
+	/* key data buffer */
+	u8 key_data[VIRTCHNL_IPSEC_MAX_KEY_LEN];
+} __packed;
+
+struct virtchnl_ipsec_sym_crypto_cfg {
+	struct virtchnl_ipsec_crypto_cfg_item
+		items[VIRTCHNL_IPSEC_MAX_CRYPTO_ITEM_NUMBER];
+};
+
+/* VIRTCHNL_OP_IPSEC_SA_CREATE
+ * VF send this SA configuration to PF using virtchnl;
+ * PF create SA as configuration and PF driver will return
+ * an unique index (sa_idx) for the created SA.
+ */
+struct virtchnl_ipsec_sa_cfg {
+	/* IPsec SA Protocol - AH/ESP */
+	u8 virtchnl_protocol_type;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 virtchnl_termination;
+
+	/* type of outer IP - IPv4/IPv6 */
+	u8 virtchnl_ip_type;
+
+	/* type of esn - !0:enable/0:disable */
+	u8 esn_enabled;
+
+	/* udp encap - !0:enable/0:disable */
+	u8 udp_encap_enabled;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* reserved */
+	u8 reserved1;
+
+	/* SA security parameter index */
+	u32 spi;
+
+	/* outer src ip address */
+	u8 src_addr[16];
+
+	/* outer dst ip address */
+	u8 dst_addr[16];
+
+	/* SPD reference. Used to link an SA with its policy.
+	 * PF drivers may ignore this field.
+	 */
+	u16 spd_ref;
+
+	/* high 32 bits of esn */
+	u32 esn_hi;
+
+	/* low 32 bits of esn */
+	u32 esn_low;
+
+	/* When enabled, sa_index must be valid */
+	u8 sa_index_en;
+
+	/* SA index when sa_index_en is true */
+	u32 sa_index;
+
+	/* auditing mode - enable/disable */
+	u8 audit_en;
+
+	/* lifetime byte limit - enable/disable
+	 * When enabled, byte_limit_hard and byte_limit_soft
+	 * must be valid.
+	 */
+	u8 byte_limit_en;
+
+	/* hard byte limit count */
+	u64 byte_limit_hard;
+
+	/* soft byte limit count */
+	u64 byte_limit_soft;
+
+	/* drop on authentication failure - enable/disable */
+	u8 drop_on_auth_fail_en;
+
+	/* anti-reply window check - enable/disable
+	 * When enabled, arw_size must be valid.
+	 */
+	u8 arw_check_en;
+
+	/* size of arw window, offset by 1. Setting to 0
+	 * represents ARW window size of 1. Setting to 127
+	 * represents ARW window size of 128
+	 */
+	u8 arw_size;
+
+	/* no ip offload mode - enable/disable
+	 * When enabled, ip type and address must not be valid.
+	 */
+	u8 no_ip_offload_en;
+
+	/* SA Domain. Used to logical separate an SADB into groups.
+	 * PF drivers supporting a single group ignore this field.
+	 */
+	u16 sa_domain;
+
+	/* crypto configuration */
+	struct virtchnl_ipsec_sym_crypto_cfg crypto_cfg;
+} __packed;
+
+/* VIRTCHNL_OP_IPSEC_SA_UPDATE
+ * VF send configuration of index of SA to PF
+ * PF will update SA according to configuration
+ */
+struct virtchnl_ipsec_sa_update {
+	u32 sa_index; /* SA to update */
+	u32 esn_hi; /* high 32 bits of esn */
+	u32 esn_low; /* low 32 bits of esn */
+};
+
+/* VIRTCHNL_OP_IPSEC_SA_DESTROY
+ * VF send configuration of index of SA to PF
+ * PF will destroy SA according to configuration
+ * flag bitmap indicate all SA or just selected SA will
+ * be destroyed
+ */
+struct virtchnl_ipsec_sa_destroy {
+	/* All zero bitmap indicates all SA will be destroyed.
+	 * Non-zero bitmap indicates the selected SA in
+	 * array sa_index will be destroyed.
+	 */
+	u8 flag;
+
+	/* selected SA index */
+	u32 sa_index[VIRTCHNL_IPSEC_MAX_SA_DESTROY_NUM];
+} __packed;
+
+/* VIRTCHNL_OP_IPSEC_SA_READ
+ * VF send this SA configuration to PF using virtchnl;
+ * PF read SA and will return configuration for the created SA.
+ */
+struct virtchnl_ipsec_sa_read {
+	/* SA valid - invalid/valid */
+	u8 valid;
+
+	/* SA active - inactive/active */
+	u8 active;
+
+	/* SA SN rollover - not_rollover/rollover */
+	u8 sn_rollover;
+
+	/* IPsec SA Protocol - AH/ESP */
+	u8 virtchnl_protocol_type;
+
+	/* termination mode - value ref VIRTCHNL_TERM_XXX */
+	u8 virtchnl_termination;
+
+	/* auditing mode - enable/disable */
+	u8 audit_en;
+
+	/* lifetime byte limit - enable/disable
+	 * When set to limit, byte_limit_hard and byte_limit_soft
+	 * must be valid.
+	 */
+	u8 byte_limit_en;
+
+	/* hard byte limit count */
+	u64 byte_limit_hard;
+
+	/* soft byte limit count */
+	u64 byte_limit_soft;
+
+	/* drop on authentication failure - enable/disable */
+	u8 drop_on_auth_fail_en;
+
+	/* anti-replay window check - enable/disable
+	 * When set to check, arw_size, arw_top, and arw must be valid
+	 */
+	u8 arw_check_en;
+
+	/* size of arw window, offset by 1. Setting to 0
+	 * represents ARW window size of 1. Setting to 127
+	 * represents ARW window size of 128
+	 */
+	u8 arw_size;
+
+	/* reserved */
+	u8 reserved1;
+
+	/* top of anti-replay-window */
+	u64 arw_top;
+
+	/* anti-replay-window */
+	u8 arw[16];
+
+	/* packets processed  */
+	u64 packets_processed;
+
+	/* bytes processed  */
+	u64 bytes_processed;
+
+	/* packets dropped  */
+	u32 packets_dropped;
+
+	/* authentication failures */
+	u32 auth_fails;
+
+	/* ARW check failures */
+	u32 arw_fails;
+
+	/* type of esn - enable/disable */
+	u8 esn;
+
+	/* IPSec SA Direction - value ref VIRTCHNL_DIR_XXX */
+	u8 virtchnl_direction;
+
+	/* SA security parameter index */
+	u32 spi;
+
+	/* SA salt */
+	u32 salt;
+
+	/* high 32 bits of esn */
+	u32 esn_hi;
+
+	/* low 32 bits of esn */
+	u32 esn_low;
+
+	/* SA Domain. Used to logical separate an SADB into groups.
+	 * PF drivers supporting a single group ignore this field.
+	 */
+	u16 sa_domain;
+
+	/* SPD reference. Used to link an SA with its policy.
+	 * PF drivers may ignore this field.
+	 */
+	u16 spd_ref;
+
+	/* crypto configuration. Salt and keys are set to 0 */
+	struct virtchnl_ipsec_sym_crypto_cfg crypto_cfg;
+} __packed;
+
+/* Add allowlist entry in IES */
+struct virtchnl_ipsec_sp_cfg {
+	u32 spi;
+	u32 dip[4];
+
+	/* Drop frame if true or redirect to QAT if false. */
+	u8 drop;
+
+	/* Congestion domain. For future use. */
+	u8 cgd;
+
+	/* 0 for IPv4 table, 1 for IPv6 table. */
+	u8 table_id;
+
+	/* Set TC (congestion domain) if true. For future use. */
+	u8 set_tc;
+};
+
+/* Delete allowlist entry in IES */
+struct virtchnl_ipsec_sp_destroy {
+	/* 0 for IPv4 table, 1 for IPv6 table. */
+	u8 table_id;
+	u32 rule_id;
+} __packed;
+
+/* Response from IES to allowlist operations */
+struct virtchnl_ipsec_sp_cfg_resp {
+	u32 rule_id;
+};
+
+struct virtchnl_ipsec_sa_cfg_resp {
+	u32 sa_handle;
+};
+
+#define INLINE_IPSEC_EVENT_RESET	0x1
+#define INLINE_IPSEC_EVENT_CRYPTO_ON	0x2
+#define INLINE_IPSEC_EVENT_CRYPTO_OFF	0x4
+
+struct virtchnl_ipsec_event {
+	u32 ipsec_event_data;
+};
+
+#define INLINE_IPSEC_STATUS_AVAILABLE	0x1
+#define INLINE_IPSEC_STATUS_UNAVAILABLE	0x2
+
+struct virtchnl_ipsec_status {
+	u32 status;
+};
+
+struct virtchnl_ipsec_resp {
+	u32 resp;
+};
+
+/* Internal message descriptor for VF <-> IPsec communication */
+struct inline_ipsec_msg {
+	u16 ipsec_opcode;
+	u16 req_id;
+
+	union {
+		/* IPsec request */
+		struct virtchnl_ipsec_sa_cfg sa_cfg[0];
+		struct virtchnl_ipsec_sp_cfg sp_cfg[0];
+		struct virtchnl_ipsec_sa_update sa_update[0];
+		struct virtchnl_ipsec_sa_destroy sa_destroy[0];
+		struct virtchnl_ipsec_sp_destroy sp_destroy[0];
+
+		/* IPsec response */
+		struct virtchnl_ipsec_sa_cfg_resp sa_cfg_resp[0];
+		struct virtchnl_ipsec_sp_cfg_resp sp_cfg_resp[0];
+		struct virtchnl_ipsec_cap ipsec_cap[0];
+		struct virtchnl_ipsec_status ipsec_status[0];
+		/* response to del_sa, del_sp, update_sa */
+		struct virtchnl_ipsec_resp ipsec_resp[0];
+
+		/* IPsec event (no req_id is required) */
+		struct virtchnl_ipsec_event event[0];
+
+		/* Reserved */
+		struct virtchnl_ipsec_sa_read sa_read[0];
+	} ipsec_data;
+};
+
+static inline u16 virtchnl_inline_ipsec_val_msg_len(u16 opcode)
+{
+	u16 valid_len = sizeof(struct inline_ipsec_msg);
+
+	switch (opcode) {
+	case INLINE_IPSEC_OP_GET_CAP:
+	case INLINE_IPSEC_OP_GET_STATUS:
+		break;
+	case INLINE_IPSEC_OP_SA_CREATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_cfg);
+		break;
+	case INLINE_IPSEC_OP_SP_CREATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sp_cfg);
+		break;
+	case INLINE_IPSEC_OP_SA_UPDATE:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_update);
+		break;
+	case INLINE_IPSEC_OP_SA_DESTROY:
+		valid_len += sizeof(struct virtchnl_ipsec_sa_destroy);
+		break;
+	case INLINE_IPSEC_OP_SP_DESTROY:
+		valid_len += sizeof(struct virtchnl_ipsec_sp_destroy);
+		break;
+	/* Only for msg length caculation of response to VF in case of
+	 * inline ipsec failure.
+	 */
+	case INLINE_IPSEC_OP_RESP:
+		valid_len += sizeof(struct virtchnl_ipsec_resp);
+		break;
+	default:
+		valid_len = 0;
+		break;
+	}
+
+	return valid_len;
+}
+
+#endif /* _VIRTCHNL_INLINE_IPSEC_H_ */
diff --git a/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h b/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ac587008da7a1dbbed2ee478a2a296e24b62a1f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/virtchnl_lan_desc.h
@@ -0,0 +1,528 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/*
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * For licensing information, see the file 'LICENSE' in the root folder
+ */
+#ifndef _VIRTCHNL_LAN_DESC_H_
+#define _VIRTCHNL_LAN_DESC_H_
+
+/* Rx */
+/* For virtchnl_splitq_base_rx_flex desc members */
+#define VIRTCHNL_RXD_FLEX_RXDID_S		0
+#define VIRTCHNL_RXD_FLEX_RXDID_M		\
+	ICE_M(0xFUL, VIRTCHNL_RXD_FLEX_RXDID_S)
+#define VIRTCHNL_RXD_FLEX_PTYPE_S		0
+#define VIRTCHNL_RXD_FLEX_PTYPE_M		\
+	ICE_M(0x3FFUL, VIRTCHNL_RXD_FLEX_PTYPE_S)
+#define VIRTCHNL_RXD_FLEX_UMBCAST_S		10
+#define VIRTCHNL_RXD_FLEX_UMBCAST_M		\
+	ICE_M(0x3UL, VIRTCHNL_RXD_FLEX_UMBCAST_S)
+#define VIRTCHNL_RXD_FLEX_FF0_S		12
+#define VIRTCHNL_RXD_FLEX_FF0_M		ICE_M(0xFUL, VIRTCHNL_RXD_FLEX_FF0_S)
+#define VIRTCHNL_RXD_FLEX_LEN_PBUF_S	0
+#define VIRTCHNL_RXD_FLEX_LEN_PBUF_M	\
+	ICE_M(0x3FFFUL, VIRTCHNL_RXD_FLEX_LEN_PBUF_S)
+#define VIRTCHNL_RXD_FLEX_GEN_S		14
+#define VIRTCHNL_RXD_FLEX_GEN_M		BIT_ULL(VIRTCHNL_RXD_FLEX_GEN_S)
+#define VIRTCHNL_RXD_FLEX_BUFQ_ID_S		15
+#define VIRTCHNL_RXD_FLEX_BUFQ_ID_M		\
+	BIT_ULL(VIRTCHNL_RXD_FLEX_BUFQ_ID_S)
+#define VIRTCHNL_RXD_FLEX_LEN_HDR_S		0
+#define VIRTCHNL_RXD_FLEX_LEN_HDR_M		\
+	ICE_M(0x3FFUL, VIRTCHNL_RXD_FLEX_LEN_HDR_S)
+#define VIRTCHNL_RXD_FLEX_RSC_S		10
+#define VIRTCHNL_RXD_FLEX_RSC_M		BIT_ULL(VIRTCHNL_RXD_FLEX_RSC_S)
+#define VIRTCHNL_RXD_FLEX_SPH_S		11
+#define VIRTCHNL_RXD_FLEX_SPH_M		BIT_ULL(VIRTCHNL_RXD_FLEX_SPH_S)
+#define VIRTCHNL_RXD_FLEX_MISS_S		12
+#define VIRTCHNL_RXD_FLEX_MISS_M		\
+	BIT_ULL(VIRTCHNL_RXD_FLEX_MISS_S)
+#define VIRTCHNL_RXD_FLEX_FF1_S		13
+#define VIRTCHNL_RXD_FLEX_FF1_M		ICE_M(0x7UL, VIRTCHNL_RXD_FLEX_FF1_M)
+
+/* For virtchnl_singleq_base_rx_legacy desc members */
+#define VIRTCHNL_RXD_QW1_LEN_SPH_S	63
+#define VIRTCHNL_RXD_QW1_LEN_SPH_M	BIT_ULL(VIRTCHNL_RXD_QW1_LEN_SPH_S)
+#define VIRTCHNL_RXD_QW1_LEN_HBUF_S	52
+#define VIRTCHNL_RXD_QW1_LEN_HBUF_M	\
+	ICE_M(0x7FFULL, VIRTCHNL_RXD_QW1_LEN_HBUF_S)
+#define VIRTCHNL_RXD_QW1_LEN_PBUF_S	38
+#define VIRTCHNL_RXD_QW1_LEN_PBUF_M	\
+	ICE_M(0x3FFFULL, VIRTCHNL_RXD_QW1_LEN_PBUF_S)
+#define VIRTCHNL_RXD_QW1_PTYPE_S	30
+#define VIRTCHNL_RXD_QW1_PTYPE_M	\
+	ICE_M(0xFFULL, VIRTCHNL_RXD_QW1_PTYPE_S)
+#define VIRTCHNL_RXD_QW1_ERROR_S	19
+#define VIRTCHNL_RXD_QW1_ERROR_M	\
+	ICE_M(0xFFUL, VIRTCHNL_RXD_QW1_ERROR_S)
+#define VIRTCHNL_RXD_QW1_STATUS_S	0
+#define VIRTCHNL_RXD_QW1_STATUS_M	\
+	ICE_M(0x7FFFFUL, VIRTCHNL_RXD_QW1_STATUS_S)
+
+enum virtchnl_rx_flex_desc_status_error_0_qw1_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_DD_S = 0,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_EOF_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_HBO_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_L3L4P_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_IPE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_L4E_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S,
+};
+
+enum virtchnl_rx_flex_desc_status_error_0_qw0_bits {
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_LPBK_S = 0,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_IPV6EXADD_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_RXE_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_CRCP_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_RSS_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_L2TAG1P_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XTRMD0_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_XTRMD1_VALID_S,
+	VIRTCHNL_RX_FLEX_DESC_STATUS0_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_flex_desc_status_error_1_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_RSVD_S = 0, /* 2 bits */
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_ATRAEFAIL_S = 2,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_L2TAG2P_S = 3,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD2_VALID_S = 4,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD3_VALID_S = 5,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD4_VALID_S = 6,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_XTRMD5_VALID_S = 7,
+	VIRTCHNL_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_base_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_BASE_DESC_STATUS_DD_S		= 0,
+	VIRTCHNL_RX_BASE_DESC_STATUS_EOF_S		= 1,
+	VIRTCHNL_RX_BASE_DESC_STATUS_L2TAG1P_S	= 2,
+	VIRTCHNL_RX_BASE_DESC_STATUS_L3L4P_S	= 3,
+	VIRTCHNL_RX_BASE_DESC_STATUS_CRCP_S		= 4,
+	VIRTCHNL_RX_BASE_DESC_STATUS_RSVD_S		= 5, /* 3 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_EXT_UDP_0_S	= 8,
+	VIRTCHNL_RX_BASE_DESC_STATUS_UMBCAST_S	= 9, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_FLM_S		= 11,
+	VIRTCHNL_RX_BASE_DESC_STATUS_FLTSTAT_S	= 12, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_LPBK_S		= 14,
+	VIRTCHNL_RX_BASE_DESC_STATUS_IPV6EXADD_S	= 15,
+	VIRTCHNL_RX_BASE_DESC_STATUS_RSVD1_S	= 16, /* 2 BITS */
+	VIRTCHNL_RX_BASE_DESC_STATUS_INT_UDP_0_S	= 18,
+	VIRTCHNL_RX_BASE_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+enum virtchnl_rx_desc_fltstat_values {
+	VIRTCHNL_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	VIRTCHNL_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	VIRTCHNL_RX_DESC_FLTSTAT_RSV	= 2,
+	VIRTCHNL_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+enum virtchnl_rx_base_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	VIRTCHNL_RX_BASE_DESC_ERROR_RXE_S		= 0,
+	VIRTCHNL_RX_BASE_DESC_ERROR_ATRAEFAIL_S	= 1,
+	VIRTCHNL_RX_BASE_DESC_ERROR_HBO_S		= 2,
+	VIRTCHNL_RX_BASE_DESC_ERROR_L3L4E_S		= 3, /* 3 BITS */
+	VIRTCHNL_RX_BASE_DESC_ERROR_IPE_S		= 3,
+	VIRTCHNL_RX_BASE_DESC_ERROR_L4E_S		= 4,
+	VIRTCHNL_RX_BASE_DESC_ERROR_EIPE_S		= 5,
+	VIRTCHNL_RX_BASE_DESC_ERROR_OVERSIZE_S	= 6,
+	VIRTCHNL_RX_BASE_DESC_ERROR_RSVD_S		= 7
+};
+
+/* Receive Descriptors */
+/* splitq buf
+ |                                       16|                   0|
+ ----------------------------------------------------------------
+ | RSV                                     | Buffer ID          |
+ ----------------------------------------------------------------
+ | Rx packet buffer adresss                                     |
+ ----------------------------------------------------------------
+ | Rx header buffer adresss                                     |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ |                                                             0|
+ */
+struct virtchnl_splitq_rx_buf_desc {
+	struct {
+		__le16  buf_id; /* Buffer Identifier */
+		__le16  rsvd0;
+		__le32  rsvd1;
+	} qword0;
+	__le64  pkt_addr; /* Packet buffer address */
+	__le64  hdr_addr; /* Header buffer address */
+	__le64  rsvd2;
+}; /* read used with buffer queues*/
+
+/* singleq buf
+ |                                                             0|
+ ----------------------------------------------------------------
+ | Rx packet buffer adresss                                     |
+ ----------------------------------------------------------------
+ | Rx header buffer adresss                                     |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ | RSV                                                          |
+ ----------------------------------------------------------------
+ |                                                             0|
+ */
+struct virtchnl_singleq_rx_buf_desc {
+	__le64  pkt_addr; /* Packet buffer address */
+	__le64  hdr_addr; /* Header buffer address */
+	__le64  rsvd1;
+	__le64  rsvd2;
+}; /* read used with buffer queues*/
+
+union virtchnl_rx_buf_desc {
+	struct virtchnl_singleq_rx_buf_desc		read;
+	struct virtchnl_splitq_rx_buf_desc		split_rd;
+};
+
+/* (0x00) singleq wb(compl) */
+struct virtchnl_singleq_base_rx_desc {
+	struct {
+		struct {
+			__le16 mirroring_status;
+			__le16 l2tag1;
+		} lo_dword;
+		union {
+			__le32 rss; /* RSS Hash */
+			__le32 fd_id; /* Flow Director filter id */
+		} hi_dword;
+	} qword0;
+	struct {
+		/* status/error/PTYPE/length */
+		__le64 status_error_ptype_len;
+	} qword1;
+	struct {
+		__le16 ext_status; /* extended status */
+		__le16 rsvd;
+		__le16 l2tag2_1;
+		__le16 l2tag2_2;
+	} qword2;
+	struct {
+		__le32 reserved;
+		__le32 fd_id;
+	} qword3;
+}; /* writeback */
+
+/* (0x01) singleq flex compl */
+struct virtchnl_rx_flex_desc {
+	/* Qword 0 */
+	u8 rxdid; /* descriptor builder profile id */
+	u8 mir_id_umb_cast; /* mirror=[5:0], umb=[7:6] */
+	__le16 ptype_flex_flags0; /* ptype=[9:0], ff0=[15:10] */
+	__le16 pkt_len; /* [15:14] are reserved */
+	__le16 hdr_len_sph_flex_flags1; /* header=[10:0] */
+					/* sph=[11:11] */
+					/* ff1/ext=[15:12] */
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 flex_meta0;
+	__le16 flex_meta1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 time_stamp_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flex_meta2;
+	__le16 flex_meta3;
+	union {
+		struct {
+			__le16 flex_meta4;
+			__le16 flex_meta5;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* (0x02) */
+struct virtchnl_rx_flex_desc_nic {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 flow_id;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* Rx Flex Descriptor Switch Profile
+ * RxDID Profile Id 3
+ * Flex-field 0: Source Vsi
+ */
+struct virtchnl_rx_flex_desc_sw {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 src_vsi; /* [10:15] are reserved */
+	__le16 flex_md1_rsvd;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC VEB Profile
+ * RxDID Profile Id 4
+ * Flex-field 0: Destination Vsi
+ */
+struct virtchnl_rx_flex_desc_nic_veb_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 dst_vsi; /* [0:12]: destination vsi */
+			/* 13: vsi valid bit */
+			/* [14:15] are reserved */
+	__le16 flex_field_1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC ACL Profile
+ * RxDID Profile Id 5
+ * Flex-field 0: ACL Counter 0
+ * Flex-field 1: ACL Counter 1
+ * Flex-field 2: ACL Counter 2
+ */
+struct virtchnl_rx_flex_desc_nic_acl_dbg {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le16 acl_ctr0;
+	__le16 acl_ctr1;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flex_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 acl_ctr2;
+	__le16 rsvd; /* flex words 2-3 are reserved */
+	__le32 ts_high;
+};
+
+/* Rx Flex Descriptor NIC Profile
+ * RxDID Profile Id 6
+ * Flex-field 0: RSS hash lower 16-bits
+ * Flex-field 1: RSS hash upper 16-bits
+ * Flex-field 2: Flow Id lower 16-bits
+ * Flex-field 3: Source Vsi
+ * Flex-field 4: reserved, Vlan id taken from L2Tag
+ */
+struct virtchnl_rx_flex_desc_nic_2 {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 rss_hash;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le16 flow_id;
+	__le16 src_vsi;
+	union {
+		struct {
+			__le16 rsvd;
+			__le16 flow_id_ipv6;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
+/* Rx Flex Descriptor Advanced (Split Queue Model)
+ * RxDID Profile Id 7
+ */
+struct virtchnl_rx_flex_desc_adv {
+	/* Qword 0 */
+	u8 rxdid_ucast; /* profile_id=[3:0] */
+			/* rsvd=[5:4] */
+			/* ucast=[7:6] */
+	u8 status_err0_qw0;
+	__le16 ptype_err_fflags0;	/* ptype=[9:0] */
+					/* ip_hdr_err=[10:10] */
+					/* udp_len_err=[11:11] */
+					/* ff0=[15:12] */
+	__le16 pktlen_gen_bufq_id;	/* plen=[13:0] */
+					/* gen=[14:14]  only in splitq */
+					/* bufq_id=[15:15] only in splitq */
+	__le16 hdrlen_flags;		/* header=[9:0] */
+					/* rsc=[10:10] only in splitq */
+					/* sph=[11:11] only in splitq */
+					/* ext_udp_0=[12:12] */
+					/* int_udp_0=[13:13] */
+					/* trunc_mirr=[14:14] */
+					/* miss_prepend=[15:15] */
+	/* Qword 1 */
+	u8 status_err0_qw1;
+	u8 status_err1;
+	u8 fflags1;
+	u8 ts_low;
+	__le16 fmd0;
+	__le16 fmd1;
+	/* Qword 2 */
+	__le16 fmd2;
+	u8 fflags2;
+	u8 hash3;
+	__le16 fmd3;
+	__le16 fmd4;
+	/* Qword 3 */
+	__le16 fmd5;
+	__le16 fmd6;
+	__le16 fmd7_0;
+	__le16 fmd7_1;
+}; /* writeback */
+
+/* Rx Flex Descriptor Advanced (Split Queue Model) NIC Profile
+ * RxDID Profile Id 8
+ * Flex-field 0: BufferID
+ * Flex-field 1: Raw checksum/L2TAG1/RSC Seg Len (determined by HW)
+ * Flex-field 2: Hash[15:0]
+ * Flex-flags 2: Hash[23:16]
+ * Flex-field 3: L2TAG2
+ * Flex-field 5: L2TAG1
+ * Flex-field 7: Timestamp (upper 32 bits)
+ */
+struct virtchnl_rx_flex_desc_adv_nic_3 {
+	/* Qword 0 */
+	u8 rxdid_ucast; /* profile_id=[3:0] */
+			/* rsvd=[5:4] */
+			/* ucast=[7:6] */
+	u8 status_err0_qw0;
+	__le16 ptype_err_fflags0;	/* ptype=[9:0] */
+					/* ip_hdr_err=[10:10] */
+					/* udp_len_err=[11:11] */
+					/* ff0=[15:12] */
+	__le16 pktlen_gen_bufq_id;	/* plen=[13:0] */
+					/* gen=[14:14]  only in splitq */
+					/* bufq_id=[15:15] only in splitq */
+	__le16 hdrlen_flags;		/* header=[9:0] */
+					/* rsc=[10:10] only in splitq */
+					/* sph=[11:11] only in splitq */
+					/* ext_udp_0=[12:12] */
+					/* int_udp_0=[13:13] */
+					/* trunc_mirr=[14:14] */
+					/* miss_prepend=[15:15] */
+	/* Qword 1 */
+	u8 status_err0_qw1;
+	u8 status_err1;
+	u8 fflags1;
+	u8 ts_low;
+	__le16 buf_id; /* only in splitq */
+	union {
+		__le16 raw_cs;
+		__le16 l2tag1;
+		__le16 rscseglen;
+	} misc;
+	/* Qword 2 */
+	__le16 hash1;
+	union {
+		u8 fflags2;
+		u8 mirrorid;
+		u8 hash2;
+	} ff2_mirrid_hash2;
+	u8 hash3;
+	__le16 l2tag2;
+	__le16 fmd4;
+	/* Qword 3 */
+	__le16 l2tag1;
+	__le16 fmd6;
+	__le32 ts_high;
+}; /* writeback */
+
+union virtchnl_rx_desc {
+	struct virtchnl_singleq_rx_buf_desc	read;
+	struct virtchnl_singleq_base_rx_desc	base_wb;
+	struct virtchnl_rx_flex_desc		flex_wb;
+	struct virtchnl_rx_flex_desc_adv	flex_wb_adv;
+};
+
+#endif /* _VIRTCHNL_LAN_DESC_H_ */
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb.h b/drivers/net/ethernet/intel/ixgb/ixgb.h
index e85271b68410843bb7aebd5ef20462458d826ea3..681d44cc978485a7fe386cc2701f1e37ba6dc7e3 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb.h
@@ -42,7 +42,6 @@
 
 #define BAR_0		0
 #define BAR_1		1
-#define BAR_5		5
 
 struct ixgb_adapter;
 #include "ixgb_hw.h"
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 0940a0da16f2863dee68502865d25e21f8b8c50d..3d8c051dd3277547db18db5650515f9badfb9469 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -412,7 +412,7 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_ioremap;
 	}
 
-	for (i = BAR_1; i <= BAR_5; i++) {
+	for (i = BAR_1; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		if (pci_resource_flags(pdev, i) & IORESOURCE_IO) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 292045f4581f78996036c8f243ecc90237bbf393..8237dbc3e9911ac2c79fba1193353f5cf0b68a6f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -489,7 +489,7 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 	}
 
 	/* Get the base address of device */
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		ret = pcim_iomap_regions(pdev, BIT(i), pci_name(pdev));
@@ -532,7 +532,7 @@ static void stmmac_pci_remove(struct pci_dev *pdev)
 	if (priv->plat->stmmac_clk)
 		clk_unregister_fixed_rate(priv->plat->stmmac_clk);
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pdev, i) == 0)
 			continue;
 		pcim_iounmap_regions(pdev, BIT(i));
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
index 386bafe74c3f6ffa3910e223101a5b52eacc8dfb..fa8604d7b797556b52c9703ec003ba54c1509f82 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-pci.c
@@ -34,7 +34,7 @@ static int xlgmac_probe(struct pci_dev *pcidev, const struct pci_device_id *id)
 		return ret;
 	}
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_len(pcidev, i) == 0)
 			continue;
 		ret = pcim_iomap_regions(pcidev, BIT(i), XLGMAC_DRV_NAME);
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f285422a8071734e01711fc40ff5997e52df07f5..1fdea03273d4a6e7aca6bbfe993c76e399182684 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1122,14 +1122,6 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long tap_compat_ioctl(struct file *file, unsigned int cmd,
-			     unsigned long arg)
-{
-	return tap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations tap_fops = {
 	.owner		= THIS_MODULE,
 	.open		= tap_open,
@@ -1139,9 +1131,7 @@ static const struct file_operations tap_fops = {
 	.poll		= tap_poll,
 	.llseek		= no_llseek,
 	.unlocked_ioctl	= tap_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= tap_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 };
 
 static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.c b/drivers/ntb/hw/idt/ntb_hw_idt.c
index dcf23468053507ba642c610fa8e4392caee4a0db..edae52384b8ab2e7e536a52e87b37525c3f4a7c4 100644
--- a/drivers/ntb/hw/idt/ntb_hw_idt.c
+++ b/drivers/ntb/hw/idt/ntb_hw_idt.c
@@ -2674,8 +2674,8 @@ static int idt_init_pci(struct idt_ntb_dev *ndev)
 	ret = pci_enable_pcie_error_reporting(pdev);
 	if (ret != 0)
 		dev_warn(&pdev->dev, "PCIe AER capability disabled\n");
-	else /* Cleanup uncorrectable error status before getting to init */
-		pci_cleanup_aer_uncorrect_error_status(pdev);
+	else /* Cleanup nonfatal error status before getting to init */
+		pci_aer_clear_nonfatal_status(pdev);
 
 	/* First enable the PCI device */
 	ret = pcim_enable_device(pdev);
diff --git a/drivers/ntb/hw/intel/Makefile b/drivers/ntb/hw/intel/Makefile
index 60ec8a773eea19f5cb64864d26c4580c37986534..f80da0ba15b236603a6c8fdce08ef7e42e814aab 100644
--- a/drivers/ntb/hw/intel/Makefile
+++ b/drivers/ntb/hw/intel/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_NTB_INTEL) += ntb_hw_intel.o
-ntb_hw_intel-y := ntb_hw_gen1.o ntb_hw_gen3.o
+ntb_hw_intel-y := ntb_hw_gen1.o ntb_hw_gen3.o ntb_hw_gen4.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c
index bb57ec2390299580754a84bdf6602439f9885239..cae70e7fe5bd16707139f3fd20053e1ded5397d9 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c
@@ -60,6 +60,7 @@
 #include "ntb_hw_intel.h"
 #include "ntb_hw_gen1.h"
 #include "ntb_hw_gen3.h"
+#include "ntb_hw_gen4.h"
 
 #define NTB_NAME	"ntb_hw_intel"
 #define NTB_DESC	"Intel(R) PCI-E Non-Transparent Bridge Driver"
@@ -762,6 +763,8 @@ static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
 		return ndev_ntb_debugfs_read(filp, ubuf, count, offp);
 	else if (pdev_is_gen3(ndev->ntb.pdev))
 		return ndev_ntb3_debugfs_read(filp, ubuf, count, offp);
+	else if (pdev_is_gen4(ndev->ntb.pdev))
+		return ndev_ntb4_debugfs_read(filp, ubuf, count, offp);
 
 	return -ENXIO;
 }
@@ -1858,16 +1861,15 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 	int rc, node;
 
 	node = dev_to_node(&pdev->dev);
+	ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
+	if (!ndev) {
+		rc = -ENOMEM;
+		goto err_ndev;
+	}
 
-	if (pdev_is_gen1(pdev)) {
-		ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
-		if (!ndev) {
-			rc = -ENOMEM;
-			goto err_ndev;
-		}
-
-		ndev_init_struct(ndev, pdev);
+	ndev_init_struct(ndev, pdev);
 
+	if (pdev_is_gen1(pdev)) {
 		rc = intel_ntb_init_pci(ndev, pdev);
 		if (rc)
 			goto err_init_pci;
@@ -1875,17 +1877,8 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 		rc = xeon_init_dev(ndev);
 		if (rc)
 			goto err_init_dev;
-
 	} else if (pdev_is_gen3(pdev)) {
-		ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
-		if (!ndev) {
-			rc = -ENOMEM;
-			goto err_ndev;
-		}
-
-		ndev_init_struct(ndev, pdev);
 		ndev->ntb.ops = &intel_ntb3_ops;
-
 		rc = intel_ntb_init_pci(ndev, pdev);
 		if (rc)
 			goto err_init_pci;
@@ -1893,7 +1886,15 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 		rc = gen3_init_dev(ndev);
 		if (rc)
 			goto err_init_dev;
+	} else if (pdev_is_gen4(pdev)) {
+		ndev->ntb.ops = &intel_ntb4_ops;
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
 
+		rc = gen4_init_dev(ndev);
+		if (rc)
+			goto err_init_dev;
 	} else {
 		rc = -EINVAL;
 		goto err_ndev;
@@ -1915,7 +1916,7 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev,
 
 err_register:
 	ndev_deinit_debugfs(ndev);
-	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev))
+	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev) || pdev_is_gen4(pdev))
 		xeon_deinit_dev(ndev);
 err_init_dev:
 	intel_ntb_deinit_pci(ndev);
@@ -1931,7 +1932,7 @@ static void intel_ntb_pci_remove(struct pci_dev *pdev)
 
 	ntb_unregister_device(&ndev->ntb);
 	ndev_deinit_debugfs(ndev);
-	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev))
+	if (pdev_is_gen1(pdev) || pdev_is_gen3(pdev) || pdev_is_gen4(pdev))
 		xeon_deinit_dev(ndev);
 	intel_ntb_deinit_pci(ndev);
 	kfree(ndev);
@@ -2036,6 +2037,7 @@ static const struct file_operations intel_ntb_debugfs_info = {
 };
 
 static const struct pci_device_id intel_ntb_pci_tbl[] = {
+	/* GEN1 */
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
@@ -2051,7 +2053,12 @@ static const struct pci_device_id intel_ntb_pci_tbl[] = {
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_BDX)},
+
+	/* GEN3 */
 	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SKX)},
+
+	/* GEN4 */
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_ICX)},
 	{0}
 };
 MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.h b/drivers/ntb/hw/intel/ntb_hw_gen1.h
index 544cf5c06f4dc2ef1412b54ab5551ad25fc37727..344249fc18d1f3562e7e5e662cddbcb0abcdd361 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen1.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen1.h
@@ -140,6 +140,8 @@
 #define NTB_HWERR_SB01BASE_LOCKUP	BIT_ULL(1)
 #define NTB_HWERR_B2BDOORBELL_BIT14	BIT_ULL(2)
 #define NTB_HWERR_MSIX_VECTOR32_BAD	BIT_ULL(3)
+#define NTB_HWERR_BAR_ALIGN		BIT_ULL(4)
+#define NTB_HWERR_LTR_BAD		BIT_ULL(5)
 
 extern struct intel_b2b_addr xeon_b2b_usd_addr;
 extern struct intel_b2b_addr xeon_b2b_dsd_addr;
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.c b/drivers/ntb/hw/intel/ntb_hw_gen3.c
index c3397160db7f7c8bf411e2da30f472a0ce75b270..ffcfc3e02c3532ab74bb95af497f99bd81edb076 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen3.c
+++ b/drivers/ntb/hw/intel/ntb_hw_gen3.c
@@ -415,9 +415,8 @@ ssize_t ndev_ntb3_debugfs_read(struct file *filp, char __user *ubuf,
 	return ret;
 }
 
-static int intel_ntb3_link_enable(struct ntb_dev *ntb,
-				  enum ntb_speed max_speed,
-				  enum ntb_width max_width)
+int intel_ntb3_link_enable(struct ntb_dev *ntb, enum ntb_speed max_speed,
+		enum ntb_width max_width)
 {
 	struct intel_ntb_dev *ndev;
 	u32 ntb_ctl;
@@ -532,7 +531,7 @@ static int intel_ntb3_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
 	return 0;
 }
 
-static int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
 				   resource_size_t *db_size,
 				   u64 *db_data, int db_bit)
 {
@@ -563,7 +562,7 @@ static int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
 	return 0;
 }
 
-static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 	int bit;
@@ -581,7 +580,7 @@ static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 	return 0;
 }
 
-static u64 intel_ntb3_db_read(struct ntb_dev *ntb)
+u64 intel_ntb3_db_read(struct ntb_dev *ntb)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
@@ -590,7 +589,7 @@ static u64 intel_ntb3_db_read(struct ntb_dev *ntb)
 			    ndev->self_reg->db_clear);
 }
 
-static int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits)
+int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits)
 {
 	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.h b/drivers/ntb/hw/intel/ntb_hw_gen3.h
index 75fb86ca27bb12dee577c00ae52a3f27e65b208b..2bc5d83560455e33d3884f92ae31e543bc66c63f 100644
--- a/drivers/ntb/hw/intel/ntb_hw_gen3.h
+++ b/drivers/ntb/hw/intel/ntb_hw_gen3.h
@@ -104,6 +104,14 @@ static inline void gen3_db_iowrite(u64 bits, void __iomem *mmio)
 ssize_t ndev_ntb3_debugfs_read(struct file *filp, char __user *ubuf,
 				      size_t count, loff_t *offp);
 int gen3_init_dev(struct intel_ntb_dev *ndev);
+int intel_ntb3_link_enable(struct ntb_dev *ntb, enum ntb_speed max_speed,
+		enum ntb_width max_width);
+u64 intel_ntb3_db_read(struct ntb_dev *ntb);
+int intel_ntb3_db_clear(struct ntb_dev *ntb, u64 db_bits);
+int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits);
+int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr,
+				resource_size_t *db_size,
+				u64 *db_data, int db_bit);
 
 extern const struct ntb_dev_ops intel_ntb3_ops;
 
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c
new file mode 100644
index 0000000000000000000000000000000000000000..fede05151f6986db790f2566f79fcbebcf19eaaf
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c
@@ -0,0 +1,577 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ntb.h>
+#include <linux/log2.h>
+
+#include "ntb_hw_intel.h"
+#include "ntb_hw_gen1.h"
+#include "ntb_hw_gen3.h"
+#include "ntb_hw_gen4.h"
+
+static int gen4_poll_link(struct intel_ntb_dev *ndev);
+static int gen4_link_is_up(struct intel_ntb_dev *ndev);
+
+static const struct intel_ntb_reg gen4_reg = {
+	.poll_link		= gen4_poll_link,
+	.link_is_up		= gen4_link_is_up,
+	.db_ioread		= gen3_db_ioread,
+	.db_iowrite		= gen3_db_iowrite,
+	.db_size		= sizeof(u32),
+	.ntb_ctl		= GEN4_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4},
+};
+
+static const struct intel_ntb_alt_reg gen4_pri_reg = {
+	.db_clear		= GEN4_IM_INT_STATUS_OFFSET,
+	.db_mask		= GEN4_IM_INT_DISABLE_OFFSET,
+	.spad			= GEN4_IM_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg gen4_sec_xlat = {
+	.bar2_limit		= GEN4_IM23XLMT_OFFSET,
+	.bar2_xlat		= GEN4_IM23XBASE_OFFSET,
+	.bar2_idx		= GEN4_IM23XBASEIDX_OFFSET,
+};
+
+static const struct intel_ntb_alt_reg gen4_b2b_reg = {
+	.db_bell		= GEN4_IM_DOORBELL_OFFSET,
+	.spad			= GEN4_EM_SPAD_OFFSET,
+};
+
+static int gen4_poll_link(struct intel_ntb_dev *ndev)
+{
+	u16 reg_val;
+
+	/*
+	 * We need to write to DLLSCS bit in the SLOTSTS before we
+	 * can clear the hardware link interrupt on ICX NTB.
+	 */
+	iowrite16(GEN4_SLOTSTS_DLLSCS, ndev->self_mmio + GEN4_SLOTSTS);
+	ndev->reg->db_iowrite(ndev->db_link_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_clear);
+
+	reg_val = ioread16(ndev->self_mmio + GEN4_LINK_STATUS_OFFSET);
+	if (reg_val == ndev->lnk_sta)
+		return 0;
+
+	ndev->lnk_sta = reg_val;
+
+	return 1;
+}
+
+static int gen4_link_is_up(struct intel_ntb_dev *ndev)
+{
+	return NTB_LNK_STA_ACTIVE(ndev->lnk_sta);
+}
+
+static int gen4_init_isr(struct intel_ntb_dev *ndev)
+{
+	int i;
+
+	/*
+	 * The MSIX vectors and the interrupt status bits are not lined up
+	 * on Gen3 (Skylake) and Gen4. By default the link status bit is bit
+	 * 32, however it is by default MSIX vector0. We need to fixup to
+	 * line them up. The vectors at reset is 1-32,0. We need to reprogram
+	 * to 0-32.
+	 */
+	for (i = 0; i < GEN4_DB_MSIX_VECTOR_COUNT; i++)
+		iowrite8(i, ndev->self_mmio + GEN4_INTVEC_OFFSET + i);
+
+	return ndev_init_isr(ndev, GEN4_DB_MSIX_VECTOR_COUNT,
+			     GEN4_DB_MSIX_VECTOR_COUNT,
+			     GEN4_DB_MSIX_VECTOR_SHIFT,
+			     GEN4_DB_TOTAL_SHIFT);
+}
+
+static int gen4_setup_b2b_mw(struct intel_ntb_dev *ndev,
+			    const struct intel_b2b_addr *addr,
+			    const struct intel_b2b_addr *peer_addr)
+{
+	struct pci_dev *pdev;
+	void __iomem *mmio;
+	phys_addr_t bar_addr;
+
+	pdev = ndev->ntb.pdev;
+	mmio = ndev->self_mmio;
+
+	/* setup incoming bar limits == base addrs (zero length windows) */
+	bar_addr = addr->bar2_addr64;
+	iowrite64(bar_addr, mmio + GEN4_IM23XLMT_OFFSET);
+	bar_addr = ioread64(mmio + GEN4_IM23XLMT_OFFSET);
+	dev_dbg(&pdev->dev, "IM23XLMT %#018llx\n", bar_addr);
+
+	bar_addr = addr->bar4_addr64;
+	iowrite64(bar_addr, mmio + GEN4_IM45XLMT_OFFSET);
+	bar_addr = ioread64(mmio + GEN4_IM45XLMT_OFFSET);
+	dev_dbg(&pdev->dev, "IM45XLMT %#018llx\n", bar_addr);
+
+	/* zero incoming translation addrs */
+	iowrite64(0, mmio + GEN4_IM23XBASE_OFFSET);
+	iowrite64(0, mmio + GEN4_IM45XBASE_OFFSET);
+
+	ndev->peer_mmio = ndev->self_mmio;
+
+	return 0;
+}
+
+static int gen4_init_ntb(struct intel_ntb_dev *ndev)
+{
+	int rc;
+
+
+	ndev->mw_count = XEON_MW_COUNT;
+	ndev->spad_count = GEN4_SPAD_COUNT;
+	ndev->db_count = GEN4_DB_COUNT;
+	ndev->db_link_mask = GEN4_DB_LINK_BIT;
+
+	ndev->self_reg = &gen4_pri_reg;
+	ndev->xlat_reg = &gen4_sec_xlat;
+	ndev->peer_reg = &gen4_b2b_reg;
+
+	if (ndev->ntb.topo == NTB_TOPO_B2B_USD)
+		rc = gen4_setup_b2b_mw(ndev, &xeon_b2b_dsd_addr,
+				&xeon_b2b_usd_addr);
+	else
+		rc = gen4_setup_b2b_mw(ndev, &xeon_b2b_usd_addr,
+				&xeon_b2b_dsd_addr);
+	if (rc)
+		return rc;
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	ndev->reg->db_iowrite(ndev->db_valid_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
+
+	return 0;
+}
+
+static enum ntb_topo gen4_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd)
+{
+	switch (ppd & GEN4_PPD_TOPO_MASK) {
+	case GEN4_PPD_TOPO_B2B_USD:
+		return NTB_TOPO_B2B_USD;
+	case GEN4_PPD_TOPO_B2B_DSD:
+		return NTB_TOPO_B2B_DSD;
+	}
+
+	return NTB_TOPO_NONE;
+}
+
+int gen4_init_dev(struct intel_ntb_dev *ndev)
+{
+	struct pci_dev *pdev = ndev->ntb.pdev;
+	u32 ppd1/*, ppd0*/;
+	u16 lnkctl;
+	int rc;
+
+	ndev->reg = &gen4_reg;
+
+	if (pdev_is_ICX(pdev)) {
+		ndev->hwerr_flags |= NTB_HWERR_BAR_ALIGN;
+		ndev->hwerr_flags |= NTB_HWERR_LTR_BAD;
+	}
+
+	ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET);
+	ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1);
+	dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1,
+		ntb_topo_string(ndev->ntb.topo));
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
+		return -EINVAL;
+
+	rc = gen4_init_ntb(ndev);
+	if (rc)
+		return rc;
+
+	/* init link setup */
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl |= GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	return gen4_init_isr(ndev);
+}
+
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp)
+{
+	struct intel_ntb_dev *ndev;
+	void __iomem *mmio;
+	char *buf;
+	size_t buf_size;
+	ssize_t ret, off;
+	union { u64 v64; u32 v32; u16 v16; } u;
+
+	ndev = filp->private_data;
+	mmio = ndev->self_mmio;
+
+	buf_size = min(count, 0x800ul);
+
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	off = 0;
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB Device Information:\n");
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Connection Topology -\t%s\n",
+			 ntb_topo_string(ndev->ntb.topo));
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB CTL -\t\t%#06x\n", ndev->ntb_ctl);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LNK STA (cached) -\t\t%#06x\n", ndev->lnk_sta);
+
+	if (!ndev->reg->link_is_up(ndev))
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tDown\n");
+	else {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tUp\n");
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Speed -\t\tPCI-E Gen %u\n",
+				 NTB_LNK_STA_SPEED(ndev->lnk_sta));
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Width -\t\tx%u\n",
+				 NTB_LNK_STA_WIDTH(ndev->lnk_sta));
+	}
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Memory Window Count -\t%u\n", ndev->mw_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Scratchpad Count -\t%u\n", ndev->spad_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Count -\t%u\n", ndev->db_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Count -\t%u\n", ndev->db_vec_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Shift -\t%u\n", ndev->db_vec_shift);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Link Mask -\t%#llx\n", ndev->db_link_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask Cached -\t%#llx\n", ndev->db_mask);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask -\t\t%#llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Incoming XLAT:\n");
+
+	u.v64 = ioread64(mmio + GEN4_IM23XBASE_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM23XBASE -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM45XBASE_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM45XBASE -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM23XLMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM23XLMT -\t\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + GEN4_IM45XLMT_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			 "IM45XLMT -\t\t\t%#018llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Statistics:\n");
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Hardware Errors:\n");
+
+	if (!pci_read_config_word(ndev->ntb.pdev,
+				  GEN4_DEVSTS_OFFSET, &u.v16))
+		off += scnprintf(buf + off, buf_size - off,
+				"DEVSTS -\t\t%#06x\n", u.v16);
+
+	u.v16 = ioread16(mmio + GEN4_LINK_STATUS_OFFSET);
+	off += scnprintf(buf + off, buf_size - off,
+			"LNKSTS -\t\t%#06x\n", u.v16);
+
+	if (!pci_read_config_dword(ndev->ntb.pdev,
+				   GEN4_UNCERRSTS_OFFSET, &u.v32))
+		off += scnprintf(buf + off, buf_size - off,
+				 "UNCERRSTS -\t\t%#06x\n", u.v32);
+
+	if (!pci_read_config_dword(ndev->ntb.pdev,
+				   GEN4_CORERRSTS_OFFSET, &u.v32))
+		off += scnprintf(buf + off, buf_size - off,
+				 "CORERRSTS -\t\t%#06x\n", u.v32);
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+	kfree(buf);
+	return ret;
+}
+
+static int intel_ntb4_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx,
+				   dma_addr_t addr, resource_size_t size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	unsigned long xlat_reg, limit_reg, idx_reg;
+	unsigned short base_idx, reg_val16;
+	resource_size_t bar_size, mw_size;
+	void __iomem *mmio;
+	u64 base, limit, reg_val;
+	int bar;
+
+	if (pidx != NTB_DEF_PEER_IDX)
+		return -EINVAL;
+
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN) {
+		/* hardware requires that addr is aligned to bar size */
+		if (addr & (bar_size - 1))
+			return -EINVAL;
+	} else {
+		if (addr & (PAGE_SIZE - 1))
+			return -EINVAL;
+	}
+
+	/* make sure the range fits in the usable mw size */
+	if (size > mw_size)
+		return -EINVAL;
+
+	mmio = ndev->self_mmio;
+	xlat_reg = ndev->xlat_reg->bar2_xlat + (idx * 0x10);
+	limit_reg = ndev->xlat_reg->bar2_limit + (idx * 0x10);
+	base = pci_resource_start(ndev->ntb.pdev, bar);
+
+	/* Set the limit if supported, if size is not mw_size */
+	if (limit_reg && size != mw_size) {
+		limit = base + size;
+		base_idx = __ilog2_u64(size);
+	} else {
+		limit = base + mw_size;
+		base_idx = __ilog2_u64(mw_size);
+	}
+
+
+	/* set and verify setting the translation address */
+	iowrite64(addr, mmio + xlat_reg);
+	reg_val = ioread64(mmio + xlat_reg);
+	if (reg_val != addr) {
+		iowrite64(0, mmio + xlat_reg);
+		return -EIO;
+	}
+
+	dev_dbg(&ntb->pdev->dev, "BAR %d IMXBASE: %#Lx\n", bar, reg_val);
+
+	/* set and verify setting the limit */
+	iowrite64(limit, mmio + limit_reg);
+	reg_val = ioread64(mmio + limit_reg);
+	if (reg_val != limit) {
+		iowrite64(base, mmio + limit_reg);
+		iowrite64(0, mmio + xlat_reg);
+		return -EIO;
+	}
+
+	dev_dbg(&ntb->pdev->dev, "BAR %d IMXLMT: %#Lx\n", bar, reg_val);
+
+	if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN) {
+		idx_reg = ndev->xlat_reg->bar2_idx + (idx * 0x2);
+		iowrite16(base_idx, mmio + idx_reg);
+		reg_val16 = ioread16(mmio + idx_reg);
+		if (reg_val16 != base_idx) {
+			iowrite64(base, mmio + limit_reg);
+			iowrite64(0, mmio + xlat_reg);
+			iowrite16(0, mmio + idx_reg);
+			return -EIO;
+		}
+		dev_dbg(&ntb->pdev->dev, "BAR %d IMBASEIDX: %#x\n", bar, reg_val16);
+	}
+
+
+	return 0;
+}
+
+static int intel_ntb4_link_enable(struct ntb_dev *ntb,
+		enum ntb_speed max_speed, enum ntb_width max_width)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_ctl, ppd0;
+	u16 lnkctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	dev_dbg(&ntb->pdev->dev,
+			"Enabling link with max_speed %d max_width %d\n",
+			max_speed, max_width);
+
+	if (max_speed != NTB_SPEED_AUTO)
+		dev_dbg(&ntb->pdev->dev,
+				"ignoring max_speed %d\n", max_speed);
+	if (max_width != NTB_WIDTH_AUTO)
+		dev_dbg(&ntb->pdev->dev,
+				"ignoring max_width %d\n", max_width);
+
+	if (!(ndev->hwerr_flags & NTB_HWERR_LTR_BAD)) {
+		u32 ltr;
+
+		/* Setup active snoop LTR values */
+		ltr = NTB_LTR_ACTIVE_REQMNT | NTB_LTR_ACTIVE_VAL | NTB_LTR_ACTIVE_LATSCALE;
+		/* Setup active non-snoop values */
+		ltr = (ltr << NTB_LTR_NS_SHIFT) | ltr;
+		iowrite32(ltr, ndev->self_mmio + GEN4_LTR_ACTIVE_OFFSET);
+
+		/* Setup idle snoop LTR values */
+		ltr = NTB_LTR_IDLE_VAL | NTB_LTR_IDLE_LATSCALE | NTB_LTR_IDLE_REQMNT;
+		/* Setup idle non-snoop values */
+		ltr = (ltr << NTB_LTR_NS_SHIFT) | ltr;
+		iowrite32(ltr, ndev->self_mmio + GEN4_LTR_IDLE_OFFSET);
+
+		/* setup PCIe LTR to active */
+		iowrite8(NTB_LTR_SWSEL_ACTIVE, ndev->self_mmio + GEN4_LTR_SWSEL_OFFSET);
+	}
+
+	ntb_ctl = NTB_CTL_E2I_BAR23_SNOOP | NTB_CTL_I2E_BAR23_SNOOP;
+	ntb_ctl |= NTB_CTL_E2I_BAR45_SNOOP | NTB_CTL_I2E_BAR45_SNOOP;
+	iowrite32(ntb_ctl, ndev->self_mmio + ndev->reg->ntb_ctl);
+
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl &= ~GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	/* start link training in PPD0 */
+	ppd0 = ioread32(ndev->self_mmio + GEN4_PPD0_OFFSET);
+	ppd0 |= GEN4_PPD_LINKTRN;
+	iowrite32(ppd0, ndev->self_mmio + GEN4_PPD0_OFFSET);
+
+	/* make sure link training has started */
+	ppd0 = ioread32(ndev->self_mmio + GEN4_PPD0_OFFSET);
+	if (!(ppd0 & GEN4_PPD_LINKTRN)) {
+		dev_warn(&ntb->pdev->dev, "Link is not training\n");
+		return -ENXIO;
+	}
+
+	ndev->dev_up = 1;
+
+	return 0;
+}
+
+static int intel_ntb4_link_disable(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_cntl;
+	u16 lnkctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	dev_dbg(&ntb->pdev->dev, "Disabling link\n");
+
+	/* clear the snoop bits */
+	ntb_cntl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_cntl &= ~(NTB_CTL_E2I_BAR23_SNOOP | NTB_CTL_I2E_BAR23_SNOOP);
+	ntb_cntl &= ~(NTB_CTL_E2I_BAR45_SNOOP | NTB_CTL_I2E_BAR45_SNOOP);
+	iowrite32(ntb_cntl, ndev->self_mmio + ndev->reg->ntb_ctl);
+
+	lnkctl = ioread16(ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+	lnkctl |= GEN4_LINK_CTRL_LINK_DISABLE;
+	iowrite16(lnkctl, ndev->self_mmio + GEN4_LINK_CTRL_OFFSET);
+
+	/* set LTR to idle */
+	if (!(ndev->hwerr_flags & NTB_HWERR_LTR_BAD))
+		iowrite8(NTB_LTR_SWSEL_IDLE, ndev->self_mmio + GEN4_LTR_SWSEL_OFFSET);
+
+	ndev->dev_up = 0;
+
+	return 0;
+}
+
+static int intel_ntb4_mw_get_align(struct ntb_dev *ntb, int pidx, int idx,
+				   resource_size_t *addr_align,
+				   resource_size_t *size_align,
+				   resource_size_t *size_max)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	resource_size_t bar_size, mw_size;
+	int bar;
+
+	if (pidx != NTB_DEF_PEER_IDX)
+		return -EINVAL;
+
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
+
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
+
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	if (addr_align) {
+		if (ndev->hwerr_flags & NTB_HWERR_BAR_ALIGN)
+			*addr_align = pci_resource_len(ndev->ntb.pdev, bar);
+		else
+			*addr_align = PAGE_SIZE;
+	}
+
+	if (size_align)
+		*size_align = 1;
+
+	if (size_max)
+		*size_max = mw_size;
+
+	return 0;
+}
+
+const struct ntb_dev_ops intel_ntb4_ops = {
+	.mw_count		= intel_ntb_mw_count,
+	.mw_get_align		= intel_ntb4_mw_get_align,
+	.mw_set_trans		= intel_ntb4_mw_set_trans,
+	.peer_mw_count		= intel_ntb_peer_mw_count,
+	.peer_mw_get_addr	= intel_ntb_peer_mw_get_addr,
+	.link_is_up		= intel_ntb_link_is_up,
+	.link_enable		= intel_ntb4_link_enable,
+	.link_disable		= intel_ntb4_link_disable,
+	.db_valid_mask		= intel_ntb_db_valid_mask,
+	.db_vector_count	= intel_ntb_db_vector_count,
+	.db_vector_mask		= intel_ntb_db_vector_mask,
+	.db_read		= intel_ntb3_db_read,
+	.db_clear		= intel_ntb3_db_clear,
+	.db_set_mask		= intel_ntb_db_set_mask,
+	.db_clear_mask		= intel_ntb_db_clear_mask,
+	.peer_db_addr		= intel_ntb3_peer_db_addr,
+	.peer_db_set		= intel_ntb3_peer_db_set,
+	.spad_is_unsafe		= intel_ntb_spad_is_unsafe,
+	.spad_count		= intel_ntb_spad_count,
+	.spad_read		= intel_ntb_spad_read,
+	.spad_write		= intel_ntb_spad_write,
+	.peer_spad_addr		= intel_ntb_peer_spad_addr,
+	.peer_spad_read		= intel_ntb_peer_spad_read,
+	.peer_spad_write	= intel_ntb_peer_spad_write,
+};
+
diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fcd3fdce9edfbc6efe6b80d52cad7ed811c0d1d
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)          */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved.   */
+#ifndef _NTB_INTEL_GEN4_H_
+#define _NTB_INTEL_GEN4_H_
+
+#include "ntb_hw_intel.h"
+
+/* Supported PCI device revision range for ICX */
+#define PCI_DEVICE_REVISION_ICX_MIN	0x2
+#define PCI_DEVICE_REVISION_ICX_MAX	0xF
+
+/* Intel Gen4 NTB hardware */
+/* PCIe config space */
+#define GEN4_IMBAR23SZ_OFFSET		0x00c4
+#define GEN4_IMBAR45SZ_OFFSET		0x00c5
+#define GEN4_EMBAR23SZ_OFFSET		0x00c6
+#define GEN4_EMBAR45SZ_OFFSET		0x00c7
+#define GEN4_DEVCTRL_OFFSET		0x0048
+#define GEN4_DEVSTS_OFFSET		0x004a
+#define GEN4_UNCERRSTS_OFFSET		0x0104
+#define GEN4_CORERRSTS_OFFSET		0x0110
+
+/* BAR0 MMIO */
+#define GEN4_NTBCNTL_OFFSET		0x0000
+#define GEN4_IM23XBASE_OFFSET		0x0010	/* IMBAR1XBASE */
+#define GEN4_IM23XLMT_OFFSET		0x0018  /* IMBAR1XLMT */
+#define GEN4_IM45XBASE_OFFSET		0x0020	/* IMBAR2XBASE */
+#define GEN4_IM45XLMT_OFFSET		0x0028  /* IMBAR2XLMT */
+#define GEN4_IM_INT_STATUS_OFFSET	0x0040
+#define GEN4_IM_INT_DISABLE_OFFSET	0x0048
+#define GEN4_INTVEC_OFFSET		0x0050  /* 0-32 vecs */
+#define GEN4_IM23XBASEIDX_OFFSET	0x0074
+#define GEN4_IM45XBASEIDX_OFFSET	0x0076
+#define GEN4_IM_SPAD_OFFSET		0x0080  /* 0-15 SPADs */
+#define GEN4_IM_SPAD_SEM_OFFSET		0x00c0	/* SPAD hw semaphore */
+#define GEN4_IM_SPAD_STICKY_OFFSET	0x00c4  /* sticky SPAD */
+#define GEN4_IM_DOORBELL_OFFSET		0x0100  /* 0-31 doorbells */
+#define GEN4_LTR_SWSEL_OFFSET		0x30ec
+#define GEN4_LTR_ACTIVE_OFFSET		0x30f0
+#define GEN4_LTR_IDLE_OFFSET		0x30f4
+#define GEN4_EM_SPAD_OFFSET		0x8080
+/* note, link status is now in MMIO and not config space for NTB */
+#define GEN4_LINK_CTRL_OFFSET		0xb050
+#define GEN4_LINK_STATUS_OFFSET		0xb052
+#define GEN4_PPD0_OFFSET		0xb0d4
+#define GEN4_PPD1_OFFSET		0xb4c0
+#define GEN4_LTSSMSTATEJMP		0xf040
+
+#define GEN4_PPD_CLEAR_TRN		0x0001
+#define GEN4_PPD_LINKTRN		0x0008
+#define GEN4_PPD_CONN_MASK		0x0300
+#define GEN4_PPD_CONN_B2B		0x0200
+#define GEN4_PPD_DEV_MASK		0x1000
+#define GEN4_PPD_DEV_DSD		0x1000
+#define GEN4_PPD_DEV_USD		0x0000
+#define GEN4_LINK_CTRL_LINK_DISABLE	0x0010
+
+#define GEN4_SLOTSTS			0xb05a
+#define GEN4_SLOTSTS_DLLSCS		0x100
+
+#define GEN4_PPD_TOPO_MASK	(GEN4_PPD_CONN_MASK | GEN4_PPD_DEV_MASK)
+#define GEN4_PPD_TOPO_B2B_USD	(GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_USD)
+#define GEN4_PPD_TOPO_B2B_DSD	(GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_DSD)
+
+#define GEN4_DB_COUNT			32
+#define GEN4_DB_LINK			32
+#define GEN4_DB_LINK_BIT		BIT_ULL(GEN4_DB_LINK)
+#define GEN4_DB_MSIX_VECTOR_COUNT	33
+#define GEN4_DB_MSIX_VECTOR_SHIFT	1
+#define GEN4_DB_TOTAL_SHIFT		33
+#define GEN4_SPAD_COUNT			16
+
+#define NTB_CTL_E2I_BAR23_SNOOP		0x000004
+#define NTB_CTL_E2I_BAR23_NOSNOOP	0x000008
+#define NTB_CTL_I2E_BAR23_SNOOP		0x000010
+#define NTB_CTL_I2E_BAR23_NOSNOOP	0x000020
+#define NTB_CTL_E2I_BAR45_SNOOP		0x000040
+#define NTB_CTL_E2I_BAR45_NOSNOO	0x000080
+#define NTB_CTL_I2E_BAR45_SNOOP		0x000100
+#define NTB_CTL_I2E_BAR45_NOSNOOP	0x000200
+#define NTB_CTL_BUSNO_DIS_INC		0x000400
+#define NTB_CTL_LINK_DOWN		0x010000
+
+#define NTB_SJC_FORCEDETECT		0x000004
+
+#define NTB_LTR_SWSEL_ACTIVE		0x0
+#define NTB_LTR_SWSEL_IDLE		0x1
+
+#define NTB_LTR_NS_SHIFT		16
+#define NTB_LTR_ACTIVE_VAL		0x0000  /* 0 us */
+#define NTB_LTR_ACTIVE_LATSCALE		0x0800  /* 1us scale */
+#define NTB_LTR_ACTIVE_REQMNT		0x8000  /* snoop req enable */
+
+#define NTB_LTR_IDLE_VAL		0x0258  /* 600 us */
+#define NTB_LTR_IDLE_LATSCALE		0x0800  /* 1us scale */
+#define NTB_LTR_IDLE_REQMNT		0x8000  /* snoop req enable */
+
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp);
+int gen4_init_dev(struct intel_ntb_dev *ndev);
+ssize_t ndev_ntb4_debugfs_read(struct file *filp, char __user *ubuf,
+				      size_t count, loff_t *offp);
+
+extern const struct ntb_dev_ops intel_ntb4_ops;
+
+static inline int pdev_is_ICX(struct pci_dev *pdev)
+{
+	if (pdev_is_gen4(pdev) &&
+	    pdev->revision >= PCI_DEVICE_REVISION_ICX_MIN &&
+	    pdev->revision <= PCI_DEVICE_REVISION_ICX_MAX)
+		return 1;
+	return 0;
+}
+
+#endif
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
index e071e28bca3f59cc7f9c9b18443ab757d9dfe3f3..d61fcd91714bbc0dd28a284275f8dde2d746c1cf 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.h
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -72,6 +72,7 @@
 #define PCI_DEVICE_ID_INTEL_NTB_PS_BDX	0x6F0E
 #define PCI_DEVICE_ID_INTEL_NTB_SS_BDX	0x6F0F
 #define PCI_DEVICE_ID_INTEL_NTB_B2B_SKX	0x201C
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_ICX	0x347e
 
 /* Ntb control and link status */
 #define NTB_CTL_CFG_LOCK		BIT(0)
@@ -120,6 +121,7 @@ struct intel_ntb_xlat_reg {
 	unsigned long			bar0_base;
 	unsigned long			bar2_xlat;
 	unsigned long			bar2_limit;
+	unsigned short			bar2_idx;
 };
 
 struct intel_b2b_addr {
@@ -182,6 +184,9 @@ struct intel_ntb_dev {
 
 	struct dentry			*debugfs_dir;
 	struct dentry			*debugfs_info;
+
+	/* gen4 entries */
+	int				dev_up;
 };
 
 #define ntb_ndev(__ntb) container_of(__ntb, struct intel_ntb_dev, ntb)
@@ -219,4 +224,11 @@ static inline int pdev_is_gen3(struct pci_dev *pdev)
 	return 0;
 }
 
+static inline int pdev_is_gen4(struct pci_dev *pdev)
+{
+	if (pdev->device == PCI_DEVICE_ID_INTEL_NTB_B2B_ICX)
+		return 1;
+
+	return 0;
+}
 #endif
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 36af7af6b7cfee351e6e8f9e39d895726bd736dd..b7d1eb38b27d464f15b610908ae1a7d8ac39b6a3 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -4,6 +4,7 @@ menuconfig LIBNVDIMM
 	depends on PHYS_ADDR_T_64BIT
 	depends on HAS_IOMEM
 	depends on BLK_DEV
+	select MEMREGION
 	help
 	  Generic support for non-volatile memory devices including
 	  ACPI-6-NFIT defined resources.  On platforms that define an
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 9204f1e9fd1414d8650bb873969d0c3175b46f24..e592c49646748ca7a38c98eded31a902a18e1cd7 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -455,7 +455,6 @@ static __exit void libnvdimm_exit(void)
 	nd_region_exit();
 	nvdimm_exit();
 	nvdimm_bus_exit();
-	nd_region_devs_exit();
 	nvdimm_devs_exit();
 }
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 25fa121104d04363e35dcad09e7de9a4d57e7956..aa059439fca08181fe4550dc0be097ea2b582ac3 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -114,7 +114,6 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nvdimm_devs_exit(void);
-void nd_region_devs_exit(void);
 struct nd_region;
 void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev);
 void nd_region_create_ns_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b8236a9e8750d02adb3c8b7a55c629e7aaf619f1..c28974d1417009b7f5763f64eca89d99a9df8cb3 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
  */
 #include <linux/scatterlist.h>
+#include <linux/memregion.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -19,7 +20,6 @@
  */
 #include <linux/io-64-nonatomic-hi-lo.h>
 
-static DEFINE_IDA(region_ida);
 static DEFINE_PER_CPU(int, flush_idx);
 
 static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
@@ -133,7 +133,7 @@ static void nd_region_release(struct device *dev)
 		put_device(&nvdimm->dev);
 	}
 	free_percpu(nd_region->lane);
-	ida_simple_remove(&region_ida, nd_region->id);
+	memregion_free(nd_region->id);
 	if (is_nd_blk(dev))
 		kfree(to_nd_blk_region(dev));
 	else
@@ -985,7 +985,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	if (!region_buf)
 		return NULL;
-	nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+	nd_region->id = memregion_alloc(GFP_KERNEL);
 	if (nd_region->id < 0)
 		goto err_id;
 
@@ -1044,7 +1044,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	return nd_region;
 
  err_percpu:
-	ida_simple_remove(&region_ida, nd_region->id);
+	memregion_free(nd_region->id);
  err_id:
 	kfree(region_buf);
 	return NULL;
@@ -1221,8 +1221,3 @@ int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
 
 	return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict);
 }
-
-void __exit nd_region_devs_exit(void)
-{
-	ida_destroy(&region_ida);
-}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ab962f4db33fb1cc2d64b006cdb1b6e706f0fc8..78998c6b84e379948225709a8260d462b7037fc8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -28,6 +28,14 @@ extern unsigned int admin_timeout;
 #define NVME_DEFAULT_KATO	5
 #define NVME_KATO_GRACE		10
 
+/*
+ * Default to a 4K page size, with the intention to update this
+ * path in the future to accommodate architectures with differing
+ * kernel and IO page sizes.
+ */
+#define NVME_CTRL_PAGE_SHIFT	12
+#define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6d25aa4e3622df10d276f9c3e3d413c41e690e2a..4bc498fae1f685cb1e7da5ac8403b05cfca30266 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2696,6 +2696,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 * Don't limit the IOMMU merged segment size.
 	 */
 	dma_set_max_seg_size(dev->dev, 0xffffffff);
+	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
 
 	mutex_unlock(&dev->shutdown_lock);
 
diff --git a/drivers/of/device.c b/drivers/of/device.c
index da81583920103f288e691d36f5c92f4709afae58..e9127db7b06761b1da4cafe68be72b2719324293 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -93,7 +93,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	bool coherent;
 	unsigned long offset;
 	const struct iommu_ops *iommu;
-	u64 mask;
+	u64 mask, end;
 
 	ret = of_dma_get_range(np, &dma_addr, &paddr, &size);
 	if (ret < 0) {
@@ -148,12 +148,13 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	 * Limit coherent and dma mask based on size and default mask
 	 * set by the driver.
 	 */
-	mask = DMA_BIT_MASK(ilog2(dma_addr + size - 1) + 1);
+	end = dma_addr + size - 1;
+	mask = DMA_BIT_MASK(ilog2(end) + 1);
 	dev->coherent_dma_mask &= mask;
 	*dev->dma_mask &= mask;
-	/* ...but only set bus mask if we found valid dma-ranges earlier */
+	/* ...but only set bus limit if we found valid dma-ranges earlier */
 	if (!ret)
-		dev->bus_dma_mask = mask;
+		dev->bus_dma_limit = end;
 
 	coherent = of_dma_is_coherent(np);
 	dev_dbg(dev, "device is%sdma coherent\n",
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index a296eaf52a5b23c377349b59688ffa62ec66db4d..48a40326984f1dc02a0a769276e72eeb4fb46135 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -613,18 +613,20 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in)
  * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain
  * @dev: device for which the mapping is to be done.
  * @rid: Requester ID for the device.
+ * @bus_token: Bus token
  *
  * Walk up the device hierarchy looking for devices with a "msi-map"
  * property.
  *
  * Returns: the MSI domain for this device (or NULL on failure)
  */
-struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid)
+struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
+						u32 bus_token)
 {
 	struct device_node *np = NULL;
 
-	__of_msi_map_rid(dev, &np, rid);
-	return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
+	__of_msi_map_rid(dev, &np, id);
+	return irq_find_matching_host(np, bus_token);
 }
 
 /**
diff --git a/drivers/of/property.c b/drivers/of/property.c
index d7fa75e31f22415c447f05fa99437e1591cb0a1a..e8202f61a5d930996ec2af0e43f3017bf22a13bb 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -872,6 +872,20 @@ of_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
 		of_property_count_strings(node, propname);
 }
 
+static const char *of_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	return kbasename(to_of_node(fwnode)->full_name);
+}
+
+static const char *of_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
+{
+	/* Root needs no prefix here (its name is "/"). */
+	if (!to_of_node(fwnode)->parent)
+		return "";
+
+	return "/";
+}
+
 static struct fwnode_handle *
 of_fwnode_get_parent(const struct fwnode_handle *fwnode)
 {
@@ -993,6 +1007,8 @@ const struct fwnode_operations of_fwnode_ops = {
 	.property_present = of_fwnode_property_present,
 	.property_read_int_array = of_fwnode_property_read_int_array,
 	.property_read_string_array = of_fwnode_property_read_string_array,
+	.get_name = of_fwnode_get_name,
+	.get_name_prefix = of_fwnode_get_name_prefix,
 	.get_parent = of_fwnode_get_parent,
 	.get_next_child_node = of_fwnode_get_next_child_node,
 	.get_named_child_node = of_fwnode_get_named_child_node,
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index ad290f79983b940fa4a49dfb479eb71eed681f87..d1eaa0dc89a31cb3428fb5f31ded72f1cd22e6b1 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1025,6 +1025,8 @@ static const struct dma_map_ops ccio_ops = {
 	.map_sg = 		ccio_map_sg,
 	.unmap_sg = 		ccio_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
+	.alloc_pages =		dma_common_alloc_pages,
+	.free_pages =		dma_common_free_pages,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index e410033b6df0cc38edba7e53011f37f06265e602..9a7c0b81f998e2c246da1d8f3e57d3ef07dbc55b 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1077,6 +1077,8 @@ static const struct dma_map_ops sba_ops = {
 	.map_sg =		sba_map_sg,
 	.unmap_sg =		sba_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
+	.alloc_pages =		dma_common_alloc_pages,
+	.free_pages =		dma_common_free_pages,
 };
 
 
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index a304f5ea11b90b2cffe1b2eec9cc605dc14b1d34..52a3d54f74c371c5a41d5926b099f3ee2374bafe 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -56,6 +56,9 @@ config PCI_MSI_IRQ_DOMAIN
 	depends on PCI_MSI
 	select GENERIC_MSI_IRQ_DOMAIN
 
+config PCI_MSI_ARCH_FALLBACKS
+	bool
+
 config PCI_QUIRKS
 	default y
 	bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 28cdd8c0213ac347f834a4b819c6c4ab0a2cf08d..b14e23004690cb2f39bf1cc5960aa2916b792d13 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_PCI)		+= access.o bus.o probe.o host-bridge.o \
 				   remove.o pci.o pci-driver.o search.o \
 				   pci-sysfs.o rom.o setup-res.o irq.o vpd.o \
-				   setup-bus.o vc.o mmap.o setup-irq.o
+				   setup-bus.o vc.o mmap.o setup-irq.o msi.o
 
 ifdef CONFIG_PCI
 obj-$(CONFIG_PROC_FS)		+= proc.o
@@ -17,7 +17,6 @@ obj-$(CONFIG_OF)		+= of.o
 obj-$(CONFIG_PCI_QUIRKS)	+= quirks.o
 obj-$(CONFIG_PCIEPORTBUS)	+= pcie/
 obj-$(CONFIG_HOTPLUG_PCI)	+= hotplug/
-obj-$(CONFIG_PCI_MSI)		+= msi.o
 obj-$(CONFIG_PCI_ATS)		+= ats.o
 obj-$(CONFIG_PCI_IOV)		+= iov.o
 obj-$(CONFIG_PCI_BRIDGE_EMUL)	+= pci-bridge-emul.o
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index e18499243f84ac674ac827a1fa31a29ee8f524d3..b5955577fe829f40bf1702ff7e57b4ba050c77db 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -30,6 +30,22 @@ void pci_ats_init(struct pci_dev *dev)
 	dev->ats_cap = pos;
 }
 
+/**
+ * pci_ats_supported - check if the device can use ATS
+ * @dev: the PCI device
+ *
+ * Returns true if the device supports ATS and is allowed to use it, false
+ * otherwise.
+ */
+bool pci_ats_supported(struct pci_dev *dev)
+{
+	if (!dev->ats_cap)
+		return false;
+
+	return (dev->untrusted == 0);
+}
+EXPORT_SYMBOL_GPL(pci_ats_supported);
+
 /**
  * pci_enable_ats - enable the ATS capability
  * @dev: the PCI device
@@ -42,7 +58,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 	u16 ctrl;
 	struct pci_dev *pdev;
 
-	if (!dev->ats_cap)
+	if (!pci_ats_supported(dev))
 		return -EINVAL;
 
 	if (WARN_ON(dev->ats_enabled))
@@ -60,8 +76,6 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 		pdev = pci_physfn(dev);
 		if (pdev->ats_stu != ps)
 			return -EINVAL;
-
-		atomic_inc(&pdev->ats_ref_cnt);  /* count enabled VFs */
 	} else {
 		dev->ats_stu = ps;
 		ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
@@ -79,20 +93,11 @@ EXPORT_SYMBOL_GPL(pci_enable_ats);
  */
 void pci_disable_ats(struct pci_dev *dev)
 {
-	struct pci_dev *pdev;
 	u16 ctrl;
 
 	if (WARN_ON(!dev->ats_enabled))
 		return;
 
-	if (atomic_read(&dev->ats_ref_cnt))
-		return;		/* VFs still enabled */
-
-	if (dev->is_virtfn) {
-		pdev = pci_physfn(dev);
-		atomic_dec(&pdev->ats_ref_cnt);
-	}
-
 	pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, &ctrl);
 	ctrl &= ~PCI_ATS_CTRL_ENABLE;
 	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
@@ -170,6 +175,20 @@ int pci_ats_page_aligned(struct pci_dev *pdev)
 EXPORT_SYMBOL_GPL(pci_ats_page_aligned);
 
 #ifdef CONFIG_PCI_PRI
+void pci_pri_init(struct pci_dev *pdev)
+{
+	u16 status;
+
+	pdev->pri_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+
+	if (!pdev->pri_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->pri_cap + PCI_PRI_STATUS, &status);
+	if (status & PCI_PRI_STATUS_PASID)
+		pdev->pasid_required = 1;
+}
+
 /**
  * pci_enable_pri - Enable PRI capability
  * @ pdev: PCI device structure
@@ -180,26 +199,36 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 {
 	u16 control, status;
 	u32 max_requests;
-	int pos;
+	int pri = pdev->pri_cap;
+
+	/*
+	 * VFs must not implement the PRI Capability.  If their PF
+	 * implements PRI, it is shared by the VFs, so if the PF PRI is
+	 * enabled, it is also enabled for the VF.
+	 */
+	if (pdev->is_virtfn) {
+		if (pci_physfn(pdev)->pri_enabled)
+			return 0;
+		return -EINVAL;
+	}
 
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+	pci_read_config_word(pdev, pri + PCI_PRI_STATUS, &status);
 	if (!(status & PCI_PRI_STATUS_STOPPED))
 		return -EBUSY;
 
-	pci_read_config_dword(pdev, pos + PCI_PRI_MAX_REQ, &max_requests);
+	pci_read_config_dword(pdev, pri + PCI_PRI_MAX_REQ, &max_requests);
 	reqs = min(max_requests, reqs);
 	pdev->pri_reqs_alloc = reqs;
-	pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ, reqs);
+	pci_write_config_dword(pdev, pri + PCI_PRI_ALLOC_REQ, reqs);
 
 	control = PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	pdev->pri_enabled = 1;
 
@@ -216,18 +245,21 @@ EXPORT_SYMBOL_GPL(pci_enable_pri);
 void pci_disable_pri(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pri = pdev->pri_cap;
+
+	/* VFs share the PF PRI */
+	if (pdev->is_virtfn)
+		return;
 
 	if (WARN_ON(!pdev->pri_enabled))
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+	pci_read_config_word(pdev, pri + PCI_PRI_CTRL, &control);
 	control &= ~PCI_PRI_CTRL_ENABLE;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	pdev->pri_enabled = 0;
 }
@@ -241,17 +273,19 @@ void pci_restore_pri_state(struct pci_dev *pdev)
 {
 	u16 control = PCI_PRI_CTRL_ENABLE;
 	u32 reqs = pdev->pri_reqs_alloc;
-	int pos;
+	int pri = pdev->pri_cap;
+
+	if (pdev->is_virtfn)
+		return;
 
 	if (!pdev->pri_enabled)
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return;
 
-	pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ, reqs);
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_dword(pdev, pri + PCI_PRI_ALLOC_REQ, reqs);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 }
 EXPORT_SYMBOL_GPL(pci_restore_pri_state);
 
@@ -265,24 +299,63 @@ EXPORT_SYMBOL_GPL(pci_restore_pri_state);
 int pci_reset_pri(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pri = pdev->pri_cap;
+
+	if (pdev->is_virtfn)
+		return 0;
 
 	if (WARN_ON(pdev->pri_enabled))
 		return -EBUSY;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
+	if (!pri)
 		return -EINVAL;
 
 	control = PCI_PRI_CTRL_RESET;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+	pci_write_config_word(pdev, pri + PCI_PRI_CTRL, control);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_reset_pri);
+
+/**
+ * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
+ *				 status.
+ * @pdev: PCI device structure
+ *
+ * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
+ */
+int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	return pdev->pasid_required;
+}
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
+
+/**
+ * pci_pri_supported - Check if PRI is supported.
+ * @pdev: PCI device structure
+ *
+ * Returns true if PRI capability is present, false otherwise.
+ */
+bool pci_pri_supported(struct pci_dev *pdev)
+{
+	/* VFs share the PF PRI */
+	if (pci_physfn(pdev)->pri_cap)
+		return true;
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_pri_supported);
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
+void pci_pasid_init(struct pci_dev *pdev)
+{
+	pdev->pasid_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+}
+
 /**
  * pci_enable_pasid - Enable the PASID capability
  * @pdev: PCI device structure
@@ -295,7 +368,17 @@ EXPORT_SYMBOL_GPL(pci_reset_pri);
 int pci_enable_pasid(struct pci_dev *pdev, int features)
 {
 	u16 control, supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
+
+	/*
+	 * VFs must not implement the PASID Capability, but if a PF
+	 * supports PASID, its VFs share the PF PASID configuration.
+	 */
+	if (pdev->is_virtfn) {
+		if (pci_physfn(pdev)->pasid_enabled)
+			return 0;
+		return -EINVAL;
+	}
 
 	if (WARN_ON(pdev->pasid_enabled))
 		return -EBUSY;
@@ -303,11 +386,10 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	if (!pdev->eetlp_prefix_path)
 		return -EINVAL;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 	supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV;
 
 	/* User wants to enable anything unsupported? */
@@ -317,7 +399,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	control = PCI_PASID_CTRL_ENABLE | features;
 	pdev->pasid_features = features;
 
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 
 	pdev->pasid_enabled = 1;
 
@@ -332,16 +414,19 @@ EXPORT_SYMBOL_GPL(pci_enable_pasid);
 void pci_disable_pasid(struct pci_dev *pdev)
 {
 	u16 control = 0;
-	int pos;
+	int pasid = pdev->pasid_cap;
+
+	/* VFs share the PF PASID configuration */
+	if (pdev->is_virtfn)
+		return;
 
 	if (WARN_ON(!pdev->pasid_enabled))
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return;
 
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 
 	pdev->pasid_enabled = 0;
 }
@@ -354,17 +439,19 @@ EXPORT_SYMBOL_GPL(pci_disable_pasid);
 void pci_restore_pasid_state(struct pci_dev *pdev)
 {
 	u16 control;
-	int pos;
+	int pasid = pdev->pasid_cap;
+
+	if (pdev->is_virtfn)
+		return;
 
 	if (!pdev->pasid_enabled)
 		return;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (!pasid)
 		return;
 
 	control = PCI_PASID_CTRL_ENABLE | pdev->pasid_features;
-	pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+	pci_write_config_word(pdev, pasid + PCI_PASID_CTRL, control);
 }
 EXPORT_SYMBOL_GPL(pci_restore_pasid_state);
 
@@ -381,13 +468,15 @@ EXPORT_SYMBOL_GPL(pci_restore_pasid_state);
 int pci_pasid_features(struct pci_dev *pdev)
 {
 	u16 supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 
 	supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV;
 
@@ -395,36 +484,6 @@ int pci_pasid_features(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_pasid_features);
 
-/**
- * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
- *				 status.
- * @pdev: PCI device structure
- *
- * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
- *
- * Even though the PRG response PASID status is read from PRI Status
- * Register, since this API will mainly be used by PASID users, this
- * function is defined within #ifdef CONFIG_PCI_PASID instead of
- * CONFIG_PCI_PRI.
- */
-int pci_prg_resp_pasid_required(struct pci_dev *pdev)
-{
-	u16 status;
-	int pos;
-
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
-		return 0;
-
-	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
-
-	if (status & PCI_PRI_STATUS_PASID)
-		return 1;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
-
 #define PASID_NUMBER_SHIFT	8
 #define PASID_NUMBER_MASK	(0x1f << PASID_NUMBER_SHIFT)
 /**
@@ -437,13 +496,15 @@ EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
 int pci_max_pasids(struct pci_dev *pdev)
 {
 	u16 supported;
-	int pos;
+	int pasid = pdev->pasid_cap;
 
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (!pos)
+	if (pdev->is_virtfn)
+		pdev = pci_physfn(pdev);
+
+	if (!pasid)
 		return -EINVAL;
 
-	pci_read_config_word(pdev, pos + PCI_PASID_CAP, &supported);
+	pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported);
 
 	supported = (supported & PASID_NUMBER_MASK) >> PASID_NUMBER_SHIFT;
 
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index 70e078238899f206ab5885a103468453e8ac9dc4..fe377ee56d04541ea7b6263e5a49275c2644d9e0 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -69,6 +69,7 @@ config PCI_TEGRA
 	bool "NVIDIA Tegra PCIe controller"
 	depends on ARCH_TEGRA || COMPILE_TEST
 	depends on PCI_MSI_IRQ_DOMAIN
+	select PCI_MSI_ARCH_FALLBACKS
 	help
 	  Say Y here if you want support for the PCIe host controller found
 	  on NVIDIA Tegra SoCs.
@@ -105,6 +106,7 @@ config PCI_HOST_GENERIC
 config PCIE_XILINX
 	bool "Xilinx AXI PCIe host bridge support"
 	depends on OF || COMPILE_TEST
+	select PCI_MSI_ARCH_FALLBACKS
 	help
 	  Say 'Y' here if you want kernel to support the Xilinx AXI PCIe
 	  Host Bridge driver.
@@ -267,7 +269,6 @@ config PCIE_TANGO_SMP8759
 
 config VMD
 	depends on PCI_MSI && X86_64 && SRCU
-	select X86_DEV_DMA_OPS
 	tristate "Intel Volume Management Device Driver"
 	---help---
 	  Adds support for the Intel Volume Management Device (VMD). VMD is a
diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c
index 4234ddb4722f7a23a7fc293a84cbfe255677d6de..b20651cea09f84095fdbd4c1a304a3d8cd5ce3b5 100644
--- a/drivers/pci/controller/dwc/pci-dra7xx.c
+++ b/drivers/pci/controller/dwc/pci-dra7xx.c
@@ -353,7 +353,7 @@ static void dra7xx_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pci);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 
 	dra7xx_pcie_enable_wrapper_interrupts(dra7xx);
diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index ca9aa4501e7e98aa9f7a3c9a2e997b4c96aafedb..0d151cead1b72ee2c03305f43a4f90b7cb8fe00f 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -58,7 +58,7 @@ static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-artpec6.c b/drivers/pci/controller/dwc/pcie-artpec6.c
index d00252bd8faeeb5f6bbda8472a66aa0a67dcb5ab..9e2482bd7b6def3067f9f40980ca67b51b39e3a9 100644
--- a/drivers/pci/controller/dwc/pcie-artpec6.c
+++ b/drivers/pci/controller/dwc/pcie-artpec6.c
@@ -422,7 +422,7 @@ static void artpec6_pcie_ep_init(struct dw_pcie_ep *ep)
 	artpec6_pcie_wait_for_phy(artpec6_pcie);
 	artpec6_pcie_set_nfts(artpec6_pcie);
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index b58fdcbc664b85b97b06ee14a4d2c8f0850a15ab..73646b677affa943f3f78e0752e844df6abab9c6 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -70,7 +70,7 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	enum pci_barno bar;
 
-	for (bar = BAR_0; bar <= BAR_5; bar++)
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
 		dw_pcie_ep_reset_bar(pci, bar);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index 5a18e94e52c801df68cda03b159a8ac34442bc41..5accdd6bc388166d910cf0a32a4b93e1573f5a94 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -214,7 +214,7 @@ struct dw_pcie_ep {
 	phys_addr_t		phys_base;
 	size_t			addr_size;
 	size_t			page_size;
-	u8			bar_to_atu[6];
+	u8			bar_to_atu[PCI_STD_NUM_BARS];
 	phys_addr_t		*outbound_addr;
 	unsigned long		*ib_window_map;
 	unsigned long		*ob_window_map;
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index f1f300218fab8659629ba466b044ceeb06c96bce..1688d2ae00cdbfff9895c3fa09ffee9dca026467 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -307,7 +307,7 @@ struct pci_bus_relations {
 struct pci_q_res_req_response {
 	struct vmpacket_descriptor hdr;
 	s32 status;			/* negative values are failures */
-	u32 probed_bar[6];
+	u32 probed_bar[PCI_STD_NUM_BARS];
 } __packed;
 
 struct pci_set_power {
@@ -539,7 +539,7 @@ struct hv_pci_dev {
 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 	 * read it back, for each of the BAR offsets within config space.
 	 */
-	u32 probed_bar[6];
+	u32 probed_bar[PCI_STD_NUM_BARS];
 };
 
 struct hv_pci_compl {
@@ -1218,7 +1218,7 @@ static void hv_irq_unmask(struct irq_data *data)
 	params->int_target.vector = cfg->vector;
 
 	/*
-	 * Honoring apic->irq_delivery_mode set to dest_Fixed by
+	 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
 	 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
 	 * spurious interrupt storm. Not doing so does not seem to have a
 	 * negative effect (yet?).
@@ -1302,7 +1302,7 @@ static u32 hv_compose_msi_req_v1(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode = dest_Fixed;
+	int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED;
 
 	/*
 	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
@@ -1323,7 +1323,7 @@ static u32 hv_compose_msi_req_v2(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode = dest_Fixed;
+	int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED;
 
 	/*
 	 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
@@ -1510,16 +1510,8 @@ static struct irq_chip hv_msi_irq_chip = {
 	.irq_unmask		= hv_irq_unmask,
 };
 
-static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
-						   msi_alloc_info_t *arg)
-{
-	return arg->msi_hwirq;
-}
-
 static struct msi_domain_ops hv_msi_ops = {
-	.get_hwirq	= hv_msi_domain_ops_get_hwirq,
-	.msi_prepare	= pci_msi_prepare,
-	.set_desc	= pci_msi_set_desc,
+	.msi_prepare	= arch_msi_prepare,
 	.msi_free	= hv_msi_free,
 };
 
@@ -1610,7 +1602,7 @@ static void survey_child_resources(struct hv_pcibus_device *hbus)
 	 * so it's sufficient to just add them up without tracking alignment.
 	 */
 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
-		for (i = 0; i < 6; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
 				dev_err(&hbus->hdev->device,
 					"There's an I/O BAR in this list!\n");
@@ -1684,7 +1676,7 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus)
 	/* Pick addresses for the BARs. */
 	do {
 		list_for_each_entry(hpdev, &hbus->children, list_entry) {
-			for (i = 0; i < 6; i++) {
+			for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 				bar_val = hpdev->probed_bar[i];
 				if (bar_val == 0)
 					continue;
@@ -1841,7 +1833,7 @@ static void q_resource_requirements(void *context, struct pci_response *resp,
 			"query resource requirements failed: %x\n",
 			resp->status);
 	} else {
-		for (i = 0; i < 6; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			completion->hpdev->probed_bar[i] =
 				q_res_req->probed_bar[i];
 		}
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 9966dcf1d112d9bb33df0b34e86ae4f7f1a7e3d6..74f2c7f323512f884e4e0952608d62dbdd400bb3 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -18,7 +18,6 @@
 #include <asm/irqdomain.h>
 #include <asm/device.h>
 #include <asm/msi.h>
-#include <asm/msidef.h>
 
 #define VMD_CFGBAR	0
 #define VMD_MEMBAR1	2
@@ -28,6 +27,7 @@
 #define BUS_RESTRICT_CAP(vmcap)	(vmcap & 0x1)
 #define PCI_REG_VMCONFIG	0x44
 #define BUS_RESTRICT_CFG(vmcfg)	((vmcfg >> 8) & 0x3)
+#define VMCONFIG_MSI_REMAP	0x2
 #define PCI_REG_VMLOCK		0x70
 #define MB2_SHADOW_EN(vmlock)	(vmlock & 0x2)
 
@@ -40,13 +40,32 @@ enum vmd_features {
 	 * membars, in order to allow proper address translation during
 	 * resource assignment to enable guest virtualization
 	 */
-	VMD_FEAT_HAS_MEMBAR_SHADOW	= (1 << 0),
+	VMD_FEAT_HAS_MEMBAR_SHADOW		= (1 << 0),
 
 	/*
 	 * Device may provide root port configuration information which limits
 	 * bus numbering
 	 */
-	VMD_FEAT_HAS_BUS_RESTRICTIONS	= (1 << 1),
+	VMD_FEAT_HAS_BUS_RESTRICTIONS		= (1 << 1),
+
+	/*
+	 * Device contains physical location shadow registers in
+	 * vendor-specific capability space
+	 */
+	VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP	= (1 << 2),
+
+	/*
+	 * Device may use MSI-X vector 0 for software triggering and will not
+	 * be used for MSI remapping
+	 */
+	VMD_FEAT_OFFSET_FIRST_VECTOR		= (1 << 3),
+
+	/*
+	 * Device can bypass remapping MSI-X transactions into its MSI-X table,
+	 * avoiding the requirement of a VMD MSI domain for child device
+	 * interrupt handling.
+	 */
+	VMD_FEAT_CAN_BYPASS_MSI_REMAP		= (1 << 4),
 };
 
 /*
@@ -98,9 +117,7 @@ struct vmd_dev {
 	struct irq_domain	*irq_domain;
 	struct pci_bus		*bus;
 	u8			busn_start;
-
-	struct dma_map_ops	dma_ops;
-	struct dma_domain	dma_domain;
+	u8			first_vec;
 };
 
 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
@@ -128,10 +145,10 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	struct vmd_irq_list *irq = vmdirq->irq;
 	struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
 
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->address_lo = MSI_ADDR_BASE_LO |
-			  MSI_ADDR_DEST_ID(index_from_irqs(vmd, irq));
-	msg->data = 0;
+	memset(msg, 0, sizeof(*msg));
+	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+	msg->arch_addr_lo.destid_0_7 = index_from_irqs(vmd, irq);
 }
 
 /*
@@ -196,11 +213,11 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
  */
 static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
 {
-	int i, best = 1;
 	unsigned long flags;
+	int i, best;
 
-	if (vmd->msix_count == 1)
-		return &vmd->irqs[0];
+	if (vmd->msix_count == 1 + vmd->first_vec)
+		return &vmd->irqs[vmd->first_vec];
 
 	/*
 	 * White list for fast-interrupt handlers. All others will share the
@@ -210,11 +227,12 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
 	case PCI_CLASS_STORAGE_EXPRESS:
 		break;
 	default:
-		return &vmd->irqs[0];
+		return &vmd->irqs[vmd->first_vec];
 	}
 
 	raw_spin_lock_irqsave(&list_lock, flags);
-	for (i = 1; i < vmd->msix_count; i++)
+	best = vmd->first_vec + 1;
+	for (i = best; i < vmd->msix_count; i++)
 		if (vmd->irqs[i].count < vmd->irqs[best].count)
 			best = i;
 	vmd->irqs[best].count++;
@@ -295,150 +313,50 @@ static struct msi_domain_info vmd_msi_domain_info = {
 	.chip		= &vmd_msi_controller,
 };
 
-/*
- * VMD replaces the requester ID with its own.  DMA mappings for devices in a
- * VMD domain need to be mapped for the VMD, not the device requiring
- * the mapping.
- */
-static struct device *to_vmd_dev(struct device *dev)
+static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
+	u16 reg;
 
-	return &vmd->dev->dev;
+	pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, &reg);
+	reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) :
+		       (reg | VMCONFIG_MSI_REMAP);
+	pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
 }
 
-static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
-		       gfp_t flag, unsigned long attrs)
+static int vmd_create_irq_domain(struct vmd_dev *vmd)
 {
-	return dma_alloc_attrs(to_vmd_dev(dev), size, addr, flag, attrs);
-}
-
-static void vmd_free(struct device *dev, size_t size, void *vaddr,
-		     dma_addr_t addr, unsigned long attrs)
-{
-	return dma_free_attrs(to_vmd_dev(dev), size, vaddr, addr, attrs);
-}
-
-static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
-		    void *cpu_addr, dma_addr_t addr, size_t size,
-		    unsigned long attrs)
-{
-	return dma_mmap_attrs(to_vmd_dev(dev), vma, cpu_addr, addr, size,
-			attrs);
-}
-
-static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
-			   void *cpu_addr, dma_addr_t addr, size_t size,
-			   unsigned long attrs)
-{
-	return dma_get_sgtable_attrs(to_vmd_dev(dev), sgt, cpu_addr, addr, size,
-			attrs);
-}
-
-static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
-			       unsigned long offset, size_t size,
-			       enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	return dma_map_page_attrs(to_vmd_dev(dev), page, offset, size, dir,
-			attrs);
-}
-
-static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
-			   enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_unmap_page_attrs(to_vmd_dev(dev), addr, size, dir, attrs);
-}
-
-static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction dir, unsigned long attrs)
-{
-	return dma_map_sg_attrs(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-			 enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_unmap_sg_attrs(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-				    size_t size, enum dma_data_direction dir)
-{
-	dma_sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
-				       size_t size, enum dma_data_direction dir)
-{
-	dma_sync_single_for_device(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				int nents, enum dma_data_direction dir)
-{
-	dma_sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
-}
+	struct fwnode_handle *fn;
 
-static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir)
-{
-	dma_sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
-}
+	fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
+	if (!fn)
+		return -ENODEV;
 
-static int vmd_dma_supported(struct device *dev, u64 mask)
-{
-	return dma_supported(to_vmd_dev(dev), mask);
-}
+	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
+						    x86_vector_domain);
+	if (!vmd->irq_domain) {
+		irq_domain_free_fwnode(fn);
+		return -ENODEV;
+	}
 
-static u64 vmd_get_required_mask(struct device *dev)
-{
-	return dma_get_required_mask(to_vmd_dev(dev));
+	return 0;
 }
 
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
+static void vmd_remove_irq_domain(struct vmd_dev *vmd)
 {
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	if (get_dma_ops(&vmd->dev->dev))
-		del_dma_domain(domain);
-}
+	/*
+	 * Some production BIOS won't enable remapping between soft reboots.
+	 * Ensure remapping is restored before unloading the driver.
+	 */
+	if (!vmd->msix_count)
+		vmd_set_msi_remapping(vmd, true);
 
-#define ASSIGN_VMD_DMA_OPS(source, dest, fn)	\
-	do {					\
-		if (source->fn)			\
-			dest->fn = vmd_##fn;	\
-	} while (0)
+	if (vmd->irq_domain) {
+		struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
-static void vmd_setup_dma_ops(struct vmd_dev *vmd)
-{
-	const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
-	struct dma_map_ops *dest = &vmd->dma_ops;
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	domain->domain_nr = vmd->sysdata.domain;
-	domain->dma_ops = dest;
-
-	if (!source)
-		return;
-	ASSIGN_VMD_DMA_OPS(source, dest, alloc);
-	ASSIGN_VMD_DMA_OPS(source, dest, free);
-	ASSIGN_VMD_DMA_OPS(source, dest, mmap);
-	ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
-	ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
-	add_dma_domain(domain);
+		irq_domain_remove(vmd->irq_domain);
+		irq_domain_free_fwnode(fn);
+	}
 }
-#undef ASSIGN_VMD_DMA_OPS
 
 static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
 				  unsigned int devfn, int reg, int len)
@@ -559,10 +477,141 @@ static int vmd_find_free_domain(void)
 	return domain + 1;
 }
 
+static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
+				resource_size_t *offset1,
+				resource_size_t *offset2)
+{
+	struct pci_dev *dev = vmd->dev;
+	u64 phys1, phys2;
+
+	if (native_hint) {
+		u32 vmlock;
+		int ret;
+
+		ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, &vmlock);
+		if (ret || vmlock == ~0)
+			return -ENODEV;
+
+		if (MB2_SHADOW_EN(vmlock)) {
+			void __iomem *membar2;
+
+			membar2 = pci_iomap(dev, VMD_MEMBAR2, 0);
+			if (!membar2)
+				return -ENOMEM;
+			phys1 = readq(membar2 + MB2_SHADOW_OFFSET);
+			phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8);
+			pci_iounmap(dev, membar2);
+		} else
+			return 0;
+	} else {
+		/* Hypervisor-Emulated Vendor-Specific Capability */
+		int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+		u32 reg, regu;
+
+		pci_read_config_dword(dev, pos + 4, &reg);
+
+		/* "SHDW" */
+		if (pos && reg == 0x53484457) {
+			pci_read_config_dword(dev, pos + 8, &reg);
+			pci_read_config_dword(dev, pos + 12, &regu);
+			phys1 = (u64) regu << 32 | reg;
+
+			pci_read_config_dword(dev, pos + 16, &reg);
+			pci_read_config_dword(dev, pos + 20, &regu);
+			phys2 = (u64) regu << 32 | reg;
+		} else
+			return 0;
+	}
+
+	*offset1 = dev->resource[VMD_MEMBAR1].start -
+			(phys1 & PCI_BASE_ADDRESS_MEM_MASK);
+	*offset2 = dev->resource[VMD_MEMBAR2].start -
+			(phys2 & PCI_BASE_ADDRESS_MEM_MASK);
+
+	return 0;
+}
+
+static int vmd_get_bus_number_start(struct vmd_dev *vmd)
+{
+	struct pci_dev *dev = vmd->dev;
+	u16 reg;
+
+	pci_read_config_word(dev, PCI_REG_VMCAP, &reg);
+	if (BUS_RESTRICT_CAP(reg)) {
+		pci_read_config_word(dev, PCI_REG_VMCONFIG, &reg);
+
+		switch (BUS_RESTRICT_CFG(reg)) {
+		case 0:
+			vmd->busn_start = 0;
+			break;
+		case 1:
+			vmd->busn_start = 128;
+			break;
+		case 2:
+			vmd->busn_start = 224;
+			break;
+		default:
+			pci_err(dev, "Unknown Bus Offset Setting (%d)\n",
+				BUS_RESTRICT_CFG(reg));
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+static irqreturn_t vmd_irq(int irq, void *data)
+{
+	struct vmd_irq_list *irqs = data;
+	struct vmd_irq *vmdirq;
+	int idx;
+
+	idx = srcu_read_lock(&irqs->srcu);
+	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
+		generic_handle_irq(vmdirq->virq);
+	srcu_read_unlock(&irqs->srcu, idx);
+
+	return IRQ_HANDLED;
+}
+
+static int vmd_alloc_irqs(struct vmd_dev *vmd)
+{
+	struct pci_dev *dev = vmd->dev;
+	int i, err;
+
+	vmd->msix_count = pci_msix_vec_count(dev);
+	if (vmd->msix_count < 0)
+		return -ENODEV;
+
+	vmd->msix_count = pci_alloc_irq_vectors(dev, vmd->first_vec + 1,
+						vmd->msix_count, PCI_IRQ_MSIX);
+	if (vmd->msix_count < 0)
+		return vmd->msix_count;
+
+	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
+				 GFP_KERNEL);
+	if (!vmd->irqs)
+		return -ENOMEM;
+
+	for (i = 0; i < vmd->msix_count; i++) {
+		err = init_srcu_struct(&vmd->irqs[i].srcu);
+		if (err)
+			return err;
+
+		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
+		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
+				       vmd_irq, IRQF_NO_THREAD,
+				       "vmd", &vmd->irqs[i]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
 	struct pci_sysdata *sd = &vmd->sysdata;
-	struct fwnode_handle *fn;
 	struct resource *res;
 	u32 upper_bits;
 	unsigned long flags;
@@ -570,6 +619,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	resource_size_t offset[2] = {0};
 	resource_size_t membar2_offset = 0x2000;
 	struct pci_bus *child;
+	int ret;
 
 	/*
 	 * Shadow registers may exist in certain VMD device ids which allow
@@ -578,42 +628,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * or 0, depending on an enable bit in the VMD device.
 	 */
 	if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
-		u32 vmlock;
-		int ret;
-
 		membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
-		ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock);
-		if (ret || vmlock == ~0)
-			return -ENODEV;
-
-		if (MB2_SHADOW_EN(vmlock)) {
-			void __iomem *membar2;
-
-			membar2 = pci_iomap(vmd->dev, VMD_MEMBAR2, 0);
-			if (!membar2)
-				return -ENOMEM;
-			offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-					(readq(membar2 + MB2_SHADOW_OFFSET) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-			offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-					(readq(membar2 + MB2_SHADOW_OFFSET + 8) &
-					 PCI_BASE_ADDRESS_MEM_MASK);
-			pci_iounmap(vmd->dev, membar2);
-		}
+		ret = vmd_get_phys_offsets(vmd, true, &offset[0], &offset[1]);
+		if (ret)
+			return ret;
+	} else if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
+		ret = vmd_get_phys_offsets(vmd, false, &offset[0], &offset[1]);
+		if (ret)
+			return ret;
 	}
 
 	/*
 	 * Certain VMD devices may have a root port configuration option which
-	 * limits the bus range to between 0-127 or 128-255
+	 * limits the bus range to between 0-127, 128-255, or 224-255
 	 */
 	if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
-		u32 vmcap, vmconfig;
-
-		pci_read_config_dword(vmd->dev, PCI_REG_VMCAP, &vmcap);
-		pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig);
-		if (BUS_RESTRICT_CAP(vmcap) &&
-		    (BUS_RESTRICT_CFG(vmconfig) == 0x1))
-			vmd->busn_start = 128;
+		ret = vmd_get_bus_number_start(vmd);
+		if (ret)
+			return ret;
 	}
 
 	res = &vmd->dev->resource[VMD_CFGBAR];
@@ -667,22 +699,38 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 		.parent = res,
 	};
 
-	sd->vmd_domain = true;
+	sd->vmd_dev = vmd->dev;
 	sd->domain = vmd_find_free_domain();
 	if (sd->domain < 0)
 		return sd->domain;
 
 	sd->node = pcibus_to_node(vmd->dev->bus);
 
-	fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
-	if (!fn)
-		return -ENODEV;
-
-	vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info,
-						    x86_vector_domain);
-	if (!vmd->irq_domain) {
-		irq_domain_free_fwnode(fn);
-		return -ENODEV;
+	/*
+	 * Currently MSI remapping must be enabled in guest passthrough mode
+	 * due to some missing interrupt remapping plumbing. This is probably
+	 * acceptable because the guest is usually CPU-limited and MSI
+	 * remapping doesn't become a performance bottleneck.
+	 */
+	if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+	    offset[0] || offset[1]) {
+		ret = vmd_alloc_irqs(vmd);
+		if (ret)
+			return ret;
+
+		vmd_set_msi_remapping(vmd, true);
+
+		ret = vmd_create_irq_domain(vmd);
+		if (ret)
+			return ret;
+
+		/*
+		 * Override the IRQ domain bus token so the domain can be
+		 * distinguished from a regular PCI/MSI domain.
+		 */
+		irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+	} else {
+		vmd_set_msi_remapping(vmd, false);
 	}
 
 	pci_add_resource(&resources, &vmd->resources[0]);
@@ -693,14 +741,16 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 				       &vmd_ops, sd, &resources);
 	if (!vmd->bus) {
 		pci_free_resource_list(&resources);
-		irq_domain_remove(vmd->irq_domain);
-		irq_domain_free_fwnode(fn);
+		vmd_remove_irq_domain(vmd);
 		return -ENODEV;
 	}
 
 	vmd_attach_resources(vmd);
-	vmd_setup_dma_ops(vmd);
-	dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+	if (vmd->irq_domain)
+		dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+	else
+		dev_set_msi_domain(&vmd->bus->dev,
+				   dev_get_msi_domain(&vmd->dev->dev));
 
 	pci_scan_child_bus(vmd->bus);
 	pci_assign_unassigned_bus_resources(vmd->bus);
@@ -720,24 +770,11 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	return 0;
 }
 
-static irqreturn_t vmd_irq(int irq, void *data)
-{
-	struct vmd_irq_list *irqs = data;
-	struct vmd_irq *vmdirq;
-	int idx;
-
-	idx = srcu_read_lock(&irqs->srcu);
-	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
-		generic_handle_irq(vmdirq->virq);
-	srcu_read_unlock(&irqs->srcu, idx);
-
-	return IRQ_HANDLED;
-}
-
 static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	unsigned long features = (unsigned long) id->driver_data;
 	struct vmd_dev *vmd;
-	int i, err;
+	int err;
 
 	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
 		return -ENOMEM;
@@ -760,36 +797,12 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	    dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
 		return -ENODEV;
 
-	vmd->msix_count = pci_msix_vec_count(dev);
-	if (vmd->msix_count < 0)
-		return -ENODEV;
-
-	vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
-					PCI_IRQ_MSIX);
-	if (vmd->msix_count < 0)
-		return vmd->msix_count;
-
-	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
-				 GFP_KERNEL);
-	if (!vmd->irqs)
-		return -ENOMEM;
-
-	for (i = 0; i < vmd->msix_count; i++) {
-		err = init_srcu_struct(&vmd->irqs[i].srcu);
-		if (err)
-			return err;
-
-		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
-		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
-				       vmd_irq, IRQF_NO_THREAD,
-				       "vmd", &vmd->irqs[i]);
-		if (err)
-			return err;
-	}
+	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
+		vmd->first_vec = 1;
 
 	spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
-	err = vmd_enable_domain(vmd, (unsigned long) id->driver_data);
+	err = vmd_enable_domain(vmd, features);
 	if (err)
 		return err;
 
@@ -809,16 +822,13 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
 static void vmd_remove(struct pci_dev *dev)
 {
 	struct vmd_dev *vmd = pci_get_drvdata(dev);
-	struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
 	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
 	pci_stop_root_bus(vmd->bus);
 	pci_remove_root_bus(vmd->bus);
 	vmd_cleanup_srcu(vmd);
-	vmd_teardown_dma_ops(vmd);
 	vmd_detach_resources(vmd);
-	irq_domain_remove(vmd->irq_domain);
-	irq_domain_free_fwnode(fn);
+	vmd_remove_irq_domain(vmd);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -831,7 +841,6 @@ static int vmd_suspend(struct device *dev)
 	for (i = 0; i < vmd->msix_count; i++)
                 devm_free_irq(dev, pci_irq_vector(pdev, i), &vmd->irqs[i]);
 
-	pci_save_state(pdev);
 	return 0;
 }
 
@@ -849,19 +858,30 @@ static int vmd_resume(struct device *dev)
 			return err;
 	}
 
-	pci_restore_state(pdev);
 	return 0;
 }
 #endif
 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
 
 static const struct pci_device_id vmd_ids[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
 		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
-				VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_CAN_BYPASS_MSI_REMAP,},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
-		.driver_data = VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4c3d),
+		.driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
+				VMD_FEAT_HAS_BUS_RESTRICTIONS |
+				VMD_FEAT_OFFSET_FIRST_VECTOR,},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, vmd_ids);
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 1cfe3687a21191c799acf1686df0372317c56e7b..5d74f81ddfe4d53d2d522774e120281e8431a944 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -44,7 +44,7 @@
 static struct workqueue_struct *kpcitest_workqueue;
 
 struct pci_epf_test {
-	void			*reg[6];
+	void			*reg[PCI_STD_NUM_BARS];
 	struct pci_epf		*epf;
 	enum pci_barno		test_reg_bar;
 	struct delayed_work	cmd_handler;
@@ -377,7 +377,7 @@ static void pci_epf_test_unbind(struct pci_epf *epf)
 
 	cancel_delayed_work(&epf_test->cmd_handler);
 	pci_epc_stop(epc);
-	for (bar = BAR_0; bar <= BAR_5; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		epf_bar = &epf->bar[bar];
 
 		if (epf_test->reg[bar]) {
@@ -400,7 +400,7 @@ static int pci_epf_test_set_bar(struct pci_epf *epf)
 
 	epc_features = epf_test->epc_features;
 
-	for (bar = BAR_0; bar <= BAR_5; bar += add) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar += add) {
 		epf_bar = &epf->bar[bar];
 		/*
 		 * pci_epc_set_bar() sets PCI_BASE_ADDRESS_MEM_TYPE_64
@@ -450,7 +450,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 	}
 	epf_test->reg[test_reg_bar] = base;
 
-	for (bar = BAR_0; bar <= BAR_5; bar += add) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar += add) {
 		epf_bar = &epf->bar[bar];
 		add = (epf_bar->flags & PCI_BASE_ADDRESS_MEM_TYPE_64) ? 2 : 1;
 
@@ -478,7 +478,7 @@ static void pci_epf_configure_bar(struct pci_epf *epf,
 	bool bar_fixed_64bit;
 	int i;
 
-	for (i = BAR_0; i <= BAR_5; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		epf_bar = &epf->bar[i];
 		bar_fixed_64bit = !!(epc_features->bar_fixed_64bit & (1 << i));
 		if (bar_fixed_64bit)
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 5ac31f683b85362c1bca32cbc01a9c77a51a4202..058d5937d8a9531b00dbbf576f290210352dfe34 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -73,7 +73,7 @@ static ssize_t power_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static ssize_t power_write_file(struct pci_slot *pci_slot, const char *buf,
@@ -130,7 +130,7 @@ static ssize_t attention_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static ssize_t attention_write_file(struct pci_slot *pci_slot, const char *buf,
@@ -175,7 +175,7 @@ static ssize_t latch_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static struct pci_slot_attribute hotplug_slot_attr_latch = {
@@ -192,7 +192,7 @@ static ssize_t presence_read_file(struct pci_slot *pci_slot, char *buf)
 	if (retval)
 		return retval;
 
-	return sprintf(buf, "%d\n", value);
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static struct pci_slot_attribute hotplug_slot_attr_presence = {
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 88b996764ff95ab4abeb72c332ec20154a7130a0..6ebb25e312b5a59d0cbc66be500e3483b223057e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -524,6 +524,32 @@ void pciehp_power_off_slot(struct controller *ctrl)
 		 PCI_EXP_SLTCTL_PWR_OFF);
 }
 
+static void pciehp_ignore_dpc_link_change(struct controller *ctrl,
+					  struct pci_dev *pdev, int irq)
+{
+	/*
+	 * Ignore link changes which occurred while waiting for DPC recovery.
+	 * Could be several if DPC triggered multiple times consecutively.
+	 */
+	synchronize_hardirq(irq);
+	atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events);
+	if (pciehp_poll_mode)
+		pcie_capability_write_word(pdev, PCI_EXP_SLTSTA,
+					   PCI_EXP_SLTSTA_DLLSC);
+	ctrl_info(ctrl, "Slot(%s): Link Down/Up ignored (recovered by DPC)\n",
+		  slot_name(ctrl));
+
+	/*
+	 * If the link is unexpectedly down after successful recovery,
+	 * the corresponding link change may have been ignored above.
+	 * Synthesize it to ensure that it is acted on.
+	 */
+	down_read(&ctrl->reset_lock);
+	if (!pciehp_check_link_active(ctrl))
+		pciehp_request(ctrl, PCI_EXP_SLTSTA_DLLSC);
+	up_read(&ctrl->reset_lock);
+}
+
 static irqreturn_t pciehp_isr(int irq, void *dev_id)
 {
 	struct controller *ctrl = (struct controller *)dev_id;
@@ -667,6 +693,16 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id)
 				      PCI_EXP_SLTCTL_ATTN_IND_ON);
 	}
 
+	/*
+	 * Ignore Link Down/Up events caused by Downstream Port Containment
+	 * if recovery from the error succeeded.
+	 */
+	if ((events & PCI_EXP_SLTSTA_DLLSC) && pci_dpc_recovered(pdev) &&
+	    ctrl->state == ON_STATE) {
+		events &= ~PCI_EXP_SLTSTA_DLLSC;
+		pciehp_ignore_dpc_link_change(ctrl, pdev, irq);
+	}
+
 	/*
 	 * Disable requests have higher priority than Presence Detect Changed
 	 * or Data Link Layer State Changed events.
diff --git a/drivers/pci/hotplug/rpadlpar_sysfs.c b/drivers/pci/hotplug/rpadlpar_sysfs.c
index dbfa0b55d31a5126f35785653857117dac463d35..068b7810a5746ba06d00d7e2c532dc590ac24c72 100644
--- a/drivers/pci/hotplug/rpadlpar_sysfs.c
+++ b/drivers/pci/hotplug/rpadlpar_sysfs.c
@@ -50,7 +50,7 @@ static ssize_t add_slot_store(struct kobject *kobj, struct kobj_attribute *attr,
 static ssize_t add_slot_show(struct kobject *kobj,
 			     struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "0\n");
+	return sysfs_emit(buf, "0\n");
 }
 
 static ssize_t remove_slot_store(struct kobject *kobj,
@@ -80,7 +80,7 @@ static ssize_t remove_slot_store(struct kobject *kobj,
 static ssize_t remove_slot_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "0\n");
+	return sysfs_emit(buf, "0\n");
 }
 
 static struct kobj_attribute add_slot_attr =
diff --git a/drivers/pci/hotplug/shpchp_sysfs.c b/drivers/pci/hotplug/shpchp_sysfs.c
index 45658bb5c554f2ade9ea0e2b4b20ab1565abb493..64beed7a26bed140bd974337856a16a50aac25b0 100644
--- a/drivers/pci/hotplug/shpchp_sysfs.c
+++ b/drivers/pci/hotplug/shpchp_sysfs.c
@@ -24,50 +24,54 @@
 static ssize_t show_ctrl(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct pci_dev *pdev;
-	char *out = buf;
 	int index, busnr;
 	struct resource *res;
 	struct pci_bus *bus;
+	size_t len = 0;
 
 	pdev = to_pci_dev(dev);
 	bus = pdev->subordinate;
 
-	out += sprintf(buf, "Free resources: memory\n");
+	len += sysfs_emit_at(buf, len, "Free resources: memory\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 				!(res->flags & IORESOURCE_PREFETCH)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: prefetchable memory\n");
+	len += sysfs_emit_at(buf, len, "Free resources: prefetchable memory\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_MEM) &&
 			       (res->flags & IORESOURCE_PREFETCH)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: IO\n");
+	len += sysfs_emit_at(buf, len, "Free resources: IO\n");
 	pci_bus_for_each_resource(bus, res, index) {
 		if (res && (res->flags & IORESOURCE_IO)) {
-			out += sprintf(out, "start = %8.8llx, length = %8.8llx\n",
-				       (unsigned long long)res->start,
-				       (unsigned long long)resource_size(res));
+			len += sysfs_emit_at(buf, len,
+					     "start = %8.8llx, length = %8.8llx\n",
+					     (unsigned long long)res->start,
+					     (unsigned long long)resource_size(res));
 		}
 	}
-	out += sprintf(out, "Free resources: bus numbers\n");
+	len += sysfs_emit_at(buf, len, "Free resources: bus numbers\n");
 	for (busnr = bus->busn_res.start; busnr <= bus->busn_res.end; busnr++) {
 		if (!pci_find_bus(pci_domain_nr(bus), busnr))
 			break;
 	}
 	if (busnr < bus->busn_res.end)
-		out += sprintf(out, "start = %8.8x, length = %8.8x\n",
-				busnr, (int)(bus->busn_res.end - busnr));
+		len += sysfs_emit_at(buf, len,
+				     "start = %8.8x, length = %8.8x\n",
+				     busnr, (int)(bus->busn_res.end - busnr));
 
-	return out - buf;
+	return len;
 }
 static DEVICE_ATTR(ctrl, S_IRUGO, show_ctrl, NULL);
 
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index b3b802bcefa05b46504f2baee051249fd5360da7..e3436cccb6eb02147fbe5491b94ec6cab5ce3858 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -246,7 +246,7 @@ static ssize_t sriov_totalvfs_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
+	return sysfs_emit(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
 }
 
 static ssize_t sriov_numvfs_show(struct device *dev,
@@ -261,7 +261,7 @@ static ssize_t sriov_numvfs_show(struct device *dev,
 	num_vfs = pdev->sriov->num_VFs;
 	device_unlock(&pdev->dev);
 
-	return sprintf(buf, "%u\n", num_vfs);
+	return sysfs_emit(buf, "%u\n", num_vfs);
 }
 
 /*
@@ -340,7 +340,7 @@ static ssize_t sriov_offset_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->offset);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->offset);
 }
 
 static ssize_t sriov_stride_show(struct device *dev,
@@ -349,7 +349,7 @@ static ssize_t sriov_stride_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->stride);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->stride);
 }
 
 static ssize_t sriov_vf_device_show(struct device *dev,
@@ -358,7 +358,7 @@ static ssize_t sriov_vf_device_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%x\n", pdev->sriov->vf_device);
+	return sysfs_emit(buf, "%x\n", pdev->sriov->vf_device);
 }
 
 static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
@@ -367,7 +367,7 @@ static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
+	return sysfs_emit(buf, "%u\n", pdev->sriov->drivers_autoprobe);
 }
 
 static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 771041784e645c21e5daa1d2d9ef78d174778fb5..f33f7469d7ba02b371c2a178964cc42ab1593bf0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -26,6 +26,8 @@
 
 #include "pci.h"
 
+#ifdef CONFIG_PCI_MSI
+
 static int pci_msi_enable = 1;
 int pci_msi_ignore_mask;
 
@@ -58,8 +60,8 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 #define pci_msi_teardown_msi_irqs	arch_teardown_msi_irqs
 #endif
 
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
 /* Arch hooks */
-
 int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
 	struct msi_controller *chip = dev->bus->msi;
@@ -102,7 +104,7 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	for_each_pci_msi_entry(entry, dev) {
+	for_each_new_pci_msi_entry(entry, dev) {
 		ret = arch_setup_msi_irq(dev, entry);
 		if (ret < 0)
 			return ret;
@@ -132,6 +134,7 @@ void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	return default_teardown_msi_irqs(dev);
 }
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 static void default_restore_msi_irq(struct pci_dev *dev, int irq)
 {
@@ -170,33 +173,30 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
  * reliably as devices without an INTx disable bit will then generate a
  * level IRQ which will never be cleared.
  */
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	u32 mask_bits = desc->masked;
+	raw_spinlock_t *lock = &desc->dev->msi_lock;
+	unsigned long flags;
 
 	if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
-		return 0;
+		return;
 
-	mask_bits &= ~mask;
-	mask_bits |= flag;
+	raw_spin_lock_irqsave(lock, flags);
+	desc->masked &= ~mask;
+	desc->masked |= flag;
 	pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
-			       mask_bits);
-
-	return mask_bits;
+			       desc->masked);
+	raw_spin_unlock_irqrestore(lock, flags);
 }
 
 static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
+	__pci_msi_desc_mask_irq(desc, mask, flag);
 }
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
-	if (desc->msi_attrib.is_virtual)
-		return NULL;
-
-	return desc->mask_base +
-		desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+	return desc->mask_base + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
 
 /*
@@ -208,14 +208,10 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
  */
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
+	void __iomem *desc_addr = pci_msix_desc_addr(desc);
 	u32 mask_bits = desc->masked;
-	void __iomem *desc_addr;
-
-	if (pci_msi_ignore_mask)
-		return 0;
 
-	desc_addr = pci_msix_desc_addr(desc);
-	if (!desc_addr)
+	if (pci_msi_ignore_mask || desc->msi_attrib.is_virtual)
 		return 0;
 
 	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
@@ -282,10 +278,8 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
-		if (!base) {
-			WARN_ON(1);
+		if (WARN_ON_ONCE(entry->msi_attrib.is_virtual))
 			return;
-		}
 
 		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -317,7 +311,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
-		if (!base)
+		if (entry->msi_attrib.is_virtual)
 			goto skip;
 
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
@@ -365,9 +359,7 @@ static void free_msi_irqs(struct pci_dev *dev)
 {
 	struct list_head *msi_list = dev_to_msi_list(&dev->dev);
 	struct msi_desc *entry, *tmp;
-	struct attribute **msi_attrs;
-	struct device_attribute *dev_attr;
-	int i, count = 0;
+	int i;
 
 	for_each_pci_msi_entry(entry, dev)
 		if (entry->irq)
@@ -376,6 +368,11 @@ static void free_msi_irqs(struct pci_dev *dev)
 
 	pci_msi_teardown_msi_irqs(dev);
 
+	if (dev->msi_irq_groups) {
+		msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups);
+		dev->msi_irq_groups = NULL;
+	}
+
 	list_for_each_entry_safe(entry, tmp, msi_list, list) {
 		if (entry->msi_attrib.is_msix) {
 			if (list_is_last(&entry->list, msi_list))
@@ -385,21 +382,49 @@ static void free_msi_irqs(struct pci_dev *dev)
 		list_del(&entry->list);
 		free_msi_entry(entry);
 	}
+}
 
-	if (dev->msi_irq_groups) {
-		sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups);
-		msi_attrs = dev->msi_irq_groups[0]->attrs;
-		while (msi_attrs[count]) {
-			dev_attr = container_of(msi_attrs[count],
-						struct device_attribute, attr);
-			kfree(dev_attr->attr.name);
-			kfree(dev_attr);
-			++count;
+static void pci_msix_teardown_irq(struct pci_dev *dev, unsigned int irq)
+{
+	struct irq_domain *domain;
+
+	domain = dev_get_msi_domain(&dev->dev);
+	if (domain && irq_domain_is_hierarchy(domain))
+		msi_domain_free_irq(domain, &dev->dev, irq);
+}
+ 
+static void free_msix_irq(struct pci_dev *dev, unsigned int irq)
+{
+	struct list_head *msi_list = dev_to_msi_list(&dev->dev);
+	struct device_attribute *dev_attr;
+	struct msi_desc *entry, *tmp;
+	struct attribute **msi_attrs;
+	int count;
+
+	list_for_each_entry_safe(entry, tmp, msi_list, list) {
+		if (entry->irq == irq) {
+			if (entry->irq) {
+				BUG_ON(irq_has_action(entry->irq));
+				pci_msix_teardown_irq(dev, irq);
+			}
+			if (dev->msi_irq_groups) {
+				msi_attrs = dev->msi_irq_groups[0]->attrs;
+				count = entry->msi_attrib.entry_nr;
+				if (msi_attrs[count]) {
+					dev_attr = container_of(msi_attrs[count],
+								struct device_attribute, attr);
+					sysfs_remove_file_from_group(&dev->dev.kobj,
+								     &dev_attr->attr,
+								     "msi_irqs");
+					kfree(dev_attr->attr.name);
+					kfree(dev_attr);
+					msi_attrs[count] = NULL;
+				}
+			}
+			__clear_bit(entry->msi_attrib.entry_nr, dev->msix_map);
+			list_del(&entry->list);
+			free_msi_entry(entry);
 		}
-		kfree(msi_attrs);
-		kfree(dev->msi_irq_groups[0]);
-		kfree(dev->msi_irq_groups);
-		dev->msi_irq_groups = NULL;
 	}
 }
 
@@ -458,102 +483,6 @@ void pci_restore_msi_state(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_restore_msi_state);
 
-static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	struct msi_desc *entry;
-	unsigned long irq;
-	int retval;
-
-	retval = kstrtoul(attr->attr.name, 10, &irq);
-	if (retval)
-		return retval;
-
-	entry = irq_get_msi_desc(irq);
-	if (entry)
-		return sprintf(buf, "%s\n",
-				entry->msi_attrib.is_msix ? "msix" : "msi");
-
-	return -ENODEV;
-}
-
-static int populate_msi_sysfs(struct pci_dev *pdev)
-{
-	struct attribute **msi_attrs;
-	struct attribute *msi_attr;
-	struct device_attribute *msi_dev_attr;
-	struct attribute_group *msi_irq_group;
-	const struct attribute_group **msi_irq_groups;
-	struct msi_desc *entry;
-	int ret = -ENOMEM;
-	int num_msi = 0;
-	int count = 0;
-	int i;
-
-	/* Determine how many msi entries we have */
-	for_each_pci_msi_entry(entry, pdev)
-		num_msi += entry->nvec_used;
-	if (!num_msi)
-		return 0;
-
-	/* Dynamically create the MSI attributes for the PCI device */
-	msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
-	if (!msi_attrs)
-		return -ENOMEM;
-	for_each_pci_msi_entry(entry, pdev) {
-		for (i = 0; i < entry->nvec_used; i++) {
-			msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
-			if (!msi_dev_attr)
-				goto error_attrs;
-			msi_attrs[count] = &msi_dev_attr->attr;
-
-			sysfs_attr_init(&msi_dev_attr->attr);
-			msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
-							    entry->irq + i);
-			if (!msi_dev_attr->attr.name)
-				goto error_attrs;
-			msi_dev_attr->attr.mode = S_IRUGO;
-			msi_dev_attr->show = msi_mode_show;
-			++count;
-		}
-	}
-
-	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
-	if (!msi_irq_group)
-		goto error_attrs;
-	msi_irq_group->name = "msi_irqs";
-	msi_irq_group->attrs = msi_attrs;
-
-	msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
-	if (!msi_irq_groups)
-		goto error_irq_group;
-	msi_irq_groups[0] = msi_irq_group;
-
-	ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups);
-	if (ret)
-		goto error_irq_groups;
-	pdev->msi_irq_groups = msi_irq_groups;
-
-	return 0;
-
-error_irq_groups:
-	kfree(msi_irq_groups);
-error_irq_group:
-	kfree(msi_irq_group);
-error_attrs:
-	count = 0;
-	msi_attr = msi_attrs[count];
-	while (msi_attr) {
-		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
-		kfree(msi_attr->name);
-		kfree(msi_dev_attr);
-		++count;
-		msi_attr = msi_attrs[count];
-	}
-	kfree(msi_attrs);
-	return ret;
-}
-
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
@@ -598,12 +527,15 @@ static int msi_verify_entries(struct pci_dev *dev)
 {
 	struct msi_desc *entry;
 
-	for_each_pci_msi_entry(entry, dev) {
-		if (!dev->no_64bit_msi || !entry->msg.address_hi)
-			continue;
-		pci_err(dev, "Device has broken 64-bit MSI but arch"
-			" tried to assign one above 4G\n");
-		return -EIO;
+	if (!dev->no_64bit_msi)
+		return 0;
+
+	for_each_new_pci_msi_entry(entry, dev) {
+		if (entry->msg.address_hi) {
+			pci_err(dev, "arch assigned 64-bit MSI address %#x%08x but device only supports 32 bits\n",
+				entry->msg.address_hi, entry->msg.address_lo);
+			return -EIO;
+		}
 	}
 	return 0;
 }
@@ -623,6 +555,7 @@ static int msi_verify_entries(struct pci_dev *dev)
 static int msi_capability_init(struct pci_dev *dev, int nvec,
 			       struct irq_affinity *affd)
 {
+	const struct attribute_group **groups;
 	struct msi_desc *entry;
 	int ret;
 	unsigned mask;
@@ -641,26 +574,21 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 
 	/* Configure MSI capability structure */
 	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
-	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
-		free_msi_irqs(dev);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
 	ret = msi_verify_entries(dev);
-	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
-		free_msi_irqs(dev);
-		return ret;
-	}
+	if (ret)
+		goto err;
 
-	ret = populate_msi_sysfs(dev);
-	if (ret) {
-		msi_mask_irq(entry, mask, ~mask);
-		free_msi_irqs(dev);
-		return ret;
+	groups = msi_populate_sysfs(&dev->dev);
+	if (IS_ERR(groups)) {
+		ret = PTR_ERR(groups);
+		goto err;
 	}
 
+	dev->msi_irq_groups = groups;
+
 	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
 	pci_msi_set_enable(dev, 1);
@@ -669,6 +597,11 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	pcibios_free_irq(dev);
 	dev->irq = entry->irq;
 	return 0;
+
+err:
+	msi_mask_irq(entry, mask, 0);
+	free_msi_irqs(dev);
+	return ret;
 }
 
 static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
@@ -691,25 +624,28 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 	return ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
 }
 
-static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
+static int msix_setup_entries(struct pci_dev *dev,
 			      struct msix_entry *entries, int nvec,
 			      struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *curmsk, *masks = NULL;
-	struct msi_desc *entry;
-	int ret, i;
+	struct msi_desc *entry, *tmp;
+	void __iomem *addr;
+	int ret, i, idx;
 	int vec_count = pci_msix_vec_count(dev);
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
 
+	/* Store pointer to the last msi_desc entry's list before adding new
+	 * entries to the device msi_list or to the device msi_list itself
+	 * if msi_list is empty.
+	 */
+	dev->dev.msi_last_list = dev->dev.msi_list.prev;
+
 	for (i = 0, curmsk = masks; i < nvec; i++) {
 		entry = alloc_msi_entry(&dev->dev, 1, curmsk);
 		if (!entry) {
-			if (!i)
-				iounmap(base);
-			else
-				free_msi_irqs(dev);
 			/* No enough memory. Don't try again */
 			ret = -ENOMEM;
 			goto out;
@@ -717,79 +653,172 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 
 		entry->msi_attrib.is_msix	= 1;
 		entry->msi_attrib.is_64		= 1;
-		if (entries)
+
+		if (entries) {
+			/* Check for invalid/duplicate entries */
+			if (test_bit(entries[i].entry, dev->msix_map) ||
+			    entries[i].entry >= vec_count) {
+				ret = -EINVAL;
+				goto out;
+			}
+			__set_bit(entries[i].entry, dev->msix_map);
 			entry->msi_attrib.entry_nr = entries[i].entry;
-		else
-			entry->msi_attrib.entry_nr = i;
+		} else {
+			idx = find_first_zero_bit(dev->msix_map, vec_count);
+			if (idx >= vec_count) {
+				ret = -ENOSPC;
+				goto out;
+			}
+			__set_bit(idx, dev->msix_map);
+			entry->msi_attrib.entry_nr = idx;
+		}
 
 		entry->msi_attrib.is_virtual =
 			entry->msi_attrib.entry_nr >= vec_count;
 
 		entry->msi_attrib.default_irq	= dev->irq;
-		entry->mask_base		= base;
+		entry->mask_base		= dev->msix_table_base;
+
+		if (!entry->msi_attrib.is_virtual) {
+			addr = pci_msix_desc_addr(entry);
+			entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
+		}
 
 		list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
 		if (masks)
 			curmsk++;
 	}
-	ret = 0;
+	return 0;
 out:
+	if (!dev->msix_enabled) {
+		if (!i)
+			iounmap(dev->msix_table_base);
+		else
+			free_msi_irqs(dev);
+	} else {
+		while (i-- > 0) {
+			list_for_each_entry_safe_reverse(entry, tmp, dev_to_msi_list(&dev->dev),
+							 list) {
+				__clear_bit(entry->msi_attrib.entry_nr, dev->msix_map);
+				list_del(&entry->list);
+				free_msi_entry(entry);
+			}
+		}
+	}
+
 	kfree(masks);
 	return ret;
 }
 
-static void msix_program_entries(struct pci_dev *dev,
-				 struct msix_entry *entries)
+static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
 {
 	struct msi_desc *entry;
-	int i = 0;
-	void __iomem *desc_addr;
 
-	for_each_pci_msi_entry(entry, dev) {
-		if (entries)
-			entries[i++].vector = entry->irq;
+	for_each_new_pci_msi_entry(entry, dev) {
+		if (entries) {
+			entries->vector = entry->irq;
+			entries++;
+		}
+	}
+}
 
-		desc_addr = pci_msix_desc_addr(entry);
-		if (desc_addr)
-			entry->masked = readl(desc_addr +
-					      PCI_MSIX_ENTRY_VECTOR_CTRL);
-		else
-			entry->masked = 0;
+static void msix_mask_all(void __iomem *base, int tsize)
+{
+	u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+	int i;
 
-		msix_mask_irq(entry, 1);
-	}
+	for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
+		writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
 }
 
 /**
  * msix_capability_init - configure device's MSI-X capability
  * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of struct msix_entry entries
- * @nvec: number of @entries
- * @affd: Optional pointer to enable automatic affinity assignment
- *
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X IRQ. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated IRQs or non-zero for otherwise.
+ * Setup the MSI-X capability structure of device function
  **/
-static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
-				int nvec, struct irq_affinity *affd)
+static int msix_capability_init(struct pci_dev *dev)
 {
-	int ret;
 	u16 control;
-	void __iomem *base;
+	int tsize;
 
-	/* Ensure MSI-X is disabled while it is set up */
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+	/*
+	 * Some devices require MSI-X to be enabled before the MSI-X
+	 * registers can be accessed.  Mask all the vectors to prevent
+	 * interrupts coming in before they're fully set up.
+	 */
+	pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
+				    PCI_MSIX_FLAGS_ENABLE);
 
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
 	/* Request & Map MSI-X table region */
-	base = msix_map_region(dev, msix_table_size(control));
-	if (!base)
+	tsize = msix_table_size(control);
+	dev->msix_table_base = msix_map_region(dev, tsize);
+	if (!dev->msix_table_base) {
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 		return -ENOMEM;
+	}
+
+	/* Ensure that all table entries are masked. */
+	msix_mask_all(dev->msix_table_base, tsize);
+	return 0;
+}
+
+static ssize_t msix_mode_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sysfs_emit(buf, "%s\n", "msix");
+}
 
-	ret = msix_setup_entries(dev, base, entries, nvec, affd);
+static int msix_add_sysfs(struct pci_dev *pdev)
+{
+	struct device_attribute *msi_dev_attr;
+	struct attribute *msi_attr;
+	struct msi_desc *entry;
+
+	for_each_new_pci_msi_entry(entry, pdev) {
+		msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+		if (!msi_dev_attr)
+			goto error_attrs;
+
+		pdev->msi_irq_groups[0]->attrs[entry->msi_attrib.entry_nr] = &msi_dev_attr->attr;
+		sysfs_attr_init(&msi_dev_attr->attr);
+		msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", entry->irq);
+		if (!msi_dev_attr->attr.name)
+			goto error_attrs;
+		msi_dev_attr->attr.mode = 0444;
+		msi_dev_attr->show = msix_mode_show;
+	}
+
+	return 0;
+
+error_attrs:
+	for_each_new_pci_msi_entry(entry, pdev) {
+		msi_attr = pdev->msi_irq_groups[0]->attrs[entry->msi_attrib.entry_nr];
+		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
+		kfree(msi_attr->name);
+		kfree(msi_dev_attr);
+	}
+	return -ENOMEM;
+}
+
+/**
+ * msix_setup_irqs - setup requested number of MSI-X entries
+ * @dev:	pointer to the pci_dev data structure of MSI-X device function
+ * @entries:	pointer to an array of struct msix_entry entries
+ * @nvec:	number of @entries
+ * @affd:	Optional pointer to enable automatic affinity assignment
+ *
+ * A return of zero indicates the successful setup of the requested IRQs
+ * or non-zero for otherwise.
+ **/
+static int msix_setup_irqs(struct pci_dev *dev, struct msix_entry *entries,
+			   int nvec, struct irq_affinity *affd)
+{
+	int ret = 0;
+
+	ret = msix_setup_entries(dev, entries, nvec, affd);
 	if (ret)
-		return ret;
+		goto out_disable;
 
 	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
 	if (ret)
@@ -800,26 +829,33 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	if (ret)
 		goto out_free;
 
-	/*
-	 * Some devices require MSI-X to be enabled before we can touch the
-	 * MSI-X registers.  We need to mask all the vectors to prevent
-	 * interrupts coming in before they're fully set up.
-	 */
-	pci_msix_clear_and_set_ctrl(dev, 0,
-				PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE);
+	msix_update_entries(dev, entries);
 
-	msix_program_entries(dev, entries);
+	if (!dev->msix_enabled) {
+		const struct attribute_group **groups;
 
-	ret = populate_msi_sysfs(dev);
-	if (ret)
-		goto out_free;
+		groups = msi_populate_sysfs(&dev->dev);
+		if (IS_ERR(groups)) {
+			ret = PTR_ERR(groups);
+			goto out_free;
+		}
 
-	/* Set MSI-X enabled bits and unmask the function */
-	pci_intx_for_msi(dev, 0);
-	dev->msix_enabled = 1;
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+		dev->msi_irq_groups = groups;
+
+		/* Set MSI-X enabled bits and unmask the function */
+		pci_intx_for_msi(dev, 0);
+		dev->msix_enabled = 1;
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+
+		pcibios_free_irq(dev);
+	} else {
+		ret = msix_add_sysfs(dev);
+		if (ret)
+			goto out_free;
+	}
+
+	dev->msix_alloc_count += nvec;
 
-	pcibios_free_irq(dev);
 	return 0;
 
 out_avail:
@@ -831,7 +867,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 		struct msi_desc *entry;
 		int avail = 0;
 
-		for_each_pci_msi_entry(entry, dev) {
+		for_each_new_pci_msi_entry(entry, dev) {
 			if (entry->irq != 0)
 				avail++;
 		}
@@ -840,8 +876,27 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	}
 
 out_free:
-	free_msi_irqs(dev);
+	if (ret) {
+		struct msi_desc *desc, *tmp;
+
+		for_each_new_pci_msi_entry_safe(desc, tmp, dev) {
+			if (desc->irq) {
+				BUG_ON(irq_has_action(desc->irq));
+				pci_msix_teardown_irq(dev, desc->irq);
+			}
+
+			__clear_bit(desc->msi_attrib.entry_nr, dev->msix_map);
+			list_del(&desc->list);
+			free_msi_entry(desc);
+		}
+	}
 
+out_disable:
+	if (!dev->msix_enabled) {
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
+		kfree(dev->msix_map);
+		iounmap(dev->msix_table_base);
+	}
 	return ret;
 }
 
@@ -862,7 +917,7 @@ static int pci_msi_supported(struct pci_dev *dev, int nvec)
 	if (!pci_msi_enable)
 		return 0;
 
-	if (!dev || dev->no_msi || dev->current_state != PCI_D0)
+	if (!dev || dev->no_msi)
 		return 0;
 
 	/*
@@ -930,7 +985,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
 	/* Return the device with MSI unmasked as initial states */
 	mask = msi_mask(desc->msi_attrib.multi_cap);
 	/* Keep cached state to be restored */
-	__pci_msi_desc_mask_irq(desc, mask, ~mask);
+	__pci_msi_desc_mask_irq(desc, mask, 0);
 
 	/* Restore dev->irq to its default pin-assertion IRQ */
 	dev->irq = desc->msi_attrib.default_irq;
@@ -971,35 +1026,33 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 			     int nvec, struct irq_affinity *affd, int flags)
 {
 	int nr_entries;
-	int i, j;
 
-	if (!pci_msi_supported(dev, nvec))
+	if (!pci_msi_supported(dev, nvec) || dev->current_state != PCI_D0)
 		return -EINVAL;
 
 	nr_entries = pci_msix_vec_count(dev);
 	if (nr_entries < 0)
 		return nr_entries;
-	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
-		return nr_entries;
-
-	if (entries) {
-		/* Check for any invalid entries */
-		for (i = 0; i < nvec; i++) {
-			if (entries[i].entry >= nr_entries)
-				return -EINVAL;		/* invalid entry */
-			for (j = i + 1; j < nvec; j++) {
-				if (entries[i].entry == entries[j].entry)
-					return -EINVAL;	/* duplicate entry */
-			}
-		}
-	}
+	if (nr_entries - dev->msix_alloc_count == 0)
+		return -ENOSPC;
+	if ((nvec > nr_entries - dev->msix_alloc_count) && !(flags & PCI_IRQ_VIRTUAL))
+		return nr_entries - dev->msix_alloc_count;
 
 	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
 	}
-	return msix_capability_init(dev, entries, nvec, affd);
+
+	if (!dev->msix_enabled) {
+		dev->msix_map = kcalloc(BITS_TO_LONGS(nr_entries), sizeof(long), GFP_KERNEL);
+		if (!dev->msix_map)
+			return -ENOMEM;
+		if (msix_capability_init(dev))
+			return -ENOMEM;
+	}
+
+	return msix_setup_irqs(dev, entries, nvec, affd);
 }
 
 static void pci_msix_shutdown(struct pci_dev *dev)
@@ -1022,6 +1075,8 @@ static void pci_msix_shutdown(struct pci_dev *dev)
 
 	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 	pci_intx_for_msi(dev, 1);
+	kfree(dev->msix_map);
+	dev->msix_alloc_count = 0;
 	dev->msix_enabled = 0;
 	pcibios_alloc_irq(dev);
 }
@@ -1031,11 +1086,30 @@ void pci_disable_msix(struct pci_dev *dev)
 	if (!pci_msi_enable || !dev || !dev->msix_enabled)
 		return;
 
+	mutex_lock(&dev->msix_mutex);
 	pci_msix_shutdown(dev);
 	free_msi_irqs(dev);
+	mutex_unlock(&dev->msix_mutex);
 }
 EXPORT_SYMBOL(pci_disable_msix);
 
+/**
+ * pci_free_msix_irq_vector - free previously allocated IRQ for a device
+ * @dev:		PCI device to operate on
+ * @irq:		Linux IRQ number
+ **/
+void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+{
+	if (!pci_msi_enable || !dev || !dev->msix_enabled)
+		return;
+
+	mutex_lock(&dev->msix_mutex);
+	free_msix_irq(dev, irq);
+	dev->msix_alloc_count -= 1;
+	mutex_unlock(&dev->msix_mutex);
+}
+EXPORT_SYMBOL(pci_free_msix_irq_vector);
+
 void pci_no_msi(void)
 {
 	pci_msi_enable = 0;
@@ -1059,7 +1133,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	int nvec;
 	int rc;
 
-	if (!pci_msi_supported(dev, minvec))
+	if (!pci_msi_supported(dev, minvec) || dev->current_state != PCI_D0)
 		return -EINVAL;
 
 	/* Check whether driver already requested MSI-X IRQs */
@@ -1133,7 +1207,9 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 				return -ENOSPC;
 		}
 
+		mutex_lock(&dev->msix_mutex);
 		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
+		mutex_unlock(&dev->msix_mutex);
 		if (rc == 0)
 			return nvec;
 
@@ -1168,6 +1244,38 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
+/**
+ * pci_add_msix_irq_vector - Add an additional MSI-X interrupt to a device
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ *
+ * Add 1 MSI-X vector to the device function, after some vectors have already
+ * been allocated using pci_alloc_irq_vectors(). It returns a negative errno
+ * if an error occurs. If it succeeds, it returns the device-relative interrupt
+ * vector index (0-based) which can be passed to pci_irq_vector to retrieve the
+ * Linux IRQ number of that device vector.
+ **/
+int pci_add_msix_irq_vector(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+	int ret;
+
+	if (WARN_ON_ONCE(!dev->msix_enabled))
+		return -EINVAL;
+
+	mutex_lock(&dev->msix_mutex);
+	ret = __pci_enable_msix(dev, NULL, 1, NULL, 0);
+
+	if (ret == 0) {
+		entry = list_last_entry(dev_to_msi_list(&dev->dev), struct msi_desc, list);
+		mutex_unlock(&dev->msix_mutex);
+		return entry->msi_attrib.entry_nr;
+	}
+
+	mutex_unlock(&dev->msix_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(pci_add_msix_irq_vector);
+
 /**
  * pci_alloc_irq_vectors_affinity - allocate multiple IRQs for a device
  * @dev:		PCI device to operate on
@@ -1366,14 +1474,14 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
 
 /**
  * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
- * @dev:	Pointer to the PCI device
  * @desc:	Pointer to the MSI descriptor
  *
  * The ID number is only used within the irqdomain.
  */
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
-					  struct msi_desc *desc)
+static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
 {
+	struct pci_dev *dev = msi_desc_to_pci_dev(desc);
+
 	return (irq_hw_number_t)desc->msi_attrib.entry_nr |
 		pci_dev_id(dev) << 11 |
 		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 27;
@@ -1421,17 +1529,12 @@ static int pci_msi_domain_handle_error(struct irq_domain *domain,
 	return error;
 }
 
-#ifdef GENERIC_MSI_DOMAIN_OPS
 static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
 				    struct msi_desc *desc)
 {
 	arg->desc = desc;
-	arg->hwirq = pci_msi_domain_calc_hwirq(msi_desc_to_pci_dev(desc),
-					       desc);
+	arg->hwirq = pci_msi_domain_calc_hwirq(desc);
 }
-#else
-#define pci_msi_domain_set_desc		NULL
-#endif
 
 static struct msi_domain_ops pci_msi_domain_ops_default = {
 	.set_desc	= pci_msi_domain_set_desc,
@@ -1493,12 +1596,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
-	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
-	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
-		info->flags |= MSI_FLAG_MUST_REACTIVATE;
-
-	/* PCI-MSI is oneshot-safe */
-	info->chip->flags |= IRQCHIP_ONESHOT_SAFE;
+	msi_domain_set_default_info_flags(info);
 
 	domain = msi_create_irq_domain(fwnode, info, parent);
 	if (!domain)
@@ -1572,9 +1670,69 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 	u32 rid = pci_dev_id(pdev);
 
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
-	dom = of_msi_map_get_device_domain(&pdev->dev, rid);
+	dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI);
 	if (!dom)
-		dom = iort_get_device_domain(&pdev->dev, rid);
+		dom = iort_get_device_domain(&pdev->dev, rid,
+					     DOMAIN_BUS_PCI_MSI);
 	return dom;
 }
+
+/**
+ * pci_dev_has_special_msi_domain - Check whether the device is handled by
+ *				    a non-standard PCI-MSI domain
+ * @pdev:	The PCI device to check.
+ *
+ * Returns: True if the device irqdomain or the bus irqdomain is
+ * non-standard PCI/MSI.
+ */
+bool pci_dev_has_special_msi_domain(struct pci_dev *pdev)
+{
+	struct irq_domain *dom = dev_get_msi_domain(&pdev->dev);
+
+	if (!dom)
+		dom = dev_get_msi_domain(&pdev->bus->dev);
+
+	if (!dom)
+		return true;
+
+	return dom->bus_token != DOMAIN_BUS_PCI_MSI;
+}
+
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
+#endif /* CONFIG_PCI_MSI */
+
+void pci_msi_init(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	/*
+	 * Disable the MSI hardware to avoid screaming interrupts
+	 * during boot.  This is the power on reset default so
+	 * usually this should be a noop.
+	 */
+	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!dev->msi_cap)
+		return;
+
+	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
+	if (ctrl & PCI_MSI_FLAGS_ENABLE)
+		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
+				      ctrl & ~PCI_MSI_FLAGS_ENABLE);
+
+	if (!(ctrl & PCI_MSI_FLAGS_64BIT))
+		dev->no_64bit_msi = 1;
+}
+
+void pci_msix_init(struct pci_dev *dev)
+{
+	u16 ctrl;
+
+	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!dev->msix_cap)
+		return;
+
+	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
+	if (ctrl & PCI_MSIX_FLAGS_ENABLE)
+		pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+				      ctrl & ~PCI_MSIX_FLAGS_ENABLE);
+}
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 36891e7deee34da58c79083d99b75ed5113d236c..f74b082b8e4a8297731a67439e5e7fd54c972afe 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -42,7 +42,7 @@ void pci_set_bus_of_node(struct pci_bus *bus)
 	} else {
 		node = of_node_get(bus->self->dev.of_node);
 		if (node && of_property_read_bool(node, "external-facing"))
-			bus->self->untrusted = true;
+			bus->self->external_facing = true;
 	}
 
 	bus->dev.of_node = node;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 0608aae72ccccfd67e5b4187e87a02593db19485..338a4638206967425a30c593abb79a7a9ba18e75 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -53,7 +53,7 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 	if (pdev->p2pdma->pool)
 		size = gen_pool_size(pdev->p2pdma->pool);
 
-	return snprintf(buf, PAGE_SIZE, "%zd\n", size);
+	return sysfs_emit(buf, "%zd\n", size);
 }
 static DEVICE_ATTR_RO(size);
 
@@ -66,7 +66,7 @@ static ssize_t available_show(struct device *dev, struct device_attribute *attr,
 	if (pdev->p2pdma->pool)
 		avail = gen_pool_avail(pdev->p2pdma->pool);
 
-	return snprintf(buf, PAGE_SIZE, "%zd\n", avail);
+	return sysfs_emit(buf, "%zd\n", avail);
 }
 static DEVICE_ATTR_RO(available);
 
@@ -75,8 +75,7 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			pdev->p2pdma->p2pmem_published);
+	return sysfs_emit(buf, "%d\n", pdev->p2pdma->p2pmem_published);
 }
 static DEVICE_ATTR_RO(published);
 
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index e30c2a78a88f58a769267f0b9103c8c4a4fe10e3..088ca501e3193ed5ea8627cbf6cca5dec4313f68 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -1138,7 +1138,7 @@ void acpi_pci_add_bus(struct pci_bus *bus)
 		return;
 
 	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 3,
-				RESET_DELAY_DSM, NULL);
+				DSM_PCI_POWER_ON_RESET_DELAY, NULL);
 	if (!obj)
 		return;
 
@@ -1203,7 +1203,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev,
 		pdev->d3cold_delay = 0;
 
 	obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 3,
-				FUNCTION_DELAY_DSM, NULL);
+				DSM_PCI_DEVICE_READINESS_DURATIONS, NULL);
 	if (!obj)
 		return;
 
@@ -1223,7 +1223,7 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev,
 	ACPI_FREE(obj);
 }
 
-static void pci_acpi_set_untrusted(struct pci_dev *dev)
+static void pci_acpi_set_external_facing(struct pci_dev *dev)
 {
 	u8 val;
 
@@ -1234,11 +1234,10 @@ static void pci_acpi_set_untrusted(struct pci_dev *dev)
 
 	/*
 	 * These root ports expose PCIe (including DMA) outside of the
-	 * system so make sure we treat them and everything behind as
-	 * untrusted.
+	 * system.  Everything downstream from them is external.
 	 */
 	if (val)
-		dev->untrusted = 1;
+		dev->external_facing = 1;
 }
 
 static void pci_acpi_setup(struct device *dev)
@@ -1250,7 +1249,8 @@ static void pci_acpi_setup(struct device *dev)
 		return;
 
 	pci_acpi_optimize_delay(pci_dev, adev->handle);
-	pci_acpi_set_untrusted(pci_dev);
+	pci_acpi_set_external_facing(pci_dev);
+	pci_acpi_add_edr_notifier(pci_dev);
 
 	pci_acpi_add_pm_notifier(adev, pci_dev);
 	if (!adev->wakeup.flags.valid)
@@ -1278,6 +1278,7 @@ static void pci_acpi_cleanup(struct device *dev)
 	if (!adev)
 		return;
 
+	pci_acpi_remove_edr_notifier(pci_dev);
 	pci_acpi_remove_pm_notifier(adev);
 	if (adev->wakeup.flags.valid) {
 		acpi_device_power_remove_dependent(adev, dev);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 5ea612a15550eef050801cfdf1f52826022e3781..5f90a68e61fc441057ccaa1601bbc2e8459bc41a 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -87,6 +87,92 @@ static void pci_free_dynids(struct pci_driver *drv)
 	spin_unlock(&drv->dynids.lock);
 }
 
+/**
+ * pci_match_id - See if a pci device matches a given pci_id table
+ * @ids: array of PCI device id structures to search in
+ * @dev: the PCI device structure to match against.
+ *
+ * Used by a driver to check whether a PCI device present in the
+ * system is in its list of supported devices.  Returns the matching
+ * pci_device_id structure or %NULL if there is no match.
+ *
+ * Deprecated, don't use this as it will not catch any dynamic ids
+ * that a driver might want to check for.
+ */
+const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
+					 struct pci_dev *dev)
+{
+	if (ids) {
+		while (ids->vendor || ids->subvendor || ids->class_mask) {
+			if (pci_match_one_device(ids, dev))
+				return ids;
+			ids++;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(pci_match_id);
+
+static const struct pci_device_id pci_device_id_any = {
+	.vendor = PCI_ANY_ID,
+	.device = PCI_ANY_ID,
+	.subvendor = PCI_ANY_ID,
+	.subdevice = PCI_ANY_ID,
+};
+
+/**
+ * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure
+ * @drv: the PCI driver to match against
+ * @dev: the PCI device structure to match against
+ *
+ * Used by a driver to check whether a PCI device present in the
+ * system is in its list of supported devices.  Returns the matching
+ * pci_device_id structure or %NULL if there is no match.
+ */
+static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
+						    struct pci_dev *dev)
+{
+	struct pci_dynid *dynid;
+	const struct pci_device_id *found_id = NULL, *ids;
+
+	/* When driver_override is set, only bind to the matching driver */
+	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
+		return NULL;
+
+	/* Look at the dynamic ids first, before the static ones */
+	spin_lock(&drv->dynids.lock);
+	list_for_each_entry(dynid, &drv->dynids.list, node) {
+		if (pci_match_one_device(&dynid->id, dev)) {
+			found_id = &dynid->id;
+			break;
+		}
+	}
+	spin_unlock(&drv->dynids.lock);
+
+	if (found_id)
+		return found_id;
+
+	for (ids = drv->id_table; (found_id = pci_match_id(ids, dev));
+	     ids = found_id + 1) {
+		/*
+		 * The match table is split based on driver_override.
+		 * In case override_only was set, enforce driver_override
+		 * matching.
+		 */
+		if (found_id->override_only) {
+			if (dev->driver_override)
+				return found_id;
+		} else {
+			return found_id;
+		}
+	}
+
+	/* driver_override will always match, send a dummy id */
+	if (dev->driver_override)
+		return &pci_device_id_any;
+	return NULL;
+}
+
 /**
  * store_new_id - sysfs frontend to pci_add_dynid()
  * @driver: target device driver
@@ -123,7 +209,7 @@ static ssize_t new_id_store(struct device_driver *driver, const char *buf,
 		pdev->subsystem_device = subdevice;
 		pdev->class = class;
 
-		if (pci_match_id(pdrv->id_table, pdev))
+		if (pci_match_device(pdrv, pdev))
 			retval = -EEXIST;
 
 		kfree(pdev);
@@ -206,78 +292,6 @@ static struct attribute *pci_drv_attrs[] = {
 };
 ATTRIBUTE_GROUPS(pci_drv);
 
-/**
- * pci_match_id - See if a pci device matches a given pci_id table
- * @ids: array of PCI device id structures to search in
- * @dev: the PCI device structure to match against.
- *
- * Used by a driver to check whether a PCI device present in the
- * system is in its list of supported devices.  Returns the matching
- * pci_device_id structure or %NULL if there is no match.
- *
- * Deprecated, don't use this as it will not catch any dynamic ids
- * that a driver might want to check for.
- */
-const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
-					 struct pci_dev *dev)
-{
-	if (ids) {
-		while (ids->vendor || ids->subvendor || ids->class_mask) {
-			if (pci_match_one_device(ids, dev))
-				return ids;
-			ids++;
-		}
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(pci_match_id);
-
-static const struct pci_device_id pci_device_id_any = {
-	.vendor = PCI_ANY_ID,
-	.device = PCI_ANY_ID,
-	.subvendor = PCI_ANY_ID,
-	.subdevice = PCI_ANY_ID,
-};
-
-/**
- * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure
- * @drv: the PCI driver to match against
- * @dev: the PCI device structure to match against
- *
- * Used by a driver to check whether a PCI device present in the
- * system is in its list of supported devices.  Returns the matching
- * pci_device_id structure or %NULL if there is no match.
- */
-static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
-						    struct pci_dev *dev)
-{
-	struct pci_dynid *dynid;
-	const struct pci_device_id *found_id = NULL;
-
-	/* When driver_override is set, only bind to the matching driver */
-	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
-		return NULL;
-
-	/* Look at the dynamic ids first, before the static ones */
-	spin_lock(&drv->dynids.lock);
-	list_for_each_entry(dynid, &drv->dynids.list, node) {
-		if (pci_match_one_device(&dynid->id, dev)) {
-			found_id = &dynid->id;
-			break;
-		}
-	}
-	spin_unlock(&drv->dynids.lock);
-
-	if (!found_id)
-		found_id = pci_match_id(drv->id_table, dev);
-
-	/* driver_override will always match, send a dummy id */
-	if (!found_id && dev->driver_override)
-		found_id = &pci_device_id_any;
-
-	return found_id;
-}
-
 struct drv_dev_and_id {
 	struct pci_driver *drv;
 	struct pci_dev *dev;
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index a5910f9428576672d0127c836e9d08019c86153e..8bc827d7d70e9cc8033c47536aa20eaf4850a371 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -33,6 +33,21 @@
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
+static bool device_has_acpi_name(struct device *dev)
+{
+#ifdef CONFIG_ACPI
+	acpi_handle handle = ACPI_HANDLE(dev);
+
+	if (!handle)
+		return false;
+
+	return acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
+			      1 << DSM_PCI_DEVICE_NAME);
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_DMI
 enum smbios_attr_enum {
 	SMBIOS_ATTR_NONE = 0,
@@ -45,13 +60,9 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf,
 {
 	const struct dmi_device *dmi;
 	struct dmi_dev_onboard *donboard;
-	int domain_nr;
-	int bus;
-	int devfn;
-
-	domain_nr = pci_domain_nr(pdev->bus);
-	bus = pdev->bus->number;
-	devfn = pdev->devfn;
+	int domain_nr = pci_domain_nr(pdev->bus);
+	int bus = pdev->bus->number;
+	int devfn = pdev->devfn;
 
 	dmi = NULL;
 	while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD,
@@ -156,29 +167,31 @@ enum acpi_attr_enum {
 	ACPI_ATTR_INDEX_SHOW,
 };
 
-static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
+static int dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
 {
 	int len;
+
 	len = utf16s_to_utf8s((const wchar_t *)obj->buffer.pointer,
 			      obj->buffer.length,
 			      UTF16_LITTLE_ENDIAN,
-			      buf, PAGE_SIZE);
-	buf[len] = '\n';
+			      buf, PAGE_SIZE - 1);
+	buf[len++] = '\n';
+
+	return len;
 }
 
 static int dsm_get_label(struct device *dev, char *buf,
 			 enum acpi_attr_enum attr)
 {
-	acpi_handle handle;
+	acpi_handle handle = ACPI_HANDLE(dev);
 	union acpi_object *obj, *tmp;
-	int len = -1;
+	int len = 0;
 
-	handle = ACPI_HANDLE(dev);
 	if (!handle)
 		return -1;
 
 	obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-				DEVICE_LABEL_DSM, NULL);
+				DSM_PCI_DEVICE_NAME, NULL);
 	if (!obj)
 		return -1;
 
@@ -193,32 +206,19 @@ static int dsm_get_label(struct device *dev, char *buf,
 		 * this entry must return a null string.
 		 */
 		if (attr == ACPI_ATTR_INDEX_SHOW) {
-			scnprintf(buf, PAGE_SIZE, "%llu\n", tmp->integer.value);
+			len = sysfs_emit(buf, "%llu\n", tmp->integer.value);
 		} else if (attr == ACPI_ATTR_LABEL_SHOW) {
 			if (tmp[1].type == ACPI_TYPE_STRING)
-				scnprintf(buf, PAGE_SIZE, "%s\n",
-					  tmp[1].string.pointer);
+				len = sysfs_emit(buf, "%s\n",
+						 tmp[1].string.pointer);
 			else if (tmp[1].type == ACPI_TYPE_BUFFER)
-				dsm_label_utf16s_to_utf8s(tmp + 1, buf);
+				len = dsm_label_utf16s_to_utf8s(tmp + 1, buf);
 		}
-		len = strlen(buf) > 0 ? strlen(buf) : -1;
 	}
 
 	ACPI_FREE(obj);
 
-	return len;
-}
-
-static bool device_has_dsm(struct device *dev)
-{
-	acpi_handle handle;
-
-	handle = ACPI_HANDLE(dev);
-	if (!handle)
-		return false;
-
-	return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-				1 << DEVICE_LABEL_DSM);
+	return len > 0 ? len : -1;
 }
 
 static umode_t acpi_index_string_exist(struct kobject *kobj,
@@ -228,7 +228,7 @@ static umode_t acpi_index_string_exist(struct kobject *kobj,
 
 	dev = kobj_to_dev(kobj);
 
-	if (device_has_dsm(dev))
+	if (device_has_acpi_name(dev))
 		return S_IRUGO;
 
 	return 0;
@@ -287,16 +287,11 @@ static inline int pci_remove_acpi_index_label_files(struct pci_dev *pdev)
 {
 	return -1;
 }
-
-static inline bool device_has_dsm(struct device *dev)
-{
-	return false;
-}
 #endif
 
 void pci_create_firmware_label_files(struct pci_dev *pdev)
 {
-	if (device_has_dsm(&pdev->dev))
+	if (device_has_acpi_name(&pdev->dev))
 		pci_create_acpi_index_label_files(pdev);
 	else
 		pci_create_smbiosname_file(pdev);
@@ -304,7 +299,7 @@ void pci_create_firmware_label_files(struct pci_dev *pdev)
 
 void pci_remove_firmware_label_files(struct pci_dev *pdev)
 {
-	if (device_has_dsm(&pdev->dev))
+	if (device_has_acpi_name(&pdev->dev))
 		pci_remove_acpi_index_label_files(pdev);
 	else
 		pci_remove_smbiosname_file(pdev);
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index e401f040f15716e48890e0df40294f92c02e230f..95097ce2537807e415d0dbef2e09374f9fd2725e 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1124,7 +1124,7 @@ static void pci_remove_resource_files(struct pci_dev *pdev)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct bin_attribute *res_attr;
 
 		res_attr = pdev->res_attr[i];
@@ -1195,7 +1195,7 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 	int retval;
 
 	/* Expose the PCI resources from this device as files */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 
 		/* skip empty resources */
 		if (!pci_resource_len(pdev, i))
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 436bedbaf40b990b83258e712b5112058f5e396e..bcf2dad2facebafc58f8cb192e4836dcca65ad4c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -674,7 +674,7 @@ struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res)
 {
 	int i;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *r = &dev->resource[i];
 
 		if (r->start && resource_contains(r, res))
@@ -1472,7 +1472,7 @@ void pci_restore_state(struct pci_dev *dev)
 	pci_restore_rebar_state(dev);
 	pci_restore_dpc_state(dev);
 
-	pci_cleanup_aer_error_status_regs(dev);
+	pci_aer_clear_status(dev);
 	pci_restore_aer_state(dev);
 
 	pci_restore_config_space(dev);
@@ -1970,6 +1970,14 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 }
 EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state);
 
+void pcie_clear_device_status(struct pci_dev *dev)
+{
+	u16 sta;
+
+	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
+	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
+}
+
 /**
  * pcie_clear_root_pme_status - Clear root port PME interrupt status.
  * @dev: PCIe root port or event collector.
@@ -3765,7 +3773,7 @@ void pci_release_selected_regions(struct pci_dev *pdev, int bars)
 {
 	int i;
 
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (bars & (1 << i))
 			pci_release_region(pdev, i);
 }
@@ -3776,7 +3784,7 @@ static int __pci_request_selected_regions(struct pci_dev *pdev, int bars,
 {
 	int i;
 
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if (bars & (1 << i))
 			if (__pci_request_region(pdev, i, res_name, excl))
 				goto err_out;
@@ -3824,7 +3832,7 @@ EXPORT_SYMBOL(pci_request_selected_regions_exclusive);
 
 void pci_release_regions(struct pci_dev *pdev)
 {
-	pci_release_selected_regions(pdev, (1 << 6) - 1);
+	pci_release_selected_regions(pdev, (1 << PCI_STD_NUM_BARS) - 1);
 }
 EXPORT_SYMBOL(pci_release_regions);
 
@@ -3843,7 +3851,8 @@ EXPORT_SYMBOL(pci_release_regions);
  */
 int pci_request_regions(struct pci_dev *pdev, const char *res_name)
 {
-	return pci_request_selected_regions(pdev, ((1 << 6) - 1), res_name);
+	return pci_request_selected_regions(pdev,
+			((1 << PCI_STD_NUM_BARS) - 1), res_name);
 }
 EXPORT_SYMBOL(pci_request_regions);
 
@@ -3865,7 +3874,7 @@ EXPORT_SYMBOL(pci_request_regions);
 int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name)
 {
 	return pci_request_selected_regions_exclusive(pdev,
-					((1 << 6) - 1), res_name);
+				((1 << PCI_STD_NUM_BARS) - 1), res_name);
 }
 EXPORT_SYMBOL(pci_request_regions_exclusive);
 
@@ -4896,7 +4905,7 @@ static void pci_dev_lock(struct pci_dev *dev)
 }
 
 /* Return 1 on successful lock, 0 on contention */
-static int pci_dev_trylock(struct pci_dev *dev)
+int pci_dev_trylock(struct pci_dev *dev)
 {
 	if (pci_cfg_access_trylock(dev)) {
 		if (device_trylock(&dev->dev))
@@ -4906,12 +4915,14 @@ static int pci_dev_trylock(struct pci_dev *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pci_dev_trylock);
 
-static void pci_dev_unlock(struct pci_dev *dev)
+void pci_dev_unlock(struct pci_dev *dev)
 {
 	device_unlock(&dev->dev);
 	pci_cfg_access_unlock(dev);
 }
+EXPORT_SYMBOL_GPL(pci_dev_unlock);
 
 static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
@@ -6059,7 +6070,9 @@ bool pci_devs_are_dma_aliases(struct pci_dev *dev1, struct pci_dev *dev2)
 	return (dev1->dma_alias_mask &&
 		test_bit(dev2->devfn, dev1->dma_alias_mask)) ||
 	       (dev2->dma_alias_mask &&
-		test_bit(dev1->devfn, dev2->dma_alias_mask));
+		test_bit(dev1->devfn, dev2->dma_alias_mask)) ||
+	       pci_real_dma_dev(dev1) == dev2 ||
+	       pci_real_dma_dev(dev2) == dev1;
 }
 
 bool pci_device_is_present(struct pci_dev *pdev)
@@ -6083,6 +6096,21 @@ void pci_ignore_hotplug(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_ignore_hotplug);
 
+/**
+ * pci_real_dma_dev - Get PCI DMA device for PCI device
+ * @dev: the PCI device that may have a PCI DMA alias
+ *
+ * Permits the platform to provide architecture-specific functionality to
+ * devices needing to alias DMA to another PCI device on another PCI bus. If
+ * the PCI device is on the same bus, it is recommended to use
+ * pci_add_dma_alias(). This is the default implementation. Architecture
+ * implementations can override this.
+ */
+struct pci_dev __weak *pci_real_dma_dev(struct pci_dev *dev)
+{
+	return dev;
+}
+
 resource_size_t __weak pcibios_default_alignment(void)
 {
 	return 0;
@@ -6293,7 +6321,7 @@ static ssize_t resource_alignment_show(struct bus_type *bus, char *buf)
 
 	spin_lock(&resource_alignment_lock);
 	if (resource_alignment_param)
-		count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param);
+		count = sysfs_emit(buf, "%s", resource_alignment_param);
 	spin_unlock(&resource_alignment_lock);
 
 	/*
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index b66ffea134dc650a70ffab95c7d140df8b56ac59..74a9f61bf0577145d74b9f844a9c2e263d0cd603 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -92,6 +92,7 @@ void pci_refresh_power_state(struct pci_dev *dev);
 void pci_power_up(struct pci_dev *dev);
 void pci_disable_enabled_device(struct pci_dev *dev);
 int pci_finish_runtime_suspend(struct pci_dev *dev);
+void pcie_clear_device_status(struct pci_dev *dev);
 void pcie_clear_root_pme_status(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
@@ -104,6 +105,8 @@ void pci_config_pm_runtime_get(struct pci_dev *dev);
 void pci_config_pm_runtime_put(struct pci_dev *dev);
 void pci_pm_init(struct pci_dev *dev);
 void pci_ea_init(struct pci_dev *dev);
+void pci_msi_init(struct pci_dev *dev);
+void pci_msix_init(struct pci_dev *dev);
 void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 void pci_free_cap_save_buffers(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
@@ -410,6 +413,8 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
 
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
+#define PCI_DPC_RECOVERED 1
+#define PCI_DPC_RECOVERING 2
 
 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
 {
@@ -449,12 +454,43 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif	/* CONFIG_PCIEAER */
 
+#ifdef CONFIG_PCIEPORTBUS
+/* Cached RCEC Endpoint Association */
+struct rcec_ea {
+	u8		nextbusn;
+	u8		lastbusn;
+	u32		bitmap;
+};
+#endif
+
 #ifdef CONFIG_PCIE_DPC
 void pci_save_dpc_state(struct pci_dev *dev);
 void pci_restore_dpc_state(struct pci_dev *dev);
+void pci_dpc_init(struct pci_dev *pdev);
+void dpc_process_error(struct pci_dev *pdev);
+pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
+bool pci_dpc_recovered(struct pci_dev *pdev);
 #else
 static inline void pci_save_dpc_state(struct pci_dev *dev) {}
 static inline void pci_restore_dpc_state(struct pci_dev *dev) {}
+static inline void pci_dpc_init(struct pci_dev *pdev) {}
+static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
+#endif
+
+#ifdef CONFIG_PCIEPORTBUS
+void pci_rcec_init(struct pci_dev *dev);
+void pci_rcec_exit(struct pci_dev *dev);
+void pcie_link_rcec(struct pci_dev *rcec);
+void pcie_walk_rcec(struct pci_dev *rcec,
+		    int (*cb)(struct pci_dev *, void *),
+		    void *userdata);
+#else
+static inline void pci_rcec_init(struct pci_dev *dev) {}
+static inline void pci_rcec_exit(struct pci_dev *dev) {}
+static inline void pcie_link_rcec(struct pci_dev *rcec) {}
+static inline void pcie_walk_rcec(struct pci_dev *rcec,
+				  int (*cb)(struct pci_dev *, void *),
+				  void *userdata) {}
 #endif
 
 #ifdef CONFIG_PCI_ATS
@@ -466,6 +502,18 @@ static inline void pci_ats_init(struct pci_dev *d) { }
 static inline void pci_restore_ats_state(struct pci_dev *dev) { }
 #endif /* CONFIG_PCI_ATS */
 
+#ifdef CONFIG_PCI_PRI
+void pci_pri_init(struct pci_dev *dev);
+#else
+static inline void pci_pri_init(struct pci_dev *dev) { }
+#endif
+
+#ifdef CONFIG_PCI_PASID
+void pci_pasid_init(struct pci_dev *dev);
+#else
+static inline void pci_pasid_init(struct pci_dev *dev) { }
+#endif
+
 #ifdef CONFIG_PCI_IOV
 int pci_iov_init(struct pci_dev *dev);
 void pci_iov_release(struct pci_dev *dev);
@@ -535,8 +583,9 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 #endif
 
 /* PCI error reporting and recovery */
-void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
-		      u32 service);
+pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
+		pci_channel_state_t state,
+		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev));
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
@@ -659,13 +708,15 @@ void pci_aer_init(struct pci_dev *dev);
 void pci_aer_exit(struct pci_dev *dev);
 extern const struct attribute_group aer_stats_attr_group;
 void pci_aer_clear_fatal_status(struct pci_dev *dev);
-void pci_aer_clear_device_status(struct pci_dev *dev);
+int pci_aer_clear_status(struct pci_dev *dev);
+int pci_aer_raw_clear_status(struct pci_dev *dev);
 #else
 static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
 static inline void pci_aer_exit(struct pci_dev *d) { }
 static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
-static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
+static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
+static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index a5a174d70e056ff753388118f0a0215c61d7b60f..2dc4ef1141d7333cb14228f4d0fc529b1bfeee5f 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -151,3 +151,13 @@ config PCIE_BW
 	  This enables PCI Express Bandwidth Change Notification.  If
 	  you know link width or rate changes occur only to correct
 	  unreliable links, you may answer Y.
+
+config PCIE_EDR
+	bool "PCI Express Error Disconnect Recover support"
+	depends on PCIE_DPC && ACPI
+	help
+	  This option adds Error Disconnect Recover support as specified
+	  in the Downstream Port Containment Related Enhancements ECN to
+	  the PCI Firmware Specification r3.2.  Enable this if you want to
+	  support hybrid DPC model which uses both firmware and OS to
+	  implement DPC.
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index efb9d2e71e9eecad1bc0298692d166114de5be6a..d9697892fa3e9380b0d925b6255abce22c6a6414 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
+pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o rcec.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
@@ -13,3 +13,4 @@ obj-$(CONFIG_PCIE_PME)		+= pme.o
 obj-$(CONFIG_PCIE_DPC)		+= dpc.o
 obj-$(CONFIG_PCIE_PTM)		+= ptm.o
 obj-$(CONFIG_PCIE_BW)		+= bw_notification.o
+obj-$(CONFIG_PCIE_EDR)		+= edr.o
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 4a818b07a1afb72ba197094a939f26ebed991873..0bc2f1ec893efc5acfedf59dfa831c1581c16a1e 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -102,6 +102,7 @@ struct aer_stats {
 #define ERR_UNCOR_ID(d)			(d >> 16)
 
 static int pcie_aer_disable;
+static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
 void pci_no_aer(void)
 {
@@ -135,22 +136,18 @@ static const char * const ecrc_policy_str[] = {
  */
 static int enable_ecrc_checking(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 reg32;
 
-	if (!pci_is_pcie(dev))
+	if (!aer)
 		return -ENODEV;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return -ENODEV;
-
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, &reg32);
 	if (reg32 & PCI_ERR_CAP_ECRC_GENC)
 		reg32 |= PCI_ERR_CAP_ECRC_GENE;
 	if (reg32 & PCI_ERR_CAP_ECRC_CHKC)
 		reg32 |= PCI_ERR_CAP_ECRC_CHKE;
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32);
 
 	return 0;
 }
@@ -163,19 +160,15 @@ static int enable_ecrc_checking(struct pci_dev *dev)
  */
 static int disable_ecrc_checking(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 reg32;
 
-	if (!pci_is_pcie(dev))
-		return -ENODEV;
-
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return -ENODEV;
 
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, &reg32);
 	reg32 &= ~(PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, reg32);
 
 	return 0;
 }
@@ -216,142 +209,22 @@ void pcie_ecrc_get_policy(char *str)
 }
 #endif	/* CONFIG_PCIE_ECRC */
 
-#ifdef CONFIG_ACPI_APEI
-static inline int hest_match_pci(struct acpi_hest_aer_common *p,
-				 struct pci_dev *pci)
-{
-	return   ACPI_HEST_SEGMENT(p->bus) == pci_domain_nr(pci->bus) &&
-		 ACPI_HEST_BUS(p->bus)     == pci->bus->number &&
-		 p->device                 == PCI_SLOT(pci->devfn) &&
-		 p->function               == PCI_FUNC(pci->devfn);
-}
-
-static inline bool hest_match_type(struct acpi_hest_header *hest_hdr,
-				struct pci_dev *dev)
-{
-	u16 hest_type = hest_hdr->type;
-	u8 pcie_type = pci_pcie_type(dev);
-
-	if ((hest_type == ACPI_HEST_TYPE_AER_ROOT_PORT &&
-		pcie_type == PCI_EXP_TYPE_ROOT_PORT) ||
-	    (hest_type == ACPI_HEST_TYPE_AER_ENDPOINT &&
-		pcie_type == PCI_EXP_TYPE_ENDPOINT) ||
-	    (hest_type == ACPI_HEST_TYPE_AER_BRIDGE &&
-		(dev->class >> 16) == PCI_BASE_CLASS_BRIDGE))
-		return true;
-	return false;
-}
-
-struct aer_hest_parse_info {
-	struct pci_dev *pci_dev;
-	int firmware_first;
-};
-
-static int hest_source_is_pcie_aer(struct acpi_hest_header *hest_hdr)
-{
-	if (hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
-	    hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
-	    hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE)
-		return 1;
-	return 0;
-}
-
-static int aer_hest_parse(struct acpi_hest_header *hest_hdr, void *data)
-{
-	struct aer_hest_parse_info *info = data;
-	struct acpi_hest_aer_common *p;
-	int ff;
-
-	if (!hest_source_is_pcie_aer(hest_hdr))
-		return 0;
-
-	p = (struct acpi_hest_aer_common *)(hest_hdr + 1);
-	ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
-
-	/*
-	 * If no specific device is supplied, determine whether
-	 * FIRMWARE_FIRST is set for *any* PCIe device.
-	 */
-	if (!info->pci_dev) {
-		info->firmware_first |= ff;
-		return 0;
-	}
-
-	/* Otherwise, check the specific device */
-	if (p->flags & ACPI_HEST_GLOBAL) {
-		if (hest_match_type(hest_hdr, info->pci_dev))
-			info->firmware_first = ff;
-	} else
-		if (hest_match_pci(p, info->pci_dev))
-			info->firmware_first = ff;
-
-	return 0;
-}
+#define	PCI_EXP_AER_FLAGS	(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
+				 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
 
-static void aer_set_firmware_first(struct pci_dev *pci_dev)
+int pcie_aer_is_native(struct pci_dev *dev)
 {
-	int rc;
-	struct aer_hest_parse_info info = {
-		.pci_dev	= pci_dev,
-		.firmware_first	= 0,
-	};
-
-	rc = apei_hest_parse(aer_hest_parse, &info);
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
-	if (rc)
-		pci_dev->__aer_firmware_first = 0;
-	else
-		pci_dev->__aer_firmware_first = info.firmware_first;
-	pci_dev->__aer_firmware_first_valid = 1;
-}
-
-int pcie_aer_get_firmware_first(struct pci_dev *dev)
-{
-	if (!pci_is_pcie(dev))
-		return 0;
-
-	if (pcie_ports_native)
+	if (!dev->aer_cap)
 		return 0;
 
-	if (!dev->__aer_firmware_first_valid)
-		aer_set_firmware_first(dev);
-	return dev->__aer_firmware_first;
+	return pcie_ports_native || host->native_aer;
 }
 
-static bool aer_firmware_first;
-
-/**
- * aer_acpi_firmware_first - Check if APEI should control AER.
- */
-bool aer_acpi_firmware_first(void)
-{
-	static bool parsed = false;
-	struct aer_hest_parse_info info = {
-		.pci_dev	= NULL,	/* Check all PCIe devices */
-		.firmware_first	= 0,
-	};
-
-	if (pcie_ports_native)
-		return false;
-
-	if (!parsed) {
-		apei_hest_parse(aer_hest_parse, &info);
-		aer_firmware_first = info.firmware_first;
-		parsed = true;
-	}
-	return aer_firmware_first;
-}
-#endif
-
-#define	PCI_EXP_AER_FLAGS	(PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
-				 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
-
 int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
-	if (pcie_aer_get_firmware_first(dev))
-		return -EIO;
-
-	if (!dev->aer_cap)
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	return pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
@@ -360,7 +233,7 @@ EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
 
 int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	return pcie_capability_clear_word(dev, PCI_EXP_DEVCTL,
@@ -368,96 +241,90 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
-void pci_aer_clear_device_status(struct pci_dev *dev)
+int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
-	u16 sta;
-
-	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
-	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
-}
-
-int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
-{
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, sev;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return -EIO;
-
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
 	/* Clear status bits for ERR_NONFATAL errors only */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev);
 	status &= ~sev;
 	if (status)
-		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
+EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status);
 
 void pci_aer_clear_fatal_status(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, sev;
 
-	pos = dev->aer_cap;
-	if (!pos)
-		return;
-
-	if (pcie_aer_get_firmware_first(dev))
+	if (!pcie_aer_is_native(dev))
 		return;
 
 	/* Clear status bits for ERR_FATAL errors only */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev);
 	status &= sev;
 	if (status)
-		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 }
 
-int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+/**
+ * pci_aer_raw_clear_status - Clear AER error registers.
+ * @dev: the PCI device
+ *
+ * Clearing AER error status registers unconditionally, regardless of
+ * whether they're owned by firmware or the OS.
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_aer_raw_clear_status(struct pci_dev *dev)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status;
 	int port_type;
 
-	if (!pci_is_pcie(dev))
-		return -ENODEV;
-
-	pos = dev->aer_cap;
-	if (!pos)
-		return -EIO;
-
-	if (pcie_aer_get_firmware_first(dev))
+	if (!aer)
 		return -EIO;
 
 	port_type = pci_pcie_type(dev);
-	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
-		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
-		pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, status);
+	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+	    port_type == PCI_EXP_TYPE_RC_EC) {
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, &status);
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_STATUS, status);
 	}
 
-	pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status);
-	pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, status);
+	pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status);
+	pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS, status);
 
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
 
 	return 0;
 }
 
+int pci_aer_clear_status(struct pci_dev *dev)
+{
+	if (!pcie_aer_is_native(dev))
+		return -EIO;
+
+	return pci_aer_raw_clear_status(dev);
+}
+
 void pci_save_aer_state(struct pci_dev *dev)
 {
+	int aer = dev->aer_cap;
 	struct pci_cap_saved_state *save_state;
 	u32 *cap;
-	int pos;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return;
 
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR);
@@ -465,22 +332,21 @@ void pci_save_aer_state(struct pci_dev *dev)
 		return;
 
 	cap = &save_state->cap.data[0];
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, cap++);
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, cap++);
+	pci_read_config_dword(dev, aer + PCI_ERR_CAP, cap++);
 	if (pcie_cap_has_rtctl(dev))
-		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, cap++);
+		pci_read_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, cap++);
 }
 
 void pci_restore_aer_state(struct pci_dev *dev)
 {
+	int aer = dev->aer_cap;
 	struct pci_cap_saved_state *save_state;
 	u32 *cap;
-	int pos;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return;
 
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_ERR);
@@ -488,12 +354,12 @@ void pci_restore_aer_state(struct pci_dev *dev)
 		return;
 
 	cap = &save_state->cap.data[0];
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, *cap++);
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, *cap++);
+	pci_write_config_dword(dev, aer + PCI_ERR_CAP, *cap++);
 	if (pcie_cap_has_rtctl(dev))
-		pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, *cap++);
+		pci_write_config_dword(dev, aer + PCI_ERR_ROOT_COMMAND, *cap++);
 }
 
 void pci_aer_init(struct pci_dev *dev)
@@ -515,7 +381,7 @@ void pci_aer_init(struct pci_dev *dev)
 	n = pcie_cap_has_rtctl(dev) ? 5 : 4;
 	pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_ERR, sizeof(u32) * n);
 
-	pci_cleanup_aer_error_status_regs(dev);
+	pci_aer_clear_status(dev);
 }
 
 void pci_aer_exit(struct pci_dev *dev)
@@ -637,21 +503,23 @@ static const char *aer_agent_string[] = {
 		     char *buf)						\
 {									\
 	unsigned int i;							\
-	char *str = buf;						\
 	struct pci_dev *pdev = to_pci_dev(dev);				\
 	u64 *stats = pdev->aer_stats->stats_array;			\
+	size_t len = 0;							\
 									\
 	for (i = 0; i < ARRAY_SIZE(strings_array); i++) {		\
 		if (strings_array[i])					\
-			str += sprintf(str, "%s %llu\n",		\
-				       strings_array[i], stats[i]);	\
+			len += sysfs_emit_at(buf, len, "%s %llu\n",	\
+					     strings_array[i],		\
+					     stats[i]);			\
 		else if (stats[i])					\
-			str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
-				       i, stats[i]);			\
+			len += sysfs_emit_at(buf, len,			\
+					     #stats_array "_bit[%d] %llu\n",\
+					     i, stats[i]);		\
 	}								\
-	str += sprintf(str, "TOTAL_%s %llu\n", total_string,		\
-		       pdev->aer_stats->total_field);			\
-	return str-buf;							\
+	len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string,	\
+			     pdev->aer_stats->total_field);		\
+	return len;							\
 }									\
 static DEVICE_ATTR_RO(name)
 
@@ -671,7 +539,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
 		     char *buf)						\
 {									\
 	struct pci_dev *pdev = to_pci_dev(dev);				\
-	return sprintf(buf, "%llu\n", pdev->aer_stats->field);		\
+	return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field);	\
 }									\
 static DEVICE_ATTR_RO(name)
 
@@ -704,7 +572,8 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
 	if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
 	     a == &dev_attr_aer_rootport_total_err_fatal.attr ||
 	     a == &dev_attr_aer_rootport_total_err_nonfatal.attr) &&
-	    pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT)
+	    ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT) &&
+	     (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC)))
 		return 0;
 
 	return a->mode;
@@ -924,7 +793,7 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
  */
 static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
 {
-	int pos;
+	int aer = dev->aer_cap;
 	u32 status, mask;
 	u16 reg16;
 
@@ -959,17 +828,16 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
 	if (!(reg16 & PCI_EXP_AER_FLAGS))
 		return false;
 
-	pos = dev->aer_cap;
-	if (!pos)
+	if (!aer)
 		return false;
 
 	/* Check if error is recorded */
 	if (e_info->severity == AER_CORRECTABLE) {
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status);
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &mask);
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status);
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
 	} else {
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &mask);
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
 	}
 	if (status & ~mask)
 		return true;
@@ -1022,7 +890,10 @@ static bool find_source_device(struct pci_dev *parent,
 	if (result)
 		return true;
 
-	pci_walk_bus(parent->subordinate, find_device_iter, e_info);
+	if (pci_pcie_type(parent) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(parent, find_device_iter, e_info);
+	else
+		pci_walk_bus(parent->subordinate, find_device_iter, e_info);
 
 	if (!e_info->error_dev_num) {
 		pci_info(parent, "can't find device of ID%04x\n", e_info->id);
@@ -1040,24 +911,22 @@ static bool find_source_device(struct pci_dev *parent,
  */
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	int pos;
+	int aer = dev->aer_cap;
 
 	if (info->severity == AER_CORRECTABLE) {
 		/*
 		 * Correctable error does not need software intervention.
 		 * No need to go through error recovery process.
 		 */
-		pos = dev->aer_cap;
-		if (pos)
-			pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
+		if (aer)
+			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 					info->status);
-		pci_aer_clear_device_status(dev);
+		if (pcie_aer_is_native(dev))
+			pcie_clear_device_status(dev);
 	} else if (info->severity == AER_NONFATAL)
-		pcie_do_recovery(dev, pci_channel_io_normal,
-				 PCIE_PORT_SERVICE_AER);
+		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
 	else if (info->severity == AER_FATAL)
-		pcie_do_recovery(dev, pci_channel_io_frozen,
-				 PCIE_PORT_SERVICE_AER);
+		pcie_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
 	pci_dev_put(dev);
 }
 
@@ -1094,10 +963,10 @@ static void aer_recover_work_func(struct work_struct *work)
 		cper_print_aer(pdev, entry.severity, entry.regs);
 		if (entry.severity == AER_NONFATAL)
 			pcie_do_recovery(pdev, pci_channel_io_normal,
-					 PCIE_PORT_SERVICE_AER);
+					 aer_root_reset);
 		else if (entry.severity == AER_FATAL)
 			pcie_do_recovery(pdev, pci_channel_io_frozen,
-					 PCIE_PORT_SERVICE_AER);
+					 aer_root_reset);
 		pci_dev_put(pdev);
 	}
 }
@@ -1142,51 +1011,52 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
  */
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 {
-	int pos, temp;
+	int aer = dev->aer_cap;
+	int temp;
+	int type = pci_pcie_type(dev);
 
 	/* Must reset in this function */
 	info->status = 0;
 	info->tlp_header_valid = 0;
 
-	pos = dev->aer_cap;
-
 	/* The device might not support AER */
-	if (!pos)
+	if (!aer)
 		return 0;
 
 	if (info->severity == AER_CORRECTABLE) {
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 			&info->status);
-		pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK,
+		pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
 			return 0;
-	} else if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
-	           pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
+	} else if (type == PCI_EXP_TYPE_ROOT_PORT ||
+		   type == PCI_EXP_TYPE_RC_EC ||
+		   type == PCI_EXP_TYPE_DOWNSTREAM ||
 		   info->severity == AER_NONFATAL) {
 
 		/* Link is still healthy for IO reads */
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS,
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
 			&info->status);
-		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
+		pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
 			return 0;
 
 		/* Get First Error Pointer */
-		pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp);
+		pci_read_config_dword(dev, aer + PCI_ERR_CAP, &temp);
 		info->first_error = PCI_ERR_CAP_FEP(temp);
 
 		if (info->status & AER_LOG_TLP_MASKS) {
 			info->tlp_header_valid = 1;
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG, &info->tlp.dw0);
+				aer + PCI_ERR_HEADER_LOG, &info->tlp.dw0);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1);
+				aer + PCI_ERR_HEADER_LOG + 4, &info->tlp.dw1);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2);
+				aer + PCI_ERR_HEADER_LOG + 8, &info->tlp.dw2);
 			pci_read_config_dword(dev,
-				pos + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3);
+				aer + PCI_ERR_HEADER_LOG + 12, &info->tlp.dw3);
 		}
 	}
 
@@ -1292,15 +1162,15 @@ static irqreturn_t aer_irq(int irq, void *context)
 	struct pcie_device *pdev = (struct pcie_device *)context;
 	struct aer_rpc *rpc = get_service_data(pdev);
 	struct pci_dev *rp = rpc->rpd;
+	int aer = rp->aer_cap;
 	struct aer_err_source e_src = {};
-	int pos = rp->aer_cap;
 
-	pci_read_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, &e_src.status);
+	pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, &e_src.status);
 	if (!(e_src.status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV)))
 		return IRQ_NONE;
 
-	pci_read_config_dword(rp, pos + PCI_ERR_ROOT_ERR_SRC, &e_src.id);
-	pci_write_config_dword(rp, pos + PCI_ERR_ROOT_STATUS, e_src.status);
+	pci_read_config_dword(rp, aer + PCI_ERR_ROOT_ERR_SRC, &e_src.id);
+	pci_write_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, e_src.status);
 
 	if (!kfifo_put(&rpc->aer_fifo, e_src))
 		return IRQ_HANDLED;
@@ -1314,6 +1184,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data)
 	int type = pci_pcie_type(dev);
 
 	if ((type == PCI_EXP_TYPE_ROOT_PORT) ||
+	    (type == PCI_EXP_TYPE_RC_EC) ||
 	    (type == PCI_EXP_TYPE_UPSTREAM) ||
 	    (type == PCI_EXP_TYPE_DOWNSTREAM)) {
 		if (enable)
@@ -1338,9 +1209,12 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev,
 {
 	set_device_error_reporting(dev, &enable);
 
-	if (!dev->subordinate)
-		return;
-	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(dev, set_device_error_reporting, &enable);
+	else if (dev->subordinate)
+		pci_walk_bus(dev->subordinate, set_device_error_reporting,
+			     &enable);
+
 }
 
 /**
@@ -1352,7 +1226,7 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev,
 static void aer_enable_rootport(struct aer_rpc *rpc)
 {
 	struct pci_dev *pdev = rpc->rpd;
-	int aer_pos;
+	int aer = pdev->aer_cap;
 	u16 reg16;
 	u32 reg32;
 
@@ -1364,14 +1238,13 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 	pcie_capability_clear_word(pdev, PCI_EXP_RTCTL,
 				   SYSTEM_ERROR_INTR_ON_MESG_MASK);
 
-	aer_pos = pdev->aer_cap;
 	/* Clear error status */
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32);
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, reg32);
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, &reg32);
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_COR_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_COR_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, reg32);
 
 	/*
 	 * Enable error reporting for the root port device and downstream port
@@ -1380,9 +1253,9 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 	set_downstream_devices_error_reporting(pdev, true);
 
 	/* Enable Root Port's interrupt in response to error messages */
-	pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 }
 
 /**
@@ -1394,8 +1267,8 @@ static void aer_enable_rootport(struct aer_rpc *rpc)
 static void aer_disable_rootport(struct aer_rpc *rpc)
 {
 	struct pci_dev *pdev = rpc->rpd;
+	int aer = pdev->aer_cap;
 	u32 reg32;
-	int pos;
 
 	/*
 	 * Disable error reporting for the root port device and downstream port
@@ -1403,15 +1276,14 @@ static void aer_disable_rootport(struct aer_rpc *rpc)
 	 */
 	set_downstream_devices_error_reporting(pdev, false);
 
-	pos = pdev->aer_cap;
 	/* Disable Root's interrupt in response to error messages */
-	pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, &reg32);
 	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32);
 
 	/* Clear Root's error status reg */
-	pci_read_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_STATUS, reg32);
+	pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, &reg32);
+	pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, reg32);
 }
 
 /**
@@ -1440,6 +1312,11 @@ static int aer_probe(struct pcie_device *dev)
 	struct device *device = &dev->device;
 	struct pci_dev *port = dev->port;
 
+	/* Limit to Root Ports or Root Complex Event Collectors */
+	if ((pci_pcie_type(port) != PCI_EXP_TYPE_RC_EC) &&
+	    (pci_pcie_type(port) != PCI_EXP_TYPE_ROOT_PORT))
+		return -ENODEV;
+
 	rpc = devm_kzalloc(device, sizeof(struct aer_rpc), GFP_KERNEL);
 	if (!rpc)
 		return -ENOMEM;
@@ -1461,47 +1338,78 @@ static int aer_probe(struct pcie_device *dev)
 }
 
 /**
- * aer_root_reset - reset link on Root Port
- * @dev: pointer to Root Port's pci_dev data structure
+ * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
+ * @dev: pointer to Root Port, RCEC, or RCiEP
  *
- * Invoked by Port Bus driver when performing link reset at Root Port.
+ * Invoked by Port Bus driver when performing reset.
  */
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 {
+	int type = pci_pcie_type(dev);
+	struct pci_dev *root;
+	int aer;
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 	u32 reg32;
-	int pos;
 	int rc;
 
-	pos = dev->aer_cap;
+	/*
+	 * Only Root Ports and RCECs have AER Root Command and Root Status
+	 * registers.  If "dev" is an RCiEP, the relevant registers are in
+	 * the RCEC.
+	 */
+	if (type == PCI_EXP_TYPE_RC_END)
+		root = dev->rcec;
+	else
+		root = dev;
+
+	/*
+	 * If the platform retained control of AER, an RCiEP may not have
+	 * an RCEC visible to us, so dev->rcec ("root") may be NULL.  In
+	 * that case, firmware is responsible for these registers.
+	 */
+	aer = root ? root->aer_cap : 0;
 
-	/* Disable Root's interrupt in response to error messages */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
-	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+	if ((host->native_aer || pcie_ports_native) && aer) {
+		/* Disable Root's interrupt in response to error messages */
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32);
+	}
 
-	rc = pci_bus_error_reset(dev);
-	pci_info(dev, "Root Port link has been reset\n");
+	if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) {
+		if (pcie_has_flr(dev)) {
+			rc = pcie_flr(dev);
+			pci_info(dev, "has been reset (%d)\n", rc);
+		} else {
+			pci_info(dev, "not reset (no FLR support)\n");
+			rc = -ENOTTY;
+		}
+	} else {
+		rc = pci_bus_error_reset(dev);
+		pci_info(dev, "Root Port link has been reset (%d)\n", rc);
+	}
 
-	/* Clear Root Error Status */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &reg32);
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, reg32);
+	if ((host->native_aer || pcie_ports_native) && aer) {
+		/* Clear Root Error Status */
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_STATUS, &reg32);
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_STATUS, reg32);
 
-	/* Enable Root Port's interrupt in response to error messages */
-	pci_read_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, &reg32);
-	reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
-	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
+		/* Enable Root Port's interrupt in response to error messages */
+		pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, &reg32);
+		reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
+		pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32);
+	}
 
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
 
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
-	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
+	.port_type	= PCIE_ANY_PORT,
 	.service	= PCIE_PORT_SERVICE_AER,
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
-	.reset_link	= aer_root_reset,
 };
 
 /**
@@ -1511,7 +1419,7 @@ static struct pcie_port_service_driver aerdriver = {
  */
 int __init pcie_aer_init(void)
 {
-	if (!pci_aer_available() || aer_acpi_firmware_first())
+	if (!pci_aer_available())
 		return -ENXIO;
 	return pcie_port_service_register(&aerdriver);
 }
diff --git a/drivers/pci/pcie/aer_inject.c b/drivers/pci/pcie/aer_inject.c
index 21cc3d3387f7919e2edfeac583e263bfe6be0715..f7e452728d1d9a9bc9b775b49b8d5e50e059d8f5 100644
--- a/drivers/pci/pcie/aer_inject.c
+++ b/drivers/pci/pcie/aer_inject.c
@@ -333,8 +333,11 @@ static int aer_inject(struct aer_error_inj *einj)
 	if (!dev)
 		return -ENODEV;
 	rpdev = pcie_find_root_port(dev);
+	/* If Root Port not found, try to find an RCEC */
+	if (!rpdev)
+		rpdev = dev->rcec;
 	if (!rpdev) {
-		pci_err(dev, "Root port not found\n");
+		pci_err(dev, "Neither Root Port nor RCEC found\n");
 		ret = -ENODEV;
 		goto out_put;
 	}
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 7624c71011c6eacb3bf824433e81e1ac0bf70f4d..5e47b98669dfbf0fc4873652cef8e3018da8faa0 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1192,7 +1192,7 @@ static ssize_t link_state_show(struct device *dev,
 	struct pci_dev *pci_device = to_pci_dev(dev);
 	struct pcie_link_state *link_state = pci_device->link_state;
 
-	return sprintf(buf, "%d\n", link_state->aspm_enabled);
+	return sysfs_emit(buf, "%d\n", link_state->aspm_enabled);
 }
 
 static ssize_t link_state_store(struct device *dev,
@@ -1231,7 +1231,7 @@ static ssize_t clk_ctl_show(struct device *dev,
 	struct pci_dev *pci_device = to_pci_dev(dev);
 	struct pcie_link_state *link_state = pci_device->link_state;
 
-	return sprintf(buf, "%d\n", link_state->clkpm_enabled);
+	return sysfs_emit(buf, "%d\n", link_state->clkpm_enabled);
 }
 
 static ssize_t clk_ctl_store(struct device *dev,
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index e06f42f58d3d4d87e71d2310e04beaeb3e4a68f3..8ce228265218a5c148cec759bbb96724e3474722 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -17,13 +17,6 @@
 #include "portdrv.h"
 #include "../pci.h"
 
-struct dpc_dev {
-	struct pcie_device	*dev;
-	u16			cap_pos;
-	bool			rp_extensions;
-	u8			rp_log_size;
-};
-
 static const char * const rp_pio_error_string[] = {
 	"Configuration Request received UR Completion",	 /* Bit Position 0  */
 	"Configuration Request received CA Completion",	 /* Bit Position 1  */
@@ -46,63 +39,94 @@ static const char * const rp_pio_error_string[] = {
 	"Memory Request Completion Timeout",		 /* Bit Position 18 */
 };
 
-static struct dpc_dev *to_dpc_dev(struct pci_dev *dev)
-{
-	struct device *device;
-
-	device = pcie_port_find_device(dev, PCIE_PORT_SERVICE_DPC);
-	if (!device)
-		return NULL;
-	return get_service_data(to_pcie_device(device));
-}
-
 void pci_save_dpc_state(struct pci_dev *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
 	if (!pci_is_pcie(dev))
 		return;
 
-	dpc = to_dpc_dev(dev);
-	if (!dpc)
-		return;
-
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
 	if (!save_state)
 		return;
 
 	cap = (u16 *)&save_state->cap.data[0];
-	pci_read_config_word(dev, dpc->cap_pos + PCI_EXP_DPC_CTL, cap);
+	pci_read_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, cap);
 }
 
 void pci_restore_dpc_state(struct pci_dev *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_cap_saved_state *save_state;
 	u16 *cap;
 
 	if (!pci_is_pcie(dev))
 		return;
 
-	dpc = to_dpc_dev(dev);
-	if (!dpc)
-		return;
-
 	save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
 	if (!save_state)
 		return;
 
 	cap = (u16 *)&save_state->cap.data[0];
-	pci_write_config_word(dev, dpc->cap_pos + PCI_EXP_DPC_CTL, *cap);
+	pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap);
 }
 
-static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
+static DECLARE_WAIT_QUEUE_HEAD(dpc_completed_waitqueue);
+
+#ifdef CONFIG_HOTPLUG_PCI_PCIE
+static bool dpc_completed(struct pci_dev *pdev)
+{
+	u16 status;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
+	if ((status != 0xffff) && (status & PCI_EXP_DPC_STATUS_TRIGGER))
+		return false;
+
+	if (test_bit(PCI_DPC_RECOVERING, &pdev->priv_flags))
+		return false;
+
+	return true;
+}
+
+/**
+ * pci_dpc_recovered - whether DPC triggered and has recovered successfully
+ * @pdev: PCI device
+ *
+ * Return true if DPC was triggered for @pdev and has recovered successfully.
+ * Wait for recovery if it hasn't completed yet.  Called from the PCIe hotplug
+ * driver to recognize and ignore Link Down/Up events caused by DPC.
+ */
+bool pci_dpc_recovered(struct pci_dev *pdev)
+{
+	struct pci_host_bridge *host;
+
+	if (!pdev->dpc_cap)
+		return false;
+
+	/*
+	 * Synchronization between hotplug and DPC is not supported
+	 * if DPC is owned by firmware and EDR is not enabled.
+	 */
+	host = pci_find_host_bridge(pdev->bus);
+	if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR))
+		return false;
+
+	/*
+	 * Need a timeout in case DPC never completes due to failure of
+	 * dpc_wait_rp_inactive().  The spec doesn't mandate a time limit,
+	 * but reports indicate that DPC completes within 4 seconds.
+	 */
+	wait_event_timeout(dpc_completed_waitqueue, dpc_completed(pdev),
+			   msecs_to_jiffies(4000));
+
+	return test_and_clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+}
+#endif /* CONFIG_HOTPLUG_PCI_PCIE */
+
+static int dpc_wait_rp_inactive(struct pci_dev *pdev)
 {
 	unsigned long timeout = jiffies + HZ;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status;
+	u16 cap = pdev->dpc_cap, status;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 	while (status & PCI_EXP_DPC_RP_BUSY &&
@@ -117,17 +141,18 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 	return 0;
 }
 
-static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
+pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-	struct dpc_dev *dpc;
+	pci_ers_result_t ret;
 	u16 cap;
 
+	set_bit(PCI_DPC_RECOVERING, &pdev->priv_flags);
+
 	/*
 	 * DPC disables the Link automatically in hardware, so it has
 	 * already been reset by the time we get here.
 	 */
-	dpc = to_dpc_dev(pdev);
-	cap = dpc->cap_pos;
+	cap = pdev->dpc_cap;
 
 	/*
 	 * Wait until the Link is inactive, then clear DPC Trigger Status
@@ -135,22 +160,31 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	 */
 	pcie_wait_for_link(pdev, false);
 
-	if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-		return PCI_ERS_RESULT_DISCONNECT;
+	if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev)) {
+		clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_DISCONNECT;
+		goto out;
+	}
 
 	pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
 			      PCI_EXP_DPC_STATUS_TRIGGER);
 
-	if (!pcie_wait_for_link(pdev, true))
-		return PCI_ERS_RESULT_DISCONNECT;
-
-	return PCI_ERS_RESULT_RECOVERED;
+	if (!pcie_wait_for_link(pdev, true)) {
+		clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_DISCONNECT;
+	} else {
+		set_bit(PCI_DPC_RECOVERED, &pdev->priv_flags);
+		ret = PCI_ERS_RESULT_RECOVERED;
+	}
+out:
+	clear_bit(PCI_DPC_RECOVERING, &pdev->priv_flags);
+	wake_up_all(&dpc_completed_waitqueue);
+	return ret;
 }
 
-static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
+static void dpc_process_rp_pio_error(struct pci_dev *pdev)
 {
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, dpc_status, first_error;
+	u16 cap = pdev->dpc_cap, dpc_status, first_error;
 	u32 status, mask, sev, syserr, exc, dw0, dw1, dw2, dw3, log, prefix;
 	int i;
 
@@ -175,7 +209,7 @@ static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
 				first_error == i ? " (First)" : "");
 	}
 
-	if (dpc->rp_log_size < 4)
+	if (pdev->dpc_rp_log_size < 4)
 		goto clear_status;
 	pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
 			      &dw0);
@@ -188,12 +222,12 @@ static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
 	pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
 		dw0, dw1, dw2, dw3);
 
-	if (dpc->rp_log_size < 5)
+	if (pdev->dpc_rp_log_size < 5)
 		goto clear_status;
 	pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
 	pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
 
-	for (i = 0; i < dpc->rp_log_size - 5; i++) {
+	for (i = 0; i < pdev->dpc_rp_log_size - 5; i++) {
 		pci_read_config_dword(pdev,
 			cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG, &prefix);
 		pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
@@ -224,12 +258,10 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
 	return 1;
 }
 
-static irqreturn_t dpc_handler(int irq, void *context)
+void dpc_process_error(struct pci_dev *pdev)
 {
+	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
 	struct aer_err_info info;
-	struct dpc_dev *dpc = context;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status, source, reason, ext_reason;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
@@ -248,27 +280,33 @@ static irqreturn_t dpc_handler(int irq, void *context)
 				     "reserved error");
 
 	/* show RP PIO error detail information */
-	if (dpc->rp_extensions && reason == 3 && ext_reason == 0)
-		dpc_process_rp_pio_error(dpc);
+	if (pdev->dpc_rp_extensions && reason == 3 && ext_reason == 0)
+		dpc_process_rp_pio_error(pdev);
 	else if (reason == 0 &&
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
 		aer_print_error(pdev, &info);
-		pci_cleanup_aer_uncorrect_error_status(pdev);
+		pci_aer_clear_nonfatal_status(pdev);
 		pci_aer_clear_fatal_status(pdev);
 	}
+}
+
+static irqreturn_t dpc_handler(int irq, void *context)
+{
+	struct pci_dev *pdev = context;
+
+	dpc_process_error(pdev);
 
 	/* We configure DPC so it only triggers on ERR_FATAL */
-	pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
+	pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
 
 	return IRQ_HANDLED;
 }
 
 static irqreturn_t dpc_irq(int irq, void *context)
 {
-	struct dpc_dev *dpc = (struct dpc_dev *)context;
-	struct pci_dev *pdev = dpc->dev->port;
-	u16 cap = dpc->cap_pos, status;
+	struct pci_dev *pdev = context;
+	u16 cap = pdev->dpc_cap, status;
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 
@@ -282,55 +320,57 @@ static irqreturn_t dpc_irq(int irq, void *context)
 	return IRQ_HANDLED;
 }
 
+void pci_dpc_init(struct pci_dev *pdev)
+{
+	u16 cap;
+
+	pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
+	if (!pdev->dpc_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
+	if (!(cap & PCI_EXP_DPC_CAP_RP_EXT))
+		return;
+
+	pdev->dpc_rp_extensions = true;
+	pdev->dpc_rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
+	if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
+		pci_err(pdev, "RP PIO log size %u is invalid\n",
+			pdev->dpc_rp_log_size);
+		pdev->dpc_rp_log_size = 0;
+	}
+}
+
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
-	struct dpc_dev *dpc;
 	struct pci_dev *pdev = dev->port;
 	struct device *device = &dev->device;
 	int status;
 	u16 ctl, cap;
 
-	if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native)
+	if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
 
-	dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
-	if (!dpc)
-		return -ENOMEM;
-
-	dpc->cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
-	dpc->dev = dev;
-	set_service_data(dev, dpc);
-
 	status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
 					   dpc_handler, IRQF_SHARED,
-					   "pcie-dpc", dpc);
+					   "pcie-dpc", pdev);
 	if (status) {
 		pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
 			 status);
 		return status;
 	}
 
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
-
-	dpc->rp_extensions = (cap & PCI_EXP_DPC_CAP_RP_EXT);
-	if (dpc->rp_extensions) {
-		dpc->rp_log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
-		if (dpc->rp_log_size < 4 || dpc->rp_log_size > 9) {
-			pci_err(pdev, "RP PIO log size %u is invalid\n",
-				dpc->rp_log_size);
-			dpc->rp_log_size = 0;
-		}
-	}
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
 
 	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
-	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
 
 	pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
 		 cap & PCI_EXP_DPC_IRQ, FLAG(cap, PCI_EXP_DPC_CAP_RP_EXT),
 		 FLAG(cap, PCI_EXP_DPC_CAP_POISONED_TLP),
-		 FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), dpc->rp_log_size,
+		 FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), pdev->dpc_rp_log_size,
 		 FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
 
 	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_DPC, sizeof(u16));
@@ -339,13 +379,12 @@ static int dpc_probe(struct pcie_device *dev)
 
 static void dpc_remove(struct pcie_device *dev)
 {
-	struct dpc_dev *dpc = get_service_data(dev);
 	struct pci_dev *pdev = dev->port;
 	u16 ctl;
 
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
 	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
-	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
@@ -354,7 +393,6 @@ static struct pcie_port_service_driver dpcdriver = {
 	.service	= PCIE_PORT_SERVICE_DPC,
 	.probe		= dpc_probe,
 	.remove		= dpc_remove,
-	.reset_link	= dpc_reset_link,
 };
 
 int __init pcie_dpc_init(void)
diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
new file mode 100644
index 0000000000000000000000000000000000000000..a6b9b479b97ad0c778820e366f814378d50e3177
--- /dev/null
+++ b/drivers/pci/pcie/edr.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Error Disconnect Recover support
+ * Author: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+ *
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#define dev_fmt(fmt) "EDR: " fmt
+
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+
+#include "portdrv.h"
+#include "../pci.h"
+
+#define EDR_PORT_DPC_ENABLE_DSM		0x0C
+#define EDR_PORT_LOCATE_DSM		0x0D
+#define EDR_OST_SUCCESS			0x80
+#define EDR_OST_FAILED			0x81
+
+/*
+ * _DSM wrapper function to enable/disable DPC
+ * @pdev   : PCI device structure
+ *
+ * returns 0 on success or errno on failure.
+ */
+static int acpi_enable_dpc(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	union acpi_object *obj, argv4, req;
+	int status = 0;
+
+	/*
+	 * Behavior when calling unsupported _DSM functions is undefined,
+	 * so check whether EDR_PORT_DPC_ENABLE_DSM is supported.
+	 */
+	if (!acpi_check_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+			    1ULL << EDR_PORT_DPC_ENABLE_DSM))
+		return 0;
+
+	req.type = ACPI_TYPE_INTEGER;
+	req.integer.value = 1;
+
+	argv4.type = ACPI_TYPE_PACKAGE;
+	argv4.package.count = 1;
+	argv4.package.elements = &req;
+
+	/*
+	 * Per Downstream Port Containment Related Enhancements ECN to PCI
+	 * Firmware Specification r3.2, sec 4.6.12, EDR_PORT_DPC_ENABLE_DSM is
+	 * optional.  Return success if it's not implemented.
+	 */
+	obj = acpi_evaluate_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+				EDR_PORT_DPC_ENABLE_DSM, &argv4);
+	if (!obj)
+		return 0;
+
+	if (obj->type != ACPI_TYPE_INTEGER) {
+		pci_err(pdev, FW_BUG "Enable DPC _DSM returned non integer\n");
+		status = -EIO;
+	}
+
+	if (obj->integer.value != 1) {
+		pci_err(pdev, "Enable DPC _DSM failed to enable DPC\n");
+		status = -EIO;
+	}
+
+	ACPI_FREE(obj);
+
+	return status;
+}
+
+/*
+ * _DSM wrapper function to locate DPC port
+ * @pdev   : Device which received EDR event
+ *
+ * Returns pci_dev or NULL.  Caller is responsible for dropping a reference
+ * on the returned pci_dev with pci_dev_put().
+ */
+static struct pci_dev *acpi_dpc_port_get(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	union acpi_object *obj;
+	u16 port;
+
+	/*
+	 * Behavior when calling unsupported _DSM functions is undefined,
+	 * so check whether EDR_PORT_DPC_ENABLE_DSM is supported.
+	 */
+	if (!acpi_check_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+			    1ULL << EDR_PORT_LOCATE_DSM))
+		return pci_dev_get(pdev);
+
+	obj = acpi_evaluate_dsm(adev->handle, &pci_acpi_dsm_guid, 5,
+				EDR_PORT_LOCATE_DSM, NULL);
+	if (!obj)
+		return pci_dev_get(pdev);
+
+	if (obj->type != ACPI_TYPE_INTEGER) {
+		ACPI_FREE(obj);
+		pci_err(pdev, FW_BUG "Locate Port _DSM returned non integer\n");
+		return NULL;
+	}
+
+	/*
+	 * Firmware returns DPC port BDF details in following format:
+	 *	15:8 = bus
+	 *	 7:3 = device
+	 *	 2:0 = function
+	 */
+	port = obj->integer.value;
+
+	ACPI_FREE(obj);
+
+	return pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+					   PCI_BUS_NUM(port), port & 0xff);
+}
+
+/*
+ * _OST wrapper function to let firmware know the status of EDR event
+ * @pdev   : Device used to send _OST
+ * @edev   : Device which experienced EDR event
+ * @status : Status of EDR event
+ */
+static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
+				u16 status)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	u32 ost_status;
+
+	pci_dbg(pdev, "Status for %s: %#x\n", pci_name(edev), status);
+
+	ost_status = PCI_DEVID(edev->bus->number, edev->devfn) << 16;
+	ost_status |= status;
+
+	status = acpi_evaluate_ost(adev->handle, ACPI_NOTIFY_DISCONNECT_RECOVER,
+				   ost_status, NULL);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void edr_handle_event(acpi_handle handle, u32 event, void *data)
+{
+	struct pci_dev *pdev = data, *edev;
+	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
+	u16 status;
+
+	if (event != ACPI_NOTIFY_DISCONNECT_RECOVER)
+		return;
+
+	pci_info(pdev, "EDR event received\n");
+
+	/* Locate the port which issued EDR event */
+	edev = acpi_dpc_port_get(pdev);
+	if (!edev) {
+		pci_err(pdev, "Firmware failed to locate DPC port\n");
+		return;
+	}
+
+	pci_dbg(pdev, "Reported EDR dev: %s\n", pci_name(edev));
+
+	/* If port does not support DPC, just send the OST */
+	if (!edev->dpc_cap) {
+		pci_err(edev, FW_BUG "This device doesn't support DPC\n");
+		goto send_ost;
+	}
+
+	/* Check if there is a valid DPC trigger */
+	pci_read_config_word(edev, edev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
+	if (!(status & PCI_EXP_DPC_STATUS_TRIGGER)) {
+		pci_err(edev, "Invalid DPC trigger %#010x\n", status);
+		goto send_ost;
+	}
+
+	dpc_process_error(edev);
+	pci_aer_raw_clear_status(edev);
+
+	/*
+	 * Irrespective of whether the DPC event is triggered by ERR_FATAL
+	 * or ERR_NONFATAL, since the link is already down, use the FATAL
+	 * error recovery path for both cases.
+	 */
+	estate = pcie_do_recovery(edev, pci_channel_io_frozen, dpc_reset_link);
+
+send_ost:
+
+	/*
+	 * If recovery is successful, send _OST(0xF, BDF << 16 | 0x80)
+	 * to firmware. If not successful, send _OST(0xF, BDF << 16 | 0x81).
+	 */
+	if (estate == PCI_ERS_RESULT_RECOVERED) {
+		pci_dbg(edev, "DPC port successfully recovered\n");
+		acpi_send_edr_status(pdev, edev, EDR_OST_SUCCESS);
+	} else {
+		pci_dbg(edev, "DPC port recovery failed\n");
+		acpi_send_edr_status(pdev, edev, EDR_OST_FAILED);
+	}
+
+	pci_dev_put(edev);
+}
+
+void pci_acpi_add_edr_notifier(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+	acpi_status status;
+
+	if (!adev) {
+		pci_dbg(pdev, "No valid ACPI node, skipping EDR init\n");
+		return;
+	}
+
+	status = acpi_install_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+					     edr_handle_event, pdev);
+	if (ACPI_FAILURE(status)) {
+		pci_err(pdev, "Failed to install notify handler\n");
+		return;
+	}
+
+	if (acpi_enable_dpc(pdev))
+		acpi_remove_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+					   edr_handle_event);
+	else
+		pci_dbg(pdev, "Notify handler installed\n");
+}
+
+void pci_acpi_remove_edr_notifier(struct pci_dev *pdev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
+
+	if (!adev)
+		return;
+
+	acpi_remove_notify_handler(adev->handle, ACPI_SYSTEM_NOTIFY,
+				   edr_handle_event);
+	pci_dbg(pdev, "Notify handler removed\n");
+}
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index c37c8daa83c6fb683a4acee81b6f7c80cf6aa23d..168674bfbec5a96e050641e45d5c454727e0490f 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -10,6 +10,8 @@
  *	Zhang Yanmin (yanmin.zhang@intel.com)
  */
 
+#define dev_fmt(fmt) "AER: " fmt
+
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -61,10 +63,12 @@ static int report_error_detected(struct pci_dev *dev,
 		 * error callbacks of "any" device in the subtree, and will
 		 * exit in the disconnected error state.
 		 */
-		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
 			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-		else
+			pci_info(dev, "can't recover (no error_detected callback)\n");
+		} else {
 			vote = PCI_ERS_RESULT_NONE;
+		}
 	} else {
 		err_handler = dev->driver->err_handler;
 		vote = err_handler->error_detected(dev, state);
@@ -143,78 +147,69 @@ static int report_resume(struct pci_dev *dev, void *data)
 }
 
 /**
- * default_reset_link - default reset function
- * @dev: pointer to pci_dev data structure
+ * pci_walk_bridge - walk bridges potentially AER affected
+ * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
+ * @cb:		callback to be called for each device found
+ * @userdata:	arbitrary pointer to be passed to callback
+ *
+ * If the device provided is a bridge, walk the subordinate bus, including
+ * any bridged devices on buses under this bus.  Call the provided callback
+ * on each device found.
  *
- * Invoked when performing link reset on a Downstream Port or a
- * Root Port with no aer driver.
+ * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
+ * call the callback on the device itself.
  */
-static pci_ers_result_t default_reset_link(struct pci_dev *dev)
-{
-	int rc;
-
-	rc = pci_bus_error_reset(dev);
-	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
-	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
-}
-
-static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
+static void pci_walk_bridge(struct pci_dev *bridge,
+			    int (*cb)(struct pci_dev *, void *),
+			    void *userdata)
 {
-	pci_ers_result_t status;
-	struct pcie_port_service_driver *driver = NULL;
-
-	driver = pcie_port_find_service(dev, service);
-	if (driver && driver->reset_link) {
-		status = driver->reset_link(dev);
-	} else if (pcie_downstream_port(dev)) {
-		status = default_reset_link(dev);
-	} else {
-		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
-			pci_name(dev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	if (status != PCI_ERS_RESULT_RECOVERED) {
-		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
-			pci_name(dev));
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	return status;
+	if (bridge->subordinate)
+		pci_walk_bus(bridge->subordinate, cb, userdata);
+	else
+		cb(bridge, userdata);
 }
 
-void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
-		      u32 service)
+pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
+		pci_channel_state_t state,
+		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
 {
+	int type = pci_pcie_type(dev);
+	struct pci_dev *bridge;
 	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
-	struct pci_bus *bus;
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
 	/*
-	 * Error recovery runs on all subordinates of the first downstream port.
-	 * If the downstream port detected the error, it is cleared at the end.
+	 * If the error was detected by a Root Port, Downstream Port, RCEC,
+	 * or RCiEP, recovery runs on the device itself.  For Ports, that
+	 * also includes any subordinate devices.
+	 *
+	 * If it was detected by another device (Endpoint, etc), recovery
+	 * runs on the device and anything else under the same Port, i.e.,
+	 * everything under "bridge".
 	 */
-	if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
-	      pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
-		dev = dev->bus->self;
-	bus = dev->subordinate;
-
-	pci_dbg(dev, "broadcast error_detected message\n");
+	if (type == PCI_EXP_TYPE_ROOT_PORT ||
+	    type == PCI_EXP_TYPE_DOWNSTREAM ||
+	    type == PCI_EXP_TYPE_RC_EC ||
+	    type == PCI_EXP_TYPE_RC_END)
+		bridge = dev;
+	else
+		bridge = pci_upstream_bridge(dev);
+
+	pci_dbg(bridge, "broadcast error_detected message\n");
 	if (state == pci_channel_io_frozen) {
-		pci_walk_bus(bus, report_frozen_detected, &status);
-		status = reset_link(dev, service);
-		if (status != PCI_ERS_RESULT_RECOVERED)
+		pci_walk_bridge(bridge, report_frozen_detected, &status);
+		if (reset_subordinates(dev) != PCI_ERS_RESULT_RECOVERED) {
+			pci_warn(bridge, "subordinate device reset failed\n");
 			goto failed;
+		}
 	} else {
-		pci_walk_bus(bus, report_normal_detected, &status);
+		pci_walk_bridge(bridge, report_normal_detected, &status);
 	}
-	if (state == pci_channel_io_frozen &&
-	    reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
-		goto failed;
 
 	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
 		status = PCI_ERS_RESULT_RECOVERED;
-		pci_dbg(dev, "broadcast mmio_enabled message\n");
-		pci_walk_bus(bus, report_mmio_enabled, &status);
+		pci_dbg(bridge, "broadcast mmio_enabled message\n");
+		pci_walk_bridge(bridge, report_mmio_enabled, &status);
 	}
 
 	if (status == PCI_ERS_RESULT_NEED_RESET) {
@@ -224,24 +219,35 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 		 * drivers' slot_reset callbacks?
 		 */
 		status = PCI_ERS_RESULT_RECOVERED;
-		pci_dbg(dev, "broadcast slot_reset message\n");
-		pci_walk_bus(bus, report_slot_reset, &status);
+		pci_dbg(bridge, "broadcast slot_reset message\n");
+		pci_walk_bridge(bridge, report_slot_reset, &status);
 	}
 
 	if (status != PCI_ERS_RESULT_RECOVERED)
 		goto failed;
 
-	pci_dbg(dev, "broadcast resume message\n");
-	pci_walk_bus(bus, report_resume, &status);
+	pci_dbg(bridge, "broadcast resume message\n");
+	pci_walk_bridge(bridge, report_resume, &status);
 
-	pci_aer_clear_device_status(dev);
-	pci_cleanup_aer_uncorrect_error_status(dev);
-	pci_info(dev, "AER: Device recovery successful\n");
-	return;
+	/*
+	 * If we have native control of AER, clear error status in the Root
+	 * Port or Downstream Port that signaled the error.  If the
+	 * platform retained control of AER, it is responsible for clearing
+	 * this status.  In that case, the signaling device may not even be
+	 * visible to the OS.
+	 */
+	if (host->native_aer || pcie_ports_native) {
+		pcie_clear_device_status(bridge);
+		pci_aer_clear_nonfatal_status(bridge);
+	}
+	pci_info(bridge, "device recovery successful\n");
+	return status;
 
 failed:
-	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+	pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
 
 	/* TODO: Should kernel panic here? */
-	pci_info(dev, "AER: Device recovery failed\n");
+	pci_info(bridge, "device recovery failed\n");
+
+	return status;
 }
diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index f38e6c19dd501e135d4da31cc52f2e682fb985ff..2d25c047e1c4fee28ba18b70f126f74e7409e310 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -310,7 +310,10 @@ static int pcie_pme_can_wakeup(struct pci_dev *dev, void *ign)
 static void pcie_pme_mark_devices(struct pci_dev *port)
 {
 	pcie_pme_can_wakeup(port, NULL);
-	if (port->subordinate)
+
+	if (pci_pcie_type(port) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(port, pcie_pme_can_wakeup, NULL);
+	else if (port->subordinate)
 		pci_walk_bus(port->subordinate, pcie_pme_can_wakeup, NULL);
 }
 
@@ -320,10 +323,16 @@ static void pcie_pme_mark_devices(struct pci_dev *port)
  */
 static int pcie_pme_probe(struct pcie_device *srv)
 {
-	struct pci_dev *port;
+	struct pci_dev *port = srv->port;
 	struct pcie_pme_service_data *data;
+	int type = pci_pcie_type(port);
 	int ret;
 
+	/* Limit to Root Ports or Root Complex Event Collectors */
+	if (type != PCI_EXP_TYPE_RC_EC &&
+	    type != PCI_EXP_TYPE_ROOT_PORT)
+		return -ENODEV;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -333,7 +342,6 @@ static int pcie_pme_probe(struct pcie_device *srv)
 	data->srv = srv;
 	set_service_data(srv, data);
 
-	port = srv->port;
 	pcie_pme_interrupt_enable(port, false);
 	pcie_clear_root_pme_status(port);
 
@@ -445,7 +453,7 @@ static void pcie_pme_remove(struct pcie_device *srv)
 
 static struct pcie_port_service_driver pcie_pme_driver = {
 	.name		= "pcie_pme",
-	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
+	.port_type	= PCIE_ANY_PORT,
 	.service	= PCIE_PORT_SERVICE_PME,
 
 	.probe		= pcie_pme_probe,
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 1e673619b101d0bb576aa501c6b2964462c09afe..af7cf237432aca71c45bb4ce412e311872551817 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -29,8 +29,10 @@ extern bool pcie_ports_dpc_native;
 
 #ifdef CONFIG_PCIEAER
 int pcie_aer_init(void);
+int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pcie_aer_init(void) { return 0; }
+static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_HOTPLUG_PCI_PCIE
@@ -92,9 +94,6 @@ struct pcie_port_service_driver {
 	/* Device driver may resume normal operations */
 	void (*error_resume)(struct pci_dev *dev);
 
-	/* Link Reset Capability - AER service driver specific */
-	pci_ers_result_t (*reset_link)(struct pci_dev *dev);
-
 	int port_type;  /* Type of the port this driver can handle */
 	u32 service;    /* Port service this device represents */
 
@@ -150,18 +149,5 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-#ifdef CONFIG_ACPI_APEI
-int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
-#else
-static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
-{
-	if (pci_dev->__aer_firmware_first_valid)
-		return pci_dev->__aer_firmware_first;
-	return 0;
-}
-#endif
-
-struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
-							u32 service);
 struct device *pcie_port_find_device(struct pci_dev *dev, u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 5075cb9e850c56b6bbf9b2832af644aa484be652..e1fed6649c41f73509b4bba28b8034988f675433 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -233,12 +233,9 @@ static int get_port_device_capability(struct pci_dev *dev)
 	}
 #endif
 
-	/*
-	 * Root ports are capable of generating PME too.  Root Complex
-	 * Event Collectors can also generate PMEs, but we don't handle
-	 * those yet.
-	 */
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT &&
+	/* Root Ports and Root Complex Event Collectors may generate PMEs */
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
+	     pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC) &&
 	    (pcie_ports_native || host->native_pme)) {
 		services |= PCIE_PORT_SERVICE_PME;
 
@@ -458,27 +455,6 @@ static int find_service_iter(struct device *device, void *data)
 	return 0;
 }
 
-/**
- * pcie_port_find_service - find the service driver
- * @dev: PCI Express port the service is associated with
- * @service: Service to find
- *
- * Find PCI Express port service driver associated with given service
- */
-struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
-							u32 service)
-{
-	struct pcie_port_service_driver *drv;
-	struct portdrv_service_data pdrvs;
-
-	pdrvs.drv = NULL;
-	pdrvs.service = service;
-	device_for_each_child(&dev->dev, &pdrvs, find_service_iter);
-
-	drv = pdrvs.drv;
-	return drv;
-}
-
 /**
  * pcie_port_find_device - find the struct device
  * @dev: PCI Express port the service is associated with
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 160d67c593105a420740b5ffb818acc23f475898..d622024a41bccc202ca0f461a67e99d8d197698c 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -101,14 +101,19 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
 static int pcie_portdrv_probe(struct pci_dev *dev,
 					const struct pci_device_id *id)
 {
+	int type = pci_pcie_type(dev);
 	int status;
 
 	if (!pci_is_pcie(dev) ||
-	    ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM) &&
-	     (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM)))
+	    ((type != PCI_EXP_TYPE_ROOT_PORT) &&
+	     (type != PCI_EXP_TYPE_UPSTREAM) &&
+	     (type != PCI_EXP_TYPE_DOWNSTREAM) &&
+	     (type != PCI_EXP_TYPE_RC_EC)))
 		return -ENODEV;
 
+	if (type == PCI_EXP_TYPE_RC_EC)
+		pcie_link_rcec(dev);
+
 	status = pcie_port_device_register(dev);
 	if (status)
 		return status;
@@ -195,6 +200,8 @@ static const struct pci_device_id port_pci_ids[] = {
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0) },
 	/* subtractive decode PCI-to-PCI bridge, class type is 060401h */
 	{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x01), ~0) },
+	/* handle any Root Complex Event Collector */
+	{ PCI_DEVICE_CLASS(((PCI_CLASS_SYSTEM_RCEC << 8) | 0x00), ~0) },
 	{ },
 };
 
diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
new file mode 100644
index 0000000000000000000000000000000000000000..d0bcd141ac9c657431f11ab54414df186ea6f388
--- /dev/null
+++ b/drivers/pci/pcie/rcec.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Root Complex Event Collector Support
+ *
+ * Authors:
+ *  Sean V Kelley <sean.v.kelley@intel.com>
+ *  Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+ *
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "../pci.h"
+
+struct walk_rcec_data {
+	struct pci_dev *rcec;
+	int (*user_callback)(struct pci_dev *dev, void *data);
+	void *user_data;
+};
+
+static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep)
+{
+	unsigned long bitmap = rcec->rcec_ea->bitmap;
+	unsigned int devn;
+
+	/* An RCiEP found on a different bus in range */
+	if (rcec->bus->number != rciep->bus->number)
+		return true;
+
+	/* Same bus, so check bitmap */
+	for_each_set_bit(devn, &bitmap, 32)
+		if (devn == PCI_SLOT(rciep->devfn))
+			return true;
+
+	return false;
+}
+
+static int link_rcec_helper(struct pci_dev *dev, void *data)
+{
+	struct walk_rcec_data *rcec_data = data;
+	struct pci_dev *rcec = rcec_data->rcec;
+
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    rcec_assoc_rciep(rcec, dev)) {
+		dev->rcec = rcec;
+		pci_dbg(dev, "PME & error events signaled via %s\n",
+			pci_name(rcec));
+	}
+
+	return 0;
+}
+
+static int walk_rcec_helper(struct pci_dev *dev, void *data)
+{
+	struct walk_rcec_data *rcec_data = data;
+	struct pci_dev *rcec = rcec_data->rcec;
+
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    rcec_assoc_rciep(rcec, dev))
+		rcec_data->user_callback(dev, rcec_data->user_data);
+
+	return 0;
+}
+
+static void walk_rcec(int (*cb)(struct pci_dev *dev, void *data),
+		      void *userdata)
+{
+	struct walk_rcec_data *rcec_data = userdata;
+	struct pci_dev *rcec = rcec_data->rcec;
+	u8 nextbusn, lastbusn;
+	struct pci_bus *bus;
+	unsigned int bnr;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	/* Walk own bus for bitmap based association */
+	pci_walk_bus(rcec->bus, cb, rcec_data);
+
+	nextbusn = rcec->rcec_ea->nextbusn;
+	lastbusn = rcec->rcec_ea->lastbusn;
+
+	/* All RCiEP devices are on the same bus as the RCEC */
+	if (nextbusn == 0xff && lastbusn == 0x00)
+		return;
+
+	for (bnr = nextbusn; bnr <= lastbusn; bnr++) {
+		/* No association indicated (PCIe 5.0-1, 7.9.10.3) */
+		if (bnr == rcec->bus->number)
+			continue;
+
+		bus = pci_find_bus(pci_domain_nr(rcec->bus), bnr);
+		if (!bus)
+			continue;
+
+		/* Find RCiEP devices on the given bus ranges */
+		pci_walk_bus(bus, cb, rcec_data);
+	}
+}
+
+/**
+ * pcie_link_rcec - Link RCiEP devices associated with RCEC.
+ * @rcec: RCEC whose RCiEP devices should be linked.
+ *
+ * Link the given RCEC to each RCiEP device found.
+ */
+void pcie_link_rcec(struct pci_dev *rcec)
+{
+	struct walk_rcec_data rcec_data;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	rcec_data.rcec = rcec;
+	rcec_data.user_callback = NULL;
+	rcec_data.user_data = NULL;
+
+	walk_rcec(link_rcec_helper, &rcec_data);
+}
+
+/**
+ * pcie_walk_rcec - Walk RCiEP devices associating with RCEC and call callback.
+ * @rcec:	RCEC whose RCiEP devices should be walked
+ * @cb:		Callback to be called for each RCiEP device found
+ * @userdata:	Arbitrary pointer to be passed to callback
+ *
+ * Walk the given RCEC. Call the callback on each RCiEP found.
+ *
+ * If @cb returns anything other than 0, break out.
+ */
+void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *),
+		    void *userdata)
+{
+	struct walk_rcec_data rcec_data;
+
+	if (!rcec->rcec_ea)
+		return;
+
+	rcec_data.rcec = rcec;
+	rcec_data.user_callback = cb;
+	rcec_data.user_data = userdata;
+
+	walk_rcec(walk_rcec_helper, &rcec_data);
+}
+
+void pci_rcec_init(struct pci_dev *dev)
+{
+	struct rcec_ea *rcec_ea;
+	u32 rcec, hdr, busn;
+	u8 ver;
+
+	/* Only for Root Complex Event Collectors */
+	if (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC)
+		return;
+
+	rcec = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_RCEC);
+	if (!rcec)
+		return;
+
+	rcec_ea = kzalloc(sizeof(*rcec_ea), GFP_KERNEL);
+	if (!rcec_ea)
+		return;
+
+	pci_read_config_dword(dev, rcec + PCI_RCEC_RCIEP_BITMAP,
+			      &rcec_ea->bitmap);
+
+	/* Check whether RCEC BUSN register is present */
+	pci_read_config_dword(dev, rcec, &hdr);
+	ver = PCI_EXT_CAP_VER(hdr);
+	if (ver >= PCI_RCEC_BUSN_REG_VER) {
+		pci_read_config_dword(dev, rcec + PCI_RCEC_BUSN, &busn);
+		rcec_ea->nextbusn = PCI_RCEC_BUSN_NEXT(busn);
+		rcec_ea->lastbusn = PCI_RCEC_BUSN_LAST(busn);
+	} else {
+		/* Avoid later ver check by setting nextbusn */
+		rcec_ea->nextbusn = 0xff;
+		rcec_ea->lastbusn = 0x00;
+	}
+
+	dev->rcec_ea = rcec_ea;
+}
+
+void pci_rcec_exit(struct pci_dev *dev)
+{
+	kfree(dev->rcec_ea);
+	dev->rcec_ea = NULL;
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 89c261c159f14ed5383ee089bca25845209ec270..f6ab3847c051368eeb024d2b8f78c3bf00deec30 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -568,7 +568,7 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
 	return b;
 }
 
-static void devm_pci_release_host_bridge_dev(struct device *dev)
+static void pci_release_host_bridge_dev(struct device *dev)
 {
 	struct pci_host_bridge *bridge = to_pci_host_bridge(dev);
 
@@ -577,12 +577,7 @@ static void devm_pci_release_host_bridge_dev(struct device *dev)
 
 	pci_free_resource_list(&bridge->windows);
 	pci_free_resource_list(&bridge->dma_ranges);
-}
-
-static void pci_release_host_bridge_dev(struct device *dev)
-{
-	devm_pci_release_host_bridge_dev(dev);
-	kfree(to_pci_host_bridge(dev));
+	kfree(bridge);
 }
 
 static void pci_init_host_bridge(struct pci_host_bridge *bridge)
@@ -601,6 +596,9 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge)
 	bridge->native_shpc_hotplug = 1;
 	bridge->native_pme = 1;
 	bridge->native_ltr = 1;
+	bridge->native_dpc = 1;
+
+	device_initialize(&bridge->dev);
 }
 
 struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
@@ -618,17 +616,25 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
 }
 EXPORT_SYMBOL(pci_alloc_host_bridge);
 
+static void devm_pci_alloc_host_bridge_release(void *data)
+{
+	pci_free_host_bridge(data);
+}
+
 struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev,
 						   size_t priv)
 {
+	int ret;
 	struct pci_host_bridge *bridge;
 
-	bridge = devm_kzalloc(dev, sizeof(*bridge) + priv, GFP_KERNEL);
+	bridge = pci_alloc_host_bridge(priv);
 	if (!bridge)
 		return NULL;
 
-	pci_init_host_bridge(bridge);
-	bridge->dev.release = devm_pci_release_host_bridge_dev;
+	ret = devm_add_action_or_reset(dev, devm_pci_alloc_host_bridge_release,
+				       bridge);
+	if (ret)
+		return NULL;
 
 	return bridge;
 }
@@ -636,10 +642,7 @@ EXPORT_SYMBOL(devm_pci_alloc_host_bridge);
 
 void pci_free_host_bridge(struct pci_host_bridge *bridge)
 {
-	pci_free_resource_list(&bridge->windows);
-	pci_free_resource_list(&bridge->dma_ranges);
-
-	kfree(bridge);
+	put_device(&bridge->dev);
 }
 EXPORT_SYMBOL(pci_free_host_bridge);
 
@@ -870,7 +873,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 	if (err)
 		goto free;
 
-	err = device_register(&bridge->dev);
+	err = device_add(&bridge->dev);
 	if (err) {
 		put_device(&bridge->dev);
 		goto free;
@@ -937,7 +940,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 
 unregister:
 	put_device(&bridge->dev);
-	device_unregister(&bridge->dev);
+	device_del(&bridge->dev);
 
 free:
 	kfree(bus);
@@ -1509,7 +1512,7 @@ static void set_pcie_untrusted(struct pci_dev *dev)
 	 * untrusted as well.
 	 */
 	parent = pci_upstream_bridge(dev);
-	if (parent && parent->untrusted)
+	if (parent && (parent->untrusted || parent->external_facing))
 		dev->untrusted = true;
 }
 
@@ -1655,22 +1658,6 @@ static u8 pci_hdr_type(struct pci_dev *dev)
 
 #define LEGACY_IO_RESOURCE	(IORESOURCE_IO | IORESOURCE_PCI_FIXED)
 
-static void pci_msi_setup_pci_dev(struct pci_dev *dev)
-{
-	/*
-	 * Disable the MSI hardware to avoid screaming interrupts
-	 * during boot.  This is the power on reset default so
-	 * usually this should be a noop.
-	 */
-	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (dev->msi_cap)
-		pci_msi_set_enable(dev, 0);
-
-	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (dev->msix_cap)
-		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
-}
-
 /**
  * pci_intx_mask_broken - Test PCI_COMMAND_INTX_DISABLE writability
  * @dev: PCI device
@@ -2152,6 +2139,7 @@ static void pci_configure_device(struct pci_dev *dev)
 static void pci_release_capabilities(struct pci_dev *dev)
 {
 	pci_aer_exit(dev);
+	pci_rcec_exit(dev);
 	pci_vpd_release(dev);
 	pci_iov_release(dev);
 	pci_free_cap_save_buffers(dev);
@@ -2188,6 +2176,9 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 		return NULL;
 
 	INIT_LIST_HEAD(&dev->bus_list);
+#ifdef CONFIG_PCI_MSI
+	mutex_init(&dev->msix_mutex);
+#endif
 	dev->dev.type = &pci_dev_type;
 	dev->bus = pci_bus_get(bus);
 
@@ -2347,38 +2338,25 @@ void pcie_report_downtraining(struct pci_dev *dev)
 
 static void pci_init_capabilities(struct pci_dev *dev)
 {
-	/* Enhanced Allocation */
-	pci_ea_init(dev);
-
-	/* Setup MSI caps & disable MSI/MSI-X interrupts */
-	pci_msi_setup_pci_dev(dev);
+	pci_ea_init(dev);		/* Enhanced Allocation */
+	pci_msi_init(dev);		/* Disable MSI */
+	pci_msix_init(dev);		/* Disable MSI-X */
 
 	/* Buffers for saving PCIe and PCI-X capabilities */
 	pci_allocate_cap_save_buffers(dev);
 
-	/* Power Management */
-	pci_pm_init(dev);
-
-	/* Vital Product Data */
-	pci_vpd_init(dev);
-
-	/* Alternative Routing-ID Forwarding */
-	pci_configure_ari(dev);
-
-	/* Single Root I/O Virtualization */
-	pci_iov_init(dev);
-
-	/* Address Translation Services */
-	pci_ats_init(dev);
-
-	/* Enable ACS P2P upstream forwarding */
-	pci_enable_acs(dev);
-
-	/* Precision Time Measurement */
-	pci_ptm_init(dev);
-
-	/* Advanced Error Reporting */
-	pci_aer_init(dev);
+	pci_pm_init(dev);		/* Power Management */
+	pci_vpd_init(dev);		/* Vital Product Data */
+	pci_configure_ari(dev);		/* Alternative Routing-ID Forwarding */
+	pci_iov_init(dev);		/* Single Root I/O Virtualization */
+	pci_ats_init(dev);		/* Address Translation Services */
+	pci_pri_init(dev);		/* Page Request Interface */
+	pci_pasid_init(dev);		/* Process Address Space ID */
+	pci_enable_acs(dev);		/* Access Control Services */
+	pci_ptm_init(dev);		/* Precision Time Measurement */
+	pci_aer_init(dev);		/* Advanced Error Reporting */
+	pci_dpc_init(dev);		/* Downstream Port Containment */
+	pci_rcec_init(dev);		/* Root Complex Event Collector */
 
 	pcie_report_downtraining(dev);
 
@@ -2450,13 +2428,10 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	/* Fix up broken headers */
 	pci_fixup_device(pci_fixup_header, dev);
 
-	/* Moved out from quirk header fixup code */
 	pci_reassigndev_resource_alignment(dev);
 
-	/* Clear the state_saved flag */
 	dev->state_saved = false;
 
-	/* Initialize various capabilities */
 	pci_init_capabilities(dev);
 
 	/*
@@ -2963,7 +2938,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	return bridge->bus;
 
 err_out:
-	kfree(bridge);
+	put_device(&bridge->dev);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(pci_create_root_bus);
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 5495537c60c2441a2b9922526833f4d81e393753..6ef74bf5013f16ef02cc83305a92f7329cc6352f 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -258,13 +258,13 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 
 	/* Make sure the caller is mapping a real resource for this device */
-	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (dev->resource[i].flags & res_bit &&
 		    pci_mmap_fits(dev, i, vma,  PCI_MMAP_PROCFS))
 			break;
 	}
 
-	if (i >= PCI_ROM_RESOURCE)
+	if (i >= PCI_STD_NUM_BARS)
 		return -ENODEV;
 
 	if (fpriv->mmap_state == pci_mmap_mem &&
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 5632bab94c246c18b3ea33797314b99f336f009a..60274a61bb486c4580eb104719fda2c3e85244e7 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -474,7 +474,7 @@ static void quirk_extend_bar_to_page(struct pci_dev *dev)
 {
 	int i;
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *r = &dev->resource[i];
 
 		if (r->flags & IORESOURCE_MEM && resource_size(r) < PAGE_SIZE) {
@@ -1809,7 +1809,7 @@ static void quirk_alder_ioapic(struct pci_dev *pdev)
 	 * The next five BARs all seem to be rubbish, so just clean
 	 * them out.
 	 */
-	for (i = 1; i < 6; i++)
+	for (i = 1; i < PCI_STD_NUM_BARS; i++)
 		memset(&pdev->resource[i], 0, sizeof(pdev->resource[i]));
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_EESSC,	quirk_alder_ioapic);
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index e9c6b120cf451331dc294f50a3ac1315cd37c2c3..95dec03d9f2a990db01998de7e06f6257e26517b 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -160,6 +160,6 @@ void pci_remove_root_bus(struct pci_bus *bus)
 	host_bridge->bus = NULL;
 
 	/* remove the host bridge */
-	device_unregister(&host_bridge->dev);
+	device_del(&host_bridge->dev);
 }
 EXPORT_SYMBOL_GPL(pci_remove_root_bus);
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index e4dbdef5aef05018b585375e57a0e2db0bd431d7..2061672954ee3cc9dac6a588f8056f775de3997d 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -32,6 +32,12 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 	struct pci_bus *bus;
 	int ret;
 
+	/*
+	 * The device may have an explicit alias requester ID for DMA where the
+	 * requester is on another PCI bus.
+	 */
+	pdev = pci_real_dma_dev(pdev);
+
 	ret = fn(pdev, pci_dev_id(pdev), data);
 	if (ret)
 		return ret;
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 1e3ed6ec0a4af068808d432e29a962e3384dcdbd..794504e854ce6bd4ef49b68a2b9b8ae436f29bd2 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -39,14 +39,14 @@ static const struct sysfs_ops pci_slot_sysfs_ops = {
 static ssize_t address_read_file(struct pci_slot *slot, char *buf)
 {
 	if (slot->number == 0xff)
-		return sprintf(buf, "%04x:%02x\n",
-				pci_domain_nr(slot->bus),
-				slot->bus->number);
-	else
-		return sprintf(buf, "%04x:%02x:%02x\n",
-				pci_domain_nr(slot->bus),
-				slot->bus->number,
-				slot->number);
+		return sysfs_emit(buf, "%04x:%02x\n",
+				  pci_domain_nr(slot->bus),
+				  slot->bus->number);
+
+	return sysfs_emit(buf, "%04x:%02x:%02x\n",
+			  pci_domain_nr(slot->bus),
+			  slot->bus->number,
+			  slot->number);
 }
 
 /* these strings match up with the values in pci_bus_speed */
@@ -87,7 +87,7 @@ static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf)
 	else
 		speed_string = "Unknown";
 
-	return sprintf(buf, "%s\n", speed_string);
+	return sysfs_emit(buf, "%s\n", speed_string);
 }
 
 static ssize_t max_speed_read_file(struct pci_slot *slot, char *buf)
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 2c9c3061894b213829a7274a2aa9dde5fe01547f..39768a4dde23050007184e5605e02a0b54e04cf9 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -278,7 +278,7 @@ static ssize_t device_version_show(struct device *dev,
 
 	ver = ioread32(&stdev->mmio_sys_info->device_version);
 
-	return sprintf(buf, "%x\n", ver);
+	return sysfs_emit(buf, "%x\n", ver);
 }
 static DEVICE_ATTR_RO(device_version);
 
@@ -290,7 +290,7 @@ static ssize_t fw_version_show(struct device *dev,
 
 	ver = ioread32(&stdev->mmio_sys_info->firmware_version);
 
-	return sprintf(buf, "%08x\n", ver);
+	return sysfs_emit(buf, "%08x\n", ver);
 }
 static DEVICE_ATTR_RO(fw_version);
 
@@ -334,7 +334,7 @@ static ssize_t component_id_show(struct device *dev,
 	struct switchtec_dev *stdev = to_stdev(dev);
 	int id = ioread16(&stdev->mmio_sys_info->component_id);
 
-	return sprintf(buf, "PM%04X\n", id);
+	return sysfs_emit(buf, "PM%04X\n", id);
 }
 static DEVICE_ATTR_RO(component_id);
 
@@ -344,7 +344,7 @@ static ssize_t component_revision_show(struct device *dev,
 	struct switchtec_dev *stdev = to_stdev(dev);
 	int rev = ioread8(&stdev->mmio_sys_info->component_revision);
 
-	return sprintf(buf, "%d\n", rev);
+	return sysfs_emit(buf, "%d\n", rev);
 }
 static DEVICE_ATTR_RO(component_revision);
 
@@ -353,7 +353,7 @@ static ssize_t partition_show(struct device *dev,
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 
-	return sprintf(buf, "%d\n", stdev->partition);
+	return sysfs_emit(buf, "%d\n", stdev->partition);
 }
 static DEVICE_ATTR_RO(partition);
 
@@ -362,7 +362,7 @@ static ssize_t partition_count_show(struct device *dev,
 {
 	struct switchtec_dev *stdev = to_stdev(dev);
 
-	return sprintf(buf, "%d\n", stdev->partition_count);
+	return sysfs_emit(buf, "%d\n", stdev->partition_count);
 }
 static DEVICE_ATTR_RO(partition_count);
 
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 000d5693fae74fd6208789aca743f1badcbd961e..55aba46d24d95cb343745baa846eb64bf61367d2 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1210,6 +1210,42 @@ config SURFACE_3_BUTTON
 	---help---
 	  This driver handles the power/home/volume buttons on the Microsoft Surface 3 tablet.
 
+config INTEL_PMT_CLASS
+	tristate
+	help
+	  The Intel Platform Monitoring Technology (PMT) class driver provides
+	  the basic sysfs interface and file hierarchy uses by PMT devices.
+
+	  For more information, see:
+	  <file:Documentation/ABI/testing/sysfs-class-intel_pmt>
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_class.
+
+config INTEL_PMT_TELEMETRY
+	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
+	depends on MFD_INTEL_PMT
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
+	  access to hardware telemetry metrics on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_telemetry.
+
+config INTEL_PMT_CRASHLOG
+	tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver"
+	depends on MFD_INTEL_PMT
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitoring Technology (PMT) crashlog driver provides
+	  access to hardware crashlog capabilities on devices that support the
+	  feature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel_pmt_crashlog.
+
 config INTEL_PUNIT_IPC
 	tristate "Intel P-Unit IPC Driver"
 	---help---
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 4151040330601052eb3f69054996261a998dc954..da88c2bb3fff0c5179baef2b654bc105945d2e6b 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -91,6 +91,9 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 				   intel_telemetry_pltdrv.o \
 				   intel_telemetry_debugfs.o
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o intel_pmc_core_pltdrv.o
+obj-$(CONFIG_INTEL_PMT_CLASS)	+= intel_pmt_class.o
+obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= intel_pmt_telemetry.o
+obj-$(CONFIG_INTEL_PMT_CRASHLOG)	+= intel_pmt_crashlog.o
 obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c
new file mode 100644
index 0000000000000000000000000000000000000000..c86ff15b1ed522be06f2c670a2f62c3bef70aa7d
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+
+#include "intel_pmt_class.h"
+
+#define PMT_XA_START		0
+#define PMT_XA_MAX		INT_MAX
+#define PMT_XA_LIMIT		XA_LIMIT(PMT_XA_START, PMT_XA_MAX)
+
+/*
+ * Early implementations of PMT on client platforms have some
+ * differences from the server platforms (which use the Out Of Band
+ * Management Services Module OOBMSM). This list tracks those
+ * platforms as needed to handle those differences. Newer client
+ * platforms are expected to be fully compatible with server.
+ */
+static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
+	{ PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */
+	{ PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */
+	{ PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
+	{ }
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev)
+{
+	struct pci_dev *parent = to_pci_dev(dev->parent);
+
+	return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
+
+/*
+ * sysfs
+ */
+static ssize_t
+intel_pmt_read(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *attr, char *buf, loff_t off,
+	       size_t count)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+
+	if (off < 0)
+		return -EINVAL;
+
+	if (off >= entry->size)
+		return 0;
+
+	if (count > entry->size - off)
+		count = entry->size - off;
+
+	memcpy_fromio(buf, entry->base + off, count);
+
+	return count;
+}
+
+static int
+intel_pmt_mmap(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+	struct intel_pmt_entry *entry = container_of(attr,
+						     struct intel_pmt_entry,
+						     pmt_bin_attr);
+	unsigned long vsize = vma->vm_end - vma->vm_start;
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned long phys = entry->base_addr;
+	unsigned long pfn = PFN_DOWN(phys);
+	unsigned long psize;
+
+	if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))
+		return -EROFS;
+
+	psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
+	if (vsize > psize) {
+		dev_err(dev, "Requested mmap size is too large\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+		vsize, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t
+guid_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%x\n", entry->guid);
+}
+static DEVICE_ATTR_RO(guid);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%zu\n", entry->size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t
+offset_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr));
+}
+static DEVICE_ATTR_RO(offset);
+
+static struct attribute *intel_pmt_attrs[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_offset.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(intel_pmt);
+
+static struct class intel_pmt_class = {
+	.name = "intel_pmt",
+	.owner = THIS_MODULE,
+	.dev_groups = intel_pmt_groups,
+};
+
+static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
+				    struct intel_pmt_header *header,
+				    struct device *dev,
+				    struct resource *disc_res)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev->parent);
+	u8 bir;
+
+	/*
+	 * The base offset should always be 8 byte aligned.
+	 *
+	 * For non-local access types the lower 3 bits of base offset
+	 * contains the index of the base address register where the
+	 * telemetry can be found.
+	 */
+	bir = GET_BIR(header->base_offset);
+
+	/* Local access and BARID only for now */
+	switch (header->access_type) {
+	case ACCESS_LOCAL:
+		if (bir) {
+			dev_err(dev,
+				"Unsupported BAR index %d for access type %d\n",
+				bir, header->access_type);
+			return -EINVAL;
+		}
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset
+		 */
+		entry->base_addr = disc_res->end + 1 + header->base_offset;
+
+		/*
+		 * Some hardware use a different calculation for the base address
+		 * when access_type == ACCESS_LOCAL. On the these systems
+		 * ACCCESS_LOCAL refers to an address in the same BAR as the
+		 * header but at a fixed offset. But as the header address was
+		 * supplied to the driver, we don't know which BAR it was in.
+		 * So search for the bar whose range includes the header address.
+		 */
+		if (intel_pmt_is_early_client_hw(dev)) {
+			int i;
+
+			entry->base_addr = 0;
+			for (i = 0; i < 6; i++)
+				if (disc_res->start >= pci_resource_start(pci_dev, i) &&
+				   (disc_res->start <= pci_resource_end(pci_dev, i))) {
+					entry->base_addr = pci_resource_start(pci_dev, i) +
+							   header->base_offset;
+					break;
+				}
+			if (!entry->base_addr)
+				return -EINVAL;
+		}
+
+		break;
+	case ACCESS_BARID:
+		/*
+		 * If another BAR was specified then the base offset
+		 * represents the offset within that BAR. SO retrieve the
+		 * address from the parent PCI device and add offset.
+		 */
+		entry->base_addr = pci_resource_start(pci_dev, bir) +
+				   GET_ADDRESS(header->base_offset);
+		break;
+	default:
+		dev_err(dev, "Unsupported access type %d\n",
+			header->access_type);
+		return -EINVAL;
+	}
+
+	entry->guid = header->guid;
+	entry->size = header->size;
+
+	return 0;
+}
+
+static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
+				  struct intel_pmt_namespace *ns,
+				  struct device *parent)
+{
+	struct resource res = {0};
+	struct device *dev;
+	int ret;
+
+	ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry,
+			    "%s%d", ns->name, entry->devid);
+
+	if (IS_ERR(dev)) {
+		dev_err(parent, "Could not create %s%d device node\n",
+			ns->name, entry->devid);
+		ret = PTR_ERR(dev);
+		goto fail_dev_create;
+	}
+
+	entry->kobj = &dev->kobj;
+
+	if (ns->attr_grp) {
+		ret = sysfs_create_group(entry->kobj, ns->attr_grp);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	/* if size is 0 assume no data buffer, so no file needed */
+	if (!entry->size)
+		return 0;
+
+	res.start = entry->base_addr;
+	res.end = res.start + entry->size - 1;
+	res.flags = IORESOURCE_MEM;
+
+	entry->base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(entry->base)) {
+		ret = PTR_ERR(entry->base);
+		goto fail_ioremap;
+	}
+
+	sysfs_bin_attr_init(&entry->pmt_bin_attr);
+	entry->pmt_bin_attr.attr.name = ns->name;
+	entry->pmt_bin_attr.attr.mode = 0440;
+	entry->pmt_bin_attr.mmap = intel_pmt_mmap;
+	entry->pmt_bin_attr.read = intel_pmt_read;
+	entry->pmt_bin_attr.size = entry->size;
+
+	ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr);
+	if (!ret)
+		return 0;
+
+fail_ioremap:
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+fail_sysfs:
+	device_unregister(dev);
+fail_dev_create:
+	xa_erase(ns->xa, entry->devid);
+
+	return ret;
+}
+
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx)
+{
+	struct intel_pmt_header header;
+	struct resource	*disc_res;
+	int ret = -ENODEV;
+
+	disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx);
+	if (!disc_res)
+		return ret;
+
+	entry->disc_table = devm_platform_ioremap_resource(pdev, idx);
+	if (IS_ERR(entry->disc_table))
+		return PTR_ERR(entry->disc_table);
+
+	ret = ns->pmt_header_decode(entry, &header, &pdev->dev);
+	if (ret)
+		return ret;
+
+	ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res);
+	if (ret)
+		return ret;
+
+	return intel_pmt_dev_register(entry, ns, &pdev->dev);
+
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_create);
+
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns)
+{
+	struct device *dev = kobj_to_dev(entry->kobj);
+
+	if (entry->size)
+		sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr);
+
+	if (ns->attr_grp)
+		sysfs_remove_group(entry->kobj, ns->attr_grp);
+
+	device_unregister(dev);
+	xa_erase(ns->xa, entry->devid);
+}
+EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy);
+
+static int __init pmt_class_init(void)
+{
+	return class_register(&intel_pmt_class);
+}
+
+static void __exit pmt_class_exit(void)
+{
+	class_unregister(&intel_pmt_class);
+}
+
+module_init(pmt_class_init);
+module_exit(pmt_class_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Class driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..1337019c2873eb37b3abda073700228437bb0b36
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_class.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_PMT_CLASS_H
+#define _INTEL_PMT_CLASS_H
+
+#include <linux/platform_device.h>
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/io.h>
+
+/* PMT access types */
+#define ACCESS_BARID		2
+#define ACCESS_LOCAL		3
+
+/* PMT discovery base address/offset register layout */
+#define GET_BIR(v)		((v) & GENMASK(2, 0))
+#define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
+
+struct intel_pmt_entry {
+	struct bin_attribute	pmt_bin_attr;
+	struct kobject		*kobj;
+	void __iomem		*disc_table;
+	void __iomem		*base;
+	unsigned long		base_addr;
+	size_t			size;
+	u32			guid;
+	int			devid;
+};
+
+struct intel_pmt_header {
+	u32	base_offset;
+	u32	size;
+	u32	guid;
+	u8	access_type;
+};
+
+struct intel_pmt_namespace {
+	const char *name;
+	struct xarray *xa;
+	const struct attribute_group *attr_grp;
+	int (*pmt_header_decode)(struct intel_pmt_entry *entry,
+				 struct intel_pmt_header *header,
+				 struct device *dev);
+};
+
+bool intel_pmt_is_early_client_hw(struct device *dev);
+int intel_pmt_dev_create(struct intel_pmt_entry *entry,
+			 struct intel_pmt_namespace *ns,
+			 struct platform_device *pdev, int idx);
+void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
+			   struct intel_pmt_namespace *ns);
+#endif
diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c
new file mode 100644
index 0000000000000000000000000000000000000000..92d315a16cfd3cd5985fe663d3a155c1e1107d3e
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_crashlog.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitoring Technology Crashlog driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "Alexander Duyck" <alexander.h.duyck@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define DRV_NAME		"pmt_crashlog"
+
+/* Crashlog discovery header types */
+#define CRASH_TYPE_OOBMSM	1
+
+/* Control Flags */
+#define CRASHLOG_FLAG_DISABLE		BIT(28)
+
+/*
+ * Bits 29 and 30 control the state of bit 31.
+ *
+ * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31.
+ * Bit 31 is the read-only status with a 1 indicating log is complete.
+ */
+#define CRASHLOG_FLAG_TRIGGER_CLEAR	BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE	BIT(30)
+#define CRASHLOG_FLAG_TRIGGER_COMPLETE	BIT(31)
+#define CRASHLOG_FLAG_TRIGGER_MASK	GENMASK(31, 28)
+
+/* Crashlog Discovery Header */
+#define CONTROL_OFFSET		0x0
+#define GUID_OFFSET		0x4
+#define BASE_OFFSET		0x8
+#define SIZE_OFFSET		0xC
+#define GET_ACCESS(v)		((v) & GENMASK(3, 0))
+#define GET_TYPE(v)		(((v) & GENMASK(7, 4)) >> 4)
+#define GET_VERSION(v)		(((v) & GENMASK(19, 16)) >> 16)
+/* size is in bytes */
+#define GET_SIZE(v)		((v) * sizeof(u32))
+
+struct crashlog_entry {
+	/* entry must be first member of struct */
+	struct intel_pmt_entry		entry;
+	struct mutex			control_mutex;
+};
+
+struct pmt_crashlog_priv {
+	int			num_entries;
+	struct crashlog_entry	entry[];
+};
+
+/*
+ * I/O
+ */
+static bool pmt_crashlog_complete(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog complete flag */
+	return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE);
+}
+
+static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* return current value of the crashlog disabled flag */
+	return !!(control & CRASHLOG_FLAG_DISABLE);
+}
+
+static bool pmt_crashlog_supported(struct intel_pmt_entry *entry)
+{
+	u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET);
+	u32 crash_type, version;
+
+	crash_type = GET_TYPE(discovery_header);
+	version = GET_VERSION(discovery_header);
+
+	/*
+	 * Currently we only recognize OOBMSM version 0 devices.
+	 * We can ignore all other crashlog devices in the system.
+	 */
+	return crash_type == CRASH_TYPE_OOBMSM && version == 0;
+}
+
+static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry,
+				     bool disable)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	/* clear trigger bits so we are only modifying disable flag */
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+
+	if (disable)
+		control |= CRASHLOG_FLAG_DISABLE;
+	else
+		control &= ~CRASHLOG_FLAG_DISABLE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_CLEAR;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry)
+{
+	u32 control = readl(entry->disc_table + CONTROL_OFFSET);
+
+	control &= ~CRASHLOG_FLAG_TRIGGER_MASK;
+	control |= CRASHLOG_FLAG_TRIGGER_EXECUTE;
+
+	writel(control, entry->disc_table + CONTROL_OFFSET);
+}
+
+/*
+ * sysfs
+ */
+static ssize_t
+enable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry = dev_get_drvdata(dev);
+	int enabled = !pmt_crashlog_disabled(entry);
+
+	return sprintf(buf, "%d\n", enabled);
+}
+
+static ssize_t
+enable_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool enabled;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &enabled);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+	pmt_crashlog_set_disable(&entry->entry, !enabled);
+	mutex_unlock(&entry->control_mutex);
+
+	return count;
+}
+static DEVICE_ATTR_RW(enable);
+
+static ssize_t
+trigger_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct intel_pmt_entry *entry;
+	int trigger;
+
+	entry = dev_get_drvdata(dev);
+	trigger = pmt_crashlog_complete(entry);
+
+	return sprintf(buf, "%d\n", trigger);
+}
+
+static ssize_t
+trigger_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct crashlog_entry *entry;
+	bool trigger;
+	int result;
+
+	entry = dev_get_drvdata(dev);
+
+	result = kstrtobool(buf, &trigger);
+	if (result)
+		return result;
+
+	mutex_lock(&entry->control_mutex);
+
+	if (!trigger) {
+		pmt_crashlog_set_clear(&entry->entry);
+	} else if (pmt_crashlog_complete(&entry->entry)) {
+		/* we cannot trigger a new crash if one is still pending */
+		result = -EEXIST;
+		goto err;
+	} else if (pmt_crashlog_disabled(&entry->entry)) {
+		/* if device is currently disabled, return busy */
+		result = -EBUSY;
+		goto err;
+	} else {
+		pmt_crashlog_set_execute(&entry->entry);
+	}
+
+	result = count;
+err:
+	mutex_unlock(&entry->control_mutex);
+	return result;
+}
+static DEVICE_ATTR_RW(trigger);
+
+static struct attribute *pmt_crashlog_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_trigger.attr,
+	NULL
+};
+
+static struct attribute_group pmt_crashlog_group = {
+	.attrs	= pmt_crashlog_attrs,
+};
+
+static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry,
+				      struct intel_pmt_header *header,
+				      struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+	struct crashlog_entry *crashlog;
+
+	if (!pmt_crashlog_supported(entry))
+		return 1;
+
+	/* initialize control mutex */
+	crashlog = container_of(entry, struct crashlog_entry, entry);
+	mutex_init(&crashlog->control_mutex);
+
+	header->access_type = GET_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + GUID_OFFSET);
+	header->base_offset = readl(disc_table + BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(crashlog_array);
+static struct intel_pmt_namespace pmt_crashlog_ns = {
+	.name = "crashlog",
+	.xa = &crashlog_array,
+	.attr_grp = &pmt_crashlog_group,
+	.pmt_header_decode = pmt_crashlog_header_decode,
+};
+
+/*
+ * initialization
+ */
+static int pmt_crashlog_remove(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns);
+
+	return 0;
+}
+
+static int pmt_crashlog_probe(struct platform_device *pdev)
+{
+	struct pmt_crashlog_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i].entry;
+
+		ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_crashlog_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_crashlog_driver = {
+	.driver = {
+		.name   = DRV_NAME,
+	},
+	.remove = pmt_crashlog_remove,
+	.probe  = pmt_crashlog_probe,
+};
+
+static int __init pmt_crashlog_init(void)
+{
+	return platform_driver_register(&pmt_crashlog_driver);
+}
+
+static void __exit pmt_crashlog_exit(void)
+{
+	platform_driver_unregister(&pmt_crashlog_driver);
+	xa_destroy(&crashlog_array);
+}
+
+module_init(pmt_crashlog_init);
+module_exit(pmt_crashlog_exit);
+
+MODULE_AUTHOR("Alexander Duyck <alexander.h.duyck@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Crashlog driver");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c
new file mode 100644
index 0000000000000000000000000000000000000000..9b95ef0504576fcd998b2c567d819961daf37f07
--- /dev/null
+++ b/drivers/platform/x86/intel_pmt_telemetry.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Telemetry driver
+ *
+ * Copyright (c) 2020, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "David E. Box" <david.e.box@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/overflow.h>
+
+#include "intel_pmt_class.h"
+
+#define TELEM_DEV_NAME		"pmt_telemetry"
+
+#define TELEM_SIZE_OFFSET	0x0
+#define TELEM_GUID_OFFSET	0x4
+#define TELEM_BASE_OFFSET	0x8
+#define TELEM_ACCESS(v)		((v) & GENMASK(3, 0))
+/* size is in bytes */
+#define TELEM_SIZE(v)		(((v) & GENMASK(27, 12)) >> 10)
+
+/* Used by client hardware to identify a fixed telemetry entry*/
+#define TELEM_CLIENT_FIXED_BLOCK_GUID	0x10000000
+
+struct pmt_telem_priv {
+	int				num_entries;
+	struct intel_pmt_entry		entry[];
+};
+
+static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry,
+				      struct device *dev)
+{
+	u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET);
+
+	if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID)
+		return false;
+
+	return intel_pmt_is_early_client_hw(dev);
+}
+
+static int pmt_telem_header_decode(struct intel_pmt_entry *entry,
+				   struct intel_pmt_header *header,
+				   struct device *dev)
+{
+	void __iomem *disc_table = entry->disc_table;
+
+	if (pmt_telem_region_overlaps(entry, dev))
+		return 1;
+
+	header->access_type = TELEM_ACCESS(readl(disc_table));
+	header->guid = readl(disc_table + TELEM_GUID_OFFSET);
+	header->base_offset = readl(disc_table + TELEM_BASE_OFFSET);
+
+	/* Size is measured in DWORDS, but accessor returns bytes */
+	header->size = TELEM_SIZE(readl(disc_table));
+
+	return 0;
+}
+
+static DEFINE_XARRAY_ALLOC(telem_array);
+static struct intel_pmt_namespace pmt_telem_ns = {
+	.name = "telem",
+	.xa = &telem_array,
+	.pmt_header_decode = pmt_telem_header_decode,
+};
+
+static int pmt_telem_remove(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < priv->num_entries; i++)
+		intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns);
+
+	return 0;
+}
+
+static int pmt_telem_probe(struct platform_device *pdev)
+{
+	struct pmt_telem_priv *priv;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(priv, entry, pdev->num_resources);
+	priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, priv);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct intel_pmt_entry *entry = &priv->entry[i];
+
+		ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i);
+		if (ret < 0)
+			goto abort_probe;
+		if (ret)
+			continue;
+
+		priv->num_entries++;
+	}
+
+	return 0;
+abort_probe:
+	pmt_telem_remove(pdev);
+	return ret;
+}
+
+static struct platform_driver pmt_telem_driver = {
+	.driver = {
+		.name   = TELEM_DEV_NAME,
+	},
+	.remove = pmt_telem_remove,
+	.probe  = pmt_telem_probe,
+};
+
+static int __init pmt_telem_init(void)
+{
+	return platform_driver_register(&pmt_telem_driver);
+}
+module_init(pmt_telem_init);
+
+static void __exit pmt_telem_exit(void)
+{
+	platform_driver_unregister(&pmt_telem_driver);
+	xa_destroy(&telem_array);
+}
+module_exit(pmt_telem_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Telemetry driver");
+MODULE_ALIAS("platform:" TELEM_DEV_NAME);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
index 0c2aa22c7a12eaf36d9670dc450868fbc64b4543..61ddf9f769afd4a53ef1512921e8b45ebc356df1 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c
@@ -265,9 +265,9 @@ static int isst_if_get_platform_info(void __user *argp)
 {
 	struct isst_if_platform_info info;
 
-	info.api_version = ISST_IF_API_VERSION,
-	info.driver_version = ISST_IF_DRIVER_VERSION,
-	info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT,
+	info.api_version = ISST_IF_API_VERSION;
+	info.driver_version = ISST_IF_DRIVER_VERSION;
+	info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT;
 	info.mbox_supported = punit_callbacks[ISST_IF_DEV_MBOX].registered;
 	info.mmio_supported = punit_callbacks[ISST_IF_DEV_MMIO].registered;
 
@@ -281,10 +281,69 @@ static int isst_if_get_platform_info(void __user *argp)
 struct isst_if_cpu_info {
 	/* For BUS 0 and BUS 1 only, which we need for PUNIT interface */
 	int bus_info[2];
+	struct pci_dev *pci_dev[2];
 	int punit_cpu_id;
+	int numa_node;
 };
 
 static struct isst_if_cpu_info *isst_cpu_info;
+#define ISST_MAX_PCI_DOMAINS	8
+
+static struct pci_dev *_isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn)
+{
+	struct pci_dev *matched_pci_dev = NULL;
+	struct pci_dev *pci_dev = NULL;
+	int no_matches = 0;
+	int i, bus_number;
+
+	if (bus_no < 0 || bus_no > 1 || cpu < 0 || cpu >= nr_cpu_ids ||
+	    cpu >= num_possible_cpus())
+		return NULL;
+
+	bus_number = isst_cpu_info[cpu].bus_info[bus_no];
+	if (bus_number < 0)
+		return NULL;
+
+	for (i = 0; i < ISST_MAX_PCI_DOMAINS; ++i) {
+		struct pci_dev *_pci_dev;
+		int node;
+
+		_pci_dev = pci_get_domain_bus_and_slot(i, bus_number, PCI_DEVFN(dev, fn));
+		if (!_pci_dev)
+			continue;
+
+		++no_matches;
+		if (!matched_pci_dev)
+			matched_pci_dev = _pci_dev;
+
+		node = dev_to_node(&_pci_dev->dev);
+		if (node == NUMA_NO_NODE) {
+			pr_info("Fail to get numa node for CPU:%d bus:%d dev:%d fn:%d\n",
+				cpu, bus_no, dev, fn);
+			continue;
+		}
+
+		if (node == isst_cpu_info[cpu].numa_node) {
+			pci_dev = _pci_dev;
+			break;
+		}
+	}
+
+	/*
+	 * If there is no numa matched pci_dev, then there can be following cases:
+	 * 1. CONFIG_NUMA is not defined: In this case if there is only single device
+	 *    match, then we don't need numa information. Simply return last match.
+	 *    Othewise return NULL.
+	 * 2. NUMA information is not exposed via _SEG method. In this case it is similar
+	 *    to case 1.
+	 * 3. Numa information doesn't match with CPU numa node and more than one match
+	 *    return NULL.
+	 */
+	if (!pci_dev && no_matches == 1)
+		pci_dev = matched_pci_dev;
+
+	return pci_dev;
+}
 
 /**
  * isst_if_get_pci_dev() - Get the PCI device instance for a CPU
@@ -300,17 +359,18 @@ static struct isst_if_cpu_info *isst_cpu_info;
  */
 struct pci_dev *isst_if_get_pci_dev(int cpu, int bus_no, int dev, int fn)
 {
-	int bus_number;
+	struct pci_dev *pci_dev;
 
 	if (bus_no < 0 || bus_no > 1 || cpu < 0 || cpu >= nr_cpu_ids ||
 	    cpu >= num_possible_cpus())
 		return NULL;
 
-	bus_number = isst_cpu_info[cpu].bus_info[bus_no];
-	if (bus_number < 0)
-		return NULL;
+	pci_dev = isst_cpu_info[cpu].pci_dev[bus_no];
+
+	if (pci_dev && pci_dev->devfn == PCI_DEVFN(dev, fn))
+		return pci_dev;
 
-	return pci_get_domain_bus_and_slot(0, bus_number, PCI_DEVFN(dev, fn));
+	return _isst_if_get_pci_dev(cpu, bus_no, dev, fn);
 }
 EXPORT_SYMBOL_GPL(isst_if_get_pci_dev);
 
@@ -327,6 +387,8 @@ static int isst_if_cpu_online(unsigned int cpu)
 	} else {
 		isst_cpu_info[cpu].bus_info[0] = data & 0xff;
 		isst_cpu_info[cpu].bus_info[1] = (data >> 8) & 0xff;
+		isst_cpu_info[cpu].pci_dev[0] = _isst_if_get_pci_dev(cpu, 0, 0, 1);
+		isst_cpu_info[cpu].pci_dev[1] = _isst_if_get_pci_dev(cpu, 1, 30, 1);
 	}
 
 	ret = rdmsrl_safe(MSR_THREAD_ID_INFO, &data);
@@ -335,6 +397,7 @@ static int isst_if_cpu_online(unsigned int cpu)
 		return ret;
 	}
 	isst_cpu_info[cpu].punit_cpu_id = data;
+	isst_cpu_info[cpu].numa_node = cpu_to_node(cpu);
 
 	isst_restore_msr_local(cpu);
 
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
index 4f6f7f0761fc1c940b3e165c08abd7257d3555db..fdecdae248d7787200c3e09077b4df74d3f70a67 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h
@@ -10,11 +10,11 @@
 #ifndef __ISST_IF_COMMON_H
 #define __ISST_IF_COMMON_H
 
-#define INTEL_RAPL_PRIO_DEVID_0	0x3451
-#define INTEL_CFG_MBOX_DEVID_0	0x3459
+#define PCI_DEVICE_ID_INTEL_RAPL_PRIO_DEVID_0	0x3451
+#define PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_0	0x3459
 
-#define INTEL_RAPL_PRIO_DEVID_1 0x3251
-#define INTEL_CFG_MBOX_DEVID_1  0x3259
+#define PCI_DEVICE_ID_INTEL_RAPL_PRIO_DEVID_1	0x3251
+#define PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_1	0x3259
 
 /*
  * Validate maximum commands in a single request.
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
index 95f01e7a87d573a5bcd1f2cc12550d5d0e972f6d..a2a2d923e60cbf1347cee9b5785feb3cabb5d524 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c
@@ -146,8 +146,8 @@ static long isst_if_mbox_proc_cmd(u8 *cmd_ptr, int *write_only, int resume)
 }
 
 static const struct pci_device_id isst_if_mbox_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_0)},
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_1)},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_0)},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_CFG_MBOX_DEVID_1)},
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, isst_if_mbox_ids);
diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
index aa17fd7817f8fdd4e075a15c7486d1d1bafbfc2f..ff49025ec0856ab587709a352442159bc97144c7 100644
--- a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c
@@ -20,15 +20,21 @@ struct isst_mmio_range {
 	int end;
 };
 
-struct isst_mmio_range mmio_range[] = {
+static struct isst_mmio_range mmio_range_devid_0[] = {
 	{0x04, 0x14},
 	{0x20, 0xD0},
 };
 
+static struct isst_mmio_range mmio_range_devid_1[] = {
+	{0x04, 0x14},
+	{0x20, 0x11C},
+};
+
 struct isst_if_device {
 	void __iomem *punit_mmio;
 	u32 range_0[5];
-	u32 range_1[45];
+	u32 range_1[64];
+	struct isst_mmio_range *mmio_range;
 	struct mutex mutex;
 };
 
@@ -39,7 +45,8 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 	struct pci_dev *pdev;
 
 	io_reg = (struct isst_if_io_reg *)cmd_ptr;
-	if (io_reg->reg < 0x04 || io_reg->reg > 0xD0)
+
+	if (io_reg->reg % 4)
 		return -EINVAL;
 
 	if (io_reg->read_write && !capable(CAP_SYS_ADMIN))
@@ -53,6 +60,10 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 	if (!punit_dev)
 		return -EINVAL;
 
+	if (io_reg->reg < punit_dev->mmio_range[0].beg ||
+	    io_reg->reg > punit_dev->mmio_range[1].end)
+		return -EINVAL;
+
 	/*
 	 * Ensure that operation is complete on a PCI device to avoid read
 	 * write race by using per PCI device mutex.
@@ -71,8 +82,8 @@ static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume)
 }
 
 static const struct pci_device_id isst_if_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0)},
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_1)},
+	{ PCI_DEVICE_DATA(INTEL, RAPL_PRIO_DEVID_0, &mmio_range_devid_0)},
+	{ PCI_DEVICE_DATA(INTEL, RAPL_PRIO_DEVID_1, &mmio_range_devid_1)},
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, isst_if_ids);
@@ -109,6 +120,7 @@ static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	mutex_init(&punit_dev->mutex);
 	pci_set_drvdata(pdev, punit_dev);
+	punit_dev->mmio_range = (struct isst_mmio_range *) ent->driver_data;
 
 	memset(&cb, 0, sizeof(cb));
 	cb.cmd_size = sizeof(struct isst_if_io_reg);
@@ -138,10 +150,15 @@ static int __maybe_unused isst_if_suspend(struct device *device)
 
 	for (i = 0; i < ARRAY_SIZE(punit_dev->range_0); ++i)
 		punit_dev->range_0[i] = readl(punit_dev->punit_mmio +
-						mmio_range[0].beg + 4 * i);
-	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i)
-		punit_dev->range_1[i] = readl(punit_dev->punit_mmio +
-						mmio_range[1].beg + 4 * i);
+						punit_dev->mmio_range[0].beg + 4 * i);
+	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i) {
+		u32 addr;
+
+		addr = punit_dev->mmio_range[1].beg + 4 * i;
+		if (addr > punit_dev->mmio_range[1].end)
+			break;
+		punit_dev->range_1[i] = readl(punit_dev->punit_mmio + addr);
+	}
 
 	return 0;
 }
@@ -153,10 +170,16 @@ static int __maybe_unused isst_if_resume(struct device *device)
 
 	for (i = 0; i < ARRAY_SIZE(punit_dev->range_0); ++i)
 		writel(punit_dev->range_0[i], punit_dev->punit_mmio +
-						mmio_range[0].beg + 4 * i);
-	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i)
-		writel(punit_dev->range_1[i], punit_dev->punit_mmio +
-						mmio_range[1].beg + 4 * i);
+						punit_dev->mmio_range[0].beg + 4 * i);
+	for (i = 0; i < ARRAY_SIZE(punit_dev->range_1); ++i) {
+		u32 addr;
+
+		addr = punit_dev->mmio_range[1].beg + 4 * i;
+		if (addr > punit_dev->mmio_range[1].end)
+			break;
+
+		writel(punit_dev->range_1[i], punit_dev->punit_mmio + addr);
+	}
 
 	return 0;
 }
diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig
index b5a217b828dcec78c9e39adf56d91c003babc99d..089b6244b716b889ef096a05a8803a4328fd4c78 100644
--- a/drivers/power/avs/Kconfig
+++ b/drivers/power/avs/Kconfig
@@ -13,9 +13,9 @@ menuconfig POWER_AVS
 	  Say Y here to enable Adaptive Voltage Scaling class support.
 
 config ROCKCHIP_IODOMAIN
-        tristate "Rockchip IO domain support"
-        depends on POWER_AVS && ARCH_ROCKCHIP && OF
-        help
-          Say y here to enable support io domains on Rockchip SoCs. It is
-          necessary for the io domain setting of the SoC to match the
-          voltage supplied by the regulators.
+	tristate "Rockchip IO domain support"
+	depends on POWER_AVS && ARCH_ROCKCHIP && OF
+	help
+	  Say y here to enable support io domains on Rockchip SoCs. It is
+	  necessary for the io domain setting of the SoC to match the
+	  voltage supplied by the regulators.
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 925b0004a0eda1a57f680c4f050ba8d3b35d837c..8566bf72f3fbe56e1b09a63aa6c0fbc885d078a4 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -62,6 +62,20 @@
 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
 #define PP_POLICY_MASK         0x1F
 
+/*
+ * SPR has different layout for Psys Domain PowerLimit registers.
+ * There are 17 bits of PL1 and PL2 instead of 15 bits.
+ * The Enable bits and TimeWindow bits are also shifted as a result.
+ */
+#define PSYS_POWER_LIMIT1_MASK       0x1FFFF
+#define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
+
+#define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
+#define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
+
+#define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
+#define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
+
 /* Non HW constants */
 #define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
@@ -96,6 +110,8 @@ struct rapl_defaults {
 	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
 				    bool to_raw);
 	unsigned int dram_domain_energy_unit;
+	unsigned int psys_domain_energy_unit;
+	bool spr_psys_bits;
 };
 static struct rapl_defaults *rapl_defaults;
 
@@ -536,12 +552,23 @@ static void rapl_init_domains(struct rapl_package *rp)
 		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
 			rd->regs[j] = rp->priv->regs[i][j];
 
-		if (i == RAPL_DOMAIN_DRAM) {
+		switch (i) {
+		case RAPL_DOMAIN_DRAM:
 			rd->domain_energy_unit =
 			    rapl_defaults->dram_domain_energy_unit;
 			if (rd->domain_energy_unit)
 				pr_info("DRAM domain energy unit %dpj\n",
 					rd->domain_energy_unit);
+			break;
+		case RAPL_DOMAIN_PLATFORM:
+			rd->domain_energy_unit =
+			    rapl_defaults->psys_domain_energy_unit;
+			if (rd->domain_energy_unit)
+				pr_info("Platform domain energy unit %dpj\n",
+					rd->domain_energy_unit);
+			break;
+		default:
+			break;
 		}
 		rd++;
 	}
@@ -616,12 +643,51 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 	/* non-hardware */
 	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 			    RAPL_PRIMITIVE_DERIVED),
 	{NULL, 0, 0, 0},
 };
 
+static enum rapl_primitives
+prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
+{
+	if (!rapl_defaults->spr_psys_bits)
+		return prim;
+
+	if (rd->id != RAPL_DOMAIN_PLATFORM)
+		return prim;
+
+	switch (prim) {
+	case POWER_LIMIT1:
+		return PSYS_POWER_LIMIT1;
+	case POWER_LIMIT2:
+		return PSYS_POWER_LIMIT2;
+	case PL1_ENABLE:
+		return PSYS_PL1_ENABLE;
+	case PL2_ENABLE:
+		return PSYS_PL2_ENABLE;
+	case TIME_WINDOW1:
+		return PSYS_TIME_WINDOW1;
+	case TIME_WINDOW2:
+		return PSYS_TIME_WINDOW2;
+	default:
+		return prim;
+	}
+}
+
 /* Read primitive data based on its related struct rapl_primitive_info.
  * if xlate flag is set, return translated data based on data units, i.e.
  * time, energy, and power.
@@ -639,7 +705,8 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			      enum rapl_primitives prim, bool xlate, u64 *data)
 {
 	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	struct reg_action ra;
 	int cpu;
 
@@ -685,7 +752,8 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 			       enum rapl_primitives prim,
 			       unsigned long long value)
 {
-	struct rapl_primitive_info *rp = &rpi[prim];
+	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
+	struct rapl_primitive_info *rp = &rpi[prim_fixed];
 	int cpu;
 	u64 bits;
 	struct reg_action ra;
@@ -922,6 +990,15 @@ static const struct rapl_defaults rapl_defaults_hsw_server = {
 	.dram_domain_energy_unit = 15300,
 };
 
+static const struct rapl_defaults rapl_defaults_spr_server = {
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+	.dram_domain_energy_unit = 15300,
+	.psys_domain_energy_unit = 1000000000,
+	.spr_psys_bits = true,
+};
+
 static const struct rapl_defaults rapl_defaults_byt = {
 	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
 	.check_unit = rapl_check_unit_atom,
@@ -978,6 +1055,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
 	INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
 	INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(SAPPHIRERAPIDS_X, rapl_defaults_spr_server),
 
 	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
 	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index 125a173bed458bebfb334370b6a7556c2858e8fe..4dd31dd9feeabb0bb55a62bd478a017afa92852f 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -2755,7 +2755,7 @@ static int tsi721_probe(struct pci_dev *pdev,
 	{
 		int i;
 
-		for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			tsi_debug(INIT, &pdev->dev, "res%d %pR",
 				  i, &pdev->resource[i]);
 		}
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 94def7d5be78959179f37ab511731eb6217cf849..6aeb3f6dfd8ffa6dfb839d61cfa0a19ecf5968b6 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -5028,7 +5028,7 @@ static DEVICE_ATTR(lpfc_aer_support, S_IRUGO | S_IWUSR,
  * Description:
  * If the @buf contains 1 and the device currently has the AER support
  * enabled, then invokes the kernel AER helper routine
- * pci_aer_clear_nonfatal_status to clean up the uncorrectable error
+ * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
  * status register.
  *
  * Notes:
@@ -5054,11 +5054,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	if (phba->hba_flag & HBA_AER_ENABLED)
-#if defined(BUILD_SLES15SP2) || defined(BUILD_RHEL83)
 		rc = pci_aer_clear_nonfatal_status(phba->pcidev);
-#else
-		rc = pci_cleanup_aer_uncorrect_error_status(phba->pcidev);
-#endif
 
 	if (rc == 0)
 		return strlen(buf);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
old mode 100755
new mode 100644
index d38617f7f6f731aea997d2a96bd60a5f37fbeb97..4bada4334c9f094cfbeac99d8667cba9e61897ef
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -16187,7 +16187,7 @@ scsih_pci_resume(struct pci_dev *pdev)
 	&& (CONFIG_SUSE_PATCHLEVEL >= 2))))
 	pci_aer_clear_nonfatal_status(pdev);
 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19) 	
-	pci_cleanup_aer_uncorrect_error_status(pdev);
+	pci_aer_clear_nonfatal_status(pdev);
 #endif
 	mpt3sas_base_start_watchdog(ioc);
 	mpt3sas_base_start_hba_unplug_watchdog(ioc);
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 68a8217032d0fb17cc6be74c18490d2b34520f27..1a3661d6be06b30666c65684f1f99481321516f8 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -1186,7 +1186,7 @@ static void pm8001_hw_chip_rst(struct pm8001_hba_info *pm8001_ha)
 void pm8001_chip_iounmap(struct pm8001_hba_info *pm8001_ha)
 {
 	s8 bar, logical = 0;
-	for (bar = 0; bar < 6; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		/*
 		** logical BARs for SPC:
 		** bar 0 and 1 - logical BAR0
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 8882ba33ca87cbe08ba3a491f12992cd90ce317d..50b6b0d24839eaf5d1e5d0d41c1977f059bc1233 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -401,7 +401,7 @@ static int pm8001_ioremap(struct pm8001_hba_info *pm8001_ha)
 
 	pdev = pm8001_ha->pdev;
 	/* map pci mem (PMC pci base 0-3)*/
-	for (bar = 0; bar < 6; bar++) {
+	for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		/*
 		** logical BARs for SPC:
 		** bar 0 and 1 - logical BAR0
diff --git a/drivers/staging/gasket/gasket_constants.h b/drivers/staging/gasket/gasket_constants.h
index 50d87c7b178c25dffd626a9ebf8319d06c3994a2..9ea9c8833f27a066b4d2a7a5eb188a153ee4f7b4 100644
--- a/drivers/staging/gasket/gasket_constants.h
+++ b/drivers/staging/gasket/gasket_constants.h
@@ -13,9 +13,6 @@
 /* The maximum devices per each type. */
 #define GASKET_DEV_MAX 256
 
-/* The number of supported (and possible) PCI BARs. */
-#define GASKET_NUM_BARS 6
-
 /* The number of supported Gasket page tables per device. */
 #define GASKET_MAX_NUM_PAGE_TABLES 1
 
diff --git a/drivers/staging/gasket/gasket_core.c b/drivers/staging/gasket/gasket_core.c
index 6f9c0d18d9ce45aa090abbe41e476721a68a0a3f..26e516997ea2630f5222dbe53061560c24353ecf 100644
--- a/drivers/staging/gasket/gasket_core.c
+++ b/drivers/staging/gasket/gasket_core.c
@@ -371,7 +371,7 @@ static int gasket_setup_pci(struct pci_dev *pci_dev,
 {
 	int i, mapped_bars, ret;
 
-	for (i = 0; i < GASKET_NUM_BARS; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		ret = gasket_map_pci_bar(gasket_dev, i);
 		if (ret) {
 			mapped_bars = i;
@@ -393,7 +393,7 @@ static void gasket_cleanup_pci(struct gasket_dev *gasket_dev)
 {
 	int i;
 
-	for (i = 0; i < GASKET_NUM_BARS; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		gasket_unmap_pci_bar(gasket_dev, i);
 }
 
@@ -493,7 +493,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		(enum gasket_sysfs_attribute_type)gasket_attr->data.attr_type;
 	switch (sysfs_type) {
 	case ATTR_BAR_OFFSETS:
-		for (i = 0; i < GASKET_NUM_BARS; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			bar_desc = &driver_desc->bar_descriptions[i];
 			if (bar_desc->size == 0)
 				continue;
@@ -505,7 +505,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		}
 		break;
 	case ATTR_BAR_SIZES:
-		for (i = 0; i < GASKET_NUM_BARS; i++) {
+		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 			bar_desc = &driver_desc->bar_descriptions[i];
 			if (bar_desc->size == 0)
 				continue;
@@ -556,7 +556,7 @@ static ssize_t gasket_sysfs_data_show(struct device *device,
 		ret = snprintf(buf, PAGE_SIZE, "%d\n", gasket_dev->reset_count);
 		break;
 	case ATTR_USER_MEM_RANGES:
-		for (i = 0; i < GASKET_NUM_BARS; ++i) {
+		for (i = 0; i < PCI_STD_NUM_BARS; ++i) {
 			current_written =
 				gasket_write_mappable_regions(buf, driver_desc,
 							      i);
@@ -736,7 +736,7 @@ static int gasket_get_bar_index(const struct gasket_dev *gasket_dev,
 	const struct gasket_driver_desc *driver_desc;
 
 	driver_desc = gasket_dev->internal_desc->driver_desc;
-	for (i = 0; i < GASKET_NUM_BARS; ++i) {
+	for (i = 0; i < PCI_STD_NUM_BARS; ++i) {
 		struct gasket_bar_desc bar_desc =
 			driver_desc->bar_descriptions[i];
 
diff --git a/drivers/staging/gasket/gasket_core.h b/drivers/staging/gasket/gasket_core.h
index be44ac1e3118626795912b661a5ce060aa8f6974..c417acadb0d51ac2d9f2b235da6fb3e9f1d8966b 100644
--- a/drivers/staging/gasket/gasket_core.h
+++ b/drivers/staging/gasket/gasket_core.h
@@ -268,7 +268,7 @@ struct gasket_dev {
 	char kobj_name[GASKET_NAME_MAX];
 
 	/* Virtual address of mapped BAR memory range. */
-	struct gasket_bar_data bar_data[GASKET_NUM_BARS];
+	struct gasket_bar_data bar_data[PCI_STD_NUM_BARS];
 
 	/* Coherent buffer. */
 	struct gasket_coherent_buffer coherent_buffer;
@@ -369,7 +369,7 @@ struct gasket_driver_desc {
 	/* Set of 6 bar descriptions that describe all PCIe bars.
 	 * Note that BUS/AXI devices (i.e. non PCI devices) use those.
 	 */
-	struct gasket_bar_desc bar_descriptions[GASKET_NUM_BARS];
+	struct gasket_bar_desc bar_descriptions[PCI_STD_NUM_BARS];
 
 	/*
 	 * Coherent buffer description.
diff --git a/drivers/staging/pi433/pi433_if.c b/drivers/staging/pi433/pi433_if.c
index 40c6f4e7632f951c363a6833030232b8cef20743..313d22f6210f3c66d3fd0514e9cdf9ecf9876e46 100644
--- a/drivers/staging/pi433/pi433_if.c
+++ b/drivers/staging/pi433/pi433_if.c
@@ -928,16 +928,6 @@ pi433_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
-#ifdef CONFIG_COMPAT
-static long
-pi433_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	return pi433_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#else
-#define pi433_compat_ioctl NULL
-#endif /* CONFIG_COMPAT */
-
 /*-------------------------------------------------------------------------*/
 
 static int pi433_open(struct inode *inode, struct file *filp)
@@ -1094,7 +1084,7 @@ static const struct file_operations pi433_fops = {
 	.write =	pi433_write,
 	.read =		pi433_read,
 	.unlocked_ioctl = pi433_ioctl,
-	.compat_ioctl = pi433_compat_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 	.open =		pi433_open,
 	.release =	pi433_release,
 	.llseek =	no_llseek,
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 8814ff38aa67bd02e29892dac30ed63764c87b69..ee0f469d4d5f81c198b2c281165be16e44bcb251 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -48,8 +48,6 @@ struct f815xxa_data {
 	int idx;
 };
 
-#define PCI_NUM_BAR_RESOURCES	6
-
 struct serial_private {
 	struct pci_dev		*dev;
 	unsigned int		nr;
@@ -89,7 +87,7 @@ setup_port(struct serial_private *priv, struct uart_8250_port *port,
 {
 	struct pci_dev *dev = priv->dev;
 
-	if (bar >= PCI_NUM_BAR_RESOURCES)
+	if (bar >= PCI_STD_NUM_BARS)
 		return -EINVAL;
 
 	if (pci_resource_flags(dev, bar) & IORESOURCE_MEM) {
@@ -3797,7 +3795,7 @@ serial_pci_guess_board(struct pci_dev *dev, struct pciserial_board *board)
 		return -ENODEV;
 
 	num_iomem = num_port = 0;
-	for (i = 0; i < PCI_NUM_BAR_RESOURCES; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_flags(dev, i) & IORESOURCE_IO) {
 			num_port++;
 			if (first_port == -1)
@@ -3825,7 +3823,7 @@ serial_pci_guess_board(struct pci_dev *dev, struct pciserial_board *board)
 	 */
 	first_port = -1;
 	num_port = 0;
-	for (i = 0; i < PCI_NUM_BAR_RESOURCES; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		if (pci_resource_flags(dev, i) & IORESOURCE_IO &&
 		    pci_resource_len(dev, i) == 8 &&
 		    (first_port == -1 || (first_port + num_port) == i)) {
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 35e89460b9ca856493226491ea6688af86e9b8c5..41ce8ffa5d1bf4be50837d2b3def5d481eb15158 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2707,18 +2707,6 @@ static long usbdev_ioctl(struct file *file, unsigned int cmd,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long usbdev_compat_ioctl(struct file *file, unsigned int cmd,
-			unsigned long arg)
-{
-	int ret;
-
-	ret = usbdev_do_ioctl(file, cmd, compat_ptr(arg));
-
-	return ret;
-}
-#endif
-
 /* No kernel lock - fine */
 static __poll_t usbdev_poll(struct file *file,
 				struct poll_table_struct *wait)
@@ -2742,9 +2730,7 @@ const struct file_operations usbdev_file_operations = {
 	.read =		  usbdev_read,
 	.poll =		  usbdev_poll,
 	.unlocked_ioctl = usbdev_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl =   usbdev_compat_ioctl,
-#endif
+	.compat_ioctl =   compat_ptr_ioctl,
 	.mmap =           usbdev_mmap,
 	.open =		  usbdev_open,
 	.release =	  usbdev_release,
diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index 9e26b0143a59a5065fa3126afcfdecddb759fa83..9ae2a7a93df20168720413db81e2c882d6e54dda 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -234,7 +234,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		/* UHCI */
 		int	region;
 
-		for (region = 0; region < PCI_ROM_RESOURCE; region++) {
+		for (region = 0; region < PCI_STD_NUM_BARS; region++) {
 			if (!(pci_resource_flags(dev, region) &
 					IORESOURCE_IO))
 				continue;
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index f6d04491df608464b02894cb175acafbdcc0b423..6c7f0a876b96ba3c4901d2a2934741fff398fe2b 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -728,7 +728,7 @@ static void quirk_usb_handoff_uhci(struct pci_dev *pdev)
 	if (!pio_enabled(pdev))
 		return;
 
-	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+	for (i = 0; i < PCI_STD_NUM_BARS; i++)
 		if ((pci_resource_flags(pdev, i) & IORESOURCE_IO)) {
 			base = pci_resource_start(pdev, i);
 			break;
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 503ed2f3fbb5eb94c5cc9cfd1080ae1c79c9ea00..2ad5fe473227b4678ea39c2bac854285239fc099 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,12 +1,22 @@
 # SPDX-License-Identifier: GPL-2.0-only
+menuconfig VFIO
+	tristate "VFIO Non-Privileged userspace driver framework"
+	select IOMMU_API
+	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
+	help
+	  VFIO provides a framework for secure userspace device drivers.
+	  See Documentation/driver-api/vfio.rst for more details.
+
+	  If you don't know what to do here, say N.
+
+if VFIO
 config VFIO_IOMMU_TYPE1
 	tristate
-	depends on VFIO
 	default n
 
 config VFIO_IOMMU_SPAPR_TCE
 	tristate
-	depends on VFIO && SPAPR_TCE_IOMMU
+	depends on SPAPR_TCE_IOMMU
 	default VFIO
 
 config VFIO_SPAPR_EEH
@@ -16,22 +26,11 @@ config VFIO_SPAPR_EEH
 
 config VFIO_VIRQFD
 	tristate
-	depends on VFIO && EVENTFD
+	select EVENTFD
 	default n
 
-menuconfig VFIO
-	tristate "VFIO Non-Privileged userspace driver framework"
-	select IOMMU_API
-	select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
-	help
-	  VFIO provides a framework for secure userspace device drivers.
-	  See Documentation/driver-api/vfio.rst for more details.
-
-	  If you don't know what to do here, say N.
-
-menuconfig VFIO_NOIOMMU
+config VFIO_NOIOMMU
 	bool "VFIO No-IOMMU support"
-	depends on VFIO
 	help
 	  VFIO is built on the ability to isolate devices using the IOMMU.
 	  Only with an IOMMU can userspace access to DMA capable devices be
@@ -47,4 +46,6 @@ menuconfig VFIO_NOIOMMU
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
 source "drivers/vfio/mdev/Kconfig"
+endif
+
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 5da27f2100f9bd9a986453a1802b533bf056b34d..6b2fa8a194ef6c1a49031f640e09d561e7fe6bf6 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -2,7 +2,6 @@
 
 config VFIO_MDEV
 	tristate "Mediated device driver framework"
-	depends on VFIO
 	default n
 	help
 	  Provides a framework to virtualize devices.
@@ -10,9 +9,11 @@ config VFIO_MDEV
 
 	  If you don't know what do here, say N.
 
-config VFIO_MDEV_DEVICE
-	tristate "VFIO driver for Mediated devices"
-	depends on VFIO && VFIO_MDEV
+config VFIO_MDEV_IDXD
+	tristate "VFIO Mediated device driver for Intel IDXD"
+	depends on VFIO && VFIO_MDEV && X86_64
+	select IMS_MSI_ARRAY
 	default n
 	help
-	  VFIO based driver for Mediated devices.
+	  VFIO based mediated device driver for
+	  Intel Accelerator Devices driver.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index 101516fdf3753e76cbb24377bedb64d52ae8cc61..a2660c3edf5b1d0aa97032cc1e4e292a7e023d21 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o vfio_mdev.o
 
 obj-$(CONFIG_VFIO_MDEV) += mdev.o
-obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o
+obj-$(CONFIG_VFIO_MDEV_IDXD) += idxd/
diff --git a/drivers/vfio/mdev/idxd/Makefile b/drivers/vfio/mdev/idxd/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ab55032b54865720cca33d324d261dfd7c201202
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/Makefile
@@ -0,0 +1,4 @@
+ccflags-y += -I$(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=IDXD
+
+obj-$(CONFIG_VFIO_MDEV_IDXD) += idxd_mdev.o
+idxd_mdev-y := mdev.o vdev.o mdev_host.o
diff --git a/drivers/vfio/mdev/idxd/mdev.c b/drivers/vfio/mdev/idxd/mdev.c
new file mode 100644
index 0000000000000000000000000000000000000000..5ff6de0cae75cbd03e983d585a11ee08ab8894cc
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev.c
@@ -0,0 +1,2489 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/circ_buf.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../mdev_private.h"
+#include "mdev.h"
+
+static u64 idxd_pci_config[] = {
+	0x0010000000008086ULL,
+	0x0080000008800000ULL,
+	0x000000000000000cULL,
+	0x000000000000000cULL,
+	0x0000000000000000ULL,
+	0x2010808600000000ULL,
+	0x0000004000000000ULL,
+	0x000000ff00000000ULL,
+	0x0000060000015011ULL, /* MSI-X capability, hardcoded 2 entries, Encoded as N-1 */
+	0x0000070000000000ULL,
+	0x0000000000920010ULL, /* PCIe capability */
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0070001000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+};
+
+static u64 idxd_pci_ext_cap[] = {
+	0x000000611101000fULL, /* ATS capability */
+	0x0000000000000000ULL,
+	0x8100000012010013ULL, /* Page Request capability */
+	0x0000000000000001ULL,
+	0x000014040001001bULL, /* PASID capability */
+	0x0000000000000000ULL,
+	0x0181808600010023ULL, /* Scalable IOV capability */
+	0x0000000100000005ULL,
+	0x0000000000000001ULL,
+	0x0000000000000000ULL,
+};
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
+			      unsigned int index, unsigned int start,
+			      unsigned int count, void *data);
+static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd);
+
+struct idxd_ioasid_work {
+	struct work_struct work;
+	struct idxd_wq *wq;
+	u32 guest_pasid;
+	u32 host_pasid;
+};
+
+static const char idxd_dsa_1dwq_name[] = "dsa-1dwq-v1";
+static const char idxd_iax_1dwq_name[] = "iax-1dwq-v1";
+static const char idxd_dsa_1swq_name[] = "dsa-1swq-v1";
+static const char idxd_iax_1swq_name[] = "iax-1swq-v1";
+
+static int idxd_vdcm_get_irq_count(struct mdev_device *mdev, int type)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	/*
+	 * Even though the number of MSIX vectors supported are not tied to number of
+	 * wqs being exported, the current design is to allow 1 vector per WQ for guest.
+	 * So here we end up with num of wqs plus 1 that handles the misc interrupts.
+	 */
+	if (type == VFIO_PCI_MSI_IRQ_INDEX || type == VFIO_PCI_MSIX_IRQ_INDEX)
+		return VIDXD_MAX_MSIX_VECS;
+	else if (type == VFIO_PCI_REQ_IRQ_INDEX)
+		return 1;
+	else if (type >= VFIO_PCI_NUM_IRQS &&
+		 type < VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs)
+		return 1;
+
+	return 0;
+}
+
+static void idxd_wq_ioasid_work(struct work_struct *work)
+{
+	struct idxd_ioasid_work *iwork = container_of(work, struct idxd_ioasid_work, work);
+	struct idxd_wq *wq = iwork->wq;
+
+	if (wq->state != IDXD_WQ_ENABLED)
+		return;
+
+	idxd_device_drain_pasid(wq->idxd, iwork->guest_pasid);
+	ioasid_put(NULL, iwork->host_pasid);
+	kfree(iwork);
+}
+
+static int idxd_mdev_ioasid_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct idxd_vdev *vdev = container_of(nb, struct idxd_vdev, pasid_nb);
+	struct mdev_device *mdev = vdev->mdev;
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+	struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+	struct idxd_ioasid_work *iwork;
+
+	if (event == IOASID_NOTIFY_FREE) {
+		dev_dbg(mdev_dev(mdev), "ioasid free event\n");
+
+		if (wq_dedicated(wq))
+			return NOTIFY_DONE;
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			return NOTIFY_DONE;
+
+		iwork = kmalloc(sizeof(*iwork), GFP_ATOMIC);
+		if (!iwork)
+			return notifier_from_errno(-ENOMEM);
+		iwork->wq = wq;
+		iwork->guest_pasid = args->spid;
+		iwork->host_pasid = args->id;
+		INIT_WORK(&iwork->work, idxd_wq_ioasid_work);
+		ioasid_queue_work(&iwork->work);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_OK;
+}
+
+int idxd_mdev_get_pasid(struct mdev_device *mdev, u32 *pasid)
+{
+	struct vfio_group *vfio_group;
+	struct iommu_domain *iommu_domain;
+	struct device *dev = mdev_dev(mdev);
+	struct device *iommu_device = mdev_get_iommu_device(mdev);
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	int mdev_pasid;
+
+	if (!vidxd->ivdev.vfio_group) {
+		dev_warn(dev, "Missing vfio_group.\n");
+		return -EINVAL;
+	}
+
+	vfio_group = vidxd->ivdev.vfio_group;
+
+	iommu_domain = vfio_group_iommu_domain(vfio_group);
+	if (IS_ERR_OR_NULL(iommu_domain))
+		goto err;
+
+	mdev_pasid = iommu_aux_get_pasid(iommu_domain, iommu_device);
+	if (mdev_pasid < 0)
+		goto err;
+
+	*pasid = (u32)mdev_pasid;
+	return 0;
+
+ err:
+	vfio_group_put_external_user(vfio_group);
+	vidxd->ivdev.vfio_group = NULL;
+	return -EFAULT;
+}
+
+int idxd_mdev_get_host_pasid(struct mdev_device *mdev, u32 gpasid, u32 *pasid)
+{
+	struct ioasid_set *ioasid_set;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		dev_warn(mdev_dev(mdev), "%s no mm!\n", __func__);
+		return -ENXIO;
+	}
+
+	ioasid_set = ioasid_find_mm_set(mm);
+	if (!ioasid_set) {
+		mmput(mm);
+		dev_warn(mdev_dev(mdev), "%s no ioasid_set!\n", __func__);
+		return -ENXIO;
+	}
+
+	*pasid = ioasid_find_by_spid(ioasid_set, gpasid, true);
+	mmput(mm);
+	if (*pasid == INVALID_IOASID) {
+		dev_warn(mdev_dev(mdev), "%s invalid ioasid by spid!\n", __func__);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static inline void reset_vconfig(struct vdcm_idxd *vidxd)
+{
+	u16 *devid = (u16 *)(vidxd->cfg + PCI_DEVICE_ID);
+	struct idxd_device *idxd = vidxd->idxd;
+
+	memset(vidxd->cfg, 0, VIDXD_MAX_CFG_SPACE_SZ);
+	memcpy(vidxd->cfg, idxd_pci_config, sizeof(idxd_pci_config));
+
+	if (idxd->data->type == IDXD_TYPE_DSA)
+		*devid = PCI_DEVICE_ID_INTEL_DSA_SPR0;
+	else if (idxd->data->type == IDXD_TYPE_IAX)
+		*devid = PCI_DEVICE_ID_INTEL_IAX_SPR0;
+
+	memcpy(vidxd->cfg + 0x100, idxd_pci_ext_cap, sizeof(idxd_pci_ext_cap));
+}
+
+static inline void reset_vmmio(struct vdcm_idxd *vidxd)
+{
+	memset(&vidxd->bar0, 0, VIDXD_MAX_MMIO_SPACE_SZ);
+}
+
+static inline void vidxd_vwq_init(struct vdcm_idxd *vidxd)
+{
+	INIT_LIST_HEAD(&vidxd->vwq.head);
+	vidxd->vwq.ndescs = 0;
+
+	memset(vidxd->vwq.portals, 0,
+		VIDXD_MAX_PORTALS * sizeof(struct idxd_wq_portal));
+}
+
+static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq = vidxd->wq;
+
+	reset_vconfig(vidxd);
+	reset_vmmio(vidxd);
+
+	vidxd->bar_size[0] = VIDXD_BAR0_SIZE;
+	vidxd->bar_size[1] = VIDXD_BAR2_SIZE;
+
+	vidxd_mmio_init(vidxd);
+
+	vidxd_vwq_init(vidxd);
+	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED) {
+		idxd_wq_disable(wq, false, NULL);
+		wq->state = IDXD_WQ_LOCKED;
+	}
+}
+
+static void  vidxd_unregister_ioasid_notifier(struct vdcm_idxd *vidxd)
+{
+	struct idxd_vdev *vdev = &vidxd->ivdev;
+	struct ioasid_mm_entry *mm_entry, *n;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return;
+
+	mutex_lock(&vdev->ioasid_lock);
+
+	list_for_each_entry_safe(mm_entry, n, &vdev->mm_list, node) {
+		if (mm_entry->mm == mm) {
+			list_del(&mm_entry->node);
+			kfree(mm_entry);
+			ioasid_unregister_notifier_mm(mm, &vidxd->ivdev.pasid_nb);
+			break;
+		}
+	}
+
+	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
+}
+
+static int vidxd_source_pause_device(struct vdcm_idxd *vidxd)
+{
+	int i;
+	int rc;
+	u32 status;
+
+	if (vidxd->paused)
+		return 0;
+
+	mutex_lock(&vidxd->mig_submit_lock);
+	/* The VMM is expected to have unmap the portals. So once we drain
+	 * there shouldn't be any work directly submited from the VM */
+	vidxd->paused = true;
+	mutex_unlock(&vidxd->mig_submit_lock);
+
+	/* For DWQs, pausing the vDSA can always be done by Drain WQ command.
+	 * For SWQs, pausing the vDSA may mean Drain PASID if the SWQ is shared
+	 * with other VMs. We will need to do Drain PASID for each PASID
+	 * allocated to the VM which may take a long time. As an optimization,
+	 * we may do Drain PASID if no of PASIDs for the VM is below certain
+	 * number and do Drain WQ otherwise.
+	 */
+	/* Drain WQ(s) to make sure no more outstanding work in the dev */
+	/* TODO: Currently support for only 1 WQ per VDev */
+	for (i = 0; i < vidxd->num_wqs; i++) {
+		rc = idxd_wq_drain(vidxd->wq, &status);
+
+		if (rc < 0) {
+			pr_info("%s: failed rc %d\n", __func__, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void vidxd_free_resources (struct vdcm_idxd *vidxd)
+{
+	int i;
+
+        /* Free the queued descriptors */
+        for (i = 0; i < vidxd->num_wqs; i++) {
+                struct idxd_wq_desc_elem *el, *tmp;
+		struct idxd_virtual_wq *vwq = &vidxd->vwq;
+
+                list_for_each_entry_safe(el, tmp, &vwq->head, link) {
+                        list_del(&el->link);
+                        vwq->ndescs--;
+                        kfree(el);
+                }
+        }
+
+}
+
+static void vidxd_source_prepare_for_migration(struct vdcm_idxd *vidxd)
+{
+	int i;
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *)vdev->mig_pages;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	unsigned int offset =  mig_info->data_offset;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_virtual_wq *vwq;
+
+	memcpy(data_ptr + offset, vidxd->cfg, sizeof(vidxd->cfg));
+	offset += sizeof(vidxd->cfg);
+	memcpy(data_ptr + offset, (u8 *)vidxd->bar_val, sizeof(vidxd->bar_val));
+	offset += sizeof(vidxd->bar_val);
+	memcpy(data_ptr + offset, (u8 *)vidxd->bar_size,
+					sizeof(vidxd->bar_size));
+	offset += sizeof(vidxd->bar_size);
+	memcpy(data_ptr + offset, (u8 *)&vidxd->bar0, sizeof(vidxd->bar0));
+	offset += sizeof(vidxd->bar0);
+
+	/* Save int handle info */
+	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
+		u32 ims_idx = dev_msi_hwirq(dev, i - 1);
+
+		/* Save active ims index, -1 means no entry is in use. */
+		pr_info("%s: saving handle %d at offset 0x%x\n", __func__, ims_idx, offset);
+		memcpy(data_ptr + offset, (u8 *)&ims_idx, sizeof(ims_idx));
+		offset += sizeof(ims_idx);
+	}
+
+        /* Save the queued descriptors */
+        for (i = 0; i < vidxd->num_wqs; i++) {
+                struct idxd_wq_desc_elem *el;
+                vwq = &vidxd->vwq;
+
+                memcpy(data_ptr + offset, (u8 *)&vwq->ndescs, sizeof(vwq->ndescs));
+                offset += sizeof(vwq->ndescs);
+                list_for_each_entry(el, &vwq->head, link) {
+                        printk("Saving descriptor at offset %x\n", offset);
+                        memcpy(data_ptr + offset, (u8 *)el, sizeof(*el));
+                        offset += sizeof(*el);
+                }
+        }
+
+	mig_info->data_size = offset - mig_info->data_offset;
+	mig_info->pending_bytes = offset - mig_info->data_offset;
+
+	dev_dbg(dev, "%s, mig_info->pending_bytes: 0x%llx, data_size: 0x%llx\n",
+		__func__, mig_info->pending_bytes, mig_info->data_size);
+}
+
+static void vidxd_dest_prepare_for_migration(struct vdcm_idxd *vidxd)
+{
+
+}
+
+static int vidxd_resume_wq_state(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	union wqcfg *vwqcfg, *wqcfg;
+	bool priv;
+	int wq_id;
+	int rc = 0;
+	u8 *bar0 = vidxd->bar0;
+
+	dev_dbg(dev, "%s:%d numwqs %d\n", __func__, __LINE__, vidxd->num_wqs);
+	/* TODO: Currently support for only 1 WQ per VDev */
+	for (wq_id = 0; wq_id < vidxd->num_wqs; wq_id++) {
+		wq = vidxd->wq;
+		dev_dbg(dev, "%s:%d wq %px\n", __func__, __LINE__, wq);
+		vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+		wqcfg = wq->wqcfg;
+
+		if (vidxd_state(vidxd) != 1 || vwqcfg->wq_state != 1) {
+			/* either VDEV or vWQ is disabled */
+			if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED)
+				idxd_wq_disable(wq, false, NULL);
+			continue;
+		} else {
+			unsigned long flags;
+			printk("vidxd re-enable wq %u:%u\n", wq_id, wq->id);
+
+			/* If dedicated WQ and PASID is not enabled, program
+			 * the default PASID in the WQ PASID register */
+			if (wq_dedicated(wq) && vwqcfg->mode_support) {
+				int wq_pasid = -1, gpasid = -1;
+
+				if (vwqcfg->pasid_en) {
+					gpasid = vwqcfg->pasid;
+					priv = vwqcfg->priv;
+					rc = idxd_mdev_get_host_pasid(mdev,
+						gpasid, &wq_pasid);
+				} else {
+					rc = idxd_mdev_get_pasid(mdev,
+						&wq_pasid);
+					priv = true;
+				}
+
+				if (wq_pasid >= 0) {
+					wqcfg->bits[WQCFG_PASID_IDX] &=
+								~GENMASK(29, 8);
+					wqcfg->priv = priv;
+					wqcfg->pasid_en = 1;
+					wqcfg->pasid = wq_pasid;
+					dev_dbg(dev, "pasid %d:%d in wq %d\n",
+						gpasid, wq_pasid, wq->id);
+					spin_lock_irqsave(&idxd->dev_lock,
+									flags);
+					idxd_wq_setup_pasid(wq, wq_pasid);
+					idxd_wq_setup_priv(wq, priv);
+					spin_unlock_irqrestore(&idxd->dev_lock,
+									flags);
+					rc = idxd_wq_enable(wq, NULL);
+					if (rc) {
+						dev_err(dev, "resume wq failed\n");
+						break;;
+					}
+				}
+			} else if (!wq_dedicated(wq) && vwqcfg->mode_support) {
+				wqcfg->bits[WQCFG_PASID_IDX] &= ~GENMASK(29, 8);
+				wqcfg->pasid_en = 1;
+				wqcfg->mode = 0;
+				spin_lock_irqsave(&idxd->dev_lock, flags);
+				idxd_wq_setup_pasid(wq, 0);
+				spin_unlock_irqrestore(&idxd->dev_lock, flags);
+				rc = idxd_wq_enable(wq, NULL);
+				if (rc) {
+					dev_err(dev, "resume wq %d failed\n",
+							wq->id);
+					break;
+				}
+			}
+		}
+	}
+	return rc;
+}
+
+static unsigned int vidxd_dest_load_state(struct vdcm_idxd *vidxd)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *)vdev->mig_pages;
+	u8	*data_ptr = (u8 *)vdev->mig_pages;
+	unsigned int offset =  mig_info->data_offset;
+
+	pr_info("%s, data_size: %llx, data_offset: 0x%llx\n", __func__,
+			mig_info->data_size, mig_info->data_offset);
+
+	/* restore the state data to device */
+	memcpy(vidxd->cfg, data_ptr + offset, sizeof(vidxd->cfg));
+	offset += sizeof(vidxd->cfg);
+	memcpy((u8 *)vidxd->bar_val, data_ptr + offset, sizeof(vidxd->bar_val));
+	offset += sizeof(vidxd->bar_val);
+	memcpy((u8 *)vidxd->bar_size, data_ptr + offset,
+					sizeof(vidxd->bar_size));
+	offset += sizeof(vidxd->bar_size);
+	memcpy((u8 *)&vidxd->bar0, data_ptr + offset, sizeof(vidxd->bar0));
+	offset += sizeof(vidxd->bar0);
+	//memcpy((u8 *)ims, data_ptr + offset, sizeof(vidxd->ims));
+	//offset += sizeof(vidxd->ims);
+
+	return offset;
+}
+
+static int vidxd_resume_ims_state (struct vdcm_idxd *vidxd,
+		unsigned int *offset, bool *int_handle_revoked)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	u8 *bar0 = vidxd->bar0;
+	int i;
+	int rc = 0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	/* Restore int handle info */
+	for (i = 1; i < VIDXD_MAX_MSIX_VECS; i++) {
+		u32 perm_val, auxval;
+		u32 gpasid, pasid;
+		bool paside;
+		int ims_idx = dev_msi_hwirq(dev, i - 1);
+		int irq = dev_msi_irq_vector(dev, i - 1);
+		u32 revoked_handle;
+
+		memcpy((u8 *)&revoked_handle, data_ptr + *offset,
+					sizeof(revoked_handle));
+
+		/* ims_idx == -1 means no handle is active. */
+		pr_info("%s: [%d] offset = 0x%x, new handle: %d, old handle: %d\n",
+				__func__, i, *offset, ims_idx, revoked_handle);
+
+		*offset += sizeof(revoked_handle);
+
+		if (revoked_handle != ims_idx) {
+			/* Int Handle Revoked */
+			*int_handle_revoked = true;
+		}
+
+		if (ims_idx < 0 || irq < 0)
+			return 0;
+
+		perm_val = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + i * 8);
+
+		paside = (perm_val >> 3) & 1;
+		gpasid = (perm_val >> 12) & 0xfffff;
+
+		if (paside)
+			rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+		else
+			rc = idxd_mdev_get_pasid(vidxd->ivdev.mdev, &pasid);
+		if (rc < 0)
+			return rc;
+
+		auxval = ims_ctrl_pasid_aux(pasid, true);
+
+		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+		pr_info("%s: auxval 0x%x rc %d\n", __func__, auxval, rc);
+		if (rc < 0) {
+			pr_info("set ims pasid failed rc %d\n", rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int vidxd_resubmit_pending_descs (struct vdcm_idxd *vidxd,
+		unsigned int *offset)
+{
+	struct vfio_pci_core_device *vdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	u8 *data_ptr = (u8 *)vdev->mig_pages;
+	struct idxd_virtual_wq *vwq;
+	struct idxd_wq *wq;
+	int i;
+
+	/* Submit the queued descriptors. The WQ state
+	 * has been resumed by this point
+	 */
+	for (i = 0; i < vidxd->num_wqs; i++) {
+		void __iomem *portal;
+		struct idxd_wq_desc_elem el;
+		vwq = &vidxd->vwq;
+		wq = vidxd->wq;
+
+		memcpy((u8 *)&vwq->ndescs, data_ptr + *offset, sizeof(vwq->ndescs));
+		*offset += sizeof(vwq->ndescs);
+
+		for (; vwq->ndescs > 0; vwq->ndescs--) {
+			printk("Descriptor at offset %x\n", *offset);
+
+			memcpy((u8 *)&el, data_ptr + *offset, sizeof(el));
+			*offset += sizeof(el);
+
+			portal = vidxd->idxd->portal_base +
+				idxd_get_wq_portal_full_offset(wq->id,
+						el.portal_prot, IDXD_IRQ_IMS);
+			portal += (el.portal_id << 6);
+
+			pr_info("submitting a desc to WQ %d:%d ded %d\n",
+					i, wq->id, wq_dedicated(wq));
+			if (wq_dedicated(wq)) {
+				iosubmit_cmds512(portal, el.work_desc, 1);
+			} else {
+				int rc;
+				struct dsa_hw_desc *hw =
+					(struct dsa_hw_desc *)el.work_desc;
+				int hpasid, gpasid = hw->pasid;
+
+				/* Translate the gpasid in the descriptor */
+				rc = idxd_mdev_get_host_pasid(mdev,
+						gpasid, &hpasid);
+				if (rc < 0) {
+					pr_info("gpasid->hpasid trans failed\n");
+					continue;
+				}
+				hw->pasid = hpasid;
+				/* FIXME: Allow enqcmds to retry a few times
+				 * before failing */
+				rc = enqcmds(portal, el.work_desc);
+				if (rc < 0) {
+					pr_info("%s: enqcmds failed\n", __func__);
+					continue;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int vidxd_dest_complete_migration(struct vdcm_idxd *vidxd)
+{
+	int rc = 0;
+	unsigned int offset;
+	bool int_handle_revoked = false;
+
+	offset = vidxd_dest_load_state(vidxd);
+
+	rc = vidxd_resume_wq_state(vidxd);
+
+	if (rc) {
+		pr_info("vidxd resume wq state failed %d\n", rc);
+		return rc;
+	}
+
+	rc = vidxd_resume_ims_state(vidxd, &offset, &int_handle_revoked);
+
+	if (rc) {
+		pr_info("vidxd int handle revocation handling failed %d\n", rc);
+		return rc;
+	}
+
+	rc = vidxd_resubmit_pending_descs(vidxd, &offset);
+
+	if (rc) {
+		pr_info("vidxd pending descs handling failed %d\n", rc);
+		return rc;
+	}
+
+	if (int_handle_revoked)
+                vidxd_notify_revoked_handles(vidxd);
+
+	return rc;
+}
+
+static int vidxd_migration_state_change(struct vfio_pci_core_device *vfio_vdev,
+		u32 new_state)
+{
+	struct vdcm_idxd *vidxd = container_of(vfio_vdev, struct vdcm_idxd, vfio_pdev);
+	struct vfio_device_migration_info *mig_info =
+		(struct vfio_device_migration_info *) vfio_vdev->mig_pages;
+	int ret = 0;
+
+	pr_info("%s, VFIO_DEVICE_STATE_MASK: 0x%x, new_state: 0x%x\n",
+			__func__, VFIO_DEVICE_STATE_MASK, new_state);
+	if (new_state & (~(VFIO_DEVICE_STATE_MASK))) {
+		pr_info("%s, invalid new device state, 0x%x!!\n", __func__, new_state);
+		return -EINVAL;
+	}
+
+	switch (new_state) {
+	case 0:
+		pr_info("%s, __STOPPED !!\n", __func__);
+		vidxd_free_resources(vidxd);
+		break;
+	case VFIO_DEVICE_STATE_RUNNING:
+		pr_info("%s, VFIO_DEVICE_STATE_RUNNING!! old state %x\n",
+			__func__, mig_info->device_state);
+		if (mig_info->device_state & VFIO_DEVICE_STATE_RESUMING)
+			vidxd_dest_complete_migration(vidxd);
+
+		mutex_lock(&vidxd->mig_submit_lock);
+		/* The VMM may continue the VM after pausing it. So get ready
+		* for normal operation */
+		vidxd->paused = false;
+		mutex_unlock(&vidxd->mig_submit_lock);
+
+		break;
+	case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING:
+		pr_info("%s, VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING!!\n", __func__);
+
+		break;
+	case VFIO_DEVICE_STATE_SAVING:
+		pr_info("%s, VFIO_DEVICE_STATE_SAVING!!\n", __func__);
+		/* Prepared the state data for migration */
+		if (!(mig_info->device_state & VFIO_DEVICE_STATE_RUNNING))
+			vidxd_source_prepare_for_migration(vidxd);
+
+		/* Pause the virtual device. The vCPUs are still running.
+		 * This happens just before the VM is paused. The vDEV
+		 * is already in slow path */
+		if (mig_info->device_state & VFIO_DEVICE_STATE_RUNNING)
+			vidxd_source_pause_device(vidxd);
+		break;
+	case VFIO_DEVICE_STATE_RESUMING:
+		/* Prepared the state restore for migration */
+		vidxd_dest_prepare_for_migration(vidxd);
+		pr_info("%s, VFIO_DEVICE_STATE_RESUMING!!\n", __func__);
+		break;
+	default:
+		pr_info("%s, not handled new device state: 0x%x\n", __func__, new_state);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static struct vfio_pci_migops vidxd_migops = {
+	.state_change	= vidxd_migration_state_change,
+};
+
+static struct idxd_wq *find_any_dwq(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int i;
+	struct idxd_wq *wq;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return NULL;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED && wq->state != IDXD_WQ_LOCKED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (!wq_dedicated(wq))
+			continue;
+
+		if (idxd_wq_refcount(wq) != 0)
+			continue;
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_get(wq);
+		mutex_unlock(&wq->wq_lock);
+		return wq;
+	}
+
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return NULL;
+}
+
+static int swq_lowest_client_count(struct idxd_device *idxd)
+{
+	struct idxd_wq *wq;
+	int i, count = -ENODEV;
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (wq_dedicated(wq))
+			continue;
+
+		if (count == -ENODEV)
+			count = idxd_wq_refcount(wq);
+		else if (count > idxd_wq_refcount(wq))
+			count = idxd_wq_refcount(wq);
+	}
+
+	return count;
+}
+
+static struct idxd_wq *find_any_swq(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int i, count;
+	struct idxd_wq *wq;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return NULL;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	count = swq_lowest_client_count(idxd);
+	if (count < 0)
+		goto out;
+
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		if (wq_dedicated(wq))
+			continue;
+
+		/*
+		 * Attempt to load balance the shared wq by round robin until on the lowest
+		 * ref count for the wq.
+		 */
+		if (idxd_wq_refcount(wq) != count)
+			continue;
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_get(wq);
+		mutex_unlock(&wq->wq_lock);
+		return wq;
+	}
+
+ out:
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return NULL;
+}
+
+extern const struct vfio_pci_regops vfio_pci_dma_fault_regops;
+
+static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev_device *mdev,
+					   struct vdcm_idxd_type *type)
+{
+	struct vdcm_idxd *vidxd;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = NULL;
+	int rc;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+		wq = find_any_dwq(idxd, type);
+		break;
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		wq = find_any_swq(idxd, type);
+		break;
+	default:
+		return ERR_PTR(-ENODEV);
+	}
+
+	if (!wq)
+		return ERR_PTR(-ENODEV);
+
+	vidxd = kzalloc(sizeof(*vidxd), GFP_KERNEL);
+	if (!vidxd) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	mutex_init(&vidxd->dev_lock);
+	vidxd->idxd = idxd;
+	vidxd->ivdev.mdev = mdev;
+	vidxd->wq = wq;
+	mdev_set_drvdata(mdev, vidxd);
+	vidxd->type = type;
+	vidxd->num_wqs = VIDXD_MAX_WQS;
+	dev_set_msi_domain(dev, idxd->ims_domain);
+	mutex_init(&vidxd->ivdev.ioasid_lock);
+	INIT_LIST_HEAD(&vidxd->ivdev.mm_list);
+
+	idxd_vdcm_init(vidxd);
+
+	mutex_init(&vidxd->vfio_pdev.igate);
+	vidxd->vfio_pdev.pdev = idxd->pdev;
+	rc = vfio_pci_dma_fault_init(&vidxd->vfio_pdev, false);
+	if (rc < 0) {
+		dev_err(dev, "dma fault region init failed\n");
+		kfree(vidxd);
+		goto err;
+	}
+
+	mdev_set_iommu_fault_data(mdev, &vidxd->vfio_pdev);
+
+	mutex_init(&vidxd->mig_submit_lock);
+	vidxd->vfio_pdev.migops = &vidxd_migops;
+	rc = vfio_pci_migration_init(&vidxd->vfio_pdev, VIDXD_STATE_BUFFER_SIZE);
+	if (rc)
+		pr_err("%s, idxd migration region init failed!!!\n", __func__);
+	else
+		pr_info("%s, idxd migration region init successfully!!!\n", __func__);
+
+	return vidxd;
+
+ err:
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+	return ERR_PTR(rc);
+}
+
+static struct vdcm_idxd_type idxd_mdev_types[IDXD_MDEV_TYPES] = {
+	{
+		.name = idxd_dsa_1dwq_name,
+		.type = IDXD_MDEV_TYPE_DSA_1_DWQ,
+	},
+	{
+		.name = idxd_iax_1dwq_name,
+		.type = IDXD_MDEV_TYPE_IAX_1_DWQ,
+	},
+	{
+		.name = idxd_dsa_1swq_name,
+		.type = IDXD_MDEV_TYPE_DSA_1_SWQ,
+	},
+	{
+		.name = idxd_iax_1swq_name,
+		.type = IDXD_MDEV_TYPE_IAX_1_SWQ,
+	},
+};
+
+static struct vdcm_idxd_type *idxd_vdcm_get_type(struct mdev_device *mdev)
+{
+	return &idxd_mdev_types[mdev_get_type_group_id(mdev)];
+}
+
+static const struct vfio_device_ops idxd_mdev_ops;
+
+static int idxd_vdcm_probe(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd;
+	struct vdcm_idxd_type *type;
+	struct device *dev, *parent;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	int rc;
+
+	parent = mdev_parent_dev(mdev);
+	idxd = dev_get_drvdata(parent);
+	dev = mdev_dev(mdev);
+	mdev_set_iommu_device(mdev, parent);
+	type = idxd_vdcm_get_type(mdev);
+
+	vidxd = vdcm_vidxd_create(idxd, mdev, type);
+	if (IS_ERR(vidxd)) {
+		dev_err(dev, "failed to create vidxd: %ld\n", PTR_ERR(vidxd));
+		return PTR_ERR(vidxd);
+	}
+
+	vfio_init_group_dev(&vidxd->vdev, &mdev->dev, &idxd_mdev_ops);
+	wq = vidxd->wq;
+	dev_set_drvdata(dev, vidxd);
+	rc = vfio_register_group_dev(&vidxd->vdev);
+	if (rc < 0) {
+		mutex_lock(&wq->wq_lock);
+		idxd_wq_put(wq);
+		mutex_unlock(&wq->wq_lock);
+		kfree(vidxd);
+		return rc;
+	}
+
+	mutex_lock(&wq->wq_lock);
+	list_add(&vidxd->list, &wq->vdcm_list);
+	mutex_unlock(&wq->wq_lock);
+	dev_dbg(dev, "mdev creation success: %s\n", dev_name(mdev_dev(mdev)));
+
+	return 0;
+}
+
+static void idxd_vdcm_remove(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_wq *wq = vidxd->wq;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int i;
+
+	dev_dbg(dev, "%s: removing for wq %d\n", __func__, vidxd->wq->id);
+
+	for (i = 0; i < vfio_pdev->num_regions; i++)
+		vfio_pdev->region[i].ops->release(vfio_pdev, &vfio_pdev->region[i]);
+	vfio_pdev->num_regions = 0;
+	kfree(vfio_pdev->region);
+	vfio_pdev->region = NULL;
+
+	for (i = 0; i < vfio_pdev->num_ext_irqs; i++)
+		vfio_pci_set_ext_irq_trigger(vfio_pdev,
+					 VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+					 VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+	vfio_pdev->num_ext_irqs = 0;
+	kfree(vfio_pdev->ext_irqs);
+	vfio_pdev->ext_irqs = NULL;
+
+	mutex_lock(&wq->wq_lock);
+	list_del(&vidxd->list);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+
+	vfio_unregister_group_dev(&vidxd->vdev);
+
+	kfree(vidxd);
+}
+
+static int idxd_vdcm_open(struct vfio_device *vdev)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	int rc = -EINVAL;
+	struct vdcm_idxd_type *type = vidxd->type;
+	struct device *dev = vdev->dev;
+	struct vfio_group *vfio_group;
+
+	dev_dbg(dev, "%s: type: %d\n", __func__, type->type);
+
+	vfio_group = vfio_group_get_external_user_from_dev(dev);
+	if (IS_ERR_OR_NULL(vfio_group)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = vidxd_register_ioasid_notifier(vidxd);
+	if (rc < 0)
+		goto ioasid_err;
+
+	mutex_lock(&vidxd->dev_lock);
+	if (vidxd->refcount)
+		goto ioasid_err;
+
+	vidxd->ivdev.vfio_group = vfio_group;
+	vidxd->refcount++;
+
+	mutex_unlock(&vidxd->dev_lock);
+	return 0;
+
+ ioasid_err:
+	vfio_group_put_external_user(vfio_group);
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static void idxd_vdcm_close(struct vfio_device *vdev)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (!vidxd->refcount)
+		goto out;
+
+	vidxd_unregister_ioasid_notifier(vidxd);
+	idxd_vdcm_set_irqs(vidxd, VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+			   VFIO_PCI_MSIX_IRQ_INDEX, 0, 0, NULL);
+
+	if (vidxd->ivdev.vfio_group) {
+		vfio_group_put_external_user(vidxd->ivdev.vfio_group);
+		vidxd->ivdev.vfio_group = NULL;
+	}
+
+	/* Re-initialize the VIDXD to a pristine state for re-use */
+	idxd_vdcm_init(vidxd);
+	vidxd->refcount--;
+	vidxd->paused = false;
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+}
+
+static int vidxd_register_ioasid_notifier(struct vdcm_idxd *vidxd)
+{
+	struct idxd_vdev *vdev = &vidxd->ivdev;
+	struct ioasid_mm_entry *mm_entry;
+	struct mm_struct *mm;
+	int rc;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return -ENODEV;
+
+	mutex_lock(&vdev->ioasid_lock);
+	list_for_each_entry(mm_entry, &vdev->mm_list, node) {
+		if (mm_entry->mm == mm) {
+			mutex_unlock(&vdev->ioasid_lock);
+			mmput(mm);
+			return 0;
+		}
+	}
+
+	mm_entry = kzalloc(sizeof(*mm_entry), GFP_KERNEL);
+	if (!mm_entry) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
+
+	mm_entry->mm = mm;
+
+	vidxd->ivdev.pasid_nb.priority = IOASID_PRIO_DEVICE;
+	vidxd->ivdev.pasid_nb.notifier_call = idxd_mdev_ioasid_event;
+	rc = ioasid_register_notifier_mm(mm, &vidxd->ivdev.pasid_nb);
+	if (rc < 0)
+		goto err_ioasid;
+
+	list_add(&mm_entry->node, &vdev->mm_list);
+	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
+
+	return 0;
+
+ err_ioasid:
+	kfree(mm_entry);
+ err_alloc:
+	mutex_unlock(&vdev->ioasid_lock);
+	mmput(mm);
+	return rc;
+}
+
+static ssize_t idxd_vdcm_rw(struct vfio_device *vdev, char *buf, size_t count, loff_t *ppos,
+			    enum idxd_vdcm_rw mode)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct device *dev = vdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc = -EINVAL;
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions) {
+		dev_err(dev, "invalid index: %u\n", index);
+		return -EINVAL;
+	}
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_cfg_write(vidxd, pos, buf, count);
+		else
+			rc = vidxd_cfg_read(vidxd, pos, buf, count);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_mmio_write(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		else
+			rc = vidxd_mmio_read(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		break;
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE) {
+			rc = vidxd_portal_mmio_write(vidxd,
+				vidxd->bar_val[1] + pos, buf, count);
+		} else {
+			rc = vidxd_portal_mmio_read(vidxd,
+				vidxd->bar_val[1] + pos, buf, count);
+		}
+		break;
+
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		dev_err(dev, "unsupported region: %u\n", index);
+		break;
+
+	default:
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, mode);
+	}
+
+	return rc == 0 ? count : rc;
+}
+
+static ssize_t idxd_vdcm_read(struct vfio_device *vdev, char __user *buf, size_t count,
+			      loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	unsigned int done = 0;
+	int rc;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		break;
+	default: {
+		struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+		struct device *dev = vdev->dev;
+
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, false);
+	} /* end default */
+	} /* end switch(index) */
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 8 && !(*ppos % 8)) {
+			u64 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			rc = idxd_vdcm_rw(vdev, &val, sizeof(val), ppos,
+					  IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+ read_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static ssize_t idxd_vdcm_write(struct vfio_device *vdev, const char __user *buf, size_t count,
+			       loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	unsigned int done = 0;
+	int rc;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+		break;
+	default: {
+		struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+		struct device *dev = vdev->dev;
+
+		dev_dbg(dev, "vendor specific region: %u\n", index);
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vfio_pdev->region[index].ops->rw(vfio_pdev, buf, count, ppos, true);
+	} /* end default */
+	} /* end switch(index) */
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 8 && !(*ppos % 8)) {
+			u64 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, (char *)&val,
+					  sizeof(val), ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(vdev, &val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+write_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static int idxd_vdcm_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
+{
+	unsigned int wq_idx, index;
+	unsigned long req_size, pgoff = 0, offset;
+	pgprot_t pg_prot;
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	struct idxd_wq *wq = vidxd->wq;
+	enum idxd_portal_prot virt_portal, phys_portal;
+	phys_addr_t base = pci_resource_start(idxd->pdev, IDXD_WQ_BAR);
+	struct device *dev = vdev->dev;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		int regnum = index - VFIO_PCI_NUM_REGIONS;
+		struct vfio_pci_region *region = vidxd->vfio_pdev.region + regnum;
+
+		if (region && region->ops && region->ops->mmap &&
+		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+			return region->ops->mmap(&vidxd->vfio_pdev, region, vma);
+
+		return -EINVAL;
+	}
+
+	pg_prot = vma->vm_page_prot;
+	req_size = vma->vm_end - vma->vm_start;
+	if (req_size > PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_DONTCOPY;
+
+	offset = (vma->vm_pgoff << PAGE_SHIFT) &
+		 ((1ULL << VFIO_PCI_OFFSET_SHIFT) - 1);
+
+	wq_idx = offset >> (PAGE_SHIFT + 2);
+	if (wq_idx >= 1) {
+		dev_err(dev, "mapping invalid wq %d off %lx\n",
+			wq_idx, offset);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check and see if the guest wants to map to the limited or unlimited portal.
+	 * The driver will allow mapping to unlimited portal only if the wq is a
+	 * dedicated wq. Otherwise, it goes to limited.
+	 */
+	virt_portal = ((offset >> PAGE_SHIFT) & 0x3) == 1;
+	phys_portal = IDXD_PORTAL_LIMITED;
+	if (virt_portal == IDXD_PORTAL_UNLIMITED && wq_dedicated(wq))
+		phys_portal = IDXD_PORTAL_UNLIMITED;
+
+	/* We always map IMS portals to the guest */
+	pgoff = (base + idxd_get_wq_portal_full_offset(wq->id, phys_portal,
+						       IDXD_IRQ_IMS)) >> PAGE_SHIFT;
+
+	dev_dbg(dev, "mmap %lx %lx %lx %lx\n", vma->vm_start, pgoff, req_size,
+		pgprot_val(pg_prot));
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = pgoff;
+
+	return remap_pfn_range(vma, vma->vm_start, pgoff, req_size, pg_prot);
+}
+
+static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
+{
+	idxd_vdcm_init(vidxd);
+}
+
+static irqreturn_t idxd_vdcm_msix_handler(int irq, void *arg)
+{
+	struct vfio_pci_irq_ctx *ctx = (struct vfio_pci_irq_ctx *)arg;
+
+	eventfd_signal(ctx->trigger, 1);
+	return IRQ_HANDLED;
+}
+
+static void idxd_vdcm_free_irq (struct vfio_pci_core_device *vfio_pdev, int vector, int irq)
+{
+	u32 auxval;
+	if (irq) {
+		irq_bypass_unregister_producer(&vfio_pdev->ctx[vector].producer);
+		free_irq(irq, &vfio_pdev->ctx[vector]);
+		auxval = ims_ctrl_pasid_aux(0, false);
+		irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+	}
+	kfree(vfio_pdev->ctx[vector].name);
+	vfio_pdev->ctx[vector].name = NULL;
+}
+
+static int idxd_vdcm_msix_set_vector_signal(struct vdcm_idxd *vidxd, int vector, int fd)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct eventfd_ctx *trigger;
+	char *name;
+	u32 pasid, auxval;
+	int irq, rc;
+	u8 *bar0 = vidxd->bar0;
+	u32 msix_perm;
+
+	dev_dbg(dev, "%s: set signal %d fd: %d\n", __func__, vector, fd);
+
+	if (vector < 0 || vector >= vfio_pdev->num_ctx) {
+		dev_warn(dev, "%s out of boundary\n", __func__);
+		return -EINVAL;
+	}
+
+	irq = vector ? dev_msi_irq_vector(dev, vector - 1) : 0;
+
+	dev_dbg(dev, "%s: irq: %d\n", __func__, irq);
+
+	if (vfio_pdev->ctx[vector].trigger) {
+		if (irq)
+			irq_bypass_unregister_producer(&vfio_pdev->ctx[vector].producer);
+
+		eventfd_ctx_put(vfio_pdev->ctx[vector].trigger);
+
+		if (fd < 0) {
+			dev_dbg(dev, "%s: trigger already set, freeing\n", __func__);
+			idxd_vdcm_free_irq(vfio_pdev, vector, irq);
+			return 0;
+		}
+		dev_dbg(dev, "%s: trigger already set, changing\n", __func__);
+		trigger = eventfd_ctx_fdget(fd);
+		if (IS_ERR(trigger)) {
+			dev_dbg(dev, "%s: trigger change failed, freeing\n", __func__);
+			idxd_vdcm_free_irq(vfio_pdev, vector, irq);
+			vfio_pdev->ctx[vector].trigger = NULL;
+			return PTR_ERR(trigger);
+		}
+		vfio_pdev->ctx[vector].trigger = trigger;
+
+		if (irq) {
+			/* Update IRQ Bypass Setting */
+			vfio_pdev->ctx[vector].producer.token = trigger;
+			vfio_pdev->ctx[vector].producer.irq = irq;
+			rc = irq_bypass_register_producer(&vfio_pdev->ctx[vector].producer);
+			if (unlikely(rc)) {
+				dev_warn(dev, "irq bypass producer (token %p) registration fails: %d\n",
+					vfio_pdev->ctx[vector].producer.token, rc);
+				vfio_pdev->ctx[vector].producer.token = NULL;
+			}
+			dev_dbg(dev, "%s: updated irq %d\n", __func__, irq);
+		}
+		return 0;
+	}
+
+	if (fd < 0)
+		return 0;
+
+	name = kasprintf(GFP_KERNEL, "vfio-dev-ims[%d](%s)", vector, dev_name(dev));
+	if (!name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(name);
+		return PTR_ERR(trigger);
+	}
+
+	vfio_pdev->ctx[vector].name = name;
+	vfio_pdev->ctx[vector].trigger = trigger;
+
+	dev_dbg(dev, "%s: trigger: %px\n", __func__, trigger);
+
+	if (!irq) {
+		dev_dbg(dev, "Mediated vector 0 set\n");
+		return 0;
+	}
+
+	/*
+	 * This only points to MSIX entry 1, which is fine for now.
+	 */
+	msix_perm = *(u32 *)(bar0 + VIDXD_MSIX_PERM_OFFSET + 8 * vector);
+	dev_dbg(dev, "MSIX PERM: %#x\n", msix_perm);
+	if (!(msix_perm & BIT(3))) {
+		rc = idxd_mdev_get_pasid(mdev, &pasid);
+		if (rc < 0) {
+			dev_warn(dev, "%s unable to get pasid, failing\n", __func__);
+			goto err;
+		}
+
+		dev_dbg(dev, "%s: pasid: %d\n", __func__, pasid);
+	} else {
+		u32 gpasid;
+
+		gpasid = (msix_perm & GENMASK(31, 12)) >> 12;
+		rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+		if (rc < 0) {
+			dev_warn(dev, "%s guest pasid %u translate failure\n", __func__, gpasid);
+			goto err;
+		}
+		dev_dbg(dev, "%s: guest pasid: %u host pasid: %u\n",
+			__func__, gpasid, pasid);
+	}
+
+	auxval = ims_ctrl_pasid_aux(pasid, true);
+	rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+	if (rc < 0) {
+		dev_warn(dev, "%s: set IMS aux data failed: %d\n", __func__, rc);
+		goto err;
+	}
+
+	rc = request_irq(irq, idxd_vdcm_msix_handler, 0, name, &vfio_pdev->ctx[vector]);
+	if (rc < 0) {
+		dev_warn(dev, "%s request_irq() failed\n", __func__);
+		goto irq_err;
+	}
+
+	vfio_pdev->ctx[vector].producer.token = trigger;
+	vfio_pdev->ctx[vector].producer.irq = irq;
+	rc = irq_bypass_register_producer(&vfio_pdev->ctx[vector].producer);
+	if (unlikely(rc)) {
+		dev_warn(dev, "irq bypass producer (token %p) registration fails: %d\n",
+			vfio_pdev->ctx[vector].producer.token, rc);
+		vfio_pdev->ctx[vector].producer.token = NULL;
+	}
+
+	dev_dbg(dev, "%s: irq %d set\n", __func__, irq);
+
+	return 0;
+
+ irq_err:
+	auxval = ims_ctrl_pasid_aux(0, false);
+	irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+ err:
+	kfree(name);
+	vfio_pdev->ctx[vector].name = NULL;
+	eventfd_ctx_put(trigger);
+	vfio_pdev->ctx[vector].trigger = NULL;
+	return rc;
+}
+
+static int idxd_vdcm_msix_set_vector_signals(struct vdcm_idxd *vidxd, u32 start,
+					     u32 count, int *fds)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int i, j, rc = 0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	if (start >= vfio_pdev->num_ctx || start + count > vfio_pdev->num_ctx) {
+		dev_warn(dev, "%s out of boundary\n", __func__);
+		return -EINVAL;
+	}
+
+	for (i = 0, j = start; i < count && !rc; i++, j++) {
+		int fd = fds ? fds[i] : -1;
+
+		dev_dbg(dev, "%s: %s signal %d, fd: %d\n",
+			__func__, (fd == -1) ? "unset" : "set", j, fd);
+		rc = idxd_vdcm_msix_set_vector_signal(vidxd, j, fd);
+	}
+
+	if (rc) {
+		dev_warn(dev, "%s: set signal failed, unwind\n", __func__);
+		for (--j; j >= (int)start; j--)
+			idxd_vdcm_msix_set_vector_signal(vidxd, j, -1);
+	}
+
+	return rc;
+}
+
+static int idxd_vdcm_msix_enable(struct vdcm_idxd *vidxd, int nvec)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc;
+
+	dev_dbg(dev, "%s: nvec: %d\n", __func__, nvec);
+
+	/* There should be at least 1 vectors for idxd */
+	if (nvec < 1)
+		return -EINVAL;
+
+	dev_dbg(dev, "%s: allocating\n", __func__);
+	vfio_pdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
+	if (!vfio_pdev->ctx) {
+		dev_warn(dev, "%s: failed to alloc VFIO irq context\n", __func__);
+		return -ENOMEM;
+	}
+
+	if (nvec > 1) {
+		dev_dbg(dev, "%s: allocate %d IMS\n", __func__, nvec - 1);
+		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, nvec - 1);
+		if (rc < 0) {
+			dev_warn(dev, "%s failed to allocate irq on IMS domain: %d\n",
+				 __func__, rc);
+			kfree(vfio_pdev->ctx);
+			return rc;
+		}
+	}
+
+	vfio_pdev->num_ctx = nvec;
+	vfio_pdev->irq_type = VFIO_PCI_MSIX_IRQ_INDEX;
+	return 0;
+}
+
+static int idxd_vdcm_msix_disable(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct irq_domain *irq_domain;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	/* Check if somebody already disabled it */
+	if (vfio_pdev->num_ctx == 0)
+		return 0;
+
+	idxd_vdcm_msix_set_vector_signals(vidxd, 0, vfio_pdev->num_ctx, NULL);
+	irq_domain = dev_get_msi_domain(dev);
+	if (irq_domain)
+		msi_domain_free_irqs(irq_domain, dev);
+	kfree(vfio_pdev->ctx);
+	vfio_pdev->num_ctx = 0;
+	vfio_pdev->irq_type = VFIO_PCI_NUM_IRQS;
+	return 0;
+}
+
+static int idxd_vdcm_set_msix_trigger(struct vdcm_idxd *vidxd, u32 index, u32 start,
+				      u32 count, u32 flags, void *data)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	int rc, i;
+
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "%s(index: %d start: %d count: %d flags: %d data: %px\n",
+		__func__, index, start, count, flags, data);
+
+	if (count > VIDXD_MAX_MSIX_VECS)
+		count = VIDXD_MAX_MSIX_VECS;
+
+	if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		dev_dbg(dev, "%s disabling\n", __func__);
+		idxd_vdcm_msix_disable(vidxd);
+		return 0;
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int *fds = data;
+
+		if (vfio_pdev->irq_type == index) {
+			dev_dbg(dev, "%s straight set signal\n", __func__);
+			return idxd_vdcm_msix_set_vector_signals(vidxd, start, count, fds);
+		}
+
+		rc = idxd_vdcm_msix_enable(vidxd, start + count);
+		if (rc < 0)
+			return rc;
+
+		rc = idxd_vdcm_msix_set_vector_signals(vidxd, start, count, fds);
+		if (rc < 0)
+			idxd_vdcm_msix_disable(vidxd);
+
+		return rc;
+	}
+
+	if (start + count > VIDXD_MAX_MSIX_VECS)
+		return -EINVAL;
+
+	for (i = start; i < start + count; i++) {
+		if (!vfio_pdev->ctx[i].trigger)
+			continue;
+		if (flags & VFIO_IRQ_SET_DATA_NONE) {
+			eventfd_signal(vfio_pdev->ctx[i].trigger, 1);
+		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+			u8 *bools = data;
+
+			if (bools[i - start])
+				eventfd_signal(vfio_pdev->ctx[i].trigger, 1);
+		}
+	}
+	return 0;
+}
+
+
+static int idxd_vdcm_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+					    unsigned int count, u32 flags, void *data)
+{
+	/* DATA_NONE/DATA_BOOL enables loopback testing */
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		if (*ctx) {
+			if (count) {
+				eventfd_signal(*ctx, 1);
+			} else {
+				eventfd_ctx_put(*ctx);
+				*ctx = NULL;
+			}
+			return 0;
+		}
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		u8 trigger;
+
+		if (!count)
+			return -EINVAL;
+
+		if (!data)
+			return -EINVAL;
+
+		trigger = *(u8 *)data;
+		if (trigger && *ctx)
+			eventfd_signal(*ctx, 1);
+
+		return 0;
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		s32 fd;
+
+		if (!count)
+			return -EINVAL;
+
+		fd = *(s32 *)data;
+		if (fd == -1) {
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+			*ctx = NULL;
+		} else if (fd >= 0) {
+			struct eventfd_ctx *efdctx;
+
+			efdctx = eventfd_ctx_fdget(fd);
+			if (IS_ERR(efdctx))
+				return PTR_ERR(efdctx);
+
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+
+			*ctx = efdctx;
+		}
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int idxd_vdcm_set_req_trigger(struct mdev_device *mdev, unsigned int index,
+				    unsigned int start, unsigned int count,
+				    u32 flags, void *data)
+{
+	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
+		return -EINVAL;
+
+	return idxd_vdcm_set_ctx_trigger_single(&mdev->req_trigger, count, flags, data);
+}
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
+			      unsigned int index, unsigned int start,
+			      unsigned int count, void *data)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	dev_dbg(dev, "%s: flags: %#x index: %#x, start: %#x, count: %#x, data: %px\n",
+		__func__, flags, index, start, count, data);
+
+	switch (index) {
+	case VFIO_PCI_MSIX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return idxd_vdcm_set_msix_trigger(vidxd, index, start, count, flags, data);
+		}
+		break;
+	case VFIO_PCI_REQ_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return idxd_vdcm_set_req_trigger(mdev, index, start, count, flags, data);
+		}
+		break;
+	default:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			return vfio_pci_set_ext_irq_trigger(vfio_pdev, index, start,
+							    count, flags, data);
+		}
+		break;
+	}
+
+	return -ENOTTY;
+}
+
+static long idxd_vdcm_ioctl(struct vfio_device *vdev, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	unsigned long minsz;
+	int rc = -EINVAL;
+	struct device *dev = vdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+
+	dev_dbg(dev, "vidxd %p ioctl, cmd: %d\n", vidxd, cmd);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+		info.flags |= VFIO_DEVICE_FLAGS_RESET;
+		info.num_regions = VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions;
+		info.num_irqs = VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_region_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+		size_t size;
+		int nr_areas = 1;
+		int cap_type_id = 0;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = VIDXD_MAX_CFG_SPACE_SZ;
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR0_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vidxd->bar_size[info.index];
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR1_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			break;
+		case VFIO_PCI_BAR2_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.flags = VFIO_REGION_INFO_FLAG_CAPS | VFIO_REGION_INFO_FLAG_MMAP |
+				     VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+				     VFIO_REGION_INFO_FLAG_DYNAMIC_TRAP;
+			info.size = vidxd->bar_size[1];
+
+			/*
+			 * Every WQ has two areas for unlimited and limited
+			 * MSI-X portals. IMS portals are not reported. For shared
+			 * WQ, we will only allow limited portal.
+			 */
+			nr_areas = wq_dedicated(vidxd->wq) ? 2 : 1;
+
+			size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
+			sparse = kzalloc(size, GFP_KERNEL);
+			if (!sparse) {
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+			sparse->header.version = 1;
+			sparse->nr_areas = nr_areas;
+			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+
+			/* Unlimited portal */
+			if (wq_dedicated(vidxd->wq)) {
+				sparse->areas[0].offset = 0;
+				sparse->areas[0].size = PAGE_SIZE;
+				sparse->areas[1].offset = PAGE_SIZE;
+				sparse->areas[1].size = PAGE_SIZE;
+			} else {
+			/* Limited portal */
+				sparse->areas[0].offset = PAGE_SIZE;
+				sparse->areas[0].size = PAGE_SIZE;
+			}
+
+			break;
+
+		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			dev_dbg(dev, "get region info bar:%d\n", info.index);
+			break;
+
+		case VFIO_PCI_ROM_REGION_INDEX:
+		case VFIO_PCI_VGA_REGION_INDEX:
+			dev_dbg(dev, "get region info index:%d\n", info.index);
+			break;
+		default: {
+			struct vfio_region_info_cap_type cap_type = {
+				.header.id = VFIO_REGION_INFO_CAP_TYPE,
+				.header.version = 1,
+			};
+			int i;
+
+			if (info.index >= VFIO_PCI_NUM_REGIONS + vfio_pdev->num_regions) {
+				rc = -EINVAL;
+				goto out;
+			}
+
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_REGIONS +
+							vfio_pdev->num_regions);
+			i = info.index - VFIO_PCI_NUM_REGIONS;
+
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vfio_pdev->region[i].size;
+			info.flags = vfio_pdev->region[i].flags;
+
+			cap_type.type = vfio_pdev->region[i].type;
+			cap_type.subtype = vfio_pdev->region[i].subtype;
+
+			rc = vfio_info_add_capability(&caps, &cap_type.header, sizeof(cap_type));
+			if (rc)
+				goto out;
+
+			if (vfio_pdev->region[i].ops->add_capability) {
+				rc = vfio_pdev->region[i].ops->add_capability(vfio_pdev,
+									  &vfio_pdev->region[i],
+									  &caps);
+				if (rc)
+					goto out;
+			}
+		} /* default */
+		} /* info.index switch */
+
+		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+			if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) {
+				rc = vfio_info_add_capability(&caps, &sparse->header,
+							      sizeof(*sparse) + (sparse->nr_areas *
+							      sizeof(*sparse->areas)));
+				kfree(sparse);
+				if (rc)
+					goto out;
+			}
+		}
+
+		if (caps.size) {
+			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg + sizeof(info),
+						 caps.buf, caps.size)) {
+					kfree(caps.buf);
+					rc = -EFAULT;
+					goto out;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+		struct vfio_info_cap caps = {
+			.buf = NULL,
+			.size = 0
+		};
+		unsigned long capsz;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+		capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz ||
+		    info.index >= VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		if (info.argsz >= capsz)
+			minsz = capsz;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX:
+		case VFIO_PCI_MSI_IRQ_INDEX:
+		case VFIO_PCI_ERR_IRQ_INDEX:
+			rc = -EINVAL;
+			goto out;
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+			break;
+		default: {
+			struct vfio_irq_info_cap_type cap_type = {
+				.header.id = VFIO_IRQ_INFO_CAP_TYPE,
+				.header.version = 1
+			};
+			int i;
+
+			if (info.index >= VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs)
+				return -EINVAL;
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_IRQS + vfio_pdev->num_ext_irqs);
+			i = info.index - VFIO_PCI_NUM_IRQS;
+
+			info.flags = vfio_pdev->ext_irqs[i].flags;
+			cap_type.type = vfio_pdev->ext_irqs[i].type;
+			cap_type.subtype = vfio_pdev->ext_irqs[i].subtype;
+
+			rc = vfio_info_add_capability(&caps, &cap_type.header, sizeof(cap_type));
+			if (rc)
+				goto out;
+			break;
+		}
+		} /* switch(info.index) */
+
+		info.count = idxd_vdcm_get_irq_count(mdev, info.index);
+		if (caps.size) {
+			info.flags |= VFIO_IRQ_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg + sizeof(info), caps.buf,
+						 caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+			kfree(caps.buf);
+		}
+
+		rc = copy_to_user((void __user *)arg, &info, minsz);
+		rc = rc ? -EFAULT : 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		size_t data_size = 0;
+		int max;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		max = idxd_vdcm_get_irq_count(mdev, hdr.index);
+		rc = vfio_set_irqs_validate_and_prepare(&hdr, max,
+							VFIO_PCI_NUM_IRQS +
+							vfio_pdev->num_ext_irqs,
+							&data_size);
+		if (rc) {
+			dev_err(dev, "intel:vfio_set_irqs_validate_and_prepare failed\n");
+			goto out;
+		}
+
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz), data_size);
+			if (IS_ERR(data)) {
+				rc = PTR_ERR(data);
+				goto out;
+			}
+		}
+		mutex_lock(&vidxd->vfio_pdev.igate);
+		rc = idxd_vdcm_set_irqs(vidxd, hdr.flags, hdr.index, hdr.start, hdr.count, data);
+		mutex_unlock(&vidxd->vfio_pdev.igate);
+		kfree(data);
+		goto out;
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		vidxd_vdcm_reset(vidxd);
+	}
+
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static void idxd_vdcm_mdev_request(struct vfio_device *vdev, unsigned int count)
+{
+	struct vdcm_idxd *vidxd = vdev_to_vidxd(vdev);
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+
+	mutex_lock(&vfio_pdev->igate);
+	if (mdev->req_trigger) {
+		if (!(count % 10))
+			dev_info_ratelimited(mdev_dev(mdev),
+					     "Relaying device request to user (#%u)\n",
+					     count);
+		eventfd_signal(mdev->req_trigger, 1);
+	} else if (count == 0) {
+		dev_warn(mdev_dev(mdev),
+			 "No device request channel registered, blocked until released by user\n");
+	}
+
+	mutex_unlock(&vfio_pdev->igate);
+}
+
+static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", idxd_mdev_types[mtype_get_type_group_id(mtype)].name);
+}
+static MDEV_TYPE_ATTR_RO(name);
+
+static int find_available_mdev_instances(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int count = 0, i;
+	unsigned long flags;
+
+	switch (type->type) {
+	case IDXD_MDEV_TYPE_DSA_1_DWQ:
+	case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_DSA)
+			return 0;
+		break;
+	case IDXD_MDEV_TYPE_IAX_1_DWQ:
+	case IDXD_MDEV_TYPE_IAX_1_SWQ:
+		if (idxd->data->type != IDXD_TYPE_IAX)
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq;
+
+		wq = idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!is_idxd_wq_mdev(wq))
+			continue;
+
+		switch (type->type) {
+		case IDXD_MDEV_TYPE_DSA_1_DWQ:
+		case IDXD_MDEV_TYPE_IAX_1_DWQ:
+			if (wq_dedicated(wq) && !idxd_wq_refcount(wq))
+				count++;
+			break;
+		case IDXD_MDEV_TYPE_DSA_1_SWQ:
+		case IDXD_MDEV_TYPE_IAX_1_SWQ:
+			if (!wq_dedicated(wq))
+				count++;
+			break;
+		default:
+			return 0;
+		}
+	}
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	return count;
+}
+
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
+{
+	struct device *dev = mtype_get_parent_dev(mtype);
+	struct idxd_device *idxd = dev_get_drvdata(dev);
+	int count;
+	struct vdcm_idxd_type *type;
+
+	type = &idxd_mdev_types[mtype_get_type_group_id(mtype)];
+	count = find_available_mdev_instances(idxd, type);
+
+	return sprintf(buf, "%d\n", count);
+}
+static MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct mdev_type *mtype, struct mdev_type_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+static MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *idxd_mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group idxd_mdev_type_dsa_group0 = {
+	.name = idxd_dsa_1dwq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_iax_group0 = {
+	.name = idxd_iax_1dwq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_dsa_group1 = {
+	.name = idxd_dsa_1swq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group idxd_mdev_type_iax_group1 = {
+	.name = idxd_iax_1swq_name,
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group *idxd_mdev_type_groups[] = {
+	&idxd_mdev_type_dsa_group0,
+	&idxd_mdev_type_iax_group0,
+	&idxd_mdev_type_dsa_group1,
+	&idxd_mdev_type_iax_group1,
+	NULL,
+};
+
+static const struct vfio_device_ops idxd_mdev_ops = {
+	.name = "vfio-mdev",
+	.open_device = idxd_vdcm_open,
+	.close_device = idxd_vdcm_close,
+	.read = idxd_vdcm_read,
+	.write = idxd_vdcm_write,
+	.mmap = idxd_vdcm_mmap,
+	.ioctl = idxd_vdcm_ioctl,
+	.request = idxd_vdcm_mdev_request,
+};
+
+static struct mdev_driver idxd_vdcm_driver = {
+	.driver = {
+		.name = "idxd-mdev",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+	.probe = idxd_vdcm_probe,
+	.remove = idxd_vdcm_remove,
+};
+
+static const struct mdev_parent_ops idxd_parent_ops = {
+	.owner = THIS_MODULE,
+	.device_driver = &idxd_vdcm_driver,
+	.supported_type_groups = idxd_mdev_type_groups,
+};
+
+static int idxd_mdev_drv_probe(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+
+	if (idxd->state != IDXD_DEV_ENABLED)
+		return -ENXIO;
+
+	mutex_lock(&wq->wq_lock);
+	wq->type = IDXD_WQT_MDEV;
+
+	rc = __drv_enable_wq(wq);
+	mutex_unlock(&wq->wq_lock);
+	if (rc < 0)
+		return rc;
+
+	mutex_lock(&idxd->kref_lock);
+	/*
+	 * If kref == 1, that means there are no mdev clients and mdev has
+	 * not been registered.
+	 */
+	if (!idxd->mdev_host_init) {
+		kref_init(&idxd->mdev_kref);
+		rc = idxd_mdev_host_init(idxd, &idxd_parent_ops);
+		if (rc < 0) {
+			mutex_unlock(&idxd->kref_lock);
+			drv_disable_wq(wq);
+			dev_warn(dev, "mdev device init failed!\n");
+			return -ENXIO;
+		}
+	} else {
+		kref_get(&idxd->mdev_kref);
+	}
+	mutex_unlock(&idxd->kref_lock);
+
+	get_device(dev);
+	dev_info(dev, "wq %s enabled\n", dev_name(dev));
+	return 0;
+}
+
+static void idxd_mdev_drv_remove(struct idxd_dev *idxd_dev)
+{
+	struct device *dev = &idxd_dev->conf_dev;
+	struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
+	struct idxd_device *idxd = wq->idxd;
+
+	mutex_lock(&wq->wq_lock);
+	__drv_disable_wq(wq);
+
+	if (wq->state == IDXD_WQ_LOCKED)
+		wq->state = IDXD_WQ_DISABLED;
+	mutex_unlock(&wq->wq_lock);
+
+	mutex_lock(&idxd->kref_lock);
+	if (idxd->mdev_host_init) {
+		kref_put(&idxd->mdev_kref, idxd_mdev_host_release);
+
+		/* kref init at 1, when it hits 1, no more devices and we can release */
+		if (kref_read(&idxd->mdev_kref) == 1)
+			kref_put(&idxd->mdev_kref, idxd_mdev_host_release);
+	}
+	mutex_unlock(&idxd->kref_lock);
+	put_device(dev);
+	dev_info(dev, "wq %s disabled\n", dev_name(dev));
+}
+
+static struct idxd_device_ops mdev_wq_ops = {
+	.notify_error = idxd_wq_vidxd_send_errors,
+};
+
+static enum idxd_dev_type dev_types[] = {
+	IDXD_DEV_WQ,
+	IDXD_DEV_NONE,
+};
+
+static struct idxd_device_driver idxd_mdev_driver = {
+	.probe = idxd_mdev_drv_probe,
+	.remove = idxd_mdev_drv_remove,
+	.name = "mdev",
+	.type = dev_types,
+	.ops = &mdev_wq_ops,
+};
+
+static int __init idxd_mdev_init(void)
+{
+	int rc;
+
+	rc = idxd_driver_register(&idxd_mdev_driver);
+	if (rc < 0)
+		return rc;
+
+	rc = mdev_register_driver(&idxd_vdcm_driver);
+	if (rc < 0) {
+		idxd_driver_unregister(&idxd_mdev_driver);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit idxd_mdev_exit(void)
+{
+	mdev_unregister_driver(&idxd_vdcm_driver);
+	idxd_driver_unregister(&idxd_mdev_driver);
+}
+
+module_init(idxd_mdev_init);
+module_exit(idxd_mdev_exit);
+
+MODULE_IMPORT_NS(IDXD);
+MODULE_SOFTDEP("pre: idxd");
+MODULE_SOFTDEP("pre: mdev");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_IDXD_DEVICE(0);
diff --git a/drivers/vfio/mdev/idxd/mdev.h b/drivers/vfio/mdev/idxd/mdev.h
new file mode 100644
index 0000000000000000000000000000000000000000..8af1bb8ee21abf616454d2dffcc84e3bee22ca87
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _IDXD_MDEV_H_
+#define _IDXD_MDEV_H_
+
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+/* two 64-bit BARs implemented */
+#define VIDXD_MAX_BARS 2
+#define VIDXD_MAX_CFG_SPACE_SZ 4096
+#define VIDXD_MAX_MMIO_SPACE_SZ 8192
+#define VIDXD_MSIX_TBL_SZ_OFFSET 0x42
+#define VIDXD_CAP_CTRL_SZ 0x100
+#define VIDXD_GRP_CTRL_SZ 0x100
+#define VIDXD_WQ_CTRL_SZ 0x100
+#define VIDXD_WQ_OCPY_INT_SZ 0x20
+#define VIDXD_MSIX_TBL_SZ 0x90
+#define VIDXD_MSIX_PERM_TBL_SZ 0x48
+
+#define VIDXD_MSIX_TABLE_OFFSET 0x600
+#define VIDXD_MSIX_PERM_OFFSET 0x300
+#define VIDXD_GRPCFG_OFFSET 0x400
+#define VIDXD_WQCFG_OFFSET 0x500
+#define VIDXD_IMS_OFFSET 0x1000
+
+#define VIDXD_BAR0_SIZE  0x2000
+#define VIDXD_BAR2_SIZE  0x2000
+#define VIDXD_MAX_MSIX_ENTRIES  (VIDXD_MSIX_TBL_SZ / 0x10)
+#define VIDXD_MAX_WQS	1
+#define VIDXD_MAX_MSIX_VECS	2
+
+#define VIDXD_ATS_OFFSET 0x100
+#define VIDXD_PRS_OFFSET 0x110
+#define VIDXD_PASID_OFFSET 0x120
+#define VIDXD_MSIX_PBA_OFFSET 0x700
+
+#define VIDXD_STATE_BUFFER_SIZE (4 * PAGE_SIZE)
+#define VIDXD_MAX_INTS 65536
+
+struct ioasid_mm_entry {
+	struct mm_struct *mm;
+	struct list_head node;
+};
+
+#define IDXD_DESC_SIZE sizeof(struct dsa_hw_desc)
+
+#define VIDXD_MAX_PORTALS 64
+
+struct idxd_wq_desc_elem {
+	enum idxd_portal_prot portal_prot;
+	u8   portal_id;
+	u8  work_desc[IDXD_DESC_SIZE];
+	struct list_head link;
+};
+
+struct idxd_wq_portal {
+	u8 data[IDXD_DESC_SIZE];
+	unsigned int count;
+};
+
+struct idxd_virtual_wq {
+	unsigned int ndescs;
+	struct list_head head;
+	struct idxd_wq_portal portals[VIDXD_MAX_PORTALS];
+};
+
+struct idxd_vdev {
+	struct mdev_device *mdev;
+	struct vfio_group *vfio_group;
+	struct notifier_block pasid_nb;
+	struct mutex ioasid_lock;
+	struct list_head mm_list;
+};
+
+struct vdcm_idxd {
+	struct vfio_device vdev;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	struct idxd_virtual_wq vwq;
+	struct idxd_vdev ivdev;
+	struct vdcm_idxd_type *type;
+	int num_wqs;
+
+	/* For VM use case */
+	u64 bar_val[VIDXD_MAX_BARS];
+	u64 bar_size[VIDXD_MAX_BARS];
+	u8 cfg[VIDXD_MAX_CFG_SPACE_SZ];
+	u8 bar0[VIDXD_MAX_MMIO_SPACE_SZ];
+	struct list_head list;
+	struct mutex dev_lock; /* lock for vidxd resources */
+	struct mutex mig_submit_lock;
+	bool paused;
+
+	int refcount;
+	struct vfio_pci_core_device vfio_pdev;
+};
+
+#define vdev_to_vidxd(vdev) container_of(vdev, struct vdcm_idxd, vdev)
+
+static inline struct vdcm_idxd *to_vidxd(struct idxd_vdev *vdev)
+{
+	return container_of(vdev, struct vdcm_idxd, vdev);
+}
+
+#define IDXD_MDEV_NAME_LEN 64
+
+enum idxd_mdev_type {
+	IDXD_MDEV_TYPE_NONE = -1,
+	IDXD_MDEV_TYPE_DSA_1_DWQ = 0,
+	IDXD_MDEV_TYPE_IAX_1_DWQ,
+	IDXD_MDEV_TYPE_DSA_1_SWQ,
+	IDXD_MDEV_TYPE_IAX_1_SWQ,
+};
+
+#define IDXD_MDEV_WQ_TYPES 	2
+#define IDXD_MDEV_TYPES		(IDXD_TYPE_MAX * IDXD_MDEV_WQ_TYPES)
+
+struct vdcm_idxd_type {
+	const char *name;
+	enum idxd_mdev_type type;
+	unsigned int avail_instance;
+};
+
+enum idxd_vdcm_rw {
+	IDXD_VDCM_READ = 0,
+	IDXD_VDCM_WRITE,
+};
+
+static inline u64 get_reg_val(void *buf, int size)
+{
+	u64 val = 0;
+
+	switch (size) {
+	case 8:
+		val = *(u64 *)buf;
+		break;
+	case 4:
+		val = *(u32 *)buf;
+		break;
+	case 2:
+		val = *(u16 *)buf;
+		break;
+	case 1:
+		val = *(u8 *)buf;
+		break;
+	}
+
+	return val;
+}
+
+static inline u8 vidxd_state(struct vdcm_idxd *vidxd)
+{
+	union gensts_reg *gensts = (union gensts_reg *)(vidxd->bar0 + IDXD_GENSTATS_OFFSET);
+
+	return gensts->state;
+}
+
+int idxd_mdev_host_init(struct idxd_device *idxd, const struct mdev_parent_ops *ops);
+void idxd_mdev_host_release(struct kref *kref);
+int idxd_mdev_get_pasid(struct mdev_device *mdev, u32 *pasid);
+int idxd_mdev_get_host_pasid(struct mdev_device *mdev, u32 gpasid, u32 *pasid);
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
+void vidxd_mmio_init(struct vdcm_idxd *vidxd);
+void vidxd_reset(struct vdcm_idxd *vidxd);
+void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
+
+int vidxd_portal_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size);
+int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size);
+
+void vidxd_notify_revoked_handles (struct vdcm_idxd *vidxd);
+
+#endif
diff --git a/drivers/vfio/mdev/idxd/mdev_host.c b/drivers/vfio/mdev/idxd/mdev_host.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef94f59940232fff674457b4d06d5c2e8fb2ee65
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/mdev_host.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+#include <linux/irqdomain.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <uapi/linux/idxd.h>
+#include "idxd.h"
+#include "mdev.h"
+
+extern const struct vfio_pci_regops vfio_pci_dma_fault_regops;
+
+int idxd_mdev_host_init(struct idxd_device *idxd, const struct mdev_parent_ops *ops)
+{
+	struct device *dev = &idxd->pdev->dev;
+	struct ims_array_info ims_info;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_IMS_SUPPORTED, &idxd->flags))
+		return -EOPNOTSUPP;
+
+	rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_AUX);
+	if (rc < 0) {
+		dev_warn(dev, "Failed to enable aux-domain: %d\n", rc);
+		return rc;
+	}
+
+	ims_info.max_slots = idxd->ims_size;
+	ims_info.slots = idxd->reg_base + idxd->ims_offset;
+	idxd->ims_domain = pci_ims_array_create_msi_irq_domain(idxd->pdev, &ims_info);
+	if (!idxd->ims_domain) {
+		dev_warn(dev, "Fail to acquire IMS domain\n");
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		return -ENODEV;
+	}
+
+	rc = mdev_register_device(dev, ops);
+	if (rc < 0) {
+		dev_warn(dev, "mdev register failed\n");
+		irq_domain_remove(idxd->ims_domain);
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		return rc;
+	}
+
+	mutex_init(&idxd->vfio_pdev.igate);
+	idxd->vfio_pdev.pdev = idxd->pdev;
+	rc = vfio_pci_dma_fault_init(&idxd->vfio_pdev, true);
+	if (rc < 0) {
+		dev_err(dev, "dma fault region init failed\n");
+		irq_domain_remove(idxd->ims_domain);
+		iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		mdev_unregister_device(dev);
+		return rc;
+	}
+
+	idxd->mdev_host_init = true;
+	return 0;
+}
+
+void idxd_mdev_host_release(struct kref *kref)
+{
+	struct idxd_device *idxd = container_of(kref, struct idxd_device, mdev_kref);
+	struct device *dev = &idxd->pdev->dev;
+	struct vfio_pci_core_device *vfio_pdev = &idxd->vfio_pdev;
+	int i, rc;
+
+	if (!idxd->mdev_host_init)
+		return;
+
+	WARN_ON(iommu_unregister_device_fault_handler(dev));
+
+	for (i = 0; i < vfio_pdev->num_regions; i++)
+		vfio_pdev->region[i].ops->release(vfio_pdev, &vfio_pdev->region[i]);
+	vfio_pdev->num_regions = 0;
+	kfree(vfio_pdev->region);
+	vfio_pdev->region = NULL;
+	rc = iommu_unregister_device_fault_handler(&idxd->pdev->dev);
+	if (rc)
+		dev_warn(dev, "iommu_unregister_device_fault_handler() failed\n");
+
+	for (i = 0; i < vfio_pdev->num_ext_irqs; i++)
+		vfio_pci_set_ext_irq_trigger(vfio_pdev,
+					     VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+					     VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+	vfio_pdev->num_ext_irqs = 0;
+	kfree(vfio_pdev->ext_irqs);
+	vfio_pdev->ext_irqs = NULL;
+
+	irq_domain_remove(idxd->ims_domain);
+	mdev_unregister_device(dev);
+	iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+	idxd->mdev_host_init = false;
+}
diff --git a/drivers/vfio/mdev/idxd/vdev.c b/drivers/vfio/mdev/idxd/vdev.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed5901e45f352abca479e24781eacf7f39bb3142
--- /dev/null
+++ b/drivers/vfio/mdev/idxd/vdev.c
@@ -0,0 +1,1422 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <linux/sched/mm.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../mdev_private.h"
+#include "mdev.h"
+
+static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val);
+
+void vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
+{
+	struct vfio_pci_core_device *vfio_pdev = &vidxd->vfio_pdev;
+
+	eventfd_signal(vfio_pdev->ctx[msix_idx].trigger, 1);
+}
+
+static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
+{
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl;
+	bool send = false;
+
+	if (!swerr->valid) {
+		memset(swerr, 0, sizeof(*swerr));
+		swerr->valid = 1;
+		swerr->error = error;
+		send = true;
+	} else if (swerr->valid && !swerr->overflow) {
+		swerr->overflow = 1;
+	}
+
+	genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	if (send && genctrl->softerr_int_en) {
+		u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+
+		*intcause |= IDXD_INTC_ERR;
+		vidxd_send_interrupt(vidxd, 0);
+	}
+}
+
+void vidxd_notify_revoked_handles (struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+
+	*intcause |= IDXD_INTC_INT_HANDLE_REVOKED;
+	pr_info("Signaling guest about revoked handles\n");
+	vidxd_send_interrupt(vidxd, 0);
+}
+
+static int vidxd_set_ims_pasid(struct vdcm_idxd *vidxd, int index, bool pasid_en, u32 gpasid)
+{
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+	u64 auxval;
+	u32 pasid;
+	int irq;
+	int rc;
+
+	irq = dev_msi_irq_vector(dev, index);
+
+	if (pasid_en)
+		rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev, gpasid, &pasid);
+	else
+		rc = idxd_mdev_get_pasid(vidxd->ivdev.mdev, &pasid);
+	if (rc < 0)
+		return rc;
+	dev_dbg(dev, "IMS entry: %d pasid_en: %u guest pasid %u host pasid: %u\n",
+		index, pasid_en, gpasid, pasid);
+	auxval = ims_ctrl_pasid_aux(pasid, 1);
+	return irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+
+}
+
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	u8 *bar0 = vidxd->bar0;
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	dev_dbg(dev, "vidxd mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
+		offset, get_reg_val(buf, size));
+
+	if (((size & (size - 1)) != 0) || (offset & (size - 1)) != 0) {
+		dev_warn(dev, "XXX %s out of bounds\n", __func__);
+		return -EINVAL;
+	}
+
+	/* If we don't limit this, we potentially can write out of bound */
+	if (size > sizeof(u32)) {
+		dev_warn(dev, "XXX %s size greater than u32\n", __func__);
+		return -EINVAL;
+	}
+
+	switch (offset) {
+	case IDXD_GENCFG_OFFSET ... IDXD_GENCFG_OFFSET + 3:
+		/* Write only when device is disabled. */
+		if (vidxd_state(vidxd) == IDXD_DEVICE_STATE_DISABLED)
+			memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_GENCTRL_OFFSET:
+		memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_INTCAUSE_OFFSET:
+		*(u32 *)&bar0[offset] &= ~(get_reg_val(buf, 4));
+		break;
+
+	case IDXD_CMD_OFFSET: {
+		u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+		u32 val = get_reg_val(buf, size);
+
+		if (size != sizeof(u32))
+			return -EINVAL;
+
+		/* Check and set command in progress */
+		if (test_and_set_bit(IDXD_CMDS_ACTIVE_BIT, (unsigned long *)cmdsts) == 0)
+			vidxd_do_command(vidxd, val);
+		else
+			vidxd_report_error(vidxd, DSA_ERR_CMD_REG);
+		break;
+	}
+
+	case IDXD_SWERR_OFFSET:
+		/* W1C */
+		bar0[offset] &= ~(get_reg_val(buf, 1) & GENMASK(1, 0));
+		break;
+
+	case VIDXD_WQCFG_OFFSET ... VIDXD_WQCFG_OFFSET + VIDXD_WQ_CTRL_SZ - 1: {
+		union wqcfg *wqcfg;
+		int wq_id = (offset - VIDXD_WQCFG_OFFSET) / 0x20;
+		int subreg = offset & 0x1c;
+		u32 new_val;
+
+		if (wq_id >= VIDXD_MAX_WQS)
+			break;
+
+		/* FIXME: Need to sanitize for RO Config WQ mode 1 */
+		wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET + wq_id * 0x20);
+		if (size >= 4) {
+			new_val = get_reg_val(buf, 4);
+		} else {
+			u32 tmp1, tmp2, shift, mask;
+
+			switch (subreg) {
+			case 4:
+				tmp1 = wqcfg->bits[1];
+				break;
+			case 8:
+				tmp1 = wqcfg->bits[2];
+				break;
+			case 12:
+				tmp1 = wqcfg->bits[3];
+				break;
+			case 16:
+				tmp1 = wqcfg->bits[4];
+				break;
+			case 20:
+				tmp1 = wqcfg->bits[5];
+				break;
+			default:
+				tmp1 = 0;
+			}
+
+			tmp2 = get_reg_val(buf, size);
+			shift = (offset & 0x03U) * 8;
+			mask = ((1U << size * 8) - 1u) << shift;
+			new_val = (tmp1 & ~mask) | (tmp2 << shift);
+		}
+
+		if (subreg == 8) {
+			if (wqcfg->wq_state == 0) {
+				wqcfg->bits[2] &= 0xfe;
+				wqcfg->bits[2] |= new_val & 0xffffff01;
+			}
+		}
+
+		break;
+	} /* WQCFG */
+
+	case VIDXD_GRPCFG_OFFSET ...  VIDXD_GRPCFG_OFFSET + VIDXD_GRP_CTRL_SZ - 1:
+		/* Nothing is written. Should be all RO */
+		break;
+
+	case VIDXD_MSIX_TABLE_OFFSET ...  VIDXD_MSIX_TABLE_OFFSET + VIDXD_MSIX_TBL_SZ - 1: {
+		int index = (offset - VIDXD_MSIX_TABLE_OFFSET) / 0x10;
+		u8 *msix_entry = &bar0[VIDXD_MSIX_TABLE_OFFSET + index * 0x10];
+		u64 *pba = (u64 *)(bar0 + VIDXD_MSIX_PBA_OFFSET);
+		u8 ctrl;
+
+		ctrl = msix_entry[MSIX_ENTRY_CTRL_BYTE];
+		memcpy(bar0 + offset, buf, size);
+		/* Handle clearing of UNMASK bit */
+		if (!(msix_entry[MSIX_ENTRY_CTRL_BYTE] & MSIX_ENTRY_MASK_INT) &&
+		    ctrl & MSIX_ENTRY_MASK_INT)
+			if (test_and_clear_bit(index, (unsigned long *)pba))
+				vidxd_send_interrupt(vidxd, index);
+		break;
+	}
+
+	case VIDXD_MSIX_PERM_OFFSET ...  VIDXD_MSIX_PERM_OFFSET + VIDXD_MSIX_PERM_TBL_SZ - 1: {
+		int index;
+		u32 msix_perm;
+
+		if (size != sizeof(u32) || !IS_ALIGNED(offset, sizeof(u64))) {
+			dev_warn(dev, "XXX unaligned MSIX PERM access\n");
+			break;
+		}
+
+		index = (offset - VIDXD_MSIX_PERM_OFFSET) / 8;
+		msix_perm = get_reg_val(buf, sizeof(u32)) & 0xfffff00d;
+		memcpy(bar0 + offset, buf, size);
+		dev_dbg(dev, "%s writing to MSIX_PERM: %#x offset %#x index: %u\n",
+			__func__, msix_perm, offset, index);
+		break;
+	}
+	} /* offset */
+
+	return 0;
+}
+
+int vidxd_portal_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+                                unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[1] - 1);
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	BUG_ON((size & (size - 1)) != 0);
+	BUG_ON(size > 8);
+	BUG_ON((offset & (size - 1)) != 0);
+
+	memset(buf, 0xff, size);
+
+	dev_dbg(dev, "vidxd portal mmio R %d %x %x: %llx\n",
+		vidxd->wq->id, size, offset, get_reg_val(buf, size));
+	return 0;
+}
+
+int vidxd_portal_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf,
+				unsigned int size)
+{
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+	u32 offset = pos & (vidxd->bar_size[1] - 1);
+	uint16_t wq_id = offset >> 14;
+	uint16_t portal_id, portal_offset;
+	struct idxd_virtual_wq *vwq;
+	struct idxd_wq *wq;
+	struct idxd_wq_portal *portal;
+	enum idxd_portal_prot portal_prot = IDXD_PORTAL_UNLIMITED;
+	int rc = 0;
+
+	BUG_ON((size & (size - 1)) != 0);
+	BUG_ON(size > 64);
+	BUG_ON((offset & (size - 1)) != 0);
+
+	dev_dbg(dev, "vidxd portal mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
+			offset, get_reg_val(buf, size));
+
+	if (wq_id >= vidxd->num_wqs) {
+		printk("DSA portal write: Invalid wq  %d\n", wq_id);
+	}
+
+	vwq = &vidxd->vwq;
+	wq = vidxd->wq;
+
+	if (!wq_dedicated(wq) || (((offset >> PAGE_SHIFT) & 0x3) == 1))
+		portal_prot = IDXD_PORTAL_LIMITED;
+
+	portal_id = (offset & 0xFFF) >> 6;
+	portal_offset = offset & 0x3F;
+
+	portal = &vwq->portals[portal_id];
+
+	portal->count += size;
+	memcpy(&portal->data[portal_offset], buf, size);
+
+	if (portal->count == IDXD_DESC_SIZE) {
+		struct idxd_wq_desc_elem *elem;
+		u64 *p = (u64 *)portal->data;
+		printk("desc: %016llx %016llx  %016llx %016llx %016llx %016llx %016llx %016llx\n",
+				p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+
+		mutex_lock(&vidxd->mig_submit_lock);
+		if (vidxd->paused) {
+#if 0
+			if (wq_dedicated(wq)) {
+#endif
+				/* Queue the descriptor if submitted to DWQ */
+				if (vwq->ndescs == wq->size) {
+					printk("can't submit more descriptors than WQ size. Dropping.\n");
+					goto out_unlock;
+				}
+
+				elem = kmalloc(sizeof(struct idxd_wq_desc_elem),
+					GFP_KERNEL);
+
+				if (elem == NULL) {
+					printk("kmalloc failed\n");
+					rc = -ENOMEM;
+					goto out_unlock;
+				}
+				printk("queuing the desc\n");
+				memcpy(elem->work_desc, portal->data, IDXD_DESC_SIZE);
+				elem->portal_prot = portal_prot;
+				elem->portal_id = portal_id;
+
+				list_add_tail(&elem->link, &vwq->head);
+				vwq->ndescs++;
+#if 0
+			} else {
+				/* Return retry if submitted to SWQ */
+				rc = -EAGAIN;
+				goto out_unlock;
+			}
+#endif
+               } else {
+			void __iomem *wq_portal;
+			wq_portal = vidxd->idxd->portal_base +
+				idxd_get_wq_portal_full_offset(wq->id,
+						portal_prot, IDXD_IRQ_IMS);
+			wq_portal += (portal_id << 6);
+			printk("submitting a desc to WQ %d ded %d\n", wq->id,
+					wq_dedicated(wq));
+			if (wq_dedicated(wq)) {
+				iosubmit_cmds512(wq_portal, (struct dsa_hw_desc *)p, 1);
+			} else {
+				int rc;
+				struct dsa_hw_desc *hw =
+					(struct dsa_hw_desc *)portal->data;
+				int hpasid, gpasid = hw->pasid;
+
+				/* Translate the gpasid in the descriptor */
+				rc = idxd_mdev_get_host_pasid(vidxd->ivdev.mdev,
+							gpasid, &hpasid);
+                                if (rc < 0) {
+                                        pr_info("gpasid->hpasid trans failed\n");
+					rc = -EINVAL;
+					goto out_unlock;
+                                }
+                                hw->pasid = hpasid;
+
+				/* FIXME: Allow enqcmds to retry a few times
+				 * before failing */
+				rc = enqcmds(wq_portal, hw);
+				if (rc < 0) {
+					pr_info("%s: enqcmds failed\n", __func__);
+					goto out_unlock;
+				}
+			}
+		}
+out_unlock:
+		mutex_unlock(&vidxd->mig_submit_lock);
+		memset(&portal->data, 0, IDXD_DESC_SIZE);
+		portal->count = 0;
+	}
+
+	return rc;
+}
+
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	memcpy(buf, vidxd->bar0 + offset, size);
+
+	dev_dbg(dev, "vidxd mmio R %d %x %x: %llx\n",
+		vidxd->wq->id, size, offset, get_reg_val(buf, size));
+	return 0;
+}
+
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count)
+{
+	u32 offset = pos & 0xfff;
+	struct device *dev = mdev_dev(vidxd->ivdev.mdev);
+
+	memcpy(buf, &vidxd->cfg[offset], count);
+
+	dev_dbg(dev, "vidxd pci R %d %x %x: %llx\n",
+		vidxd->wq->id, count, offset, get_reg_val(buf, count));
+
+	return 0;
+}
+
+/*
+ * Much of the emulation code has been borrowed from Intel i915 cfg space
+ * emulation code.
+ * drivers/gpu/drm/i915/gvt/cfg_space.c:
+ */
+
+/*
+ * Bitmap for writable bits (RW or RW1C bits, but cannot co-exist in one
+ * byte) byte by byte in standard pci configuration space. (not the full
+ * 256 bytes.)
+ */
+static const u8 pci_cfg_space_rw_bmp[PCI_INTERRUPT_LINE + 4] = {
+	[PCI_COMMAND]		= 0xff, 0x07,
+	[PCI_STATUS]		= 0x00, 0xf9, /* the only one RW1C byte */
+	[PCI_CACHE_LINE_SIZE]	= 0xff,
+	[PCI_BASE_ADDRESS_0 ... PCI_CARDBUS_CIS - 1] = 0xff,
+	[PCI_ROM_ADDRESS]	= 0x01, 0xf8, 0xff, 0xff,
+	[PCI_INTERRUPT_LINE]	= 0xff,
+};
+
+static void _pci_cfg_mem_write(struct vdcm_idxd *vidxd, unsigned int off, u8 *src,
+			       unsigned int bytes)
+{
+	u8 *cfg_base = vidxd->cfg;
+	u8 mask, new, old;
+	int i = 0;
+
+	for (; i < bytes && (off + i < sizeof(pci_cfg_space_rw_bmp)); i++) {
+		mask = pci_cfg_space_rw_bmp[off + i];
+		old = cfg_base[off + i];
+		new = src[i] & mask;
+
+		/**
+		 * The PCI_STATUS high byte has RW1C bits, here
+		 * emulates clear by writing 1 for these bits.
+		 * Writing a 0b to RW1C bits has no effect.
+		 */
+		if (off + i == PCI_STATUS + 1)
+			new = (~new & old) & mask;
+
+		cfg_base[off + i] = (old & ~mask) | new;
+	}
+
+	/* For other configuration space directly copy as it is. */
+	if (i < bytes)
+		memcpy(cfg_base + off + i, src + i, bytes - i);
+}
+
+static inline void _write_pci_bar(struct vdcm_idxd *vidxd, u32 offset, u32 val, bool low)
+{
+	u32 *pval;
+
+	/* BAR offset should be 32 bits algiend */
+	offset = rounddown(offset, 4);
+	pval = (u32 *)(vidxd->cfg + offset);
+
+	if (low) {
+		/*
+		 * only update bit 31 - bit 4,
+		 * leave the bit 3 - bit 0 unchanged.
+		 */
+		*pval = (val & GENMASK(31, 4)) | (*pval & GENMASK(3, 0));
+	} else {
+		*pval = val;
+	}
+}
+
+static int _pci_cfg_bar_write(struct vdcm_idxd *vidxd, unsigned int offset, void *p_data,
+			      unsigned int bytes)
+{
+	u32 new = *(u32 *)(p_data);
+	bool lo = IS_ALIGNED(offset, 8);
+	u64 size;
+	unsigned int bar_id;
+
+	/*
+	 * Power-up software can determine how much address
+	 * space the device requires by writing a value of
+	 * all 1's to the register and then reading the value
+	 * back. The device will return 0's in all don't-care
+	 * address bits.
+	 */
+	if (new == 0xffffffff) {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			bar_id = (offset - PCI_BASE_ADDRESS_0) / 8;
+			size = vidxd->bar_size[bar_id];
+			_write_pci_bar(vidxd, offset, size >> (lo ? 0 : 32), lo);
+			break;
+		default:
+			/* Unimplemented BARs */
+			_write_pci_bar(vidxd, offset, 0x0, false);
+		}
+	} else {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			_write_pci_bar(vidxd, offset, new, lo);
+			break;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size)
+{
+	struct device *dev = &vidxd->idxd->pdev->dev;
+	u8 *cfg = vidxd->cfg;
+	u32 offset = pos & 0xfff;
+	u64 val;
+
+	if (size > 4)
+		return -EINVAL;
+
+	if (pos + size > VIDXD_MAX_CFG_SPACE_SZ)
+		return -EINVAL;
+
+	dev_dbg(dev, "vidxd pci W %d %x %x: %llx\n", vidxd->wq->id, size, pos,
+		get_reg_val(buf, size));
+
+	/* First check if it's PCI_COMMAND */
+	if (IS_ALIGNED(pos, 2) && pos == PCI_COMMAND) {
+		bool new_bme;
+		bool bme;
+
+		if (size > 2)
+			return -EINVAL;
+
+		new_bme = !!(get_reg_val(buf, 2) & PCI_COMMAND_MASTER);
+		bme = !!(vidxd->cfg[pos] & PCI_COMMAND_MASTER);
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+
+		/* Flag error if turning off BME while device is enabled */
+		if ((bme && !new_bme) && vidxd_state(vidxd) == IDXD_DEVICE_STATE_ENABLED)
+			vidxd_report_error(vidxd, DSA_ERR_PCI_CFG);
+		return 0;
+	}
+
+	switch (pos) {
+	case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5:
+		if (!IS_ALIGNED(pos, 4))
+			return -EINVAL;
+		return _pci_cfg_bar_write(vidxd, pos, buf, size);
+
+	case VIDXD_ATS_OFFSET + 4:
+		if (size < 4)
+			break;
+		offset += 2;
+		buf = buf + 2;
+		size -= 2;
+		fallthrough;
+
+	case VIDXD_ATS_OFFSET + 6:
+		memcpy(&cfg[offset], buf, size);
+		break;
+
+	case VIDXD_PRS_OFFSET + 4: {
+		u8 old_val, new_val;
+
+		val = get_reg_val(buf, 1);
+		old_val = cfg[VIDXD_PRS_OFFSET + 4];
+		new_val = val & 1;
+
+		cfg[offset] = new_val;
+		if (old_val == 0 && new_val == 1) {
+			/*
+			 * Clear Stopped, Response Failure,
+			 * and Unexpected Response.
+			 */
+			*(u16 *)&cfg[VIDXD_PRS_OFFSET + 6] &= ~(u16)(0x0103);
+		}
+
+		if (size < 4)
+			break;
+
+		offset += 2;
+		buf = (u8 *)buf + 2;
+		size -= 2;
+		fallthrough;
+	}
+
+	case VIDXD_PRS_OFFSET + 6:
+		cfg[offset] &= ~(get_reg_val(buf, 1) & 3);
+		break;
+
+	case VIDXD_PRS_OFFSET + 12 ... VIDXD_PRS_OFFSET + 15:
+		memcpy(&cfg[offset], buf, size);
+		break;
+
+	case VIDXD_PASID_OFFSET + 4:
+		if (size < 4)
+			break;
+		offset += 2;
+		buf = buf + 2;
+		size -= 2;
+		fallthrough;
+
+	case VIDXD_PASID_OFFSET + 6:
+		cfg[offset] = get_reg_val(buf, 1) & 5;
+		break;
+
+	default:
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+	}
+	return 0;
+}
+
+static void vidxd_mmio_init_grpcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union group_cap_reg *grp_cap = (union group_cap_reg *)(bar0 + IDXD_GRPCAP_OFFSET);
+
+	/* single group for current implementation */
+	grp_cap->rdbuf_ctrl = 0;
+	grp_cap->rdbuf_limit = 0;
+	grp_cap->total_rdbufs = 0;
+	grp_cap->num_groups = 1;
+}
+
+static void vidxd_mmio_init_grpcfg(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct grpcfg *grpcfg = (struct grpcfg *)(bar0 + VIDXD_GRPCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+	int i;
+
+	/*
+	 * At this point, we are only exporting a single workqueue for
+	 * each mdev. So we need to just fake it as first workqueue
+	 * and also mark the available engines in this group.
+	 */
+
+	/* Set single workqueue and the first one */
+	grpcfg->wqs[0] = BIT(0);
+	grpcfg->engines = 0;
+	for (i = 0; i < group->num_engines; i++)
+		grpcfg->engines |= BIT(i);
+	grpcfg->flags.bits = group->grpcfg.flags.bits;
+}
+
+static void vidxd_mmio_init_wqcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct idxd_wq *wq = vidxd->wq;
+	union wq_cap_reg *wq_cap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+
+	wq_cap->occupancy_int = 0;
+	wq_cap->occupancy = 0;
+	wq_cap->priority = 0;
+	wq_cap->total_wq_size = wq->size;
+	wq_cap->num_wqs = VIDXD_MAX_WQS;
+	wq_cap->wq_ats_support = 0;
+	if (wq_dedicated(wq))
+		wq_cap->dedicated_mode = 1;
+	else
+		wq_cap->shared_mode = 1;
+}
+
+static void vidxd_mmio_init_wqcfg(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	struct idxd_wq *wq = vidxd->wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+
+	wqcfg->wq_size = wq->size;
+	wqcfg->wq_thresh = wq->threshold;
+
+	if (wq_dedicated(wq))
+		wqcfg->mode = WQCFG_MODE_DEDICATED;
+	else if (device_user_pasid_enabled(idxd))
+		wqcfg->pasid_en = 1;
+
+	wqcfg->bof = wq->wqcfg->bof;
+
+	wqcfg->priority = wq->priority;
+	wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
+	wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
+	wqcfg->mode_support = 1;
+}
+
+static void vidxd_mmio_init_engcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union engine_cap_reg *engcap = (union engine_cap_reg *)(bar0 + IDXD_ENGCAP_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+
+	engcap->num_engines = group->num_engines;
+}
+
+static void vidxd_mmio_init_gencap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union gen_cap_reg *gencap = (union gen_cap_reg *)(bar0 + IDXD_GENCAP_OFFSET);
+
+	gencap->bits = idxd->hw.gen_cap.bits;
+	gencap->config_en = 0;
+	gencap->max_ims_mult = 0;
+	gencap->cmd_cap = 1;
+	if (device_user_pasid_enabled(idxd))
+		gencap->block_on_fault = 1;
+}
+
+static void vidxd_mmio_init_cmdcap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmdcap = (u32 *)(bar0 + IDXD_CMDCAP_OFFSET);
+
+	if (idxd->hw.cmd_cap)
+		*cmdcap = idxd->hw.cmd_cap;
+	else
+		*cmdcap = 0x1ffe;
+
+	*cmdcap |= BIT(IDXD_CMD_REQUEST_INT_HANDLE) | BIT(IDXD_CMD_RELEASE_INT_HANDLE) |
+			BIT(IDXD_CMD_REVOKED_HANDLES_PROCESSED);
+}
+
+static void vidxd_mmio_init_opcap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u64 opcode;
+	u8 *bar0 = vidxd->bar0;
+	u64 *opcap = (u64 *)(bar0 + IDXD_OPCAP_OFFSET);
+
+	if (idxd->data->type == IDXD_TYPE_DSA) {
+		opcode = BIT_ULL(DSA_OPCODE_NOOP) | BIT_ULL(DSA_OPCODE_BATCH) |
+			 BIT_ULL(DSA_OPCODE_DRAIN) | BIT_ULL(DSA_OPCODE_MEMMOVE) |
+			 BIT_ULL(DSA_OPCODE_MEMFILL) | BIT_ULL(DSA_OPCODE_COMPARE) |
+			 BIT_ULL(DSA_OPCODE_COMPVAL) | BIT_ULL(DSA_OPCODE_CR_DELTA) |
+			 BIT_ULL(DSA_OPCODE_AP_DELTA) | BIT_ULL(DSA_OPCODE_DUALCAST) |
+			 BIT_ULL(DSA_OPCODE_CRCGEN) | BIT_ULL(DSA_OPCODE_COPY_CRC) |
+			 BIT_ULL(DSA_OPCODE_DIF_CHECK) | BIT_ULL(DSA_OPCODE_DIF_INS) |
+			 BIT_ULL(DSA_OPCODE_DIF_STRP) | BIT_ULL(DSA_OPCODE_DIF_UPDT) |
+			 BIT_ULL(DSA_OPCODE_CFLUSH);
+		*opcap = opcode;
+	} else if (idxd->data->type == IDXD_TYPE_IAX) {
+		opcode = BIT_ULL(IAX_OPCODE_NOOP) | BIT_ULL(IAX_OPCODE_DRAIN) |
+			 BIT_ULL(IAX_OPCODE_MEMMOVE);
+		*opcap = opcode;
+		opcap++;
+		opcode = OPCAP_BIT(IAX_OPCODE_DECOMPRESS) | OPCAP_BIT(IAX_OPCODE_COMPRESS) |
+			 OPCAP_BIT(IAX_OPCODE_CRC64) | OPCAP_BIT(IAX_OPCODE_ZERO_DECOMP_32) |
+			 OPCAP_BIT(IAX_OPCODE_ZERO_DECOMP_16) | OPCAP_BIT(IAX_OPCODE_DECOMP_32) |
+			 OPCAP_BIT(IAX_OPCODE_DECOMP_16) | OPCAP_BIT(IAX_OPCODE_SCAN) |
+			 OPCAP_BIT(IAX_OPCODE_SET_MEMBER) | OPCAP_BIT(IAX_OPCODE_EXTRACT) |
+			 OPCAP_BIT(IAX_OPCODE_SELECT) | OPCAP_BIT(IAX_OPCODE_RLE_BURST) |
+			 OPCAP_BIT(IAX_OPCDE_FIND_UNIQUE) | OPCAP_BIT(IAX_OPCODE_EXPAND);
+		*opcap = opcode;
+	}
+}
+
+static void vidxd_mmio_init_version(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u32 *version;
+
+	version = (u32 *)vidxd->bar0;
+	*version = idxd->hw.version;
+}
+
+static void vidxd_mmio_reset(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+
+	memset(bar0 + IDXD_GENCFG_OFFSET, 0, 4);
+	memset(bar0 + IDXD_GENCTRL_OFFSET, 0, 4);
+	memset(bar0 + IDXD_GENSTATS_OFFSET, 0, 4);
+	memset(bar0 + IDXD_INTCAUSE_OFFSET, 0, 4);
+	memset(bar0 + IDXD_INTCAUSE_OFFSET, 0, 4);
+	memset(bar0 + VIDXD_MSIX_PBA_OFFSET, 0, 1);
+	memset(bar0 + VIDXD_MSIX_PERM_OFFSET, 0, VIDXD_MSIX_PERM_TBL_SZ);
+
+	vidxd_mmio_init_grpcfg(vidxd);
+	vidxd_mmio_init_wqcfg(vidxd);
+}
+
+void vidxd_mmio_init(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union offsets_reg *offsets;
+
+	memset(vidxd->bar0, 0, VIDXD_BAR0_SIZE);
+
+	vidxd_mmio_init_version(vidxd);
+	vidxd_mmio_init_gencap(vidxd);
+	vidxd_mmio_init_wqcap(vidxd);
+	vidxd_mmio_init_grpcap(vidxd);
+	vidxd_mmio_init_engcap(vidxd);
+	vidxd_mmio_init_opcap(vidxd);
+
+	offsets = (union offsets_reg *)(bar0 + IDXD_TABLE_OFFSET);
+	offsets->grpcfg = VIDXD_GRPCFG_OFFSET / 0x100;
+	offsets->wqcfg = VIDXD_WQCFG_OFFSET / 0x100;
+	offsets->msix_perm = VIDXD_MSIX_PERM_OFFSET / 0x100;
+
+	vidxd_mmio_init_cmdcap(vidxd);
+	memset(bar0 + VIDXD_MSIX_PERM_OFFSET, 0, VIDXD_MSIX_PERM_TBL_SZ);
+	vidxd_mmio_init_grpcfg(vidxd);
+	vidxd_mmio_init_wqcfg(vidxd);
+}
+
+static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmd = (u32 *)(bar0 + IDXD_CMD_OFFSET);
+	u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	*cmdsts = val;
+	dev_dbg(dev, "%s: cmd: %#x  status: %#x\n", __func__, *cmd, val);
+
+	if (*cmd & IDXD_CMD_INT_MASK) {
+		*intcause |= IDXD_INTC_CMD;
+		vidxd_send_interrupt(vidxd, 0);
+	}
+}
+
+static void vidxd_enable(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	bool ats = (*(u16 *)&vidxd->cfg[VIDXD_ATS_OFFSET + 6]) & (1U << 15);
+	bool prs = (*(u16 *)&vidxd->cfg[VIDXD_PRS_OFFSET + 4]) & 1U;
+	bool pasid = (*(u16 *)&vidxd->cfg[VIDXD_PASID_OFFSET + 6]) & 1U;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_ENABLED)
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_ENABLED);
+
+	/* Check PCI configuration */
+	if (!(vidxd->cfg[PCI_COMMAND] & PCI_COMMAND_MASTER))
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_BUSMASTER_EN);
+
+	if (pasid != prs || (pasid && !ats))
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_BUSMASTER_EN);
+
+	gensts->state = IDXD_DEVICE_STATE_ENABLED;
+
+	return idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_disable(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg;
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+		return;
+	}
+
+	wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	wq = vidxd->wq;
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable (wq disable) failed: %#x\n", status);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status)
+			dev_warn(dev, "vidxd disable (wq drain) failed: %#x\n", status);
+	}
+
+	wqcfg->wq_state = 0;
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_drain_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	idxd_wq_drain(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_drain(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	idxd_wq_drain(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq drain failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_abort_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wq_dedicated(wq))
+		idxd_wq_abort(wq, NULL);
+	else
+		idxd_wq_drain(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status = 0;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	if (wq_dedicated(wq))
+		idxd_wq_abort(wq, &status);
+	else
+		idxd_wq_drain(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq abort failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+void vidxd_reset(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct idxd_wq *wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	gensts->state = IDXD_DEVICE_STATE_DRAIN;
+	wq = vidxd->wq;
+
+	if (wq->state == IDXD_WQ_ENABLED) {
+		if (wq_dedicated(wq)) {
+			idxd_wq_abort(wq, NULL);
+			idxd_wq_disable(wq, false, NULL);
+		} else {
+			idxd_wq_drain(wq, NULL);
+		}
+	}
+
+	vidxd_mmio_reset(vidxd);
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status = 0;
+
+	wq = vidxd->wq;
+	dev_dbg(dev, "vidxd reset wq %u:%u\n", 0, wq->id);
+
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	if (wq_dedicated(wq)) {
+		idxd_wq_abort(wq, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to abort: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to disable: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status) {
+			dev_dbg(dev, "vidxd reset wq failed to drain: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	}
+
+	wqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_alloc_int_handle(struct vdcm_idxd *vidxd, int operand)
+{
+	bool ims = !!(operand & CMD_INT_HANDLE_IMS);
+	u32 cmdsts;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	int ims_idx, vidx;
+
+	vidx = operand & GENMASK(15, 0);
+
+	dev_dbg(dev, "allocating int handle for %d\n", vidx);
+
+	/* vidx cannot be 0 since that's emulated and does not require IMS handle */
+	if (vidx <= 0 || vidx >= VIDXD_MAX_MSIX_VECS) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX);
+		return;
+	}
+
+	if (ims) {
+		dev_warn(dev, "IMS allocation is not implemented yet\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_NO_HANDLE);
+		return;
+	}
+
+	ims_idx = dev_msi_hwirq(dev, vidx - 1);
+	cmdsts = ims_idx << IDXD_CMDSTS_RES_SHIFT;
+	dev_dbg(dev, "requested index %d handle %d\n", vidx, ims_idx);
+	idxd_complete_command(vidxd, cmdsts);
+}
+
+static void vidxd_revoked_handles_processed (struct vdcm_idxd *vidxd,
+		int operand)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_virtual_wq *vwq = &vidxd->vwq;
+	int idx;
+	u32 status;
+
+        printk("completed revoked int handle\n");
+
+	idxd_complete_command(vidxd, 0);
+
+	BUG_ON(!list_empty(&vwq->head));
+
+	/* Step 1. Drain all the WQs associated with this VM. Currently only 1 */
+	idxd_wq_drain(vidxd->wq, &status);
+
+	if (status)
+		dev_dbg(dev, "wq drain failed: %#x\n", status);
+
+	/* Step 2. Generate a completion interrupt for all int handles */
+	for (idx = 1; idx < VIDXD_MAX_MSIX_VECS; idx++) {
+		dev_dbg(dev, "revoked int handle processed idx %d\n", idx);
+		vidxd_send_interrupt(vidxd, idx);
+	}
+}
+
+static void vidxd_release_int_handle(struct vdcm_idxd *vidxd, int operand)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	bool ims = !!(operand & CMD_INT_HANDLE_IMS);
+	int handle, i;
+	bool found = false;
+
+	handle = operand & GENMASK(15, 0);
+	dev_dbg(dev, "allocating int handle %d\n", handle);
+
+	if (ims) {
+		dev_warn(dev, "IMS allocation is not implemented yet\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE);
+		return;
+	}
+
+	/* IMS backed entry start at 1, 0 is emulated vector */
+	for (i = 0; i < VIDXD_MAX_MSIX_VECS - 1; i++) {
+		if (dev_msi_hwirq(dev, i) == handle) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		dev_warn(dev, "Freeing unallocated int handle.\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX_RELEASE);
+	}
+
+	dev_dbg(dev, "int handle %d released.\n", handle);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wq_cap_reg *wqcap;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_device *idxd;
+	union wqcfg *vwqcfg, *wqcfg;
+	int rc;
+	bool wq_pasid_enable;
+	bool pasid_enabled = (*(u16 *)&vidxd->cfg[VIDXD_PASID_OFFSET + 6]) & 1U;
+
+	if (wq_id >= VIDXD_MAX_WQS) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_WQIDX);
+		return;
+	}
+
+	idxd = vidxd->idxd;
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "%s: wq %u:%u\n", __func__, wq_id, wq->id);
+
+	vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET + wq_id * 32);
+	wqcap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+	wqcfg = wq->wqcfg;
+
+	if (vidxd_state(vidxd) != IDXD_DEVICE_STATE_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_NOTEN);
+		return;
+	}
+
+	if (vwqcfg->wq_state != IDXD_WQ_DEV_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_ENABLED);
+		return;
+	}
+
+	if ((!wq_dedicated(wq) && wqcap->shared_mode == 0) ||
+	    (wq_dedicated(wq) && wqcap->dedicated_mode == 0)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_MODE);
+		return;
+	}
+
+	if ((!wq_dedicated(wq) && vwqcfg->pasid_en == 0) ||
+	    (vwqcfg->pasid_en && pasid_enabled == 0)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+		return;
+	}
+
+	wq_pasid_enable = vwqcfg->pasid_en;
+
+	if (wq_dedicated(wq)) {
+		int wq_pasid = -1;
+		bool priv;
+
+		if (wq_pasid_enable) {
+			u32 gpasid;
+
+			priv = vwqcfg->priv;
+			gpasid = vwqcfg->pasid;
+
+			if (gpasid == 0) {
+				rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
+				dev_dbg(dev, "shared wq, pasid 0, use default host: %u\n",
+					wq_pasid);
+			} else {
+				rc = idxd_mdev_get_host_pasid(mdev, gpasid, &wq_pasid);
+				dev_dbg(dev, "guest pasid enabled, translate gpasid: %d\n", gpasid);
+			}
+		} else {
+			priv = 1;
+			rc = idxd_mdev_get_pasid(mdev, &wq_pasid);
+			dev_dbg(dev, "guest pasid disabled, using default host pasid: %u\n",
+				wq_pasid);
+		}
+		if (rc < 0) {
+			dev_err(dev, "idxd pasid setup failed wq %d: %d\n", wq->id, rc);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+			return;
+		}
+
+		if (wq_pasid >= 0) {
+			u32 status;
+			unsigned long flags;
+
+			wqcfg->bits[WQCFG_PASID_IDX] &= ~GENMASK(29, 8);
+			wqcfg->priv = priv;
+			wqcfg->pasid_en = 1;
+			wqcfg->pasid = wq_pasid;
+			dev_dbg(dev, "program pasid %d in wq %d\n", wq_pasid, wq->id);
+			spin_lock_irqsave(&idxd->dev_lock, flags);
+			idxd_wq_setup_pasid(wq, wq_pasid);
+			idxd_wq_setup_priv(wq, priv);
+			spin_unlock_irqrestore(&idxd->dev_lock, flags);
+			rc = idxd_wq_enable(wq, &status);
+			if (rc < 0 || status) {
+				dev_err(dev, "vidxd enable wq %d failed\n", wq->id);
+				idxd_complete_command(vidxd, status);
+				return;
+			}
+		} else {
+			dev_err(dev, "idxd pasid setup failed wq %d wq_pasid %d\n",
+				wq->id, wq_pasid);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+			return;
+		}
+	}
+
+	vwqcfg->wq_state = IDXD_WQ_DEV_ENABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_disable(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg, *vwqcfg;
+	u8 *bar0 = vidxd->bar0;
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "vidxd disable wq %u:%u\n", 0, wq->id);
+
+	wqcfg = wq->wqcfg;
+	vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	if (vwqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		struct ioasid_set *ioasid_set;
+		struct mm_struct *mm;
+
+		idxd_wq_disable(wq, false, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+
+		if (vwqcfg->pasid_en) {
+			mm = get_task_mm(current);
+			if (!mm) {
+				dev_dbg(dev, "Can't retrieve task mm\n");
+				return;
+			}
+
+			ioasid_set = ioasid_find_mm_set(mm);
+			if (!ioasid_set) {
+				dev_dbg(dev, "Unable to find ioasid_set\n");
+				mmput(mm);
+				return;
+			}
+			mmput(mm);
+			if (!ioasid_put(ioasid_set, wqcfg->pasid))
+				dev_warn(dev, "Unable to put ioasid\n");
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable drain wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	}
+
+	vwqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	msi_domain_free_irqs(dev_get_msi_domain(dev), dev);
+}
+
+static bool command_supported(struct vdcm_idxd *vidxd, u32 cmd)
+{
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmd_cap = (u32 *)(bar0 + IDXD_CMDCAP_OFFSET);
+
+	return !!(*cmd_cap & BIT(cmd));
+}
+
+static void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
+{
+	union idxd_command_reg *reg = (union idxd_command_reg *)(vidxd->bar0 + IDXD_CMD_OFFSET);
+	struct mdev_device *mdev = vidxd->ivdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	reg->bits = val;
+
+	dev_dbg(dev, "%s: cmd code: %u reg: %x\n", __func__, reg->cmd, reg->bits);
+
+	if (!command_supported(vidxd, reg->cmd)) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_CMD);
+		return;
+	}
+
+	switch (reg->cmd) {
+	case IDXD_CMD_ENABLE_DEVICE:
+		vidxd_enable(vidxd);
+		break;
+	case IDXD_CMD_DISABLE_DEVICE:
+		vidxd_disable(vidxd);
+		break;
+	case IDXD_CMD_DRAIN_ALL:
+		vidxd_drain_all(vidxd);
+		break;
+	case IDXD_CMD_ABORT_ALL:
+		vidxd_abort_all(vidxd);
+		break;
+	case IDXD_CMD_RESET_DEVICE:
+		vidxd_reset(vidxd);
+		break;
+	case IDXD_CMD_ENABLE_WQ:
+		vidxd_wq_enable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DISABLE_WQ:
+		vidxd_wq_disable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DRAIN_WQ:
+		vidxd_wq_drain(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_ABORT_WQ:
+		vidxd_wq_abort(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_RESET_WQ:
+		vidxd_wq_reset(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_REQUEST_INT_HANDLE:
+		vidxd_alloc_int_handle(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_RELEASE_INT_HANDLE:
+		vidxd_release_int_handle(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_REVOKED_HANDLES_PROCESSED:
+		vidxd_revoked_handles_processed(vidxd, reg->operand);
+		break;
+	default:
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_CMD);
+		break;
+	}
+}
+
+static void vidxd_send_errors(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+	int i;
+
+	lockdep_assert_held(&idxd->dev_lock);
+
+	if (swerr->valid) {
+		if (!swerr->overflow)
+			swerr->overflow = 1;
+		return;
+	}
+
+	for (i = 0; i < 4; i++)
+		swerr->bits[i] = idxd->sw_err.bits[i];
+
+	*intcause |= IDXD_INTC_ERR;
+	if (genctrl->softerr_int_en)
+		vidxd_send_interrupt(vidxd, 0);
+}
+
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq)
+{
+	struct vdcm_idxd *vidxd;
+
+	list_for_each_entry(vidxd, &wq->vdcm_list, list)
+		vidxd_send_errors(vidxd);
+}
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b558d4cfd082ad44a6263b52faae9bdc4f84c774..b314101237fe224de97632f94b0637db5d32c5be 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -29,39 +29,39 @@ static DEFINE_MUTEX(mdev_list_lock);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev)
 {
-	return mdev->parent->dev;
+	return mdev->type->parent->dev;
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
-void *mdev_get_drvdata(struct mdev_device *mdev)
-{
-	return mdev->driver_data;
-}
-EXPORT_SYMBOL(mdev_get_drvdata);
-
-void mdev_set_drvdata(struct mdev_device *mdev, void *data)
-{
-	mdev->driver_data = data;
-}
-EXPORT_SYMBOL(mdev_set_drvdata);
-
-struct device *mdev_dev(struct mdev_device *mdev)
+/*
+ * Return the index in supported_type_groups that this mdev_device was created
+ * from.
+ */
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev)
 {
-	return &mdev->dev;
+	return mdev->type->type_group_id;
 }
-EXPORT_SYMBOL(mdev_dev);
+EXPORT_SYMBOL(mdev_get_type_group_id);
 
-struct mdev_device *mdev_from_dev(struct device *dev)
+/*
+ * Used in mdev_type_attribute sysfs functions to return the index in the
+ * supported_type_groups that the sysfs is called from.
+ */
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype)
 {
-	return dev_is_mdev(dev) ? to_mdev_device(dev) : NULL;
+	return mtype->type_group_id;
 }
-EXPORT_SYMBOL(mdev_from_dev);
+EXPORT_SYMBOL(mtype_get_type_group_id);
 
-const guid_t *mdev_uuid(struct mdev_device *mdev)
+/*
+ * Used in mdev_type_attribute sysfs functions to return the parent struct
+ * device
+ */
+struct device *mtype_get_parent_dev(struct mdev_type *mtype)
 {
-	return &mdev->uuid;
+	return mtype->parent->dev;
 }
-EXPORT_SYMBOL(mdev_uuid);
+EXPORT_SYMBOL(mtype_get_parent_dev);
 
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
@@ -75,7 +75,7 @@ static struct mdev_parent *__find_parent_device(struct device *dev)
 	return NULL;
 }
 
-static void mdev_release_parent(struct kref *kref)
+void mdev_release_parent(struct kref *kref)
 {
 	struct mdev_parent *parent = container_of(kref, struct mdev_parent,
 						  ref);
@@ -85,49 +85,31 @@ static void mdev_release_parent(struct kref *kref)
 	put_device(dev);
 }
 
-static struct mdev_parent *mdev_get_parent(struct mdev_parent *parent)
-{
-	if (parent)
-		kref_get(&parent->ref);
-
-	return parent;
-}
-
-static void mdev_put_parent(struct mdev_parent *parent)
-{
-	if (parent)
-		kref_put(&parent->ref, mdev_release_parent);
-}
-
 /* Caller must hold parent unreg_sem read or write lock */
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
-	struct mdev_parent *parent;
-	struct mdev_type *type;
+	struct mdev_parent *parent = mdev->type->parent;
 	int ret;
 
-	type = to_mdev_type(mdev->type_kobj);
-	mdev_remove_sysfs_files(&mdev->dev, type);
+	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
-	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
-	ret = parent->ops->remove(mdev);
-	if (ret)
-		dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	if (parent->ops->remove) {
+		ret = parent->ops->remove(mdev);
+		if (ret)
+			dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	}
 
 	/* Balances with device_initialize() */
 	put_device(&mdev->dev);
-	mdev_put_parent(parent);
 }
 
 static int mdev_device_remove_cb(struct device *dev, void *data)
 {
-	if (dev_is_mdev(dev)) {
-		struct mdev_device *mdev;
+	struct mdev_device *mdev = mdev_from_dev(dev);
 
-		mdev = to_mdev_device(dev);
+	if (mdev)
 		mdev_device_remove_common(mdev);
-	}
 	return 0;
 }
 
@@ -147,7 +129,9 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	char *envp[] = { env_string, NULL };
 
 	/* check for mandatory ops */
-	if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
+	if (!ops || !ops->supported_type_groups)
+		return -EINVAL;
+	if (!ops->device_driver && (!ops->create || !ops->remove))
 		return -EINVAL;
 
 	dev = get_device(dev);
@@ -252,8 +236,13 @@ void mdev_unregister_device(struct device *dev)
 }
 EXPORT_SYMBOL(mdev_unregister_device);
 
-static void mdev_device_free(struct mdev_device *mdev)
+static void mdev_device_release(struct device *dev)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
+
+	/* Pairs with the get in mdev_device_create() */
+	kobject_put(&mdev->type->kobj);
+
 	mutex_lock(&mdev_list_lock);
 	list_del(&mdev->next);
 	mutex_unlock(&mdev_list_lock);
@@ -262,24 +251,12 @@ static void mdev_device_free(struct mdev_device *mdev)
 	kfree(mdev);
 }
 
-static void mdev_device_release(struct device *dev)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	mdev_device_free(mdev);
-}
-
-int mdev_device_create(struct kobject *kobj,
-		       struct device *dev, const guid_t *uuid)
+int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
-	struct mdev_parent *parent;
-	struct mdev_type *type = to_mdev_type(kobj);
-
-	parent = mdev_get_parent(type->parent);
-	if (!parent)
-		return -EINVAL;
+	struct mdev_parent *parent = type->parent;
+	struct mdev_driver *drv = parent->ops->device_driver;
 
 	mutex_lock(&mdev_list_lock);
 
@@ -287,50 +264,58 @@ int mdev_device_create(struct kobject *kobj,
 	list_for_each_entry(tmp, &mdev_list, next) {
 		if (guid_equal(&tmp->uuid, uuid)) {
 			mutex_unlock(&mdev_list_lock);
-			ret = -EEXIST;
-			goto mdev_fail;
+			return -EEXIST;
 		}
 	}
 
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
 	if (!mdev) {
 		mutex_unlock(&mdev_list_lock);
-		ret = -ENOMEM;
-		goto mdev_fail;
+		return -ENOMEM;
 	}
 
+	device_initialize(&mdev->dev);
+	mdev->dev.parent  = parent->dev;
+	mdev->dev.bus = &mdev_bus_type;
+	mdev->dev.release = mdev_device_release;
+	mdev->dev.groups = parent->ops->mdev_attr_groups;
+	mdev->type = type;
+	/* Pairs with the put in mdev_device_release() */
+	kobject_get(&type->kobj);
+
 	guid_copy(&mdev->uuid, uuid);
 	list_add(&mdev->next, &mdev_list);
 	mutex_unlock(&mdev_list_lock);
 
-	mdev->parent = parent;
+	ret = dev_set_name(&mdev->dev, "%pUl", uuid);
+	if (ret)
+		goto out_put_device;
 
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem)) {
-		mdev_device_free(mdev);
 		ret = -ENODEV;
-		goto mdev_fail;
+		goto out_put_device;
 	}
 
-	device_initialize(&mdev->dev);
-	mdev->dev.parent  = dev;
-	mdev->dev.bus     = &mdev_bus_type;
-	mdev->dev.release = mdev_device_release;
-	dev_set_name(&mdev->dev, "%pUl", uuid);
-	mdev->dev.groups = parent->ops->mdev_attr_groups;
-	mdev->type_kobj = kobj;
+	if (parent->ops->create) {
+		ret = parent->ops->create(mdev);
+		if (ret)
+			goto out_unlock;
+	}
 
-	ret = parent->ops->create(kobj, mdev);
+	ret = device_add(&mdev->dev);
 	if (ret)
-		goto ops_create_fail;
+		goto out_remove;
 
-	ret = device_add(&mdev->dev);
+	if (!drv)
+		drv = &vfio_mdev_driver;
+	ret = device_driver_attach(&drv->driver, &mdev->dev);
 	if (ret)
-		goto add_fail;
+		goto out_del;
 
-	ret = mdev_create_sysfs_files(&mdev->dev, type);
+	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
-		goto sysfs_fail;
+		goto out_del;
 
 	mdev->active = true;
 	dev_dbg(&mdev->dev, "MDEV: created\n");
@@ -338,24 +323,22 @@ int mdev_device_create(struct kobject *kobj,
 
 	return 0;
 
-sysfs_fail:
+out_del:
 	device_del(&mdev->dev);
-add_fail:
-	parent->ops->remove(mdev);
-ops_create_fail:
+out_remove:
+	if (parent->ops->remove)
+		parent->ops->remove(mdev);
+out_unlock:
 	up_read(&parent->unreg_sem);
+out_put_device:
 	put_device(&mdev->dev);
-mdev_fail:
-	mdev_put_parent(parent);
 	return ret;
 }
 
-int mdev_device_remove(struct device *dev)
+int mdev_device_remove(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev, *tmp;
-	struct mdev_parent *parent;
-
-	mdev = to_mdev_device(dev);
+	struct mdev_device *tmp;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
@@ -376,7 +359,6 @@ int mdev_device_remove(struct device *dev)
 	mdev->active = false;
 	mutex_unlock(&mdev_list_lock);
 
-	parent = mdev->parent;
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem))
 		return -ENODEV;
@@ -386,42 +368,36 @@ int mdev_device_remove(struct device *dev)
 	return 0;
 }
 
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	mdev->iommu_device = iommu_device;
-
-	return 0;
-}
-EXPORT_SYMBOL(mdev_set_iommu_device);
-
-struct device *mdev_get_iommu_device(struct device *dev)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	return mdev->iommu_device;
-}
-EXPORT_SYMBOL(mdev_get_iommu_device);
-
 static int __init mdev_init(void)
 {
-	return mdev_bus_register();
+	int rc;
+
+	rc = mdev_bus_register();
+	if (rc)
+		return rc;
+	rc = mdev_register_driver(&vfio_mdev_driver);
+	if (rc)
+		goto err_bus;
+	return 0;
+err_bus:
+	mdev_bus_unregister();
+	return rc;
 }
 
 static void __exit mdev_exit(void)
 {
+	mdev_unregister_driver(&vfio_mdev_driver);
+
 	if (mdev_bus_compat_class)
 		class_compat_unregister(mdev_bus_compat_class);
 
 	mdev_bus_unregister();
 }
 
-module_init(mdev_init)
+subsys_initcall(mdev_init)
 module_exit(mdev_exit)
 
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_SOFTDEP("post: vfio_mdev");
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 0d3223aee20b83cab87c5ee568edcb4fd800dd48..c368ec824e2b5c071448c57da4c74def7fcfdee5 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -39,7 +39,8 @@ static void mdev_detach_iommu(struct mdev_device *mdev)
 
 static int mdev_probe(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 	int ret;
 
@@ -47,8 +48,8 @@ static int mdev_probe(struct device *dev)
 	if (ret)
 		return ret;
 
-	if (drv && drv->probe) {
-		ret = drv->probe(dev);
+	if (drv->probe) {
+		ret = drv->probe(mdev);
 		if (ret)
 			mdev_detach_iommu(mdev);
 	}
@@ -58,37 +59,45 @@ static int mdev_probe(struct device *dev)
 
 static int mdev_remove(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 
-	if (drv && drv->remove)
-		drv->remove(dev);
+	if (drv->remove)
+		drv->remove(mdev);
 
 	mdev_detach_iommu(mdev);
 
 	return 0;
 }
 
+static int mdev_match(struct device *dev, struct device_driver *drv)
+{
+	/*
+	 * No drivers automatically match. Drivers are only bound by explicit
+	 * device_driver_attach()
+	 */
+	return 0;
+}
+
 struct bus_type mdev_bus_type = {
 	.name		= "mdev",
 	.probe		= mdev_probe,
 	.remove		= mdev_remove,
+	.match		= mdev_match,
 };
 EXPORT_SYMBOL_GPL(mdev_bus_type);
 
 /**
  * mdev_register_driver - register a new MDEV driver
  * @drv: the driver to register
- * @owner: module owner of driver to be registered
  *
  * Returns a negative value on error, otherwise 0.
  **/
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+int mdev_register_driver(struct mdev_driver *drv)
 {
 	/* initialize common driver fields */
-	drv->driver.name = drv->name;
 	drv->driver.bus = &mdev_bus_type;
-	drv->driver.owner = owner;
 
 	/* register with core */
 	return driver_register(&drv->driver);
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 7d922950caaf3c1f5269556a8a913d42d6144710..afbad7b0a14a17d1b7a69e1e4e55222b91960266 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Mediated device interal definitions
+ * Mediated device internal definitions
  *
  * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  *     Author: Neo Jia <cjia@nvidia.com>
@@ -24,26 +24,12 @@ struct mdev_parent {
 	struct rw_semaphore unreg_sem;
 };
 
-struct mdev_device {
-	struct device dev;
-	struct mdev_parent *parent;
-	guid_t uuid;
-	void *driver_data;
-	struct list_head next;
-	struct kobject *type_kobj;
-	struct device *iommu_device;
-	bool active;
-};
-
-#define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
-#define dev_is_mdev(d)		((d)->bus == &mdev_bus_type)
-
 struct mdev_type {
 	struct kobject kobj;
 	struct kobject *devices_kobj;
 	struct mdev_parent *parent;
 	struct list_head next;
-	struct attribute_group *group;
+	unsigned int type_group_id;
 };
 
 #define to_mdev_type_attr(_attr)	\
@@ -51,14 +37,27 @@ struct mdev_type {
 #define to_mdev_type(_kobj)		\
 	container_of(_kobj, struct mdev_type, kobj)
 
+extern struct mdev_driver vfio_mdev_driver;
+
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
+int  mdev_create_sysfs_files(struct mdev_device *mdev);
+void mdev_remove_sysfs_files(struct mdev_device *mdev);
+
+int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid);
+int  mdev_device_remove(struct mdev_device *dev);
+
+void mdev_release_parent(struct kref *kref);
+
+static inline void mdev_get_parent(struct mdev_parent *parent)
+{
+	kref_get(&parent->ref);
+}
 
-int  mdev_device_create(struct kobject *kobj,
-			struct device *dev, const guid_t *uuid);
-int  mdev_device_remove(struct device *dev);
+static inline void mdev_put_parent(struct mdev_parent *parent)
+{
+	kref_put(&parent->ref, mdev_release_parent);
+}
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index ca9476883f15027a682c2cbada674cad96078e8f..f5cf1931c54e48bc2036168a6121d705de2294b1 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -26,7 +26,7 @@ static ssize_t mdev_type_attr_show(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->show)
-		ret = attr->show(kobj, type->parent->dev, buf);
+		ret = attr->show(type, attr, buf);
 	return ret;
 }
 
@@ -39,7 +39,7 @@ static ssize_t mdev_type_attr_store(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->store)
-		ret = attr->store(&type->kobj, type->parent->dev, buf, count);
+		ret = attr->store(type, attr, buf, count);
 	return ret;
 }
 
@@ -48,8 +48,9 @@ static const struct sysfs_ops mdev_type_sysfs_ops = {
 	.store = mdev_type_attr_store,
 };
 
-static ssize_t create_store(struct kobject *kobj, struct device *dev,
-			    const char *buf, size_t count)
+static ssize_t create_store(struct mdev_type *mtype,
+			    struct mdev_type_attribute *attr, const char *buf,
+			    size_t count)
 {
 	char *str;
 	guid_t uuid;
@@ -67,20 +68,22 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = mdev_device_create(kobj, dev, &uuid);
+	ret = mdev_device_create(mtype, &uuid);
 	if (ret)
 		return ret;
 
 	return count;
 }
 
-MDEV_TYPE_ATTR_WO(create);
+static MDEV_TYPE_ATTR_WO(create);
 
 static void mdev_type_release(struct kobject *kobj)
 {
 	struct mdev_type *type = to_mdev_type(kobj);
 
 	pr_debug("Releasing group %s\n", kobj->name);
+	/* Pairs with the get in add_mdev_supported_type() */
+	mdev_put_parent(type->parent);
 	kfree(type);
 }
 
@@ -90,9 +93,11 @@ static struct kobj_type mdev_type_ktype = {
 };
 
 static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
-						 struct attribute_group *group)
+						 unsigned int type_group_id)
 {
 	struct mdev_type *type;
+	struct attribute_group *group =
+		parent->ops->supported_type_groups[type_group_id];
 	int ret;
 
 	if (!group->name) {
@@ -106,6 +111,9 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 
 	type->kobj.kset = parent->mdev_types_kset;
 	type->parent = parent;
+	/* Pairs with the put in mdev_type_release() */
+	mdev_get_parent(parent);
+	type->type_group_id = type_group_id;
 
 	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
 				   "%s-%s", dev_driver_string(parent->dev),
@@ -131,8 +139,6 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 		ret = -ENOMEM;
 		goto attrs_failed;
 	}
-
-	type->group = group;
 	return type;
 
 attrs_failed:
@@ -147,8 +153,11 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 
 static void remove_mdev_supported_type(struct mdev_type *type)
 {
+	struct attribute_group *group =
+		type->parent->ops->supported_type_groups[type->type_group_id];
+
 	sysfs_remove_files(&type->kobj,
-			   (const struct attribute **)type->group->attrs);
+			   (const struct attribute **)group->attrs);
 	kobject_put(type->devices_kobj);
 	sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
 	kobject_del(&type->kobj);
@@ -162,8 +171,7 @@ static int add_mdev_supported_type_groups(struct mdev_parent *parent)
 	for (i = 0; parent->ops->supported_type_groups[i]; i++) {
 		struct mdev_type *type;
 
-		type = add_mdev_supported_type(parent,
-					parent->ops->supported_type_groups[i]);
+		type = add_mdev_supported_type(parent, i);
 		if (IS_ERR(type)) {
 			struct mdev_type *ltype, *tmp;
 
@@ -225,6 +233,7 @@ int parent_create_sysfs_files(struct mdev_parent *parent)
 static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	unsigned long val;
 
 	if (kstrtoul(buf, 0, &val) < 0)
@@ -233,7 +242,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 	if (val && device_remove_file_self(dev, attr)) {
 		int ret;
 
-		ret = mdev_device_remove(dev);
+		ret = mdev_device_remove(mdev);
 		if (ret)
 			return ret;
 	}
@@ -248,34 +257,38 @@ static const struct attribute *mdev_device_attrs[] = {
 	NULL,
 };
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
+int mdev_create_sysfs_files(struct mdev_device *mdev)
 {
+	struct mdev_type *type = mdev->type;
+	struct kobject *kobj = &mdev->dev.kobj;
 	int ret;
 
-	ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
+	ret = sysfs_create_link(type->devices_kobj, kobj, dev_name(&mdev->dev));
 	if (ret)
 		return ret;
 
-	ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
+	ret = sysfs_create_link(kobj, &type->kobj, "mdev_type");
 	if (ret)
 		goto type_link_failed;
 
-	ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
+	ret = sysfs_create_files(kobj, mdev_device_attrs);
 	if (ret)
 		goto create_files_failed;
 
 	return ret;
 
 create_files_failed:
-	sysfs_remove_link(&dev->kobj, "mdev_type");
+	sysfs_remove_link(kobj, "mdev_type");
 type_link_failed:
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 	return ret;
 }
 
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
+void mdev_remove_sysfs_files(struct mdev_device *mdev)
 {
-	sysfs_remove_files(&dev->kobj, mdev_device_attrs);
-	sysfs_remove_link(&dev->kobj, "mdev_type");
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	struct kobject *kobj = &mdev->dev.kobj;
+
+	sysfs_remove_files(kobj, mdev_device_attrs);
+	sysfs_remove_link(kobj, "mdev_type");
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 }
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 30964a4e0a28a86aafc380f5813b4e8f0dbabb18..7a9883048216e78504ea7168a9cbe37d24fe6bdf 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -17,57 +17,43 @@
 
 #include "mdev_private.h"
 
-#define DRIVER_VERSION  "0.1"
-#define DRIVER_AUTHOR   "NVIDIA Corporation"
-#define DRIVER_DESC     "VFIO based driver for Mediated device"
-
-static int vfio_mdev_open(void *device_data)
+static int vfio_mdev_open_device(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
-	int ret;
-
-	if (unlikely(!parent->ops->open))
-		return -EINVAL;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
+	if (unlikely(!parent->ops->open_device))
+		return 0;
 
-	ret = parent->ops->open(mdev);
-	if (ret)
-		module_put(THIS_MODULE);
-
-	return ret;
+	return parent->ops->open_device(mdev);
 }
 
-static void vfio_mdev_release(void *device_data)
+static void vfio_mdev_close_device(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
-	if (likely(parent->ops->release))
-		parent->ops->release(mdev);
-
-	module_put(THIS_MODULE);
+	if (likely(parent->ops->close_device))
+		parent->ops->close_device(mdev);
 }
 
-static long vfio_mdev_unlocked_ioctl(void *device_data,
+static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->ioctl))
-		return -EINVAL;
+		return 0;
 
 	return parent->ops->ioctl(mdev, cmd, arg);
 }
 
-static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
+static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->read))
 		return -EINVAL;
@@ -75,11 +61,12 @@ static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
 	return parent->ops->read(mdev, buf, count, ppos);
 }
 
-static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
-			       size_t count, loff_t *ppos)
+static ssize_t vfio_mdev_write(struct vfio_device *core_vdev,
+			       const char __user *buf, size_t count,
+			       loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->write))
 		return -EINVAL;
@@ -87,10 +74,11 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
 	return parent->ops->write(mdev, buf, count, ppos);
 }
 
-static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_mdev_mmap(struct vfio_device *core_vdev,
+			  struct vm_area_struct *vma)
 {
-	struct mdev_device *mdev = device_data;
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->mmap))
 		return -EINVAL;
@@ -98,48 +86,67 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
+static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
+{
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
+	struct mdev_parent *parent = mdev->type->parent;
+
+	if (parent->ops->request)
+		parent->ops->request(mdev, count);
+	else if (count == 0)
+		dev_notice(mdev_dev(mdev),
+			   "No mdev vendor driver request callback support, blocked until released by user\n");
+}
+
 static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
-	.open		= vfio_mdev_open,
-	.release	= vfio_mdev_release,
+	.open_device	= vfio_mdev_open_device,
+	.close_device	= vfio_mdev_close_device,
 	.ioctl		= vfio_mdev_unlocked_ioctl,
 	.read		= vfio_mdev_read,
 	.write		= vfio_mdev_write,
 	.mmap		= vfio_mdev_mmap,
+	.request	= vfio_mdev_request,
 };
 
-static int vfio_mdev_probe(struct device *dev)
+static int vfio_mdev_probe(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev = to_mdev_device(dev);
+	struct vfio_device *vdev;
+	int ret;
 
-	return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev);
-}
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
 
-static void vfio_mdev_remove(struct device *dev)
-{
-	vfio_del_group_dev(dev);
-}
+	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops);
+	ret = vfio_register_group_dev(vdev);
+	if (ret)
+		goto out_uninit;
 
-static struct mdev_driver vfio_mdev_driver = {
-	.name	= "vfio_mdev",
-	.probe	= vfio_mdev_probe,
-	.remove	= vfio_mdev_remove,
-};
+	dev_set_drvdata(&mdev->dev, vdev);
+	return 0;
 
-static int __init vfio_mdev_init(void)
-{
-	return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE);
+out_uninit:
+	vfio_uninit_group_dev(vdev);
+	kfree(vdev);
+	return ret;
 }
 
-static void __exit vfio_mdev_exit(void)
+static void vfio_mdev_remove(struct mdev_device *mdev)
 {
-	mdev_unregister_driver(&vfio_mdev_driver);
-}
+	struct vfio_device *vdev = dev_get_drvdata(&mdev->dev);
 
-module_init(vfio_mdev_init)
-module_exit(vfio_mdev_exit)
+	vfio_unregister_group_dev(vdev);
+	vfio_uninit_group_dev(vdev);
+	kfree(vdev);
+}
 
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
+struct mdev_driver vfio_mdev_driver = {
+	.driver = {
+		.name = "vfio_mdev",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+	.probe	= vfio_mdev_probe,
+	.remove	= vfio_mdev_remove,
+};
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 4abddbebd4b2364e86624d5c51b7bde775e1ee7a..860424ccda1bf11f1662a4629b068c353db42964 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,18 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0-only
-config VFIO_PCI
-	tristate "VFIO support for PCI devices"
-	depends on VFIO && PCI && EVENTFD
+if PCI && MMU
+config VFIO_PCI_CORE
+	tristate
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
+
+config VFIO_PCI_MMAP
+	def_bool y if !S390
+
+config VFIO_PCI_INTX
+	def_bool y if !S390
+
+config VFIO_PCI
+	tristate "Generic VFIO support for any PCI device"
+	select VFIO_PCI_CORE
 	help
-	  Support for the PCI VFIO bus driver.  This is required to make
-	  use of PCI drivers using the VFIO framework.
+	  Support for the generic PCI VFIO bus driver which can connect any
+	  PCI device to the VFIO framework.
 
 	  If you don't know what to do here, say N.
 
+if VFIO_PCI
 config VFIO_PCI_VGA
-	bool "VFIO PCI support for VGA devices"
-	depends on VFIO_PCI && X86 && VGA_ARB
+	bool "Generic VFIO PCI support for VGA devices"
+	depends on X86 && VGA_ARB
 	help
 	  Support for VGA extension to VFIO PCI.  This exposes an additional
 	  region on VGA devices for accessing legacy VGA addresses used by
@@ -20,17 +31,9 @@ config VFIO_PCI_VGA
 
 	  If you don't know what to do here, say N.
 
-config VFIO_PCI_MMAP
-	depends on VFIO_PCI
-	def_bool y if !S390
-
-config VFIO_PCI_INTX
-	depends on VFIO_PCI
-	def_bool y if !S390
-
 config VFIO_PCI_IGD
-	bool "VFIO PCI extensions for Intel graphics (GVT-d)"
-	depends on VFIO_PCI && X86
+	bool "Generic VFIO PCI extensions for Intel graphics (GVT-d)"
+	depends on X86
 	default y
 	help
 	  Support for Intel IGD specific extensions to enable direct
@@ -39,9 +42,5 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
-
-config VFIO_PCI_NVLINK2
-	def_bool y
-	depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
-	help
-	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+endif
+endif
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f027f8a0e89c02e41c2867a3751db21d23f407a0..349d68d242b4253c5dae31efc9a8c6343c4ca434 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
-vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
-vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
+vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
+obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 
+vfio-pci-y := vfio_pci.o
+vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a603f363835c48c3516afa80a8b03bcfc0e02f97..a5ce92beb6557d45f03851b42779a684f03c0580 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ *
  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  *     Author: Alex Williamson <alex.williamson@redhat.com>
  *
@@ -9,7 +11,6 @@
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
 
 #include <linux/device.h>
 #include <linux/eventfd.h>
@@ -19,19 +20,13 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
-#include <linux/pci.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-#include <linux/vfio.h>
-#include <linux/vgaarb.h>
-#include <linux/nospec.h>
-#include <linux/sched/mm.h>
 
-#include "vfio_pci_private.h"
+#include <linux/vfio_pci_core.h>
 
-#define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
 #define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
 
@@ -55,149 +50,27 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
-static inline bool vfio_vga_disabled(void)
-{
-#ifdef CONFIG_VFIO_PCI_VGA
-	return disable_vga;
-#else
-	return true;
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration.  Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
 #endif
-}
-
-/*
- * Our VGA arbiter participation is limited since we don't know anything
- * about the device itself.  However, if the device is the only VGA device
- * downstream of a bridge and VFIO VGA support is disabled, then we can
- * safely return legacy VGA IO and memory as not decoded since the user
- * has no way to get to it and routing can be disabled externally at the
- * bridge.
- */
-static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
-{
-	struct vfio_pci_device *vdev = opaque;
-	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
-	unsigned char max_busnr;
-	unsigned int decodes;
-
-	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
-		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
-		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
-
-	max_busnr = pci_bus_max_busnr(pdev->bus);
-	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
-
-	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
-		if (tmp == pdev ||
-		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
-		    pci_is_root_bus(tmp->bus))
-			continue;
-
-		if (tmp->bus->number >= pdev->bus->number &&
-		    tmp->bus->number <= max_busnr) {
-			pci_dev_put(tmp);
-			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
-			break;
-		}
-	}
-
-	return decodes;
-}
-
-static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
-{
-	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
-}
-
-static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
-{
-	struct resource *res;
-	int bar;
-	struct vfio_pci_dummy_resource *dummy_res;
-
-	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
-		res = vdev->pdev->resource + bar;
-
-		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
-			goto no_mmap;
-
-		if (!(res->flags & IORESOURCE_MEM))
-			goto no_mmap;
-
-		/*
-		 * The PCI core shouldn't set up a resource with a
-		 * type but zero size. But there may be bugs that
-		 * cause us to do that.
-		 */
-		if (!resource_size(res))
-			goto no_mmap;
-
-		if (resource_size(res) >= PAGE_SIZE) {
-			vdev->bar_mmap_supported[bar] = true;
-			continue;
-		}
-
-		if (!(res->start & ~PAGE_MASK)) {
-			/*
-			 * Add a dummy resource to reserve the remainder
-			 * of the exclusive page in case that hot-add
-			 * device's bar is assigned into it.
-			 */
-			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
-			if (dummy_res == NULL)
-				goto no_mmap;
-
-			dummy_res->resource.name = "vfio sub-page reserved";
-			dummy_res->resource.start = res->end + 1;
-			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
-			dummy_res->resource.flags = res->flags;
-			if (request_resource(res->parent,
-						&dummy_res->resource)) {
-				kfree(dummy_res);
-				goto no_mmap;
-			}
-			dummy_res->index = bar;
-			list_add(&dummy_res->res_next,
-					&vdev->dummy_resources_list);
-			vdev->bar_mmap_supported[bar] = true;
-			continue;
-		}
-		/*
-		 * Here we don't handle the case when the BAR is not page
-		 * aligned because we can't expect the BAR will be
-		 * assigned into the same location in a page in guest
-		 * when we passthrough the BAR. And it's hard to access
-		 * this BAR in userspace because we have no way to get
-		 * the BAR's location in a page.
-		 */
-no_mmap:
-		vdev->bar_mmap_supported[bar] = false;
-	}
-}
 
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
-static void vfio_pci_disable(struct vfio_pci_device *vdev);
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
 
-/*
- * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
- * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
- * If a device implements the former but not the latter we would typically
- * expect broken_intx_masking be set and require an exclusive interrupt.
- * However since we do have control of the device's ability to assert INTx,
- * we can instead pretend that the device does not implement INTx, virtualizing
- * the pin register to report zero and maintaining DisINTx set on the host.
- */
-static bool vfio_pci_nointx(struct pci_dev *pdev)
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
 {
 	switch (pdev->vendor) {
 	case PCI_VENDOR_ID_INTEL:
 		switch (pdev->device) {
-		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
-		case 0x1572:
-		case 0x1574:
-		case 0x1580 ... 0x1581:
-		case 0x1583 ... 0x158b:
-		case 0x37d0 ... 0x37d2:
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+		case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X:
+		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
 			return true;
 		default:
 			return false;
@@ -207,1811 +80,193 @@ static bool vfio_pci_nointx(struct pci_dev *pdev)
 	return false;
 }
 
-static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	u16 pmcsr;
-
-	if (!pdev->pm_cap)
-		return;
-
-	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
-
-	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
-}
-
-/*
- * pci_set_power_state() wrapper handling devices which perform a soft reset on
- * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
- * restore when returned to D0.  Saved separately from pci_saved_state for use
- * by PM capability emulation and separately from pci_dev internal saved state
- * to avoid it being overwritten and consumed around other resets.
- */
-int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
 {
-	struct pci_dev *pdev = vdev->pdev;
-	bool needs_restore = false, needs_save = false;
-	int ret;
-
-	if (vdev->needs_pm_restore) {
-		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
-			pci_save_state(pdev);
-			needs_save = true;
-		}
+	if (!vfio_pci_dev_in_denylist(pdev))
+		return false;
 
-		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
-			needs_restore = true;
+	if (disable_denylist) {
+		pci_warn(pdev,
+			 "device denylist disabled - allowing device %04x:%04x.\n",
+			 pdev->vendor, pdev->device);
+		return false;
 	}
 
-	ret = pci_set_power_state(pdev, state);
-
-	if (!ret) {
-		/* D3 might be unsupported via quirk, skip unless in D3 */
-		if (needs_save && pdev->current_state >= PCI_D3hot) {
-			vdev->pm_save = pci_store_saved_state(pdev);
-		} else if (needs_restore) {
-			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
-			pci_restore_state(pdev);
-		}
-	}
+	pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+		 pdev->vendor, pdev->device);
 
-	return ret;
+	return true;
 }
 
-static int vfio_pci_enable(struct vfio_pci_device *vdev)
+static int vfio_pci_open_device(struct vfio_device *core_vdev)
 {
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
-	u16 cmd;
-	u8 msix_pos;
-
-	vfio_pci_set_power_state(vdev, PCI_D0);
-
-	/* Don't allow our initial saved state to include busmaster */
-	pci_clear_master(pdev);
 
-	ret = pci_enable_device(pdev);
+	ret = vfio_pci_core_enable(vdev);
 	if (ret)
 		return ret;
 
-	/* If reset fails because of the device lock, fail this path entirely */
-	ret = pci_try_reset_function(pdev);
-	if (ret == -EAGAIN) {
-		pci_disable_device(pdev);
-		return ret;
-	}
-
-	vdev->reset_works = !ret;
-	pci_save_state(pdev);
-	vdev->pci_saved_state = pci_store_saved_state(pdev);
-	if (!vdev->pci_saved_state)
-		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
-
-	if (likely(!nointxmask)) {
-		if (vfio_pci_nointx(pdev)) {
-			pci_info(pdev, "Masking broken INTx support\n");
-			vdev->nointx = true;
-			pci_intx(pdev, 0);
-		} else
-			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
-	}
-
-	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
-	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
-		cmd &= ~PCI_COMMAND_INTX_DISABLE;
-		pci_write_config_word(pdev, PCI_COMMAND, cmd);
-	}
-
-	ret = vfio_config_init(vdev);
-	if (ret) {
-		kfree(vdev->pci_saved_state);
-		vdev->pci_saved_state = NULL;
-		pci_disable_device(pdev);
-		return ret;
-	}
-
-	msix_pos = pdev->msix_cap;
-	if (msix_pos) {
-		u16 flags;
-		u32 table;
-
-		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
-		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
-
-		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
-		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
-		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
-	} else
-		vdev->msix_bar = 0xFF;
-
-	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
-		vdev->has_vga = true;
-
-
 	if (vfio_pci_is_vga(pdev) &&
 	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
 	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
 		ret = vfio_pci_igd_init(vdev);
 		if (ret && ret != -ENODEV) {
 			pci_warn(pdev, "Failed to setup Intel IGD regions\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_ibm_npu2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
-			goto disable_exit;
+			vfio_pci_core_disable(vdev);
+			return ret;
 		}
 	}
 
-	vfio_pci_probe_mmaps(vdev);
+	vfio_pci_core_finish_enable(vdev);
 
 	return 0;
-
-disable_exit:
-	vfio_pci_disable(vdev);
-	return ret;
-}
-
-static void vfio_pci_disable(struct vfio_pci_device *vdev)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	struct vfio_pci_dummy_resource *dummy_res, *tmp;
-	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
-	int i, bar;
-
-	/* Stop the device from further DMA */
-	pci_clear_master(pdev);
-
-	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
-				VFIO_IRQ_SET_ACTION_TRIGGER,
-				vdev->irq_type, 0, 0, NULL);
-
-	/* Device closed, don't need mutex here */
-	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
-				 &vdev->ioeventfds_list, next) {
-		vfio_virqfd_disable(&ioeventfd->virqfd);
-		list_del(&ioeventfd->next);
-		kfree(ioeventfd);
-	}
-	vdev->ioeventfds_nr = 0;
-
-	vdev->virq_disabled = false;
-
-	for (i = 0; i < vdev->num_regions; i++)
-		vdev->region[i].ops->release(vdev, &vdev->region[i]);
-
-	vdev->num_regions = 0;
-	kfree(vdev->region);
-	vdev->region = NULL; /* don't krealloc a freed pointer */
-
-	vfio_config_free(vdev);
-
-	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
-		if (!vdev->barmap[bar])
-			continue;
-		pci_iounmap(pdev, vdev->barmap[bar]);
-		pci_release_selected_regions(pdev, 1 << bar);
-		vdev->barmap[bar] = NULL;
-	}
-
-	list_for_each_entry_safe(dummy_res, tmp,
-				 &vdev->dummy_resources_list, res_next) {
-		list_del(&dummy_res->res_next);
-		release_resource(&dummy_res->resource);
-		kfree(dummy_res);
-	}
-
-	vdev->needs_reset = true;
-
-	/*
-	 * If we have saved state, restore it.  If we can reset the device,
-	 * even better.  Resetting with current state seems better than
-	 * nothing, but saving and restoring current state without reset
-	 * is just busy work.
-	 */
-	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
-		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
-
-		if (!vdev->reset_works)
-			goto out;
-
-		pci_save_state(pdev);
-	}
-
-	/*
-	 * Disable INTx and MSI, presumably to avoid spurious interrupts
-	 * during reset.  Stolen from pci_reset_function()
-	 */
-	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
-	/*
-	 * Try to get the locks ourselves to prevent a deadlock. The
-	 * success of this is dependent on being able to lock the device,
-	 * which is not always possible.
-	 * We can not use the "try" reset interface here, which will
-	 * overwrite the previously restored configuration information.
-	 */
-	if (vdev->reset_works && pci_cfg_access_trylock(pdev)) {
-		if (device_trylock(&pdev->dev)) {
-			if (!__pci_reset_function_locked(pdev))
-				vdev->needs_reset = false;
-			device_unlock(&pdev->dev);
-		}
-		pci_cfg_access_unlock(pdev);
-	}
-
-	pci_restore_state(pdev);
-out:
-	pci_disable_device(pdev);
-
-	vfio_pci_try_bus_reset(vdev);
-
-	if (!disable_idle_d3)
-		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
 
-static void vfio_pci_release(void *device_data)
-{
-	struct vfio_pci_device *vdev = device_data;
-
-	mutex_lock(&vdev->reflck->lock);
-
-	if (!(--vdev->refcnt)) {
-		vfio_spapr_pci_eeh_release(vdev->pdev);
-		vfio_pci_disable(vdev);
-		mutex_lock(&vdev->igate);
-		if (vdev->err_trigger) {
-			eventfd_ctx_put(vdev->err_trigger);
-			vdev->err_trigger = NULL;
-		}
-		mutex_unlock(&vdev->igate);
-
-		mutex_lock(&vdev->igate);
-		if (vdev->req_trigger) {
-			eventfd_ctx_put(vdev->req_trigger);
-			vdev->req_trigger = NULL;
-		}
-		mutex_unlock(&vdev->igate);
-	}
-
-	mutex_unlock(&vdev->reflck->lock);
-
-	module_put(THIS_MODULE);
-}
+static const struct vfio_device_ops vfio_pci_ops = {
+	.name		= "vfio-pci",
+	.open_device	= vfio_pci_open_device,
+	.close_device	= vfio_pci_core_close_device,
+	.ioctl		= vfio_pci_core_ioctl,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+};
 
-static int vfio_pci_open(void *device_data)
+static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	struct vfio_pci_device *vdev = device_data;
-	int ret = 0;
-
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
+	struct vfio_pci_core_device *vdev;
+	int ret;
 
-	mutex_lock(&vdev->reflck->lock);
+	if (vfio_pci_is_denylisted(pdev))
+		return -EINVAL;
 
-	if (!vdev->refcnt) {
-		ret = vfio_pci_enable(vdev);
-		if (ret)
-			goto error;
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+	vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops);
 
-		vfio_spapr_pci_eeh_open(vdev->pdev);
-	}
-	vdev->refcnt++;
-error:
-	mutex_unlock(&vdev->reflck->lock);
+	ret = vfio_pci_core_register_device(vdev);
 	if (ret)
-		module_put(THIS_MODULE);
+		goto out_free;
+	dev_set_drvdata(&pdev->dev, vdev);
+	return 0;
+
+out_free:
+	vfio_pci_core_uninit_device(vdev);
+	kfree(vdev);
 	return ret;
 }
 
-static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
+static void vfio_pci_remove(struct pci_dev *pdev)
 {
-	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
-		u8 pin;
-
-		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
-		    vdev->nointx || vdev->pdev->is_virtfn)
-			return 0;
-
-		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-
-		return pin ? 1 : 0;
-	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
-		u8 pos;
-		u16 flags;
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
 
-		pos = vdev->pdev->msi_cap;
-		if (pos) {
-			pci_read_config_word(vdev->pdev,
-					     pos + PCI_MSI_FLAGS, &flags);
-			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
-		}
-	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
-		u8 pos;
-		u16 flags;
-
-		pos = vdev->pdev->msix_cap;
-		if (pos) {
-			pci_read_config_word(vdev->pdev,
-					     pos + PCI_MSIX_FLAGS, &flags);
-
-			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
-		}
-	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
-		if (pci_is_pcie(vdev->pdev))
-			return 1;
-	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
-		return 1;
-	}
-
-	return 0;
-}
-
-static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
-{
-	(*(int *)data)++;
-	return 0;
+	vfio_pci_core_unregister_device(vdev);
+	vfio_pci_core_uninit_device(vdev);
+	kfree(vdev);
 }
 
-struct vfio_pci_fill_info {
-	int max;
-	int cur;
-	struct vfio_pci_dependent_device *devices;
-};
-
-static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
 {
-	struct vfio_pci_fill_info *fill = data;
-	struct iommu_group *iommu_group;
-
-	if (fill->cur == fill->max)
-		return -EAGAIN; /* Something changed, try again */
-
-	iommu_group = iommu_group_get(&pdev->dev);
-	if (!iommu_group)
-		return -EPERM; /* Cannot reset non-isolated devices */
+	if (!enable_sriov)
+		return -ENOENT;
 
-	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
-	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
-	fill->devices[fill->cur].bus = pdev->bus->number;
-	fill->devices[fill->cur].devfn = pdev->devfn;
-	fill->cur++;
-	iommu_group_put(iommu_group);
-	return 0;
+	return vfio_pci_core_sriov_configure(pdev, nr_virtfn);
 }
 
-struct vfio_pci_group_entry {
-	struct vfio_group *group;
-	int id;
-};
-
-struct vfio_pci_group_info {
-	int count;
-	struct vfio_pci_group_entry *groups;
+static const struct pci_device_id vfio_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_ANY_ID, PCI_ANY_ID) }, /* match all by default */
+	{}
 };
 
-static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_group_info *info = data;
-	struct iommu_group *group;
-	int id, i;
-
-	group = iommu_group_get(&pdev->dev);
-	if (!group)
-		return -EPERM;
-
-	id = iommu_group_id(group);
-
-	for (i = 0; i < info->count; i++)
-		if (info->groups[i].id == id)
-			break;
-
-	iommu_group_put(group);
+MODULE_DEVICE_TABLE(pci, vfio_pci_table);
 
-	return (i == info->count) ? -EINVAL : 0;
-}
-
-static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
-{
-	for (; pdev; pdev = pdev->bus->self)
-		if (pdev->bus == slot->bus)
-			return (pdev->slot == slot);
-	return false;
-}
-
-struct vfio_pci_walk_info {
-	int (*fn)(struct pci_dev *, void *data);
-	void *data;
-	struct pci_dev *pdev;
-	bool slot;
-	int ret;
+static struct pci_driver vfio_pci_driver = {
+	.name			= "vfio-pci",
+	.id_table		= vfio_pci_table,
+	.probe			= vfio_pci_probe,
+	.remove			= vfio_pci_remove,
+	.sriov_configure	= vfio_pci_sriov_configure,
+	.err_handler		= &vfio_pci_core_err_handlers,
 };
 
-static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
+static void __init vfio_pci_fill_ids(void)
 {
-	struct vfio_pci_walk_info *walk = data;
+	char *p, *id;
+	int rc;
 
-	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
-		walk->ret = walk->fn(pdev, walk->data);
+	/* no ids passed actually */
+	if (ids[0] == '\0')
+		return;
 
-	return walk->ret;
-}
+	/* add ids specified in the module parameter */
+	p = ids;
+	while ((id = strsep(&p, ","))) {
+		unsigned int vendor, device, subvendor = PCI_ANY_ID,
+			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
+		int fields;
 
-static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
-					 int (*fn)(struct pci_dev *,
-						   void *data), void *data,
-					 bool slot)
-{
-	struct vfio_pci_walk_info walk = {
-		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
-	};
+		if (!strlen(id))
+			continue;
 
-	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
+		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
+				&vendor, &device, &subvendor, &subdevice,
+				&class, &class_mask);
+
+		if (fields < 2) {
+			pr_warn("invalid id string \"%s\"\n", id);
+			continue;
+		}
 
-	return walk.ret;
+		rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
+				   subvendor, subdevice, class, class_mask, 0);
+		if (rc)
+			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask, rc);
+		else
+			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
+				vendor, device, subvendor, subdevice,
+				class, class_mask);
+	}
 }
 
-static int msix_mmappable_cap(struct vfio_pci_device *vdev,
-			      struct vfio_info_cap *caps)
+static int __init vfio_pci_init(void)
 {
-	struct vfio_info_cap_header header = {
-		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
-		.version = 1
-	};
+	int ret;
+	bool is_disable_vga = true;
 
-	return vfio_info_add_capability(caps, &header, sizeof(header));
-}
+#ifdef CONFIG_VFIO_PCI_VGA
+	is_disable_vga = disable_vga;
+#endif
 
-int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
-				 unsigned int type, unsigned int subtype,
-				 const struct vfio_pci_regops *ops,
-				 size_t size, u32 flags, void *data)
-{
-	struct vfio_pci_region *region;
+	vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3);
 
-	region = krealloc(vdev->region,
-			  (vdev->num_regions + 1) * sizeof(*region),
-			  GFP_KERNEL);
-	if (!region)
-		return -ENOMEM;
+	/* Register and scan for devices */
+	ret = pci_register_driver(&vfio_pci_driver);
+	if (ret)
+		return ret;
 
-	vdev->region = region;
-	vdev->region[vdev->num_regions].type = type;
-	vdev->region[vdev->num_regions].subtype = subtype;
-	vdev->region[vdev->num_regions].ops = ops;
-	vdev->region[vdev->num_regions].size = size;
-	vdev->region[vdev->num_regions].flags = flags;
-	vdev->region[vdev->num_regions].data = data;
+	vfio_pci_fill_ids();
 
-	vdev->num_regions++;
+	if (disable_denylist)
+		pr_warn("device denylist disabled.\n");
 
 	return 0;
 }
+module_init(vfio_pci_init);
 
-struct vfio_devices {
-	struct vfio_device **devices;
-	int cur_index;
-	int max_index;
-};
-
-static long vfio_pci_ioctl(void *device_data,
-			   unsigned int cmd, unsigned long arg)
-{
-	struct vfio_pci_device *vdev = device_data;
-	unsigned long minsz;
-
-	if (cmd == VFIO_DEVICE_GET_INFO) {
-		struct vfio_device_info info;
-
-		minsz = offsetofend(struct vfio_device_info, num_irqs);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz)
-			return -EINVAL;
-
-		info.flags = VFIO_DEVICE_FLAGS_PCI;
-
-		if (vdev->reset_works)
-			info.flags |= VFIO_DEVICE_FLAGS_RESET;
-
-		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
-		info.num_irqs = VFIO_PCI_NUM_IRQS;
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
-		struct pci_dev *pdev = vdev->pdev;
-		struct vfio_region_info info;
-		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
-		int i, ret;
-
-		minsz = offsetofend(struct vfio_region_info, offset);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz)
-			return -EINVAL;
-
-		switch (info.index) {
-		case VFIO_PCI_CONFIG_REGION_INDEX:
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = pdev->cfg_size;
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-			break;
-		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = pci_resource_len(pdev, info.index);
-			if (!info.size) {
-				info.flags = 0;
-				break;
-			}
-
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-			if (vdev->bar_mmap_supported[info.index]) {
-				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
-				if (info.index == vdev->msix_bar) {
-					ret = msix_mmappable_cap(vdev, &caps);
-					if (ret)
-						return ret;
-				}
-			}
-
-			break;
-		case VFIO_PCI_ROM_REGION_INDEX:
-		{
-			void __iomem *io;
-			size_t size;
-			u16 cmd;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.flags = 0;
-
-			/* Report the BAR size, not the ROM size */
-			info.size = pci_resource_len(pdev, info.index);
-			if (!info.size) {
-				/* Shadow ROMs appear as PCI option ROMs */
-				if (pdev->resource[PCI_ROM_RESOURCE].flags &
-							IORESOURCE_ROM_SHADOW)
-					info.size = 0x20000;
-				else
-					break;
-			}
-
-			/*
-			 * Is it really there?  Enable memory decode for
-			 * implicit access in pci_map_rom().
-			 */
-			cmd = vfio_pci_memory_lock_and_enable(vdev);
-			io = pci_map_rom(pdev, &size);
-			if (io) {
-				info.flags = VFIO_REGION_INFO_FLAG_READ;
-				pci_unmap_rom(pdev, io);
-			} else {
-				info.size = 0;
-			}
-			vfio_pci_memory_unlock_and_restore(vdev, cmd);
-
-			break;
-		}
-		case VFIO_PCI_VGA_REGION_INDEX:
-			if (!vdev->has_vga)
-				return -EINVAL;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = 0xc0000;
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-
-			break;
-		default:
-		{
-			struct vfio_region_info_cap_type cap_type = {
-					.header.id = VFIO_REGION_INFO_CAP_TYPE,
-					.header.version = 1 };
-
-			if (info.index >=
-			    VFIO_PCI_NUM_REGIONS + vdev->num_regions)
-				return -EINVAL;
-			info.index = array_index_nospec(info.index,
-							VFIO_PCI_NUM_REGIONS +
-							vdev->num_regions);
-
-			i = info.index - VFIO_PCI_NUM_REGIONS;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = vdev->region[i].size;
-			info.flags = vdev->region[i].flags;
-
-			cap_type.type = vdev->region[i].type;
-			cap_type.subtype = vdev->region[i].subtype;
-
-			ret = vfio_info_add_capability(&caps, &cap_type.header,
-						       sizeof(cap_type));
-			if (ret)
-				return ret;
-
-			if (vdev->region[i].ops->add_capability) {
-				ret = vdev->region[i].ops->add_capability(vdev,
-						&vdev->region[i], &caps);
-				if (ret)
-					return ret;
-			}
-		}
-		}
-
-		if (caps.size) {
-			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
-			if (info.argsz < sizeof(info) + caps.size) {
-				info.argsz = sizeof(info) + caps.size;
-				info.cap_offset = 0;
-			} else {
-				vfio_info_cap_shift(&caps, sizeof(info));
-				if (copy_to_user((void __user *)arg +
-						  sizeof(info), caps.buf,
-						  caps.size)) {
-					kfree(caps.buf);
-					return -EFAULT;
-				}
-				info.cap_offset = sizeof(info);
-			}
-
-			kfree(caps.buf);
-		}
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
-		struct vfio_irq_info info;
-
-		minsz = offsetofend(struct vfio_irq_info, count);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
-			return -EINVAL;
-
-		switch (info.index) {
-		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
-		case VFIO_PCI_REQ_IRQ_INDEX:
-			break;
-		case VFIO_PCI_ERR_IRQ_INDEX:
-			if (pci_is_pcie(vdev->pdev))
-				break;
-		/* fall through */
-		default:
-			return -EINVAL;
-		}
-
-		info.flags = VFIO_IRQ_INFO_EVENTFD;
-
-		info.count = vfio_pci_get_irq_count(vdev, info.index);
-
-		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
-			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
-				       VFIO_IRQ_INFO_AUTOMASKED);
-		else
-			info.flags |= VFIO_IRQ_INFO_NORESIZE;
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
-		struct vfio_irq_set hdr;
-		u8 *data = NULL;
-		int max, ret = 0;
-		size_t data_size = 0;
-
-		minsz = offsetofend(struct vfio_irq_set, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		max = vfio_pci_get_irq_count(vdev, hdr.index);
-
-		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
-						 VFIO_PCI_NUM_IRQS, &data_size);
-		if (ret)
-			return ret;
-
-		if (data_size) {
-			data = memdup_user((void __user *)(arg + minsz),
-					    data_size);
-			if (IS_ERR(data))
-				return PTR_ERR(data);
-		}
-
-		mutex_lock(&vdev->igate);
-
-		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
-					      hdr.start, hdr.count, data);
-
-		mutex_unlock(&vdev->igate);
-		kfree(data);
-
-		return ret;
-
-	} else if (cmd == VFIO_DEVICE_RESET) {
-		int ret;
-
-		if (!vdev->reset_works)
-			return -EINVAL;
-
-		vfio_pci_zap_and_down_write_memory_lock(vdev);
-		ret = pci_try_reset_function(vdev->pdev);
-		up_write(&vdev->memory_lock);
-
-		return ret;
-
-	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
-		struct vfio_pci_hot_reset_info hdr;
-		struct vfio_pci_fill_info fill = { 0 };
-		struct vfio_pci_dependent_device *devices = NULL;
-		bool slot = false;
-		int ret = 0;
-
-		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (hdr.argsz < minsz)
-			return -EINVAL;
-
-		hdr.flags = 0;
-
-		/* Can we do a slot or bus reset or neither? */
-		if (!pci_probe_reset_slot(vdev->pdev->slot))
-			slot = true;
-		else if (pci_probe_reset_bus(vdev->pdev->bus))
-			return -ENODEV;
-
-		/* How many devices are affected? */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_count_devs,
-						    &fill.max, slot);
-		if (ret)
-			return ret;
-
-		WARN_ON(!fill.max); /* Should always be at least one */
-
-		/*
-		 * If there's enough space, fill it now, otherwise return
-		 * -ENOSPC and the number of devices affected.
-		 */
-		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
-			ret = -ENOSPC;
-			hdr.count = fill.max;
-			goto reset_info_exit;
-		}
-
-		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
-		if (!devices)
-			return -ENOMEM;
-
-		fill.devices = devices;
-
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_fill_devs,
-						    &fill, slot);
-
-		/*
-		 * If a device was removed between counting and filling,
-		 * we may come up short of fill.max.  If a device was
-		 * added, we'll have a return of -EAGAIN above.
-		 */
-		if (!ret)
-			hdr.count = fill.cur;
-
-reset_info_exit:
-		if (copy_to_user((void __user *)arg, &hdr, minsz))
-			ret = -EFAULT;
-
-		if (!ret) {
-			if (copy_to_user((void __user *)(arg + minsz), devices,
-					 hdr.count * sizeof(*devices)))
-				ret = -EFAULT;
-		}
-
-		kfree(devices);
-		return ret;
-
-	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
-		struct vfio_pci_hot_reset hdr;
-		int32_t *group_fds;
-		struct vfio_pci_group_entry *groups;
-		struct vfio_pci_group_info info;
-		struct vfio_devices devs = { .cur_index = 0 };
-		bool slot = false;
-		int i, group_idx, mem_idx = 0, count = 0, ret = 0;
-
-		minsz = offsetofend(struct vfio_pci_hot_reset, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (hdr.argsz < minsz || hdr.flags)
-			return -EINVAL;
-
-		/* Can we do a slot or bus reset or neither? */
-		if (!pci_probe_reset_slot(vdev->pdev->slot))
-			slot = true;
-		else if (pci_probe_reset_bus(vdev->pdev->bus))
-			return -ENODEV;
-
-		/*
-		 * We can't let userspace give us an arbitrarily large
-		 * buffer to copy, so verify how many we think there
-		 * could be.  Note groups can have multiple devices so
-		 * one group per device is the max.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_count_devs,
-						    &count, slot);
-		if (ret)
-			return ret;
-
-		/* Somewhere between 1 and count is OK */
-		if (!hdr.count || hdr.count > count)
-			return -EINVAL;
-
-		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
-		groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
-		if (!group_fds || !groups) {
-			kfree(group_fds);
-			kfree(groups);
-			return -ENOMEM;
-		}
-
-		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
-				   hdr.count * sizeof(*group_fds))) {
-			kfree(group_fds);
-			kfree(groups);
-			return -EFAULT;
-		}
-
-		/*
-		 * For each group_fd, get the group through the vfio external
-		 * user interface and store the group and iommu ID.  This
-		 * ensures the group is held across the reset.
-		 */
-		for (group_idx = 0; group_idx < hdr.count; group_idx++) {
-			struct vfio_group *group;
-			struct fd f = fdget(group_fds[group_idx]);
-			if (!f.file) {
-				ret = -EBADF;
-				break;
-			}
-
-			group = vfio_group_get_external_user(f.file);
-			fdput(f);
-			if (IS_ERR(group)) {
-				ret = PTR_ERR(group);
-				break;
-			}
-
-			groups[group_idx].group = group;
-			groups[group_idx].id =
-					vfio_external_user_iommu_id(group);
-		}
-
-		kfree(group_fds);
-
-		/* release reference to groups on error */
-		if (ret)
-			goto hot_reset_release;
-
-		info.count = hdr.count;
-		info.groups = groups;
-
-		/*
-		 * Test whether all the affected devices are contained
-		 * by the set of groups provided by the user.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_validate_devs,
-						    &info, slot);
-		if (ret)
-			goto hot_reset_release;
-
-		devs.max_index = count;
-		devs.devices = kcalloc(count, sizeof(struct vfio_device *),
-				       GFP_KERNEL);
-		if (!devs.devices) {
-			ret = -ENOMEM;
-			goto hot_reset_release;
-		}
-
-		/*
-		 * We need to get memory_lock for each device, but devices
-		 * can share mmap_sem, therefore we need to zap and hold
-		 * the vma_lock for each device, and only then get each
-		 * memory_lock.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					    vfio_pci_try_zap_and_vma_lock_cb,
-					    &devs, slot);
-		if (ret)
-			goto hot_reset_release;
-
-		for (; mem_idx < devs.cur_index; mem_idx++) {
-			struct vfio_pci_device *tmp;
-
-			tmp = vfio_device_data(devs.devices[mem_idx]);
-
-			ret = down_write_trylock(&tmp->memory_lock);
-			if (!ret) {
-				ret = -EBUSY;
-				goto hot_reset_release;
-			}
-			mutex_unlock(&tmp->vma_lock);
-		}
-
-		/* User has access, do the reset */
-		ret = pci_reset_bus(vdev->pdev);
-
-hot_reset_release:
-		for (i = 0; i < devs.cur_index; i++) {
-			struct vfio_device *device;
-			struct vfio_pci_device *tmp;
-
-			device = devs.devices[i];
-			tmp = vfio_device_data(device);
-
-			if (i < mem_idx)
-				up_write(&tmp->memory_lock);
-			else
-				mutex_unlock(&tmp->vma_lock);
-			vfio_device_put(device);
-		}
-		kfree(devs.devices);
-
-		for (group_idx--; group_idx >= 0; group_idx--)
-			vfio_group_put_external_user(groups[group_idx].group);
-
-		kfree(groups);
-		return ret;
-	} else if (cmd == VFIO_DEVICE_IOEVENTFD) {
-		struct vfio_device_ioeventfd ioeventfd;
-		int count;
-
-		minsz = offsetofend(struct vfio_device_ioeventfd, fd);
-
-		if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (ioeventfd.argsz < minsz)
-			return -EINVAL;
-
-		if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
-			return -EINVAL;
-
-		count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
-
-		if (hweight8(count) != 1 || ioeventfd.fd < -1)
-			return -EINVAL;
-
-		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
-					  ioeventfd.data, count, ioeventfd.fd);
-	}
-
-	return -ENOTTY;
-}
-
-static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
-			   size_t count, loff_t *ppos, bool iswrite)
-{
-	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
-	struct vfio_pci_device *vdev = device_data;
-
-	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
-		return -EINVAL;
-
-	switch (index) {
-	case VFIO_PCI_CONFIG_REGION_INDEX:
-		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
-
-	case VFIO_PCI_ROM_REGION_INDEX:
-		if (iswrite)
-			return -EINVAL;
-		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
-
-	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
-		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
-
-	case VFIO_PCI_VGA_REGION_INDEX:
-		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
-	default:
-		index -= VFIO_PCI_NUM_REGIONS;
-		return vdev->region[index].ops->rw(vdev, buf,
-						   count, ppos, iswrite);
-	}
-
-	return -EINVAL;
-}
-
-static ssize_t vfio_pci_read(void *device_data, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	if (!count)
-		return 0;
-
-	return vfio_pci_rw(device_data, buf, count, ppos, false);
-}
-
-static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
-			      size_t count, loff_t *ppos)
-{
-	if (!count)
-		return 0;
-
-	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
-}
-
-/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
-static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
-{
-	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
-
-	/*
-	 * Lock ordering:
-	 * vma_lock is nested under mmap_sem for vm_ops callback paths.
-	 * The memory_lock semaphore is used by both code paths calling
-	 * into this function to zap vmas and the vm_ops.fault callback
-	 * to protect the memory enable state of the device.
-	 *
-	 * When zapping vmas we need to maintain the mmap_sem => vma_lock
-	 * ordering, which requires using vma_lock to walk vma_list to
-	 * acquire an mm, then dropping vma_lock to get the mmap_sem and
-	 * reacquiring vma_lock.  This logic is derived from similar
-	 * requirements in uverbs_user_mmap_disassociate().
-	 *
-	 * mmap_sem must always be the top-level lock when it is taken.
-	 * Therefore we can only hold the memory_lock write lock when
-	 * vma_list is empty, as we'd need to take mmap_sem to clear
-	 * entries.  vma_list can only be guaranteed empty when holding
-	 * vma_lock, thus memory_lock is nested under vma_lock.
-	 *
-	 * This enables the vm_ops.fault callback to acquire vma_lock,
-	 * followed by memory_lock read lock, while already holding
-	 * mmap_sem without risk of deadlock.
-	 */
-	while (1) {
-		struct mm_struct *mm = NULL;
-
-		if (try) {
-			if (!mutex_trylock(&vdev->vma_lock))
-				return 0;
-		} else {
-			mutex_lock(&vdev->vma_lock);
-		}
-		while (!list_empty(&vdev->vma_list)) {
-			mmap_vma = list_first_entry(&vdev->vma_list,
-						    struct vfio_pci_mmap_vma,
-						    vma_next);
-			mm = mmap_vma->vma->vm_mm;
-			if (mmget_not_zero(mm))
-				break;
-
-			list_del(&mmap_vma->vma_next);
-			kfree(mmap_vma);
-			mm = NULL;
-		}
-		if (!mm)
-			return 1;
-		mutex_unlock(&vdev->vma_lock);
-
-		if (try) {
-			if (!down_read_trylock(&mm->mmap_sem)) {
-				mmput(mm);
-				return 0;
-			}
-		} else {
-			down_read(&mm->mmap_sem);
-		}
-		if (mmget_still_valid(mm)) {
-			if (try) {
-				if (!mutex_trylock(&vdev->vma_lock)) {
-					up_read(&mm->mmap_sem);
-					mmput(mm);
-					return 0;
-				}
-			} else {
-				mutex_lock(&vdev->vma_lock);
-			}
-			list_for_each_entry_safe(mmap_vma, tmp,
-						 &vdev->vma_list, vma_next) {
-				struct vm_area_struct *vma = mmap_vma->vma;
-
-				if (vma->vm_mm != mm)
-					continue;
-
-				list_del(&mmap_vma->vma_next);
-				kfree(mmap_vma);
-
-				zap_vma_ptes(vma, vma->vm_start,
-					     vma->vm_end - vma->vm_start);
-			}
-			mutex_unlock(&vdev->vma_lock);
-		}
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-	}
-}
-
-void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev)
-{
-	vfio_pci_zap_and_vma_lock(vdev, false);
-	down_write(&vdev->memory_lock);
-	mutex_unlock(&vdev->vma_lock);
-}
-
-u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev)
-{
-	u16 cmd;
-
-	down_write(&vdev->memory_lock);
-	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
-	if (!(cmd & PCI_COMMAND_MEMORY))
-		pci_write_config_word(vdev->pdev, PCI_COMMAND,
-				      cmd | PCI_COMMAND_MEMORY);
-
-	return cmd;
-}
-
-void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd)
-{
-	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
-	up_write(&vdev->memory_lock);
-}
-
-/* Caller holds vma_lock */
-static int __vfio_pci_add_vma(struct vfio_pci_device *vdev,
-			      struct vm_area_struct *vma)
-{
-	struct vfio_pci_mmap_vma *mmap_vma;
-
-	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
-	if (!mmap_vma)
-		return -ENOMEM;
-
-	mmap_vma->vma = vma;
-	list_add(&mmap_vma->vma_next, &vdev->vma_list);
-
-	return 0;
-}
-
-/*
- * Zap mmaps on open so that we can fault them in on access and therefore
- * our vma_list only tracks mappings accessed since last zap.
- */
-static void vfio_pci_mmap_open(struct vm_area_struct *vma)
-{
-	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
-}
-
-static void vfio_pci_mmap_close(struct vm_area_struct *vma)
-{
-	struct vfio_pci_device *vdev = vma->vm_private_data;
-	struct vfio_pci_mmap_vma *mmap_vma;
-
-	mutex_lock(&vdev->vma_lock);
-	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
-		if (mmap_vma->vma == vma) {
-			list_del(&mmap_vma->vma_next);
-			kfree(mmap_vma);
-			break;
-		}
-	}
-	mutex_unlock(&vdev->vma_lock);
-}
-
-static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct vfio_pci_device *vdev = vma->vm_private_data;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-
-	mutex_lock(&vdev->vma_lock);
-	down_read(&vdev->memory_lock);
-
-	if (!__vfio_pci_memory_enabled(vdev)) {
-		ret = VM_FAULT_SIGBUS;
-		mutex_unlock(&vdev->vma_lock);
-		goto up_out;
-	}
-
-	if (__vfio_pci_add_vma(vdev, vma)) {
-		ret = VM_FAULT_OOM;
-		mutex_unlock(&vdev->vma_lock);
-		goto up_out;
-	}
-
-	mutex_unlock(&vdev->vma_lock);
-
-	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start, vma->vm_page_prot))
-		ret = VM_FAULT_SIGBUS;
-
-up_out:
-	up_read(&vdev->memory_lock);
-	return ret;
-}
-
-static const struct vm_operations_struct vfio_pci_mmap_ops = {
-	.open = vfio_pci_mmap_open,
-	.close = vfio_pci_mmap_close,
-	.fault = vfio_pci_mmap_fault,
-};
-
-static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
-{
-	struct vfio_pci_device *vdev = device_data;
-	struct pci_dev *pdev = vdev->pdev;
-	unsigned int index;
-	u64 phys_len, req_len, pgoff, req_start;
-	int ret;
-
-	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
-
-	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
-		return -EINVAL;
-	if (vma->vm_end < vma->vm_start)
-		return -EINVAL;
-	if ((vma->vm_flags & VM_SHARED) == 0)
-		return -EINVAL;
-	if (index >= VFIO_PCI_NUM_REGIONS) {
-		int regnum = index - VFIO_PCI_NUM_REGIONS;
-		struct vfio_pci_region *region = vdev->region + regnum;
-
-		if (region->ops && region->ops->mmap &&
-		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
-			return region->ops->mmap(vdev, region, vma);
-		return -EINVAL;
-	}
-	if (index >= VFIO_PCI_ROM_REGION_INDEX)
-		return -EINVAL;
-	if (!vdev->bar_mmap_supported[index])
-		return -EINVAL;
-
-	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
-	req_len = vma->vm_end - vma->vm_start;
-	pgoff = vma->vm_pgoff &
-		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
-	req_start = pgoff << PAGE_SHIFT;
-
-	if (req_start + req_len > phys_len)
-		return -EINVAL;
-
-	/*
-	 * Even though we don't make use of the barmap for the mmap,
-	 * we need to request the region and the barmap tracks that.
-	 */
-	if (!vdev->barmap[index]) {
-		ret = pci_request_selected_regions(pdev,
-						   1 << index, "vfio-pci");
-		if (ret)
-			return ret;
-
-		vdev->barmap[index] = pci_iomap(pdev, index, 0);
-		if (!vdev->barmap[index]) {
-			pci_release_selected_regions(pdev, 1 << index);
-			return -ENOMEM;
-		}
-	}
-
-	vma->vm_private_data = vdev;
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
-
-	/*
-	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
-	 * change vm_flags within the fault handler.  Set them now.
-	 */
-	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-	vma->vm_ops = &vfio_pci_mmap_ops;
-
-	return 0;
-}
-
-static void vfio_pci_request(void *device_data, unsigned int count)
-{
-	struct vfio_pci_device *vdev = device_data;
-	struct pci_dev *pdev = vdev->pdev;
-
-	mutex_lock(&vdev->igate);
-
-	if (vdev->req_trigger) {
-		if (!(count % 10))
-			pci_notice_ratelimited(pdev,
-				"Relaying device request to user (#%u)\n",
-				count);
-		eventfd_signal(vdev->req_trigger, 1);
-	} else if (count == 0) {
-		pci_warn(pdev,
-			"No device request channel registered, blocked until released by user\n");
-	}
-
-	mutex_unlock(&vdev->igate);
-}
-
-static const struct vfio_device_ops vfio_pci_ops = {
-	.name		= "vfio-pci",
-	.open		= vfio_pci_open,
-	.release	= vfio_pci_release,
-	.ioctl		= vfio_pci_ioctl,
-	.read		= vfio_pci_read,
-	.write		= vfio_pci_write,
-	.mmap		= vfio_pci_mmap,
-	.request	= vfio_pci_request,
-};
-
-static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
-static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
-
-static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct vfio_pci_device *vdev;
-	struct iommu_group *group;
-	int ret;
-
-	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
-		return -EINVAL;
-
-	/*
-	 * Prevent binding to PFs with VFs enabled, this too easily allows
-	 * userspace instance with VFs and PFs from the same device, which
-	 * cannot work.  Disabling SR-IOV here would initiate removing the
-	 * VFs, which would unbind the driver, which is prone to blocking
-	 * if that VF is also in use by vfio-pci.  Just reject these PFs
-	 * and let the user sort it out.
-	 */
-	if (pci_num_vf(pdev)) {
-		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
-		return -EBUSY;
-	}
-
-	group = vfio_iommu_group_get(&pdev->dev);
-	if (!group)
-		return -EINVAL;
-
-	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
-	if (!vdev) {
-		vfio_iommu_group_put(group, &pdev->dev);
-		return -ENOMEM;
-	}
-
-	vdev->pdev = pdev;
-	vdev->irq_type = VFIO_PCI_NUM_IRQS;
-	mutex_init(&vdev->igate);
-	spin_lock_init(&vdev->irqlock);
-	mutex_init(&vdev->ioeventfds_lock);
-	INIT_LIST_HEAD(&vdev->dummy_resources_list);
-	INIT_LIST_HEAD(&vdev->ioeventfds_list);
-	mutex_init(&vdev->vma_lock);
-	INIT_LIST_HEAD(&vdev->vma_list);
-	init_rwsem(&vdev->memory_lock);
-
-	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
-	if (ret) {
-		vfio_iommu_group_put(group, &pdev->dev);
-		kfree(vdev);
-		return ret;
-	}
-
-	ret = vfio_pci_reflck_attach(vdev);
-	if (ret) {
-		vfio_del_group_dev(&pdev->dev);
-		vfio_iommu_group_put(group, &pdev->dev);
-		kfree(vdev);
-		return ret;
-	}
-
-	if (vfio_pci_is_vga(pdev)) {
-		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
-		vga_set_legacy_decoding(pdev,
-					vfio_pci_set_vga_decode(vdev, false));
-	}
-
-	vfio_pci_probe_power_state(vdev);
-
-	if (!disable_idle_d3) {
-		/*
-		 * pci-core sets the device power state to an unknown value at
-		 * bootup and after being removed from a driver.  The only
-		 * transition it allows from this unknown state is to D0, which
-		 * typically happens when a driver calls pci_enable_device().
-		 * We're not ready to enable the device yet, but we do want to
-		 * be able to get to D3.  Therefore first do a D0 transition
-		 * before going to D3.
-		 */
-		vfio_pci_set_power_state(vdev, PCI_D0);
-		vfio_pci_set_power_state(vdev, PCI_D3hot);
-	}
-
-	return ret;
-}
-
-static void vfio_pci_remove(struct pci_dev *pdev)
-{
-	struct vfio_pci_device *vdev;
-
-	vdev = vfio_del_group_dev(&pdev->dev);
-	if (!vdev)
-		return;
-
-	vfio_pci_reflck_put(vdev->reflck);
-
-	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
-	kfree(vdev->region);
-	mutex_destroy(&vdev->ioeventfds_lock);
-
-	if (!disable_idle_d3)
-		vfio_pci_set_power_state(vdev, PCI_D0);
-
-	kfree(vdev->pm_save);
-	kfree(vdev);
-
-	if (vfio_pci_is_vga(pdev)) {
-		vga_client_register(pdev, NULL, NULL, NULL);
-		vga_set_legacy_decoding(pdev,
-				VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
-				VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
-	}
-}
-
-static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
-						  pci_channel_state_t state)
-{
-	struct vfio_pci_device *vdev;
-	struct vfio_device *device;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (device == NULL)
-		return PCI_ERS_RESULT_DISCONNECT;
-
-	vdev = vfio_device_data(device);
-	if (vdev == NULL) {
-		vfio_device_put(device);
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	mutex_lock(&vdev->igate);
-
-	if (vdev->err_trigger)
-		eventfd_signal(vdev->err_trigger, 1);
-
-	mutex_unlock(&vdev->igate);
-
-	vfio_device_put(device);
-
-	return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static const struct pci_error_handlers vfio_err_handlers = {
-	.error_detected = vfio_pci_aer_err_detected,
-};
-
-static struct pci_driver vfio_pci_driver = {
-	.name		= "vfio-pci",
-	.id_table	= NULL, /* only dynamic ids */
-	.probe		= vfio_pci_probe,
-	.remove		= vfio_pci_remove,
-	.err_handler	= &vfio_err_handlers,
-};
-
-static DEFINE_MUTEX(reflck_lock);
-
-static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
-{
-	struct vfio_pci_reflck *reflck;
-
-	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
-	if (!reflck)
-		return ERR_PTR(-ENOMEM);
-
-	kref_init(&reflck->kref);
-	mutex_init(&reflck->lock);
-
-	return reflck;
-}
-
-static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
-{
-	kref_get(&reflck->kref);
-}
-
-static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_reflck **preflck = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return 0;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return 0;
-	}
-
-	vdev = vfio_device_data(device);
-
-	if (vdev->reflck) {
-		vfio_pci_reflck_get(vdev->reflck);
-		*preflck = vdev->reflck;
-		vfio_device_put(device);
-		return 1;
-	}
-
-	vfio_device_put(device);
-	return 0;
-}
-
-static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
-{
-	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
-
-	mutex_lock(&reflck_lock);
-
-	if (pci_is_root_bus(vdev->pdev->bus) ||
-	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
-					  &vdev->reflck, slot) <= 0)
-		vdev->reflck = vfio_pci_reflck_alloc();
-
-	mutex_unlock(&reflck_lock);
-
-	return PTR_ERR_OR_ZERO(vdev->reflck);
-}
-
-static void vfio_pci_reflck_release(struct kref *kref)
-{
-	struct vfio_pci_reflck *reflck = container_of(kref,
-						      struct vfio_pci_reflck,
-						      kref);
-
-	kfree(reflck);
-	mutex_unlock(&reflck_lock);
-}
-
-static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
-{
-	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
-}
-
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = vfio_device_data(device);
-
-	/* Fault if the device is not unused */
-	if (vdev->refcnt) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	devs->devices[devs->cur_index++] = device;
-	return 0;
-}
-
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
-{
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = vfio_device_data(device);
-
-	/*
-	 * Locking multiple devices is prone to deadlock, runaway and
-	 * unwind if we hit contention.
-	 */
-	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	devs->devices[devs->cur_index++] = device;
-	return 0;
-}
-
-/*
- * If a bus or slot reset is available for the provided device and:
- *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
- *  - At least one of the affected devices is marked dirty via
- *    needs_reset (such as by lack of FLR support)
- * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->reflck->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
- *
- * NB: vfio-core considers a group to be viable even if some devices are
- * bound to drivers like pci-stub or pcieport.  Here we require all devices
- * to be bound to vfio_pci since that's the only way we can be sure they
- * stay put.
- */
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
-{
-	struct vfio_devices devs = { .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
-
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
-		return;
-
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
-
-	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
-			break;
-		}
-	}
-
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
-
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
-
-			if (tmp != vdev && !disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
-
-		vfio_device_put(devs.devices[i]);
-	}
-
-	kfree(devs.devices);
-}
-
-static void __exit vfio_pci_cleanup(void)
+static void __exit vfio_pci_cleanup(void)
 {
 	pci_unregister_driver(&vfio_pci_driver);
-	vfio_pci_uninit_perm_bits();
 }
-
-static void __init vfio_pci_fill_ids(void)
-{
-	char *p, *id;
-	int rc;
-
-	/* no ids passed actually */
-	if (ids[0] == '\0')
-		return;
-
-	/* add ids specified in the module parameter */
-	p = ids;
-	while ((id = strsep(&p, ","))) {
-		unsigned int vendor, device, subvendor = PCI_ANY_ID,
-			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
-		int fields;
-
-		if (!strlen(id))
-			continue;
-
-		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
-				&vendor, &device, &subvendor, &subdevice,
-				&class, &class_mask);
-
-		if (fields < 2) {
-			pr_warn("invalid id string \"%s\"\n", id);
-			continue;
-		}
-
-		rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
-				   subvendor, subdevice, class, class_mask, 0);
-		if (rc)
-			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask, rc);
-		else
-			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask);
-	}
-}
-
-static int __init vfio_pci_init(void)
-{
-	int ret;
-
-	/* Allocate shared config space permision data used by all devices */
-	ret = vfio_pci_init_perm_bits();
-	if (ret)
-		return ret;
-
-	/* Register and scan for devices */
-	ret = pci_register_driver(&vfio_pci_driver);
-	if (ret)
-		goto out_driver;
-
-	vfio_pci_fill_ids();
-
-	return 0;
-
-out_driver:
-	vfio_pci_uninit_perm_bits();
-	return ret;
-}
-
-module_init(vfio_pci_init);
 module_exit(vfio_pci_cleanup);
 
-MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index bf32997c557ff08112d89b4681bf1a1cfccacae2..6751606c0de6ca51a1847bac86668aff49c0b400 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -26,7 +26,7 @@
 #include <linux/vfio.h>
 #include <linux/slab.h>
 
-#include "vfio_pci_private.h"
+#include <linux/vfio_pci_core.h>
 
 /* Fake capability ID for standard config space */
 #define PCI_CAP_ID_BASIC	0
@@ -95,22 +95,22 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
 	[PCI_EXT_CAP_ID_LTR]	=	PCI_EXT_CAP_LTR_SIZEOF,
 	[PCI_EXT_CAP_ID_SECPCI]	=	0,	/* not yet */
 	[PCI_EXT_CAP_ID_PMUX]	=	0,	/* not yet */
-	[PCI_EXT_CAP_ID_PASID]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_PASID]	=	PCI_EXT_CAP_PASID_SIZEOF,
 };
 
 /*
  * Read/Write Permission Bits - one bit for each bit in capability
  * Any field can be read if it exists, but what is read depends on
- * whether the field is 'virtualized', or just pass thru to the
+ * whether the field is 'virtualized', or just pass through to the
  * hardware.  Any virtualized field is also virtualized for writes.
  * Writes are only permitted if they have a 1 bit here.
  */
 struct perm_bits {
 	u8	*virt;		/* read/write virtual data, not hw */
 	u8	*write;		/* writeable bits */
-	int	(*readfn)(struct vfio_pci_device *vdev, int pos, int count,
+	int	(*readfn)(struct vfio_pci_core_device *vdev, int pos, int count,
 			  struct perm_bits *perm, int offset, __le32 *val);
-	int	(*writefn)(struct vfio_pci_device *vdev, int pos, int count,
+	int	(*writefn)(struct vfio_pci_core_device *vdev, int pos, int count,
 			   struct perm_bits *perm, int offset, __le32 val);
 };
 
@@ -171,7 +171,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset,
 	return ret;
 }
 
-static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos,
 				    int count, struct perm_bits *perm,
 				    int offset, __le32 *val)
 {
@@ -197,7 +197,7 @@ static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos,
 				     int count, struct perm_bits *perm,
 				     int offset, __le32 val)
 {
@@ -244,7 +244,7 @@ static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Allow direct read from hardware, except for capability next pointer */
-static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos,
 				   int count, struct perm_bits *perm,
 				   int offset, __le32 *val)
 {
@@ -269,7 +269,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Raw access skips any kind of virtualization */
-static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -282,7 +282,7 @@ static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 *val)
 {
@@ -296,7 +296,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Virt access uses only virtualization */
-static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_virt_config_write(struct vfio_pci_core_device *vdev, int pos,
 				  int count, struct perm_bits *perm,
 				  int offset, __le32 val)
 {
@@ -304,7 +304,7 @@ static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 *val)
 {
@@ -396,7 +396,7 @@ static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
 }
 
 /* Caller should hold memory_lock semaphore */
-bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
@@ -413,7 +413,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
  * Restore the *real* BARs after we detect a FLR or backdoor reset.
  * (backdoor = some device specific technique that we didn't catch)
  */
-static void vfio_bar_restore(struct vfio_pci_device *vdev)
+static void vfio_bar_restore(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 *rbar = vdev->rbar;
@@ -460,34 +460,39 @@ static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
  * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
  * to reflect the hardware capabilities.  This implements BAR sizing.
  */
-static void vfio_bar_fixup(struct vfio_pci_device *vdev)
+static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int i;
-	__le32 *bar;
+	__le32 *vbar;
 	u64 mask;
 
-	bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+	if (!vdev->bardirty)
+		return;
+
+	vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
+		int bar = i + PCI_STD_RESOURCES;
 
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
-		if (!pci_resource_start(pdev, i)) {
-			*bar = 0; /* Unmapped by host = unimplemented to user */
+		if (!pci_resource_start(pdev, bar)) {
+			*vbar = 0; /* Unmapped by host = unimplemented to user */
 			continue;
 		}
 
-		mask = ~(pci_resource_len(pdev, i) - 1);
+		mask = ~(pci_resource_len(pdev, bar) - 1);
 
-		*bar &= cpu_to_le32((u32)mask);
-		*bar |= vfio_generate_bar_flags(pdev, i);
+		*vbar &= cpu_to_le32((u32)mask);
+		*vbar |= vfio_generate_bar_flags(pdev, bar);
 
-		if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
-			bar++;
-			*bar &= cpu_to_le32((u32)(mask >> 32));
+		if (*vbar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+			vbar++;
+			*vbar &= cpu_to_le32((u32)(mask >> 32));
 			i++;
 		}
 	}
 
-	bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+	vbar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
 
 	/*
 	 * NB. REGION_INFO will have reported zero size if we weren't able
@@ -497,19 +502,19 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
 	if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
 		mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
 		mask |= PCI_ROM_ADDRESS_ENABLE;
-		*bar &= cpu_to_le32((u32)mask);
+		*vbar &= cpu_to_le32((u32)mask);
 	} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
 					IORESOURCE_ROM_SHADOW) {
 		mask = ~(0x20000 - 1);
 		mask |= PCI_ROM_ADDRESS_ENABLE;
-		*bar &= cpu_to_le32((u32)mask);
+		*vbar &= cpu_to_le32((u32)mask);
 	} else
-		*bar = 0;
+		*vbar = 0;
 
 	vdev->bardirty = false;
 }
 
-static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_basic_config_read(struct vfio_pci_core_device *vdev, int pos,
 				  int count, struct perm_bits *perm,
 				  int offset, __le32 *val)
 {
@@ -531,7 +536,7 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
 }
 
 /* Test whether BARs match the value we think they should contain */
-static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
+static bool vfio_need_bar_restore(struct vfio_pci_core_device *vdev)
 {
 	int i = 0, pos = PCI_BASE_ADDRESS_0, ret;
 	u32 bar;
@@ -547,7 +552,7 @@ static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
 	return false;
 }
 
-static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 				   int count, struct perm_bits *perm,
 				   int offset, __le32 val)
 {
@@ -687,7 +692,7 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 val)
 {
@@ -742,7 +747,7 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_vpd_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -824,7 +829,7 @@ static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -908,7 +913,7 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
 	return 0;
 }
 
-static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 val)
 {
@@ -1067,7 +1072,7 @@ int __init vfio_pci_init_perm_bits(void)
 	return ret;
 }
 
-static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
+static int vfio_find_cap_start(struct vfio_pci_core_device *vdev, int pos)
 {
 	u8 cap;
 	int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
@@ -1084,7 +1089,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
 	return pos;
 }
 
-static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
+static int vfio_msi_config_read(struct vfio_pci_core_device *vdev, int pos,
 				int count, struct perm_bits *perm,
 				int offset, __le32 *val)
 {
@@ -1104,7 +1109,7 @@ static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
 	return vfio_default_config_read(vdev, pos, count, perm, offset, val);
 }
 
-static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
+static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos,
 				 int count, struct perm_bits *perm,
 				 int offset, __le32 val)
 {
@@ -1184,7 +1189,7 @@ static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
 }
 
 /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
-static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
+static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int len, ret;
@@ -1217,7 +1222,7 @@ static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
 }
 
 /* Determine extended capability length for VC (2 & 9) and MFVC */
-static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
+static int vfio_vc_cap_len(struct vfio_pci_core_device *vdev, u16 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 tmp;
@@ -1258,7 +1263,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
 	return len;
 }
 
-static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
+static int vfio_cap_len(struct vfio_pci_core_device *vdev, u8 cap, u8 pos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 dword;
@@ -1333,7 +1338,7 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
 	return 0;
 }
 
-static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
+static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epos)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 byte;
@@ -1407,7 +1412,7 @@ static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
 	return 0;
 }
 
-static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
+static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev,
 				   int offset, int size)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -1454,7 +1459,7 @@ static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
 	return ret;
 }
 
-static int vfio_cap_init(struct vfio_pci_device *vdev)
+static int vfio_cap_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map = vdev->pci_config_map;
@@ -1544,11 +1549,302 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static int vfio_ecap_init(struct vfio_pci_device *vdev)
+static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_core_device *vdev,
+					int offset, uint8_t *data, int size)
+{
+	int ret = 0, data_offset = 0;
+
+	while (size) {
+		int filled;
+
+		if (size >= 4 && !(offset % 4)) {
+			__le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
+			u32 dword;
+
+			memcpy(&dword, data + data_offset, 4);
+			*dwordp = cpu_to_le32(dword);
+			filled = 4;
+		} else if (size >= 2 && !(offset % 2)) {
+			__le16 *wordp = (__le16 *)&vdev->vconfig[offset];
+			u16 word;
+
+			memcpy(&word, data + data_offset, 2);
+			*wordp = cpu_to_le16(word);
+			filled = 2;
+		} else {
+			u8 *byte = &vdev->vconfig[offset];
+
+			memcpy(byte, data + data_offset, 1);
+			filled = 1;
+		}
+
+		offset += filled;
+		data_offset += filled;
+		size -= filled;
+	}
+
+	return ret;
+}
+
+static int vfio_pci_get_ecap_content(struct pci_dev *pdev,
+					int cap, int cap_len, u8 *content)
+{
+	int pos, offset, len = cap_len, ret = 0;
+
+	pos = pci_find_ext_capability(pdev, cap);
+	if (!pos)
+		return -EINVAL;
+
+	offset = 0;
+	while (len) {
+		int fetched;
+
+		if (len >= 4 && !(pos % 4)) {
+			u32 *dwordp = (u32 *) (content + offset);
+			u32 dword;
+			__le32 *dwptr = (__le32 *) &dword;
+
+			ret = pci_read_config_dword(pdev, pos, &dword);
+			if (ret)
+				return ret;
+			*dwordp = le32_to_cpu(*dwptr);
+			fetched = 4;
+		} else if (len >= 2 && !(pos % 2)) {
+			u16 *wordp = (u16 *) (content + offset);
+			u16 word;
+			__le16 *wptr = (__le16 *) &word;
+
+			ret = pci_read_config_word(pdev, pos, &word);
+			if (ret)
+				return ret;
+			*wordp = le16_to_cpu(*wptr);
+			fetched = 2;
+		} else {
+			u8 *byte = (u8 *) (content + offset);
+
+			ret = pci_read_config_byte(pdev, pos, byte);
+			if (ret)
+				return ret;
+			fetched = 1;
+		}
+
+		pos += fetched;
+		offset += fetched;
+		len -= fetched;
+	}
+
+	return ret;
+}
+
+struct vfio_pci_pasid_cap_data {
+	u32 id:16;
+	u32 version:4;
+	u32 next:12;
+	union {
+		u16 cap_reg_val;
+		struct {
+			u16 rsv1:1;
+			u16 execs:1;
+			u16 prvs:1;
+			u16 rsv2:5;
+			u16 pasid_bits:5;
+			u16 rsv3:3;
+		};
+	} cap_reg;
+	union {
+		u16 control_reg_val;
+		struct {
+			u16 paside:1;
+			u16 exece:1;
+			u16 prve:1;
+			u16 rsv4:13;
+		};
+	} control_reg;
+};
+
+static int vfio_pci_add_pasid_cap(struct vfio_pci_core_device *vdev,
+				    struct pci_dev *pdev,
+				    u16 epos, u16 *next, __le32 **prevp)
+{
+	u8 *map = vdev->pci_config_map;
+	int ecap = PCI_EXT_CAP_ID_PASID;
+	int len = pci_ext_cap_length[ecap];
+	struct vfio_pci_pasid_cap_data pasid_cap;
+	struct vfio_pci_pasid_cap_data vpasid_cap;
+	int ret;
+
+	/*
+	 * If no cap filled in this function, should make sure the next
+	 * pointer points to current epos.
+	 */
+	*next = epos;
+
+	if (!len) {
+		pr_info("%s: VF %s hiding PASID capability\n",
+				__func__, dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	/* Add PASID capability*/
+	ret = vfio_pci_get_ecap_content(pdev, ecap,
+					len, (u8 *)&pasid_cap);
+	if (ret)
+		goto out;
+
+	if (!pasid_cap.control_reg.paside) {
+		pr_debug("%s: its PF's PASID capability is not enabled\n",
+			dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	memcpy(&vpasid_cap, &pasid_cap, len);
+
+	vpasid_cap.next = 0;
+	/* clear the control reg for guest */
+	memset(&vpasid_cap.control_reg, 0x0,
+			sizeof(vpasid_cap.control_reg));
+
+	memset(map + epos, vpasid_cap.id, len);
+	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
+					(u8 *)&vpasid_cap, len);
+	if (!ret) {
+		/*
+		 * Successfully filled in PASID cap, update
+		 * the next offset in previous cap header,
+		 * and also update caller about the offset
+		 * of next cap if any.
+		 */
+		u32 val = epos;
+		**prevp &= cpu_to_le32(~(0xffcU << 20));
+		**prevp |= cpu_to_le32(val << 20);
+		*prevp = (__le32 *)&vdev->vconfig[epos];
+		*next = epos + len;
+	}
+
+out:
+	return ret;
+}
+
+struct vfio_pci_pri_cap_data {
+	u32 id:16;
+	u32 version:4;
+	u32 next:12;
+	union {
+		u16 control_reg_val;
+		struct {
+			u16 enable:1;
+			u16 reset:1;
+			u16 rsv1:14;
+		};
+	} control_reg;
+	union {
+		u16 status_reg_val;
+		struct {
+			u16 rf:1;
+			u16 uprgi:1;
+			u16 rsv2:6;
+			u16 stop:1;
+			u16 rsv3:6;
+			u16 pasid_required:1;
+		};
+	} status_reg;
+	u32 prq_capacity;
+	u32 prq_quota;
+};
+
+static int vfio_pci_add_pri_cap(struct vfio_pci_core_device *vdev,
+				    struct pci_dev *pdev,
+				    u16 epos, u16 *next, __le32 **prevp)
+{
+	u8 *map = vdev->pci_config_map;
+	int ecap = PCI_EXT_CAP_ID_PRI;
+	int len = pci_ext_cap_length[ecap];
+	struct vfio_pci_pri_cap_data pri_cap;
+	struct vfio_pci_pri_cap_data vpri_cap;
+	int ret;
+
+	/*
+	 * If no cap filled in this function, should make sure the next
+	 * pointer points to current epos.
+	 */
+	*next = epos;
+
+	if (!len) {
+		pr_info("%s: VF %s hiding PRI capability\n",
+				__func__, dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	/* Add PASID capability*/
+	ret = vfio_pci_get_ecap_content(pdev, ecap,
+					len, (u8 *)&pri_cap);
+	if (ret)
+		goto out;
+
+	if (!pri_cap.control_reg.enable) {
+		pr_debug("%s: its PF's PRI capability is not enabled\n",
+			dev_name(&vdev->pdev->dev));
+		ret = 0;
+		goto out;
+	}
+
+	memcpy(&vpri_cap, &pri_cap, len);
+
+	vpri_cap.next = 0;
+	/* clear the control reg for guest */
+	memset(&vpri_cap.control_reg, 0x0,
+			sizeof(vpri_cap.control_reg));
+
+	memset(map + epos, vpri_cap.id, len);
+	ret = vfio_fill_custom_vconfig_bytes(vdev, epos,
+					(u8 *)&vpri_cap, len);
+	if (!ret) {
+		/*
+		 * Successfully filled in PASID cap, update
+		 * the next offset in previous cap header,
+		 * and also update caller about the offset
+		 * of next cap if any.
+		 */
+		u32 val = epos;
+		**prevp &= cpu_to_le32(~(0xffcU << 20));
+		**prevp |= cpu_to_le32(val << 20);
+		*prevp = (__le32 *)&vdev->vconfig[epos];
+		*next = epos + len;
+	}
+
+out:
+	return ret;
+}
+
+static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_core_device *vdev,
+			struct pci_dev *pdev, u16 start_epos, __le32 *prev)
+{
+	__le32 *__prev = prev;
+	u16 epos = start_epos, epos_next = start_epos;
+	int ret = 0;
+
+	/* Add PASID capability*/
+	ret = vfio_pci_add_pasid_cap(vdev, pdev, epos,
+					&epos_next, &__prev);
+	if (ret)
+		return ret;
+
+	/* Add PRI capability */
+	epos = epos_next;
+	ret = vfio_pci_add_pri_cap(vdev, pdev, epos,
+				   &epos_next, &__prev);
+
+	return ret;
+}
+
+static int vfio_ecap_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map = vdev->pci_config_map;
-	u16 epos;
+	u16 epos, epos_max;
 	__le32 *prev = NULL;
 	int loops, ret, ecaps = 0;
 
@@ -1556,6 +1852,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
 		return 0;
 
 	epos = PCI_CFG_SPACE_SIZE;
+	epos_max = PCI_CFG_SPACE_SIZE;
 
 	loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
 
@@ -1580,6 +1877,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
 			}
 		}
 
+		if (epos_max <= (epos + len))
+			epos_max = epos + len;
+
 		if (!len) {
 			pci_info(pdev, "%s: hiding ecap %#x@%#x\n",
 				 __func__, ecap, epos);
@@ -1639,6 +1939,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
 	if (!ecaps)
 		*(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
 
+#ifdef CONFIG_PCI_ATS
+	if (pdev->is_virtfn) {
+		struct pci_dev *physfn = pdev->physfn;
+
+		ret = vfio_pci_add_emulated_cap_for_vf(vdev,
+					physfn, epos_max, prev);
+		if (ret)
+			pr_info("%s, failed to add special caps for VF %s\n",
+				__func__, dev_name(&vdev->pdev->dev));
+	}
+#endif
+
 	return 0;
 }
 
@@ -1664,7 +1976,7 @@ static const struct pci_device_id known_bogus_vf_intx_pin[] = {
  * for each area requiring emulated bits, but the array of pointers
  * would be comparable in size (at least for standard config space).
  */
-int vfio_config_init(struct vfio_pci_device *vdev)
+int vfio_config_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u8 *map, *vconfig;
@@ -1768,7 +2080,7 @@ int vfio_config_init(struct vfio_pci_device *vdev)
 	return pcibios_err_to_errno(ret);
 }
 
-void vfio_config_free(struct vfio_pci_device *vdev)
+void vfio_config_free(struct vfio_pci_core_device *vdev)
 {
 	kfree(vdev->vconfig);
 	vdev->vconfig = NULL;
@@ -1785,7 +2097,7 @@ void vfio_config_free(struct vfio_pci_device *vdev)
  * Find the remaining number of bytes in a dword that match the given
  * position.  Stop at either the end of the capability or the dword boundary.
  */
-static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
+static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev,
 					   loff_t pos)
 {
 	u8 cap = vdev->pci_config_map[pos];
@@ -1797,7 +2109,18 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
 	return i;
 }
 
-static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
+static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id)
+{
+#ifdef CONFIG_PCI_ATS
+	return (pdev->is_virtfn &&
+		(cap_id == PCI_EXT_CAP_ID_PASID ||
+		 cap_id == PCI_EXT_CAP_ID_PRI));
+#else
+	return false;
+#endif
+}
+
+static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 				 size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -1830,7 +2153,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 	if (cap_id == PCI_CAP_ID_INVALID) {
 		perm = &unassigned_perms;
 		cap_start = *ppos;
-	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT) {
+	} else if (cap_id == PCI_CAP_ID_INVALID_VIRT ||
+		   vfio_pci_need_virt_perm(pdev, cap_id)) {
 		perm = &virt_perms;
 		cap_start = *ppos;
 	} else {
@@ -1880,7 +2204,7 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return ret;
 }
 
-ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
 {
 	size_t done = 0;
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
new file mode 100644
index 0000000000000000000000000000000000000000..3cdcff7b2aa776b4a97aa2603bba5625a1452cb2
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -0,0 +1,2534 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vgaarb.h>
+#include <linux/nospec.h>
+#include <linux/sched/mm.h>
+#include <linux/circ_buf.h>
+
+#include <linux/vfio_pci_core.h>
+
+#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC "core driver for VFIO based PCI devices"
+
+static bool nointxmask;
+static bool disable_vga;
+static bool disable_idle_d3;
+
+static inline bool vfio_vga_disabled(void)
+{
+#ifdef CONFIG_VFIO_PCI_VGA
+	return disable_vga;
+#else
+	return true;
+#endif
+}
+
+/*
+ * Our VGA arbiter participation is limited since we don't know anything
+ * about the device itself.  However, if the device is the only VGA device
+ * downstream of a bridge and VFIO VGA support is disabled, then we can
+ * safely return legacy VGA IO and memory as not decoded since the user
+ * has no way to get to it and routing can be disabled externally at the
+ * bridge.
+ */
+static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
+{
+	struct vfio_pci_core_device *vdev = opaque;
+	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
+	unsigned char max_busnr;
+	unsigned int decodes;
+
+	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
+		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+
+	max_busnr = pci_bus_max_busnr(pdev->bus);
+	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
+
+	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
+		if (tmp == pdev ||
+		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
+		    pci_is_root_bus(tmp->bus))
+			continue;
+
+		if (tmp->bus->number >= pdev->bus->number &&
+		    tmp->bus->number <= max_busnr) {
+			pci_dev_put(tmp);
+			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
+			break;
+		}
+	}
+
+	return decodes;
+}
+
+static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
+{
+	struct resource *res;
+	int i;
+	struct vfio_pci_dummy_resource *dummy_res;
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		int bar = i + PCI_STD_RESOURCES;
+
+		res = &vdev->pdev->resource[bar];
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+			goto no_mmap;
+
+		if (!(res->flags & IORESOURCE_MEM))
+			goto no_mmap;
+
+		/*
+		 * The PCI core shouldn't set up a resource with a
+		 * type but zero size. But there may be bugs that
+		 * cause us to do that.
+		 */
+		if (!resource_size(res))
+			goto no_mmap;
+
+		if (resource_size(res) >= PAGE_SIZE) {
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+
+		if (!(res->start & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the remainder
+			 * of the exclusive page in case that hot-add
+			 * device's bar is assigned into it.
+			 */
+			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
+			if (dummy_res == NULL)
+				goto no_mmap;
+
+			dummy_res->resource.name = "vfio sub-page reserved";
+			dummy_res->resource.start = res->end + 1;
+			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
+			dummy_res->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res->resource)) {
+				kfree(dummy_res);
+				goto no_mmap;
+			}
+			dummy_res->index = bar;
+			list_add(&dummy_res->res_next,
+					&vdev->dummy_resources_list);
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+		/*
+		 * Here we don't handle the case when the BAR is not page
+		 * aligned because we can't expect the BAR will be
+		 * assigned into the same location in a page in guest
+		 * when we passthrough the BAR. And it's hard to access
+		 * this BAR in userspace because we have no way to get
+		 * the BAR's location in a page.
+		 */
+no_mmap:
+		vdev->bar_mmap_supported[bar] = false;
+	}
+}
+
+struct vfio_pci_group_info;
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups);
+
+/*
+ * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
+ * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
+ * If a device implements the former but not the latter we would typically
+ * expect broken_intx_masking be set and require an exclusive interrupt.
+ * However since we do have control of the device's ability to assert INTx,
+ * we can instead pretend that the device does not implement INTx, virtualizing
+ * the pin register to report zero and maintaining DisINTx set on the host.
+ */
+static bool vfio_pci_nointx(struct pci_dev *pdev)
+{
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_INTEL:
+		switch (pdev->device) {
+		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
+		case 0x1572:
+		case 0x1574:
+		case 0x1580 ... 0x1581:
+		case 0x1583 ... 0x158b:
+		case 0x37d0 ... 0x37d2:
+		/* X550 */
+		case 0x1563:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	return false;
+}
+
+static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u16 pmcsr;
+
+	if (!pdev->pm_cap)
+		return;
+
+	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
+
+	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
+}
+
+/*
+ * pci_set_power_state() wrapper handling devices which perform a soft reset on
+ * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
+ * restore when returned to D0.  Saved separately from pci_saved_state for use
+ * by PM capability emulation and separately from pci_dev internal saved state
+ * to avoid it being overwritten and consumed around other resets.
+ */
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	bool needs_restore = false, needs_save = false;
+	int ret;
+
+	if (vdev->needs_pm_restore) {
+		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
+			pci_save_state(pdev);
+			needs_save = true;
+		}
+
+		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
+			needs_restore = true;
+	}
+
+	ret = pci_set_power_state(pdev, state);
+
+	if (!ret) {
+		/* D3 might be unsupported via quirk, skip unless in D3 */
+		if (needs_save && pdev->current_state >= PCI_D3hot) {
+			vdev->pm_save = pci_store_saved_state(pdev);
+		} else if (needs_restore) {
+			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
+			pci_restore_state(pdev);
+		}
+	}
+
+	return ret;
+}
+
+static void vfio_pci_dma_fault_release(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_region *region)
+{
+	kfree(vdev->fault_pages);
+}
+
+static int vfio_pci_dma_fault_mmap(struct vfio_pci_core_device *vdev,
+				   struct vfio_pci_region *region,
+				   struct vm_area_struct *vma)
+{
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned long long addr;
+	unsigned int ret;
+
+	phys_len = region->size;
+
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	/* only the second page of the producer fault region is mmappable */
+	if (req_start < PAGE_SIZE)
+		return -EINVAL;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	addr = virt_to_phys(vdev->fault_pages);
+	vma->vm_private_data = vdev;
+	vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			      req_len, vma->vm_page_prot);
+	return ret;
+}
+
+static int vfio_pci_dma_fault_add_capability(struct vfio_pci_core_device *vdev,
+					     struct vfio_pci_region *region,
+					     struct vfio_info_cap *caps)
+{
+	struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+	struct vfio_region_info_cap_fault cap = {
+		.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
+		.header.version = 1,
+		.version = 1,
+	};
+	size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+	int ret;
+
+	ret = vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+	if (ret)
+		return ret;
+
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
+
+	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+	sparse->header.version = 1;
+	sparse->nr_areas = 1;
+	sparse->areas[0].offset = PAGE_SIZE;
+	sparse->areas[0].size = region->size - PAGE_SIZE;
+
+	ret = vfio_info_add_capability(caps, &sparse->header, size);
+
+	/* Regardless of add status, needs to free sparse to avoid memleak */
+	kfree(sparse);
+
+	return ret;
+}
+
+static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
+	.rw		= vfio_pci_dma_fault_rw,
+	.release	= vfio_pci_dma_fault_release,
+	.mmap		= vfio_pci_dma_fault_mmap,
+	.add_capability = vfio_pci_dma_fault_add_capability,
+};
+
+static void vfio_pci_mregion_release(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_region *region)
+{
+}
+
+static int vfio_pci_mregion_mmap(struct vfio_pci_core_device *vdev,
+				   struct vfio_pci_region *region,
+				   struct vm_area_struct *vma)
+{
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned long long addr;
+	unsigned int ret;
+
+	phys_len = region->size;
+
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	/* only the second page of the producer fault region is mmappable */
+	if (req_start < PAGE_SIZE)
+		return -EINVAL;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	addr = virt_to_phys(vdev->mig_pages);
+	vma->vm_private_data = vdev;
+	vma->vm_pgoff = (addr >> PAGE_SHIFT) + pgoff;
+
+	printk("%s, mmap addr %llx\n", __func__, addr);
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			      req_len, vma->vm_page_prot);
+	return ret;
+}
+
+static int vfio_pci_mregion_add_capability(struct vfio_pci_core_device *vdev,
+					     struct vfio_pci_region *region,
+					     struct vfio_info_cap *caps)
+{
+	struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+	size_t size = sizeof(*sparse) + sizeof(*sparse->areas);
+	int ret;
+
+	/* TODO: Look into re-enabling it */
+	return 0;
+
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
+
+	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+	sparse->header.version = 1;
+	sparse->nr_areas = 1;
+	/* first page of the region is for vfio_device_migration_info */
+	sparse->areas[0].offset = PAGE_SIZE;
+	sparse->areas[0].size = region->size - PAGE_SIZE;
+
+	ret = vfio_info_add_capability(caps, &sparse->header, size);
+	if (ret)
+		kfree(sparse);
+
+	return ret;
+}
+
+static const struct vfio_pci_regops vfio_pci_mig_regops = {
+	.rw		= vfio_pci_mregion_rw,
+	.release	= vfio_pci_mregion_release,
+	.mmap		= vfio_pci_mregion_mmap,
+	.add_capability = vfio_pci_mregion_add_capability,
+};
+
+int vfio_pci_migration_init(struct vfio_pci_core_device *vdev, uint32_t state_size)
+{
+	struct vfio_device_migration_info *mig_info;
+	size_t size;
+	int ret;
+
+	mutex_init(&vdev->mig_lock);
+
+	size = ALIGN(sizeof(*mig_info) + state_size, PAGE_SIZE);
+
+	vdev->mig_pages = kzalloc(size, GFP_KERNEL);
+	if (!vdev->mig_pages)
+		return -ENOMEM;
+
+	mig_info = (struct vfio_device_migration_info *) vdev->mig_pages;
+	ret = vfio_pci_register_dev_region(vdev,
+		VFIO_REGION_TYPE_MIGRATION,
+		VFIO_REGION_SUBTYPE_MIGRATION,
+		&vfio_pci_mig_regops, size,
+		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE,
+		//VFIO_REGION_INFO_FLAG_MMAP,
+		vdev->mig_pages);
+	if (ret)
+		goto out;
+
+	mig_info->data_offset = sizeof(*mig_info);
+	pr_info("%s, mig_info->data_offset: 0x%lx\n", __func__, (unsigned long) mig_info->data_offset);
+	return 0;
+out:
+	kfree(vdev->mig_pages);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_migration_init);
+
+
+int vfio_pci_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+	struct vfio_pci_core_device *vdev = (struct vfio_pci_core_device *)data;
+	struct vfio_region_dma_fault *reg =
+		(struct vfio_region_dma_fault *)vdev->fault_pages;
+	int head, tail, size, ext_irq_index;
+	struct iommu_fault *new;
+	int ret = 0;
+
+	if (WARN_ON(!reg))
+		return -ENOENT;
+
+	new = (struct iommu_fault *)(vdev->fault_pages + reg->offset +
+				     reg->head * reg->entry_size);
+
+	/* We need to send page request and relavent unrecoverable fault to userspace */
+	if (fault->type != IOMMU_FAULT_DMA_UNRECOV &&
+	    fault->type != IOMMU_FAULT_PAGE_REQ)
+		return -ENOENT;
+
+	mutex_lock(&vdev->fault_queue_lock);
+
+	dev_dbg(&vdev->pdev->dev, "%s, enque fault event\n", __func__);
+	head = reg->head;
+	tail = reg->tail;
+	size = reg->nb_entries;
+
+	if (CIRC_SPACE(head, tail, size) < 1) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	*new = *fault;
+	reg->head = (head + 1) % size;
+unlock:
+	mutex_unlock(&vdev->fault_queue_lock);
+	if (ret)
+		return ret;
+
+	ext_irq_index = vfio_pci_get_ext_irq_index(vdev, VFIO_IRQ_TYPE_NESTED,
+						   VFIO_IRQ_SUBTYPE_DMA_FAULT);
+	if (ext_irq_index < 0)
+		return -EINVAL;
+
+	mutex_lock(&vdev->igate);
+	dev_dbg(&vdev->pdev->dev, "%s, signal userspace!\n", __func__);
+	if (vdev->ext_irqs[ext_irq_index].trigger)
+		eventfd_signal(vdev->ext_irqs[ext_irq_index].trigger, 1);
+	mutex_unlock(&vdev->igate);
+	return 0;
+}
+
+#define DMA_FAULT_RING_LENGTH 512
+
+int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev, bool register_fault)
+{
+	struct vfio_region_dma_fault *header;
+	size_t size;
+	int ret;
+
+	mutex_init(&vdev->fault_queue_lock);
+
+	/*
+	 * We provision 1 page for the header and space for
+	 * DMA_FAULT_RING_LENGTH fault records in the ring buffer.
+	 */
+	size = ALIGN(sizeof(struct iommu_fault) *
+		     DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
+
+	vdev->fault_pages = kzalloc(size, GFP_KERNEL);
+	if (!vdev->fault_pages)
+		return -ENOMEM;
+
+	ret = vfio_pci_register_dev_region(vdev,
+		VFIO_REGION_TYPE_NESTED,
+		VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
+		&vfio_pci_dma_fault_regops, size,
+		VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+		VFIO_REGION_INFO_FLAG_MMAP,
+		vdev->fault_pages);
+	if (ret)
+		goto out;
+
+	header = (struct vfio_region_dma_fault *)vdev->fault_pages;
+	header->entry_size = sizeof(struct iommu_fault);
+	header->nb_entries = DMA_FAULT_RING_LENGTH;
+	header->offset = PAGE_SIZE;
+
+	if (register_fault) {
+		ret = iommu_register_device_fault_handler(&vdev->pdev->dev,
+							  vfio_pci_iommu_dev_fault_handler,
+							  vdev);
+		if (ret) /* the dma fault region is freed in vfio_pci_disable() */
+			goto out;
+	}
+
+	ret = vfio_pci_register_irq(vdev, VFIO_IRQ_TYPE_NESTED,
+				    VFIO_IRQ_SUBTYPE_DMA_FAULT,
+				    VFIO_IRQ_INFO_EVENTFD);
+	if (ret) /* the fault handler is also freed in vfio_pci_disable() */
+		goto out;
+
+	return 0;
+out:
+	kfree(vdev->fault_pages);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_dma_fault_init);
+
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+	u16 cmd;
+	u8 msix_pos;
+
+	vfio_pci_set_power_state(vdev, PCI_D0);
+
+	/* Don't allow our initial saved state to include busmaster */
+	pci_clear_master(pdev);
+
+	ret = pci_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	/* If reset fails because of the device lock, fail this path entirely */
+	ret = pci_try_reset_function(pdev);
+	if (ret == -EAGAIN) {
+		pci_disable_device(pdev);
+		return ret;
+	}
+
+	vdev->reset_works = !ret;
+	pci_save_state(pdev);
+	vdev->pci_saved_state = pci_store_saved_state(pdev);
+	if (!vdev->pci_saved_state)
+		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
+
+	if (likely(!nointxmask)) {
+		if (vfio_pci_nointx(pdev)) {
+			pci_info(pdev, "Masking broken INTx support\n");
+			vdev->nointx = true;
+			pci_intx(pdev, 0);
+		} else
+			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
+	}
+
+	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
+		cmd &= ~PCI_COMMAND_INTX_DISABLE;
+		pci_write_config_word(pdev, PCI_COMMAND, cmd);
+	}
+
+	ret = vfio_config_init(vdev);
+	if (ret) {
+		kfree(vdev->pci_saved_state);
+		vdev->pci_saved_state = NULL;
+		pci_disable_device(pdev);
+		return ret;
+	}
+
+	msix_pos = pdev->msix_cap;
+	if (msix_pos) {
+		u16 flags;
+		u32 table;
+
+		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
+		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
+
+		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
+		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
+		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
+	} else
+		vdev->msix_bar = 0xFF;
+
+	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
+		vdev->has_vga = true;
+
+	ret = vfio_pci_dma_fault_init(vdev, true);
+	if (ret) {
+		pci_disable_device(pdev);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
+
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_dummy_resource *dummy_res, *tmp;
+	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
+	int i, bar;
+
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
+	/* Stop the device from further DMA */
+	pci_clear_master(pdev);
+
+	if (vdev->irq_type < VFIO_PCI_NUM_IRQS)
+		vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+					VFIO_IRQ_SET_ACTION_TRIGGER,
+					vdev->irq_type, 0, 0, NULL);
+
+	WARN_ON(iommu_unregister_device_fault_handler(&vdev->pdev->dev));
+
+	if (vdev->ext_irqs) {
+		for (i = 0; i < vdev->num_ext_irqs; i++)
+			if (vdev->ext_irqs[i].trigger)
+				vfio_pci_set_irqs_ioctl(
+					vdev, VFIO_IRQ_SET_DATA_NONE |
+					VFIO_IRQ_SET_ACTION_TRIGGER,
+					VFIO_PCI_NUM_IRQS + i, 0, 0, NULL);
+		vdev->num_ext_irqs = 0;
+		kfree(vdev->ext_irqs);
+		vdev->ext_irqs = NULL;
+	}
+
+	/* Device closed, don't need mutex here */
+	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
+				 &vdev->ioeventfds_list, next) {
+		vfio_virqfd_disable(&ioeventfd->virqfd);
+		list_del(&ioeventfd->next);
+		kfree(ioeventfd);
+	}
+	vdev->ioeventfds_nr = 0;
+
+	vdev->virq_disabled = false;
+
+	for (i = 0; i < vdev->num_regions; i++)
+		vdev->region[i].ops->release(vdev, &vdev->region[i]);
+
+	vdev->num_regions = 0;
+	kfree(vdev->region);
+	vdev->region = NULL; /* don't krealloc a freed pointer */
+
+	vfio_config_free(vdev);
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		bar = i + PCI_STD_RESOURCES;
+		if (!vdev->barmap[bar])
+			continue;
+		pci_iounmap(pdev, vdev->barmap[bar]);
+		pci_release_selected_regions(pdev, 1 << bar);
+		vdev->barmap[bar] = NULL;
+	}
+
+	list_for_each_entry_safe(dummy_res, tmp,
+				 &vdev->dummy_resources_list, res_next) {
+		list_del(&dummy_res->res_next);
+		release_resource(&dummy_res->resource);
+		kfree(dummy_res);
+	}
+
+	vdev->needs_reset = true;
+
+	/*
+	 * If we have saved state, restore it.  If we can reset the device,
+	 * even better.  Resetting with current state seems better than
+	 * nothing, but saving and restoring current state without reset
+	 * is just busy work.
+	 */
+	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
+		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
+
+		if (!vdev->reset_works)
+			goto out;
+
+		pci_save_state(pdev);
+	}
+
+	/*
+	 * Disable INTx and MSI, presumably to avoid spurious interrupts
+	 * during reset.  Stolen from pci_reset_function()
+	 */
+	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+
+	/*
+	 * Try to get the locks ourselves to prevent a deadlock. The
+	 * success of this is dependent on being able to lock the device,
+	 * which is not always possible.
+	 * We can not use the "try" reset interface here, which will
+	 * overwrite the previously restored configuration information.
+	 */
+	if (vdev->reset_works && pci_dev_trylock(pdev)) {
+		if (!__pci_reset_function_locked(pdev))
+			vdev->needs_reset = false;
+		pci_dev_unlock(pdev);
+	}
+
+	pci_restore_state(pdev);
+out:
+	pci_disable_device(pdev);
+
+	if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
+		vfio_pci_set_power_state(vdev, PCI_D3hot);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
+
+static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *physfn = pci_physfn(vdev->pdev);
+	struct vfio_device *pf_dev;
+
+	if (!vdev->pdev->is_virtfn)
+		return NULL;
+
+	pf_dev = vfio_device_get_from_dev(&physfn->dev);
+	if (!pf_dev)
+		return NULL;
+
+	if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) {
+		vfio_device_put(pf_dev);
+		return NULL;
+	}
+
+	return container_of(pf_dev, struct vfio_pci_core_device, vdev);
+}
+
+static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val)
+{
+	struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
+
+	if (!pf_vdev)
+		return;
+
+	mutex_lock(&pf_vdev->vf_token->lock);
+	pf_vdev->vf_token->users += val;
+	WARN_ON(pf_vdev->vf_token->users < 0);
+	mutex_unlock(&pf_vdev->vf_token->lock);
+
+	vfio_device_put(&pf_vdev->vdev);
+}
+
+void vfio_pci_core_close_device(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	vfio_pci_vf_token_user_add(vdev, -1);
+	vfio_spapr_pci_eeh_release(vdev->pdev);
+	vfio_pci_core_disable(vdev);
+
+	mutex_lock(&vdev->igate);
+	if (vdev->err_trigger) {
+		eventfd_ctx_put(vdev->err_trigger);
+		vdev->err_trigger = NULL;
+	}
+	if (vdev->req_trigger) {
+		eventfd_ctx_put(vdev->req_trigger);
+		vdev->req_trigger = NULL;
+	}
+	mutex_unlock(&vdev->igate);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
+
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
+{
+	vfio_pci_probe_mmaps(vdev);
+	vfio_spapr_pci_eeh_open(vdev->pdev);
+	vfio_pci_vf_token_user_add(vdev, 1);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
+
+static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
+{
+	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+		u8 pin;
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
+		    vdev->nointx || vdev->pdev->is_virtfn)
+			return 0;
+
+		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
+
+		return pin ? 1 : 0;
+	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
+		u8 pos;
+		u16 flags;
+
+		pos = vdev->pdev->msi_cap;
+		if (pos) {
+			pci_read_config_word(vdev->pdev,
+					     pos + PCI_MSI_FLAGS, &flags);
+			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
+		}
+	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
+		u8 pos;
+		u16 flags;
+
+		pos = vdev->pdev->msix_cap;
+		if (pos) {
+			pci_read_config_word(vdev->pdev,
+					     pos + PCI_MSIX_FLAGS, &flags);
+
+			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
+		}
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+		if (pci_is_pcie(vdev->pdev))
+			return 1;
+	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
+		return 1;
+	} else if (irq_type >= VFIO_PCI_NUM_IRQS &&
+		   irq_type < VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs) {
+		return 1;
+	}
+
+	return 0;
+}
+
+static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
+{
+	(*(int *)data)++;
+	return 0;
+}
+
+struct vfio_pci_fill_info {
+	int max;
+	int cur;
+	struct vfio_pci_dependent_device *devices;
+};
+
+static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_fill_info *fill = data;
+	struct iommu_group *iommu_group;
+
+	if (fill->cur == fill->max)
+		return -EAGAIN; /* Something changed, try again */
+
+	iommu_group = iommu_group_get(&pdev->dev);
+	if (!iommu_group)
+		return -EPERM; /* Cannot reset non-isolated devices */
+
+	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
+	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
+	fill->devices[fill->cur].bus = pdev->bus->number;
+	fill->devices[fill->cur].devfn = pdev->devfn;
+	fill->cur++;
+	iommu_group_put(iommu_group);
+	return 0;
+}
+
+struct vfio_pci_group_info {
+	int count;
+	struct vfio_group **groups;
+};
+
+static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
+{
+	for (; pdev; pdev = pdev->bus->self)
+		if (pdev->bus == slot->bus)
+			return (pdev->slot == slot);
+	return false;
+}
+
+struct vfio_pci_walk_info {
+	int (*fn)(struct pci_dev *, void *data);
+	void *data;
+	struct pci_dev *pdev;
+	bool slot;
+	int ret;
+};
+
+static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_walk_info *walk = data;
+
+	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
+		walk->ret = walk->fn(pdev, walk->data);
+
+	return walk->ret;
+}
+
+static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
+					 int (*fn)(struct pci_dev *,
+						   void *data), void *data,
+					 bool slot)
+{
+	struct vfio_pci_walk_info walk = {
+		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
+	};
+
+	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
+
+	return walk.ret;
+}
+
+static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
+			      struct vfio_info_cap *caps)
+{
+	struct vfio_info_cap_header header = {
+		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
+		.version = 1
+	};
+
+	return vfio_info_add_capability(caps, &header, sizeof(header));
+}
+
+int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
+				 unsigned int type, unsigned int subtype,
+				 const struct vfio_pci_regops *ops,
+				 size_t size, u32 flags, void *data)
+{
+	struct vfio_pci_region *region;
+
+	region = krealloc(vdev->region,
+			  (vdev->num_regions + 1) * sizeof(*region),
+			  GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	vdev->region = region;
+	vdev->region[vdev->num_regions].type = type;
+	vdev->region[vdev->num_regions].subtype = subtype;
+	vdev->region[vdev->num_regions].ops = ops;
+	vdev->region[vdev->num_regions].size = size;
+	vdev->region[vdev->num_regions].flags = flags;
+	vdev->region[vdev->num_regions].data = data;
+
+	vdev->num_regions++;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region);
+
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+		unsigned long arg)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	unsigned long minsz;
+
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		unsigned long capsz;
+		int ret;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		/* For backward compatibility, cannot require this */
+		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (info.argsz >= capsz) {
+			minsz = capsz;
+			info.cap_offset = 0;
+		}
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+
+		if (vdev->reset_works)
+			info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
+		info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs;
+
+		ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+		if (ret && ret != -ENODEV) {
+			pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+			return ret;
+		}
+
+		if (caps.size) {
+			info.flags |= VFIO_DEVICE_FLAGS_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct pci_dev *pdev = vdev->pdev;
+		struct vfio_region_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		int i, ret;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = pdev->cfg_size;
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = pci_resource_len(pdev, info.index);
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+			if (vdev->bar_mmap_supported[info.index]) {
+				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
+				if (info.index == vdev->msix_bar) {
+					ret = msix_mmappable_cap(vdev, &caps);
+					if (ret)
+						return ret;
+				}
+			}
+
+			break;
+		case VFIO_PCI_ROM_REGION_INDEX:
+		{
+			void __iomem *io;
+			size_t size;
+			u16 cmd;
+
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.flags = 0;
+
+			/* Report the BAR size, not the ROM size */
+			info.size = pci_resource_len(pdev, info.index);
+			if (!info.size) {
+				/* Shadow ROMs appear as PCI option ROMs */
+				if (pdev->resource[PCI_ROM_RESOURCE].flags &
+							IORESOURCE_ROM_SHADOW)
+					info.size = 0x20000;
+				else
+					break;
+			}
+
+			/*
+			 * Is it really there?  Enable memory decode for
+			 * implicit access in pci_map_rom().
+			 */
+			cmd = vfio_pci_memory_lock_and_enable(vdev);
+			io = pci_map_rom(pdev, &size);
+			if (io) {
+				info.flags = VFIO_REGION_INFO_FLAG_READ;
+				pci_unmap_rom(pdev, io);
+			} else {
+				info.size = 0;
+			}
+			vfio_pci_memory_unlock_and_restore(vdev, cmd);
+
+			break;
+		}
+		case VFIO_PCI_VGA_REGION_INDEX:
+			if (!vdev->has_vga)
+				return -EINVAL;
+
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0xc0000;
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+
+			break;
+		default:
+		{
+			struct vfio_region_info_cap_type cap_type = {
+					.header.id = VFIO_REGION_INFO_CAP_TYPE,
+					.header.version = 1 };
+
+			if (info.index >=
+			    VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+				return -EINVAL;
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_REGIONS +
+							vdev->num_regions);
+
+			i = info.index - VFIO_PCI_NUM_REGIONS;
+
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vdev->region[i].size;
+			info.flags = vdev->region[i].flags;
+
+			cap_type.type = vdev->region[i].type;
+			cap_type.subtype = vdev->region[i].subtype;
+
+			ret = vfio_info_add_capability(&caps, &cap_type.header,
+						       sizeof(cap_type));
+			if (ret)
+				return ret;
+
+			if (vdev->region[i].ops->add_capability) {
+				ret = vdev->region[i].ops->add_capability(vdev,
+						&vdev->region[i], &caps);
+				if (ret)
+					return ret;
+			}
+		}
+		}
+
+		if (caps.size) {
+			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		unsigned long capsz;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		/* For backward compatibility, cannot require this */
+		capsz = offsetofend(struct vfio_irq_info, cap_offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz ||
+			info.index >= VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs)
+			return -EINVAL;
+
+		if (info.argsz >= capsz)
+			minsz = capsz;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX:
+			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+				       VFIO_IRQ_INFO_AUTOMASKED);
+			break;
+		case VFIO_PCI_MSI_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+			break;
+		case VFIO_PCI_ERR_IRQ_INDEX:
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+			if (!pci_is_pcie(vdev->pdev))
+				return -EINVAL;
+			break;
+			fallthrough;
+		default:
+		{
+			struct vfio_irq_info_cap_type cap_type = {
+				.header.id = VFIO_IRQ_INFO_CAP_TYPE,
+				.header.version = 1 };
+			int ret, i;
+			if (info.index >= VFIO_PCI_NUM_IRQS +
+						vdev->num_ext_irqs)
+				return -EINVAL;
+			info.index = array_index_nospec(info.index,
+							VFIO_PCI_NUM_IRQS +
+							vdev->num_ext_irqs);
+			i = info.index - VFIO_PCI_NUM_IRQS;
+
+			info.flags = vdev->ext_irqs[i].flags;
+			cap_type.type = vdev->ext_irqs[i].type;
+			cap_type.subtype = vdev->ext_irqs[i].subtype;
+
+			ret = vfio_info_add_capability(&caps,
+					&cap_type.header,
+					sizeof(cap_type));
+			if (ret)
+				return ret;
+		}
+		}
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+		info.count = vfio_pci_get_irq_count(vdev, info.index);
+
+		if (caps.size) {
+			info.flags |= VFIO_IRQ_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		int max, ret = 0;
+		size_t data_size = 0;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		max = vfio_pci_get_irq_count(vdev, hdr.index);
+
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
+				VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs,
+				&data_size);
+		if (ret)
+			return ret;
+
+		if (data_size) {
+			data = memdup_user((void __user *)(arg + minsz),
+					    data_size);
+			if (IS_ERR(data))
+				return PTR_ERR(data);
+		}
+
+		mutex_lock(&vdev->igate);
+
+		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+					      hdr.start, hdr.count, data);
+
+		mutex_unlock(&vdev->igate);
+		kfree(data);
+
+		return ret;
+
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		int ret;
+
+		if (!vdev->reset_works)
+			return -EINVAL;
+
+		vfio_pci_zap_and_down_write_memory_lock(vdev);
+		ret = pci_try_reset_function(vdev->pdev);
+		up_write(&vdev->memory_lock);
+
+		return ret;
+
+	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
+		struct vfio_pci_hot_reset_info hdr;
+		struct vfio_pci_fill_info fill = { 0 };
+		struct vfio_pci_dependent_device *devices = NULL;
+		bool slot = false;
+		int ret = 0;
+
+		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (hdr.argsz < minsz)
+			return -EINVAL;
+
+		hdr.flags = 0;
+
+		/* Can we do a slot or bus reset or neither? */
+		if (!pci_probe_reset_slot(vdev->pdev->slot))
+			slot = true;
+		else if (pci_probe_reset_bus(vdev->pdev->bus))
+			return -ENODEV;
+
+		/* How many devices are affected? */
+		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+						    vfio_pci_count_devs,
+						    &fill.max, slot);
+		if (ret)
+			return ret;
+
+		WARN_ON(!fill.max); /* Should always be at least one */
+
+		/*
+		 * If there's enough space, fill it now, otherwise return
+		 * -ENOSPC and the number of devices affected.
+		 */
+		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
+			ret = -ENOSPC;
+			hdr.count = fill.max;
+			goto reset_info_exit;
+		}
+
+		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
+		if (!devices)
+			return -ENOMEM;
+
+		fill.devices = devices;
+
+		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+						    vfio_pci_fill_devs,
+						    &fill, slot);
+
+		/*
+		 * If a device was removed between counting and filling,
+		 * we may come up short of fill.max.  If a device was
+		 * added, we'll have a return of -EAGAIN above.
+		 */
+		if (!ret)
+			hdr.count = fill.cur;
+
+reset_info_exit:
+		if (copy_to_user((void __user *)arg, &hdr, minsz))
+			ret = -EFAULT;
+
+		if (!ret) {
+			if (copy_to_user((void __user *)(arg + minsz), devices,
+					 hdr.count * sizeof(*devices)))
+				ret = -EFAULT;
+		}
+
+		kfree(devices);
+		return ret;
+
+	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
+		struct vfio_pci_hot_reset hdr;
+		int32_t *group_fds;
+		struct vfio_group **groups;
+		struct vfio_pci_group_info info;
+		bool slot = false;
+		int group_idx, count = 0, ret = 0;
+
+		minsz = offsetofend(struct vfio_pci_hot_reset, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (hdr.argsz < minsz || hdr.flags)
+			return -EINVAL;
+
+		/* Can we do a slot or bus reset or neither? */
+		if (!pci_probe_reset_slot(vdev->pdev->slot))
+			slot = true;
+		else if (pci_probe_reset_bus(vdev->pdev->bus))
+			return -ENODEV;
+
+		/*
+		 * We can't let userspace give us an arbitrarily large
+		 * buffer to copy, so verify how many we think there
+		 * could be.  Note groups can have multiple devices so
+		 * one group per device is the max.
+		 */
+		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+						    vfio_pci_count_devs,
+						    &count, slot);
+		if (ret)
+			return ret;
+
+		/* Somewhere between 1 and count is OK */
+		if (!hdr.count || hdr.count > count)
+			return -EINVAL;
+
+		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
+		groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
+		if (!group_fds || !groups) {
+			kfree(group_fds);
+			kfree(groups);
+			return -ENOMEM;
+		}
+
+		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
+				   hdr.count * sizeof(*group_fds))) {
+			kfree(group_fds);
+			kfree(groups);
+			return -EFAULT;
+		}
+
+		/*
+		 * For each group_fd, get the group through the vfio external
+		 * user interface and store the group and iommu ID.  This
+		 * ensures the group is held across the reset.
+		 */
+		for (group_idx = 0; group_idx < hdr.count; group_idx++) {
+			struct vfio_group *group;
+			struct fd f = fdget(group_fds[group_idx]);
+			if (!f.file) {
+				ret = -EBADF;
+				break;
+			}
+
+			group = vfio_group_get_external_user(f.file);
+			fdput(f);
+			if (IS_ERR(group)) {
+				ret = PTR_ERR(group);
+				break;
+			}
+
+			groups[group_idx] = group;
+		}
+
+		kfree(group_fds);
+
+		/* release reference to groups on error */
+		if (ret)
+			goto hot_reset_release;
+
+		info.count = hdr.count;
+		info.groups = groups;
+
+		ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+
+hot_reset_release:
+		for (group_idx--; group_idx >= 0; group_idx--)
+			vfio_group_put_external_user(groups[group_idx]);
+
+		kfree(groups);
+		return ret;
+	} else if (cmd == VFIO_DEVICE_IOEVENTFD) {
+		struct vfio_device_ioeventfd ioeventfd;
+		int count;
+
+		minsz = offsetofend(struct vfio_device_ioeventfd, fd);
+
+		if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (ioeventfd.argsz < minsz)
+			return -EINVAL;
+
+		if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
+			return -EINVAL;
+
+		count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
+
+		if (hweight8(count) != 1 || ioeventfd.fd < -1)
+			return -EINVAL;
+
+		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
+					  ioeventfd.data, count, ioeventfd.fd);
+	} else if (cmd == VFIO_DEVICE_FEATURE) {
+		struct vfio_device_feature feature;
+		uuid_t uuid;
+
+		minsz = offsetofend(struct vfio_device_feature, flags);
+
+		if (copy_from_user(&feature, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (feature.argsz < minsz)
+			return -EINVAL;
+
+		/* Check unknown flags */
+		if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
+				      VFIO_DEVICE_FEATURE_SET |
+				      VFIO_DEVICE_FEATURE_GET |
+				      VFIO_DEVICE_FEATURE_PROBE))
+			return -EINVAL;
+
+		/* GET & SET are mutually exclusive except with PROBE */
+		if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+		    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+		    (feature.flags & VFIO_DEVICE_FEATURE_GET))
+			return -EINVAL;
+
+		switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+		case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
+			if (!vdev->vf_token)
+				return -ENOTTY;
+
+			/*
+			 * We do not support GET of the VF Token UUID as this
+			 * could expose the token of the previous device user.
+			 */
+			if (feature.flags & VFIO_DEVICE_FEATURE_GET)
+				return -EINVAL;
+
+			if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
+				return 0;
+
+			/* Don't SET unless told to do so */
+			if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
+				return -EINVAL;
+
+			if (feature.argsz < minsz + sizeof(uuid))
+				return -EINVAL;
+
+			if (copy_from_user(&uuid, (void __user *)(arg + minsz),
+					   sizeof(uuid)))
+				return -EFAULT;
+
+			mutex_lock(&vdev->vf_token->lock);
+			uuid_copy(&vdev->vf_token->uuid, &uuid);
+			mutex_unlock(&vdev->vf_token->lock);
+
+			return 0;
+		default:
+			return -ENOTTY;
+		}
+	}
+
+	return -ENOTTY;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
+
+static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			   size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+		return -EINVAL;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+
+	case VFIO_PCI_ROM_REGION_INDEX:
+		if (iswrite)
+			return -EINVAL;
+		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
+
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+
+	case VFIO_PCI_VGA_REGION_INDEX:
+		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
+	default:
+		index -= VFIO_PCI_NUM_REGIONS;
+		return vdev->region[index].ops->rw(vdev, buf,
+						   count, ppos, iswrite);
+	}
+
+	return -EINVAL;
+}
+
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_rw(vdev, buf, count, ppos, false);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_read);
+
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_write);
+
+/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
+static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
+{
+	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
+
+	/*
+	 * Lock ordering:
+	 * vma_lock is nested under mmap_sem for vm_ops callback paths.
+	 * The memory_lock semaphore is used by both code paths calling
+	 * into this function to zap vmas and the vm_ops.fault callback
+	 * to protect the memory enable state of the device.
+	 *
+	 * When zapping vmas we need to maintain the mmap_sem => vma_lock
+	 * ordering, which requires using vma_lock to walk vma_list to
+	 * acquire an mm, then dropping vma_lock to get the mmap_sem and
+	 * reacquiring vma_lock.  This logic is derived from similar
+	 * requirements in uverbs_user_mmap_disassociate().
+	 *
+	 * mmap_sem must always be the top-level lock when it is taken.
+	 * Therefore we can only hold the memory_lock write lock when
+	 * vma_list is empty, as we'd need to take mmap_sem to clear
+	 * entries.  vma_list can only be guaranteed empty when holding
+	 * vma_lock, thus memory_lock is nested under vma_lock.
+	 *
+	 * This enables the vm_ops.fault callback to acquire vma_lock,
+	 * followed by memory_lock read lock, while already holding
+	 * mmap_sem without risk of deadlock.
+	 */
+	while (1) {
+		struct mm_struct *mm = NULL;
+
+		if (try) {
+			if (!mutex_trylock(&vdev->vma_lock))
+				return 0;
+		} else {
+			mutex_lock(&vdev->vma_lock);
+		}
+		while (!list_empty(&vdev->vma_list)) {
+			mmap_vma = list_first_entry(&vdev->vma_list,
+						    struct vfio_pci_mmap_vma,
+						    vma_next);
+			mm = mmap_vma->vma->vm_mm;
+			if (mmget_not_zero(mm))
+				break;
+
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+			mm = NULL;
+		}
+		if (!mm)
+			return 1;
+		mutex_unlock(&vdev->vma_lock);
+
+		if (try) {
+			if (!down_read_trylock(&mm->mmap_sem)) {
+				mmput(mm);
+				return 0;
+			}
+		} else {
+			down_read(&mm->mmap_sem);
+		}
+		if (mmget_still_valid(mm)) {
+			if (try) {
+				if (!mutex_trylock(&vdev->vma_lock)) {
+					up_read(&mm->mmap_sem);
+					mmput(mm);
+					return 0;
+				}
+			} else {
+				mutex_lock(&vdev->vma_lock);
+			}
+			list_for_each_entry_safe(mmap_vma, tmp,
+						 &vdev->vma_list, vma_next) {
+				struct vm_area_struct *vma = mmap_vma->vma;
+
+				if (vma->vm_mm != mm)
+					continue;
+
+				list_del(&mmap_vma->vma_next);
+				kfree(mmap_vma);
+
+				zap_vma_ptes(vma, vma->vm_start,
+					     vma->vm_end - vma->vm_start);
+			}
+			mutex_unlock(&vdev->vma_lock);
+		}
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
+{
+	vfio_pci_zap_and_vma_lock(vdev, false);
+	down_write(&vdev->memory_lock);
+	mutex_unlock(&vdev->vma_lock);
+}
+
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
+{
+	u16 cmd;
+
+	down_write(&vdev->memory_lock);
+	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
+	if (!(cmd & PCI_COMMAND_MEMORY))
+		pci_write_config_word(vdev->pdev, PCI_COMMAND,
+				      cmd | PCI_COMMAND_MEMORY);
+
+	return cmd;
+}
+
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
+{
+	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
+	up_write(&vdev->memory_lock);
+}
+
+/* Caller holds vma_lock */
+static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
+			      struct vm_area_struct *vma)
+{
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
+	if (!mmap_vma)
+		return -ENOMEM;
+
+	mmap_vma->vma = vma;
+	list_add(&mmap_vma->vma_next, &vdev->vma_list);
+
+	return 0;
+}
+
+/*
+ * Zap mmaps on open so that we can fault them in on access and therefore
+ * our vma_list only tracks mappings accessed since last zap.
+ */
+static void vfio_pci_mmap_open(struct vm_area_struct *vma)
+{
+	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mutex_lock(&vdev->vma_lock);
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma) {
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+			break;
+		}
+	}
+	mutex_unlock(&vdev->vma_lock);
+}
+
+static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+
+	mutex_lock(&vdev->vma_lock);
+	down_read(&vdev->memory_lock);
+
+	if (!__vfio_pci_memory_enabled(vdev)) {
+		ret = VM_FAULT_SIGBUS;
+		goto up_out;
+	}
+
+	/*
+	 * We populate the whole vma on fault, so we need to test whether
+	 * the vma has already been mapped, such as for concurrent faults
+	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
+	 * we ask it to fill the same range again.
+	 */
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma)
+			goto up_out;
+	}
+
+	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot)) {
+		ret = VM_FAULT_SIGBUS;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+		goto up_out;
+	}
+
+	if (__vfio_pci_add_vma(vdev, vma)) {
+		ret = VM_FAULT_OOM;
+		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+	}
+
+up_out:
+	up_read(&vdev->memory_lock);
+	mutex_unlock(&vdev->vma_lock);
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_pci_mmap_ops = {
+	.open = vfio_pci_mmap_open,
+	.close = vfio_pci_mmap_close,
+	.fault = vfio_pci_mmap_fault,
+};
+
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned int index;
+	u64 phys_len, req_len, pgoff, req_start;
+	int ret;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		int regnum = index - VFIO_PCI_NUM_REGIONS;
+		struct vfio_pci_region *region = vdev->region + regnum;
+
+		if (region->ops && region->ops->mmap &&
+		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+			return region->ops->mmap(vdev, region, vma);
+		return -EINVAL;
+	}
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -EINVAL;
+	if (!vdev->bar_mmap_supported[index])
+		return -EINVAL;
+
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_start + req_len > phys_len)
+		return -EINVAL;
+
+	/*
+	 * Even though we don't make use of the barmap for the mmap,
+	 * we need to request the region and the barmap tracks that.
+	 */
+	if (!vdev->barmap[index]) {
+		ret = pci_request_selected_regions(pdev,
+						   1 << index, "vfio-pci");
+		if (ret)
+			return ret;
+
+		vdev->barmap[index] = pci_iomap(pdev, index, 0);
+		if (!vdev->barmap[index]) {
+			pci_release_selected_regions(pdev, 1 << index);
+			return -ENOMEM;
+		}
+	}
+
+	vma->vm_private_data = vdev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+
+	/*
+	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
+	 * change vm_flags within the fault handler.  Set them now.
+	 */
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_ops = &vfio_pci_mmap_ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
+
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+
+	mutex_lock(&vdev->igate);
+
+	if (vdev->req_trigger) {
+		if (!(count % 10))
+			pci_notice_ratelimited(pdev,
+				"Relaying device request to user (#%u)\n",
+				count);
+		eventfd_signal(vdev->req_trigger, 1);
+	} else if (count == 0) {
+		pci_warn(pdev,
+			"No device request channel registered, blocked until released by user\n");
+	}
+
+	mutex_unlock(&vdev->igate);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_request);
+
+static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
+				      bool vf_token, uuid_t *uuid)
+{
+	/*
+	 * There's always some degree of trust or collaboration between SR-IOV
+	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
+	 * can disrupt VFs with a reset, but often the PF has more explicit
+	 * access to deny service to the VF or access data passed through the
+	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
+	 * represent this trust.  This both prevents that a VF driver might
+	 * assume the PF driver is a trusted, in-kernel driver, and also that
+	 * a PF driver might be replaced with a rogue driver, unknown to in-use
+	 * VF drivers.
+	 *
+	 * Therefore when presented with a VF, if the PF is a vfio device and
+	 * it is bound to the vfio-pci driver, the user needs to provide a VF
+	 * token to access the device, in the form of appending a vf_token to
+	 * the device name, for example:
+	 *
+	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
+	 *
+	 * When presented with a PF which has VFs in use, the user must also
+	 * provide the current VF token to prove collaboration with existing
+	 * VF users.  If VFs are not in use, the VF token provided for the PF
+	 * device will act to set the VF token.
+	 *
+	 * If the VF token is provided but unused, an error is generated.
+	 */
+	if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
+		return 0; /* No VF token provided or required */
+
+	if (vdev->pdev->is_virtfn) {
+		struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
+		bool match;
+
+		if (!pf_vdev) {
+			if (!vf_token)
+				return 0; /* PF is not vfio-pci, no VF token */
+
+			pci_info_ratelimited(vdev->pdev,
+				"VF token incorrectly provided, PF not bound to vfio-pci\n");
+			return -EINVAL;
+		}
+
+		if (!vf_token) {
+			vfio_device_put(&pf_vdev->vdev);
+			pci_info_ratelimited(vdev->pdev,
+				"VF token required to access device\n");
+			return -EACCES;
+		}
+
+		mutex_lock(&pf_vdev->vf_token->lock);
+		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
+		mutex_unlock(&pf_vdev->vf_token->lock);
+
+		vfio_device_put(&pf_vdev->vdev);
+
+		if (!match) {
+			pci_info_ratelimited(vdev->pdev,
+				"Incorrect VF token provided for device\n");
+			return -EACCES;
+		}
+	} else if (vdev->vf_token) {
+		mutex_lock(&vdev->vf_token->lock);
+		if (vdev->vf_token->users) {
+			if (!vf_token) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"VF token required to access device\n");
+				return -EACCES;
+			}
+
+			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
+				mutex_unlock(&vdev->vf_token->lock);
+				pci_info_ratelimited(vdev->pdev,
+					"Incorrect VF token provided for device\n");
+				return -EACCES;
+			}
+		} else if (vf_token) {
+			uuid_copy(&vdev->vf_token->uuid, uuid);
+		}
+
+		mutex_unlock(&vdev->vf_token->lock);
+	} else if (vf_token) {
+		pci_info_ratelimited(vdev->pdev,
+			"VF token incorrectly provided, not a PF or VF\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define VF_TOKEN_ARG "vf_token="
+
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	bool vf_token = false;
+	uuid_t uuid;
+	int ret;
+
+	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
+		return 0; /* No match */
+
+	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
+		buf += strlen(pci_name(vdev->pdev));
+
+		if (*buf != ' ')
+			return 0; /* No match: non-whitespace after name */
+
+		while (*buf) {
+			if (*buf == ' ') {
+				buf++;
+				continue;
+			}
+
+			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
+						  strlen(VF_TOKEN_ARG))) {
+				buf += strlen(VF_TOKEN_ARG);
+
+				if (strlen(buf) < UUID_STRING_LEN)
+					return -EINVAL;
+
+				ret = uuid_parse(buf, &uuid);
+				if (ret)
+					return ret;
+
+				vf_token = true;
+				buf += UUID_STRING_LEN;
+			} else {
+				/* Unknown/duplicate option */
+				return -EINVAL;
+			}
+		}
+	}
+
+	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+	if (ret)
+		return ret;
+
+	return 1; /* Match */
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_match);
+
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct vfio_pci_core_device *vdev = container_of(nb,
+						    struct vfio_pci_core_device, nb);
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev *physfn = pci_physfn(pdev);
+
+	if (action == BUS_NOTIFY_ADD_DEVICE &&
+	    pdev->is_virtfn && physfn == vdev->pdev) {
+		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+			 pci_name(pdev));
+		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+						  vdev->vdev.ops->name);
+	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+		   pdev->is_virtfn && physfn == vdev->pdev) {
+		struct pci_driver *drv = pci_dev_driver(pdev);
+
+		if (drv && drv != pci_dev_driver(vdev->pdev))
+			pci_warn(vdev->pdev,
+				 "VF %s bound to driver %s while PF bound to driver %s\n",
+				 pci_name(pdev), drv->name,
+				 pci_dev_driver(vdev->pdev)->name);
+	}
+
+	return 0;
+}
+
+static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (!pdev->is_physfn)
+		return 0;
+
+	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+	if (!vdev->vf_token)
+		return -ENOMEM;
+
+	mutex_init(&vdev->vf_token->lock);
+	uuid_gen(&vdev->vf_token->uuid);
+
+	vdev->nb.notifier_call = vfio_pci_bus_notifier;
+	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+	if (ret) {
+		kfree(vdev->vf_token);
+		return ret;
+	}
+	return 0;
+}
+
+static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
+{
+	if (!vdev->vf_token)
+		return;
+
+	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+	WARN_ON(vdev->vf_token->users);
+	mutex_destroy(&vdev->vf_token->lock);
+	kfree(vdev->vf_token);
+}
+
+static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	int ret;
+
+	if (!vfio_pci_is_vga(pdev))
+		return 0;
+
+	ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
+	if (ret)
+		return ret;
+	vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false));
+	return 0;
+}
+
+static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+
+	if (!vfio_pci_is_vga(pdev))
+		return;
+	vga_client_register(pdev, NULL, NULL, NULL);
+	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+					      VGA_RSRC_LEGACY_IO |
+					      VGA_RSRC_LEGACY_MEM);
+}
+
+void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
+			       struct pci_dev *pdev,
+			       const struct vfio_device_ops *vfio_pci_ops)
+{
+	vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops);
+	vdev->pdev = pdev;
+	vdev->irq_type = VFIO_PCI_NUM_IRQS;
+	mutex_init(&vdev->igate);
+	spin_lock_init(&vdev->irqlock);
+	mutex_init(&vdev->ioeventfds_lock);
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+	INIT_LIST_HEAD(&vdev->ioeventfds_list);
+	mutex_init(&vdev->vma_lock);
+	INIT_LIST_HEAD(&vdev->vma_list);
+	init_rwsem(&vdev->memory_lock);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
+
+void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
+{
+	mutex_destroy(&vdev->igate);
+	mutex_destroy(&vdev->ioeventfds_lock);
+	mutex_destroy(&vdev->vma_lock);
+	vfio_uninit_group_dev(&vdev->vdev);
+	kfree(vdev->region);
+	kfree(vdev->pm_save);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device);
+
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	struct iommu_group *group;
+	int ret;
+
+	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return -EINVAL;
+
+	/*
+	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
+	 * by the host or other users.  We cannot capture the VFs if they
+	 * already exist, nor can we track VF users.  Disabling SR-IOV here
+	 * would initiate removing the VFs, which would unbind the driver,
+	 * which is prone to blocking if that VF is also in use by vfio-pci.
+	 * Just reject these PFs and let the user sort it out.
+	 */
+	if (pci_num_vf(pdev)) {
+		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
+		return -EBUSY;
+	}
+
+	group = vfio_iommu_group_get(&pdev->dev);
+	if (!group)
+		return -EINVAL;
+
+	if (pci_is_root_bus(pdev->bus)) {
+		ret = vfio_assign_device_set(&vdev->vdev, vdev);
+	} else if (!pci_probe_reset_slot(pdev->slot)) {
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
+	} else {
+		/*
+		 * If there is no slot reset support for this device, the whole
+		 * bus needs to be grouped together to support bus-wide resets.
+		 */
+		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
+	}
+
+	if (ret)
+		goto out_group_put;
+	ret = vfio_pci_vf_init(vdev);
+	if (ret)
+		goto out_group_put;
+	ret = vfio_pci_vga_init(vdev);
+	if (ret)
+		goto out_vf;
+
+	vfio_pci_probe_power_state(vdev);
+
+	if (!disable_idle_d3) {
+		/*
+		 * pci-core sets the device power state to an unknown value at
+		 * bootup and after being removed from a driver.  The only
+		 * transition it allows from this unknown state is to D0, which
+		 * typically happens when a driver calls pci_enable_device().
+		 * We're not ready to enable the device yet, but we do want to
+		 * be able to get to D3.  Therefore first do a D0 transition
+		 * before going to D3.
+		 */
+		vfio_pci_set_power_state(vdev, PCI_D0);
+		vfio_pci_set_power_state(vdev, PCI_D3hot);
+	}
+
+	ret = vfio_register_group_dev(&vdev->vdev);
+	if (ret)
+		goto out_power;
+	return 0;
+
+out_power:
+	if (!disable_idle_d3)
+		vfio_pci_set_power_state(vdev, PCI_D0);
+out_vf:
+	vfio_pci_vf_uninit(vdev);
+out_group_put:
+	vfio_iommu_group_put(group, &pdev->dev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
+
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
+{
+	struct pci_dev *pdev = vdev->pdev;
+
+	pci_disable_sriov(pdev);
+
+	vfio_unregister_group_dev(&vdev->vdev);
+
+	vfio_pci_vf_uninit(vdev);
+	vfio_pci_vga_uninit(vdev);
+
+	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
+
+	if (!disable_idle_d3)
+		vfio_pci_set_power_state(vdev, PCI_D0);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
+
+static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
+						  pci_channel_state_t state)
+{
+	struct vfio_pci_core_device *vdev;
+	struct vfio_device *device;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (device == NULL)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	vdev = container_of(device, struct vfio_pci_core_device, vdev);
+
+	mutex_lock(&vdev->igate);
+
+	if (vdev->err_trigger)
+		eventfd_signal(vdev->err_trigger, 1);
+
+	mutex_unlock(&vdev->igate);
+
+	vfio_device_put(device);
+
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+	struct vfio_device *device;
+	int ret = 0;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
+		return -ENODEV;
+
+	if (nr_virtfn == 0)
+		pci_disable_sriov(pdev);
+	else
+		ret = pci_enable_sriov(pdev, nr_virtfn);
+
+	vfio_device_put(device);
+
+	return ret < 0 ? ret : nr_virtfn;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
+
+const struct pci_error_handlers vfio_pci_core_err_handlers = {
+	.error_detected = vfio_pci_aer_err_detected,
+};
+EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
+
+static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_group_info *groups)
+{
+	unsigned int i;
+
+	for (i = 0; i < groups->count; i++)
+		if (groups->groups[i] == vdev->vdev.group)
+			return true;
+	return false;
+}
+
+static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
+{
+	struct vfio_device_set *dev_set = data;
+	struct vfio_device *cur;
+
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == &pdev->dev)
+			return 0;
+	return -EBUSY;
+}
+
+/*
+ * vfio-core considers a group to be viable and will create a vfio_device even
+ * if some devices are bound to drivers like pci-stub or pcieport. Here we
+ * require all PCI devices to be inside our dev_set since that ensures they stay
+ * put and that every driver controlling the device can co-ordinate with the
+ * device reset.
+ *
+ * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
+ * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
+ */
+static struct pci_dev *
+vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
+{
+	struct pci_dev *pdev;
+
+	lockdep_assert_held(&dev_set->lock);
+
+	/*
+	 * By definition all PCI devices in the dev_set share the same PCI
+	 * reset, so any pci_dev will have the same outcomes for
+	 * pci_probe_reset_*() and pci_reset_bus().
+	 */
+	pdev = list_first_entry(&dev_set->device_list,
+				struct vfio_pci_core_device,
+				vdev.dev_set_list)->pdev;
+
+	/* pci_reset_bus() is supported */
+	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
+		return NULL;
+
+	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
+					  dev_set,
+					  !pci_probe_reset_slot(pdev->slot)))
+		return NULL;
+	return pdev;
+}
+
+/*
+ * We need to get memory_lock for each device, but devices can share mmap_sem,
+ * therefore we need to zap and hold the vma_lock for each device, and only then
+ * get each memory_lock.
+ */
+static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
+				      struct vfio_pci_group_info *groups)
+{
+	struct vfio_pci_core_device *cur_mem;
+	struct vfio_pci_core_device *cur_vma;
+	struct vfio_pci_core_device *cur;
+	struct pci_dev *pdev;
+	bool is_mem = true;
+	int ret;
+
+	mutex_lock(&dev_set->lock);
+	cur_mem = list_first_entry(&dev_set->device_list,
+				   struct vfio_pci_core_device,
+				   vdev.dev_set_list);
+
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+		/*
+		 * Test whether all the affected devices are contained by the
+		 * set of groups provided by the user.
+		 */
+		if (!vfio_dev_in_groups(cur_vma, groups)) {
+			ret = -EINVAL;
+			goto err_undo;
+		}
+
+		/*
+		 * Locking multiple devices is prone to deadlock, runaway and
+		 * unwind if we hit contention.
+		 */
+		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+	}
+	cur_vma = NULL;
+
+	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
+		if (!down_write_trylock(&cur_mem->memory_lock)) {
+			ret = -EBUSY;
+			goto err_undo;
+		}
+		mutex_unlock(&cur_mem->vma_lock);
+	}
+	cur_mem = NULL;
+
+	ret = pci_reset_bus(pdev);
+
+err_undo:
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (cur == cur_mem)
+			is_mem = false;
+		if (cur == cur_vma)
+			break;
+		if (is_mem)
+			up_write(&cur->memory_lock);
+		else
+			mutex_unlock(&cur->vma_lock);
+	}
+err_unlock:
+	mutex_unlock(&dev_set->lock);
+	return ret;
+}
+
+static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_core_device *cur;
+	bool needs_reset = false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		/* No VFIO device in the set can have an open device FD */
+		if (cur->vdev.open_count)
+			return false;
+		needs_reset |= cur->needs_reset;
+	}
+	return needs_reset;
+}
+
+/*
+ * If a bus or slot reset is available for the provided dev_set and:
+ *  - All of the devices affected by that bus or slot reset are unused
+ *  - At least one of the affected devices is marked dirty via
+ *    needs_reset (such as by lack of FLR support)
+ * Then attempt to perform that bus or slot reset.
+ * Returns true if the dev_set was reset.
+ */
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_core_device *cur;
+	struct pci_dev *pdev;
+	int ret;
+
+	if (!vfio_pci_dev_set_needs_reset(dev_set))
+		return false;
+
+	pdev = vfio_pci_dev_set_resettable(dev_set);
+	if (!pdev)
+		return false;
+
+	ret = pci_reset_bus(pdev);
+	if (ret)
+		return false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		cur->needs_reset = false;
+		if (!disable_idle_d3)
+			vfio_pci_set_power_state(cur, PCI_D3hot);
+	}
+	return true;
+}
+
+void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3)
+{
+	nointxmask = is_nointxmask;
+	disable_vga = is_disable_vga;
+	disable_idle_d3 = is_disable_idle_d3;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
+
+static void vfio_pci_core_cleanup(void)
+{
+	vfio_pci_uninit_perm_bits();
+}
+
+static int __init vfio_pci_core_init(void)
+{
+	/* Allocate shared config space permission data used by all devices */
+	return vfio_pci_init_perm_bits();
+}
+
+module_init(vfio_pci_core_init);
+module_exit(vfio_pci_core_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 53d97f459252f8e7738532818c649e8db2e8bbca..7ca4109bba48281f5fcc850b01300e91c64eb792 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -15,14 +15,19 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 
-#include "vfio_pci_private.h"
+#include <linux/vfio_pci_core.h>
 
 #define OPREGION_SIGNATURE	"IntelGraphicsMem"
 #define OPREGION_SIZE		(8 * 1024)
 #define OPREGION_PCI_ADDR	0xfc
 
-static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
-			      size_t count, loff_t *ppos, bool iswrite)
+#define OPREGION_RVDA		0x3ba
+#define OPREGION_RVDS		0x3c2
+#define OPREGION_VERSION	0x16
+
+static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev,
+			       char __user *buf, size_t count, loff_t *ppos,
+			       bool iswrite)
 {
 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
 	void *base = vdev->region[i].data;
@@ -41,7 +46,7 @@ static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return count;
 }
 
-static void vfio_pci_igd_release(struct vfio_pci_device *vdev,
+static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev,
 				 struct vfio_pci_region *region)
 {
 	memunmap(region->data);
@@ -52,12 +57,13 @@ static const struct vfio_pci_regops vfio_pci_igd_regops = {
 	.release	= vfio_pci_igd_release,
 };
 
-static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev)
 {
 	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
 	u32 addr, size;
 	void *base;
 	int ret;
+	u16 version;
 
 	ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr);
 	if (ret)
@@ -83,6 +89,54 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 
 	size *= 1024; /* In KB */
 
+	/*
+	 * Support opregion v2.1+
+	 * When VBT data exceeds 6KB size and cannot be within mailbox #4, then
+	 * the Extended VBT region next to opregion is used to hold the VBT data.
+	 * RVDA (Relative Address of VBT Data from Opregion Base) and RVDS
+	 * (Raw VBT Data Size) from opregion structure member are used to hold the
+	 * address from region base and size of VBT data. RVDA/RVDS are not
+	 * defined before opregion 2.0.
+	 *
+	 * opregion 2.1+: RVDA is unsigned, relative offset from
+	 * opregion base, and should point to the end of opregion.
+	 * otherwise, exposing to userspace to allow read access to everything between
+	 * the OpRegion and VBT is not safe.
+	 * RVDS is defined as size in bytes.
+	 *
+	 * opregion 2.0: rvda is the physical VBT address.
+	 * Since rvda is HPA it cannot be directly used in guest.
+	 * And it should not be practically available for end user,so it is not supported.
+	 */
+	version = le16_to_cpu(*(__le16 *)(base + OPREGION_VERSION));
+	if (version >= 0x0200) {
+		u64 rvda;
+		u32 rvds;
+
+		rvda = le64_to_cpu(*(__le64 *)(base + OPREGION_RVDA));
+		rvds = le32_to_cpu(*(__le32 *)(base + OPREGION_RVDS));
+		if (rvda && rvds) {
+			/* no support for opregion v2.0 with physical VBT address */
+			if (version == 0x0200) {
+				memunmap(base);
+				pci_err(vdev->pdev,
+					"IGD assignment does not support opregion v2.0 with an extended VBT region\n");
+				return -EINVAL;
+			}
+
+			if (rvda != size) {
+				memunmap(base);
+				pci_err(vdev->pdev,
+					"Extended VBT does not follow opregion on version 0x%04x\n",
+					version);
+				return -EINVAL;
+			}
+
+			/* region size for opregion v2.0+: opregion and VBT size. */
+			size += rvds;
+		}
+	}
+
 	if (size != OPREGION_SIZE) {
 		memunmap(base);
 		base = memremap(addr, size, MEMREMAP_WB);
@@ -107,9 +161,9 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 	return ret;
 }
 
-static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
-				  char __user *buf, size_t count, loff_t *ppos,
-				  bool iswrite)
+static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev,
+				   char __user *buf, size_t count, loff_t *ppos,
+				   bool iswrite)
 {
 	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
 	struct pci_dev *pdev = vdev->region[i].data;
@@ -127,7 +181,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_byte(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		if (copy_to_user(buf + count - size, &val, 1))
 			return -EFAULT;
@@ -141,7 +195,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_word(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le16(val);
 		if (copy_to_user(buf + count - size, &val, 2))
@@ -156,7 +210,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_dword(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le32(val);
 		if (copy_to_user(buf + count - size, &val, 4))
@@ -171,7 +225,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_word(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		val = cpu_to_le16(val);
 		if (copy_to_user(buf + count - size, &val, 2))
@@ -186,7 +240,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 
 		ret = pci_user_read_config_byte(pdev, pos, &val);
 		if (ret)
-			return pcibios_err_to_errno(ret);
+			return ret;
 
 		if (copy_to_user(buf + count - size, &val, 1))
 			return -EFAULT;
@@ -200,7 +254,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
 	return count;
 }
 
-static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev,
+static void vfio_pci_igd_cfg_release(struct vfio_pci_core_device *vdev,
 				     struct vfio_pci_region *region)
 {
 	struct pci_dev *pdev = region->data;
@@ -213,7 +267,7 @@ static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = {
 	.release	= vfio_pci_igd_cfg_release,
 };
 
-static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev)
+static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *host_bridge, *lpc_bridge;
 	int ret;
@@ -261,7 +315,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-int vfio_pci_igd_init(struct vfio_pci_device *vdev)
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
 	int ret;
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 869dce5f134dd5d4a594ac1a706b706161583f77..b7bb1ae21b72b7ad8f3276b733510a69dd552a96 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -19,21 +19,22 @@
 #include <linux/vfio.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 
-#include "vfio_pci_private.h"
+#include <linux/vfio_pci_core.h>
 
 /*
  * INTx
  */
 static void vfio_send_intx_eventfd(void *opaque, void *unused)
 {
-	struct vfio_pci_device *vdev = opaque;
+	struct vfio_pci_core_device *vdev = opaque;
 
 	if (likely(is_intx(vdev) && !vdev->virq_disabled))
 		eventfd_signal(vdev->ctx[0].trigger, 1);
 }
 
-void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
+void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;
@@ -73,7 +74,7 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  */
 static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 {
-	struct vfio_pci_device *vdev = opaque;
+	struct vfio_pci_core_device *vdev = opaque;
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;
 	int ret = 0;
@@ -107,7 +108,7 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 	return ret;
 }
 
-void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
 {
 	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
 		vfio_send_intx_eventfd(vdev, NULL);
@@ -115,7 +116,7 @@ void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
 
 static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 {
-	struct vfio_pci_device *vdev = dev_id;
+	struct vfio_pci_core_device *vdev = dev_id;
 	unsigned long flags;
 	int ret = IRQ_NONE;
 
@@ -139,7 +140,7 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
 	return ret;
 }
 
-static int vfio_intx_enable(struct vfio_pci_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
 {
 	if (!is_irq_none(vdev))
 		return -EINVAL;
@@ -168,7 +169,7 @@ static int vfio_intx_enable(struct vfio_pci_device *vdev)
 	return 0;
 }
 
-static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long irqflags = IRQF_SHARED;
@@ -223,7 +224,7 @@ static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
 	return 0;
 }
 
-static void vfio_intx_disable(struct vfio_pci_device *vdev)
+static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
 {
 	vfio_virqfd_disable(&vdev->ctx[0].unmask);
 	vfio_virqfd_disable(&vdev->ctx[0].mask);
@@ -244,7 +245,7 @@ static irqreturn_t vfio_msihandler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
+static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
@@ -285,7 +286,7 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
 	return 0;
 }
 
-static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
+static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
 				      int vector, int fd, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -364,7 +365,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
+static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start,
 			      unsigned count, int32_t *fds, bool msix)
 {
 	int i, j, ret = 0;
@@ -385,7 +386,7 @@ static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
 	return ret;
 }
 
-static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
+static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int i;
@@ -417,7 +418,7 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
 /*
  * IOCTL support
  */
-static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -444,7 +445,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev,
 				  unsigned index, unsigned start,
 				  unsigned count, uint32_t flags, void *data)
 {
@@ -464,7 +465,7 @@ static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
 				     unsigned index, unsigned start,
 				     unsigned count, uint32_t flags, void *data)
 {
@@ -507,7 +508,7 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -613,7 +614,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 	return -EINVAL;
 }
 
-static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -624,7 +625,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
 					       count, flags, data);
 }
 
-static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
+static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
@@ -635,11 +636,30 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
 					       count, flags, data);
 }
 
-int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
+int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
+					unsigned int index, unsigned int start,
+					unsigned int count, uint32_t flags,
+					void *data)
+{
+	int i;
+
+	if (start != 0 || count > 1)
+		return -EINVAL;
+
+	index = array_index_nospec(index,
+				   VFIO_PCI_NUM_IRQS + vdev->num_ext_irqs);
+	i = index - VFIO_PCI_NUM_IRQS;
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->ext_irqs[i].trigger,
+					       count, flags, data);
+}
+EXPORT_SYMBOL_GPL(vfio_pci_set_ext_irq_trigger);
+
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
 {
-	int (*func)(struct vfio_pci_device *vdev, unsigned index,
+	int (*func)(struct vfio_pci_core_device *vdev, unsigned index,
 		    unsigned start, unsigned count, uint32_t flags,
 		    void *data) = NULL;
 
@@ -684,6 +704,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 			break;
 		}
 		break;
+	default:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_ext_irq_trigger;
+			break;
+		}
+		break;
 	}
 
 	if (!func)
@@ -691,3 +718,39 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 
 	return func(vdev, index, start, count, flags, data);
 }
+
+int vfio_pci_get_ext_irq_index(struct vfio_pci_core_device *vdev,
+			       unsigned int type, unsigned int subtype)
+{
+	int i;
+
+	for (i = 0; i <  vdev->num_ext_irqs; i++) {
+		if (vdev->ext_irqs[i].type == type &&
+		    vdev->ext_irqs[i].subtype == subtype) {
+			return i;
+		}
+	}
+	return -EINVAL;
+}
+
+int vfio_pci_register_irq(struct vfio_pci_core_device *vdev,
+			  unsigned int type, unsigned int subtype,
+			  u32 flags)
+{
+	struct vfio_ext_irq *ext_irqs;
+
+	ext_irqs = krealloc(vdev->ext_irqs,
+			    (vdev->num_ext_irqs + 1) * sizeof(*ext_irqs),
+			    GFP_KERNEL);
+	if (!ext_irqs)
+		return -ENOMEM;
+
+	vdev->ext_irqs = ext_irqs;
+
+	vdev->ext_irqs[vdev->num_ext_irqs].type = type;
+	vdev->ext_irqs[vdev->num_ext_irqs].subtype = subtype;
+	vdev->ext_irqs[vdev->num_ext_irqs].flags = flags;
+	vdev->ext_irqs[vdev->num_ext_irqs].trigger = NULL;
+	vdev->num_ext_irqs++;
+	return 0;
+}
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index 08f17839c2fe2c088a2667fe386b37e4c5dd3cdd..cd275a8e5155d4973fe630450df1dfb132421a4e 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -161,7 +161,7 @@ static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
 	data->useraddr = vma->vm_start;
 	data->mm = current->mm;
 
-	atomic_inc(&data->mm->mm_count);
+	mmgrab(data->mm);
 	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
 			vma_pages(vma), data->gpu_hpa, &data->mem);
 
@@ -219,7 +219,7 @@ int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
 	unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
 
 	/*
-	 * PCI config space does not tell us about NVLink presense but
+	 * PCI config space does not tell us about NVLink presence but
 	 * platform does, use this.
 	 */
 	npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
@@ -402,7 +402,7 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 	u32 link_speed = 0xff;
 
 	/*
-	 * PCI config space does not tell us about NVLink presense but
+	 * PCI config space does not tell us about NVLink presence but
 	 * platform does, use this.
 	 */
 	if (!pnv_pci_get_gpu_dev(vdev->pdev))
@@ -425,8 +425,14 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
 
 	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
 			&mmio_atsd)) {
-		dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
-		mmio_atsd = 0;
+		if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
+				&mmio_atsd)) {
+			dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+			mmio_atsd = 0;
+		} else {
+			dev_warn(&vdev->pdev->dev,
+				 "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
+		}
 	}
 
 	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 83f81d24df78eae51fc17221ff3bcd828a487694..aa0b71dbbe20e4d1fe4d65328a01d73f0e9ea723 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -17,7 +17,7 @@
 #include <linux/vfio.h>
 #include <linux/vgaarb.h>
 
-#include "vfio_pci_private.h"
+#include <linux/vfio_pci_core.h>
 
 #ifdef __LITTLE_ENDIAN
 #define vfio_ioread64	ioread64
@@ -37,17 +37,70 @@
 #define vfio_ioread8	ioread8
 #define vfio_iowrite8	iowrite8
 
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev,		\
+			bool test_mem, u##size val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	vfio_iowrite##size(val, io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev,		\
+			bool test_mem, u##size *val, void __iomem *io)	\
+{									\
+	if (test_mem) {							\
+		down_read(&vdev->memory_lock);				\
+		if (!__vfio_pci_memory_enabled(vdev)) {			\
+			up_read(&vdev->memory_lock);			\
+			return -EIO;					\
+		}							\
+	}								\
+									\
+	*val = vfio_ioread##size(io);					\
+									\
+	if (test_mem)							\
+		up_read(&vdev->memory_lock);				\
+									\
+	return 0;							\
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
 /*
  * Read or write from an __iomem region (MMIO or I/O port) with an excluded
  * range which is inaccessible.  The excluded range drops writes and fills
  * reads with -1.  This is intended for handling MSI-X vector tables and
  * leftover space for ROM BARs.
  */
-static ssize_t do_io_rw(void __iomem *io, char __user *buf,
+static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+			void __iomem *io, char __user *buf,
 			loff_t off, size_t count, size_t x_start,
 			size_t x_end, bool iswrite)
 {
 	ssize_t done = 0;
+	int ret;
 
 	while (count) {
 		size_t fillable, filled;
@@ -66,9 +119,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 4))
 					return -EFAULT;
 
-				vfio_iowrite32(val, io + off);
+				ret = vfio_pci_iowrite32(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread32(io + off);
+				ret = vfio_pci_ioread32(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 4))
 					return -EFAULT;
@@ -82,9 +141,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 2))
 					return -EFAULT;
 
-				vfio_iowrite16(val, io + off);
+				ret = vfio_pci_iowrite16(vdev, test_mem,
+							 val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread16(io + off);
+				ret = vfio_pci_ioread16(vdev, test_mem,
+							&val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 2))
 					return -EFAULT;
@@ -98,9 +163,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 				if (copy_from_user(&val, buf, 1))
 					return -EFAULT;
 
-				vfio_iowrite8(val, io + off);
+				ret = vfio_pci_iowrite8(vdev, test_mem,
+							val, io + off);
+				if (ret)
+					return ret;
 			} else {
-				val = vfio_ioread8(io + off);
+				ret = vfio_pci_ioread8(vdev, test_mem,
+						       &val, io + off);
+				if (ret)
+					return ret;
 
 				if (copy_to_user(buf, &val, 1))
 					return -EFAULT;
@@ -129,7 +200,7 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 	return done;
 }
 
-static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
+static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -153,7 +224,7 @@ static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
 	return 0;
 }
 
-ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos, bool iswrite)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -178,14 +249,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 
 	count = min(count, (size_t)(end - pos));
 
-	if (res->flags & IORESOURCE_MEM) {
-		down_read(&vdev->memory_lock);
-		if (!__vfio_pci_memory_enabled(vdev)) {
-			up_read(&vdev->memory_lock);
-			return -EIO;
-		}
-	}
-
 	if (bar == PCI_ROM_RESOURCE) {
 		/*
 		 * The ROM can fill less space than the BAR, so we start the
@@ -213,7 +276,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 		x_end = vdev->msix_offset + vdev->msix_size;
 	}
 
-	done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
+	done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+			count, x_start, x_end, iswrite);
 
 	if (done >= 0)
 		*ppos += done;
@@ -221,13 +285,10 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 	if (bar == PCI_ROM_RESOURCE)
 		pci_unmap_rom(pdev, io);
 out:
-	if (res->flags & IORESOURCE_MEM)
-		up_read(&vdev->memory_lock);
-
 	return done;
 }
 
-ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite)
 {
 	int ret;
@@ -246,7 +307,7 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 	switch ((u32)pos) {
 	case 0xa0000 ... 0xbffff:
 		count = min(count, (size_t)(0xc0000 - pos));
-		iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
+		iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
 		off = pos - 0xa0000;
 		rsrc = VGA_RSRC_LEGACY_MEM;
 		is_ioport = false;
@@ -278,7 +339,12 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 		return ret;
 	}
 
-	done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
+	/*
+	 * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+	 * probing, so we don't currently worry about access in relation
+	 * to the memory enable bit in the command register.
+	 */
+	done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
 
 	vga_put(vdev->pdev, rsrc);
 
@@ -290,31 +356,160 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 	return done;
 }
 
-static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
+					bool test_mem)
 {
-	struct vfio_pci_ioeventfd *ioeventfd = opaque;
-
 	switch (ioeventfd->count) {
 	case 1:
-		vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
+				  ioeventfd->data, ioeventfd->addr);
 		break;
 	case 2:
-		vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 	case 4:
-		vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 #ifdef iowrite64
 	case 8:
-		vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+		vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
+				   ioeventfd->data, ioeventfd->addr);
 		break;
 #endif
 	}
+}
+
+ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			     size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	void *base = vdev->region[i].data;
+	int ret = -EFAULT;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	mutex_lock(&vdev->fault_queue_lock);
+
+	if (iswrite) {
+		struct vfio_region_dma_fault *header =
+			(struct vfio_region_dma_fault *)base;
+		u32 new_tail;
+
+		if (pos != 0 || count != 4) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (copy_from_user((void *)&new_tail, buf, count))
+			goto unlock;
+
+		if (new_tail > header->nb_entries) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+		header->tail = new_tail;
+	} else {
+		if (copy_to_user(buf, base + pos, count))
+			goto unlock;
+	}
+	*ppos += count;
+	ret = count;
+unlock:
+	mutex_unlock(&vdev->fault_queue_lock);
+	return ret;
+}
+
+ssize_t vfio_pci_mregion_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			    size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	void *base = vdev->region[i].data;
+	struct vfio_device_migration_info *mig_info = (struct vfio_device_migration_info *) base;
+	int ret = -EFAULT;
+
+	//pr_info("%s, pos: 0x%llx, count: %lu, base: %px, index: %d wr %d\n",
+		//__func__, pos, count, base, i, iswrite);
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	mutex_lock(&vdev->mig_lock);
+
+	if (iswrite) {
+		if ((pos == offsetof(struct vfio_device_migration_info,
+				device_state)) && vdev->migops) {
+			u32 new_state;
+			/* Call into the device specific code to handle
+			 * the state change (before changing the state)
+			 */
+			if (count != sizeof(mig_info->device_state)) {
+				ret = -EINVAL;
+				goto unlock;
+			}
+
+			if (copy_from_user((void *)&new_state, buf, count))
+				goto unlock;
+
+			ret = vdev->migops->state_change(vdev, new_state);
+			if (ret)
+				goto unlock;
+			mig_info->device_state = new_state;
+		} else {
+			if (copy_from_user(base + pos, buf, count))
+				goto unlock;
+		}
+	} else {
+		if (copy_to_user(buf, base + pos, count))
+			goto unlock;
+		if (pos >= mig_info->data_offset &&
+			pos < mig_info->data_offset + mig_info->data_size)
+			mig_info->pending_bytes -= count;
+	}
+	ret = count;
+unlock:
+	mutex_unlock(&vdev->mig_lock);
+	return ret;
+}
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+	struct vfio_pci_core_device *vdev = ioeventfd->vdev;
+
+	if (ioeventfd->test_mem) {
+		if (!down_read_trylock(&vdev->memory_lock))
+			return 1; /* Lock contended, use thread */
+		if (!__vfio_pci_memory_enabled(vdev)) {
+			up_read(&vdev->memory_lock);
+			return 0;
+		}
+	}
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, false);
+
+	if (ioeventfd->test_mem)
+		up_read(&vdev->memory_lock);
 
 	return 0;
 }
 
-long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
+{
+	struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+	vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
+}
+
+long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 			uint64_t data, int count, int fd)
 {
 	struct pci_dev *pdev = vdev->pdev;
@@ -378,14 +573,17 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
 		goto out_unlock;
 	}
 
+	ioeventfd->vdev = vdev;
 	ioeventfd->addr = vdev->barmap[bar] + pos;
 	ioeventfd->data = data;
 	ioeventfd->pos = pos;
 	ioeventfd->bar = bar;
 	ioeventfd->count = count;
+	ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
 
 	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
-				 NULL, NULL, &ioeventfd->virqfd, fd);
+				 vfio_pci_ioeventfd_thread, NULL,
+				 &ioeventfd->virqfd, fd);
 	if (ret) {
 		kfree(ioeventfd);
 		goto out_unlock;
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea4c0d2b0663cad5463d38509e54d1eb3a3babd1
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2020.  All rights reserved.
+ *	Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ *                 Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_zdev.h>
+#include <asm/pci_clp.h>
+#include <asm/pci_io.h>
+
+#include <linux/vfio_pci_core.h>
+
+/*
+ * Add the Base PCI Function information to the device info region.
+ */
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_base cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
+		.header.version = 1,
+		.start_dma = zdev->start_dma,
+		.end_dma = zdev->end_dma,
+		.pchid = zdev->pchid,
+		.vfn = zdev->vfn,
+		.fmb_length = zdev->fmb_length,
+		.pft = zdev->pft,
+		.gid = zdev->pfgid
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the Base PCI Function Group information to the device info region.
+ */
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_group cap = {
+		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
+		.header.version = 1,
+		.dasm = zdev->dma_mask,
+		.msi_addr = zdev->msi_addr,
+		.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
+		.mui = zdev->fmb_update,
+		.noi = zdev->max_msi,
+		.maxstbl = ZPCI_MAX_WRITE_SIZE,
+		.version = zdev->version
+	};
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the device utility string to the device info region.
+ */
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_util *cap;
+	int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
+	cap->header.version = 1;
+	cap->size = CLP_UTIL_STR_LEN;
+	memcpy(cap->util_str, zdev->util_str, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add the function path string to the device info region.
+ */
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
+{
+	struct vfio_device_info_cap_zpci_pfip *cap;
+	int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
+	int ret;
+
+	cap = kmalloc(cap_size, GFP_KERNEL);
+	if (!cap)
+		return -ENOMEM;
+
+	cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
+	cap->header.version = 1;
+	cap->size = CLP_PFIP_NR_SEGMENTS;
+	memcpy(cap->pfip, zdev->pfip, cap->size);
+
+	ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+	kfree(cap);
+
+	return ret;
+}
+
+/*
+ * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
+ */
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				struct vfio_info_cap *caps)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	int ret;
+
+	if (!zdev)
+		return -ENODEV;
+
+	ret = zpci_base_cap(zdev, caps);
+	if (ret)
+		return ret;
+
+	ret = zpci_group_cap(zdev, caps);
+	if (ret)
+		return ret;
+
+	if (zdev->util_str_avail) {
+		ret = zpci_util_cap(zdev, caps);
+		if (ret)
+			return ret;
+	}
+
+	ret = zpci_pfip_cap(zdev, caps);
+
+	return ret;
+}
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index dc1a3c44f2c62bba26e75df8feefa0892d4afba2..72de406281bf7b47fb4895b3b7b8583405b0d7fc 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM
 	tristate "VFIO support for platform devices"
-	depends on VFIO && EVENTFD && (ARM || ARM64)
+	depends on ARM || ARM64 || COMPILE_TEST
 	select VFIO_VIRQFD
 	help
 	  Support for platform devices with VFIO. This is required to make
@@ -10,9 +10,10 @@ config VFIO_PLATFORM
 
 	  If you don't know what to do here, say N.
 
+if VFIO_PLATFORM
 config VFIO_AMBA
 	tristate "VFIO support for AMBA devices"
-	depends on VFIO_PLATFORM && ARM_AMBA
+	depends on ARM_AMBA
 	help
 	  Support for ARM AMBA devices with VFIO. This is required to make
 	  use of ARM AMBA devices present on the system using the VFIO
@@ -21,3 +22,4 @@ config VFIO_AMBA
 	  If you don't know what to do here, say N.
 
 source "drivers/vfio/platform/reset/Kconfig"
+endif
diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig
index 1edbe9ee7356a8955225fbd5624ccc2cbd6da2a6..12f5f3d803876c44d06f528ede1da7f83e3a9208 100644
--- a/drivers/vfio/platform/reset/Kconfig
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM_CALXEDAXGMAC_RESET
 	tristate "VFIO support for calxeda xgmac reset"
-	depends on VFIO_PLATFORM
 	help
 	  Enables the VFIO platform driver to handle reset for Calxeda xgmac
 
@@ -9,7 +8,6 @@ config VFIO_PLATFORM_CALXEDAXGMAC_RESET
 
 config VFIO_PLATFORM_AMDXGBE_RESET
 	tristate "VFIO support for AMD XGBE reset"
-	depends on VFIO_PLATFORM
 	help
 	  Enables the VFIO platform driver to handle reset for AMD XGBE
 
@@ -17,7 +15,7 @@ config VFIO_PLATFORM_AMDXGBE_RESET
 
 config VFIO_PLATFORM_BCMFLEXRM_RESET
 	tristate "VFIO support for Broadcom FlexRM reset"
-	depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
 	default ARCH_BCM_IPROC
 	help
 	  Enables the VFIO platform driver to handle reset for Broadcom FlexRM
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index 2d2babe21b2f2deca76e8582a9d40214d2d93042..abdca900802d04bce8deee4627526c3d26713609 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -24,7 +24,7 @@
 #define MDIO_AN_INT		0x8002
 #define MDIO_AN_INTMASK		0x8001
 
-static unsigned int xmdio_read(void *ioaddr, unsigned int mmd,
+static unsigned int xmdio_read(void __iomem *ioaddr, unsigned int mmd,
 			       unsigned int reg)
 {
 	unsigned int mmd_address, value;
@@ -35,7 +35,7 @@ static unsigned int xmdio_read(void *ioaddr, unsigned int mmd,
 	return value;
 }
 
-static void xmdio_write(void *ioaddr, unsigned int mmd,
+static void xmdio_write(void __iomem *ioaddr, unsigned int mmd,
 			unsigned int reg, unsigned int value)
 {
 	unsigned int mmd_address;
@@ -54,13 +54,13 @@ static int vfio_platform_amdxgbe_reset(struct vfio_platform_device *vdev)
 
 	if (!xgmac_regs->ioaddr) {
 		xgmac_regs->ioaddr =
-			ioremap_nocache(xgmac_regs->addr, xgmac_regs->size);
+			ioremap(xgmac_regs->addr, xgmac_regs->size);
 		if (!xgmac_regs->ioaddr)
 			return -ENOMEM;
 	}
 	if (!xpcs_regs->ioaddr) {
 		xpcs_regs->ioaddr =
-			ioremap_nocache(xpcs_regs->addr, xpcs_regs->size);
+			ioremap(xpcs_regs->addr, xpcs_regs->size);
 		if (!xpcs_regs->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
index 16165a62b86de01d7a11d6daa45b8ea6cb3afce5..1131ebe4837d427857bbec20fa05f5f79fbd37dd 100644
--- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2017 Broadcom
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 /*
@@ -82,7 +74,7 @@ static int vfio_platform_bcmflexrm_reset(struct vfio_platform_device *vdev)
 
 	/* Map FlexRM ring registers if not mapped */
 	if (!reg->ioaddr) {
-		reg->ioaddr = ioremap_nocache(reg->addr, reg->size);
+		reg->ioaddr = ioremap(reg->addr, reg->size);
 		if (!reg->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
index f67bab5475013223670a6376f30eb2113223f20d..63cc7f0b2e4a437a4d446b54e4937bcb6ed88876 100644
--- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -26,7 +26,7 @@
 #define XGMAC_DMA_CONTROL       0x00000f18      /* Ctrl (Operational Mode) */
 #define XGMAC_DMA_INTR_ENA      0x00000f1c      /* Interrupt Enable */
 
-/* DMA Control registe defines */
+/* DMA Control register defines */
 #define DMA_CONTROL_ST          0x00002000      /* Start/Stop Transmission */
 #define DMA_CONTROL_SR          0x00000002      /* Start/Stop Receive */
 
@@ -52,7 +52,7 @@ static int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev)
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 		if (!reg->ioaddr)
 			return -ENOMEM;
 	}
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 9636a2afaecd1b4599473debb5ee7dc7908055b0..a309ec7a019b3b29afc1aadb9b7eadcc475ce999 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -59,7 +59,6 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
 	vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
 	vdev->get_resource = get_amba_resource;
 	vdev->get_irq = get_amba_irq;
-	vdev->parent_module = THIS_MODULE;
 	vdev->reset_required = false;
 
 	ret = vfio_platform_probe_common(vdev, &adev->dev);
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 1e276901008939280bc1cebfbb1f6b77a49108aa..68a1c87066d7604e5cbf8cce44dbde228ecb7b5d 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -25,19 +25,8 @@ static struct resource *get_platform_resource(struct vfio_platform_device *vdev,
 					      int num)
 {
 	struct platform_device *dev = (struct platform_device *) vdev->opaque;
-	int i;
 
-	for (i = 0; i < dev->num_resources; i++) {
-		struct resource *r = &dev->resource[i];
-
-		if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
-			if (!num)
-				return r;
-
-			num--;
-		}
-	}
-	return NULL;
+	return platform_get_mem_or_io(dev, num);
 }
 
 static int get_platform_irq(struct vfio_platform_device *vdev, int i)
@@ -61,27 +50,24 @@ static int vfio_platform_probe(struct platform_device *pdev)
 	vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
 	vdev->get_resource = get_platform_resource;
 	vdev->get_irq = get_platform_irq;
-	vdev->parent_module = THIS_MODULE;
 	vdev->reset_required = reset_required;
 
 	ret = vfio_platform_probe_common(vdev, &pdev->dev);
-	if (ret)
+	if (ret) {
 		kfree(vdev);
-
-	return ret;
+		return ret;
+	}
+	dev_set_drvdata(&pdev->dev, vdev);
+	return 0;
 }
 
 static int vfio_platform_remove(struct platform_device *pdev)
 {
-	struct vfio_platform_device *vdev;
-
-	vdev = vfio_platform_remove_common(&pdev->dev);
-	if (vdev) {
-		kfree(vdev);
-		return 0;
-	}
+	struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev);
 
-	return -EINVAL;
+	vfio_platform_remove_common(vdev);
+	kfree(vdev);
+	return 0;
 }
 
 static struct platform_driver vfio_platform_driver = {
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 152e5188183cecddcb6c9f395422613cf33ce402..6af7ce7d619c25cf7e525b47ac25b4c8f3d5aba1 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -218,68 +218,52 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev,
 	return -EINVAL;
 }
 
-static void vfio_platform_release(void *device_data)
+static void vfio_platform_close_device(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
-
-	mutex_lock(&driver_lock);
-
-	if (!(--vdev->refcnt)) {
-		const char *extra_dbg = NULL;
-		int ret;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
+	int ret;
 
-		ret = vfio_platform_call_reset(vdev, &extra_dbg);
-		if (ret && vdev->reset_required) {
-			dev_warn(vdev->device, "reset driver is required and reset call failed in release (%d) %s\n",
-				 ret, extra_dbg ? extra_dbg : "");
-			WARN_ON(1);
-		}
-		pm_runtime_put(vdev->device);
-		vfio_platform_regions_cleanup(vdev);
-		vfio_platform_irq_cleanup(vdev);
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (WARN_ON(ret && vdev->reset_required)) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in release (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
 	}
-
-	mutex_unlock(&driver_lock);
-
-	module_put(vdev->parent_module);
+	pm_runtime_put(vdev->device);
+	vfio_platform_regions_cleanup(vdev);
+	vfio_platform_irq_cleanup(vdev);
 }
 
-static int vfio_platform_open(void *device_data)
+static int vfio_platform_open_device(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+	const char *extra_dbg = NULL;
 	int ret;
 
-	if (!try_module_get(vdev->parent_module))
-		return -ENODEV;
-
-	mutex_lock(&driver_lock);
-
-	if (!vdev->refcnt) {
-		const char *extra_dbg = NULL;
-
-		ret = vfio_platform_regions_init(vdev);
-		if (ret)
-			goto err_reg;
+	ret = vfio_platform_regions_init(vdev);
+	if (ret)
+		return ret;
 
-		ret = vfio_platform_irq_init(vdev);
-		if (ret)
-			goto err_irq;
+	ret = vfio_platform_irq_init(vdev);
+	if (ret)
+		goto err_irq;
 
-		ret = pm_runtime_get_sync(vdev->device);
-		if (ret < 0)
-			goto err_rst;
+	ret = pm_runtime_get_sync(vdev->device);
+	if (ret < 0)
+		goto err_rst;
 
-		ret = vfio_platform_call_reset(vdev, &extra_dbg);
-		if (ret && vdev->reset_required) {
-			dev_warn(vdev->device, "reset driver is required and reset call failed in open (%d) %s\n",
-				 ret, extra_dbg ? extra_dbg : "");
-			goto err_rst;
-		}
+	ret = vfio_platform_call_reset(vdev, &extra_dbg);
+	if (ret && vdev->reset_required) {
+		dev_warn(
+			vdev->device,
+			"reset driver is required and reset call failed in open (%d) %s\n",
+			ret, extra_dbg ? extra_dbg : "");
+		goto err_rst;
 	}
-
-	vdev->refcnt++;
-
-	mutex_unlock(&driver_lock);
 	return 0;
 
 err_rst:
@@ -287,16 +271,15 @@ static int vfio_platform_open(void *device_data)
 	vfio_platform_irq_cleanup(vdev);
 err_irq:
 	vfio_platform_regions_cleanup(vdev);
-err_reg:
-	mutex_unlock(&driver_lock);
-	module_put(THIS_MODULE);
 	return ret;
 }
 
-static long vfio_platform_ioctl(void *device_data,
+static long vfio_platform_ioctl(struct vfio_device *core_vdev,
 				unsigned int cmd, unsigned long arg)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -408,7 +391,7 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 
 		if (!reg->ioaddr)
 			return -ENOMEM;
@@ -455,10 +438,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_read(void *device_data, char __user *buf,
-				  size_t count, loff_t *ppos)
+static ssize_t vfio_platform_read(struct vfio_device *core_vdev,
+				  char __user *buf, size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -485,7 +469,7 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
 
 	if (!reg->ioaddr) {
 		reg->ioaddr =
-			ioremap_nocache(reg->addr, reg->size);
+			ioremap(reg->addr, reg->size);
 
 		if (!reg->ioaddr)
 			return -ENOMEM;
@@ -531,10 +515,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_write(void *device_data, const char __user *buf,
+static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf,
 				   size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -573,9 +558,10 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
 			       req_len, vma->vm_page_prot);
 }
 
-static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index;
 
 	index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
@@ -615,8 +601,8 @@ static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
 
 static const struct vfio_device_ops vfio_platform_ops = {
 	.name		= "vfio-platform",
-	.open		= vfio_platform_open,
-	.release	= vfio_platform_release,
+	.open_device	= vfio_platform_open_device,
+	.close_device	= vfio_platform_close_device,
 	.ioctl		= vfio_platform_ioctl,
 	.read		= vfio_platform_read,
 	.write		= vfio_platform_write,
@@ -659,15 +645,14 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	struct iommu_group *group;
 	int ret;
 
-	if (!vdev)
-		return -EINVAL;
+	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops);
 
 	ret = vfio_platform_acpi_probe(vdev, dev);
 	if (ret)
 		ret = vfio_platform_of_probe(vdev, dev);
 
 	if (ret)
-		return ret;
+		goto out_uninit;
 
 	vdev->device = dev;
 
@@ -675,7 +660,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	if (ret && vdev->reset_required) {
 		dev_err(dev, "No reset function found for device %s\n",
 			vdev->name);
-		return ret;
+		goto out_uninit;
 	}
 
 	group = vfio_iommu_group_get(dev);
@@ -685,36 +670,33 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 		goto put_reset;
 	}
 
-	ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev);
+	ret = vfio_register_group_dev(&vdev->vdev);
 	if (ret)
 		goto put_iommu;
 
 	mutex_init(&vdev->igate);
 
-	pm_runtime_enable(vdev->device);
+	pm_runtime_enable(dev);
 	return 0;
 
 put_iommu:
 	vfio_iommu_group_put(group, dev);
 put_reset:
 	vfio_platform_put_reset(vdev);
+out_uninit:
+	vfio_uninit_group_dev(&vdev->vdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
 
-struct vfio_platform_device *vfio_platform_remove_common(struct device *dev)
+void vfio_platform_remove_common(struct vfio_platform_device *vdev)
 {
-	struct vfio_platform_device *vdev;
+	vfio_unregister_group_dev(&vdev->vdev);
 
-	vdev = vfio_del_group_dev(dev);
-
-	if (vdev) {
-		pm_runtime_disable(vdev->device);
-		vfio_platform_put_reset(vdev);
-		vfio_iommu_group_put(dev->iommu_group, dev);
-	}
-
-	return vdev;
+	pm_runtime_disable(vdev->device);
+	vfio_platform_put_reset(vdev);
+	vfio_uninit_group_dev(&vdev->vdev);
+	vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev);
 }
 EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
 
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 289089910643ac964216fc3e01dce389a9453168..520d2a8e8375b277655fa8d3eb3dfb96bdb2a084 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/vfio.h>
 
 #define VFIO_PLATFORM_OFFSET_SHIFT   40
 #define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 1)
@@ -42,13 +43,12 @@ struct vfio_platform_region {
 };
 
 struct vfio_platform_device {
+	struct vfio_device		vdev;
 	struct vfio_platform_region	*regions;
 	u32				num_regions;
 	struct vfio_platform_irq	*irqs;
 	u32				num_irqs;
-	int				refcnt;
 	struct mutex			igate;
-	struct module			*parent_module;
 	const char			*compat;
 	const char			*acpihid;
 	struct module			*reset_module;
@@ -80,8 +80,7 @@ struct vfio_platform_reset_node {
 
 extern int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 				      struct device *dev);
-extern struct vfio_platform_device *vfio_platform_remove_common
-				     (struct device *dev);
+void vfio_platform_remove_common(struct vfio_platform_device *vdev);
 
 extern int vfio_platform_irq_init(struct vfio_platform_device *vdev);
 extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 388597930b648feafb04fb3a81a1b3ea43c0a749..3c034fe14ccb036b3f9127ce79d74470df5efde8 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -46,7 +46,6 @@ static struct vfio {
 	struct mutex			group_lock;
 	struct cdev			group_cdev;
 	dev_t				group_devt;
-	wait_queue_head_t		release_q;
 } vfio;
 
 struct vfio_iommu_driver {
@@ -85,19 +84,11 @@ struct vfio_group {
 	atomic_t			opened;
 	wait_queue_head_t		container_q;
 	bool				noiommu;
+	unsigned int			dev_counter;
 	struct kvm			*kvm;
 	struct blocking_notifier_head	notifier;
 };
 
-struct vfio_device {
-	struct kref			kref;
-	struct device			*dev;
-	const struct vfio_device_ops	*ops;
-	struct vfio_group		*group;
-	struct list_head		group_next;
-	void				*device_data;
-};
-
 #ifdef CONFIG_VFIO_NOIOMMU
 static bool noiommu __read_mostly;
 module_param_named(enable_unsafe_noiommu_mode,
@@ -105,11 +96,84 @@ module_param_named(enable_unsafe_noiommu_mode,
 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 #endif
 
+static DEFINE_XARRAY(vfio_device_set_xa);
+
+int vfio_assign_device_set(struct vfio_device *device, void *set_id)
+{
+	unsigned long idx = (unsigned long)set_id;
+	struct vfio_device_set *new_dev_set;
+	struct vfio_device_set *dev_set;
+
+	if (WARN_ON(!set_id))
+		return -EINVAL;
+
+	/*
+	 * Atomically acquire a singleton object in the xarray for this set_id
+	 */
+	xa_lock(&vfio_device_set_xa);
+	dev_set = xa_load(&vfio_device_set_xa, idx);
+	if (dev_set)
+		goto found_get_ref;
+	xa_unlock(&vfio_device_set_xa);
+
+	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
+	if (!new_dev_set)
+		return -ENOMEM;
+	mutex_init(&new_dev_set->lock);
+	INIT_LIST_HEAD(&new_dev_set->device_list);
+	new_dev_set->set_id = set_id;
+
+	xa_lock(&vfio_device_set_xa);
+	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
+			       GFP_KERNEL);
+	if (!dev_set) {
+		dev_set = new_dev_set;
+		goto found_get_ref;
+	}
+
+	kfree(new_dev_set);
+	if (xa_is_err(dev_set)) {
+		xa_unlock(&vfio_device_set_xa);
+		return xa_err(dev_set);
+	}
+
+found_get_ref:
+	dev_set->device_count++;
+	xa_unlock(&vfio_device_set_xa);
+	mutex_lock(&dev_set->lock);
+	device->dev_set = dev_set;
+	list_add_tail(&device->dev_set_list, &dev_set->device_list);
+	mutex_unlock(&dev_set->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_assign_device_set);
+
+static void vfio_release_device_set(struct vfio_device *device)
+{
+	struct vfio_device_set *dev_set = device->dev_set;
+
+	if (!dev_set)
+		return;
+
+	mutex_lock(&dev_set->lock);
+	list_del(&device->dev_set_list);
+	mutex_unlock(&dev_set->lock);
+
+	xa_lock(&vfio_device_set_xa);
+	if (!--dev_set->device_count) {
+		__xa_erase(&vfio_device_set_xa,
+			   (unsigned long)dev_set->set_id);
+		mutex_destroy(&dev_set->lock);
+		kfree(dev_set);
+	}
+	xa_unlock(&vfio_device_set_xa);
+}
+
 /*
  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
  * and remove functions, any use cases other than acquiring the first
- * reference for the purpose of calling vfio_add_group_dev() or removing
- * that symmetric reference after vfio_del_group_dev() should use the raw
+ * reference for the purpose of calling vfio_register_group_dev() or removing
+ * that symmetric reference after vfio_unregister_group_dev() should use the raw
  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
  * removes the device from the dummy group and cannot be nested.
  */
@@ -531,65 +595,17 @@ static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 /**
  * Device objects - create, release, get, put, search
  */
-static
-struct vfio_device *vfio_group_create_device(struct vfio_group *group,
-					     struct device *dev,
-					     const struct vfio_device_ops *ops,
-					     void *device_data)
-{
-	struct vfio_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return ERR_PTR(-ENOMEM);
-
-	kref_init(&device->kref);
-	device->dev = dev;
-	device->group = group;
-	device->ops = ops;
-	device->device_data = device_data;
-	dev_set_drvdata(dev, device);
-
-	/* No need to get group_lock, caller has group reference */
-	vfio_group_get(group);
-
-	mutex_lock(&group->device_lock);
-	list_add(&device->group_next, &group->device_list);
-	mutex_unlock(&group->device_lock);
-
-	return device;
-}
-
-static void vfio_device_release(struct kref *kref)
-{
-	struct vfio_device *device = container_of(kref,
-						  struct vfio_device, kref);
-	struct vfio_group *group = device->group;
-
-	list_del(&device->group_next);
-	mutex_unlock(&group->device_lock);
-
-	dev_set_drvdata(device->dev, NULL);
-
-	kfree(device);
-
-	/* vfio_del_group_dev may be waiting for this device */
-	wake_up(&vfio.release_q);
-}
-
 /* Device reference always implies a group reference */
 void vfio_device_put(struct vfio_device *device)
 {
-	struct vfio_group *group = device->group;
-	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
-	vfio_group_put(group);
+	if (refcount_dec_and_test(&device->refcount))
+		complete(&device->comp);
 }
 EXPORT_SYMBOL_GPL(vfio_device_put);
 
-static void vfio_device_get(struct vfio_device *device)
+static bool vfio_device_try_get(struct vfio_device *device)
 {
-	vfio_group_get(device->group);
-	kref_get(&device->kref);
+	return refcount_inc_not_zero(&device->refcount);
 }
 
 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
@@ -599,8 +615,7 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 
 	mutex_lock(&group->device_lock);
 	list_for_each_entry(device, &group->device_list, group_next) {
-		if (device->dev == dev) {
-			vfio_device_get(device);
+		if (device->dev == dev && vfio_device_try_get(device)) {
 			mutex_unlock(&group->device_lock);
 			return device;
 		}
@@ -624,9 +639,10 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
  * that error notification via MSI can be affected for platforms that handle
  * MSI within the same IOVA space as DMA.
  */
-static const char * const vfio_driver_whitelist[] = { "pci-stub" };
+static const char * const vfio_driver_allowed[] = { "pci-stub" };
 
-static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
+static bool vfio_dev_driver_allowed(struct device *dev,
+				    struct device_driver *drv)
 {
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
@@ -635,8 +651,8 @@ static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 			return true;
 	}
 
-	return match_string(vfio_driver_whitelist,
-			    ARRAY_SIZE(vfio_driver_whitelist),
+	return match_string(vfio_driver_allowed,
+			    ARRAY_SIZE(vfio_driver_allowed),
 			    drv->name) >= 0;
 }
 
@@ -645,7 +661,7 @@ static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
  * one of the following states:
  *  - driver-less
  *  - bound to a vfio driver
- *  - bound to a whitelisted driver
+ *  - bound to an otherwise allowed driver
  *  - a PCI interconnect device
  *
  * We use two methods to determine whether a device is bound to a vfio
@@ -671,7 +687,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
 	}
 	mutex_unlock(&group->unbound_lock);
 
-	if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
+	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 		return 0;
 
 	device = vfio_group_get_device(group, dev);
@@ -797,14 +813,35 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 /**
  * VFIO driver API
  */
-int vfio_add_group_dev(struct device *dev,
-		       const struct vfio_device_ops *ops, void *device_data)
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops)
+{
+	init_completion(&device->comp);
+	device->dev = dev;
+	device->ops = ops;
+}
+EXPORT_SYMBOL_GPL(vfio_init_group_dev);
+
+void vfio_uninit_group_dev(struct vfio_device *device)
 {
+	vfio_release_device_set(device);
+}
+EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
+
+int vfio_register_group_dev(struct vfio_device *device)
+{
+	struct vfio_device *existing_device;
 	struct iommu_group *iommu_group;
 	struct vfio_group *group;
-	struct vfio_device *device;
 
-	iommu_group = iommu_group_get(dev);
+	/*
+	 * If the driver doesn't specify a set then the device is added to a
+	 * singleton set just for itself.
+	 */
+	if (!device->dev_set)
+		vfio_assign_device_set(device, device);
+
+	iommu_group = iommu_group_get(device->dev);
 	if (!iommu_group)
 		return -EINVAL;
 
@@ -823,31 +860,29 @@ int vfio_add_group_dev(struct device *dev,
 		iommu_group_put(iommu_group);
 	}
 
-	device = vfio_group_get_device(group, dev);
-	if (device) {
-		dev_WARN(dev, "Device already exists on group %d\n",
+	existing_device = vfio_group_get_device(group, device->dev);
+	if (existing_device) {
+		dev_WARN(device->dev, "Device already exists on group %d\n",
 			 iommu_group_id(iommu_group));
-		vfio_device_put(device);
+		vfio_device_put(existing_device);
 		vfio_group_put(group);
 		return -EBUSY;
 	}
 
-	device = vfio_group_create_device(group, dev, ops, device_data);
-	if (IS_ERR(device)) {
-		vfio_group_put(group);
-		return PTR_ERR(device);
-	}
+	/* Our reference on group is moved to the device */
+	device->group = group;
 
-	/*
-	 * Drop all but the vfio_device reference.  The vfio_device holds
-	 * a reference to the vfio_group, which holds a reference to the
-	 * iommu_group.
-	 */
-	vfio_group_put(group);
+	/* Refcounting can't start until the driver calls register */
+	refcount_set(&device->refcount, 1);
+
+	mutex_lock(&group->device_lock);
+	list_add(&device->group_next, &group->device_list);
+	group->dev_counter++;
+	mutex_unlock(&group->device_lock);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(vfio_add_group_dev);
+EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 
 /**
  * Get a reference to the vfio_device for a device.  Even if the
@@ -875,13 +910,24 @@ EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 						     char *buf)
 {
-	struct vfio_device *it, *device = NULL;
+	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 
 	mutex_lock(&group->device_lock);
 	list_for_each_entry(it, &group->device_list, group_next) {
-		if (!strcmp(dev_name(it->dev), buf)) {
+		int ret;
+
+		if (it->ops->match) {
+			ret = it->ops->match(it, buf);
+			if (ret < 0) {
+				device = ERR_PTR(ret);
+				break;
+			}
+		} else {
+			ret = !strcmp(dev_name(it->dev), buf);
+		}
+
+		if (ret && vfio_device_try_get(it)) {
 			device = it;
-			vfio_device_get(device);
 			break;
 		}
 	}
@@ -890,33 +936,16 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 	return device;
 }
 
-/*
- * Caller must hold a reference to the vfio_device
- */
-void *vfio_device_data(struct vfio_device *device)
-{
-	return device->device_data;
-}
-EXPORT_SYMBOL_GPL(vfio_device_data);
-
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
-void *vfio_del_group_dev(struct device *dev)
+void vfio_unregister_group_dev(struct vfio_device *device)
 {
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct vfio_device *device = dev_get_drvdata(dev);
 	struct vfio_group *group = device->group;
-	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
 	unsigned int i = 0;
 	bool interrupted = false;
-
-	/*
-	 * The group exists so long as we have a device reference.  Get
-	 * a group reference and use it to scan for the device going away.
-	 */
-	vfio_group_get(group);
+	long rc;
 
 	/*
 	 * When the device is removed from the group, the group suddenly
@@ -929,7 +958,7 @@ void *vfio_del_group_dev(struct device *dev)
 	 */
 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 	if (unbound) {
-		unbound->dev = dev;
+		unbound->dev = device->dev;
 		mutex_lock(&group->unbound_lock);
 		list_add(&unbound->unbound_next, &group->unbound_list);
 		mutex_unlock(&group->unbound_lock);
@@ -937,44 +966,33 @@ void *vfio_del_group_dev(struct device *dev)
 	WARN_ON(!unbound);
 
 	vfio_device_put(device);
-
-	/*
-	 * If the device is still present in the group after the above
-	 * 'put', then it is in use and we need to request it from the
-	 * bus driver.  The driver may in turn need to request the
-	 * device from the user.  We send the request on an arbitrary
-	 * interval with counter to allow the driver to take escalating
-	 * measures to release the device if it has the ability to do so.
-	 */
-	add_wait_queue(&vfio.release_q, &wait);
-
-	do {
-		device = vfio_group_get_device(group, dev);
-		if (!device)
-			break;
-
+	rc = try_wait_for_completion(&device->comp);
+	while (rc <= 0) {
 		if (device->ops->request)
-			device->ops->request(device_data, i++);
-
-		vfio_device_put(device);
+			device->ops->request(device, i++);
 
 		if (interrupted) {
-			wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
+			rc = wait_for_completion_timeout(&device->comp,
+							 HZ * 10);
 		} else {
-			wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
-			if (signal_pending(current)) {
+			rc = wait_for_completion_interruptible_timeout(
+				&device->comp, HZ * 10);
+			if (rc < 0) {
 				interrupted = true;
-				dev_warn(dev,
+				dev_warn(device->dev,
 					 "Device is currently in use, task"
 					 " \"%s\" (%d) "
 					 "blocked until device is released",
 					 current->comm, task_pid_nr(current));
 			}
 		}
+	}
 
-	} while (1);
+	mutex_lock(&group->device_lock);
+	list_del(&device->group_next);
+	group->dev_counter--;
+	mutex_unlock(&group->device_lock);
 
-	remove_wait_queue(&vfio.release_q, &wait);
 	/*
 	 * In order to support multiple devices per group, devices can be
 	 * plucked from the group while other devices in the group are still
@@ -992,11 +1010,10 @@ void *vfio_del_group_dev(struct device *dev)
 	if (list_empty(&group->device_list))
 		wait_event(group->container_q, !group->container);
 
+	/* Matches the get in vfio_register_group_dev() */
 	vfio_group_put(group);
-
-	return device_data;
 }
-EXPORT_SYMBOL_GPL(vfio_del_group_dev);
+EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 
 /**
  * VFIO base fd, /dev/vfio/vfio
@@ -1184,15 +1201,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long vfio_fops_compat_ioctl(struct file *filep,
-				   unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif	/* CONFIG_COMPAT */
-
 static int vfio_fops_open(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container;
@@ -1213,6 +1221,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
 static int vfio_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container = filep->private_data;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (driver && driver->ops->notify)
+		driver->ops->notify(container->iommu_data,
+				    VFIO_IOMMU_CONTAINER_CLOSE);
 
 	filep->private_data = NULL;
 
@@ -1275,9 +1288,7 @@ static const struct file_operations vfio_fops = {
 	.read		= vfio_fops_read,
 	.write		= vfio_fops_write,
 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.mmap		= vfio_fops_mmap,
 };
 
@@ -1431,7 +1442,8 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 {
 	struct vfio_device *device;
 	struct file *filep;
-	int ret;
+	int fdno;
+	int ret = 0;
 
 	if (0 == atomic_read(&group->container_users) ||
 	    !group->container->iommu_driver || !vfio_group_viable(group))
@@ -1441,34 +1453,36 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 		return -EPERM;
 
 	device = vfio_device_get_from_name(group, buf);
-	if (!device)
-		return -ENODEV;
+	if (IS_ERR(device))
+		return PTR_ERR(device);
 
-	ret = device->ops->open(device->device_data);
-	if (ret) {
-		vfio_device_put(device);
-		return ret;
+	if (!try_module_get(device->dev->driver->owner)) {
+		ret = -ENODEV;
+		goto err_device_put;
 	}
 
+	mutex_lock(&device->dev_set->lock);
+	device->open_count++;
+	if (device->open_count == 1 && device->ops->open_device) {
+		ret = device->ops->open_device(device);
+		if (ret)
+			goto err_undo_count;
+	}
+	mutex_unlock(&device->dev_set->lock);
+
 	/*
 	 * We can't use anon_inode_getfd() because we need to modify
 	 * the f_mode flags directly to allow more than just ioctls
 	 */
-	ret = get_unused_fd_flags(O_CLOEXEC);
-	if (ret < 0) {
-		device->ops->release(device->device_data);
-		vfio_device_put(device);
-		return ret;
-	}
+	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		goto err_close_device;
 
 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
 				   device, O_RDWR);
 	if (IS_ERR(filep)) {
-		put_unused_fd(ret);
 		ret = PTR_ERR(filep);
-		device->ops->release(device->device_data);
-		vfio_device_put(device);
-		return ret;
+		goto err_fd;
 	}
 
 	/*
@@ -1480,12 +1494,25 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 
 	atomic_inc(&group->container_users);
 
-	fd_install(ret, filep);
+	fd_install(fdno, filep);
 
 	if (group->noiommu)
 		dev_warn(device->dev, "vfio-noiommu device opened by user "
 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
-
+	return fdno;
+
+err_fd:
+	put_unused_fd(fdno);
+err_close_device:
+	mutex_lock(&device->dev_set->lock);
+	if (device->open_count == 1 && device->ops->close_device)
+		device->ops->close_device(device);
+err_undo_count:
+	device->open_count--;
+	mutex_unlock(&device->dev_set->lock);
+	module_put(device->dev->driver->owner);
+err_device_put:
+	vfio_device_put(device);
 	return ret;
 }
 
@@ -1556,15 +1583,6 @@ static long vfio_group_fops_unl_ioctl(struct file *filep,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-static long vfio_group_fops_compat_ioctl(struct file *filep,
-					 unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_group_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif	/* CONFIG_COMPAT */
-
 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
 {
 	struct vfio_group *group;
@@ -1620,9 +1638,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
 static const struct file_operations vfio_group_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_group_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.open		= vfio_group_fops_open,
 	.release	= vfio_group_fops_release,
 };
@@ -1634,7 +1650,12 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
 
-	device->ops->release(device->device_data);
+	mutex_lock(&device->dev_set->lock);
+	if (!--device->open_count && device->ops->close_device)
+		device->ops->close_device(device);
+	mutex_unlock(&device->dev_set->lock);
+
+	module_put(device->dev->driver->owner);
 
 	vfio_group_try_dissolve_container(device->group);
 
@@ -1651,7 +1672,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 	if (unlikely(!device->ops->ioctl))
 		return -EINVAL;
 
-	return device->ops->ioctl(device->device_data, cmd, arg);
+	return device->ops->ioctl(device, cmd, arg);
 }
 
 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
@@ -1662,7 +1683,7 @@ static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
 	if (unlikely(!device->ops->read))
 		return -EINVAL;
 
-	return device->ops->read(device->device_data, buf, count, ppos);
+	return device->ops->read(device, buf, count, ppos);
 }
 
 static ssize_t vfio_device_fops_write(struct file *filep,
@@ -1674,7 +1695,7 @@ static ssize_t vfio_device_fops_write(struct file *filep,
 	if (unlikely(!device->ops->write))
 		return -EINVAL;
 
-	return device->ops->write(device->device_data, buf, count, ppos);
+	return device->ops->write(device, buf, count, ppos);
 }
 
 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
@@ -1684,17 +1705,8 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	if (unlikely(!device->ops->mmap))
 		return -EINVAL;
 
-	return device->ops->mmap(device->device_data, vma);
-}
-
-#ifdef CONFIG_COMPAT
-static long vfio_device_fops_compat_ioctl(struct file *filep,
-					  unsigned int cmd, unsigned long arg)
-{
-	arg = (unsigned long)compat_ptr(arg);
-	return vfio_device_fops_unl_ioctl(filep, cmd, arg);
+	return device->ops->mmap(device, vma);
 }
-#endif	/* CONFIG_COMPAT */
 
 static const struct file_operations vfio_device_fops = {
 	.owner		= THIS_MODULE,
@@ -1702,9 +1714,7 @@ static const struct file_operations vfio_device_fops = {
 	.read		= vfio_device_fops_read,
 	.write		= vfio_device_fops_write,
 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vfio_device_fops_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.mmap		= vfio_device_fops_mmap,
 };
 
@@ -1753,6 +1763,44 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep)
 }
 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
 
+/**
+ * External user API, exported by symbols to be linked dynamically.
+ * The external user passes in a device pointer
+ * to verify that:
+ *	- A VFIO group is assiciated with the device;
+ *	- IOMMU is set for the group.
+ * If both checks passed, vfio_group_get_external_user_from_dev()
+ * increments the container user counter to prevent the VFIO group
+ * from disposal before external user exits and returns the pointer
+ * to the VFIO group.
+ *
+ * When the external user finishes using the VFIO group, it calls
+ * vfio_group_put_external_user() to release the VFIO group and
+ * decrement the container user counter.
+ *
+ * @dev [in]	: device
+ * Return error PTR or pointer to VFIO group.
+ */
+
+struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
+{
+	struct vfio_group *group;
+	int ret;
+
+	group = vfio_group_get_from_dev(dev);
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret) {
+		vfio_group_put(group);
+		return ERR_PTR(ret);
+	}
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
+
 void vfio_group_put_external_user(struct vfio_group *group)
 {
 	vfio_group_try_dissolve_container(group);
@@ -1928,6 +1976,11 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	if (!group)
 		return -ENODEV;
 
+	if (group->dev_counter > 1) {
+		ret = -EINVAL;
+		goto err_pin_pages;
+	}
+
 	ret = vfio_group_add_container_user(group);
 	if (ret)
 		goto err_pin_pages;
@@ -1935,7 +1988,8 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
 	container = group->container;
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
-		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+		ret = driver->ops->pin_pages(container->iommu_data,
+					     group->iommu_group, user_pfn,
 					     npage, prot, phys_pfn);
 	else
 		ret = -ENOTTY;
@@ -1994,6 +2048,149 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+/*
+ * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
+ * VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: VFIO group
+ * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
+ * @npage [in]		: count of elements in user_iova_pfn array.
+ *			  This count should not be greater
+ *			  VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in]		: protection flags
+ * @phys_pfn [out]	: array of host PFNs
+ * Return error or number of pages pinned.
+ */
+int vfio_group_pin_pages(struct vfio_group *group,
+			 unsigned long *user_iova_pfn, int npage,
+			 int prot, unsigned long *phys_pfn)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!group || !user_iova_pfn || !phys_pfn || !npage)
+		return -EINVAL;
+
+	if (group->dev_counter > 1)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->pin_pages))
+		ret = driver->ops->pin_pages(container->iommu_data,
+					     group->iommu_group, user_iova_pfn,
+					     npage, prot, phys_pfn);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_group_pin_pages);
+
+/*
+ * Unpin a set of guest IOVA PFNs for a VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: vfio group
+ * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
+ * @npage [in]		: count of elements in user_iova_pfn array.
+ *			  This count should not be greater than
+ *			  VFIO_PIN_PAGES_MAX_ENTRIES.
+ * Return error or number of pages unpinned.
+ */
+int vfio_group_unpin_pages(struct vfio_group *group,
+			   unsigned long *user_iova_pfn, int npage)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret;
+
+	if (!group || !user_iova_pfn || !npage)
+		return -EINVAL;
+
+	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+		return -E2BIG;
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unpin_pages))
+		ret = driver->ops->unpin_pages(container->iommu_data,
+					       user_iova_pfn, npage);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_group_unpin_pages);
+
+
+/*
+ * This interface allows the CPUs to perform some sort of virtual DMA on
+ * behalf of the device.
+ *
+ * CPUs read/write from/into a range of IOVAs pointing to user space memory
+ * into/from a kernel buffer.
+ *
+ * As the read/write of user space memory is conducted via the CPUs and is
+ * not a real device DMA, it is not necessary to pin the user space memory.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in]		: VFIO group
+ * @user_iova [in]	: base IOVA of a user space buffer
+ * @data [in]		: pointer to kernel buffer
+ * @len [in]		: kernel buffer length
+ * @write		: indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
+		void *data, size_t len, bool write)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+	int ret = 0;
+
+	if (!group || !data || len <= 0)
+		return -EINVAL;
+
+	container = group->container;
+	driver = container->iommu_driver;
+
+	if (likely(driver && driver->ops->dma_rw))
+		ret = driver->ops->dma_rw(container->iommu_data,
+					  user_iova, data, len, write);
+	else
+		ret = -ENOTTY;
+
+	return ret;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
 static int vfio_register_iommu_notifier(struct vfio_group *group,
 					unsigned long *events,
 					struct notifier_block *nb)
@@ -2161,6 +2358,24 @@ int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
 }
 EXPORT_SYMBOL(vfio_unregister_notifier);
 
+struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+
+	if (!group)
+		return ERR_PTR(-EINVAL);
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->group_iommu_domain))
+		return driver->ops->group_iommu_domain(container->iommu_data,
+						       group->iommu_group);
+
+	return ERR_PTR(-ENOTTY);
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
+
 /**
  * Module/class support
  */
@@ -2186,7 +2401,6 @@ static int __init vfio_init(void)
 	mutex_init(&vfio.iommu_drivers_lock);
 	INIT_LIST_HEAD(&vfio.group_list);
 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
-	init_waitqueue_head(&vfio.release_q);
 
 	ret = misc_register(&vfio_dev);
 	if (ret) {
@@ -2242,6 +2456,7 @@ static void __exit vfio_cleanup(void)
 	class_destroy(vfio.class);
 	vfio.class = NULL;
 	misc_deregister(&vfio_dev);
+	xa_destroy(&vfio_device_set_xa);
 }
 
 module_init(vfio_init);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 26cef65b41e7a167cbc500d672e093e851c59504..fe888b5dcc006281133c2df808449da9834fbe5d 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -79,7 +79,7 @@ static long tce_iommu_mm_set(struct tce_container *container)
 	}
 	BUG_ON(!current->mm);
 	container->mm = current->mm;
-	atomic_inc(&container->mm->mm_count);
+	mmgrab(container->mm);
 
 	return 0;
 }
@@ -383,7 +383,7 @@ static void tce_iommu_unuse_page(struct tce_container *container,
 	struct page *page;
 
 	page = pfn_to_page(hpa >> PAGE_SHIFT);
-	put_page(page);
+	unpin_user_page(page);
 }
 
 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
@@ -486,7 +486,7 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 	struct page *page = NULL;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
-	if (get_user_pages_fast(tce & PAGE_MASK, 1,
+	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 			&page) != 1)
 		return -EFAULT;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6b1e8cba1798408ab02abebd66b66bb9fd9289ab..548e2327871628cfe4a47f7afe703caf348f007d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -16,7 +16,7 @@
  * IOMMU to support the IOMMU API and have few to no restrictions around
  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  * optimized for relatively static mappings of a userspace process with
- * userpsace pages pinned into memory.  We also assume devices and IOMMU
+ * userspace pages pinned into memory.  We also assume devices and IOMMU
  * domains are PCI based as the IOMMU API is still centered around a
  * device/bus interface rather than a group interface.
  */
@@ -28,9 +28,11 @@
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/mmu_context.h>
 #include <linux/rbtree.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
@@ -69,8 +71,16 @@ struct vfio_iommu {
 	struct rb_root		dma_list;
 	struct blocking_notifier_head notifier;
 	unsigned int		dma_avail;
+	unsigned int		vaddr_invalid_count;
+	uint64_t		pgsize_bitmap;
+	uint64_t		num_non_pinned_groups;
+	wait_queue_head_t	vaddr_wait;
+	struct iommu_nesting_info	*nesting_info;
 	bool			v2;
 	bool			nesting;
+	bool			dirty_page_tracking;
+	bool			container_open;
+	uint64_t		num_non_hwdbm_groups;
 };
 
 struct vfio_domain {
@@ -89,14 +99,26 @@ struct vfio_dma {
 	int			prot;		/* IOMMU_READ/WRITE */
 	bool			iommu_mapped;
 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
+	bool			vaddr_invalid;
 	struct task_struct	*task;
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
+	unsigned long		*bitmap;
 };
 
-struct vfio_group {
+struct vfio_batch {
+	struct page		**pages;	/* for pin_user_pages_remote */
+	struct page		*fallback_page; /* if pages alloc fails */
+	int			capacity;	/* length of pages array */
+	int			size;		/* of batch currently */
+	int			offset;		/* of next entry in pages */
+};
+
+struct vfio_iommu_group {
 	struct iommu_group	*iommu_group;
 	struct list_head	next;
 	bool			mdev_group;	/* An mdev group */
+	bool			pinned_page_dirty_scope;
+	bool			iommu_hwdbm;	/* Valid for non-mdev group */
 };
 
 struct vfio_iova {
@@ -112,7 +134,7 @@ struct vfio_pfn {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		pfn;		/* Host pfn */
-	atomic_t		ref_count;
+	unsigned int		ref_count;
 };
 
 struct vfio_regions {
@@ -125,8 +147,59 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
+#define CONTAINER_HAS_DOMAIN(iommu)	(((iommu)->external_domain) || \
+					 (!list_empty(&(iommu)->domain_list)))
+
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
+#define WAITED 1
+
+struct domain_capsule {
+	struct vfio_iommu_group	*group;
+	struct iommu_domain	*domain;
+	void			*data;
+	/* set if @data contains a user pointer*/
+	bool			user;
+	u64			flags;
+};
+
+/* iommu->lock must be held */
+static int vfio_prepare_nesting_domain_capsule(struct vfio_iommu *iommu,
+					       struct domain_capsule *dc)
+{
+	struct vfio_domain *domain;
+	struct vfio_iommu_group *group;
+
+	if (!iommu->nesting_info)
+		return -EINVAL;
+
+	domain = list_first_entry(&iommu->domain_list,
+				  struct vfio_domain, next);
+	group = list_first_entry(&domain->group_list,
+				 struct vfio_iommu_group, next);
+	dc->group = group;
+	dc->domain = domain->domain;
+	dc->user = true;
+	return 0;
+}
+
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group);
+
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -151,6 +224,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 	return NULL;
 }
 
+static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
+						dma_addr_t start, u64 size)
+{
+	struct rb_node *res = NULL;
+	struct rb_node *node = iommu->dma_list.rb_node;
+	struct vfio_dma *dma_res = NULL;
+
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start < dma->iova + dma->size) {
+			res = node;
+			dma_res = dma;
+			if (start >= dma->iova)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	if (res && size && dma_res->iova >= start + size)
+		res = NULL;
+	return res;
+}
+
 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 {
 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
@@ -175,6 +273,93 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+	uint64_t npages = dma->size / pgsize;
+
+	if (npages > DIRTY_BITMAP_PAGES_MAX)
+		return -EINVAL;
+
+	/*
+	 * Allocate extra 64 bits that are used to calculate shift required for
+	 * bitmap_shift_left() to manipulate and club unaligned number of pages
+	 * in adjacent vfio_dma ranges.
+	 */
+	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+			       GFP_KERNEL);
+	if (!dma->bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+	kfree(dma->bitmap);
+	dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+	struct rb_node *p;
+	unsigned long pgshift = __ffs(pgsize);
+
+	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
+	}
+}
+
+static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
+	}
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+		int ret;
+
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret) {
+			struct rb_node *p;
+
+			for (p = rb_prev(n); p; p = rb_prev(p)) {
+				struct vfio_dma *dma = rb_entry(n,
+							struct vfio_dma, node);
+
+				vfio_dma_bitmap_free(dma);
+			}
+			return ret;
+		}
+		vfio_dma_populate_bitmap(dma, pgsize);
+	}
+	return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		vfio_dma_bitmap_free(dma);
+	}
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -233,7 +418,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 
 	vpfn->iova = iova;
 	vpfn->pfn = pfn;
-	atomic_set(&vpfn->ref_count, 1);
+	vpfn->ref_count = 1;
 	vfio_link_pfn(dma, vpfn);
 	return 0;
 }
@@ -251,7 +436,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
 	if (vpfn)
-		atomic_inc(&vpfn->ref_count);
+		vpfn->ref_count++;
 	return vpfn;
 }
 
@@ -259,7 +444,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 {
 	int ret = 0;
 
-	if (atomic_dec_and_test(&vpfn->ref_count)) {
+	vpfn->ref_count--;
+	if (!vpfn->ref_count) {
 		ret = put_pfn(vpfn->pfn, dma->prot);
 		vfio_remove_from_pfn_list(dma, vpfn);
 	}
@@ -295,31 +481,13 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
  * Some mappings aren't backed by a struct page, for example an mmap'd
  * MMIO range for our own or another device.  These use a different
  * pfn conversion and shouldn't be tracked as locked pages.
+ * For compound pages, any driver that sets the reserved bit in head
+ * page needs to set the reserved bit in all subpages to be safe.
  */
 static bool is_invalid_reserved_pfn(unsigned long pfn)
 {
-	if (pfn_valid(pfn)) {
-		bool reserved;
-		struct page *tail = pfn_to_page(pfn);
-		struct page *head = compound_head(tail);
-		reserved = !!(PageReserved(head));
-		if (head != tail) {
-			/*
-			 * "head" is not a dangling pointer
-			 * (compound_head takes care of that)
-			 * but the hugepage may have been split
-			 * from under us (and we may not hold a
-			 * reference count on the head page so it can
-			 * be reused before we run PageReferenced), so
-			 * we've to check PageTail before returning
-			 * what we just read.
-			 */
-			smp_rmb();
-			if (PageTail(tail))
-				return reserved;
-		}
-		return PageReserved(tail);
-	}
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
 
 	return true;
 }
@@ -328,9 +496,8 @@ static int put_pfn(unsigned long pfn, int prot)
 {
 	if (!is_invalid_reserved_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
-		if (prot & IOMMU_WRITE)
-			SetPageDirty(page);
-		put_page(page);
+
+		put_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 		return 1;
 	}
 	return 0;
@@ -372,12 +539,54 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 	return ret;
 }
 
-static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
-			 int prot, unsigned long *pfn)
+#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
+
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+	batch->size = 0;
+	batch->offset = 0;
+
+	if (unlikely(disable_hugepages))
+		goto fallback;
+
+	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!batch->pages)
+		goto fallback;
+
+	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
+	return;
+
+fallback:
+	batch->pages = &batch->fallback_page;
+	batch->capacity = 1;
+}
+
+static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
+{
+	while (batch->size) {
+		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
+
+		put_pfn(pfn, dma->prot);
+		batch->offset++;
+		batch->size--;
+	}
+}
+
+static void vfio_batch_fini(struct vfio_batch *batch)
+{
+	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
+		free_page((unsigned long)batch->pages);
+}
+
+/*
+ * Returns the positive number of pfns successfully obtained or a negative
+ * error code.
+ */
+static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+			  long npages, int prot, unsigned long *pfn,
+			  struct page **pages)
 {
-	struct page *page[1];
 	struct vm_area_struct *vma;
-	struct vm_area_struct *vmas[1];
 	unsigned int flags = 0;
 	int ret;
 
@@ -385,51 +594,90 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		flags |= FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
-	if (mm == current->mm) {
-		ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
-				     vmas);
-	} else {
-		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
-					    vmas, NULL);
-		/*
-		 * The lifetime of a vaddr_get_pfn() page pin is
-		 * userspace-controlled. In the fs-dax case this could
-		 * lead to indefinite stalls in filesystem operations.
-		 * Disallow attempts to pin fs-dax pages via this
-		 * interface.
-		 */
-		if (ret > 0 && vma_is_fsdax(vmas[0])) {
-			ret = -EOPNOTSUPP;
-			put_page(page[0]);
-		}
-	}
-	up_read(&mm->mmap_sem);
-
-	if (ret == 1) {
-		*pfn = page_to_pfn(page[0]);
-		return 0;
+	ret = get_user_pages_remote(NULL, mm, vaddr, npages, flags | FOLL_LONGTERM,
+				    pages, NULL, NULL);
+	if (ret > 0) {
+		*pfn = page_to_pfn(pages[0]);
+		goto done;
 	}
 
-	down_read(&mm->mmap_sem);
-
 	vaddr = untagged_addr(vaddr);
 
 retry:
-	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
+	vma = vma_lookup(mm, vaddr);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {
 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
 		if (ret == -EAGAIN)
 			goto retry;
 
-		if (!ret && !is_invalid_reserved_pfn(*pfn))
-			ret = -EFAULT;
+		if (!ret) {
+			if (is_invalid_reserved_pfn(*pfn))
+				ret = 1;
+			else
+				ret = -EFAULT;
+		}
 	}
-
+done:
 	up_read(&mm->mmap_sem);
 	return ret;
 }
 
+static int vfio_wait(struct vfio_iommu *iommu)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
+	mutex_unlock(&iommu->lock);
+	schedule();
+	mutex_lock(&iommu->lock);
+	finish_wait(&iommu->vaddr_wait, &wait);
+	if (kthread_should_stop() || !iommu->container_open ||
+	    fatal_signal_pending(current)) {
+		return -EFAULT;
+	}
+	return WAITED;
+}
+
+/*
+ * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
+ * if the task waits, but is re-locked on return.  Return result in *dma_p.
+ * Return 0 on success with no waiting, WAITED on success if waited, and -errno
+ * on error.
+ */
+static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
+			       size_t size, struct vfio_dma **dma_p)
+{
+	int ret = 0;
+
+	do {
+		*dma_p = vfio_find_dma(iommu, start, size);
+		if (!*dma_p)
+			return -EINVAL;
+		else if (!(*dma_p)->vaddr_invalid)
+			return ret;
+		else
+			ret = vfio_wait(iommu);
+	} while (ret == WAITED);
+
+	return ret;
+}
+
+/*
+ * Wait for all vaddr in the dma_list to become valid.  iommu lock is dropped
+ * if the task waits, but is re-locked on return.  Return 0 on success with no
+ * waiting, WAITED on success if waited, and -errno on error.
+ */
+static int vfio_wait_all_valid(struct vfio_iommu *iommu)
+{
+	int ret = 0;
+
+	while (iommu->vaddr_invalid_count && ret >= 0)
+		ret = vfio_wait(iommu);
+
+	return ret;
+}
+
 /*
  * Attempt to pin pages.  We really don't want to track all the pfns and
  * the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -437,76 +685,108 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
  */
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 				  long npage, unsigned long *pfn_base,
-				  unsigned long limit)
+				  unsigned long limit, struct vfio_batch *batch)
 {
-	unsigned long pfn = 0;
+	unsigned long pfn;
+	struct mm_struct *mm = current->mm;
 	long ret, pinned = 0, lock_acct = 0;
 	bool rsvd;
 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 
 	/* This code path is only user initiated */
-	if (!current->mm)
+	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
-	if (ret)
-		return ret;
-
-	pinned++;
-	rsvd = is_invalid_reserved_pfn(*pfn_base);
-
-	/*
-	 * Reserved pages aren't counted against the user, externally pinned
-	 * pages are already counted against the user.
-	 */
-	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
-			put_pfn(*pfn_base, dma->prot);
-			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
-					limit << PAGE_SHIFT);
-			return -ENOMEM;
-		}
-		lock_acct++;
+	if (batch->size) {
+		/* Leftover pages in batch from an earlier call. */
+		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
+		pfn = *pfn_base;
+		rsvd = is_invalid_reserved_pfn(*pfn_base);
+	} else {
+		*pfn_base = 0;
 	}
 
-	if (unlikely(disable_hugepages))
-		goto out;
+	while (npage) {
+		if (!batch->size) {
+			/* Empty batch, so refill it. */
+			long req_pages = min_t(long, npage, batch->capacity);
 
-	/* Lock all the consecutive pages from pfn_base */
-	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
-	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
-		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
-		if (ret)
-			break;
+			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+					     &pfn, batch->pages);
+			if (ret < 0)
+				goto unpin_out;
 
-		if (pfn != *pfn_base + pinned ||
-		    rsvd != is_invalid_reserved_pfn(pfn)) {
-			put_pfn(pfn, dma->prot);
-			break;
+			batch->size = ret;
+			batch->offset = 0;
+
+			if (!*pfn_base) {
+				*pfn_base = pfn;
+				rsvd = is_invalid_reserved_pfn(*pfn_base);
+			}
 		}
 
-		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-			if (!dma->lock_cap &&
-			    current->mm->locked_vm + lock_acct + 1 > limit) {
-				put_pfn(pfn, dma->prot);
-				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-					__func__, limit << PAGE_SHIFT);
-				ret = -ENOMEM;
-				goto unpin_out;
+		/*
+		 * pfn is preset for the first iteration of this inner loop and
+		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
+		 * batch->pages isn't valid (there's no struct page), so allow
+		 * batch->pages to be touched only when there's more than one
+		 * pfn to check, which guarantees the pfns are from a
+		 * !VM_PFNMAP vma.
+		 */
+		while (true) {
+			if (pfn != *pfn_base + pinned ||
+			    rsvd != is_invalid_reserved_pfn(pfn))
+				goto out;
+
+			/*
+			 * Reserved pages aren't counted against the user,
+			 * externally pinned pages are already counted against
+			 * the user.
+			 */
+			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+				if (!dma->lock_cap &&
+				    mm->locked_vm + lock_acct + 1 > limit) {
+					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+						__func__, limit << PAGE_SHIFT);
+					ret = -ENOMEM;
+					goto unpin_out;
+				}
+				lock_acct++;
 			}
-			lock_acct++;
+
+			pinned++;
+			npage--;
+			vaddr += PAGE_SIZE;
+			iova += PAGE_SIZE;
+			batch->offset++;
+			batch->size--;
+
+			if (!batch->size)
+				break;
+
+			pfn = page_to_pfn(batch->pages[batch->offset]);
 		}
+
+		if (unlikely(disable_hugepages))
+			break;
 	}
 
 out:
 	ret = vfio_lock_acct(dma, lock_acct, false);
 
 unpin_out:
-	if (ret) {
-		if (!rsvd) {
+	if (batch->size == 1 && !batch->offset) {
+		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
+		put_pfn(pfn, dma->prot);
+		batch->size = 0;
+	}
+
+	if (ret < 0) {
+		if (pinned && !rsvd) {
 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 				put_pfn(pfn, dma->prot);
 		}
+		vfio_batch_unpin(batch, dma);
 
 		return ret;
 	}
@@ -538,6 +818,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 				  unsigned long *pfn_base, bool do_accounting)
 {
+	struct page *pages[1];
 	struct mm_struct *mm;
 	int ret;
 
@@ -545,8 +826,13 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 	if (!mm)
 		return -ENODEV;
 
-	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
-	if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
+	if (ret != 1)
+		goto out;
+
+	ret = 0;
+
+	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 		ret = vfio_lock_acct(dma, 1, true);
 		if (ret) {
 			put_pfn(*pfn_base, dma->prot);
@@ -558,6 +844,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 		}
 	}
 
+out:
 	mmput(mm);
 	return ret;
 }
@@ -580,15 +867,18 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+				      struct iommu_group *iommu_group,
 				      unsigned long *user_pfn,
 				      int npage, int prot,
 				      unsigned long *phys_pfn)
 {
 	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_iommu_group *group;
 	int i, j, ret;
 	unsigned long remote_vaddr;
 	struct vfio_dma *dma;
 	bool do_accounting;
+	dma_addr_t iova;
 
 	if (!iommu || !user_pfn || !phys_pfn)
 		return -EINVAL;
@@ -599,6 +889,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
+	/*
+	 * Wait for all necessary vaddr's to be valid so they can be used in
+	 * the main loop without dropping the lock, to avoid racing vs unmap.
+	 */
+again:
+	if (iommu->vaddr_invalid_count) {
+		for (i = 0; i < npage; i++) {
+			iova = user_pfn[i] << PAGE_SHIFT;
+			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
+			if (ret < 0)
+				goto pin_done;
+			if (ret == WAITED)
+				goto again;
+		}
+	}
+
 	/* Fail if notifier list is empty */
 	if (!iommu->notifier.head) {
 		ret = -EINVAL;
@@ -607,13 +913,12 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	/*
 	 * If iommu capable domain exist in the container then all pages are
-	 * already pinned and accounted. Accouting should be done if there is no
+	 * already pinned and accounted. Accounting should be done if there is no
 	 * iommu capable domain in the container.
 	 */
 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 
 	for (i = 0; i < npage; i++) {
-		dma_addr_t iova;
 		struct vfio_pfn *vpfn;
 
 		iova = user_pfn[i] << PAGE_SHIFT;
@@ -646,9 +951,26 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				vfio_lock_acct(dma, -1, true);
 			goto pin_unwind;
 		}
-	}
 
+		if (iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap,
+				   (iova - dma->iova) >> pgshift, 1);
+		}
+	}
 	ret = i;
+
+	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
+	if (!group->pinned_page_dirty_scope) {
+		group->pinned_page_dirty_scope = true;
+		iommu->num_non_pinned_groups--;
+	}
+
 	goto pin_done;
 
 pin_unwind:
@@ -674,7 +996,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 	bool do_accounting;
 	int i;
 
-	if (!iommu || !user_pfn)
+	if (!iommu || !user_pfn || npage <= 0)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -691,13 +1013,13 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 		iova = user_pfn[i] << PAGE_SHIFT;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma)
-			goto unpin_exit;
+			break;
+
 		vfio_unpin_page_external(dma, iova, do_accounting);
 	}
 
-unpin_exit:
 	mutex_unlock(&iommu->lock);
-	return i > npage ? npage : (i > 0 ? i : -EINVAL);
+	return i > 0 ? i : -EINVAL;
 }
 
 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
@@ -707,7 +1029,7 @@ static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 	long unlocked = 0;
 	struct vfio_regions *entry, *next;
 
-	iommu_tlb_sync(domain->domain, iotlb_gather);
+	iommu_iotlb_sync(domain->domain, iotlb_gather);
 
 	list_for_each_entry_safe(entry, next, regions, list) {
 		unlocked += vfio_unpin_pages_remote(dma,
@@ -881,19 +1203,23 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	vfio_unmap_unpin(iommu, dma, true);
 	vfio_unlink_dma(iommu, dma);
 	put_task_struct(dma->task);
+	vfio_dma_bitmap_free(dma);
+	if (dma->vaddr_invalid) {
+		iommu->vaddr_invalid_count--;
+		wake_up_all(&iommu->vaddr_wait);
+	}
 	kfree(dma);
 	iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 {
 	struct vfio_domain *domain;
-	unsigned long bitmap = ULONG_MAX;
 
-	mutex_lock(&iommu->lock);
+	iommu->pgsize_bitmap = ULONG_MAX;
+
 	list_for_each_entry(domain, &iommu->domain_list, next)
-		bitmap &= domain->domain->pgsize_bitmap;
-	mutex_unlock(&iommu->lock);
+		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
 	/*
 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -903,50 +1229,234 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
 	 * to map the buffer.
 	 */
-	if (bitmap & ~PAGE_MASK) {
-		bitmap &= PAGE_MASK;
-		bitmap |= PAGE_SIZE;
+	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+		iommu->pgsize_bitmap &= PAGE_MASK;
+		iommu->pgsize_bitmap |= PAGE_SIZE;
 	}
+}
+
+static struct device *vfio_get_iommu_device(struct vfio_iommu_group *group,
+					    struct device *dev);
+
+static int vfio_dev_enable_feature(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = data;
+	enum iommu_dev_features *feat = dc->data;
+	struct device *iommu_device;
 
-	return bitmap;
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	if (iommu_dev_feature_enabled(iommu_device, *feat))
+		return 0;
+
+	return iommu_dev_enable_feature(iommu_device, *feat);
 }
 
-static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-			     struct vfio_iommu_type1_dma_unmap *unmap)
+static bool vfio_group_supports_hwdbm(struct vfio_iommu_group *group)
 {
-	uint64_t mask;
-	struct vfio_dma *dma, *dma_last = NULL;
-	size_t unmapped = 0;
-	int ret = 0, retries = 0;
+	enum iommu_dev_features feat = IOMMU_DEV_FEAT_HWDBM;
+	struct domain_capsule dc = { .group = group, .data = &feat, };
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+	if (iommu_group_for_each_dev(group->iommu_group, &dc,
+				     vfio_dev_enable_feature))
+		return false;
 
-	if (unmap->iova & mask)
-		return -EINVAL;
-	if (!unmap->size || unmap->size & mask)
-		return -EINVAL;
-	if (unmap->iova + unmap->size - 1 < unmap->iova ||
-	    unmap->size > SIZE_MAX)
-		return -EINVAL;
+	return true;
+}
 
-	WARN_ON(mask & PAGE_MASK);
-again:
-	mutex_lock(&iommu->lock);
+static int vfio_iommu_dirty_log_clear(struct vfio_iommu *iommu,
+				      dma_addr_t start_iova, size_t size,
+				      unsigned long *bitmap_buffer,
+				      dma_addr_t base_iova, size_t pgsize)
+{
+	struct vfio_domain *d;
+	unsigned long pgshift = __ffs(pgsize);
+	int ret;
 
-	/*
-	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
-	 * avoid tracking individual mappings.  This means that the granularity
-	 * of the original mapping was lost and the user was allowed to attempt
-	 * to unmap any range.  Depending on the contiguousness of physical
-	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
-	 * or may not have worked.  We only guaranteed unmap granularity
-	 * matching the original mapping; even though it was untracked here,
-	 * the original mappings are reflected in IOMMU mappings.  This
-	 * resulted in a couple unusual behaviors.  First, if a range is not
-	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
-	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
-	 * a zero sized unmap.  Also, if an unmap request overlaps the first
-	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		ret = iommu_clear_dirty_log(d->domain, start_iova, size,
+					    bitmap_buffer, base_iova, pgshift);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+			      struct vfio_dma *dma, dma_addr_t base_iova,
+			      size_t pgsize)
+{
+	unsigned long pgshift = __ffs(pgsize);
+	unsigned long nbits = dma->size >> pgshift;
+	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
+	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
+	unsigned long shift = bit_offset % BITS_PER_LONG;
+	unsigned long leftover;
+
+	if (!iommu->num_non_pinned_groups || !dma->iommu_mapped)
+		goto bitmap_done;
+
+	/* try to get dirty log from IOMMU */
+	if (!iommu->num_non_hwdbm_groups) {
+		struct vfio_domain *d;
+
+		list_for_each_entry(d, &iommu->domain_list, next) {
+			if (iommu_sync_dirty_log(d->domain, dma->iova, dma->size,
+						 dma->bitmap, dma->iova, pgshift))
+				return -EFAULT;
+		}
+		goto bitmap_done;
+	}
+
+	/*
+	 * mark all pages dirty if any IOMMU capable device is not able
+	 * to report dirty pages and all pages are pinned and mapped.
+	 */
+	bitmap_set(dma->bitmap, 0, nbits);
+
+bitmap_done:
+	if (shift) {
+		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
+				  nbits + shift);
+
+		if (copy_from_user(&leftover,
+				   (void __user *)(bitmap + copy_offset),
+				   sizeof(leftover)))
+			return -EFAULT;
+
+		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
+	}
+
+	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
+			 DIRTY_BITMAP_BYTES(nbits + shift)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+				  dma_addr_t iova, size_t size, size_t pgsize)
+{
+	struct vfio_dma *dma;
+	struct rb_node *n;
+	unsigned long pgshift = __ffs(pgsize);
+	int ret;
+
+	/*
+	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
+	 * vfio_dma mappings may be clubbed by specifying large ranges, but
+	 * there must not be any previous mappings bisected by the range.
+	 * An error will be returned if these conditions are not met.
+	 */
+	dma = vfio_find_dma(iommu, iova, 1);
+	if (dma && dma->iova != iova)
+		return -EINVAL;
+
+	dma = vfio_find_dma(iommu, iova + size - 1, 0);
+	if (dma && dma->iova + dma->size != iova + size)
+		return -EINVAL;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (dma->iova < iova)
+			continue;
+
+		if (dma->iova > iova + size - 1)
+			break;
+
+		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
+		if (ret)
+			return ret;
+
+		/* Clear iommu dirty log to re-enable dirty log tracking */
+		if (iommu->num_non_pinned_groups &&
+		    dma->iommu_mapped && !iommu->num_non_hwdbm_groups) {
+			ret = vfio_iommu_dirty_log_clear(iommu, dma->iova,
+					dma->size, dma->bitmap, dma->iova,
+					pgsize);
+			if (ret) {
+				pr_warn("dma dirty log clear failed!\n");
+				return ret;
+			}
+		}
+		/*
+		 * Re-populate bitmap to include all pinned pages which are
+		 * considered as dirty but exclude pages which are unpinned and
+		 * pages which are marked dirty by vfio_dma_rw()
+		 */
+		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
+		vfio_dma_populate_bitmap(dma, pgsize);
+
+	}
+	return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
+	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
+			     struct vfio_iommu_type1_dma_unmap *unmap,
+			     struct vfio_bitmap *bitmap)
+{
+	struct vfio_dma *dma, *dma_last = NULL;
+	size_t unmapped = 0, pgsize;
+	int ret = -EINVAL, retries = 0;
+	unsigned long pgshift;
+	dma_addr_t iova = unmap->iova;
+	u64 size = unmap->size;
+	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
+	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
+	struct rb_node *n, *first_n;
+
+	mutex_lock(&iommu->lock);
+
+	pgshift = __ffs(iommu->pgsize_bitmap);
+	pgsize = (size_t)1 << pgshift;
+
+	if (iova & (pgsize - 1))
+		goto unlock;
+
+	if (unmap_all) {
+		if (iova || size)
+			goto unlock;
+		size = U64_MAX;
+	} else if (!size || size & (pgsize - 1) ||
+		   iova + size - 1 < iova || size > SIZE_MAX) {
+		goto unlock;
+	}
+
+	/* When dirty tracking is enabled, allow only min supported pgsize */
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+		goto unlock;
+	}
+
+	WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
+	/*
+	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
+	 * avoid tracking individual mappings.  This means that the granularity
+	 * of the original mapping was lost and the user was allowed to attempt
+	 * to unmap any range.  Depending on the contiguousness of physical
+	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
+	 * or may not have worked.  We only guaranteed unmap granularity
+	 * matching the original mapping; even though it was untracked here,
+	 * the original mappings are reflected in IOMMU mappings.  This
+	 * resulted in a couple unusual behaviors.  First, if a range is not
+	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
+	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
+	 * a zero sized unmap.  Also, if an unmap request overlaps the first
+	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
 	 * This also returns success and the returned unmap size reflects the
 	 * actual size unmapped.
 	 *
@@ -964,21 +1474,25 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	 * will only return success and a size of zero if there were no
 	 * mappings within the range.
 	 */
-	if (iommu->v2) {
-		dma = vfio_find_dma(iommu, unmap->iova, 1);
-		if (dma && dma->iova != unmap->iova) {
-			ret = -EINVAL;
+	if (iommu->v2 && !unmap_all) {
+		dma = vfio_find_dma(iommu, iova, 1);
+		if (dma && dma->iova != iova)
 			goto unlock;
-		}
-		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
-		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
-			ret = -EINVAL;
+
+		dma = vfio_find_dma(iommu, iova + size - 1, 0);
+		if (dma && dma->iova + dma->size != iova + size)
 			goto unlock;
-		}
 	}
 
-	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
-		if (!iommu->v2 && unmap->iova > dma->iova)
+	ret = 0;
+	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+
+	while (n) {
+		dma = rb_entry(n, struct vfio_dma, node);
+		if (dma->iova >= iova + size)
+			break;
+
+		if (!iommu->v2 && iova > dma->iova)
 			break;
 		/*
 		 * Task with same address space who mapped this iova range is
@@ -987,6 +1501,27 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		if (dma->task->mm != current->mm)
 			break;
 
+		if (invalidate_vaddr) {
+			if (dma->vaddr_invalid) {
+				struct rb_node *last_n = n;
+
+				for (n = first_n; n != last_n; n = rb_next(n)) {
+					dma = rb_entry(n,
+						       struct vfio_dma, node);
+					dma->vaddr_invalid = false;
+					iommu->vaddr_invalid_count--;
+				}
+				ret = -EINVAL;
+				unmapped = 0;
+				break;
+			}
+			dma->vaddr_invalid = true;
+			iommu->vaddr_invalid_count++;
+			unmapped += dma->size;
+			n = rb_next(n);
+			continue;
+		}
+
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
 			struct vfio_iommu_type1_dma_unmap nb_unmap;
 
@@ -1010,9 +1545,19 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			blocking_notifier_call_chain(&iommu->notifier,
 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
 						    &nb_unmap);
+			mutex_lock(&iommu->lock);
 			goto again;
 		}
+
+		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+			ret = update_user_bitmap(bitmap->data, iommu, dma,
+						 iova, pgsize);
+			if (ret)
+				break;
+		}
+
 		unmapped += dma->size;
+		n = rb_next(n);
 		vfio_remove_dma(iommu, dma);
 	}
 
@@ -1043,8 +1588,10 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	return 0;
 
 unwind:
-	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
+	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
+		cond_resched();
+	}
 
 	return ret;
 }
@@ -1054,15 +1601,19 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 {
 	dma_addr_t iova = dma->iova;
 	unsigned long vaddr = dma->vaddr;
+	struct vfio_batch batch;
 	size_t size = map_size;
 	long npage;
 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret = 0;
 
+	vfio_batch_init(&batch);
+
 	while (size) {
 		/* Pin a contiguous chunk of memory */
 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
-					      size >> PAGE_SHIFT, &pfn, limit);
+					      size >> PAGE_SHIFT, &pfn, limit,
+					      &batch);
 		if (npage <= 0) {
 			WARN_ON(!npage);
 			ret = (int)npage;
@@ -1075,6 +1626,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		if (ret) {
 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
 						npage, true);
+			vfio_batch_unpin(&batch, dma);
 			break;
 		}
 
@@ -1082,6 +1634,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 		dma->size += npage << PAGE_SHIFT;
 	}
 
+	vfio_batch_fini(&batch);
 	dma->iommu_mapped = true;
 
 	if (ret)
@@ -1114,37 +1667,59 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
+	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
 	dma_addr_t iova = map->iova;
 	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
 	int ret = 0, prot = 0;
-	uint64_t mask;
+	size_t pgsize;
 	struct vfio_dma *dma;
 
 	/* Verify that none of our __u64 fields overflow */
 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 		return -EINVAL;
 
-	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-	WARN_ON(mask & PAGE_MASK);
-
 	/* READ/WRITE from device perspective */
 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 		prot |= IOMMU_WRITE;
 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 		prot |= IOMMU_READ;
 
-	if (!prot || !size || (size | iova | vaddr) & mask)
-		return -EINVAL;
-
-	/* Don't allow IOVA or virtual address wrap */
-	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
+	if ((prot && set_vaddr) || (!prot && !set_vaddr))
 		return -EINVAL;
 
 	mutex_lock(&iommu->lock);
 
-	if (vfio_find_dma(iommu, iova, size)) {
+	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+	WARN_ON((pgsize - 1) & PAGE_MASK);
+
+	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Don't allow IOVA or virtual address wrap */
+	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dma = vfio_find_dma(iommu, iova, size);
+	if (set_vaddr) {
+		if (!dma) {
+			ret = -ENOENT;
+		} else if (!dma->vaddr_invalid || dma->iova != iova ||
+			   dma->size != size) {
+			ret = -EINVAL;
+		} else {
+			dma->vaddr = vaddr;
+			dma->vaddr_invalid = false;
+			iommu->vaddr_invalid_count--;
+			wake_up_all(&iommu->vaddr_wait);
+		}
+		goto out_unlock;
+	} else if (dma) {
 		ret = -EEXIST;
 		goto out_unlock;
 	}
@@ -1210,6 +1785,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	else
 		ret = vfio_pin_map_dma(iommu, dma, size);
 
+	if (!ret && iommu->dirty_page_tracking) {
+		ret = vfio_dma_bitmap_alloc(dma, pgsize);
+		if (ret)
+			vfio_remove_dma(iommu, dma);
+	}
+
 out_unlock:
 	mutex_unlock(&iommu->lock);
 	return ret;
@@ -1230,16 +1811,23 @@ static int vfio_bus_type(struct device *dev, void *data)
 static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			     struct vfio_domain *domain)
 {
+	struct vfio_batch batch;
 	struct vfio_domain *d = NULL;
 	struct rb_node *n;
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret;
 
+	ret = vfio_wait_all_valid(iommu);
+	if (ret < 0)
+		return ret;
+
 	/* Arbitrarily pick the first domain in the list for lookups */
 	if (!list_empty(&iommu->domain_list))
 		d = list_first_entry(&iommu->domain_list,
 				     struct vfio_domain, next);
 
+	vfio_batch_init(&batch);
+
 	n = rb_first(&iommu->dma_list);
 
 	for (; n; n = rb_next(n)) {
@@ -1287,7 +1875,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 
 				npage = vfio_pin_pages_remote(dma, vaddr,
 							      n >> PAGE_SHIFT,
-							      &pfn, limit);
+							      &pfn, limit,
+							      &batch);
 				if (npage <= 0) {
 					WARN_ON(!npage);
 					ret = (int)npage;
@@ -1301,11 +1890,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 			ret = iommu_map(domain->domain, iova, phys,
 					size, dma->prot | domain->prot);
 			if (ret) {
-				if (!dma->iommu_mapped)
+				if (!dma->iommu_mapped) {
 					vfio_unpin_pages_remote(dma, iova,
 							phys >> PAGE_SHIFT,
 							size >> PAGE_SHIFT,
 							true);
+					vfio_batch_unpin(&batch, dma);
+				}
 				goto unwind;
 			}
 
@@ -1320,6 +1911,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		dma->iommu_mapped = true;
 	}
 
+	vfio_batch_fini(&batch);
 	return 0;
 
 unwind:
@@ -1360,6 +1952,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 		}
 	}
 
+	vfio_batch_fini(&batch);
 	return ret;
 }
 
@@ -1396,10 +1989,10 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 	__free_pages(pages, order);
 }
 
-static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
-					   struct iommu_group *iommu_group)
+static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
+						 struct iommu_group *iommu_group)
 {
-	struct vfio_group *g;
+	struct vfio_iommu_group *g;
 
 	list_for_each_entry(g, &domain->group_list, next) {
 		if (g->iommu_group == iommu_group)
@@ -1409,6 +2002,25 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
 	return NULL;
 }
 
+static struct vfio_iommu_group*
+vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+			    struct iommu_group *iommu_group)
+{
+	struct vfio_domain *domain;
+	struct vfio_iommu_group *group = NULL;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		group = find_iommu_group(domain, iommu_group);
+		if (group)
+			return group;
+	}
+
+	if (iommu->external_domain)
+		group = find_iommu_group(iommu->external_domain, iommu_group);
+
+	return group;
+}
+
 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 				  phys_addr_t *base)
 {
@@ -1435,44 +2047,40 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 	return ret;
 }
 
-static struct device *vfio_mdev_get_iommu_device(struct device *dev)
-{
-	struct device *(*fn)(struct device *dev);
-	struct device *iommu_device;
-
-	fn = symbol_get(mdev_get_iommu_device);
-	if (fn) {
-		iommu_device = fn(dev);
-		symbol_put(mdev_get_iommu_device);
-
-		return iommu_device;
-	}
-
-	return NULL;
-}
-
 static int vfio_mdev_attach_domain(struct device *dev, void *data)
 {
-	struct iommu_domain *domain = data;
+	struct mdev_device *mdev = to_mdev_device(dev);
+	struct iommu_domain *domain;
 	struct device *iommu_device;
+	int ret = -ENODEV;
+
+	/* Only single domain is allowed to attach to an mdev. */
+	domain = mdev_get_iommu_domain(mdev);
+	if (domain)
+		return -EINVAL;
+	domain = data;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
-			return iommu_aux_attach_device(domain, iommu_device);
+			ret = iommu_aux_attach_device(domain, iommu_device);
 		else
-			return iommu_attach_device(domain, iommu_device);
+			ret = iommu_attach_device(domain, iommu_device);
 	}
 
-	return -EINVAL;
+	if (!ret)
+		mdev_set_iommu_domain(mdev, domain);
+
+	return ret;
 }
 
 static int vfio_mdev_detach_domain(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct iommu_domain *domain = data;
 	struct device *iommu_device;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
 			iommu_aux_detach_device(domain, iommu_device);
@@ -1480,11 +2088,13 @@ static int vfio_mdev_detach_domain(struct device *dev, void *data)
 			iommu_detach_device(domain, iommu_device);
 	}
 
+	mdev_set_iommu_domain(mdev, NULL);
+
 	return 0;
 }
 
 static int vfio_iommu_attach_group(struct vfio_domain *domain,
-				   struct vfio_group *group)
+				   struct vfio_iommu_group *group)
 {
 	if (group->mdev_group)
 		return iommu_group_for_each_dev(group->iommu_group,
@@ -1495,7 +2105,7 @@ static int vfio_iommu_attach_group(struct vfio_domain *domain,
 }
 
 static void vfio_iommu_detach_group(struct vfio_domain *domain,
-				    struct vfio_group *group)
+				    struct vfio_iommu_group *group)
 {
 	if (group->mdev_group)
 		iommu_group_for_each_dev(group->iommu_group, domain->domain,
@@ -1520,9 +2130,10 @@ static bool vfio_bus_is_mdev(struct bus_type *bus)
 
 static int vfio_mdev_iommu_device(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct device **old = data, *new;
 
-	new = vfio_mdev_get_iommu_device(dev);
+	new = mdev_get_iommu_device(mdev);
 	if (!new || (*old && *old != new))
 		return -EINVAL;
 
@@ -1679,7 +2290,7 @@ static int vfio_iommu_resv_exclude(struct list_head *iova,
 				continue;
 			/*
 			 * Insert a new node if current node overlaps with the
-			 * reserve region to exlude that from valid iova range.
+			 * reserve region to exclude that from valid iova range.
 			 * Note that, new node is inserted before the current
 			 * node and finally the current node is deleted keeping
 			 * the list updated and sorted.
@@ -1753,34 +2364,39 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
 
 	list_splice_tail(iova_copy, iova);
 }
+
+static void vfio_iommu_release_nesting_info(struct vfio_iommu *iommu)
+{
+	kfree(iommu->nesting_info);
+	iommu->nesting_info = NULL;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
-	struct vfio_group *group;
+	struct vfio_iommu_group *group;
 	struct vfio_domain *domain, *d;
 	struct bus_type *bus = NULL;
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base = 0;
-	struct iommu_domain_geometry geo;
+	struct iommu_domain_geometry *geo;
 	LIST_HEAD(iova_copy);
 	LIST_HEAD(group_resv_regions);
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry(d, &iommu->domain_list, next) {
-		if (find_iommu_group(d, iommu_group)) {
-			mutex_unlock(&iommu->lock);
-			return -EINVAL;
-		}
+	/* Check for duplicates */
+	if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
 	}
 
-	if (iommu->external_domain) {
-		if (find_iommu_group(iommu->external_domain, iommu_group)) {
-			mutex_unlock(&iommu->lock);
-			return -EINVAL;
-		}
+	/* Nesting type container can include only one group */
+	if (iommu->nesting && CONTAINER_HAS_DOMAIN(iommu)) {
+		mutex_unlock(&iommu->lock);
+		return -EINVAL;
 	}
 
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
@@ -1809,12 +2425,21 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 			if (!iommu->external_domain) {
 				INIT_LIST_HEAD(&domain->group_list);
 				iommu->external_domain = domain;
+				vfio_update_pgsize_bitmap(iommu);
 			} else {
 				kfree(domain);
 			}
 
 			list_add(&group->next,
 				 &iommu->external_domain->group_list);
+			/*
+			 * Non-iommu backed group cannot dirty memory directly,
+			 * it can only use interfaces that provide dirty
+			 * tracking.
+			 * The iommu scope can only be promoted with the
+			 * addition of a dirty tracking group.
+			 */
+			group->pinned_page_dirty_scope = true;
 			mutex_unlock(&iommu->lock);
 
 			return 0;
@@ -1842,11 +2467,35 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_domain;
 
-	/* Get aperture info */
-	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+	/* Nesting cap info is available only after attaching */
+	if (iommu->nesting) {
+		int size = sizeof(struct iommu_nesting_info);
+
+		iommu->nesting_info = kzalloc(size, GFP_KERNEL);
+		if (!iommu->nesting_info) {
+			ret = -ENOMEM;
+			goto out_detach;
+		}
+
+		/* Now get the nesting info */
+		iommu->nesting_info->argsz = size;
+		ret = iommu_domain_get_attr(domain->domain,
+					    DOMAIN_ATTR_NESTING,
+					    iommu->nesting_info);
+		if (ret)
+			goto out_detach;
+
+		/* when @format of nesting_info is 0, fail the attach */
+		if (iommu->nesting_info->format == 0) {
+			ret = -ENOENT;
+			goto out_detach;
+		}
+	}
 
-	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
-				     geo.aperture_end)) {
+	/* Get aperture info */
+	geo = &domain->domain->geometry;
+	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
+				     geo->aperture_end)) {
 		ret = -EINVAL;
 		goto out_detach;
 	}
@@ -1869,8 +2518,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
-	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
-				     geo.aperture_end);
+	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
+				     geo->aperture_end);
 	if (ret)
 		goto out_detach;
 
@@ -1929,20 +2578,35 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
 	if (resv_msi) {
 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
-		if (ret)
+		if (ret && ret != -ENODEV)
 			goto out_detach;
 	}
 
 	list_add(&domain->next, &iommu->domain_list);
+	vfio_update_pgsize_bitmap(iommu);
 done:
 	/* Delete the old one and insert new iova list */
 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+
+	/*
+	 * An iommu backed group can dirty memory directly and therefore
+	 * demotes the iommu scope until it declares itself dirty tracking
+	 * capable via the page pinning interface.
+	 */
+	iommu->num_non_pinned_groups++;
+
+	/* Update the hwdbm status of group and iommu */
+	group->iommu_hwdbm = vfio_group_supports_hwdbm(group);
+	if (!group->iommu_hwdbm)
+		iommu->num_non_hwdbm_groups++;
+
 	mutex_unlock(&iommu->lock);
 	vfio_iommu_resv_free(&group_resv_regions);
 
 	return 0;
 
 out_detach:
+	vfio_iommu_release_nesting_info(iommu);
 	vfio_iommu_detach_group(domain, group);
 out_domain:
 	iommu_domain_free(domain->domain);
@@ -1995,7 +2659,6 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 				   struct list_head *iova_copy)
 {
 	struct vfio_domain *domain;
-	struct iommu_domain_geometry geo;
 	struct vfio_iova *node;
 	dma_addr_t start = 0;
 	dma_addr_t end = (dma_addr_t)~0;
@@ -2004,12 +2667,12 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 		return;
 
 	list_for_each_entry(domain, &iommu->domain_list, next) {
-		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
-				      &geo);
-		if (geo.aperture_start > start)
-			start = geo.aperture_start;
-		if (geo.aperture_end < end)
-			end = geo.aperture_end;
+		struct iommu_domain_geometry *geo = &domain->domain->geometry;
+
+		if (geo->aperture_start > start)
+			start = geo->aperture_start;
+		if (geo->aperture_end < end)
+			end = geo->aperture_end;
 	}
 
 	/* Modify aperture limits. The new aper is either same or bigger */
@@ -2029,7 +2692,7 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
 				   struct list_head *iova_copy)
 {
 	struct vfio_domain *d;
-	struct vfio_group *g;
+	struct vfio_iommu_group *g;
 	struct vfio_iova *node;
 	dma_addr_t start, end;
 	LIST_HEAD(resv_regions);
@@ -2066,12 +2729,101 @@ static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
 	return ret;
 }
 
+static struct device *vfio_get_iommu_device(struct vfio_iommu_group *group,
+					    struct device *dev)
+{
+	struct mdev_device *mdev = to_mdev_device(dev);
+	if (group->mdev_group)
+		return mdev_get_iommu_device(mdev);
+	else
+		return dev;
+}
+
+static int vfio_dev_bind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *)dc->data;
+	struct mdev_device *mdev = to_mdev_device(dev);
+	struct device *iommu_device;
+	void *iommu_fault_data = NULL;
+
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	if (iommu_device != dev)
+		iommu_fault_data = mdev_get_iommu_fault_data(mdev);
+
+	pr_debug("%s: iommu_fault_data: %llx\n", __func__, (unsigned long long) iommu_fault_data);
+
+	return iommu_uapi_sva_bind_gpasid(dc->domain, iommu_device,
+					  (void __user *)arg,
+					  iommu_fault_data);
+}
+
+static int vfio_dev_unbind_gpasid_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	struct device *iommu_device;
+
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	/*
+	 * dc->user is a toggle for the unbind operation. When user
+	 * set, the dc->data passes in a __user pointer and requires
+	 * to use iommu_uapi_sva_unbind_gpasid(), in which it will
+	 * copy the unbind data from the user buffer. When user is
+	 * clear, the dc->data passes in a pasid which is going to
+	 * be unbind no need to copy data from userspace.
+	 */
+	if (dc->user) {
+		unsigned long arg = *(unsigned long *)dc->data;
+
+		pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
+		iommu_uapi_sva_unbind_gpasid(dc->domain, iommu_device,
+					     (void __user *)arg);
+	} else {
+		ioasid_t pasid = *(ioasid_t *)dc->data;
+
+		iommu_sva_unbind_gpasid(dc->domain, iommu_device, pasid, dc->flags);
+	}
+	return 0;
+}
+
+static void vfio_group_unbind_gpasid_fn(ioasid_t pasid, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+
+	dc->user = false;
+	dc->data = &pasid;
+
+	iommu_group_for_each_dev(dc->group->iommu_group,
+				 dc, vfio_dev_unbind_gpasid_fn);
+}
+
+static void vfio_group_unbind_default_gpasid(ioasid_t pasid, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+
+	dc->user = false;
+	dc->data = &pasid;
+	dc->flags = IOMMU_SVA_HPASID_DEF;
+
+	iommu_group_for_each_dev(dc->group->iommu_group,
+				 dc, vfio_dev_unbind_gpasid_fn);
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain;
-	struct vfio_group *group;
+	struct vfio_iommu_group *group;
+	bool update_dirty_scope = false;
+	bool update_iommu_hwdbm = false;
 	LIST_HEAD(iova_copy);
 
 	mutex_lock(&iommu->lock);
@@ -2079,6 +2831,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 	if (iommu->external_domain) {
 		group = find_iommu_group(iommu->external_domain, iommu_group);
 		if (group) {
+			update_dirty_scope = !group->pinned_page_dirty_scope;
 			list_del(&group->next);
 			kfree(group);
 
@@ -2107,7 +2860,35 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		if (!group)
 			continue;
 
+		if (iommu->nesting_info &&
+		    iommu->nesting_info->features &
+					IOMMU_NESTING_FEAT_BIND_PGTBL) {
+			struct domain_capsule dc = { .group = group,
+						     .domain = domain->domain,
+						     .data = NULL };
+			struct ioasid_user *iuser;
+
+			/*
+			 * For devices attached to nesting type iommu,
+			 * VFIO should unbind page tables bound with the
+			 * devices in the iommu group before detaching.
+			 */
+			iuser = ioasid_user_get_from_task(current);
+			if (!(IS_ERR(iuser) || !iuser)) {
+				ioasid_user_for_each_id(iuser, &dc,
+					       vfio_group_unbind_gpasid_fn);
+				ioasid_user_put(iuser);
+			}
+			/*
+			 * We should explicitly call interface to unbind default pasid gIOVA
+			 * page table here.
+			 */
+			vfio_group_unbind_default_gpasid(0, &dc);
+		}
+
 		vfio_iommu_detach_group(domain, group);
+		update_dirty_scope = !group->pinned_page_dirty_scope;
+		update_iommu_hwdbm = !group->iommu_hwdbm;
 		list_del(&group->next);
 		kfree(group);
 		/*
@@ -2125,11 +2906,14 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 				} else {
 					vfio_iommu_unmap_unpin_reaccount(iommu);
 				}
+
+				vfio_iommu_release_nesting_info(iommu);
 			}
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
 			vfio_iommu_aper_expand(iommu, &iova_copy);
+			vfio_update_pgsize_bitmap(iommu);
 		}
 		break;
 	}
@@ -2140,6 +2924,17 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		vfio_iommu_iova_free(&iova_copy);
 
 detach_group_done:
+	/*
+	 * Removal of a group without dirty tracking may allow the iommu scope
+	 * to be promoted.
+	 */
+	if (update_dirty_scope) {
+		iommu->num_non_pinned_groups--;
+		if (iommu->dirty_page_tracking)
+			vfio_iommu_populate_bitmap_full(iommu);
+	}
+	if (update_iommu_hwdbm)
+		iommu->num_non_hwdbm_groups--;
 	mutex_unlock(&iommu->lock);
 }
 
@@ -2156,7 +2951,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 		break;
 	case VFIO_TYPE1_NESTING_IOMMU:
 		iommu->nesting = true;
-		/* fall through */
+		fallthrough;
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
@@ -2169,15 +2964,17 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	INIT_LIST_HEAD(&iommu->iova_list);
 	iommu->dma_list = RB_ROOT;
 	iommu->dma_avail = dma_entry_limit;
+	iommu->container_open = true;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+	init_waitqueue_head(&iommu->vaddr_wait);
 
 	return iommu;
 }
 
 static void vfio_release_domain(struct vfio_domain *domain, bool external)
 {
-	struct vfio_group *group, *group_tmp;
+	struct vfio_iommu_group *group, *group_tmp;
 
 	list_for_each_entry_safe(group, group_tmp,
 				 &domain->group_list, next) {
@@ -2232,6 +3029,25 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
+					    unsigned long arg)
+{
+	switch (arg) {
+	case VFIO_TYPE1_IOMMU:
+	case VFIO_TYPE1v2_IOMMU:
+	case VFIO_TYPE1_NESTING_IOMMU:
+	case VFIO_UNMAP_ALL:
+	case VFIO_UPDATE_VADDR:
+		return 1;
+	case VFIO_DMA_CC_IOMMU:
+		if (!iommu)
+			return 0;
+		return vfio_domains_have_iommu_cache(iommu);
+	default:
+		return 0;
+	}
+}
+
 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
 		 size_t size)
@@ -2261,8 +3077,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 	size_t size;
 	int iovas = 0, i = 0, ret;
 
-	mutex_lock(&iommu->lock);
-
 	list_for_each_entry(iova, &iommu->iova_list, list)
 		iovas++;
 
@@ -2271,17 +3085,14 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 		 * Return 0 as a container with a single mdev device
 		 * will have an empty list
 		 */
-		ret = 0;
-		goto out_unlock;
+		return 0;
 	}
 
-	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+	size = struct_size(cap_iovas, iova_ranges, iovas);
 
 	cap_iovas = kzalloc(size, GFP_KERNEL);
-	if (!cap_iovas) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
+	if (!cap_iovas)
+		return -ENOMEM;
 
 	cap_iovas->nr_iovas = iovas;
 
@@ -2294,8 +3105,6 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
 
 	kfree(cap_iovas);
-out_unlock:
-	mutex_unlock(&iommu->lock);
 	return ret;
 }
 
@@ -2317,117 +3126,497 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
 	return ret;
 }
 
-static long vfio_iommu_type1_ioctl(void *iommu_data,
-				   unsigned int cmd, unsigned long arg)
+static int vfio_iommu_nesting_build_caps(struct vfio_iommu *iommu,
+					 struct vfio_info_cap *caps)
 {
-	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_iommu_type1_info_cap_nesting nesting_cap;
+	size_t size;
+
+	/* when nesting_info is null, no need to go further */
+	if (!iommu->nesting_info)
+		return 0;
+
+	size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) +
+	       iommu->nesting_info->argsz;
+
+	nesting_cap.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_NESTING;
+	nesting_cap.header.version = 1;
+
+	memcpy(&nesting_cap.info, iommu->nesting_info,
+	       iommu->nesting_info->argsz);
+
+	return vfio_info_add_capability(caps, &nesting_cap.header, size);
+}
+
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+					   struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+	cap_mig.header.version = 1;
+
+	cap_mig.flags = 0;
+	/* support minimum pgsize */
+	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
+}
+
+static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
+				     unsigned long arg)
+{
+	struct vfio_iommu_type1_info info;
 	unsigned long minsz;
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	unsigned long capsz;
+	int ret;
 
-	if (cmd == VFIO_CHECK_EXTENSION) {
-		switch (arg) {
-		case VFIO_TYPE1_IOMMU:
-		case VFIO_TYPE1v2_IOMMU:
-		case VFIO_TYPE1_NESTING_IOMMU:
-			return 1;
-		case VFIO_DMA_CC_IOMMU:
-			if (!iommu)
-				return 0;
-			return vfio_domains_have_iommu_cache(iommu);
-		default:
-			return 0;
-		}
-	} else if (cmd == VFIO_IOMMU_GET_INFO) {
-		struct vfio_iommu_type1_info info;
-		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
-		unsigned long capsz;
-		int ret;
+	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
-		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+	/* For backward compatibility, cannot require this */
+	capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
 
-		/* For backward compatibility, cannot require this */
-		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
 
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
+	if (info.argsz < minsz)
+		return -EINVAL;
 
-		if (info.argsz < minsz)
-			return -EINVAL;
+	if (info.argsz >= capsz) {
+		minsz = capsz;
+		info.cap_offset = 0; /* output, no-recopy necessary */
+	}
 
-		if (info.argsz >= capsz) {
-			minsz = capsz;
-			info.cap_offset = 0; /* output, no-recopy necessary */
-		}
+	mutex_lock(&iommu->lock);
+	info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
-		info.flags = VFIO_IOMMU_INFO_PGSIZES;
+	info.iova_pgsizes = iommu->pgsize_bitmap;
 
-		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
+	ret = vfio_iommu_migration_build_caps(iommu, &caps);
 
+	if (!ret)
 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
 
-		if (!ret)
-			ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
+	if (!ret)
+		ret = vfio_iommu_nesting_build_caps(iommu, &caps);
 
-		if (ret)
-			return ret;
+	mutex_unlock(&iommu->lock);
 
-		if (caps.size) {
-			info.flags |= VFIO_IOMMU_INFO_CAPS;
+	if (!ret)
+		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
 
-			if (info.argsz < sizeof(info) + caps.size) {
-				info.argsz = sizeof(info) + caps.size;
-			} else {
-				vfio_info_cap_shift(&caps, sizeof(info));
-				if (copy_to_user((void __user *)arg +
-						sizeof(info), caps.buf,
-						caps.size)) {
-					kfree(caps.buf);
-					return -EFAULT;
-				}
-				info.cap_offset = sizeof(info);
-			}
+	if (ret)
+		return ret;
 
-			kfree(caps.buf);
+	if (caps.size) {
+		info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user((void __user *)arg +
+					sizeof(info), caps.buf,
+					caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
+			}
+			info.cap_offset = sizeof(info);
 		}
 
-		return copy_to_user((void __user *)arg, &info, minsz) ?
+		kfree(caps.buf);
+	}
+
+	return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
+}
 
-	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
-		struct vfio_iommu_type1_dma_map map;
-		uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
-				VFIO_DMA_MAP_FLAG_WRITE;
+static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
+				    unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_map map;
+	unsigned long minsz;
+	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
+			VFIO_DMA_MAP_FLAG_VADDR;
 
-		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
-		if (copy_from_user(&map, (void __user *)arg, minsz))
+	if (copy_from_user(&map, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (map.argsz < minsz || map.flags & ~mask)
+		return -EINVAL;
+
+	return vfio_dma_do_map(iommu, &map);
+}
+
+static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
+				      unsigned long arg)
+{
+	struct vfio_iommu_type1_dma_unmap unmap;
+	struct vfio_bitmap bitmap = { 0 };
+	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
+			VFIO_DMA_UNMAP_FLAG_VADDR |
+			VFIO_DMA_UNMAP_FLAG_ALL;
+	unsigned long minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+	if (copy_from_user(&unmap, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (unmap.argsz < minsz || unmap.flags & ~mask)
+		return -EINVAL;
+
+	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
+			    VFIO_DMA_UNMAP_FLAG_VADDR)))
+		return -EINVAL;
+
+	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+		unsigned long pgshift;
+
+		if (unmap.argsz < (minsz + sizeof(bitmap)))
+			return -EINVAL;
+
+		if (copy_from_user(&bitmap,
+				   (void __user *)(arg + minsz),
+				   sizeof(bitmap)))
 			return -EFAULT;
 
-		if (map.argsz < minsz || map.flags & ~mask)
+		if (!access_ok((void __user *)bitmap.data, bitmap.size))
 			return -EINVAL;
 
-		return vfio_dma_do_map(iommu, &map);
+		pgshift = __ffs(bitmap.pgsize);
+		ret = verify_bitmap_size(unmap.size >> pgshift,
+					 bitmap.size);
+		if (ret)
+			return ret;
+	}
+
+	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
+	if (ret)
+		return ret;
+
+	return copy_to_user((void __user *)arg, &unmap, minsz) ?
+			-EFAULT : 0;
+}
+
+static void vfio_dma_dirty_log_start(struct vfio_iommu *iommu,
+				     struct vfio_dma *dma,
+				     bool start)
+{
+	struct vfio_domain *d;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		/* Go through all domain anyway even if we fail */
+		iommu_domain_set_hwdbm(d->domain, start, dma->iova, dma->size);
+	}
+}
+
+static void vfio_iommu_dirty_log_switch(struct vfio_iommu *iommu, bool start)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+		if (!dma->iommu_mapped)
+			continue;
+
+		/* Go through all dma range anyway even if we fail */
+		vfio_dma_dirty_log_start(iommu, dma, start);
+	}
+}
+
+static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct vfio_iommu_type1_dirty_bitmap dirty;
+	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+	unsigned long minsz;
+	int ret = 0;
+
+	if (!iommu->v2)
+		return -EACCES;
+
+	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
+
+	if (copy_from_user(&dirty, (void __user *)arg, minsz))
+		return -EFAULT;
 
-	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
-		struct vfio_iommu_type1_dma_unmap unmap;
-		long ret;
+	if (dirty.argsz < minsz || dirty.flags & ~mask)
+		return -EINVAL;
+
+	/* only one flag should be set at a time */
+	if (__ffs(dirty.flags) != __fls(dirty.flags))
+		return -EINVAL;
+
+	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+		size_t pgsize;
+
+		mutex_lock(&iommu->lock);
+		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
+		if (!iommu->dirty_page_tracking) {
+			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+			if (!ret) {
+				iommu->dirty_page_tracking = true;
+				vfio_iommu_dirty_log_switch(iommu, true);
+			}
+		}
+		mutex_unlock(&iommu->lock);
+		return ret;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+		mutex_lock(&iommu->lock);
+		if (iommu->dirty_page_tracking) {
+			iommu->dirty_page_tracking = false;
+			vfio_dma_bitmap_free_all(iommu);
+			vfio_iommu_dirty_log_switch(iommu, false);
+		}
+		mutex_unlock(&iommu->lock);
+		return 0;
+	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+		struct vfio_iommu_type1_dirty_bitmap_get range;
+		unsigned long pgshift;
+		size_t data_size = dirty.argsz - minsz;
+		size_t iommu_pgsize;
 
-		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+		if (!data_size || data_size < sizeof(range))
+			return -EINVAL;
 
-		if (copy_from_user(&unmap, (void __user *)arg, minsz))
+		if (copy_from_user(&range, (void __user *)(arg + minsz),
+				   sizeof(range)))
 			return -EFAULT;
 
-		if (unmap.argsz < minsz || unmap.flags)
+		if (range.iova + range.size < range.iova)
+			return -EINVAL;
+		if (!access_ok((void __user *)range.bitmap.data,
+			       range.bitmap.size))
 			return -EINVAL;
 
-		ret = vfio_dma_do_unmap(iommu, &unmap);
+		pgshift = __ffs(range.bitmap.pgsize);
+		ret = verify_bitmap_size(range.size >> pgshift,
+					 range.bitmap.size);
 		if (ret)
 			return ret;
 
-		return copy_to_user((void __user *)arg, &unmap, minsz) ?
-			-EFAULT : 0;
+		mutex_lock(&iommu->lock);
+
+		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+		/* allow only smallest supported pgsize */
+		if (range.bitmap.pgsize != iommu_pgsize) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (range.iova & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (!range.size || range.size & (iommu_pgsize - 1)) {
+			ret = -EINVAL;
+			goto out_unlock;
+ 		}
+
+		if (iommu->dirty_page_tracking)
+			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
+						     iommu, range.iova,
+						     range.size,
+						     range.bitmap.pgsize);
+		else
+			ret = -EINVAL;
+out_unlock:
+		mutex_unlock(&iommu->lock);
+
+		return ret;
 	}
 
-	return -ENOTTY;
+	return -EINVAL;
+}
+
+static long vfio_iommu_handle_pgtbl_op(struct vfio_iommu *iommu,
+				       bool is_bind, unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg, .user = true };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_BIND_PGTBL)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	if (is_bind)
+		ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+					       vfio_dev_bind_gpasid_fn);
+	if (ret || !is_bind)
+		iommu_group_for_each_dev(dc.group->iommu_group,
+					 &dc, vfio_dev_unbind_gpasid_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_dev_cache_invalidate_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *)dc->data;
+	struct device *iommu_device;
+
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	iommu_uapi_cache_invalidate(dc->domain, iommu_device,
+				    (void __user *)arg);
+	return 0;
+}
+
+static long vfio_iommu_invalidate_cache(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_CACHE_INVLD)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+				 vfio_dev_cache_invalidate_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_dev_page_resp_fn(struct device *dev, void *data)
+{
+	struct domain_capsule *dc = (struct domain_capsule *)data;
+	unsigned long arg = *(unsigned long *) dc->data;
+	struct device *iommu_device;
+
+	pr_debug("%s, - arg: 0x%lx\n", __func__, arg);
+	iommu_device = vfio_get_iommu_device(dc->group, dev);
+	if (!iommu_device)
+		return -EINVAL;
+
+	return iommu_page_response(dc->domain, iommu_device,
+				   (void __user *) arg);
+}
+
+static long vfio_iommu_page_response(struct vfio_iommu *iommu,
+				     unsigned long arg)
+{
+	struct domain_capsule dc = { .data = &arg };
+	struct iommu_nesting_info *info;
+	int ret;
+
+	mutex_lock(&iommu->lock);
+	info = iommu->nesting_info;
+	if (!info || !(info->features & IOMMU_NESTING_FEAT_PAGE_RESP)) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	ret = vfio_prepare_nesting_domain_capsule(iommu, &dc);
+	if (ret)
+		goto out_unlock;
+
+	ret = iommu_group_for_each_dev(dc.group->iommu_group, &dc,
+				       vfio_dev_page_resp_fn);
+
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static long vfio_iommu_type1_nesting_op(struct vfio_iommu *iommu,
+					unsigned long arg)
+{
+	struct vfio_iommu_type1_nesting_op hdr;
+	unsigned int minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_iommu_type1_nesting_op, flags);
+
+	pr_debug("%s, - 1, arg: 0x%lx, minsz: %u\n", __func__, arg, minsz);
+	if (copy_from_user(&hdr, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz || hdr.flags & ~VFIO_NESTING_OP_MASK)
+		return -EINVAL;
+
+	pr_debug("%s, - 2\n", __func__);
+	switch (hdr.flags & VFIO_NESTING_OP_MASK) {
+	case VFIO_IOMMU_NESTING_OP_BIND_PGTBL:
+	pr_debug("%s, bind - 1\n", __func__);
+		ret = vfio_iommu_handle_pgtbl_op(iommu, true, arg + minsz);
+	pr_debug("%s, bind - 2, ret: %d\n", __func__, ret);
+		break;
+	case VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL:
+	pr_debug("%s, unbind - 1\n", __func__);
+		ret = vfio_iommu_handle_pgtbl_op(iommu, false, arg + minsz);
+	pr_debug("%s, unbind - 2, ret: %d\n", __func__, ret);
+		break;
+	case VFIO_IOMMU_NESTING_OP_CACHE_INVLD:
+	pr_debug("%s, cache_inv - 1\n", __func__);
+		ret = vfio_iommu_invalidate_cache(iommu, arg + minsz);
+	pr_debug("%s, cache_inv - 2, ret: %d\n", __func__, ret);
+		break;
+	case VFIO_IOMMU_NESTING_OP_PAGE_RESP:
+		ret = vfio_iommu_page_response(iommu, arg + minsz);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static long vfio_iommu_type1_ioctl(void *iommu_data,
+				   unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return vfio_iommu_type1_check_extension(iommu, arg);
+	case VFIO_IOMMU_GET_INFO:
+		return vfio_iommu_type1_get_info(iommu, arg);
+	case VFIO_IOMMU_MAP_DMA:
+		return vfio_iommu_type1_map_dma(iommu, arg);
+	case VFIO_IOMMU_UNMAP_DMA:
+		return vfio_iommu_type1_unmap_dma(iommu, arg);
+	case VFIO_IOMMU_DIRTY_PAGES:
+		return vfio_iommu_type1_dirty_pages(iommu, arg);
+	case VFIO_IOMMU_NESTING_OP:
+		return vfio_iommu_type1_nesting_op(iommu, arg);
+	default:
+		return -ENOTTY;
+	}
 }
 
 static int vfio_iommu_type1_register_notifier(void *iommu_data,
@@ -2454,6 +3643,127 @@ static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
 }
 
+static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
+					 dma_addr_t user_iova, void *data,
+					 size_t count, bool write,
+					 size_t *copied)
+{
+	struct mm_struct *mm;
+	unsigned long vaddr;
+	struct vfio_dma *dma;
+	bool kthread = current->mm == NULL;
+	size_t offset;
+	int ret;
+
+	*copied = 0;
+
+	ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
+	if (ret < 0)
+		return ret;
+
+	if ((write && !(dma->prot & IOMMU_WRITE)) ||
+			!(dma->prot & IOMMU_READ))
+		return -EPERM;
+
+	mm = get_task_mm(dma->task);
+
+	if (!mm)
+		return -EPERM;
+
+	if (kthread)
+		use_mm(mm);
+	else if (current->mm != mm)
+		goto out;
+
+	offset = user_iova - dma->iova;
+
+	if (count > dma->size - offset)
+		count = dma->size - offset;
+
+	vaddr = dma->vaddr + offset;
+
+	if (write) {
+		*copied = copy_to_user((void __user *)vaddr, data,
+					 count) ? 0 : count;
+		if (*copied && iommu->dirty_page_tracking) {
+			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+			/*
+			 * Bitmap populated with the smallest supported page
+			 * size
+			 */
+			bitmap_set(dma->bitmap, offset >> pgshift,
+				   ((offset + *copied - 1) >> pgshift) -
+				   (offset >> pgshift) + 1);
+		}
+	} else
+		*copied = copy_from_user(data, (void __user *)vaddr,
+					   count) ? 0 : count;
+	if (kthread)
+		unuse_mm(mm);
+out:
+	mmput(mm);
+	return *copied ? 0 : -EFAULT;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
+				   void *data, size_t count, bool write)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	int ret = 0;
+	size_t done;
+
+	mutex_lock(&iommu->lock);
+	while (count > 0) {
+		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
+						    count, write, &done);
+		if (ret)
+			break;
+
+		count -= done;
+		data += done;
+		user_iova += done;
+	}
+
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static struct iommu_domain *
+vfio_iommu_type1_group_iommu_domain(void *iommu_data,
+				    struct iommu_group *iommu_group)
+{
+	struct iommu_domain *domain = ERR_PTR(-ENODEV);
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *d;
+
+	if (!iommu || !iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (find_iommu_group(d, iommu_group)) {
+			domain = d->domain;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return domain;
+}
+
+static void vfio_iommu_type1_notify(void *iommu_data,
+				    enum vfio_iommu_notify_type event)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	if (event != VFIO_IOMMU_CONTAINER_CLOSE)
+		return;
+	mutex_lock(&iommu->lock);
+	iommu->container_open = false;
+	mutex_unlock(&iommu->lock);
+	wake_up_all(&iommu->vaddr_wait);
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2466,6 +3776,9 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
+	.dma_rw			= vfio_iommu_type1_dma_rw,
+	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
+	.notify			= vfio_iommu_type1_notify,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 997cb5d0a657cb1cb795ca609704f886c7ffc276..414e98d82b02e561d5423910d0cb572b7c4ea7d3 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -46,6 +46,9 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void
 	__poll_t flags = key_to_poll(key);
 
 	if (flags & EPOLLIN) {
+		u64 cnt;
+		eventfd_ctx_do_read(virqfd->eventfd, &cnt);
+
 		/* An event has been signaled, call function */
 		if ((!virqfd->handler ||
 		     virqfd->handler(virqfd->opaque, virqfd->data)) &&
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d6f9f491e69f760ae81edf543dc8b67d4cff439d..1734b15ebbcd7255337085bec76a02614bc2ec93 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1744,14 +1744,6 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
-				   unsigned long arg)
-{
-	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
@@ -1787,9 +1779,7 @@ static const struct file_operations vhost_net_fops = {
 	.write_iter     = vhost_net_chr_write_iter,
 	.poll           = vhost_net_chr_poll,
 	.unlocked_ioctl = vhost_net_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_net_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open           = vhost_net_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 98c484149ac7f22b22ed78ec6f4d2e41b9268687..02fc1a5c0ab0b7c727ee3c230d487cd3011de62a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1715,21 +1715,11 @@ vhost_scsi_ioctl(struct file *f,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_scsi_compat_ioctl(struct file *f, unsigned int ioctl,
-				unsigned long arg)
-{
-	return vhost_scsi_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_scsi_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_scsi_release,
 	.unlocked_ioctl = vhost_scsi_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= vhost_scsi_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.open           = vhost_scsi_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 056308008288c80dc14efbf6c437cbc6940129dd..e37c92d4d7ada2ca4f06b8f91d3f6b2a6b303610 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -304,21 +304,11 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_test_compat_ioctl(struct file *f, unsigned int ioctl,
-				   unsigned long arg)
-{
-	return vhost_test_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_test_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_test_release,
 	.unlocked_ioctl = vhost_test_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_test_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 	.open           = vhost_test_open,
 	.llseek		= noop_llseek,
 };
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f21f5bfbb78dccde89b20ce95fae8d2e119539b7..1f370e5be4489667106b2cdff9f0987022bb0eca 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -814,23 +814,13 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
-					 unsigned long arg)
-{
-	return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static const struct file_operations vhost_vsock_fops = {
 	.owner          = THIS_MODULE,
 	.open           = vhost_vsock_dev_open,
 	.release        = vhost_vsock_dev_release,
 	.llseek		= noop_llseek,
 	.unlocked_ioctl = vhost_vsock_dev_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl   = vhost_vsock_dev_compat_ioctl,
-#endif
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 static struct miscdevice vhost_vsock_misc = {
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index bf76dadbed87fda7d61ef01d4f98aefa8c767077..7e1d00de070bbcf39298946d3b3152976b39c05a 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -965,13 +965,11 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 		fb_var_to_videomode(&mode2, &info->var);
 		/* make sure we don't delete the videomode of current var */
 		ret = fb_mode_is_equal(&mode1, &mode2);
-
-		if (!ret)
-			fbcon_mode_deleted(info, &mode1);
-
-		if (!ret)
-			fb_delete_videomode(&mode1, &info->modelist);
-
+		if (!ret) {
+			ret = fbcon_mode_deleted(info, &mode1);
+			if (!ret)
+				fb_delete_videomode(&mode1, &info->modelist);
+		}
 
 		return ret ? -EINVAL : 0;
 	}
@@ -1774,7 +1772,7 @@ int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int res_id, const
 	int err, idx, bar;
 	bool res_id_found = false;
 
-	for (idx = 0, bar = 0; bar < PCI_ROM_RESOURCE; bar++) {
+	for (idx = 0, bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
 			continue;
 		idx++;
@@ -1784,7 +1782,7 @@ int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int res_id, const
 	if (!ap)
 		return -ENOMEM;
 
-	for (idx = 0, bar = 0; bar < PCI_ROM_RESOURCE; bar++) {
+	for (idx = 0, bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
 		if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
 			continue;
 		ap->ranges[idx].base = pci_resource_start(pdev, bar);
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index e0cbf5b3d217490a1bc425647c532fa288d195a9..1cad8379bc61eab49faddf721ea3f38765008dfe 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -653,7 +653,7 @@ static void efifb_fixup_resources(struct pci_dev *dev)
 	if (!base)
 		return;
 
-	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct resource *res = &dev->resource[i];
 
 		if (!(res->flags & IORESOURCE_MEM))
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 06346422f743210cd296ee003be4da6c8709d890..43dce90059e5d6efbe24bc1f5f30c2497cdc7280 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -381,7 +381,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (dma_capable(dev, dev_addr, size) &&
+	if (dma_capable(dev, dev_addr, size, true) &&
 	    !range_straddles_page_boundary(phys, size) &&
 		!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
 		swiotlb_force != SWIOTLB_FORCE)
@@ -392,8 +392,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
 
-	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys,
-				     size, size, dir, attrs);
+	map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs);
 	if (map == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
@@ -403,7 +402,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	/*
 	 * Ensure that the address returned is DMA'ble
 	 */
-	if (unlikely(!dma_capable(dev, dev_addr, size))) {
+	if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, map, size, size, dir,
 				attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return DMA_MAPPING_ERROR;
@@ -411,7 +410,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 
 done:
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		xen_dma_sync_for_device(dev, dev_addr, phys, size, dir);
+		xen_dma_sync_for_device(dev_addr, phys, size, dir);
 	return dev_addr;
 }
 
@@ -431,7 +430,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 
 	if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		xen_dma_sync_for_cpu(hwdev, dev_addr, paddr, size, dir);
+		xen_dma_sync_for_cpu(dev_addr, paddr, size, dir);
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(dev_addr))
@@ -445,7 +444,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
 	phys_addr_t paddr = xen_bus_to_phys(dma_addr);
 
 	if (!dev_is_dma_coherent(dev))
-		xen_dma_sync_for_cpu(dev, dma_addr, paddr, size, dir);
+		xen_dma_sync_for_cpu(dma_addr, paddr, size, dir);
 
 	if (is_xen_swiotlb_buffer(dma_addr))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
@@ -461,7 +460,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
 
 	if (!dev_is_dma_coherent(dev))
-		xen_dma_sync_for_device(dev, dma_addr, paddr, size, dir);
+		xen_dma_sync_for_device(dma_addr, paddr, size, dir);
 }
 
 /*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7ce3cfd965d259ee72190f4c6b448ecc805e9893..d3b2acbfedd5620a34ce2d88b88d304ef99d903f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1701,7 +1701,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
+	int regset0_size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1710,8 +1710,10 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
-				    &t->prstatus.pr_reg, NULL);
+	regset0_size = regset_get(t->task, &view->regsets[0],
+		   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
+	if (regset0_size < 0)
+		return 0;
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
 		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
@@ -1726,32 +1728,28 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 */
 	for (i = 1; i < view->n; ++i) {
 		const struct user_regset *regset = &view->regsets[i];
+		int note_type = regset->core_note_type;
+		bool is_fpreg = note_type == NT_PRFPREG;
+		void *data;
+		int ret;
+
 		do_thread_regset_writeback(t->task, regset);
-		if (regset->core_note_type && regset->get &&
-		    (!regset->active || regset->active(t->task, regset) > 0)) {
-			int ret;
-			size_t size = regset_size(t->task, regset);
-			void *data = kzalloc(size, GFP_KERNEL);
-			if (unlikely(!data))
-				return 0;
-			ret = regset->get(t->task, regset,
-					  0, size, data, NULL);
-			if (unlikely(ret))
-				kfree(data);
-			else {
-				if (regset->core_note_type != NT_PRFPREG)
-					fill_note(&t->notes[i], "LINUX",
-						  regset->core_note_type,
-						  size, data);
-				else {
-					SET_PR_FPVALID(&t->prstatus,
-							1, regset0_size);
-					fill_note(&t->notes[i], "CORE",
-						  NT_PRFPREG, size, data);
-				}
-				*total += notesize(&t->notes[i]);
-			}
-		}
+		if (!note_type) // not for coredumps
+			continue;
+		if (regset->active && regset->active(t->task, regset) <= 0)
+			continue;
+
+		ret = regset_get_alloc(t->task, regset, ~0U, &data);
+		if (ret < 0)
+			continue;
+
+		if (is_fpreg)
+			SET_PR_FPVALID(&t->prstatus, 1, regset0_size);
+
+		fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+			  note_type, ret, data);
+
+		*total += notesize(&t->notes[i]);
 	}
 
 	return 1;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 78e41c7c3d05bbf0d673019279320bbcaa037f1f..26b3d821e916863440e466d3b2ec4f3d527f2f8a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -181,11 +181,14 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 {
+	lockdep_assert_held(&ctx->wqh.lock);
+
 	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 	ctx->count -= *cnt;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
 
 /**
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4614c0ba5f1c49d9b02a442c7e32e5b594fea620..bdc4503c00a3866c1d41d3e3eda3a7173458bae9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
-				      unsigned long arg)
-
-{
-	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = {
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.unlocked_ioctl	= fat_generic_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= fat_generic_compat_ioctl,
-#endif
+	.compat_ioctl	= compat_ptr_ioctl,
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index adc8e6b5ebab9e2d062622d6a808752469085044..8f1f0548f4f1132f37e735ca62185d44f7129106 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2257,12 +2257,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 	}
 	if (cmd == FUSE_DEV_IOC_RECOVERY) {
 		struct fuse_dev *fud = fuse_get_dev(file);
-		struct fuse_iqueue *fiq = &fud->fc->iq;
-		struct fuse_pqueue *fpq = &fud->pq;
+		struct fuse_iqueue *fiq = NULL;
+		struct fuse_pqueue *fpq = NULL;
 		struct fuse_req *req, *next;
 		LIST_HEAD(recovery);
 		unsigned int i;
 
+		if (fud && fud->fc) {
+			fiq = &fud->fc->iq;
+			fpq = &fud->pq;
+		} else
+			return -ENOMEM;
+
 		spin_lock(&fpq->lock);
 		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
 			list_splice_tail_init(&fpq->processing[i],
@@ -2278,7 +2284,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 		list_splice(&recovery, &fiq->pending);
 		spin_unlock(&fiq->lock);
 		err = 0;
-        }
+	}
 	return err;
 }
 
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 80867a1a94f268e19d419b2ab0274aa92c7bb947..f678d9f7181c6e4bf1a4bf5242f6f05f44ab8faf 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -762,10 +762,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -788,24 +784,22 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 				iomap);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made iomap_write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		length -= copied;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index cb5629bd5fffe6e733b5f88935d2f48dd0d75e1f..ae96a339d24d0dacad3f0f7c4fe72969e16f838e 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -103,3 +103,7 @@ config PROC_CHILDREN
 config PROC_PID_ARCH_STATUS
 	def_bool n
 	depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+	def_bool n
+	depends on PROC_FS
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 519f2d89ec9f8dde51643891ef18927fda124c3a..54480df1c66eb1d5561d4388cfe244d126ae6790 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/stat.h>
 #include <linux/posix-timers.h>
+#include <linux/resctrl.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -3060,6 +3061,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CGROUPS
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
 #endif
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
@@ -3461,6 +3465,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CGROUPS
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
 #endif
 	ONE("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h
index 42d573172e5396babc64bc0663e31562befa1c22..a42a21c45d338cb40635461afd767a1f134cf521 100644
--- a/include/acpi/acconfig.h
+++ b/include/acpi/acconfig.h
@@ -188,6 +188,8 @@
 #define ACPI_MAX_GSBUS_DATA_SIZE        255
 #define ACPI_MAX_GSBUS_BUFFER_SIZE      ACPI_SERIAL_HEADER_SIZE + ACPI_MAX_GSBUS_DATA_SIZE
 
+#define ACPI_PRM_INPUT_BUFFER_SIZE      26
+
 /* _sx_d and _sx_w control methods */
 
 #define ACPI_NUM_sx_d_METHODS           4
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 22c039ebc6c55551097bee4fe65d10ef81d9f34c..c7e102e2fa417c2bd38a182611dcc0947afff629 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -514,7 +514,8 @@ enum acpi_dmar_type {
 	ACPI_DMAR_TYPE_ROOT_ATS = 2,
 	ACPI_DMAR_TYPE_HARDWARE_AFFINITY = 3,
 	ACPI_DMAR_TYPE_NAMESPACE = 4,
-	ACPI_DMAR_TYPE_RESERVED = 5	/* 5 and greater are reserved */
+	ACPI_DMAR_TYPE_SATC = 5,
+	ACPI_DMAR_TYPE_RESERVED = 6	/* 6 and greater are reserved */
 };
 
 /* DMAR Device Scope structure */
@@ -607,6 +608,14 @@ struct acpi_dmar_andd {
 	char device_name[1];
 };
 
+/* 5: SOC Integrated Address Translation Cache Reporting Structure */
+
+struct acpi_dmar_satc {
+	struct acpi_dmar_header header;
+	u8 flags;
+	u8 reserved;
+	u16 segment;
+};
 /*******************************************************************************
  *
  * DRTM - Dynamic Root of Trust for Measurement table
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index d068272f6e56cace81cbf191de8f48da5057412f..0470387f9b724b479f280f45ee53f344d2bf7cea 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -39,6 +39,7 @@
 #define ACPI_SIG_PDTT           "PDTT"	/* Platform Debug Trigger Table */
 #define ACPI_SIG_PMTT           "PMTT"	/* Platform Memory Topology Table */
 #define ACPI_SIG_PPTT           "PPTT"	/* Processor Properties Topology Table */
+#define ACPI_SIG_PRMT           "PRMT"	/* Platform Runtime Mechanism Table */
 #define ACPI_SIG_RASF           "RASF"	/* RAS Feature table */
 #define ACPI_SIG_SBST           "SBST"	/* Smart Battery Specification Table */
 #define ACPI_SIG_SDEI           "SDEI"	/* Software Delegated Exception Interface Table */
@@ -1538,6 +1539,48 @@ struct acpi_pptt_id {
 	u16 spin_rev;
 };
 
+/*******************************************************************************
+ *
+ * PRMT - Platform Runtime Mechanism Table
+ *        Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_prmt {
+	struct acpi_table_header header;	/* Common ACPI table header */
+};
+
+struct acpi_table_prmt_header {
+	u8 platform_guid[16];
+	u32 module_info_offset;
+	u32 module_info_count;
+};
+
+struct acpi_prmt_module_header {
+	u16 revision;
+	u16 length;
+};
+
+struct acpi_prmt_module_info {
+	u16 revision;
+	u16 length;
+	u8 module_guid[16];
+	u16 major_rev;
+	u16 minor_rev;
+	u16 handler_info_count;
+	u32 handler_info_offset;
+	u64 mmio_list_pointer;
+};
+
+struct acpi_prmt_handler_info {
+	u16 revision;
+	u16 length;
+	u8 handler_guid[16];
+	u64 handler_address;
+	u64 static_data_buffer_address;
+	u64 acpi_param_buffer_address;
+};
+
 /*******************************************************************************
  *
  * RASF - RAS Feature Table (ACPI 5.0)
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 9373662cdb44f04fdcd83aafc5819e8ac92106e9..2b4276f73da2abd7b435081b779717fb7413ec03 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -815,8 +815,9 @@ typedef u8 acpi_adr_space_type;
 #define ACPI_ADR_SPACE_GPIO             (acpi_adr_space_type) 8
 #define ACPI_ADR_SPACE_GSBUS            (acpi_adr_space_type) 9
 #define ACPI_ADR_SPACE_PLATFORM_COMM    (acpi_adr_space_type) 10
+#define ACPI_ADR_SPACE_PLATFORM_RT      (acpi_adr_space_type) 11
 
-#define ACPI_NUM_PREDEFINED_REGIONS     11
+#define ACPI_NUM_PREDEFINED_REGIONS     12
 
 /*
  * Special Address Spaces
diff --git a/include/asm-generic/msi.h b/include/asm-generic/msi.h
index e6795f088bdddf4fa1bbd322a6a4a39280a6f3b6..25344de0e8f9d85e1deee0f75cc55e3beecb7f30 100644
--- a/include/asm-generic/msi.h
+++ b/include/asm-generic/msi.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+
 #ifndef NUM_MSI_ALLOC_SCRATCHPAD_REGS
 # define NUM_MSI_ALLOC_SCRATCHPAD_REGS	2
 #endif
@@ -30,4 +32,6 @@ typedef struct msi_alloc_info {
 
 #define GENERIC_MSI_DOMAIN_OPS		1
 
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+
 #endif
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index ed68f45c09f739f3c2f7c98f30b7cc66b6fd9bc8..dd0f2e4b79a7868fe2532b0b667db9c5f9e98dab 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -132,6 +132,7 @@ enum acpi_address_range_id {
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
+	struct acpi_prmt_module_header prmt;
 };
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
@@ -529,6 +530,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_PCLPI_SUPPORT			0x00000080
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
+#define OSC_SB_PRM_SUPPORT			0x00200000
 
 extern bool osc_sb_apei_support_acked;
 extern bool osc_pc_lpi_support_confirmed;
@@ -539,8 +541,9 @@ extern bool osc_pc_lpi_support_confirmed;
 #define OSC_PCI_CLOCK_PM_SUPPORT		0x00000004
 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT		0x00000008
 #define OSC_PCI_MSI_SUPPORT			0x00000010
+#define OSC_PCI_EDR_SUPPORT			0x00000080
 #define OSC_PCI_HPX_TYPE_3_SUPPORT		0x00000100
-#define OSC_PCI_SUPPORT_MASKS			0x0000011f
+#define OSC_PCI_SUPPORT_MASKS			0x0000019f
 
 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
 #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL	0x00000001
@@ -549,7 +552,8 @@ extern bool osc_pc_lpi_support_confirmed;
 #define OSC_PCI_EXPRESS_AER_CONTROL		0x00000008
 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL	0x00000010
 #define OSC_PCI_EXPRESS_LTR_CONTROL		0x00000020
-#define OSC_PCI_CONTROL_MASKS			0x0000003f
+#define OSC_PCI_EXPRESS_DPC_CONTROL		0x00000080
+#define OSC_PCI_CONTROL_MASKS			0x000000bf
 
 #define ACPI_GSB_ACCESS_ATTRIB_QUICK		0x00000002
 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 8e7e2ec37f1b295f2615b92ea253f6f9f1f09025..08ec6bd2297fef2c3a558f227345733aa63ae22b 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -29,7 +29,8 @@ struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
 void acpi_iort_init(void);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
@@ -40,8 +41,8 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 static inline void acpi_iort_init(void) { }
 static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 { return req_id; }
-static inline struct irq_domain *iort_get_device_domain(struct device *dev,
-							u32 req_id)
+static inline struct irq_domain *iort_get_device_domain(
+	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
diff --git a/include/linux/aer.h b/include/linux/aer.h
index fa19e01f418a603e4dad4787e9d36028387391cd..4d8299f152d7d7dddbc508385660edebd9162f26 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,8 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
-int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
+int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 void pci_save_aer_state(struct pci_dev *dev);
 void pci_restore_aer_state(struct pci_dev *dev);
 #else
@@ -57,11 +56,7 @@ static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
-static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
-{
-	return -EINVAL;
-}
-static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
@@ -74,5 +69,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 		       int severity, struct aer_capability_regs *aer_regs);
+
+/*
+ * TK4: For compact reason, an alias for pci_cleanup_aer_uncorrect_error_status
+ * to pci_aer_clear_nonfatal_status is added here. Since many modules really
+ * expects an 5.4 kernel to have pci_cleanup_aer_uncorrect_error_status, this
+ * is useful as a compact layer.
+ */
+static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
+{
+       return pci_aer_clear_nonfatal_status(dev);
+}
+
 #endif //_AER_H_
 
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 21e950e4ab623ee3ca9ecce468ee31da60dd6971..450717299928bbace723d236c248c69f3c94929b 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -76,7 +76,7 @@ extern void amd_iommu_free_device(struct pci_dev *pdev);
  *
  * The function returns 0 on success or a negative value on error.
  */
-extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
+extern int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
 				struct task_struct *task);
 
 /**
@@ -88,7 +88,7 @@ extern int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
  * When this function returns the device is no longer using the PASID
  * and the PASID is no longer bound to its task.
  */
-extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid);
+extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid);
 
 /**
  * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed
@@ -114,7 +114,7 @@ extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid);
 #define AMD_IOMMU_INV_PRI_RSP_FAIL	2
 
 typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev,
-					int pasid,
+					u32 pasid,
 					unsigned long address,
 					u16);
 
@@ -166,7 +166,7 @@ extern int amd_iommu_device_info(struct pci_dev *pdev,
  * @cb: The call-back function
  */
 
-typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
+typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, u32 pasid);
 
 extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
 					   amd_iommu_invalidate_ctx cb);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 29fc933df3bf0eaf5dae01b458854d9cb410eef9..d291e0736f1e211d810bb660e84a97dd2b7871db 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,12 @@
  *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask)  as above
+ *  bitmap_next_clear_region(map, &start, &end, nbits)  Find next clear region
+ *  bitmap_next_set_region(map, &start, &end, nbits)  Find next set region
+ *  bitmap_for_each_clear_region(map, rs, re, start, end)
+ *  						Iterate over all clear regions
+ *  bitmap_for_each_set_region(map, rs, re, start, end)
+ *  						Iterate over all set regions
  *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
  *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
  *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
@@ -438,6 +444,41 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
 	return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
 }
 
+static inline void bitmap_next_clear_region(unsigned long *bitmap,
+					    unsigned int *rs, unsigned int *re,
+					    unsigned int end)
+{
+	*rs = find_next_zero_bit(bitmap, end, *rs);
+	*re = find_next_bit(bitmap, end, *rs + 1);
+}
+
+static inline void bitmap_next_set_region(unsigned long *bitmap,
+					  unsigned int *rs, unsigned int *re,
+					  unsigned int end)
+{
+	*rs = find_next_bit(bitmap, end, *rs);
+	*re = find_next_zero_bit(bitmap, end, *rs + 1);
+}
+
+/*
+ * Bitmap region iterators.  Iterates over the bitmap between [@start, @end).
+ * @rs and @re should be integer variables and will be set to start and end
+ * index of the current clear or set region.
+ */
+#define bitmap_for_each_clear_region(bitmap, rs, re, start, end)	     \
+	for ((rs) = (start),						     \
+	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end));	     \
+	     (rs) < (re);						     \
+	     (rs) = (re) + 1,						     \
+	     bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))
+
+#define bitmap_for_each_set_region(bitmap, rs, re, start, end)		     \
+	for ((rs) = (start),						     \
+	     bitmap_next_set_region((bitmap), &(rs), &(re), (end));	     \
+	     (rs) < (re);						     \
+	     (rs) = (re) + 1,						     \
+	     bitmap_next_set_region((bitmap), &(rs), &(re), (end)))
+
 /**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
  * @n: u64 value
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 46b92cd61d0c8252a354eeb4b2f84965e1513aaf..4f72b47973c304e2d90ca3720438efa1c22ea5c9 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -3,6 +3,7 @@
 #define _LINUX_CACHEINFO_H
 
 #include <linux/bitops.h>
+#include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 
@@ -119,4 +120,24 @@ int acpi_find_last_cache_level(unsigned int cpu);
 
 const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
 
+/*
+ * Get the id of the cache associated with @cpu at level @level.
+ * cpuhp lock must be held.
+ */
+static inline int get_cpu_cacheinfo_id(int cpu, int level)
+{
+	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+	int i;
+
+	for (i = 0; i < ci->num_leaves; i++) {
+		if (ci->info_list[i].level == level) {
+			if (ci->info_list[i].attributes & CACHE_ID)
+				return ci->info_list[i].id;
+			return -1;
+		}
+	}
+
+	return -1;
+}
+
 #endif /* _LINUX_CACHEINFO_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b41dbb5d265ee358b1a6958d809c11..cda75ecdcdcb75af737680cdfcd5555cf2816acd 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -57,6 +57,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_IOASIDS)
+SUBSYS(ioasids)
+#endif
+
 #if IS_ENABLED(CONFIG_CGROUP_RDMA)
 SUBSYS(rdma)
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c4c389c7e1b4221db8ed4b987ba34e5023425f6d..13f845a56f499cf0fc5d7b379900ce6c03403fd3 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -490,12 +490,13 @@ extern void __user *compat_alloc_user_space(unsigned long len);
 
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
-#define compat_save_altstack_ex(uss, sp) do { \
+#define unsafe_compat_save_altstack(uss, sp, label) do { \
 	compat_stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
-	put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
-	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \
+			&__uss->ss_sp, label); \
+	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
+	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
 	if (t->sas_ss_flags & SS_AUTODISARM) \
 		sas_ss_reset(t); \
 } while (0);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4e9822cb11f3866dbb5317c2e7f2561140b6dc98..94cda8c3b5d1d5ad02aab7ad10bd4f58048a2b74 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -190,7 +190,12 @@ void arch_cpu_idle_dead(void);
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
-void play_idle(unsigned long duration_us);
+void play_idle_precise(u64 duration_ns, u64 latency_ns);
+
+static inline void play_idle(unsigned long duration_us)
+{
+	play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX);
+}
 
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2d55cee638fc6071b3e523a75ed569f7acec0a3f..96bf167ec28b55da49792c5a01193d866e2c435f 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -56,7 +56,7 @@ enum cpuhp_state {
 	CPUHP_PAGE_ALLOC_DEAD,
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
-	CPUHP_IOMMU_INTEL_DEAD,
+	CPUHP_IOMMU_IOVA_DEAD,
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
@@ -158,6 +158,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4447967e7a9f1bc0270b8fb55d65ff21ba7ff5d7..48a1840a7e3deed8a169a9ee0df037740bb19565 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -29,12 +29,13 @@ struct cpuidle_driver;
  * CPUIDLE DEVICE INTERFACE *
  ****************************/
 
-#define CPUIDLE_STATE_DISABLED_BY_USER         BIT(0)
+#define CPUIDLE_STATE_DISABLED_BY_USER		BIT(0)
+#define CPUIDLE_STATE_DISABLED_BY_DRIVER	BIT(1)
 
 struct cpuidle_state_usage {
 	unsigned long long	disable;
 	unsigned long long	usage;
-	unsigned long long	time; /* in US */
+	u64			time_ns;
 	unsigned long long	above; /* Number of times it's been too deep */
 	unsigned long long	below; /* Number of times it's been too shallow */
 #ifdef CONFIG_SUSPEND
@@ -47,11 +48,12 @@ struct cpuidle_state {
 	char		name[CPUIDLE_NAME_LEN];
 	char		desc[CPUIDLE_DESC_LEN];
 
+	u64		exit_latency_ns;
+	u64		target_residency_ns;
 	unsigned int	flags;
 	unsigned int	exit_latency; /* in US */
 	int		power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
-	bool		disabled; /* disabled on all CPUs */
 
 	int (*enter)	(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
@@ -70,11 +72,13 @@ struct cpuidle_state {
 };
 
 /* Idle State Flags */
-#define CPUIDLE_FLAG_NONE       (0x00)
-#define CPUIDLE_FLAG_POLLING	BIT(0) /* polling state */
-#define CPUIDLE_FLAG_COUPLED	BIT(1) /* state applies to multiple cpus */
-#define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */
-#define CPUIDLE_FLAG_OFF       BIT(4) /* disable this state by default */
+#define CPUIDLE_FLAG_NONE       	(0x00)
+#define CPUIDLE_FLAG_POLLING		BIT(0) /* polling state */
+#define CPUIDLE_FLAG_COUPLED		BIT(1) /* state applies to multiple cpus */
+#define CPUIDLE_FLAG_TIMER_STOP 	BIT(2) /* timer is stopped on this state */
+#define CPUIDLE_FLAG_UNUSABLE		BIT(3) /* avoid using this state */
+#define CPUIDLE_FLAG_OFF		BIT(4) /* disable this state by default */
+#define CPUIDLE_FLAG_TLB_FLUSHED	BIT(5) /* idle-state flushes TLBs */
 
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
@@ -83,14 +87,14 @@ struct cpuidle_driver_kobj;
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
-	unsigned int		use_deepest_state:1;
 	unsigned int		poll_time_limit:1;
 	unsigned int		cpu;
 	ktime_t			next_hrtimer;
 
 	int			last_state_idx;
-	int			last_residency;
+	u64			last_residency_ns;
 	u64			poll_limit_ns;
+	u64			forced_idle_latency_limit_ns;
 	struct cpuidle_state_usage	states_usage[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 	struct cpuidle_driver_kobj *kobj_driver;
@@ -113,7 +117,6 @@ DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 struct cpuidle_driver {
 	const char		*name;
 	struct module 		*owner;
-	int                     refcnt;
 
         /* used by the cpuidle framework to setup the broadcast timer */
 	unsigned int            bctimer:1;
@@ -145,8 +148,8 @@ extern u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 extern struct cpuidle_driver *cpuidle_get_driver(void);
-extern struct cpuidle_driver *cpuidle_driver_ref(void);
-extern void cpuidle_driver_unref(void);
+extern void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
+					bool disable);
 extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
 extern int cpuidle_register_device(struct cpuidle_device *dev);
 extern void cpuidle_unregister_device(struct cpuidle_device *dev);
@@ -182,8 +185,8 @@ static inline u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
 static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
-static inline struct cpuidle_driver *cpuidle_driver_ref(void) {return NULL; }
-static inline void cpuidle_driver_unref(void) {}
+static inline void cpuidle_driver_state_disabled(struct cpuidle_driver *drv,
+					       int idx, bool disable) { }
 static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { }
 static inline int cpuidle_register_device(struct cpuidle_device *dev)
 {return -ENODEV; }
@@ -207,18 +210,20 @@ static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 
 #ifdef CONFIG_CPU_IDLE
 extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev);
+				      struct cpuidle_device *dev,
+				      u64 latency_limit_ns);
 extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-					     struct cpuidle_device *dev)
+					     struct cpuidle_device *dev,
+					     u64 latency_limit_ns)
 {return -ENODEV; }
 static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable)
+static inline void cpuidle_use_deepest_state(u64 latency_limit_ns)
 {
 }
 #endif
@@ -261,13 +266,8 @@ struct cpuidle_governor {
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 };
 
-#ifdef CONFIG_CPU_IDLE
 extern int cpuidle_register_governor(struct cpuidle_governor *gov);
-extern int cpuidle_governor_latency_req(unsigned int cpu);
-#else
-static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
-{return 0;}
-#endif
+extern s64 cpuidle_governor_latency_req(unsigned int cpu);
 
 #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter,			\
 				idx,					\
diff --git a/include/linux/device.h b/include/linux/device.h
index 5add0c2deb8795a8d02a0e669d417cda54052eff..937c71dcff40a2b69ee8e4877fe75c083d51405a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -41,9 +41,8 @@ struct device_node;
 struct fwnode_handle;
 struct iommu_ops;
 struct iommu_group;
-struct iommu_fwspec;
 struct dev_pin_info;
-struct iommu_param;
+struct dev_iommu;
 
 struct bus_attribute {
 	struct attribute	attr;
@@ -996,6 +995,7 @@ struct device_dma_parameters {
 	 * sg limitations.
 	 */
 	unsigned int max_segment_size;
+	unsigned int min_align_mask;
 	unsigned long segment_boundary_mask;
 };
 
@@ -1181,7 +1181,9 @@ struct dev_links_info {
  * 		along with subsystem-level and driver-level callbacks.
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pinctl.rst for details.
+ * @msi_lock:	Lock to protect MSI mask cache and mask register
  * @msi_list:	Hosts MSI descriptors
+ * @msi_last_list: Pointer to list of last msi_desc entry
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
  * @dma_ops:    DMA mapping operations for this device.
@@ -1189,8 +1191,8 @@ struct dev_links_info {
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
  * 		hardware supports 64-bit addresses for consistent allocations
  * 		such descriptors.
- * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA
- *		limit than the device itself supports.
+ * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
+ *		DMA limit than the device itself supports.
  * @dma_pfn_offset: offset of DMA memory range relatively of RAM
  * @dma_parms:	A low level driver may set these to teach IOMMU code about
  * 		segment limitations.
@@ -1211,8 +1213,7 @@ struct dev_links_info {
  * 		gone away. This should be set by the allocator of the
  * 		device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
- * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
- * @iommu_param: Per device generic IOMMU runtime data
+ * @iommu:	Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:	Set after successful invocation of bus type's .offline().
@@ -1220,6 +1221,11 @@ struct dev_links_info {
  *              device.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
+ * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
+ *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
+ *		and optionall (if the coherent mask is large enough) also
+ *		for dma allocations.  This flag is managed by the dma ops
+ *		instance from ->dma_supported.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -1263,7 +1269,11 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	struct list_head	*msi_last_list;
+	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
+	struct list_head	dev_msi_list;
+	struct list_head	*dev_msi_last_list;
 #endif
 
 	const struct dma_map_ops *dma_ops;
@@ -1273,7 +1283,7 @@ struct device {
 					     not all hardware supports
 					     64 bit addresses for consistent
 					     allocations such descriptors. */
-	u64		bus_dma_mask;	/* upstream dma_mask constraint */
+	u64		bus_dma_limit;	/* upstream dma constraint */
 	unsigned long	dma_pfn_offset;
 
 	struct device_dma_parameters *dma_parms;
@@ -1308,8 +1318,8 @@ struct device {
 
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
-	struct iommu_fwspec	*iommu_fwspec;
-	struct iommu_param	*iommu_param;
+	struct dev_iommu	*iommu;
+	u32			pasid;	/* For in-kernel DMA w/ PASID */
 
 	bool			offline_disabled:1;
 	bool			offline:1;
@@ -1319,6 +1329,9 @@ struct device {
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	bool			dma_coherent:1;
 #endif
+#ifdef CONFIG_DMA_OPS_BYPASS
+	bool			dma_ops_bypass : 1;
+#endif
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
@@ -1562,6 +1575,8 @@ static inline void *dev_get_platdata(const struct device *dev)
  * Manual binding of a device to driver. See drivers/base/bus.c
  * for information on use.
  */
+int __must_check device_driver_attach(struct device_driver *drv,
+				      struct device *dev);
 extern int __must_check device_bind_driver(struct device *dev);
 extern void device_release_driver(struct device *dev);
 extern int  __must_check device_attach(struct device *dev);
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6a18a97b76a87d49ad3dfc315ecfcbec1e14eb91..8af66e5132124f8b15848f2203988d8e030fb532 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -1,42 +1,48 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internals of the DMA direct mapping implementation.  Only for use by the
+ * DMA mapping code and IOMMU drivers.
+ */
 #ifndef _LINUX_DMA_DIRECT_H
 #define _LINUX_DMA_DIRECT_H 1
 
 #include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/memblock.h> /* for min_low_pfn */
 #include <linux/mem_encrypt.h>
-
-static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
+#include <linux/swiotlb.h>
 
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
+#ifndef phys_to_dma_unencrypted
+#define phys_to_dma_unencrypted		phys_to_dma
+#endif
 #else
-static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
+static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
+		phys_addr_t paddr)
 {
 	dma_addr_t dev_addr = (dma_addr_t)paddr;
 
 	return dev_addr - ((dma_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
 }
 
-static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dev_addr)
+/*
+ * If memory encryption is supported, phys_to_dma will set the memory encryption
+ * bit in the DMA address, and dma_to_phys will clear it.
+ * phys_to_dma_unencrypted is for use on special unencrypted memory like swiotlb
+ * buffers.
+ */
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	phys_addr_t paddr = (phys_addr_t)dev_addr;
-
-	return paddr + ((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
+	return __sme_set(phys_to_dma_unencrypted(dev, paddr));
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
 {
-	dma_addr_t end = addr + size - 1;
-
-	if (!dev->dma_mask)
-		return false;
+	phys_addr_t paddr = (phys_addr_t)dev_addr +
+		((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
 
-	if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
-	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
-		return false;
-
-	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
+	return __sme_clr(paddr);
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
@@ -49,20 +55,16 @@ static inline bool force_dma_unencrypted(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 
-/*
- * If memory encryption is supported, phys_to_dma will set the memory encryption
- * bit in the DMA address, and dma_to_phys will clear it.  The raw __phys_to_dma
- * and __dma_to_phys versions should only be used on non-encrypted memory for
- * special occasions like DMA coherent buffers.
- */
-static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size,
+		bool is_ram)
 {
-	return __sme_set(__phys_to_dma(dev, paddr));
-}
+	dma_addr_t end = addr + size - 1;
 
-static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
-{
-	return __sme_clr(__dma_to_phys(dev, daddr));
+	if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) &&
+	    min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn)))
+		return false;
+
+	return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
@@ -70,12 +72,117 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-		dma_addr_t dma_addr, unsigned long attrs);
-struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_direct_free_pages(struct device *dev, size_t size,
+		struct page *page, dma_addr_t dma_addr,
+		enum dma_data_direction dir);
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+bool dma_direct_can_mmap(struct device *dev);
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+size_t dma_direct_max_mapping_size(struct device *dev);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_sync_single_for_device(dev, paddr, size, dir);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(paddr, size, dir);
+}
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_all();
+	}
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+}
+
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys = page_to_phys(page) + offset;
+	dma_addr_t dma_addr = phys_to_dma(dev, phys);
+
+	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+		return swiotlb_map(dev, phys, size, dir, attrs);
+
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		if (swiotlb_force != SWIOTLB_NO_FORCE)
+			return swiotlb_map(dev, phys, size, dir, attrs);
+
+		dev_WARN_ONCE(dev, 1,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		return DMA_MAPPING_ERROR;
+	}
+
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(phys, size, dir);
+	return dma_addr;
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys = dma_to_phys(dev, addr);
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 2112f21f73d82bd7ced1bbfb4b58c91b1cc5ab44..298b31e3a007b35f7186ba6b7914f1f8ea57858d 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -18,8 +18,16 @@ int iommu_get_dma_cookie(struct iommu_domain *domain);
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
+/*
+ * For devices that can do DMA request with PASID, setup a system PASID.
+ * Address modes (IOVA, PA) are selected by the platform code.
+ */
+ioasid_t iommu_enable_pasid_dma(struct device *dev);
+int iommu_disable_pasid_dma(struct device *dev);
+
 /* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+int iommu_dma_init_fq(struct iommu_domain *domain);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -37,6 +45,11 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
+void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
+		struct iommu_domain *domain);
+
+extern bool iommu_dma_forcedac;
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
@@ -45,8 +58,13 @@ struct msi_msg;
 struct device;
 
 static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size)
+				       u64 dma_limit)
+{
+}
+
+static inline int iommu_dma_init_fq(struct iommu_domain *domain)
 {
+	return -EINVAL;
 }
 
 static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d0d8651ee149c083065872fd854e08254229f13d..177a3efe4dff57e63d969636c5d6e259b59daf2c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -31,11 +31,6 @@
  * buffered to improve performance.
  */
 #define DMA_ATTR_WRITE_COMBINE		(1UL << 2)
-/*
- * DMA_ATTR_NON_CONSISTENT: Lets the platform to choose to return either
- * consistent or non-consistent memory as it sees fit.
- */
-#define DMA_ATTR_NON_CONSISTENT		(1UL << 3)
 /*
  * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel
  * virtual mapping for the allocated buffer.
@@ -84,6 +79,11 @@ struct dma_map_ops {
 	void (*free)(struct device *dev, size_t size,
 			      void *vaddr, dma_addr_t dma_handle,
 			      unsigned long attrs);
+	struct page *(*alloc_pages)(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, enum dma_data_direction dir,
+			gfp_t gfp);
+	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
+			dma_addr_t dma_handle, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			  void *, dma_addr_t, size_t,
 			  unsigned long attrs);
@@ -99,8 +99,9 @@ struct dma_map_ops {
 			   size_t size, enum dma_data_direction dir,
 			   unsigned long attrs);
 	/*
-	 * map_sg returns 0 on error and a value > 0 on success.
-	 * It should never return a value < 0.
+	 * map_sg should return a negative error code on error. See
+	 * dma_map_sgtable() for a list of appropriate error codes
+	 * and their meanings.
 	 */
 	int (*map_sg)(struct device *dev, struct scatterlist *sg,
 		      int nents, enum dma_data_direction dir,
@@ -197,73 +198,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
-static inline bool dma_is_direct(const struct dma_map_ops *ops)
-{
-	return likely(!ops);
-}
-
-/*
- * All the dma_direct_* declarations are here just for the indirect call bypass,
- * and must not be used directly drivers!
- */
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-}
-static inline void dma_direct_unmap_sg(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-size_t dma_direct_max_mapping_size(struct device *dev);
-
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 
@@ -280,164 +214,6 @@ static inline void set_dma_ops(struct device *dev,
 	dev->dma_ops = dma_ops;
 }
 
-static inline dma_addr_t dma_map_page_attrs(struct device *dev,
-		struct page *page, size_t offset, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
-	else
-		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	debug_dma_map_page(dev, page, offset, size, dir, addr);
-
-	return addr;
-}
-
-static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_unmap_page(dev, addr, size, dir, attrs);
-	else if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir);
-}
-
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int ents;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
-	else
-		ents = ops->map_sg(dev, sg, nents, dir, attrs);
-	BUG_ON(ents < 0);
-	debug_dma_map_sg(dev, sg, nents, ents, dir);
-
-	return ents;
-}
-
-static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-				      int nents, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_is_direct(ops))
-		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
-	else if (ops->unmap_sg)
-		ops->unmap_sg(dev, sg, nents, dir, attrs);
-}
-
-static inline dma_addr_t dma_map_resource(struct device *dev,
-					  phys_addr_t phys_addr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr = DMA_MAPPING_ERROR;
-
-	BUG_ON(!valid_dma_direction(dir));
-
-	/* Don't allow RAM to be mapped */
-	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
-		return DMA_MAPPING_ERROR;
-
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-	else if (ops->map_resource)
-		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
-	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
-	return addr;
-}
-
-static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
-				      size_t size, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (!dma_is_direct(ops) && ops->unmap_resource)
-		ops->unmap_resource(dev, addr, size, dir, attrs);
-	debug_dma_unmap_resource(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-					   size_t size,
-					   enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-	else if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr, size, dir);
-	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t addr, size_t size,
-					      enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_device(dev, addr, size, dir);
-	else if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr, size, dir);
-	debug_dma_sync_single_for_device(dev, addr, size, dir);
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
-
-}
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
@@ -448,6 +224,30 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir);
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir);
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir);
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir);
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs);
 void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
@@ -470,6 +270,7 @@ int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 #else /* CONFIG_HAS_DMA */
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
@@ -482,8 +283,9 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+static inline unsigned int dma_map_sg_attrs(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
 {
 	return 0;
 }
@@ -492,6 +294,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
 		unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
 		phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -580,12 +387,21 @@ static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
 }
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return false;
+}
 static inline unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	return 0;
 }
 #endif /* CONFIG_HAS_DMA */
 
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir);
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -618,6 +434,58 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
+/**
+ * dma_unmap_sgtable - Unmap the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the unmap operation
+ *
+ * Unmaps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After this function
+ * the ownership of the buffer is transferred back to the CPU domain.
+ */
+static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+}
+
+/**
+ * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ *
+ * Performs the needed cache synchronization and moves the ownership of the
+ * buffer back to the CPU domain, so it is safe to perform any access to it
+ * by the CPU. Before doing any further DMA operations, one has to transfer
+ * the ownership of the buffer back to the DMA domain by calling the
+ * dma_sync_sgtable_for_device().
+ */
+static inline void dma_sync_sgtable_for_cpu(struct device *dev,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir);
+}
+
+/**
+ * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ *
+ * Performs the needed cache synchronization and moves the ownership of the
+ * buffer back to the DMA domain, so it is safe to perform the DMA operation.
+ * Once finished, one has to call dma_sync_sgtable_for_cpu() or
+ * dma_unmap_sgtable().
+ */
+static inline void dma_sync_sgtable_for_device(struct device *dev,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir);
+}
+
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
@@ -630,7 +498,10 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
-
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
+void dma_common_free_pages(struct device *dev, size_t size, struct page *vaddr,
+		dma_addr_t dma_handle, enum dma_data_direction dir);
 struct page **dma_common_find_pages(void *cpu_addr);
 void *dma_common_contiguous_remap(struct page *page, size_t size,
 			pgprot_t prot, const void *caller);
@@ -639,9 +510,10 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
 			pgprot_t prot, const void *caller);
 void dma_common_free_remap(void *cpu_addr, size_t size);
 
-bool dma_in_atomic_pool(void *start, size_t size);
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags);
-bool dma_free_from_pool(void *start, size_t size);
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+		void **cpu_addr, gfp_t flags,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
+bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
 int
 dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr,
@@ -703,7 +575,7 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
  */
 static inline bool dma_addressing_limited(struct device *dev)
 {
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) <
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
 			    dma_get_required_mask(dev);
 }
 
@@ -757,6 +629,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 	return -EIO;
 }
 
+static inline unsigned int dma_get_min_align_mask(struct device *dev)
+{
+	if (dev->dma_parms)
+		return dev->dma_parms->min_align_mask;
+	return 0;
+}
+
+static inline int dma_set_min_align_mask(struct device *dev,
+		unsigned int min_align_mask)
+{
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return -EIO;
+	dev->dma_parms->min_align_mask = min_align_mask;
+	return 0;
+}
+
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index dd3de6d88fc0814670f0f3396ec02e4be78da409..73252358824d390ddd21f72d533ebb395519c7b8 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -21,28 +21,10 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 
-/*
- * Check if an allocation needs to be marked uncached to be coherent.
- */
-static __always_inline bool dma_alloc_need_uncached(struct device *dev,
-		unsigned long attrs)
-{
-	if (dev_is_dma_coherent(dev))
-		return false;
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
-		return false;
-	if (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
-	    (attrs & DMA_ATTR_NON_CONSISTENT))
-		return false;
-	return true;
-}
-
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr);
 
 #ifdef CONFIG_MMU
 /*
@@ -75,29 +57,29 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
 #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir);
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 #else
-static inline void arch_sync_dma_for_device(struct device *dev,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir);
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 #else
-static inline void arch_sync_dma_for_cpu(struct device *dev,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
+static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
 {
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
-void arch_sync_dma_for_cpu_all(struct device *dev);
+void arch_sync_dma_for_cpu_all(void);
 #else
-static inline void arch_sync_dma_for_cpu_all(struct device *dev)
+static inline void arch_sync_dma_for_cpu_all(void)
 {
 }
 #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */
@@ -110,7 +92,7 @@ static inline void arch_dma_prep_coherent(struct page *page, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
-void *uncached_kernel_address(void *addr);
-void *cached_kernel_address(void *addr);
+void *arch_dma_set_uncached(void *addr, size_t size);
+void arch_dma_clear_uncached(void *addr, size_t size);
 
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8013562751a50a1a3914670621afea11da7c7863..6529dd7361065f64dbcdbaae7ced603c5e02078e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -49,6 +49,7 @@ enum dma_status {
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
+	DMA_MEMCPY_SG,
 	DMA_XOR,
 	DMA_PQ,
 	DMA_XOR_VAL,
@@ -162,7 +163,7 @@ struct dma_interleaved_template {
  * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
  *  this transaction
  * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client
- *  acknowledges receipt, i.e. has has a chance to establish any dependency
+ *  acknowledges receipt, i.e. has a chance to establish any dependency
  *  chains
  * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
  * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
@@ -219,6 +220,62 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
  * @bytes_transferred: byte counter
  */
 
+/**
+ * enum dma_desc_metadata_mode - per descriptor metadata mode types supported
+ * @DESC_METADATA_CLIENT - the metadata buffer is allocated/provided by the
+ *  client driver and it is attached (via the dmaengine_desc_attach_metadata()
+ *  helper) to the descriptor.
+ *
+ * Client drivers interested to use this mode can follow:
+ * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *	construct the metadata in the client's buffer
+ *   2. use dmaengine_desc_attach_metadata() to attach the buffer to the
+ *	descriptor
+ *   3. submit the transfer
+ * - DMA_DEV_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. use dmaengine_desc_attach_metadata() to attach the buffer to the
+ *	descriptor
+ *   3. submit the transfer
+ *   4. when the transfer is completed, the metadata should be available in the
+ *	attached buffer
+ *
+ * @DESC_METADATA_ENGINE - the metadata buffer is allocated/managed by the DMA
+ *  driver. The client driver can ask for the pointer, maximum size and the
+ *  currently used size of the metadata and can directly update or read it.
+ *  dmaengine_desc_get_metadata_ptr() and dmaengine_desc_set_metadata_len() is
+ *  provided as helper functions.
+ *
+ *  Note: the metadata area for the descriptor is no longer valid after the
+ *  transfer has been completed (valid up to the point when the completion
+ *  callback returns if used).
+ *
+ * Client drivers interested to use this mode can follow:
+ * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. use dmaengine_desc_get_metadata_ptr() to get the pointer to the engine's
+ *	metadata area
+ *   3. update the metadata at the pointer
+ *   4. use dmaengine_desc_set_metadata_len()  to tell the DMA engine the amount
+ *	of data the client has placed into the metadata buffer
+ *   5. submit the transfer
+ * - DMA_DEV_TO_MEM:
+ *   1. prepare the descriptor (dmaengine_prep_*)
+ *   2. submit the transfer
+ *   3. on transfer completion, use dmaengine_desc_get_metadata_ptr() to get the
+ *	pointer to the engine's metadata area
+ *   4. Read out the metadata from the pointer
+ *
+ * Note: the two mode is not compatible and clients must use one mode for a
+ * descriptor.
+ */
+enum dma_desc_metadata_mode {
+	DESC_METADATA_NONE = 0,
+	DESC_METADATA_CLIENT = BIT(0),
+	DESC_METADATA_ENGINE = BIT(1),
+};
+
 struct dma_chan_percpu {
 	/* stats */
 	unsigned long memcpy_count;
@@ -238,10 +295,14 @@ struct dma_router {
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
  * @device: ptr to the dma device who supplies this channel, always !%NULL
+ * @slave: ptr to the device using this channel
  * @cookie: last cookie value returned to client
  * @completed_cookie: last completed cookie for this channel
  * @chan_id: channel ID for sysfs
  * @dev: class device for sysfs
+ * @name: backlink name for sysfs
+ * @dbg_client_name: slave name for debugfs in format:
+ *	dev_name(requester's dev):channel name, for example: "2b00000.mcasp:tx"
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client_count: how many clients are using this channel
@@ -252,12 +313,17 @@ struct dma_router {
  */
 struct dma_chan {
 	struct dma_device *device;
+	struct device *slave;
 	dma_cookie_t cookie;
 	dma_cookie_t completed_cookie;
 
 	/* sysfs */
 	int chan_id;
 	struct dma_chan_dev *dev;
+	const char *name;
+#ifdef CONFIG_DEBUG_FS
+	char *dbg_client_name;
+#endif
 
 	struct list_head device_node;
 	struct dma_chan_percpu __percpu *local;
@@ -276,13 +342,11 @@ struct dma_chan {
  * @chan: driver channel device
  * @device: sysfs device
  * @dev_id: parent dma_device dev_id
- * @idr_ref: reference count to gate release of dma_device dev_id
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
 	int dev_id;
-	atomic_t *idr_ref;
 };
 
 /**
@@ -402,7 +466,11 @@ enum dma_residue_granularity {
  *	Since the enum dma_transfer_direction is not defined as bit flag for
  *	each type, the dma controller should set BIT(<TYPE>) and same
  *	should be checked by controller as well
+ * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
+ * @max_sg_burst: max number of SG list entries executed in a single burst
+ *	DMA tansaction with no software intervention for reinitialization.
+ *	Zero value means unlimited number of entries.
  * @cmd_pause: true, if pause is supported (i.e. for reading residue or
  *	       for resume later)
  * @cmd_resume: true, if resume is supported
@@ -415,7 +483,9 @@ struct dma_slave_caps {
 	u32 src_addr_widths;
 	u32 dst_addr_widths;
 	u32 directions;
+	u32 min_burst;
 	u32 max_burst;
+	u32 max_sg_burst;
 	bool cmd_pause;
 	bool cmd_resume;
 	bool cmd_terminate;
@@ -475,6 +545,18 @@ struct dmaengine_unmap_data {
 	dma_addr_t addr[0];
 };
 
+struct dma_async_tx_descriptor;
+
+struct dma_descriptor_metadata_ops {
+	int (*attach)(struct dma_async_tx_descriptor *desc, void *data,
+		      size_t len);
+
+	void *(*get_ptr)(struct dma_async_tx_descriptor *desc,
+			 size_t *payload_len, size_t *max_len);
+	int (*set_len)(struct dma_async_tx_descriptor *desc,
+		       size_t payload_len);
+};
+
 /**
  * struct dma_async_tx_descriptor - async transaction descriptor
  * ---dma generic offload fields---
@@ -488,6 +570,11 @@ struct dmaengine_unmap_data {
  * descriptor pending. To be pushed on .issue_pending() call
  * @callback: routine to call after this operation is complete
  * @callback_param: general parameter to pass to the callback routine
+ * @desc_metadata_mode: core managed metadata mode to protect mixed use of
+ *	DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise
+ *	DESC_METADATA_NONE
+ * @metadata_ops: DMA driver provided metadata mode ops, need to be set by the
+ *	DMA driver if metadata mode is supported with the descriptor
  * ---async_tx api specific fields---
  * @next: at completion submit this descriptor
  * @parent: pointer to the next level up in the dependency chain
@@ -504,6 +591,8 @@ struct dma_async_tx_descriptor {
 	dma_async_tx_callback_result callback_result;
 	void *callback_param;
 	struct dmaengine_unmap_data *unmap;
+	enum dma_desc_metadata_mode desc_metadata_mode;
+	struct dma_descriptor_metadata_ops *metadata_ops;
 #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
 	struct dma_async_tx_descriptor *next;
 	struct dma_async_tx_descriptor *parent;
@@ -539,10 +628,11 @@ static inline void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap)
 
 static inline void dma_descriptor_unmap(struct dma_async_tx_descriptor *tx)
 {
-	if (tx->unmap) {
-		dmaengine_unmap_put(tx->unmap);
-		tx->unmap = NULL;
-	}
+	if (!tx->unmap)
+		return;
+
+	dmaengine_unmap_put(tx->unmap);
+	tx->unmap = NULL;
 }
 
 #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
@@ -611,11 +701,13 @@ static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descr
  * @residue: the remaining number of bytes left to transmit
  *	on the selected transfer for states DMA_IN_PROGRESS and
  *	DMA_PAUSED if this is implemented in the driver, else 0
+ * @in_flight_bytes: amount of data in bytes cached by the DMA.
  */
 struct dma_tx_state {
 	dma_cookie_t last;
 	dma_cookie_t used;
 	u32 residue;
+	u32 in_flight_bytes;
 };
 
 /**
@@ -666,6 +758,7 @@ struct dma_filter {
  * @global_node: list_head for global dma_device_list
  * @filter: information for device/slave to filter function/param mapping
  * @cap_mask: one or more dma_capability flags
+ * @desc_metadata_modes: supported metadata modes by the DMA device
  * @max_xor: maximum number of xor sources, 0 if no capability
  * @max_pq: maximum number of PQ sources and PQ-continue capability
  * @copy_align: alignment shift for memcpy operations
@@ -683,13 +776,18 @@ struct dma_filter {
  *	Since the enum dma_transfer_direction is not defined as bit flag for
  *	each type, the dma controller should set BIT(<TYPE>) and same
  *	should be checked by controller as well
+ * @min_burst: min burst capability per-transfer
  * @max_burst: max burst capability per-transfer
+ * @max_sg_burst: max number of SG list entries executed in a single burst
+ *	DMA tansaction with no software intervention for reinitialization.
+ *	Zero value means unlimited number of entries.
  * @residue_granularity: granularity of the transfer residue reported
  *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
+ * @device_prep_dma_memcpy_sg: prepares a memcpy_sg operation
  * @device_prep_dma_xor: prepares a xor operation
  * @device_prep_dma_xor_val: prepares a xor validation operation
  * @device_prep_dma_pq: prepares a pq operation
@@ -703,6 +801,8 @@ struct dma_filter {
  *	be called after period_len bytes have been transferred.
  * @device_prep_interleaved_dma: Transfer expression in a generic way.
  * @device_prep_dma_imm_data: DMA's 8 byte immediate data to the dst address
+ * @device_caps: May be used to override the generic DMA slave capabilities
+ *	with per-channel specific ones
  * @device_config: Pushes a new configuration to a channel, return 0 or an error
  *	code
  * @device_pause: Pauses any transfer happening on a channel. Returns
@@ -719,15 +819,23 @@ struct dma_filter {
  *	will just return a simple status code
  * @device_issue_pending: push pending transactions to hardware
  * @descriptor_reuse: a submitted transfer can be resubmitted after completion
+ * @device_release: called sometime atfer dma_async_device_unregister() is
+ *     called and there are no further references to this structure. This
+ *     must be implemented to free resources however many existing drivers
+ *     do not and are therefore not safe to unbind while in use.
+ * @dbg_summary_show: optional routine to show contents in debugfs; default code
+ *     will be used when this is omitted, but custom code can show extra,
+ *     controller specific information.
  */
 struct dma_device {
-
+	struct kref ref;
 	unsigned int chancnt;
 	unsigned int privatecnt;
 	struct list_head channels;
 	struct list_head global_node;
 	struct dma_filter filter;
 	dma_cap_mask_t  cap_mask;
+	enum dma_desc_metadata_mode desc_metadata_modes;
 	unsigned short max_xor;
 	unsigned short max_pq;
 	enum dmaengine_alignment copy_align;
@@ -739,11 +847,15 @@ struct dma_device {
 	int dev_id;
 	struct device *dev;
 	struct module *owner;
+	struct ida chan_ida;
+	struct mutex chan_mutex;	/* to protect chan_ida */
 
 	u32 src_addr_widths;
 	u32 dst_addr_widths;
 	u32 directions;
+	u32 min_burst;
 	u32 max_burst;
+	u32 max_sg_burst;
 	bool descriptor_reuse;
 	enum dma_residue_granularity residue_granularity;
 
@@ -753,6 +865,11 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
@@ -791,6 +908,8 @@ struct dma_device {
 		struct dma_chan *chan, dma_addr_t dst, u64 data,
 		unsigned long flags);
 
+	void (*device_caps)(struct dma_chan *chan,
+			    struct dma_slave_caps *caps);
 	int (*device_config)(struct dma_chan *chan,
 			     struct dma_slave_config *config);
 	int (*device_pause)(struct dma_chan *chan);
@@ -802,6 +921,12 @@ struct dma_device {
 					    dma_cookie_t cookie,
 					    struct dma_tx_state *txstate);
 	void (*device_issue_pending)(struct dma_chan *chan);
+	void (*device_release)(struct dma_device *dev);
+	/* debugfs support */
+#ifdef CONFIG_DEBUG_FS
+	void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev);
+	struct dentry *dbg_dev_root;
+#endif
 };
 
 static inline int dmaengine_slave_config(struct dma_chan *chan,
@@ -904,6 +1029,55 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags)
+{
+	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg)
+		return NULL;
+
+	return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents,
+						       src_sg, src_nents,
+						       flags);
+}
+
+static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
+		enum dma_desc_metadata_mode mode)
+{
+	if (!chan)
+		return false;
+
+	return !!(chan->device->desc_metadata_modes & mode);
+}
+
+#ifdef CONFIG_DMA_ENGINE
+int dmaengine_desc_attach_metadata(struct dma_async_tx_descriptor *desc,
+				   void *data, size_t len);
+void *dmaengine_desc_get_metadata_ptr(struct dma_async_tx_descriptor *desc,
+				      size_t *payload_len, size_t *max_len);
+int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc,
+				    size_t payload_len);
+#else /* CONFIG_DMA_ENGINE */
+static inline int dmaengine_desc_attach_metadata(
+		struct dma_async_tx_descriptor *desc, void *data, size_t len)
+{
+	return -EINVAL;
+}
+static inline void *dmaengine_desc_get_metadata_ptr(
+		struct dma_async_tx_descriptor *desc, size_t *payload_len,
+		size_t *max_len)
+{
+	return NULL;
+}
+static inline int dmaengine_desc_set_metadata_len(
+		struct dma_async_tx_descriptor *desc, size_t payload_len)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_DMA_ENGINE */
+
 /**
  * dmaengine_terminate_all() - Terminate all active DMA transfers
  * @chan: The channel for which to terminate the transfers
@@ -1291,11 +1465,12 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 static inline void
 dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue)
 {
-	if (st) {
-		st->last = last;
-		st->used = used;
-		st->residue = residue;
-	}
+	if (!st)
+		return;
+
+	st->last = last;
+	st->used = used;
+	st->residue = residue;
 }
 
 #ifdef CONFIG_DMA_ENGINE
@@ -1306,7 +1481,6 @@ void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 				       dma_filter_fn fn, void *fn_param,
 				       struct device_node *np);
-struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
 struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
@@ -1336,11 +1510,6 @@ static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 {
 	return NULL;
 }
-static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
-							 const char *name)
-{
-	return NULL;
-}
 static inline struct dma_chan *dma_request_chan(struct device *dev,
 						const char *name)
 {
@@ -1372,12 +1541,11 @@ static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx)
 	if (ret)
 		return ret;
 
-	if (caps.descriptor_reuse) {
-		tx->flags |= DMA_CTRL_REUSE;
-		return 0;
-	} else {
+	if (!caps.descriptor_reuse)
 		return -EPERM;
-	}
+
+	tx->flags |= DMA_CTRL_REUSE;
+	return 0;
 }
 
 static inline void dmaengine_desc_clear_reuse(struct dma_async_tx_descriptor *tx)
@@ -1393,10 +1561,10 @@ static inline bool dmaengine_desc_test_reuse(struct dma_async_tx_descriptor *tx)
 static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc)
 {
 	/* this is supported for reusable desc, so check that */
-	if (dmaengine_desc_test_reuse(desc))
-		return desc->desc_free(desc);
-	else
+	if (!dmaengine_desc_test_reuse(desc))
 		return -EPERM;
+
+	return desc->desc_free(desc);
 }
 
 /* --- DMA device --- */
@@ -1404,14 +1572,25 @@ static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc)
 int dma_async_device_register(struct dma_device *device);
 int dmaenginem_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
+int dma_async_device_channel_register(struct dma_device *device,
+				      struct dma_chan *chan);
+void dma_async_device_channel_unregister(struct dma_device *device,
+					 struct dma_chan *chan);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
-struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
-struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
 #define dma_request_channel(mask, x, y) \
 	__dma_request_channel(&(mask), x, y, NULL)
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
+/* Deprecated, please use dma_request_chan() directly */
+static inline struct dma_chan * __deprecated
+dma_request_slave_channel(struct device *dev, const char *name)
+{
+	struct dma_chan *ch = dma_request_chan(dev, name);
+
+	return IS_ERR(ch) ? NULL : ch;
+}
+
 static inline struct dma_chan
 *__dma_request_slave_channel_compat(const dma_cap_mask_t *mask,
 				  dma_filter_fn fn, void *fn_param,
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f397e52c2d9dedc96c9864a9d5940013aa719e0b..18b57996da621efda911083753a04c067f8ea94b 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -48,6 +48,7 @@ struct dmar_drhd_unit {
 	u16	segment;		/* PCI domain		*/
 	u8	ignored:1; 		/* ignore drhd		*/
 	u8	include_all:1;
+	u8	gfx_dedicated:1;	/* graphic dedicated	*/
 	struct intel_iommu *iommu;
 };
 
@@ -130,22 +131,34 @@ static inline int dmar_res_noop(struct acpi_dmar_header *hdr, void *arg)
 	return 0;
 }
 
+#ifdef CONFIG_DMAR_DEBUG
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+			  unsigned long long addr, u32 pasid);
+#else
+static inline void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+					unsigned long long addr, u32 pasid) {}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU
 extern int iommu_detected, no_iommu;
 extern int intel_iommu_init(void);
+extern void intel_iommu_shutdown(void);
 extern int dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_parse_one_atsr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg);
+extern int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert);
 extern int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info);
 #else /* !CONFIG_INTEL_IOMMU: */
 static inline int intel_iommu_init(void) { return -ENODEV; }
+static inline void intel_iommu_shutdown(void) { }
 
 #define	dmar_parse_one_rmrr		dmar_res_noop
 #define	dmar_parse_one_atsr		dmar_res_noop
 #define	dmar_check_one_atsr		dmar_res_noop
 #define	dmar_release_one_atsr		dmar_res_noop
+#define	dmar_parse_one_satc		dmar_res_noop
 
 static inline int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 {
@@ -275,7 +288,7 @@ extern void dmar_msi_unmask(struct irq_data *data);
 extern void dmar_msi_mask(struct irq_data *data);
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
-extern int dmar_set_interrupt(struct intel_iommu *iommu);
+extern int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
 extern int dmar_alloc_hwirq(int id, int node, void *arg);
 extern void dmar_free_hwirq(int irq);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c19483b900794cd8a478f265a6873bea054ac712..47c1f57b31d933567f3a264ae805fb1208c692e0 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -187,7 +187,9 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_RDDR4:		Registered DDR4 RAM
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
+ * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
+ * @MEM_HBM2:		High bandwidth Memory Gen 2.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -211,7 +213,9 @@ enum mem_type {
 	MEM_DDR4,
 	MEM_RDDR4,
 	MEM_LRDDR4,
+	MEM_DDR5,
 	MEM_NVDIMM,
+	MEM_HBM2,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -234,7 +238,9 @@ enum mem_type {
 #define MEM_FLAG_DDR4           BIT(MEM_DDR4)
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
+#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
 /**
  * enum edac-type - Error Detection and Correction capabilities and mode
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c82ef0eba4f84aa1f0c7bd65efdb954c03cc52b8..96090fb1df0c85287dcaf6b2508efe6853dc6b6e 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -114,6 +114,7 @@ typedef	struct {
 #define EFI_MEMORY_MORE_RELIABLE \
 				((u64)0x0000000000010000ULL)	/* higher reliability */
 #define EFI_MEMORY_RO		((u64)0x0000000000020000ULL)	/* read-only */
+#define EFI_MEMORY_SP		((u64)0x0000000000040000ULL)	/* soft reserved */
 #define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
@@ -146,6 +147,56 @@ struct efi_boot_memmap {
 	unsigned long		*buff_size;
 };
 
+#pragma pack(1)
+
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER */
+typedef struct {
+	u32	ver;
+	u16	emb_drv_cnt;
+	u16	payload_cnt;
+	/*
+	 * Variable array indicated by number of
+	 * (emb_drv_cnt + payload_cnt)
+	 */
+	u64	offset_list[];
+} efi_manage_capsule_header_t;
+
+/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER */
+typedef struct {
+	u32	ver;
+	guid_t	image_type_id;
+	u8	image_index;
+	u8	reserved_bytes[3];
+	u32	image_size;
+	u32	vendor_code_size;
+	/* ver = 2. */
+	u64	hw_ins;
+	/* ver = v3. */
+	u64	capsule_support;
+} efi_manage_capsule_image_header_t;
+
+#pragma pack()
+
+/* WIN_CERTIFICATE */
+typedef struct {
+	u32	len;
+	u16	rev;
+	u16	cert_type;
+} win_cert_t;
+
+/* WIN_CERTIFICATE_UEFI_GUID */
+typedef struct {
+	win_cert_t	hdr;
+	guid_t		cert_type;
+	u8		cert_data[];
+} win_cert_uefi_guid_t;
+
+/* EFI_FIRMWARE_IMAGE_AUTHENTICATIO */
+typedef struct {
+	u64				mon_count;
+	win_cert_uefi_guid_t		auth_info;
+} efi_image_auth_t;
+
 /*
  * EFI capsule flags
  */
@@ -1050,7 +1101,6 @@ static inline void efi_enter_virtual_mode (void) {}
 extern efi_status_t efi_query_variable_store(u32 attributes,
 					     unsigned long size,
 					     bool nonblocking);
-extern void efi_find_mirror(void);
 #else
 
 static inline efi_status_t efi_query_variable_store(u32 attributes,
@@ -1208,6 +1258,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_DBG			8	/* Print additional debug info at runtime */
 #define EFI_NX_PE_DATA		9	/* Can runtime data regions be mapped non-executable? */
 #define EFI_MEM_ATTR		10	/* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
+#define EFI_MEM_NO_SOFT_RESERVE	11	/* Is the kernel configured to ignore soft reservations? */
 
 #ifdef CONFIG_EFI
 /*
@@ -1218,6 +1269,14 @@ static inline bool efi_enabled(int feature)
 	return test_bit(feature, &efi.flags) != 0;
 }
 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+	return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+		&& __efi_soft_reserve_enabled();
+}
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -1231,6 +1290,11 @@ efi_capsule_pending(int *reset_type)
 {
 	return false;
 }
+
+static inline bool efi_soft_reserve_enabled(void)
+{
+	return false;
+}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a6644dd62da4837ebc7e9d20a24716e59a..fa0a524baed0ad5ef0482cf7e70596aee217a2b9 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -41,6 +41,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
@@ -82,6 +83,11 @@ static inline bool eventfd_signal_count(void)
 	return false;
 }
 
+static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7c75019718e4b78e87be97866d70ac0b3f71376c..ee401930f39704390e99f2f807e4b9cebed39898 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -482,10 +482,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 #define BPF_CALL_x(x, name, ...)					       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
+	typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
 	{								       \
-		return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
+		return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
 	}								       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index ababd6bc82f33cc75e5733c92ca6005f30a9bc16..a5673c4674cfa03c81823dce6368d8096e48bb43 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -56,6 +56,8 @@ struct fwnode_reference_args {
  *				 otherwise.
  * @property_read_string_array: Read an array of string properties. Return zero
  *				on success, a negative error code otherwise.
+ * @get_name: Return the name of an fwnode.
+ * @get_name_prefix: Get a prefix for a node (for printing purposes).
  * @get_parent: Return the parent of an fwnode.
  * @get_next_child_node: Return the next child node in an iteration.
  * @get_named_child_node: Return a child node with a given name.
@@ -82,6 +84,8 @@ struct fwnode_operations {
 	(*property_read_string_array)(const struct fwnode_handle *fwnode_handle,
 				      const char *propname, const char **val,
 				      size_t nval);
+	const char *(*get_name)(const struct fwnode_handle *fwnode);
+	const char *(*get_name_prefix)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *(*get_parent)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *
 	(*get_next_child_node)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 4bd583bd6934ea87e0b178b07f69a5c480d1fa48..5b14a0f381241de51ac575d483847af8fb8b5b3b 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -206,7 +206,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
 		int min_alloc_order, int nid, const char *name);
 extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
 			size_t size);
 
 #ifdef CONFIG_OF
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 88ac8edf44e31fbc7994d36cd498f1c1a4a675f4..8af9abb73b6c80d8dcc17e2d301271f1bd3c4275 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -19,6 +19,8 @@
 #include <linux/iommu.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
+#include <linux/ioasid.h>
+#include <linux/bitfield.h>
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -30,14 +32,24 @@
 #define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
+#define VTD_ATTR_MASK		(0xfff)
 
 #define VTD_STRIDE_SHIFT        (9)
 #define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
 
-#define DMA_PTE_READ (1)
-#define DMA_PTE_WRITE (2)
-#define DMA_PTE_LARGE_PAGE (1 << 7)
-#define DMA_PTE_SNP (1 << 11)
+#define DMA_PTE_READ		BIT_ULL(0)
+#define DMA_PTE_WRITE		BIT_ULL(1)
+#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
+#define DMA_PTE_SNP		BIT_ULL(11)
+
+#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
+#define DMA_FL_PTE_US		BIT_ULL(2)
+#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
+#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
+#define DMA_FL_PTE_XD		BIT_ULL(63)
+
+#define ADDR_WIDTH_5LEVEL	(57)
+#define ADDR_WIDTH_4LEVEL	(48)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
 #define CONTEXT_TT_DEV_IOTLB	1
@@ -70,6 +82,7 @@
 #define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
+#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
 #define DMAR_PQH_REG	0xc0	/* Page request queue head register */
 #define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
@@ -116,6 +129,10 @@
 #define DMAR_VCMD_REG		0xe10 /* Virtual command register */
 #define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
 
+#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
+
 #define OFFSET_STRIDE		(9)
 
 #define dmar_readq(a) readq(a)
@@ -160,33 +177,40 @@
  * Extended Capability Register
  */
 
+#define	ecap_rps(e)		(((e) >> 49) & 0x1)
 #define ecap_smpwc(e)		(((e) >> 48) & 0x1)
 #define ecap_flts(e)		(((e) >> 47) & 0x1)
 #define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_slads(e)		(((e) >> 45) & 0x1)
+#define ecap_vcs(e)		(((e) >> 44) & 0x1)
 #define ecap_smts(e)		(((e) >> 43) & 0x1)
-#define ecap_dit(e)		((e >> 41) & 0x1)
-#define ecap_pasid(e)		((e >> 40) & 0x1)
-#define ecap_pss(e)		((e >> 35) & 0x1f)
-#define ecap_eafs(e)		((e >> 34) & 0x1)
-#define ecap_nwfs(e)		((e >> 33) & 0x1)
-#define ecap_srs(e)		((e >> 31) & 0x1)
-#define ecap_ers(e)		((e >> 30) & 0x1)
-#define ecap_prs(e)		((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)	((e >> 28) & 0x1)
-#define ecap_dis(e)		((e >> 27) & 0x1)
-#define ecap_nest(e)		((e >> 26) & 0x1)
-#define ecap_mts(e)		((e >> 25) & 0x1)
-#define ecap_ecs(e)		((e >> 24) & 0x1)
+#define ecap_dit(e)		(((e) >> 41) & 0x1)
+#define ecap_pds(e)		(((e) >> 42) & 0x1)
+#define ecap_pasid(e)		(((e) >> 40) & 0x1)
+#define ecap_pss(e)		(((e) >> 35) & 0x1f)
+#define ecap_eafs(e)		(((e) >> 34) & 0x1)
+#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
+#define ecap_srs(e)		(((e) >> 31) & 0x1)
+#define ecap_ers(e)		(((e) >> 30) & 0x1)
+#define ecap_prs(e)		(((e) >> 29) & 0x1)
+#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
+#define ecap_dis(e)		(((e) >> 27) & 0x1)
+#define ecap_nest(e)		(((e) >> 26) & 0x1)
+#define ecap_mts(e)		(((e) >> 25) & 0x1)
+#define ecap_ecs(e)		(((e) >> 24) & 0x1)
 #define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
 #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
 #define ecap_coherent(e)	((e) & 0x1)
 #define ecap_qis(e)		((e) & 0x2)
-#define ecap_pass_through(e)	((e >> 6) & 0x1)
-#define ecap_eim_support(e)	((e >> 4) & 0x1)
-#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
+#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
+#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
 #define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
-#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-#define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
+#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
+#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
+
+/* Virtual command interface capability */
+#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
@@ -281,6 +305,9 @@
 
 /* PRS_REG */
 #define DMA_PRS_PPR	((u32)1)
+#define DMA_PRS_PRO	((u32)2)
+
+#define DMA_VCS_PAS	((u64)1)
 
 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
 do {									\
@@ -321,6 +348,8 @@ enum {
 
 #define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
 #define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+#define QI_IWD_FENCE		(((u64)1) << 6)
+#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
 
 #define QI_IOTLB_DID(did) 	(((u64)did) << 16)
 #define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
@@ -328,7 +357,7 @@ enum {
 #define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
 #define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
 #define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
-#define QI_IOTLB_AM(am)		(((u8)am))
+#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
 
 #define QI_CC_FM(fm)		(((u64)fm) << 48)
 #define QI_CC_SID(sid)		(((u64)sid) << 32)
@@ -347,19 +376,24 @@ enum {
 #define QI_PC_DID(did)		(((u64)did) << 16)
 #define QI_PC_GRAN(gran)	(((u64)gran) << 4)
 
-#define QI_PC_ALL_PASIDS	(QI_PC_TYPE | QI_PC_GRAN(0))
-#define QI_PC_PASID_SEL		(QI_PC_TYPE | QI_PC_GRAN(1))
+/* PASID cache invalidation granu */
+#define QI_PC_ALL_PASIDS	0
+#define QI_PC_PASID_SEL		1
+#define QI_PC_GLOBAL		3
 
 #define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-#define QI_EIOTLB_AM(am)	(((u64)am))
+#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
 #define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
 #define QI_EIOTLB_DID(did)	(((u64)did) << 16)
 #define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
 
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL		1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
+
 #define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
 #define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
-#define QI_DEV_EIOTLB_GLOB(g)	((u64)(g) & 0x1)
 #define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
 #define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
 #define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
@@ -404,6 +438,8 @@ struct q_inval {
 	int             free_cnt;
 };
 
+struct dmar_pci_notify_info;
+
 #ifdef CONFIG_IRQ_REMAP
 /* 1MB - maximum possible interrupt remapping table size */
 #define INTR_REMAP_PAGE_ORDER	8
@@ -418,6 +454,11 @@ struct ir_table {
 	struct irte *base;
 	unsigned long *bitmap;
 };
+
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
+#else
+static inline void
+intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
 #endif
 
 struct iommu_flush {
@@ -437,8 +478,11 @@ enum {
 
 #define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
+#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
+#define VTD_FLAG_PGTT_SL_ONLY		(1 << 3)
 
 extern int intel_iommu_sm;
+extern spinlock_t device_domain_lock;
 
 #define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
 #define pasid_supported(iommu)	(sm_supported(iommu) &&			\
@@ -475,10 +519,24 @@ struct context_entry {
 	u64 hi;
 };
 
+/*
+ * When VT-d works in the scalable mode, it allows DMA translation to
+ * happen through either first level or second level page table. This
+ * bit marks that the DMA translation for the domain goes through the
+ * first level page table, otherwise, it goes through the second level.
+ */
+#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
+
+/*
+ * Domain represents a virtual machine which demands iommu nested
+ * translation mode support.
+ */
+#define DOMAIN_FLAG_NESTING_MODE		BIT(2)
+
 struct dmar_domain {
 	int	nid;			/* node id */
 
-	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
+	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
 					/* Refcount of devices per iommu */
 
 
@@ -486,11 +544,13 @@ struct dmar_domain {
 					/* Domain ids per IOMMU. Use u16 since
 					 * domain ids are 16 bit wide according
 					 * to VT-d spec, section 9.3 */
-	unsigned int	auxd_refcnt;	/* Refcount of auxiliary attaching */
 
-	bool has_iotlb_device;
+	u8 has_iotlb_device: 1;
+	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
+	u8 iommu_snooping: 1;		/* indicate snooping control feature */
+
 	struct list_head devices;	/* all devices' list */
-	struct list_head auxd;		/* link to device's auxiliary list */
+	struct list_head subdevices;	/* all subdevices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
 	struct dma_pte	*pgd;		/* virtual address */
@@ -500,20 +560,17 @@ struct dmar_domain {
 	int		agaw;
 
 	int		flags;		/* flags to find out type of domain */
-
-	int		iommu_coherency;/* indicate coherency of iommu access */
-	int		iommu_snooping; /* indicate snooping control feature*/
-	int		iommu_count;	/* reference count of iommu */
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 	u64		max_addr;	/* maximum mapped address */
 
-	int		default_pasid;	/*
+	u32		default_pasid;	/*
 					 * The default pasid used for non-SVM
 					 * traffic on mediated devices.
 					 */
-
+	u32 		kernel_pasid;	/* for in-kernel DMA w/ PASID */
+	atomic_t	kernel_pasid_user; /* count of kernel_pasid users */
 	struct iommu_domain domain;	/* generic domain data structure for
 					   iommu core */
 };
@@ -524,6 +581,7 @@ struct intel_iommu {
 	u64		reg_size; /* size of hw register set */
 	u64		cap;
 	u64		ecap;
+	u64		vccap;
 	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
 	raw_spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
@@ -544,7 +602,15 @@ struct intel_iommu {
 #ifdef CONFIG_INTEL_IOMMU_SVM
 	struct page_req_dsc *prq;
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
+	struct completion prq_complete;
+	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
+	unsigned long num_prqs; /* Page Req */
+	unsigned long num_prrs; /* Page Resp SUCCESS */
+	unsigned long num_prri; /* Page Resp INVALID */
+
 #endif
+	struct iopf_queue *iopf_queue;
+	unsigned char iopfq_name[16];
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
 
@@ -556,18 +622,27 @@ struct intel_iommu {
 	struct iommu_device iommu;  /* IOMMU core code handle */
 	int		node;
 	u32		flags;      /* Software defined flags */
+	struct workqueue_struct *fault_wq; /* Reporting IOMMU fault to device */
 
 	struct dmar_drhd_unit *drhd;
+	void *perf_statistic;
+};
+
+/* Per subdevice private data */
+struct subdev_domain_info {
+	struct list_head link_phys;	/* link to phys device siblings */
+	struct list_head link_domain;	/* link to domain siblings */
+	struct device *pdev;		/* physical device derived from */
+	struct dmar_domain *domain;	/* aux-domain */
+	int users;			/* user count */
 };
 
 /* PCI domain-device relationship */
 struct device_domain_info {
 	struct list_head link;	/* link to domain siblings */
 	struct list_head global; /* link to global list */
-	struct list_head table;	/* link to pasid table */
-	struct list_head auxiliary_domains; /* auxiliary domains
-					     * attached to this device
-					     */
+	struct list_head subdevices; /* subdevices sibling */
+	u32 segment;		/* PCI segment number */
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
 	u16 pfsid;		/* SRIOV physical function source ID */
@@ -592,6 +667,12 @@ static inline void __iommu_flush_cache(
 		clflush_cache_range(addr, size);
 }
 
+/* Convert generic struct iommu_domain to private struct dmar_domain */
+static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct dmar_domain, domain);
+}
+
 /*
  * 0: readable
  * 1: writable
@@ -613,10 +694,11 @@ static inline void dma_clear_pte(struct dma_pte *pte)
 static inline u64 dma_pte_addr(struct dma_pte *pte)
 {
 #ifdef CONFIG_64BIT
-	return pte->val & VTD_PAGE_MASK;
+	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
 #else
 	/* Must have a full atomic 64-bit read */
-	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
+			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
 #endif
 }
 
@@ -630,13 +712,19 @@ static inline bool dma_pte_superpage(struct dma_pte *pte)
 	return (pte->val & DMA_PTE_LARGE_PAGE);
 }
 
-static inline int first_pte_in_page(struct dma_pte *pte)
+static inline bool first_pte_in_page(struct dma_pte *pte)
+{
+	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
+}
+
+static inline int nr_pte_to_next_page(struct dma_pte *pte)
 {
-	return !((unsigned long)pte & ~VTD_PAGE_MASK);
+	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
+		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
+extern int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu);
 
 extern int dmar_enable_qi(struct intel_iommu *iommu);
 extern void dmar_disable_qi(struct intel_iommu *iommu);
@@ -649,30 +737,65 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type);
 extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 			u16 qdep, u64 addr, unsigned mask);
-extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih);
+
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid, u16 qdep, u64 addr,
+			      unsigned int size_order);
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
+			  u32 pasid);
+
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options);
+/*
+ * Options used in qi_submit_sync:
+ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
+ */
+#define QI_OPT_WAIT_DRAIN		BIT(0)
 
 extern int dmar_ir_support(void);
 
 void *alloc_pgtable_page(int node);
 void free_pgtable_page(void *vaddr);
 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
-				     void *data), void *data);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
+struct dmar_domain *find_domain(struct device *dev);
+struct device_domain_info *get_domain_info(struct device *dev);
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+int domain_get_pasid(struct iommu_domain *domain, struct device *dev);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
-int intel_svm_init(struct intel_iommu *iommu);
+extern void intel_svm_check(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
-
-struct svm_dev_ops;
+inline bool is_aux_domain(struct device *dev,
+			  struct iommu_domain *domain);
+int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
+			  struct iommu_gpasid_bind_data *data,
+			  void *fault_data);
+int intel_svm_unbind_gpasid(struct iommu_domain *domain,
+			    struct device *dev, u32 pasid, u64 user_flags);
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
+				 void *drvdata);
+void intel_svm_unbind(struct iommu_sva *handle);
+u32 intel_svm_get_pasid(struct iommu_sva *handle);
+int intel_svm_page_response(struct iommu_domain *domain, struct device *dev,
+			    struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg);
+void intel_svm_add_pasid_notifier(void);
 
 struct intel_svm_dev {
 	struct list_head list;
 	struct rcu_head rcu;
 	struct device *dev;
-	struct svm_dev_ops *ops;
+	struct intel_iommu *iommu;
+	struct dmar_domain *domain;
+	struct iommu_sva sva;
+	unsigned long prq_seq_number;
+	u32 pasid;
 	int users;
 	u16 did;
 	u16 dev_iotlb:1;
@@ -682,16 +805,20 @@ struct intel_svm_dev {
 struct intel_svm {
 	struct mmu_notifier notifier;
 	struct mm_struct *mm;
-	struct intel_iommu *iommu;
-	int flags;
-	int pasid;
+
+	unsigned int flags;
+	u32 pasid;
+	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
 	struct list_head list;
+	struct work_struct work; /* For deferred clean up */
 };
-
-extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
+#else
+static inline void intel_svm_check(struct intel_iommu *iommu) {}
 #endif
 
+extern int qi_done_no_cpu_relax;
+
 #ifdef CONFIG_INTEL_IOMMU_DEBUGFS
 void intel_iommu_debugfs_init(void);
 #else
@@ -722,4 +849,32 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 #define intel_iommu_enabled (0)
 #endif
 
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	char *buf = str;
+	int bytes;
+
+	bytes = snprintf(buf, size,
+			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
+			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
+			 dw1 & BIT_ULL(0) ? 'r' : '-',
+			 dw1 & BIT_ULL(1) ? 'w' : '-',
+			 dw0 & BIT_ULL(52) ? 'x' : '-',
+			 dw0 & BIT_ULL(53) ? 'p' : '-',
+			 dw1 & BIT_ULL(2) ? 'l' : '-',
+			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
+			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+	/* Private Data */
+	if (dw0 & BIT_ULL(9)) {
+		size -= bytes;
+		buf += bytes;
+		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+	}
+
+	return str;
+}
+
 #endif
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index d7c403d0dd27d84ecdc8d8b094739e32b85968e2..7c0dfe29ff4ab4e02002dc50ddb3fe792aaece37 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -8,29 +8,14 @@
 #ifndef __INTEL_SVM_H__
 #define __INTEL_SVM_H__
 
-struct device;
-
-struct svm_dev_ops {
-	void (*fault_cb)(struct device *dev, int pasid, u64 address,
-			 void *private, int rwxp, int response);
-};
-
 /* Values for rxwp in fault_cb callback */
 #define SVM_REQ_READ	(1<<3)
 #define SVM_REQ_WRITE	(1<<2)
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
-
-/*
- * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
- * PASID for the current process. Even if a PASID already exists, a new one
- * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID
- * will not be given to subsequent callers. This facility allows a driver to
- * disambiguate between multiple device contexts which access the same MM,
- * if there is no other way to do so. It should be used sparingly, if at all.
- */
-#define SVM_FLAG_PRIVATE_PASID		(1<<0)
+/* Page Request Queue depth */
+#define PRQ_RING_MASK	((0x1000 << prq_size_page_order) - 0x20)
 
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
@@ -43,91 +28,18 @@ struct svm_dev_ops {
  * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
  * do such IOTLB flushes automatically.
  */
-#define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-
-/**
- * intel_svm_bind_mm() - Bind the current process to a PASID
- * @dev:	Device to be granted access
- * @pasid:	Address for allocated PASID
- * @flags:	Flags. Later for requesting supervisor mode, etc.
- * @ops:	Callbacks to device driver
- *
- * This function attempts to enable PASID support for the given device.
- * If the @pasid argument is non-%NULL, a PASID is allocated for access
- * to the MM of the current process.
- *
- * By using a %NULL value for the @pasid argument, this function can
- * be used to simply validate that PASID support is available for the
- * given device — i.e. that it is behind an IOMMU which has the
- * requisite support, and is enabled.
- *
- * Page faults are handled transparently by the IOMMU code, and there
- * should be no need for the device driver to be involved. If a page
- * fault cannot be handled (i.e. is an invalid address rather than
- * just needs paging in), then the page request will be completed by
- * the core IOMMU code with appropriate status, and the device itself
- * can then report the resulting fault to its driver via whatever
- * mechanism is appropriate.
- *
- * Multiple calls from the same process may result in the same PASID
- * being re-used. A reference count is kept.
- */
-extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
-			     struct svm_dev_ops *ops);
-
-/**
- * intel_svm_unbind_mm() - Unbind a specified PASID
- * @dev:	Device for which PASID was allocated
- * @pasid:	PASID value to be unbound
- *
- * This function allows a PASID to be retired when the device no
- * longer requires access to the address space of a given process.
- *
- * If the use count for the PASID in question reaches zero, the
- * PASID is revoked and may no longer be used by hardware.
- *
- * Device drivers are required to ensure that no access (including
- * page requests) is currently outstanding for the PASID in question,
- * before calling this function.
+#define SVM_FLAG_SUPERVISOR_MODE	BIT(0)
+/*
+ * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest
+ * processes. Compared to the host bind, the primary differences are:
+ * 1. mm life cycle management
+ * 2. fault reporting
  */
-extern int intel_svm_unbind_mm(struct device *dev, int pasid);
-
-/**
- * intel_svm_is_pasid_valid() - check if pasid is valid
- * @dev:	Device for which PASID was allocated
- * @pasid:	PASID value to be checked
- *
- * This function checks if the specified pasid is still valid. A
- * valid pasid means the backing mm is still having a valid user.
- * For kernel callers init_mm is always valid. for other mm, if mm->mm_users
- * is non-zero, it is valid.
- *
- * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid
- * 1 if pasid is valid.
+#define SVM_FLAG_GUEST_MODE		BIT(1)
+/*
+ * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
+ * which requires guest and host PASID translation at both directions.
  */
-extern int intel_svm_is_pasid_valid(struct device *dev, int pasid);
-
-#else /* CONFIG_INTEL_IOMMU_SVM */
-
-static inline int intel_svm_bind_mm(struct device *dev, int *pasid,
-				    int flags, struct svm_dev_ops *ops)
-{
-	return -ENOSYS;
-}
-
-static inline int intel_svm_unbind_mm(struct device *dev, int pasid)
-{
-	BUG();
-}
-
-static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_INTEL_IOMMU_SVM */
-
-#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL))
+#define SVM_FLAG_GUEST_PASID		BIT(2)
 
 #endif /* __INTEL_SVM_H__ */
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index efb3ce892c20dc1f87d35abb43a5c32bbe6b0b17..a12fca5b401dc23a765f1ddf78ff2e73b4272462 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -55,6 +55,12 @@ enum rapl_primitives {
 	THROTTLED_TIME,
 	PRIORITY_LEVEL,
 
+	PSYS_POWER_LIMIT1,
+	PSYS_POWER_LIMIT2,
+	PSYS_PL1_ENABLE,
+	PSYS_PL2_ENABLE,
+	PSYS_TIME_WINDOW1,
+	PSYS_TIME_WINDOW2,
 	/* below are not raw primitive data */
 	AVERAGE_POWER,
 	NR_RAPL_PRIMITIVES,
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index cf34dbaf2c114fce63f3d30becdb3aef4ffcddf3..e0f44cc660e73c7678fd86513a46b76619b99523 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -301,39 +301,8 @@ struct irq_affinity_desc {
 
 extern cpumask_var_t irq_default_affinity;
 
-/* Internal implementation. Use the helpers below */
-extern int __irq_set_affinity(unsigned int irq, const struct cpumask *cpumask,
-			      bool force);
-
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Fails if cpumask does not contain an online CPU
- */
-static inline int
-irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, false);
-}
-
-/**
- * irq_force_affinity - Force the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Same as irq_set_affinity, but without checking the mask against
- * online cpus.
- *
- * Solely for low level cpu hotplug code, where we need to make per
- * cpu interrupts affine before the cpu becomes online.
- */
-static inline int
-irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, true);
-}
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
@@ -474,6 +443,8 @@ extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 				 bool state);
 
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val);
+
 #ifdef CONFIG_IRQ_FORCED_THREADING
 # ifdef CONFIG_PREEMPT_RT
 #  define force_irqthreads	(true)
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ec7a13405f10bbece1606f51f7f6b27df1dcfe48..a2ebcbff7d1a7823c19e2059f245cc58572d7906 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -31,7 +31,7 @@ enum io_pgtable_fmt {
  *                  single page.  IOMMUs that cannot batch TLB invalidation
  *                  operations efficiently will typically issue them here, but
  *                  others may decide to update the iommu_iotlb_gather structure
- *                  and defer the invalidation until iommu_tlb_sync() instead.
+ *                  and defer the invalidation until iommu_iotlb_sync() instead.
  *
  * Note that these can all be called in atomic context and must therefore
  * not block.
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
new file mode 100644
index 0000000000000000000000000000000000000000..02be9292a74a4ac4c58040da1858a454f246e723
--- /dev/null
+++ b/include/linux/ioasid.h
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_IOASID_H
+#define __LINUX_IOASID_H
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <uapi/linux/ioasid.h>
+
+#define INVALID_IOASID ((ioasid_t)-1)
+#define IOASID_DMA_NO_PASID	0 /* For DMA request w/o PASID */
+#define IOASID_ALLOC_BASE	1 /* Start of the allocation */
+
+typedef unsigned int ioasid_t;
+typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
+typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
+
+/* IOASID set types */
+enum ioasid_set_type {
+	IOASID_SET_TYPE_NULL = 1, /* Set token is NULL */
+	IOASID_SET_TYPE_MM,	  /* Set token is a mm_struct pointer
+				   * i.e. associated with a process
+				   */
+	IOASID_SET_TYPE_NR,
+};
+
+/**
+ * struct ioasid_set - Meta data about ioasid_set
+ * @nh:		List of notifiers private to that set
+ * @xa:		XArray to store ioasid_set private IDs, can be used for
+ *		guest-host IOASID mapping, or just a private IOASID namespace.
+ * @token:	Unique to identify an IOASID set
+ * @type:	Token types
+ * @quota:	Max number of IOASIDs can be allocated within the set
+ * @nr_ioasids:	Number of IOASIDs currently allocated in the set
+ * @id:		ID of the set
+ */
+struct ioasid_set {
+	struct atomic_notifier_head nh;
+	struct xarray xa;
+	void *token;
+	int type;
+	int quota;
+	atomic_t nr_ioasids;
+	int id;
+	bool free_pending;
+	struct rcu_head rcu;
+};
+
+/**
+ * struct ioasid_allocator_ops - IOASID allocator helper functions and data
+ *
+ * @alloc:	helper function to allocate IOASID
+ * @free:	helper function to free IOASID
+ * @list:	for tracking ops that share helper functions but not data
+ * @pdata:	data belong to the allocator, provided when calling alloc()
+ */
+struct ioasid_allocator_ops {
+	ioasid_alloc_fn_t alloc;
+	ioasid_free_fn_t free;
+	struct list_head list;
+	void *pdata;
+};
+
+/* Notification data when IOASID status changed */
+enum ioasid_notify_val {
+	IOASID_NOTIFY_ALLOC = 1,
+	IOASID_NOTIFY_FREE,
+	IOASID_NOTIFY_BIND,
+	IOASID_NOTIFY_UNBIND,
+};
+
+#define IOASID_NOTIFY_FLAG_ALL BIT(0)
+#define IOASID_NOTIFY_FLAG_SET BIT(1)
+/**
+ * enum ioasid_notifier_prios - IOASID event notification order
+ *
+ * When status of an IOASID changes, users might need to take actions to
+ * reflect the new state. For example, when an IOASID is freed due to
+ * exception, the hardware context in virtual CPU, DMA device, and IOMMU
+ * shall be cleared and drained. Order is required to prevent life cycle
+ * problems.
+ */
+enum ioasid_notifier_prios {
+	IOASID_PRIO_LAST,
+	IOASID_PRIO_DEVICE,
+	IOASID_PRIO_IOMMU,
+	IOASID_PRIO_CPU,
+};
+
+/**
+ * struct ioasid_nb_args - Argument provided by IOASID core when notifier
+ * is called.
+ * @id:		The IOASID being notified
+ * @spid:	The set private ID associated with the IOASID
+ * @set:	The IOASID set of @id
+ * @pdata:	The private data attached to the IOASID
+ */
+struct ioasid_nb_args {
+	ioasid_t id;
+	ioasid_t spid;
+	struct ioasid_set *set;
+	void *pdata;
+};
+
+#if IS_ENABLED(CONFIG_IOASID)
+void ioasid_install_capacity(ioasid_t total);
+int ioasid_reserve_capacity(ioasid_t nr_ioasid);
+int ioasid_cancel_capacity(ioasid_t nr_ioasid);
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type);
+int ioasid_set_free(struct ioasid_set *set);
+struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token);
+
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+		      void *private);
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_if_owned(ioasid_t ioasid);
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free_all_in_set(struct ioasid_set *set);
+void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+		  bool (*getter)(void *));
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid);
+int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
+void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
+int ioasid_attach_data(ioasid_t ioasid, void *data);
+void ioasid_detach_data(ioasid_t ioasid);
+int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid);
+void ioasid_detach_spid(ioasid_t ioasid);
+ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get);
+int ioasid_register_notifier(struct ioasid_set *set,
+			struct notifier_block *nb);
+void ioasid_unregister_notifier(struct ioasid_set *set,
+				struct notifier_block *nb);
+void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+				void (*fn)(ioasid_t id, void *data),
+				void *data);
+int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
+void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block *nb);
+#ifdef CONFIG_CGROUP_IOASIDS
+int ioasid_cg_charge(struct ioasid_set *set);
+void ioasid_cg_uncharge(struct ioasid_set *set);
+#else
+/* No cgroup control, allocation will proceed until run out total pool */
+static inline int ioasid_cg_charge(struct ioasid_set *set)
+{
+	return 0;
+}
+
+static inline int ioasid_cg_uncharge(struct ioasid_set *set)
+{
+	return 0;
+}
+#endif /* CGROUP_IOASIDS */
+bool ioasid_queue_work(struct work_struct *work);
+
+/* IOASID userspace support */
+struct ioasid_user;
+#if IS_ENABLED(CONFIG_IOASID_USER)
+extern struct ioasid_user *ioasid_user_get_from_task(struct task_struct *task);
+extern void ioasid_user_put(struct ioasid_user *iuser);
+extern void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+				   void (*fn)(ioasid_t id, void *data));
+extern struct ioasid_set *host_pasid_set;
+
+#else /* CONFIG_IOASID_USER */
+static inline struct ioasid_user *
+ioasid_user_get_from_task(struct task_struct *task)
+{
+	return ERR_PTR(-ENOTTY);
+}
+
+static inline void ioasid_user_put(struct ioasid_user *iuser)
+{
+}
+
+static inline void ioasid_user_for_each_id(struct ioasid_user *iuser, void *data,
+					  void (*fn)(ioasid_t id, void *data))
+{
+}
+#endif /* CONFIG_IOASID_USER */
+
+static inline bool pasid_valid(ioasid_t ioasid)
+{
+	return ioasid != INVALID_IOASID;
+}
+
+#else /* !CONFIG_IOASID */
+
+static inline void ioasid_install_capacity(ioasid_t total)
+{
+}
+
+static inline int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+	return -ENOSPC;
+}
+
+static inline int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+	return -EINVAL;
+}
+
+static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
+				    ioasid_t max, void *private)
+{
+	return INVALID_IOASID;
+}
+
+static inline struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota,
+						  ioasid_set_type type)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
+{
+}
+
+static inline struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+	return NULL;
+}
+
+static inline int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
+static inline int ioasid_get_if_owned(ioasid_t ioasid)
+{
+	return -ENOTSUPP;
+}
+
+static inline bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return false;
+}
+
+static inline bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+	return false;
+}
+
+static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
+				bool (*getter)(void *))
+{
+	return NULL;
+}
+
+static inline int ioasid_register_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_notifier(struct notifier_block *nb)
+{
+}
+
+static inline int ioasid_register_allocator(struct ioasid_allocator_ops *allocator)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator)
+{
+}
+
+static inline int ioasid_attach_data(ioasid_t ioasid, void *data)
+{
+	return -ENOTSUPP;
+}
+
+static inline bool pasid_valid(ioasid_t ioasid)
+{
+	return false;
+}
+
+static inline void ioasid_detach_data(ioasid_t ioasid)
+{
+}
+
+static inline void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+}
+
+static inline struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_detach_spid(ioasid_t ioasid)
+{
+}
+
+static inline ioasid_t ioasid_find_by_spid(struct ioasid_set *set,
+					   ioasid_t spid, bool get)
+{
+	return INVALID_IOASID;
+}
+
+static inline void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+					      void (*fn)(ioasid_t id, void *data),
+					      void *data)
+{
+}
+
+static inline int ioasid_register_notifier_mm(struct mm_struct *mm,
+					      struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ioasid_unregister_notifier_mm(struct mm_struct *mm,
+						 struct notifier_block *nb)
+{
+}
+
+static inline bool ioasid_queue_work(struct work_struct *work)
+{
+	return false;
+}
+#endif /* CONFIG_IOASID */
+#endif /* __LINUX_IOASID_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 87ea30c5b39dfa98e3421fda5c19447833350622..51e0ebfa9639c9e4248fe5ba74b5baaf46db4975 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <linux/ioasid.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
@@ -45,15 +46,16 @@ struct iommu_domain;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
+struct iommu_dma_cookie;
 
 /* iommu fault flags */
-#define IOMMU_FAULT_READ	0x0
-#define IOMMU_FAULT_WRITE	0x1
+#define IOMMU_FAULT_READ		(1 << 0)
+#define IOMMU_FAULT_WRITE		(1 << 1)
+#define IOMMU_FAULT_EXEC		(1 << 2)
+#define IOMMU_FAULT_PRIV		(1 << 3)
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
 			struct device *, unsigned long, int, void *);
-typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *,
-				       void *);
 typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *);
 
 struct iommu_domain_geometry {
@@ -67,6 +69,7 @@ struct iommu_domain_geometry {
 #define __IOMMU_DOMAIN_DMA_API	(1U << 1)  /* Domain for use in DMA-API
 					      implementation              */
 #define __IOMMU_DOMAIN_PT	(1U << 2)  /* Domain is identity mapped   */
+#define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
 
 /*
  * This are the possible domain-types
@@ -79,12 +82,17 @@ struct iommu_domain_geometry {
  *	IOMMU_DOMAIN_DMA	- Internally used for DMA-API implementations.
  *				  This flag allows IOMMU drivers to implement
  *				  certain optimizations for these domains
+ *	IOMMU_DOMAIN_DMA_FQ	- As above, but definitely using batched TLB
+ *				  invalidation.
  */
 #define IOMMU_DOMAIN_BLOCKED	(0U)
 #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
 #define IOMMU_DOMAIN_UNMANAGED	(__IOMMU_DOMAIN_PAGING)
 #define IOMMU_DOMAIN_DMA	(__IOMMU_DOMAIN_PAGING |	\
 				 __IOMMU_DOMAIN_DMA_API)
+#define IOMMU_DOMAIN_DMA_FQ	(__IOMMU_DOMAIN_PAGING |	\
+				 __IOMMU_DOMAIN_DMA_API |	\
+				 __IOMMU_DOMAIN_DMA_FQ)
 
 struct iommu_domain {
 	unsigned type;
@@ -93,14 +101,21 @@ struct iommu_domain {
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
-	void *iova_cookie;
+	struct iommu_dma_cookie *iova_cookie;
 };
 
+static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
+{
+	return domain->type & __IOMMU_DOMAIN_DMA_API;
+}
+
 enum iommu_cap {
 	IOMMU_CAP_CACHE_COHERENCY,	/* IOMMU can enforce cache coherent DMA
 					   transactions */
 	IOMMU_CAP_INTR_REMAP,		/* IOMMU supports interrupt isolation */
 	IOMMU_CAP_NOEXEC,		/* IOMMU_NOEXEC flag */
+	IOMMU_CAP_VIOMMU_HINT,		/* IOMMU can detect a hit for running in
+					   VM */
 };
 
 /*
@@ -123,8 +138,8 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
-	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_MAX,
 };
 
@@ -166,46 +181,35 @@ struct iommu_resv_region {
 enum iommu_dev_features {
 	IOMMU_DEV_FEAT_AUX,	/* Aux-domain feature */
 	IOMMU_DEV_FEAT_SVA,	/* Shared Virtual Addresses */
+	IOMMU_DEV_FEAT_HWDBM,	/* Hardware Dirty Bit Management */
 };
 
 #define IOMMU_PASID_INVALID	(-1U)
 
-/**
- * struct iommu_sva_ops - device driver callbacks for an SVA context
- *
- * @mm_exit: called when the mm is about to be torn down by exit_mmap. After
- *           @mm_exit returns, the device must not issue any more transaction
- *           with the PASID given as argument.
- *
- *           The @mm_exit handler is allowed to sleep. Be careful about the
- *           locks taken in @mm_exit, because they might lead to deadlocks if
- *           they are also held when dropping references to the mm. Consider the
- *           following call chain:
- *           mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A)
- *           Using mmput_async() prevents this scenario.
- *
- */
-struct iommu_sva_ops {
-	iommu_mm_exit_handler_t mm_exit;
-};
-
 #ifdef CONFIG_IOMMU_API
 
 /**
  * struct iommu_iotlb_gather - Range information for a pending IOTLB flush
  *
  * @start: IOVA representing the start of the range to be flushed
- * @end: IOVA representing the end of the range to be flushed (exclusive)
+ * @end: IOVA representing the end of the range to be flushed (inclusive)
  * @pgsize: The interval at which to perform the flush
+ * @freelist: Removed pages to free after sync
+ * @queued: Indicates that the flush will be queued
  *
  * This structure is intended to be updated by multiple calls to the
  * ->unmap() function in struct iommu_ops before eventually being passed
- * into ->iotlb_sync().
+ * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after
+ * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to
+ * them. @queued is set to indicate when ->iotlb_flush_all() will be called
+ * later instead of ->iotlb_sync(), so drivers may optimise accordingly.
  */
 struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
+	struct page		*freelist;
+	bool			queued;
 };
 
 /**
@@ -222,16 +226,17 @@ struct iommu_iotlb_gather {
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
  * @iova_to_phys: translate iova to physical address
- * @add_device: add device to iommu grouping
- * @remove_device: remove device from iommu grouping
+ * @probe_device: Add device to iommu driver handling
+ * @release_device: Remove device from iommu driver handling
+ * @probe_finalize: Do final setup work after the device is added to an IOMMU
+ *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
- * @domain_set_attr: Change domain attributes
+ * @domain_set_attr: Change domain attributes 
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
  * @domain_window_enable: Configure and enable a particular window for a domain
- * @domain_window_disable: Disable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -244,7 +249,19 @@ struct iommu_iotlb_gather {
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
  * @page_response: handle page request response
+ * @cache_invalidate: invalidate translation caches
+ * @sva_bind_gpasid: bind guest pasid and mm
+ * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @def_domain_type: device default domain type, return value:
+ *		 - IOMMU_DOMAIN_IDENTITY: must use an identity domain
+ *		 - IOMMU_DOMAIN_DMA: must use a dma domain
+ *		 - 0: use the default setting
+ * @enable_pasid_dma: Set up PASID for in-kernel DMA
+ * @disable_pasid_dma: Disable in-kernel DMA with PASID on the device
  * @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @owner: Driver module providing these ops
+ * @sva_suspend_pasid: stop activities related to a pasid but maintain the bond
+ * @sva_resume_pasid: start activities related to a pasid
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -256,16 +273,18 @@ struct iommu_ops {
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot);
+		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
-	void (*iotlb_sync_map)(struct iommu_domain *domain);
+	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+			       size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
-	int (*add_device)(struct device *dev);
-	void (*remove_device)(struct device *dev);
+	struct iommu_device *(*probe_device)(struct device *dev);
+	void (*release_device)(struct device *dev);
+	void (*probe_finalize)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
 	int (*domain_get_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
@@ -282,7 +301,6 @@ struct iommu_ops {
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
 				    phys_addr_t paddr, u64 size, int prot);
-	void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
@@ -301,13 +319,48 @@ struct iommu_ops {
 	struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm,
 				      void *drvdata);
 	void (*sva_unbind)(struct iommu_sva *handle);
-	int (*sva_get_pasid)(struct iommu_sva *handle);
+	u32 (*sva_get_pasid)(struct iommu_sva *handle);
 
-	int (*page_response)(struct device *dev,
+	int (*page_response)(struct iommu_domain *domain,
+			     struct device *dev,
 			     struct iommu_fault_event *evt,
 			     struct iommu_page_response *msg);
-
+	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
+				struct iommu_cache_invalidate_info *inv_info);
+	int (*sva_bind_gpasid)(struct iommu_domain *domain,
+				struct device *dev,
+				struct iommu_gpasid_bind_data *data,
+				void *fault_data);
+
+	int (*sva_unbind_gpasid)(struct iommu_domain *domain,
+				 struct device *dev, u32 pasid, u64 flags);
+
+	void (*sva_suspend_pasid)(struct device *dev, u32 pasid);
+
+	void (*sva_resume_pasid)(struct device *dev, u32 pasid);
+
+	int (*def_domain_type)(struct device *dev);
+
+	int (*merge_pages)(struct iommu_domain *domain, unsigned long iova,
+			   phys_addr_t phys, size_t size);
+	int (*split_block)(struct iommu_domain *domain, unsigned long iova,
+			   size_t size);
+	int (*set_hwdbm)(struct iommu_domain *domain, bool enable,
+			unsigned long iova, size_t size);
+
+	int (*sync_dirty_log)(struct iommu_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap, unsigned long base_iova,
+			unsigned long bitmap_pgshift);
+	int (*clear_dirty_log)(struct iommu_domain *domain,
+			unsigned long iova, size_t size,
+			unsigned long *bitmap, unsigned long base_iova,
+			unsigned long bitmap_pgshift);
+
+	int (*enable_pasid_dma)(struct device *dev, u32 pasid);
+	int (*disable_pasid_dma)(struct device *dev);
 	unsigned long pgsize_bitmap;
+	struct module *owner;
 
 #ifdef CONFIG_SMMU_BYPASS_DEV
 #ifndef __GENKSYMS__
@@ -338,38 +391,56 @@ struct iommu_device {
  *
  * @fault: fault descriptor
  * @list: pending fault event list, used for tracking responses
+ * @expire: time limit in jiffies will wait for page response
  */
 struct iommu_fault_event {
 	struct iommu_fault fault;
 	struct list_head list;
+	u64 expire;
+	u64 vector;
+};
+
+struct iommu_fault_handler_data {
+	u32 vector;
+	void *data;
+	struct list_head list;
 };
 
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @handler: Callback function to handle IOMMU faults at device level
- * @data: handler private data
- * @faults: holds the pending faults which needs response
+ * @data: handler private data list
+ * @faults: holds the pending faults which needs response, e.g. page response.
  * @lock: protect pending faults list
+ * @timer: track page request pending time limit
  */
 struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
-	void *data;
+	struct list_head data;
 	struct list_head faults;
+	struct timer_list timer;
 	struct mutex lock;
 };
 
 /**
- * struct iommu_param - collection of per-device IOMMU data
+ * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @iopf_param:	 I/O Page Fault queue and data
+ * @fwspec:	 IOMMU fwspec data
+ * @iommu_dev:	 IOMMU device this device is linked to
+ * @priv:	 IOMMU Driver private data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
- *	struct iommu_fwspec	*iommu_fwspec;
  */
-struct iommu_param {
+struct dev_iommu {
 	struct mutex lock;
-	struct iommu_fault_param *fault_param;
+	struct iommu_fault_param	*fault_param;
+	struct iopf_device_param	*iopf_param;
+	struct iommu_fwspec		*fwspec;
+	struct iommu_device		*iommu_dev;
+	void				*priv;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
@@ -381,6 +452,7 @@ int  iommu_device_sysfs_add(struct iommu_device *iommu,
 void iommu_device_sysfs_remove(struct iommu_device *iommu);
 int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
 static inline void iommu_device_set_ops(struct iommu_device *iommu,
 					const struct iommu_ops *ops)
@@ -414,6 +486,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather)
 #define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER	6 /* Post Driver unbind */
 
 extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops);
+extern int bus_iommu_probe(struct bus_type *bus);
 extern bool iommu_present(struct bus_type *bus);
 extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
@@ -423,25 +496,43 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
+extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
+				       struct device *dev,
+				       void __user *uinfo);
+
+extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
+				      struct device *dev,
+				      void __user *udata,
+				      void *fault_data);
+extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+					struct device *dev, void __user *udata);
+extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+				   struct device *dev, ioasid_t pasid,
+				   u64 flags);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot);
+extern int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
+			    phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 			  size_t size);
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
 			       unsigned long iova, size_t size,
 			       struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-			   struct scatterlist *sg,unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
 
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
-extern int iommu_request_dm_for_dev(struct device *dev);
-extern int iommu_request_dma_domain_for_dev(struct device *dev);
+extern void generic_iommu_put_resv_regions(struct device *dev,
+					   struct list_head *list);
 extern void iommu_set_default_passthrough(bool cmd_line);
 extern void iommu_set_default_translated(bool cmd_line);
 extern bool iommu_default_passthrough(void);
@@ -481,34 +572,48 @@ extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
-extern int iommu_page_response(struct device *dev,
-			       struct iommu_page_response *msg);
+extern int iommu_add_device_fault_data(struct device *dev,
+				int vector, void *data);
+extern void iommu_delete_device_fault_data(struct device *dev, int vector);
+extern int iommu_page_response(struct iommu_domain *domain,
+			       struct device *dev,
+			       void __user *uinfo);
 
 extern int iommu_group_id(struct iommu_group *group);
-extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
+extern int iommu_domain_set_hwdbm(struct iommu_domain *domain, bool enable,
+				  unsigned long iova, size_t size);
+
 extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 
+extern int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				size_t size, unsigned long *bitmap,
+				unsigned long base_iova,
+				unsigned long bitmap_pgshift);
+extern int iommu_clear_dirty_log(struct iommu_domain *domain, unsigned long iova,
+				 size_t dma_size, unsigned long *bitmap,
+				 unsigned long base_iova,
+				 unsigned long bitmap_pgshift);
+
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
-extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
 {
 	if (domain->ops->flush_iotlb_all)
 		domain->ops->flush_iotlb_all(domain);
 }
 
-static inline void iommu_tlb_sync(struct iommu_domain *domain,
+static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
 	if (domain->ops->iotlb_sync)
@@ -521,7 +626,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 					       struct iommu_iotlb_gather *gather,
 					       unsigned long iova, size_t size)
 {
-	unsigned long start = iova, end = start + size;
+	unsigned long start = iova, end = start + size - 1;
 
 	/*
 	 * If the new page is disjoint from the current range or is mapped at
@@ -531,7 +636,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 	if (gather->pgsize != size ||
 	    end < gather->start || start > gather->end) {
 		if (gather->pgsize)
-			iommu_tlb_sync(domain, gather);
+			iommu_iotlb_sync(domain, gather);
 		gather->pgsize = size;
 	}
 
@@ -553,17 +658,16 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * struct iommu_fwspec - per-device IOMMU instance data
  * @ops: ops for this device's IOMMU
  * @iommu_fwnode: firmware handle for this device's IOMMU
- * @iommu_priv: IOMMU driver private data for this device
+ * @flags: IOMMU_FWSPEC_* flags
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
 struct iommu_fwspec {
 	const struct iommu_ops	*ops;
 	struct fwnode_handle	*iommu_fwnode;
-	void			*iommu_priv;
 	u32			flags;
 	unsigned int		num_ids;
-	u32			ids[1];
+	u32			ids[];
 };
 
 /* ATS is supported */
@@ -574,7 +678,6 @@ struct iommu_fwspec {
  */
 struct iommu_sva {
 	struct device			*dev;
-	const struct iommu_sva_ops	*ops;
 };
 
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
@@ -585,19 +688,31 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
-	return dev->iommu_fwspec;
+	if (dev->iommu)
+		return dev->iommu->fwspec;
+	else
+		return NULL;
 }
 
 static inline void dev_iommu_fwspec_set(struct device *dev,
 					struct iommu_fwspec *fwspec)
 {
-	dev->iommu_fwspec = fwspec;
+	dev->iommu->fwspec = fwspec;
+}
+
+static inline void *dev_iommu_priv_get(struct device *dev)
+{
+	return dev->iommu->priv;
+}
+
+static inline void dev_iommu_priv_set(struct device *dev, void *priv)
+{
+	dev->iommu->priv = priv;
 }
 
 int iommu_probe_device(struct device *dev);
 void iommu_release_device(struct device *dev);
 
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f);
@@ -609,9 +724,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm,
 					void *drvdata);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
-int iommu_sva_set_ops(struct iommu_sva *handle,
-		      const struct iommu_sva_ops *ops);
-int iommu_sva_get_pasid(struct iommu_sva *handle);
+u32 iommu_sva_get_pasid(struct iommu_sva *handle);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -668,6 +781,13 @@ static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 	return -ENODEV;
 }
 
+static inline int iommu_map_atomic(struct iommu_domain *domain,
+				   unsigned long iova, phys_addr_t paddr,
+				   size_t size, int prot)
+{
+	return -ENODEV;
+}
+
 static inline size_t iommu_unmap(struct iommu_domain *domain,
 				 unsigned long iova, size_t size)
 {
@@ -681,18 +801,25 @@ static inline size_t iommu_unmap_fast(struct iommu_domain *domain,
 	return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
-				  unsigned long iova, struct scatterlist *sg,
-				  unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot)
 {
-	return 0;
+	return -ENODEV;
 }
 
-static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+					 unsigned long iova, struct scatterlist *sg,
+					 unsigned int nents, int prot)
 {
+	return -ENODEV;
 }
 
-static inline void iommu_tlb_sync(struct iommu_domain *domain,
+static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+}
+
+static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 				  struct iommu_iotlb_gather *iotlb_gather)
 {
 }
@@ -704,11 +831,6 @@ static inline int iommu_domain_window_enable(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
-static inline void iommu_domain_window_disable(struct iommu_domain *domain,
-					       u32 wnd_nr)
-{
-}
-
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	return 0;
@@ -735,16 +857,6 @@ static inline int iommu_get_group_resv_regions(struct iommu_group *group,
 	return -ENODEV;
 }
 
-static inline int iommu_request_dm_for_dev(struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_request_dma_domain_for_dev(struct device *dev)
-{
-	return -ENODEV;
-}
-
 static inline void iommu_set_default_passthrough(bool cmd_line)
 {
 }
@@ -848,8 +960,20 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	return -ENODEV;
 }
 
-static inline int iommu_page_response(struct device *dev,
-				      struct iommu_page_response *msg)
+static inline
+int iommu_add_device_fault_data(struct device *dev, int vector, void *data)
+{
+	return -ENODEV;
+}
+
+static inline
+void iommu_delete_device_fault_data(struct device *dev, int vector)
+{
+}
+
+static inline int iommu_page_response(struct iommu_domain *domain,
+				      struct device *dev,
+				      void __user *uinfo)
 {
 	return -ENODEV;
 }
@@ -871,6 +995,32 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain,
 	return -EINVAL;
 }
 
+static inline int iommu_sync_dirty_log(struct iommu_domain *domain,
+				       unsigned long iova, size_t size,
+				       unsigned long *bitmap,
+				       unsigned long base_iova,
+				       unsigned long pgshift)
+{
+	return -EINVAL;
+}
+
+static inline int iommu_clear_dirty_log(struct iommu_domain *domain,
+					unsigned long iova, size_t size,
+					unsigned long *bitmap,
+					unsigned long base_iova,
+					unsigned long pgshift)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_domain_set_hwdbm(struct iommu_domain *domain,
+					 bool enable,
+					 unsigned long iova,
+					 size_t size)
+{
+	return -EINVAL;
+}
+
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
@@ -949,12 +1099,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static inline bool
-iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	return false;
-}
-
 static inline bool
 iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
@@ -1000,19 +1144,70 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle)
 {
 }
 
-static inline int iommu_sva_set_ops(struct iommu_sva *handle,
-				    const struct iommu_sva_ops *ops)
+static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
-	return -EINVAL;
+	return IOMMU_PASID_INVALID;
 }
 
-static inline int iommu_sva_get_pasid(struct iommu_sva *handle)
+static inline int
+iommu_uapi_cache_invalidate(struct iommu_domain *domain,
+			    struct device *dev,
+			    struct iommu_cache_invalidate_info *inv_info)
 {
-	return IOMMU_PASID_INVALID;
+	return -ENODEV;
 }
 
+static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
+					     struct device *dev, void __user *udata,
+					     void *fault_data)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
+					       struct device *dev, void __user *udata)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+					  struct device *dev,
+					  ioasid_t pasid,
+					  u64 flags)
+{
+	return -ENODEV;
+}
+
+static inline void sva_suspend_pasid(struct device *dev, u32 pasid)
+{
+}
+
+static inline void (*sva_resume_pasid)(struct device *dev, u32 pasid)
+{
+}
+
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMU_API */
 
+/**
+ * iommu_map_sgtable - Map the given buffer to the IOMMU domain
+ * @domain:	The IOMMU domain to perform the mapping
+ * @iova:	The start address to map the buffer
+ * @sgt:	The sg_table object describing the buffer
+ * @prot:	IOMMU protection bits
+ *
+ * Creates a mapping at @iova for the buffer described by a scatterlist
+ * stored in the given sg_table object in the provided IOMMU domain.
+ */
+static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
+			unsigned long iova, struct sg_table *sgt, int prot)
+{
+	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot);
+}
+
 #ifdef CONFIG_IOMMU_DEBUGFS
 extern	struct dentry *iommu_debugfs_dir;
 void iommu_debugfs_setup(void);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 982f87a4f52f5cf1d74e24c414cf2477a3d628c6..06f5c7bb405becbad7212f2222ca3df299a21b45 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,6 +141,7 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
+	IORES_DESC_SOFT_RESERVED		= 8,
 };
 
 /*
diff --git a/include/linux/iova.h b/include/linux/iova.h
index a0637abffee88b0f0b12f3c6520cd5d2d77e779d..aee26846dcf908c07e40451d9bbb5683dadb23f6 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -95,6 +95,7 @@ struct iova_domain {
 						   flush-queues */
 	atomic_t fq_timer_on;			/* 1 when timer is active, 0
 						   when not */
+	struct hlist_node	cpuhp_dead;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -152,17 +153,14 @@ unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-bool has_iova_flush_queue(struct iova_domain *iovad);
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 #else
 static inline int iova_cache_get(void)
 {
@@ -225,22 +223,12 @@ static inline struct iova *reserve_iova(struct iova_domain *iovad,
 	return NULL;
 }
 
-static inline void copy_reserved_iova(struct iova_domain *from,
-				      struct iova_domain *to)
-{
-}
-
 static inline void init_iova_domain(struct iova_domain *iovad,
 				    unsigned long granule,
 				    unsigned long start_pfn)
 {
 }
 
-static inline bool has_iova_flush_queue(struct iova_domain *iovad)
-{
-	return false;
-}
-
 static inline int init_iova_flush_queue(struct iova_domain *iovad,
 					iova_flush_cb flush_cb,
 					iova_entry_dtor entry_dtor)
@@ -266,10 +254,6 @@ static inline struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	return NULL;
 }
 
-static inline void free_cpu_cached_iovas(unsigned int cpu,
-					 struct iova_domain *iovad)
-{
-}
 #endif
 
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3d7b04d5264ce6841636e2c5a5c17d62c2c7011d..dbb2d08e010b57755e56c62b76e1bbc3fff5b19f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -481,6 +481,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  *				irq_request_resources
  * @irq_compose_msi_msg:	optional to compose message content for MSI
  * @irq_write_msi_msg:	optional to write message content for MSI
+ * @irq_set_auxdata:	Optional function to update auxiliary data e.g. in
+ *			shared registers
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
  * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
@@ -528,6 +530,8 @@ struct irq_chip {
 	void		(*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 	void		(*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 
+	int		(*irq_set_auxdata)(struct irq_data *data, unsigned int which, u64 auxval);
+
 	int		(*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
 	int		(*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
 
@@ -555,6 +559,7 @@ struct irq_chip {
  * IRQCHIP_EOI_THREADED:	Chip requires eoi() on unmask in threaded mode
  * IRQCHIP_SUPPORTS_LEVEL_MSI	Chip can provide two doorbells for Level MSIs
  * IRQCHIP_SUPPORTS_NMI:	Chip can deliver NMIs, only for root irqchips
+ * IRQCHIP_AFFINITY_PRE_STARTUP:      Default affinity update before startup
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
@@ -566,6 +571,7 @@ enum {
 	IRQCHIP_EOI_THREADED		= (1 <<  6),
 	IRQCHIP_SUPPORTS_LEVEL_MSI	= (1 <<  7),
 	IRQCHIP_SUPPORTS_NMI		= (1 <<  8),
+	IRQCHIP_AFFINITY_PRE_STARTUP	= (1 << 10),
 };
 
 #include <linux/irqdesc.h>
diff --git a/include/linux/irqchip/irq-ims-msi.h b/include/linux/irqchip/irq-ims-msi.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba767fbbb3d5050bd92c7d0713236b9d89cbc8e
--- /dev/null
+++ b/include/linux/irqchip/irq-ims-msi.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* (C) Copyright 2021 Thomas Gleixner <tglx@linutronix.de> */
+
+#ifndef _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+#define _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+
+#include <linux/types.h>
+#include <linux/bits.h>
+
+/**
+ * ims_hw_slot - The hardware layout of an IMS based MSI message
+ * @address_lo:	Lower 32bit address
+ * @address_hi:	Upper 32bit address
+ * @data:	Message data
+ * @ctrl:	Control word
+ *
+ * This structure is used by both the device memory array and the queue
+ * memory variants of IMS.
+ */
+struct ims_slot {
+	u32	address_lo;
+	u32	address_hi;
+	u32	data;
+	u32	ctrl;
+} __packed;
+
+/*
+ * The IMS control word utilizes bit 0-2 for interrupt control. The remaining
+ * bits can contain auxiliary data.
+ */
+#define IMS_CONTROL_WORD_IRQMASK	GENMASK(2, 0)
+#define IMS_CONTROL_WORD_AUXMASK	GENMASK(31, 3)
+
+/* Auxiliary control word data related defines */
+enum {
+	IMS_AUXDATA_CONTROL_WORD,
+};
+
+/* Bit to mask the interrupt in ims_hw_slot::ctrl */
+#define IMS_CTRL_VECTOR_MASKBIT		BIT(0)
+#define IMS_CTRL_PASID_ENABLE           BIT(3)
+#define IMS_CTRL_PASID_SHIFT            12
+
+/* Set pasid and enable bit for the IMS entry */
+static inline u32 ims_ctrl_pasid_aux(unsigned int pasid, bool enable)
+{
+	u32 auxval = pasid << IMS_CTRL_PASID_SHIFT;
+
+	return enable ? auxval | IMS_CTRL_PASID_ENABLE : auxval;
+}
+
+/**
+ * struct ims_array_info - Information to create an IMS array domain
+ * @slots:	Pointer to the start of the array
+ * @max_slots:	Maximum number of slots in the array
+ */
+struct ims_array_info {
+	struct ims_slot		__iomem *slots;
+	unsigned int		max_slots;
+};
+
+struct pci_dev;
+struct irq_domain;
+
+struct irq_domain *pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+						       struct ims_array_info *ims_info);
+
+#endif
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 6fc9f0ac5f2b8ecc3a654d6c7d7cfa433c5aacff..20fb384aef46999388f15511a9ac7b998d86191f 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -84,6 +84,8 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_IPI,
 	DOMAIN_BUS_FSL_MC_MSI,
 	DOMAIN_BUS_TI_SCI_INTA_MSI,
+	DOMAIN_BUS_VMD_MSI,
+	DOMAIN_BUS_DEVICE_MSI,
 };
 
 /**
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d83d403dac2eaeaad0be68b1f9a164324aa34f75..09f759228e3f92c74971971ef9281ccf62d070fc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -328,13 +328,6 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err);
-#else
-static inline void refcount_error_report(struct pt_regs *regs, const char *err)
-{ }
-#endif
-
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0ec4ec428b9ba049afa0aac3dfd6581cf1498e62..f8a34fdf45d23820f3bdcf8a80ba3d8b06fb534f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,8 +35,8 @@
 
 #include <asm/kvm_host.h>
 
-#ifndef KVM_MAX_VCPU_ID
-#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#ifndef KVM_MAX_VCPU_IDS
+#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
 #endif
 
 /*
@@ -278,7 +278,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	int guest_xcr0_loaded;
 	struct swait_queue_head wq;
 	struct pid __rcu *pid;
 	int sigset_active;
@@ -290,6 +289,10 @@ struct kvm_vcpu {
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
 	int mmio_read_completed;
+	int mmio_nonposted_write_completed;
+#define MMIO_WRITE 1
+#define MMIO_NONPOSTED_WRITE 3
+#define MMIO_NONPOSTED_DEFERRED 4
 	int mmio_is_write;
 	int mmio_cur_fragment;
 	int mmio_nr_fragments;
@@ -861,6 +864,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
@@ -1245,6 +1249,10 @@ extern unsigned int halt_poll_ns_grow;
 extern unsigned int halt_poll_ns_grow_start;
 extern unsigned int halt_poll_ns_shrink;
 
+extern u32 kvm_mmu_limit_nr;
+extern u32 kvm_mmu_reclaim_try_times;
+extern u32 kvm_mmu_reclaim_times;
+
 struct kvm_device {
 	struct kvm_device_ops *ops;
 	struct kvm *kvm;
diff --git a/include/linux/list.h b/include/linux/list.h
index 85c92555e31f85f019354e54d6efb8e79c2aee17..ce19c6b632a5905533c819bf34bc2acd4a8c1f58 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -567,6 +567,15 @@ static inline void list_splice_tail_init(struct list_head *list,
 	     pos != (head); \
 	     pos = n, n = pos->prev)
 
+/**
+ * list_entry_is_head - test if the entry points to the head of the list
+ * @pos:	the type * to cursor
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_entry_is_head(pos, head, member)				\
+	(&pos->member == (head))
+
 /**
  * list_for_each_entry	-	iterate over list of given type
  * @pos:	the type * to use as a loop cursor.
@@ -575,7 +584,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry(pos, head, member)				\
 	for (pos = list_first_entry(head, typeof(*pos), member);	\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -586,7 +595,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_reverse(pos, head, member)			\
 	for (pos = list_last_entry(head, typeof(*pos), member);		\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -611,7 +620,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_continue(pos, head, member) 		\
 	for (pos = list_next_entry(pos, member);			\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -625,7 +634,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_continue_reverse(pos, head, member)		\
 	for (pos = list_prev_entry(pos, member);			\
-	     &pos->member != (head);					\
+	     !list_entry_is_head(pos, head, member);			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -637,7 +646,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Iterate over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from(pos, head, member) 			\
-	for (; &pos->member != (head);					\
+	for (; !list_entry_is_head(pos, head, member);			\
 	     pos = list_next_entry(pos, member))
 
 /**
@@ -650,7 +659,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Iterate backwards over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from_reverse(pos, head, member)		\
-	for (; &pos->member != (head);					\
+	for (; !list_entry_is_head(pos, head, member);			\
 	     pos = list_prev_entry(pos, member))
 
 /**
@@ -663,7 +672,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe(pos, n, head, member)			\
 	for (pos = list_first_entry(head, typeof(*pos), member),	\
 		n = list_next_entry(pos, member);			\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe_continue(pos, n, head, member) 		\
 	for (pos = list_next_entry(pos, member), 				\
 		n = list_next_entry(pos, member);				\
-	     &pos->member != (head);						\
+	     !list_entry_is_head(pos, head, member);				\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -694,7 +703,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  */
 #define list_for_each_entry_safe_from(pos, n, head, member) 			\
 	for (n = list_next_entry(pos, member);					\
-	     &pos->member != (head);						\
+	     !list_entry_is_head(pos, head, member);				\
 	     pos = n, n = list_next_entry(n, member))
 
 /**
@@ -710,7 +719,7 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each_entry_safe_reverse(pos, n, head, member)		\
 	for (pos = list_last_entry(head, typeof(*pos), member),		\
 		n = list_prev_entry(pos, member);			\
-	     &pos->member != (head); 					\
+	     !list_entry_is_head(pos, head, member); 			\
 	     pos = n, n = list_prev_entry(n, member))
 
 /**
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b8a835fd611b286993eb9801598065ab442ef3f4..a3e607abb040dd1a562686dffb4f8cacbf474497 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -647,12 +647,31 @@ do {									\
 			  "Not in hardirq as expected\n");		\
 	} while (0)
 
+#define lockdep_assert_preemption_enabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() != 0		||		\
+		      !current->hardirqs_enabled));		\
+} while (0)
+
+#define lockdep_assert_preemption_disabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() == 0		&&		\
+		      current->hardirqs_enabled));		\
+} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
 # define lockdep_assert_in_irq() do { } while (0)
+
+# define lockdep_assert_preemption_enabled() do { } while (0)
+# define lockdep_assert_preemption_disabled() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0ce30ca78db0466e02cf8d3373d84ff5ab13a5fd..773d7e95ce1709501cbf96961a1fcdd01c0ecb3c 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,7 +10,25 @@
 #ifndef MDEV_H
 #define MDEV_H
 
-struct mdev_device;
+struct mdev_type;
+
+struct mdev_device {
+	struct device dev;
+	guid_t uuid;
+	void *driver_data;
+	struct list_head next;
+	struct mdev_type *type;
+	struct device *iommu_device;
+	void *iommu_fault_data;
+	void *iommu_domain;
+	bool active;
+	struct eventfd_ctx *req_trigger;
+};
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return container_of(dev, struct mdev_device, dev);
+}
 
 /*
  * Called by the parent device driver to set the device which represents
@@ -19,18 +37,65 @@ struct mdev_device;
  *
  * @dev: the mediated device that iommu will isolate.
  * @iommu_device: a pci device which represents the iommu for @dev.
+ */
+static inline void mdev_set_iommu_device(struct mdev_device *mdev,
+					 struct device *iommu_device)
+{
+	mdev->iommu_device = iommu_device;
+}
+
+static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
+{
+	return mdev->iommu_device;
+}
+
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
+struct device *mtype_get_parent_dev(struct mdev_type *mtype);
+
+/*
+ * Called by the parent device driver to set the iommu fault data which
+ * is used for iommu fault reporting on the mdev. The vfio iommu modules
+ * could call mdev_get_iommu_fault_data() to retrieve fault data and add
+ * it to physical device's fault data list.
+ *
+ * @dev: the mediated device that iommu will report fault.
+ * @fault_data: the iommu fault data for @dev.
  *
  * Return 0 for success, otherwise negative error value.
  */
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device);
+static inline void mdev_set_iommu_fault_data(struct mdev_device *mdev, void *fault_data)
+{
+	mdev->iommu_fault_data = fault_data;
+}
 
-struct device *mdev_get_iommu_device(struct device *dev);
+static inline void *mdev_get_iommu_fault_data(struct mdev_device *mdev)
+{
+	return mdev->iommu_fault_data;
+}
+
+/*
+ * Called by vfio iommu modules to save the iommu domain after a domain being
+ * attached to the mediated device. The vDCM (virtual device control module)
+ * could call mdev_get_iommu_domain() to retrieve an auxiliary domain attached
+ * to an mdev.
+ */
+static inline void mdev_set_iommu_domain(struct mdev_device *mdev, void *domain)
+{
+	mdev->iommu_domain = domain;
+}
+
+static inline void *mdev_get_iommu_domain(struct mdev_device *mdev)
+{
+	return mdev->iommu_domain;
+}
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
  *
  * @owner:		The module owner.
+ * @device_driver:	Which device driver to probe() on newly created devices
  * @dev_attr_groups:	Attributes of the parent device.
  * @mdev_attr_groups:	Attributes of the mediated device.
  * @supported_type_groups: Attributes to define supported types. It is mandatory
@@ -38,7 +103,6 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @create:		Called to allocate basic resources in parent device's
  *			driver for a particular mediated device. It is
  *			mandatory to provide create ops.
- *			@kobj: kobject of type for which 'create' is called.
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
@@ -48,11 +112,6 @@ struct device *mdev_get_iommu_device(struct device *dev);
  *			@mdev: mdev_device device structure which is being
  *			       destroyed
  *			Returns integer: success (0) or error (< 0)
- * @open:		Open mediated device.
- *			@mdev: mediated device.
- *			Returns integer: success (0) or error (< 0)
- * @release:		release mediated device
- *			@mdev: mediated device.
  * @read:		Read emulation callback
  *			@mdev: mediated device structure
  *			@buf: read buffer
@@ -72,19 +131,23 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @mmap:		mmap callback
  *			@mdev: mediated device structure
  *			@vma: vma structure
+ * @request:		request callback to release device
+ *			@mdev: mediated device structure
+ *			@count: request sequence number
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
 struct mdev_parent_ops {
 	struct module   *owner;
+	struct mdev_driver *device_driver;
 	const struct attribute_group **dev_attr_groups;
 	const struct attribute_group **mdev_attr_groups;
 	struct attribute_group **supported_type_groups;
 
-	int     (*create)(struct kobject *kobj, struct mdev_device *mdev);
+	int     (*create)(struct mdev_device *mdev);
 	int     (*remove)(struct mdev_device *mdev);
-	int     (*open)(struct mdev_device *mdev);
-	void    (*release)(struct mdev_device *mdev);
+	int     (*open_device)(struct mdev_device *mdev);
+	void    (*close_device)(struct mdev_device *mdev);
 	ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
 			size_t count, loff_t *ppos);
 	ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
@@ -92,14 +155,17 @@ struct mdev_parent_ops {
 	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	void	(*request)(struct mdev_device *mdev, unsigned int count);
 };
 
 /* interface for exporting mdev supported type attributes */
 struct mdev_type_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct kobject *kobj, struct device *dev, char *buf);
-	ssize_t (*store)(struct kobject *kobj, struct device *dev,
-			 const char *buf, size_t count);
+	ssize_t (*show)(struct mdev_type *mtype,
+			struct mdev_type_attribute *attr, char *buf);
+	ssize_t (*store)(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, const char *buf,
+			 size_t count);
 };
 
 #define MDEV_TYPE_ATTR(_name, _mode, _show, _store)		\
@@ -114,35 +180,46 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
 
 /**
  * struct mdev_driver - Mediated device driver
- * @name: driver name
  * @probe: called when new device created
  * @remove: called when device removed
  * @driver: device driver structure
  *
  **/
 struct mdev_driver {
-	const char *name;
-	int  (*probe)(struct device *dev);
-	void (*remove)(struct device *dev);
+	int (*probe)(struct mdev_device *dev);
+	void (*remove)(struct mdev_device *dev);
 	struct device_driver driver;
 };
 
-#define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
-
-void *mdev_get_drvdata(struct mdev_device *mdev);
-void mdev_set_drvdata(struct mdev_device *mdev, void *data);
-const guid_t *mdev_uuid(struct mdev_device *mdev);
+static inline void *mdev_get_drvdata(struct mdev_device *mdev)
+{
+	return mdev->driver_data;
+}
+static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
+{
+	mdev->driver_data = data;
+}
+static inline const guid_t *mdev_uuid(struct mdev_device *mdev)
+{
+	return &mdev->uuid;
+}
 
 extern struct bus_type mdev_bus_type;
 
 int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops);
 void mdev_unregister_device(struct device *dev);
 
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+int mdev_register_driver(struct mdev_driver *drv);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev);
-struct device *mdev_dev(struct mdev_device *mdev);
-struct mdev_device *mdev_from_dev(struct device *dev);
+static inline struct device *mdev_dev(struct mdev_device *mdev)
+{
+	return &mdev->dev;
+}
+static inline struct mdev_device *mdev_from_dev(struct device *dev)
+{
+	return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
+}
 
 #endif /* MDEV_H */
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
new file mode 100644
index 0000000000000000000000000000000000000000..e11595256cac006569bc4f0edbe74718667a2a00
--- /dev/null
+++ b/include/linux/memregion.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MEMREGION_H_
+#define _MEMREGION_H_
+#include <linux/types.h>
+#include <linux/errno.h>
+
+struct memregion_info {
+	int target_node;
+};
+
+#ifdef CONFIG_MEMREGION
+int memregion_alloc(gfp_t gfp);
+void memregion_free(int id);
+#else
+static inline int memregion_alloc(gfp_t gfp)
+{
+	return -ENOMEM;
+}
+void memregion_free(int id)
+{
+}
+#endif
+#endif /* _MEMREGION_H_ */
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index b06b75776a32f85fc5359f9683570e924c7b4b9e..bb2012cd8e6dac2bbac85514d5ae919df8ab330e 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -21,6 +21,7 @@
 #define APOLLO_MOUSE_MINOR	7	/* unused */
 #define PC110PAD_MINOR		9	/* unused */
 /*#define ADB_MOUSE_MINOR	10	FIXME OBSOLETE */
+#define IOASID_MINOR		129     /* /dev/ioasid     */
 #define WATCHDOG_MINOR		130	/* Watchdog timer     */
 #define TEMP_MINOR		131	/* Temperature Sensor */
 #define APM_MINOR_DEV		134
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 60d9d7c5a10bcb95855425ad49c62b699d02e88d..1cc662ad866bce2f725a8be4503525d25a1d02ce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2489,17 +2489,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-   NULL if none.  Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+ * start_addr < end_addr.
+ */
+static inline
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+					     unsigned long start_addr,
+					     unsigned long end_addr)
 {
-	struct vm_area_struct * vma = find_vma(mm,start_addr);
+	struct vm_area_struct *vma = find_vma(mm, start_addr);
 
 	if (vma && end_addr <= vma->vm_start)
 		vma = NULL;
 	return vma;
 }
 
+/**
+ * vma_lookup() - Find a VMA at a specific address
+ * @mm: The process address space.
+ * @addr: The user address.
+ *
+ * Return: The vm_area_struct at the given address, %NULL otherwise.
+ */
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+
+	if (vma && addr < vma->vm_start)
+		vma = NULL;
+
+	return vma;
+}
+
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 34f390f45bd2dbaa8d5146542ba93cefda4f4f65..604e40941d8d2e5f0d8497e3f9cdc59b60208fa2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 #endif
 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 
+#define INIT_PASID	0
 
 struct address_space;
 struct mem_cgroup;
@@ -531,6 +532,10 @@ struct mm_struct {
 		atomic_long_t hugetlb_usage;
 #endif
 		struct work_struct async_put_work;
+
+#ifdef CONFIG_IOMMU_SVA
+		u32 pasid;
+#endif
 	} __randomize_layout;
 
 	KABI_RESERVE(1);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index d9a543a9e1ccec60ea7ab8ce283e0f2bd9503566..2494b640d5c74d2de1fb8ddc1343987597fee5a7 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -3,6 +3,7 @@
 #define _LINUX_MMU_CONTEXT_H
 
 #include <asm/mmu_context.h>
+#include <asm/mmu.h>
 
 struct mm_struct;
 
@@ -14,4 +15,8 @@ void unuse_mm(struct mm_struct *mm);
 # define switch_mm_irqs_off switch_mm
 #endif
 
+#ifndef leave_mm
+static inline void leave_mm(int cpu) { }
+#endif
+
 #endif
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 4c56404e53a76065416f885175bf62c9b7250be3..f655de60b72b66f6de4dd80ae534ed96991bd939 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,10 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+enum {
+	PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1,
+};
+
 /**
  * struct pci_device_id - PCI device ID structure
  * @vendor:		Vendor ID to match (or PCI_ANY_ID)
@@ -34,12 +38,14 @@ typedef unsigned long kernel_ulong_t;
  *			Best practice is to use driver_data as an index
  *			into a static list of equivalent device types,
  *			instead of using it as a pointer.
+ * @override_only:	Match only when dev->driver_override is this driver.
  */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
 	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
 	kernel_ulong_t driver_data;	/* Data private to the driver */
+	__u32 override_only;
 };
 
 
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d695e2eb2092dfc915f3badceca848627f614496..88d47a9396e0f3bb58cfba24d563ce8cd7bdcec5 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -4,11 +4,50 @@
 
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <asm/msi.h>
+
+/* Dummy shadow structures if an architecture does not define them */
+#ifndef arch_msi_msg_addr_lo
+typedef struct arch_msi_msg_addr_lo {
+	u32	address_lo;
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#endif
+
+#ifndef arch_msi_msg_addr_hi
+typedef struct arch_msi_msg_addr_hi {
+	u32	address_hi;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#endif
+
+#ifndef arch_msi_msg_data
+typedef struct arch_msi_msg_data {
+	u32	data;
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#endif
 
+/**
+ * msi_msg - Representation of a MSI message
+ * @address_lo:		Low 32 bits of msi message address
+ * @arch_addrlo:	Architecture specific shadow of @address_lo
+ * @address_hi:		High 32 bits of msi message address
+ *			(only used when device supports it)
+ * @arch_addrhi:	Architecture specific shadow of @address_hi
+ * @data:		MSI message data (usually 16 bits)
+ * @arch_data:		Architecture specific shadow of @data
+ */
 struct msi_msg {
-	u32	address_lo;	/* low 32 bits of msi message address */
-	u32	address_hi;	/* high 32 bits of msi message address */
-	u32	data;		/* 16 bits of msi message data */
+	union {
+		u32			address_lo;
+		arch_msi_msg_addr_lo_t	arch_addr_lo;
+	};
+	union {
+		u32			address_hi;
+		arch_msi_msg_addr_hi_t	arch_addr_hi;
+	};
+	union {
+		u32			data;
+		arch_msi_msg_data_t	arch_data;
+	};
 };
 
 extern int pci_msi_ignore_mask;
@@ -55,6 +94,17 @@ struct ti_sci_inta_msi_desc {
 	u16	dev_index;
 };
 
+/**
+ * device_msi_desc - Device MSI specific MSI descriptor data
+ * @priv:		Pointer to device specific private data
+ * @priv_iomem:		Pointer to device specific private io memory
+ * @hwirq:		The hardware irq number in the device domain
+ */
+struct device_msi_desc {
+	void __iomem	*priv_iomem;
+	u16		hwirq;
+};
+
 /**
  * struct msi_desc - Descriptor structure for MSI based interrupts
  * @list:	List head for management
@@ -127,24 +177,52 @@ struct msi_desc {
 		struct platform_msi_desc platform;
 		struct fsl_mc_msi_desc fsl_mc;
 		struct ti_sci_inta_msi_desc inta;
+		struct device_msi_desc device_msi;
 	};
 };
 
 /* Helpers to hide struct msi_desc implementation details */
 #define msi_desc_to_dev(desc)		((desc)->dev)
 #define dev_to_msi_list(dev)		(&(dev)->msi_list)
+#define dev_to_dev_msi_list(dev)	(&(dev)->dev_msi_list)
 #define first_msi_entry(dev)		\
 	list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list)
-#define for_each_msi_entry(desc, dev)	\
-	list_for_each_entry((desc), dev_to_msi_list((dev)), list)
+#define __for_each_msi_entry(desc, msi_list)    \
+	list_for_each_entry((desc), (msi_list), list)
+#define for_each_msi_entry(desc, dev)		\
+	__for_each_msi_entry((desc), dev_to_msi_list((dev)))
 #define for_each_msi_entry_safe(desc, tmp, dev)	\
 	list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list)
-#define for_each_msi_vector(desc, __irq, dev)				\
-	for_each_msi_entry((desc), (dev))				\
-		if ((desc)->irq)					\
-			for (__irq = (desc)->irq;			\
-			     __irq < ((desc)->irq + (desc)->nvec_used);	\
+#define __for_each_msi_vector(desc, __irq, msi_list)	\
+	__for_each_msi_entry((desc), (msi_list))	\
+		if ((desc)->irq)			\
+			for (__irq = (desc)->irq;	\
+			     __irq < ((desc)->irq + (desc)->nvec_used); \
 			     __irq++)
+#define for_each_msi_vector(desc, __irq, dev)		\
+	__for_each_msi_vector(desc, __irq, dev_to_msi_list((dev)))
+
+/* Iterate through all the msi_descs starting from a given desc */
+#define __for_each_new_msi_entry(desc, msi_last_list, msi_list) \
+	(desc) = list_entry((msi_last_list)->next, struct msi_desc, list);	\
+	list_for_each_entry_from((desc), (msi_list), list)
+#define for_each_new_msi_entry(desc, dev)			\
+	 __for_each_new_msi_entry((desc), (dev)->msi_last_list, dev_to_msi_list((dev)))
+#define __for_each_new_msi_vector(desc, __irq, msi_last_list, msi_list)	\
+	__for_each_new_msi_entry((desc), (msi_last_list), (msi_list))	\
+		if ((desc)->irq)					\
+			for ((__irq) = (desc)->irq;			\
+			     (__irq) < ((desc)->irq + (desc)->nvec_used);	\
+			     (__irq)++)
+#define for_each_new_msi_vector(desc, __irq, dev)	\
+	__for_each_new_msi_vector((desc), (__irq), (dev)->msi_last_list, dev_to_msi_list((dev)))
+#define for_each_new_msi_entry_safe(desc, tmp, dev)	\
+	(desc) = list_entry((dev)->msi_last_list->next, struct msi_desc, list);	\
+	list_for_each_entry_safe_from((desc), (tmp), dev_to_msi_list((dev)), list)
+#define for_each_dev_msi_entry(desc, dev)	\
+	list_for_each_entry((desc), dev_to_dev_msi_list((dev)), list)
+#define for_each_new_dev_msi_entry(desc, dev)                       \
+	__for_each_new_msi_entry((desc), (dev)->dev_msi_last_list, dev_to_dev_msi_list((dev)))
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
 static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
@@ -173,6 +251,10 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
 #define first_pci_msi_entry(pdev)	first_msi_entry(&(pdev)->dev)
 #define for_each_pci_msi_entry(desc, pdev)	\
 	for_each_msi_entry((desc), &(pdev)->dev)
+#define for_each_new_pci_msi_entry(desc, pdev)        \
+	for_each_new_msi_entry((desc), &(pdev)->dev)
+#define for_each_new_pci_msi_entry_safe(desc, tmp, pdev)	\
+	for_each_new_msi_entry_safe((desc), (tmp), &(pdev)->dev)
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
 void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
@@ -194,22 +276,47 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
+const struct attribute_group **msi_populate_sysfs(struct device *dev);
+void msi_destroy_sysfs(struct device *dev,
+		       const struct attribute_group **msi_irq_groups);
+
 /*
- * The arch hooks to setup up msi irqs. Those functions are
- * implemented as weak symbols so that they /can/ be overriden by
- * architecture specific code if needed.
+ * The arch hooks to setup up msi irqs. Default functions are implemented
+ * as weak symbols so that they /can/ be overriden by architecture specific
+ * code if needed. These hooks must be enabled by the architecture or by
+ * drivers which depend on them via msi_controller based MSI handling.
+ *
+ * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by
+ * stubs with warnings.
  */
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
 int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-void arch_restore_msi_irqs(struct pci_dev *dev);
-
 void default_teardown_msi_irqs(struct pci_dev *dev);
+#else
+static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	WARN_ON_ONCE(1);
+	return -ENODEV;
+}
+
+static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+	WARN_ON_ONCE(1);
+}
+#endif
+
+/*
+ * The restore hooks are still available as they are useful even
+ * for fully irq domain based setups. Courtesy to XEN/X86.
+ */
+void arch_restore_msi_irqs(struct pci_dev *dev);
 void default_restore_msi_irqs(struct pci_dev *dev);
 
 struct msi_controller {
@@ -228,7 +335,6 @@ struct msi_controller {
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
 #include <linux/irqhandler.h>
-#include <asm/msi.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -247,6 +353,14 @@ struct msi_domain_info;
  * @msi_finish:		Optional callback to finalize the allocation
  * @set_desc:		Set the msi descriptor for an interrupt
  * @handle_error:	Optional error handler if the allocation fails
+ * @domain_alloc_irqs:	Optional function to override the default allocation
+ *			function.
+ * @domain_free_irqs:	Optional function to override the default free
+ *			function.
+ * @msi_alloc_store:	Optional callback to allocate storage in a device
+ *			specific non-standard MSI store
+ * @msi_alloc_free:	Optional callback to free storage in a device
+ *			specific non-standard MSI store
  *
  * @get_hwirq, @msi_init and @msi_free are callbacks used by
  * msi_create_irq_domain() and related interfaces
@@ -254,6 +368,22 @@ struct msi_domain_info;
  * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error
  * are callbacks used by msi_domain_alloc_irqs() and related
  * interfaces which are based on msi_desc.
+ *
+ * @domain_alloc_irqs, @domain_free_irqs can be used to override the
+ * default allocation/free functions (__msi_domain_alloc/free_irqs). This
+ * is initially for a wrapper around XENs seperate MSI universe which can't
+ * be wrapped into the regular irq domains concepts by mere mortals.  This
+ * allows to universally use msi_domain_alloc/free_irqs without having to
+ * special case XEN all over the place.
+ *
+ * Contrary to other operations @domain_alloc_irqs and @domain_free_irqs
+ * are set to the default implementation if NULL and even when
+ * MSI_FLAG_USE_DEF_DOM_OPS is not set to avoid breaking existing users and
+ * because these callbacks are obviously mandatory.
+ *
+ * This is NOT meant to be abused, but it can be useful to build wrappers
+ * for specialized MSI irq domains which need extra work before and after
+ * calling __msi_domain_alloc_irqs()/__msi_domain_free_irqs().
  */
 struct msi_domain_ops {
 	irq_hw_number_t	(*get_hwirq)(struct msi_domain_info *info,
@@ -276,6 +406,20 @@ struct msi_domain_ops {
 				    struct msi_desc *desc);
 	int		(*handle_error)(struct irq_domain *domain,
 					struct msi_desc *desc, int error);
+	int		(*domain_alloc_irqs)(struct irq_domain *domain,
+					     struct device *dev, int nvec);
+	void		(*domain_free_irqs)(struct irq_domain *domain,
+					    struct device *dev);
+	void		(*domain_free_irq)(struct irq_domain *domain,
+					   struct device *dev,
+					   unsigned int irq);
+	int		(*msi_alloc_store)(struct irq_domain *domain,
+					   struct device *dev, int nvec);
+	void		(*msi_free_store)(struct irq_domain *domain,
+					  struct device *dev);
+	void            (*msi_free_irq)(struct irq_domain *domain,
+					struct device *dev, unsigned int irq);
+
 };
 
 /**
@@ -333,9 +477,15 @@ int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
 struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 					 struct msi_domain_info *info,
 					 struct irq_domain *parent);
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			    int nvec);
 int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			  int nvec);
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+int device_msi_add_irq(struct irq_domain *domain, struct device *dev);
+void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq);
+void __msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
 struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
@@ -344,6 +494,8 @@ struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
 int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec,
 				   irq_write_msi_msg_t write_msi_msg);
 void platform_msi_domain_free_irqs(struct device *dev);
+int dev_msi_irq_vector(struct device *dev, unsigned int nr);
+int dev_msi_hwirq(struct device *dev, unsigned int nr);
 
 /* When an MSI domain is used as an intermediate domain */
 int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -368,19 +520,32 @@ int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nvec);
 void *platform_msi_get_host_data(struct irq_domain *domain);
+void msi_domain_set_default_info_flags(struct msi_domain_info *info);
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
+#ifdef CONFIG_DEVICE_MSI
+struct irq_domain *device_msi_create_irq_domain(struct fwnode_handle *fn,
+						struct msi_domain_info *info,
+						struct irq_domain *parent);
+
+# ifdef CONFIG_PCI
+struct irq_domain *pci_subdevice_msi_create_irq_domain(struct pci_dev *pdev,
+						       struct msi_domain_info *info);
+# endif
+#endif /* CONFIG_DEVICE_MSI */
+
+bool arch_support_pci_device_msi(struct pci_dev *pdev);
+
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
-irq_hw_number_t pci_msi_domain_calc_hwirq(struct pci_dev *dev,
-					  struct msi_desc *desc);
 int pci_msi_domain_check_cap(struct irq_domain *domain,
 			     struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
+bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);
 #else
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 {
@@ -388,4 +553,8 @@ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
 
+#ifndef arch_msi_prepare
+# define arch_msi_prepare	NULL
+#endif
+
 #endif /* LINUX_MSI_H */
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index f3d40dd7bb665b83e3f123655a8232cc972566b5..fc7fd92927ddc1580fcb2a240c89d37acf6e8ce1 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -2,28 +2,17 @@
 #ifndef __OF_IOMMU_H
 #define __OF_IOMMU_H
 
-#include <linux/device.h>
-#include <linux/iommu.h>
-#include <linux/of.h>
+struct device;
+struct device_node;
+struct iommu_ops;
 
 #ifdef CONFIG_OF_IOMMU
 
-extern int of_get_dma_window(struct device_node *dn, const char *prefix,
-			     int index, unsigned long *busno, dma_addr_t *addr,
-			     size_t *size);
-
 extern const struct iommu_ops *of_iommu_configure(struct device *dev,
 					struct device_node *master_np);
 
 #else
 
-static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
-			    int index, unsigned long *busno, dma_addr_t *addr,
-			    size_t *size)
-{
-	return -EINVAL;
-}
-
 static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 					 struct device_node *master_np)
 {
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1214cabb22479be282ea6482ecec1657d8d59c1a..7142a37227584768d54bb44c032996fb444ddbc8 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -52,7 +52,8 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev,
 					    struct device_node *np,
 					    enum irq_domain_bus_token token);
 extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-						       u32 rid);
+							u32 id,
+							u32 bus_token);
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in);
 #else
@@ -85,7 +86,7 @@ static inline struct irq_domain *of_msi_get_domain(struct device *dev,
 	return NULL;
 }
 static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-							      u32 rid)
+						u32 id, u32 bus_token)
 {
 	return NULL;
 }
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 62b7fdcc661c30e03d1650baaea199c3f095bf81..769cb9c9fa2886f14e6fb38b0d66b10086fa2182 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -107,20 +107,24 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { }
 #endif
 
 extern const guid_t pci_acpi_dsm_guid;
-#define IGNORE_PCI_BOOT_CONFIG_DSM	0x05
-#define DEVICE_LABEL_DSM		0x07
-#define RESET_DELAY_DSM			0x08
-#define FUNCTION_DELAY_DSM		0x09
+
+/* _DSM Definitions for PCI */
+#define DSM_PCI_PRESERVE_BOOT_CONFIG		0x05
+#define DSM_PCI_DEVICE_NAME			0x07
+#define DSM_PCI_POWER_ON_RESET_DELAY		0x08
+#define DSM_PCI_DEVICE_READINESS_DURATIONS	0x09
+
+#ifdef CONFIG_PCIE_EDR
+void pci_acpi_add_edr_notifier(struct pci_dev *pdev);
+void pci_acpi_remove_edr_notifier(struct pci_dev *pdev);
+#else
+static inline void pci_acpi_add_edr_notifier(struct pci_dev *pdev) { }
+static inline void pci_acpi_remove_edr_notifier(struct pci_dev *pdev) { }
+#endif /* CONFIG_PCIE_EDR */
 
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
 #endif	/* CONFIG_ACPI */
 
-#ifdef CONFIG_ACPI_APEI
-extern bool aer_acpi_firmware_first(void);
-#else
-static inline bool aer_acpi_firmware_first(void) { return false; }
-#endif
-
 #endif	/* _PCI_ACPI_H_ */
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 1ebb88e7c184c9ce4246b043dfaf08cd00ba9357..24e2a5c3741cca5883966cdbe4f2fee42fd2b1cd 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -10,9 +10,11 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
 void pci_disable_pri(struct pci_dev *pdev);
 void pci_restore_pri_state(struct pci_dev *pdev);
 int pci_reset_pri(struct pci_dev *pdev);
-
+int pci_prg_resp_pasid_required(struct pci_dev *pdev);
+bool pci_pri_supported(struct pci_dev *pdev);
 #else /* CONFIG_PCI_PRI */
-
+static inline bool pci_pri_supported(struct pci_dev *pdev)
+{ return false; }
 static inline int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 {
 	return -ENODEV;
@@ -31,6 +33,10 @@ static inline int pci_reset_pri(struct pci_dev *pdev)
 	return -ENODEV;
 }
 
+static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
@@ -40,7 +46,6 @@ void pci_disable_pasid(struct pci_dev *pdev);
 void pci_restore_pasid_state(struct pci_dev *pdev);
 int pci_pasid_features(struct pci_dev *pdev);
 int pci_max_pasids(struct pci_dev *pdev);
-int pci_prg_resp_pasid_required(struct pci_dev *pdev);
 
 #else  /* CONFIG_PCI_PASID */
 
@@ -66,11 +71,6 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
 {
 	return -EINVAL;
 }
-
-static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
-{
-	return 0;
-}
 #endif /* CONFIG_PCI_PASID */
 
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 0c12d69dde9299cd597d013336a21909af8a4ae4..c8e39607dbb734d34a94cdf4b743b9e22180f00e 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -120,7 +120,7 @@ struct pci_epc_features {
 	unsigned int	msix_capable : 1;
 	u8	reserved_bar;
 	u8	bar_fixed_64bit;
-	u64	bar_fixed_size[BAR_5 + 1];
+	u64	bar_fixed_size[PCI_STD_NUM_BARS];
 	size_t	align;
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2665076fc842fffe0d7ca745df6b0ed4276660e9..815d5f9c7957cf47b390b630905bc99bb7e2395f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -83,7 +83,7 @@ enum pci_mmap_state {
 enum {
 	/* #0-5: standard PCI resources */
 	PCI_STD_RESOURCES,
-	PCI_STD_RESOURCE_END = 5,
+	PCI_STD_RESOURCE_END = PCI_STD_RESOURCES + PCI_STD_NUM_BARS - 1,
 
 	/* #6: expansion ROM resource */
 	PCI_ROM_RESOURCE,
@@ -287,6 +287,7 @@ struct pci_vpd;
 struct pci_sriov;
 struct pci_ats;
 struct pci_p2pdma;
+struct rcec_ea;
 
 /* The pci_dev structure describes PCI devices */
 struct pci_dev {
@@ -309,6 +310,10 @@ struct pci_dev {
 #ifdef CONFIG_PCIEAER
 	u16		aer_cap;	/* AER capability offset */
 	struct aer_stats *aer_stats;	/* AER stats for this device */
+#endif
+#ifdef CONFIG_PCIEPORTBUS
+	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
+	struct pci_dev  *rcec;          /* Associated RCEC device */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
@@ -415,8 +420,12 @@ struct pci_dev {
 	 * mappings to make sure they cannot access arbitrary memory.
 	 */
 	unsigned int	untrusted:1;
-	unsigned int	__aer_firmware_first_valid:1;
-	unsigned int	__aer_firmware_first:1;
+	/*
+	 * Info from the platform, e.g., ACPI or device tree, may mark a
+	 * device as "external-facing".  An external-facing device is
+	 * itself internal but devices downstream from it are external.
+	 */
+	unsigned int	external_facing:1;
 	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
 	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
@@ -445,8 +454,17 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_MSI
 	const struct attribute_group **msi_irq_groups;
+	int		msix_alloc_count;	/* No. of MSI-X vectors allocated to device */
+	void __iomem	*msix_table_base;	/* Base address of device MSI-X table */
+	struct mutex	msix_mutex;		/* Serialize MSI-X interrupt allocation */
+	unsigned long	*msix_map;		/* Bitmap to track allocated MSI-X vectors */
 #endif
 	struct pci_vpd *vpd;
+#ifdef CONFIG_PCIE_DPC
+	u16		dpc_cap;
+	unsigned int	dpc_rp_extensions:1;
+	u8		dpc_rp_log_size;
+#endif
 #ifdef CONFIG_PCI_ATS
 	union {
 		struct pci_sriov	*sriov;		/* PF: SR-IOV info */
@@ -454,12 +472,14 @@ struct pci_dev {
 	};
 	u16		ats_cap;	/* ATS Capability offset */
 	u8		ats_stu;	/* ATS Smallest Translation Unit */
-	atomic_t	ats_ref_cnt;	/* Number of VFs with ATS enabled */
 #endif
 #ifdef CONFIG_PCI_PRI
+	u16		pri_cap;	/* PRI Capability offset */
 	u32		pri_reqs_alloc; /* Number of PRI requests allocated */
+	unsigned int	pasid_required:1; /* PRG Response PASID Required */
 #endif
 #ifdef CONFIG_PCI_PASID
+	u16		pasid_cap;	/* PASID Capability offset */
 	u16		pasid_features;
 #endif
 #ifdef CONFIG_PCI_P2PDMA
@@ -516,6 +536,7 @@ struct pci_host_bridge {
 	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
+	unsigned int	native_dpc:1;		/* OS may use PCIe DPC */
 	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
 
 	/* Resource alignment requirements */
@@ -871,6 +892,35 @@ struct pci_driver {
 	.vendor = (vend), .device = (dev), \
 	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 
+/**
+ * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with
+ *                              override_only flags.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ * @driver_override: the 32 bit PCI Device override_only
+ *
+ * This macro is used to create a struct pci_device_id that matches only a
+ * driver_override device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID.
+ */
+#define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
+	.vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \
+	.subdevice = PCI_ANY_ID, .override_only = (driver_override)
+
+/**
+ * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO
+ *                                   "driver_override" PCI device.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID and the driver_override will be set to
+ * PCI_ID_F_VFIO_DRIVER_OVERRIDE.
+ */
+#define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \
+	PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE)
+
 /**
  * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem
  * @vend: the 16 bit PCI Vendor ID
@@ -1221,6 +1271,7 @@ int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
+struct pci_dev *pci_real_dma_dev(struct pci_dev *dev);
 
 int __printf(6, 7) pci_request_irq(struct pci_dev *dev, unsigned int nr,
 		irq_handler_t handler, irq_handler_t thread_fn, void *dev_id,
@@ -1465,8 +1516,10 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
 				   struct irq_affinity *affd);
+int pci_add_msix_irq_vector(struct pci_dev *dev);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
+void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
 int pci_irq_get_node(struct pci_dev *pdev, int vec);
@@ -1497,10 +1550,17 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 	return -ENOSPC;
 }
 
+static inline int pci_add_msix_irq_vector(struct pci_dev *dev)
+{ return -ENOSYS; }
+
 static inline void pci_free_irq_vectors(struct pci_dev *dev)
 {
 }
 
+static inline void pci_free_msix_irq_vector(struct pci_dev *dev, unsigned int irq)
+{
+}
+
 static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 {
 	if (WARN_ON_ONCE(nr > 0))
@@ -1591,6 +1651,9 @@ void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
 
+int pci_dev_trylock(struct pci_dev *dev);
+void pci_dev_unlock(struct pci_dev *dev);
+
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
  * a PCI domain is defined to be a set of PCI buses which share
@@ -1793,11 +1856,14 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
+bool pci_ats_supported(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
 int pci_ats_page_aligned(struct pci_dev *dev);
 #else
+static inline bool pci_ats_supported(struct pci_dev *d)
+{ return false; }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0ad57693f3926e8804759017f63f52e33e937fc1..e49aa4f5b6205720db7a67436d4da47d495ce1e4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -81,6 +81,7 @@
 #define PCI_CLASS_SYSTEM_RTC		0x0803
 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
 #define PCI_CLASS_SYSTEM_SDHCI		0x0805
+#define PCI_CLASS_SYSTEM_RCEC		0x0807
 #define PCI_CLASS_SYSTEM_OTHER		0x0880
 
 #define PCI_BASE_CLASS_INPUT		0x09
@@ -2659,6 +2660,8 @@
 #define PCI_DEVICE_ID_INTEL_80332_1	0x0332
 #define PCI_DEVICE_ID_INTEL_80333_0	0x0370
 #define PCI_DEVICE_ID_INTEL_80333_1	0x0372
+#define PCI_DEVICE_ID_INTEL_QAT_DH895XCC	0x0435
+#define PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF	0x0443
 #define PCI_DEVICE_ID_INTEL_82375	0x0482
 #define PCI_DEVICE_ID_INTEL_82424	0x0483
 #define PCI_DEVICE_ID_INTEL_82378	0x0484
@@ -2708,6 +2711,8 @@
 #define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_NHI     0x1577
 #define PCI_DEVICE_ID_INTEL_ALPINE_RIDGE_4C_BRIDGE  0x1578
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
+#define PCI_DEVICE_ID_INTEL_QAT_C3XXX	0x19e2
+#define PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF	0x19e3
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
 #define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
@@ -2924,6 +2929,8 @@
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF7	0x3717
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF8	0x3718
 #define PCI_DEVICE_ID_INTEL_IOAT_JSF9	0x3719
+#define PCI_DEVICE_ID_INTEL_QAT_C62X	0x37c8
+#define PCI_DEVICE_ID_INTEL_QAT_C62X_VF	0x37c9
 #define PCI_DEVICE_ID_INTEL_ICH10_0	0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1	0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2	0x3a18
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5862ff3f031a66280a51437db887f95d5626c6df..892e178ab089b7d399c584ea0817a2b39afc54e7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -92,14 +92,26 @@ struct perf_raw_record {
 /*
  * branch stack layout:
  *  nr: number of taken branches stored in entries[]
+ *  hw_idx: The low level index of raw branch records
+ *          for the most recent branch.
+ *          -1ULL means invalid/unknown.
  *
  * Note that nr can vary from sample to sample
  * branches (to, from) are stored from most recent
  * to least recent, i.e., entries[0] contains the most
  * recent branch.
+ * The entries[] is an abstraction of raw branch records,
+ * which may not be stored in age order in HW, e.g. Intel LBR.
+ * The hw_idx is to expose the low level index of raw
+ * branch record for the most recent branch aka entries[0].
+ * The hw_idx index is between -1 (unknown) and max depth,
+ * which can be retrieved in /sys/devices/cpu/caps/branches.
+ * For the architectures whose raw branch records are
+ * already stored in age order, the hw_idx should be 0.
  */
 struct perf_branch_stack {
 	__u64				nr;
+	__u64				hw_idx;
 	struct perf_branch_entry	entries[0];
 };
 
@@ -413,11 +425,21 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
+
 	/*
-	 * PMU specific data size
+	 * Kmem cache of PMU specific data
 	 */
-	size_t				task_ctx_size;
+	struct kmem_cache		*task_ctx_cache;
 
+	/*
+	 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
+	 * can be synchronized using this function. See Intel LBR callstack support
+	 * implementation and Perf core context switch handling callbacks for usage
+	 * examples.
+	 */
+	void (*swap_task_ctx)		(struct perf_event_context *prev,
+					 struct perf_event_context *next);
+					/* optional */
 
 	/*
 	 * Set up pmu-private data structures for an AUX area
@@ -951,7 +973,7 @@ struct perf_sample_data {
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
-	u64				weight;
+	union perf_sample_weight	weight;
 	u64				txn;
 	union  perf_mem_data_src	data_src;
 
@@ -1002,7 +1024,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->raw  = NULL;
 	data->br_stack = NULL;
 	data->period = period;
-	data->weight = 0;
+	data->weight.full = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 2955ba97604892ab14a7e6b0cb532136aef85943..6beb26b7151d21c18c7c3dd52c73290c0d8d48c5 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -44,10 +44,6 @@ static inline bool arch_pkeys_enabled(void)
 	return false;
 }
 
-static inline void copy_init_pkru_to_fpregs(void)
-{
-}
-
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
diff --git a/include/linux/prmt.h b/include/linux/prmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..24da8364b91977fcd8c9c3372d8cd26a3cb6d578
--- /dev/null
+++ b/include/linux/prmt.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifdef CONFIG_ACPI_PRMT
+void init_prmt(void);
+#else
+static inline void init_prmt(void) { }
+#endif
diff --git a/include/linux/property.h b/include/linux/property.h
index 9b3d4ca3a73a9eca0266e16c64c933034d7fb3a3..0546611096613af4259ef578e71fbc4ee2154a0c 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -80,9 +80,14 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
 					    const char *name,
 					    unsigned int index);
 
+const char *fwnode_get_name(const struct fwnode_handle *fwnode);
+const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
 	struct fwnode_handle *fwnode);
+unsigned int fwnode_count_parents(const struct fwnode_handle *fwn);
+struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn,
+					    unsigned int depth);
 struct fwnode_handle *fwnode_get_next_child_node(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *child);
 struct fwnode_handle *fwnode_get_next_available_child_node(
@@ -418,7 +423,8 @@ struct software_node {
 };
 
 bool is_software_node(const struct fwnode_handle *fwnode);
-const struct software_node *to_software_node(struct fwnode_handle *fwnode);
+const struct software_node *
+to_software_node(const struct fwnode_handle *fwnode);
 struct fwnode_handle *software_node_fwnode(const struct software_node *node);
 
 const struct software_node *
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 1fd61a9af45cd6f8b4578c20eaf2e27bc6b8ba47..2f0a976b2eeaf479aed5078700ce1c96c7411a05 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -158,4 +158,198 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+
+	return leftmost ? node : NULL;
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index e28cce21bad6cc1692b9624ccab74b93e448fd3e..0ac50cf62d062663b4d43df9eeafafb2f0256e7d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -1,9 +1,88 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * Saturation semantics
+ * ====================
+ *
+ * refcount_t differs from atomic_t in that the counter saturates at
+ * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
+ * counter and causing 'spurious' use-after-free issues. In order to avoid the
+ * cost associated with introducing cmpxchg() loops into all of the saturating
+ * operations, we temporarily allow the counter to take on an unchecked value
+ * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
+ * or overflow has occurred. Although this is racy when multiple threads
+ * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
+ * equidistant from 0 and INT_MAX we minimise the scope for error:
+ *
+ * 	                           INT_MAX     REFCOUNT_SATURATED   UINT_MAX
+ *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
+ *   +--------------------------------+----------------+----------------+
+ *                                     <---------- bad value! ---------->
+ *
+ * (in a signed view of the world, the "bad value" range corresponds to
+ * a negative counter value).
+ *
+ * As an example, consider a refcount_inc() operation that causes the counter
+ * to overflow:
+ *
+ * 	int old = atomic_fetch_add_relaxed(r);
+ *	// old is INT_MAX, refcount now INT_MIN (0x8000_0000)
+ *	if (old < 0)
+ *		atomic_set(r, REFCOUNT_SATURATED);
+ *
+ * If another thread also performs a refcount_inc() operation between the two
+ * atomic operations, then the count will continue to edge closer to 0. If it
+ * reaches a value of 1 before /any/ of the threads reset it to the saturated
+ * value, then a concurrent refcount_dec_and_test() may erroneously free the
+ * underlying object. Given the precise timing details involved with the
+ * round-robin scheduling of each thread manipulating the refcount and the need
+ * to hit the race multiple times in succession, there doesn't appear to be a
+ * practical avenue of attack even if using refcount_add() operations with
+ * larger increments.
+ *
+ * Memory ordering
+ * ===============
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ * The decrements dec_and_test() and sub_and_test() also provide acquire
+ * ordering on success.
+ *
+ */
+
 #ifndef _LINUX_REFCOUNT_H
 #define _LINUX_REFCOUNT_H
 
 #include <linux/atomic.h>
+#include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/limits.h>
 #include <linux/spinlock_types.h>
 
 struct mutex;
@@ -12,7 +91,7 @@ struct mutex;
  * struct refcount_t - variant of atomic_t specialized for reference counts
  * @refs: atomic_t counter field
  *
- * The counter saturates at UINT_MAX and will not move once
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
  * there. This avoids wrapping the counter and causing 'spurious'
  * use-after-free bugs.
  */
@@ -21,13 +100,25 @@ typedef struct refcount_struct {
 } refcount_t;
 
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
+
+enum refcount_saturation_type {
+	REFCOUNT_ADD_NOT_ZERO_OVF,
+	REFCOUNT_ADD_OVF,
+	REFCOUNT_ADD_UAF,
+	REFCOUNT_SUB_UAF,
+	REFCOUNT_DEC_LEAK,
+};
+
+void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
 
 /**
  * refcount_set - set a refcount's value
  * @r: the refcount
  * @n: value to which the refcount will be set
  */
-static inline void refcount_set(refcount_t *r, unsigned int n)
+static inline void refcount_set(refcount_t *r, int n)
 {
 	atomic_set(&r->refs, n);
 }
@@ -43,70 +134,168 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
-extern __must_check bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r);
-extern void refcount_add_checked(unsigned int i, refcount_t *r);
-
-extern __must_check bool refcount_inc_not_zero_checked(refcount_t *r);
-extern void refcount_inc_checked(refcount_t *r);
-
-extern __must_check bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r);
-
-extern __must_check bool refcount_dec_and_test_checked(refcount_t *r);
-extern void refcount_dec_checked(refcount_t *r);
-
-#ifdef CONFIG_REFCOUNT_FULL
-
-#define refcount_add_not_zero	refcount_add_not_zero_checked
-#define refcount_add		refcount_add_checked
-
-#define refcount_inc_not_zero	refcount_inc_not_zero_checked
-#define refcount_inc		refcount_inc_checked
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
+static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
+{
+	int old = refcount_read(r);
 
-#define refcount_sub_and_test	refcount_sub_and_test_checked
+	do {
+		if (!old)
+			break;
+	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));
 
-#define refcount_dec_and_test	refcount_dec_and_test_checked
-#define refcount_dec		refcount_dec_checked
+	if (unlikely(old < 0 || old + i < 0))
+		refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);
 
-#else
-# ifdef CONFIG_ARCH_HAS_REFCOUNT
-#  include <asm/refcount.h>
-# else
-static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
-{
-	return atomic_add_unless(&r->refs, i, 0);
+	return old;
 }
 
-static inline void refcount_add(unsigned int i, refcount_t *r)
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
+static inline void refcount_add(int i, refcount_t *r)
 {
-	atomic_add(i, &r->refs);
+	int old = atomic_fetch_add_relaxed(i, &r->refs);
+
+	if (unlikely(!old))
+		refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
+	else if (unlikely(old < 0 || old + i < 0))
+		refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
 }
 
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
+ * and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
+ */
 static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
 {
-	return atomic_add_unless(&r->refs, 1, 0);
+	return refcount_add_not_zero(1, r);
 }
 
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
+ */
 static inline void refcount_inc(refcount_t *r)
 {
-	atomic_inc(&r->refs);
+	refcount_add(1, r);
 }
 
-static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
+static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
 {
-	return atomic_sub_and_test(i, &r->refs);
+	int old = atomic_fetch_sub_release(i, &r->refs);
+
+	if (old == i) {
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	if (unlikely(old < 0 || old - i < 0))
+		refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
+
+	return false;
 }
 
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
 static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
-	return atomic_dec_and_test(&r->refs);
+	return refcount_sub_and_test(1, r);
 }
 
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
 static inline void refcount_dec(refcount_t *r)
 {
-	atomic_dec(&r->refs);
+	if (unlikely(atomic_fetch_sub_release(1, &r->refs) <= 1))
+		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
 }
-# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
-#endif /* CONFIG_REFCOUNT_FULL */
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
 extern __must_check bool refcount_dec_not_one(refcount_t *r);
diff --git a/include/linux/regset.h b/include/linux/regset.h
index bf0243779738461f20ba0a80789ffc6b5ad6be83..8f1cde3faaba0a11c921970766dafb8596258cc4 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -17,6 +17,52 @@
 struct task_struct;
 struct user_regset;
 
+struct membuf {
+	void *p;
+	size_t left;
+};
+
+static inline int membuf_zero(struct membuf *s, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memset(s->p, 0, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+static inline int membuf_write(struct membuf *s, const void *v, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memcpy(s->p, v, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+/* current s->p must be aligned for v; v must be a scalar */
+#define membuf_store(s, v)				\
+({							\
+	struct membuf *__s = (s);			\
+        if (__s->left) {				\
+		typeof(v) __v = (v);			\
+		size_t __size = sizeof(__v);		\
+		if (unlikely(__size > __s->left)) {	\
+			__size = __s->left;		\
+			memcpy(__s->p, &__v, __size);	\
+		} else {				\
+			*(typeof(__v + 0) *)__s->p = __v;	\
+		}					\
+		__s->p += __size;			\
+		__s->left -= __size;			\
+	}						\
+	__s->left;})
 
 /**
  * user_regset_active_fn - type of @active function in &struct user_regset
@@ -57,6 +103,10 @@ typedef int user_regset_get_fn(struct task_struct *target,
 			       unsigned int pos, unsigned int count,
 			       void *kbuf, void __user *ubuf);
 
+typedef int user_regset_get2_fn(struct task_struct *target,
+			       const struct user_regset *regset,
+			       struct membuf to);
+
 /**
  * user_regset_set_fn - type of @set function in &struct user_regset
  * @target:	thread being examined
@@ -186,6 +236,7 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target,
  */
 struct user_regset {
 	user_regset_get_fn		*get;
+	user_regset_get2_fn		*regset_get;
 	user_regset_set_fn		*set;
 	user_regset_active_fn		*active;
 	user_regset_writeback_fn	*writeback;
@@ -353,31 +404,19 @@ static inline int user_regset_copyin_ignore(unsigned int *pos,
 	return 0;
 }
 
-/**
- * copy_regset_to_user - fetch a thread's user_regset data into user memory
- * @target:	thread to be examined
- * @view:	&struct user_regset_view describing user thread machine state
- * @setno:	index in @view->regsets
- * @offset:	offset into the regset data, in bytes
- * @size:	amount of data to copy, in bytes
- * @data:	user-mode pointer to copy into
- */
-static inline int copy_regset_to_user(struct task_struct *target,
-				      const struct user_regset_view *view,
-				      unsigned int setno,
-				      unsigned int offset, unsigned int size,
-				      void __user *data)
-{
-	const struct user_regset *regset = &view->regsets[setno];
-
-	if (!regset->get)
-		return -EOPNOTSUPP;
+extern int regset_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int size, void *data);
 
-	if (!access_ok(data, size))
-		return -EFAULT;
+extern int regset_get_alloc(struct task_struct *target,
+			    const struct user_regset *regset,
+			    unsigned int size,
+			    void **data);
 
-	return regset->get(target, regset, offset, size, NULL, data);
-}
+extern int copy_regset_to_user(struct task_struct *target,
+			       const struct user_regset_view *view,
+			       unsigned int setno, unsigned int offset,
+			       unsigned int size, void __user *data);
 
 /**
  * copy_regset_from_user - store into thread's user_regset data from user memory
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
new file mode 100644
index 0000000000000000000000000000000000000000..21deb5212bbdd82e58dac7b4764762c66fffba9b
--- /dev/null
+++ b/include/linux/resctrl.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RESCTRL_H
+#define _RESCTRL_H
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pid.h>
+
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+int proc_resctrl_show(struct seq_file *m,
+		      struct pid_namespace *ns,
+		      struct pid *pid,
+		      struct task_struct *tsk);
+
+#endif
+
+/**
+ * enum resctrl_conf_type - The type of configuration.
+ * @CDP_NONE:	No prioritisation, both code and data are controlled or monitored.
+ * @CDP_CODE:	Configuration applies to instruction fetches.
+ * @CDP_DATA:	Configuration applies to reads and writes.
+ */
+enum resctrl_conf_type {
+	CDP_NONE,
+	CDP_CODE,
+	CDP_DATA,
+};
+
+#define CDP_NUM_TYPES	(CDP_DATA + 1)
+
+/**
+ * struct resctrl_staged_config - parsed configuration to be applied
+ * @new_ctrl:		new ctrl value to be loaded
+ * @have_new_ctrl:	whether the user provided new_ctrl is valid
+ */
+struct resctrl_staged_config {
+	u32			new_ctrl;
+	bool			have_new_ctrl;
+};
+
+/**
+ * struct rdt_domain - group of CPUs sharing a resctrl resource
+ * @list:		all instances of this resource
+ * @id:			unique id for this instance
+ * @cpu_mask:		which CPUs share this resource
+ * @rmid_busy_llc:	bitmap of which limbo RMIDs are above threshold
+ * @mbm_total:		saved state for MBM total bandwidth
+ * @mbm_local:		saved state for MBM local bandwidth
+ * @mbm_over:		worker to periodically read MBM h/w counters
+ * @cqm_limbo:		worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:	worker CPU for MBM h/w counters
+ * @cqm_work_cpu:	worker CPU for CQM h/w counters
+ * @plr:		pseudo-locked region (if any) associated with domain
+ * @staged_config:	parsed configuration to be applied
+ */
+struct rdt_domain {
+	struct list_head		list;
+	int				id;
+	struct cpumask			cpu_mask;
+	unsigned long			*rmid_busy_llc;
+	struct mbm_state		*mbm_total;
+	struct mbm_state		*mbm_local;
+	struct delayed_work		mbm_over;
+	struct delayed_work		cqm_limbo;
+	int				mbm_work_cpu;
+	int				cqm_work_cpu;
+	struct pseudo_lock_region	*plr;
+	struct resctrl_staged_config	staged_config[CDP_NUM_TYPES];
+};
+
+/**
+ * struct resctrl_cache - Cache allocation related data
+ * @cbm_len:		Length of the cache bit mask
+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
+ * @shareable_bits:	Bitmask of shareable resource with other
+ *			executing entities
+ * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_empty_bitmaps:	True if the '0' bitmap is valid.
+ * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
+ *				level has CPU scope.
+ */
+struct resctrl_cache {
+	unsigned int	cbm_len;
+	unsigned int	min_cbm_bits;
+	unsigned int	shareable_bits;
+	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_empty_bitmaps;
+	bool		arch_has_per_cpu_cfg;
+};
+
+/**
+ * enum membw_throttle_mode - System's memory bandwidth throttling mode
+ * @THREAD_THROTTLE_UNDEFINED:	Not relevant to the system
+ * @THREAD_THROTTLE_MAX:	Memory bandwidth is throttled at the core
+ *				always using smallest bandwidth percentage
+ *				assigned to threads, aka "max throttling"
+ * @THREAD_THROTTLE_PER_THREAD:	Memory bandwidth is throttled at the thread
+ */
+enum membw_throttle_mode {
+	THREAD_THROTTLE_UNDEFINED = 0,
+	THREAD_THROTTLE_MAX,
+	THREAD_THROTTLE_PER_THREAD,
+};
+
+/**
+ * struct resctrl_membw - Memory bandwidth allocation related data
+ * @min_bw:		Minimum memory bandwidth percentage user can request
+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
+ * @delay_linear:	True if memory B/W delay is in linear scale
+ * @arch_needs_linear:	True if we can't configure non-linear resources
+ * @throttle_mode:	Bandwidth throttling mode when threads request
+ *			different memory bandwidths
+ * @mba_sc:		True if MBA software controller(mba_sc) is enabled
+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
+ */
+struct resctrl_membw {
+	u32				min_bw;
+	u32				bw_gran;
+	u32				delay_linear;
+	bool				arch_needs_linear;
+	enum membw_throttle_mode	throttle_mode;
+	bool				mba_sc;
+	u32				*mb_map;
+};
+
+struct rdt_parse_data;
+struct resctrl_schema;
+
+/**
+ * struct rdt_resource - attributes of a resctrl resource
+ * @rid:		The index of the resource
+ * @alloc_enabled:	Is allocation enabled on this machine
+ * @mon_enabled:	Is monitoring enabled for this feature
+ * @alloc_capable:	Is allocation available on this machine
+ * @mon_capable:	Is monitor feature available on this machine
+ * @num_rmid:		Number of RMIDs available
+ * @cache_level:	Which cache level defines scope of this resource
+ * @cache:		Cache allocation related data
+ * @membw:		If the component has bandwidth controls, their properties.
+ * @domains:		All domains for this resource
+ * @name:		Name to use in "schemata" file.
+ * @data_width:		Character width of data when displaying
+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * @format_str:		Per resource format string to show domain value
+ * @parse_ctrlval:	Per resource function pointer to parse control values
+ * @evt_list:		List of monitoring events
+ * @fflags:		flags to choose base and info files
+ * @cdp_capable:	Is the CDP feature available on this resource
+ */
+struct rdt_resource {
+	int			rid;
+	bool			alloc_enabled;
+	bool			mon_enabled;
+	bool			alloc_capable;
+	bool			mon_capable;
+	int			num_rmid;
+	int			cache_level;
+	struct resctrl_cache	cache;
+	struct resctrl_membw	membw;
+	struct list_head	domains;
+	char			*name;
+	int			data_width;
+	u32			default_ctrl;
+	const char		*format_str;
+	int			(*parse_ctrlval)(struct rdt_parse_data *data,
+						 struct resctrl_schema *s,
+						 struct rdt_domain *d);
+	struct list_head	evt_list;
+	unsigned long		fflags;
+	bool			cdp_capable;
+};
+
+/**
+ * struct resctrl_schema - configuration abilities of a resource presented to
+ *			   user-space
+ * @list:	Member of resctrl_schema_all.
+ * @name:	The name to use in the "schemata" file.
+ * @conf_type:	Whether this schema is specific to code/data.
+ * @res:	The resource structure exported by the architecture to describe
+ *		the hardware that is configured by this schema.
+ * @num_closid:	The number of closid that can be used with this schema. When
+ *		features like CDP are enabled, this will be lower than the
+ *		hardware supports for the resource.
+ */
+struct resctrl_schema {
+	struct list_head		list;
+	char				name[8];
+	enum resctrl_conf_type		conf_type;
+	struct rdt_resource		*res;
+	u32				num_closid;
+};
+
+/* The number of closid supported by this resource regardless of CDP */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type type);
+
+#endif /* _RESCTRL_H */
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6eec50fb36c80b61e68edf1a2214791679fe3285..4f922afb607ac01d4122dc3641faa367884faaa6 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -151,6 +151,20 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
 #define for_each_sg(sglist, sg, nr, __i)	\
 	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
 
+/*
+ * Loop over each sg element in the given sg_table object.
+ */
+#define for_each_sgtable_sg(sgt, sg, i)		\
+	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i)
+
+/*
+ * Loop over each sg element in the given *DMA mapped* sg_table object.
+ * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
+ * of the each element.
+ */
+#define for_each_sgtable_dma_sg(sgt, sg, i)	\
+	for_each_sg(sgt->sgl, sg, sgt->nents, i)
+
 /**
  * sg_chain - Chain two sglists together
  * @prv:	First scatterlist
@@ -401,9 +415,10 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
  * @sglist:	sglist to iterate over
  * @piter:	page iterator to hold current page, sg, sg_pgoffset
  * @nents:	maximum number of sg entries to iterate over
- * @pgoffset:	starting page offset
+ * @pgoffset:	starting page offset (in pages)
  *
  * Callers may use sg_page_iter_page() to get each page pointer.
+ * In each loop it operates on PAGE_SIZE unit.
  */
 #define for_each_sg_page(sglist, piter, nents, pgoffset)		   \
 	for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
@@ -412,18 +427,47 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
 /**
  * for_each_sg_dma_page - iterate over the pages of the given sg list
  * @sglist:	sglist to iterate over
- * @dma_iter:	page iterator to hold current page
+ * @dma_iter:	DMA page iterator to hold current page
  * @dma_nents:	maximum number of sg entries to iterate over, this is the value
  *              returned from dma_map_sg
- * @pgoffset:	starting page offset
+ * @pgoffset:	starting page offset (in pages)
  *
  * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
+ * In each loop it operates on PAGE_SIZE unit.
  */
 #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
 	for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
 				  pgoffset);                                   \
 	     __sg_page_iter_dma_next(dma_iter);)
 
+/**
+ * for_each_sgtable_page - iterate over all pages in the sg_table object
+ * @sgt:	sg_table object to iterate over
+ * @piter:	page iterator to hold current page
+ * @pgoffset:	starting page offset (in pages)
+ *
+ * Iterates over the all memory pages in the buffer described by
+ * a scatterlist stored in the given sg_table object.
+ * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
+ */
+#define for_each_sgtable_page(sgt, piter, pgoffset)	\
+	for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset)
+
+/**
+ * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
+ * @sgt:	sg_table object to iterate over
+ * @dma_iter:	DMA page iterator to hold current page
+ * @pgoffset:	starting page offset (in pages)
+ *
+ * Iterates over the all DMA mapped pages in the buffer described by
+ * a scatterlist stored in the given sg_table object.
+ * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
+ * unit.
+ */
+#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset)	\
+	for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset)
+
+
 /*
  * Mapping sg iterator
  *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7999c06015fadb40b9b25822fadd6e141917491..0ef728f0a3006860e5f149a34668ff71de4182ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -792,9 +792,15 @@ struct task_struct {
 	unsigned			frozen:1;
 #endif
 #ifdef CONFIG_BLK_CGROUP
-	/* to be used once the psi infrastructure lands upstream. */
 	unsigned			use_memdelay:1;
 #endif
+#ifdef CONFIG_PSI
+	/* Stalled due to lack of memory */
+	unsigned			in_memstall:1;
+#endif
+#ifdef CONFIG_IOMMU_SVA
+	unsigned			pasid_activated:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
@@ -1292,6 +1298,7 @@ struct task_struct {
 					__mce_reserved : 62;
 
 	struct callback_head		mce_kill_me;
+	int				mce_count;
 #endif
 
 	/*
@@ -1495,7 +1502,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_MEMSTALL		0x01000000	/* Stalled due to lack of memory */
 #define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3a1d899019af00bb2b99a15b43f2f4f5a5b11661..c9fef831ba21795dcf794ccc525439e0c37d3259 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 #include <linux/sync_core.h>
+#include <linux/ioasid.h>
 
 /*
  * Routines for handling mm_structs
@@ -393,4 +394,13 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+static inline void mm_pasid_init(struct mm_struct *mm)
+{
+	mm->pasid = INVALID_IOASID;
+}
+#else
+static inline void mm_pasid_init(struct mm_struct *mm) {}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 23351d6ace55359f6eed3035b90f8abce6adb1fe..f652db01272612bf813239b379d55020df4696ec 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -514,6 +514,17 @@ static inline int kill_cad_pid(int sig, int priv)
 #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
 #define SEND_SIG_PRIV	((struct kernel_siginfo *) 1)
 
+static inline int __on_sig_stack(unsigned long sp)
+{
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
 /*
  * True if we are on the alternate signal stack.
  */
@@ -531,13 +542,7 @@ static inline int on_sig_stack(unsigned long sp)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		return 0;
 
-#ifdef CONFIG_STACK_GROWSUP
-	return sp >= current->sas_ss_sp &&
-		sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-	return sp > current->sas_ss_sp &&
-		sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
+	return __on_sig_stack(sp);
 }
 
 static inline int sas_ss_flags(unsigned long sp)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 1a5f88316b081463a4e76bb8b48aa312f75162d5..b4973c9d1d7bcc5178b5c77ef51f51a76270b184 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -444,16 +444,22 @@ void signals_init(void);
 int restore_altstack(const stack_t __user *);
 int __save_altstack(stack_t __user *, unsigned long);
 
-#define save_altstack_ex(uss, sp) do { \
+#define unsafe_save_altstack(uss, sp, label) do { \
 	stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
-	put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \
-	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
-	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
+	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
+	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
 	if (t->sas_ss_flags & SS_AUTODISARM) \
 		sas_ss_reset(t); \
 } while (0);
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+bool sigaltstack_size_valid(size_t ss_size);
+#else
+static inline bool sigaltstack_size_valid(size_t size) { return true; }
+#endif /* !CONFIG_DYNAMIC_SIGFRAME */
+
 #ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 3208a520d0be3f2ad2ca5c53a55e69bd892290b5..8c595cea28113596289bc761f164f84b87864741 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -321,6 +321,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0a8fced6aaec4834d9a2edda370f394f63c9808d..484e41f835f145cadf508a8072f092462ee049e8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,6 +29,10 @@ enum swiotlb_force {
  * controllable.
  */
 #define IO_TLB_SHIFT 11
+#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
 
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
@@ -37,33 +41,22 @@ unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern void __init swiotlb_update_mem_attributes(void);
 
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
-	SYNC_FOR_CPU = 0,
-	SYNC_FOR_DEVICE = 1,
-};
-
-extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-					  dma_addr_t tbl_dma_addr,
-					  phys_addr_t phys,
-					  size_t mapping_size,
-					  size_t alloc_size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs);
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs);
 
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
 				     size_t mapping_size,
-				     size_t alloc_size,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
-extern void swiotlb_tbl_sync_single(struct device *hwdev,
-				    phys_addr_t tlb_addr,
-				    size_t size, enum dma_data_direction dir,
-				    enum dma_sync_target target);
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
+dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
@@ -74,24 +67,17 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 	return paddr >= io_tlb_start && paddr < io_tlb_end;
 }
 
-bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
+void __init swiotlb_adjust_size(unsigned long new_size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
 	return false;
 }
-static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys,
-		dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	return false;
-}
 static inline void swiotlb_exit(void)
 {
 }
@@ -108,6 +94,10 @@ static inline bool is_swiotlb_active(void)
 {
 	return false;
 }
+
+static inline void swiotlb_adjust_size(unsigned long new_size)
+{
+}
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800c0f6cf02535468ba52d2c8f2d74a..b53a9557884adae600e07f0e4523aa70b030d651 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,43 +15,79 @@
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+/*
+ * VFIO devices can be placed in a set, this allows all devices to share this
+ * structure and the VFIO core will provide a lock that is held around
+ * open_device()/close_device() for all devices in the set.
+ */
+struct vfio_device_set {
+	void *set_id;
+	struct mutex lock;
+	struct list_head device_list;
+	unsigned int device_count;
+};
+
+struct vfio_device {
+	struct device *dev;
+	const struct vfio_device_ops *ops;
+	struct vfio_group *group;
+	struct vfio_device_set *dev_set;
+	struct list_head dev_set_list;
+
+	/* Members below here are private, not for driver use */
+	refcount_t refcount;
+	unsigned int open_count;
+	struct completion comp;
+	struct list_head group_next;
+};
+
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
- * @open: Called when userspace creates new file descriptor for device
- * @release: Called when userspace releases file descriptor for device
+ * @open_device: Called when the first file descriptor is opened for this device
+ * @close_device: Opposite of open_device
  * @read: Perform read(2) on device file descriptor
  * @write: Perform write(2) on device file descriptor
  * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
  *         operations documented below
  * @mmap: Perform mmap(2) on a region of the device file descriptor
  * @request: Request for the bus driver to release the device
+ * @match: Optional device name match callback (return: 0 for no-match, >0 for
+ *         match, -errno for abort (ex. match with insufficient or incorrect
+ *         additional args)
  */
 struct vfio_device_ops {
 	char	*name;
-	int	(*open)(void *device_data);
-	void	(*release)(void *device_data);
-	ssize_t	(*read)(void *device_data, char __user *buf,
+	int	(*open_device)(struct vfio_device *vdev);
+	void	(*close_device)(struct vfio_device *vdev);
+	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos);
-	ssize_t	(*write)(void *device_data, const char __user *buf,
+	ssize_t	(*write)(struct vfio_device *vdev, const char __user *buf,
 			 size_t count, loff_t *size);
-	long	(*ioctl)(void *device_data, unsigned int cmd,
+	long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 			 unsigned long arg);
-	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
-	void	(*request)(void *device_data, unsigned int count);
+	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
+	void	(*request)(struct vfio_device *vdev, unsigned int count);
+	int	(*match)(struct vfio_device *vdev, char *buf);
 };
 
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
 extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
-extern int vfio_add_group_dev(struct device *dev,
-			      const struct vfio_device_ops *ops,
-			      void *device_data);
-
-extern void *vfio_del_group_dev(struct device *dev);
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops);
+void vfio_uninit_group_dev(struct vfio_device *device);
+int vfio_register_group_dev(struct vfio_device *device);
+void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-extern void *vfio_device_data(struct vfio_device *device);
+
+int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+
+/* events for the backend driver notify callback */
+enum vfio_iommu_notify_type {
+	VFIO_IOMMU_CONTAINER_CLOSE = 0,
+};
 
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
@@ -72,7 +108,9 @@ struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	void		(*detach_group)(void *iommu_data,
 					struct iommu_group *group);
-	int		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
+	int		(*pin_pages)(void *iommu_data,
+				     struct iommu_group *group,
+				     unsigned long *user_pfn,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
 	int		(*unpin_pages)(void *iommu_data,
@@ -82,6 +120,12 @@ struct vfio_iommu_driver_ops {
 					     struct notifier_block *nb);
 	int		(*unregister_notifier)(void *iommu_data,
 					       struct notifier_block *nb);
+	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
+				  void *data, size_t count, bool write);
+	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
+						   struct iommu_group *group);
+	void		(*notify)(void *iommu_data,
+				  enum vfio_iommu_notify_type event);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -94,6 +138,8 @@ extern void vfio_unregister_iommu_driver(
  */
 extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
 extern void vfio_group_put_external_user(struct vfio_group *group);
+extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device
+								*dev);
 extern bool vfio_external_group_match_file(struct vfio_group *group,
 					   struct file *filep);
 extern int vfio_external_user_iommu_id(struct vfio_group *group);
@@ -107,6 +153,17 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
 			    int npage);
 
+extern int vfio_group_pin_pages(struct vfio_group *group,
+				unsigned long *user_iova_pfn, int npage,
+				int prot, unsigned long *phys_pfn);
+extern int vfio_group_unpin_pages(struct vfio_group *group,
+				  unsigned long *user_iova_pfn, int npage);
+
+extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
+		       void *data, size_t len, bool write);
+
+extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/include/linux/vfio_pci_core.h
similarity index 44%
rename from drivers/vfio/pci/vfio_pci_private.h
rename to include/linux/vfio_pci_core.h
index 987b4d311fde9981c8f1a5a512aac4106e317eda..3f97de7f9daf0a9fba778a75d1f89743e916ed8c 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/include/linux/vfio_pci_core.h
@@ -10,11 +10,14 @@
 
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/vfio.h>
 #include <linux/irqbypass.h>
 #include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/notifier.h>
 
-#ifndef VFIO_PCI_PRIVATE_H
-#define VFIO_PCI_PRIVATE_H
+#ifndef VFIO_PCI_CORE_H
+#define VFIO_PCI_CORE_H
 
 #define VFIO_PCI_OFFSET_SHIFT   40
 
@@ -31,12 +34,14 @@
 
 struct vfio_pci_ioeventfd {
 	struct list_head	next;
+	struct vfio_pci_core_device	*vdev;
 	struct virqfd		*virqfd;
 	void __iomem		*addr;
 	uint64_t		data;
 	loff_t			pos;
 	int			bar;
 	int			count;
+	bool			test_mem;
 };
 
 struct vfio_pci_irq_ctx {
@@ -48,18 +53,22 @@ struct vfio_pci_irq_ctx {
 	struct irq_bypass_producer	producer;
 };
 
-struct vfio_pci_device;
+struct vfio_pci_core_device;
 struct vfio_pci_region;
 
+struct vfio_pci_migops {
+	int	(*state_change)(struct vfio_pci_core_device *vdev, u32 new_state);
+};
+
 struct vfio_pci_regops {
-	size_t	(*rw)(struct vfio_pci_device *vdev, char __user *buf,
+	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
 		      size_t count, loff_t *ppos, bool iswrite);
-	void	(*release)(struct vfio_pci_device *vdev,
+	void	(*release)(struct vfio_pci_core_device *vdev,
 			   struct vfio_pci_region *region);
-	int	(*mmap)(struct vfio_pci_device *vdev,
+	int	(*mmap)(struct vfio_pci_core_device *vdev,
 			struct vfio_pci_region *region,
 			struct vm_area_struct *vma);
-	int	(*add_capability)(struct vfio_pci_device *vdev,
+	int	(*add_capability)(struct vfio_pci_core_device *vdev,
 				  struct vfio_pci_region *region,
 				  struct vfio_info_cap *caps);
 };
@@ -73,26 +82,35 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_ext_irq {
+	u32				type;
+	u32				subtype;
+	u32				flags;
+	struct eventfd_ctx		*trigger;
+};
+
 struct vfio_pci_dummy_resource {
 	struct resource		resource;
 	int			index;
 	struct list_head	res_next;
 };
 
-struct vfio_pci_reflck {
-	struct kref		kref;
-	struct mutex		lock;
-};
-
 struct vfio_pci_mmap_vma {
 	struct vm_area_struct	*vma;
 	struct list_head	vma_next;
 };
 
-struct vfio_pci_device {
+struct vfio_pci_vf_token {
+	struct mutex		lock;
+	uuid_t			uuid;
+	int			users;
+};
+
+struct vfio_pci_core_device {
+	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
-	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
-	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
+	void __iomem		*barmap[PCI_STD_NUM_BARS];
+	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
@@ -101,6 +119,8 @@ struct vfio_pci_device {
 	struct vfio_pci_irq_ctx	*ctx;
 	int			num_ctx;
 	int			irq_type;
+	struct vfio_ext_irq	*ext_irqs;
+	int			num_ext_irqs;
 	int			num_regions;
 	struct vfio_pci_region	*region;
 	u8			msi_qmax;
@@ -119,17 +139,23 @@ struct vfio_pci_device {
 	bool			needs_pm_restore;
 	struct pci_saved_state	*pci_saved_state;
 	struct pci_saved_state	*pm_save;
-	struct vfio_pci_reflck	*reflck;
-	int			refcnt;
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	u8			*fault_pages;
+	struct mutex		fault_queue_lock;
 	struct list_head	dummy_resources_list;
 	struct mutex		ioeventfds_lock;
 	struct list_head	ioeventfds_list;
+	struct vfio_pci_vf_token	*vf_token;
+	struct notifier_block	nb;
 	struct mutex		vma_lock;
 	struct list_head	vma_list;
 	struct rw_semaphore	memory_lock;
+
+	struct vfio_pci_migops	*migops;
+	u8			*mig_pages;
+	struct mutex		mig_lock;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
@@ -138,67 +164,112 @@ struct vfio_pci_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
-extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
-extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
+extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+extern int vfio_pci_register_irq(struct vfio_pci_core_device *vdev,
+				 unsigned int type, unsigned int subtype,
+				 u32 flags);
+extern int vfio_pci_get_ext_irq_index(struct vfio_pci_core_device *vdev,
+				      unsigned int type, unsigned int subtype);
 
-extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
+extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
 				   uint32_t flags, unsigned index,
 				   unsigned start, unsigned count, void *data);
 
-extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev,
+extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
 				  char __user *buf, size_t count,
 				  loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
+extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
+extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			       size_t count, loff_t *ppos, bool iswrite);
 
-extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 			       uint64_t data, int count, int fd);
 
+extern ssize_t vfio_pci_dma_fault_rw(struct vfio_pci_core_device *vdev,
+				    char __user *buf, size_t count,
+				    loff_t *ppos, bool iswrite);
+extern ssize_t vfio_pci_mregion_rw(struct vfio_pci_core_device *vdev,
+				   char __user *buf, size_t count,
+				   loff_t *ppos, bool iswrite);
+
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_config_init(struct vfio_pci_device *vdev);
-extern void vfio_config_free(struct vfio_pci_device *vdev);
+extern int vfio_config_init(struct vfio_pci_core_device *vdev);
+extern void vfio_config_free(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
+extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 					unsigned int type, unsigned int subtype,
 					const struct vfio_pci_regops *ops,
 					size_t size, u32 flags, void *data);
 
-extern int vfio_pci_set_power_state(struct vfio_pci_device *vdev,
+extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
 				    pci_power_t state);
 
-extern bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev);
-extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device
+extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
 						    *vdev);
-extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev);
-extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev,
+extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
 					       u16 cmd);
 
 #ifdef CONFIG_VFIO_PCI_IGD
-extern int vfio_pci_igd_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
 #else
-static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
+static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
 	return -ENODEV;
 }
 #endif
-#ifdef CONFIG_VFIO_PCI_NVLINK2
-extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
-extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
+
+#ifdef CONFIG_S390
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				       struct vfio_info_cap *caps);
 #else
-static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+					      struct vfio_info_cap *caps)
 {
 	return -ENODEV;
 }
+#endif
+
+/* Will be exported for vfio pci drivers usage */
+void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3);
+void vfio_pci_core_close_device(struct vfio_device *core_vdev);
+void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
+			       struct pci_dev *pdev,
+			       const struct vfio_device_ops *vfio_pci_ops);
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
+int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
+extern const struct pci_error_handlers vfio_pci_core_err_handlers;
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+		unsigned long arg);
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos);
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos);
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
 
-static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 {
-	return -ENODEV;
+	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
-#endif
-#endif /* VFIO_PCI_PRIVATE_H */
+
+int vfio_pci_dma_fault_init(struct vfio_pci_core_device *vdev, bool register_fault);
+int vfio_pci_set_ext_irq_trigger(struct vfio_pci_core_device *vdev,
+				 unsigned int index, unsigned int start,
+				 unsigned int count, uint32_t flags, void *data);
+int vfio_pci_migration_init(struct vfio_pci_core_device *vdev, uint32_t size);
+#endif /* VFIO_PCI_CORE_H */
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdfaa06b332b03cd0639a27a8dd7384..758b14d1a179db4425e07aaa09d175b916fe91b5 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -6,7 +6,6 @@
  *
  * Author: Lu Baolu <baolu.lu@linux.intel.com>
  */
-#ifdef CONFIG_INTEL_IOMMU
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM intel_iommu
 
@@ -16,6 +15,8 @@
 #include <linux/tracepoint.h>
 #include <linux/intel-iommu.h>
 
+#define MSG_MAX		256
+
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
 		 size_t size),
@@ -49,12 +50,6 @@ DEFINE_EVENT(dma_map, map_single,
 	TP_ARGS(dev, dev_addr, phys_addr, size)
 );
 
-DEFINE_EVENT(dma_map, map_sg,
-	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
-		 size_t size),
-	TP_ARGS(dev, dev_addr, phys_addr, size)
-);
-
 DEFINE_EVENT(dma_map, bounce_map_single,
 	TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
 		 size_t size),
@@ -99,8 +94,121 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
 	TP_ARGS(dev, dev_addr, size)
 );
 
+DECLARE_EVENT_CLASS(dma_map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+
+	TP_ARGS(dev, index, total, sg),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name(dev))
+		__field(dma_addr_t, dev_addr)
+		__field(phys_addr_t, phys_addr)
+		__field(size_t,	size)
+		__field(int, index)
+		__field(int, total)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name(dev));
+		__entry->dev_addr = sg->dma_address;
+		__entry->phys_addr = sg_phys(sg);
+		__entry->size = sg->dma_length;
+		__entry->index = index;
+		__entry->total = total;
+	),
+
+	TP_printk("dev=%s [%d/%d] dev_addr=0x%llx phys_addr=0x%llx size=%zu",
+		  __get_str(dev_name), __entry->index, __entry->total,
+		  (unsigned long long)__entry->dev_addr,
+		  (unsigned long long)__entry->phys_addr,
+		  __entry->size)
+);
+
+DEFINE_EVENT(dma_map_sg, map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+	TP_ARGS(dev, index, total, sg)
+);
+
+DEFINE_EVENT(dma_map_sg, bounce_map_sg,
+	TP_PROTO(struct device *dev, int index, int total,
+		 struct scatterlist *sg),
+	TP_ARGS(dev, index, total, sg)
+);
+
+TRACE_EVENT(qi_submit,
+	TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
+
+	TP_ARGS(iommu, qw0, qw1, qw2, qw3),
+
+	TP_STRUCT__entry(
+		__field(u64, qw0)
+		__field(u64, qw1)
+		__field(u64, qw2)
+		__field(u64, qw3)
+		__string(iommu, iommu->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(iommu, iommu->name);
+		__entry->qw0 = qw0;
+		__entry->qw1 = qw1;
+		__entry->qw2 = qw2;
+		__entry->qw3 = qw3;
+	),
+
+	TP_printk("%s %s: 0x%llx 0x%llx 0x%llx 0x%llx",
+		  __print_symbolic(__entry->qw0 & 0xf,
+				   { QI_CC_TYPE,	"cc_inv" },
+				   { QI_IOTLB_TYPE,	"iotlb_inv" },
+				   { QI_DIOTLB_TYPE,	"dev_tlb_inv" },
+				   { QI_IEC_TYPE,	"iec_inv" },
+				   { QI_IWD_TYPE,	"inv_wait" },
+				   { QI_EIOTLB_TYPE,	"p_iotlb_inv" },
+				   { QI_PC_TYPE,	"pc_inv" },
+				   { QI_DEIOTLB_TYPE,	"p_dev_tlb_inv" },
+				   { QI_PGRP_RESP_TYPE,	"page_grp_resp" }),
+		__get_str(iommu),
+		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
+	)
+);
+
+TRACE_EVENT(prq_report,
+	TP_PROTO(struct intel_iommu *iommu, struct device *dev,
+		 u64 dw0, u64 dw1, u64 dw2, u64 dw3,
+		 unsigned long seq),
+
+	TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
+
+	TP_STRUCT__entry(
+		__field(u64, dw0)
+		__field(u64, dw1)
+		__field(u64, dw2)
+		__field(u64, dw3)
+		__field(unsigned long, seq)
+		__string(iommu, iommu->name)
+		__string(dev, dev_name(dev))
+		__dynamic_array(char, buff, MSG_MAX)
+	),
+
+	TP_fast_assign(
+		__entry->dw0 = dw0;
+		__entry->dw1 = dw1;
+		__entry->dw2 = dw2;
+		__entry->dw3 = dw3;
+		__entry->seq = seq;
+		__assign_str(iommu, iommu->name);
+		__assign_str(dev, dev_name(dev));
+	),
+
+	TP_printk("%s/%s seq# %ld: %s",
+		__get_str(iommu), __get_str(dev), __entry->seq,
+		decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
+				      __entry->dw1, __entry->dw2, __entry->dw3)
+	)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
-#endif /* CONFIG_INTEL_IOMMU */
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 72b4582322ff501d75a580b36ce3e2528ad06f32..7a7801bc0562addcd65b0c52944c7f8926f32177 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -12,6 +12,8 @@
 #define _TRACE_IOMMU_H
 
 #include <linux/tracepoint.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommu.h>
 
 struct device;
 
@@ -161,6 +163,88 @@ DEFINE_EVENT(iommu_error, io_page_fault,
 
 	TP_ARGS(dev, iova, flags)
 );
+
+TRACE_EVENT(dev_fault,
+
+	TP_PROTO(struct device *dev,  struct iommu_fault *evt),
+
+	TP_ARGS(dev, evt),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(dev))
+		__field(int, type)
+		__field(int, reason)
+		__field(u64, addr)
+		__field(u64, fetch_addr)
+		__field(u32, pasid)
+		__field(u32, grpid)
+		__field(u32, flags)
+		__field(u32, prot)
+	),
+
+	TP_fast_assign(
+		__assign_str(device, dev_name(dev));
+		__entry->type = evt->type;
+		if (evt->type == IOMMU_FAULT_DMA_UNRECOV) {
+			__entry->reason		= evt->event.reason;
+			__entry->flags		= evt->event.flags;
+			__entry->pasid		= evt->event.pasid;
+			__entry->grpid		= 0;
+			__entry->prot		= evt->event.perm;
+			__entry->addr		= evt->event.addr;
+			__entry->fetch_addr	= evt->event.fetch_addr;
+		} else {
+			__entry->reason		= 0;
+			__entry->flags		= evt->prm.flags;
+			__entry->pasid		= evt->prm.pasid;
+			__entry->grpid		= evt->prm.grpid;
+			__entry->prot		= evt->prm.perm;
+			__entry->addr		= evt->prm.addr;
+			__entry->fetch_addr	= 0;
+		}
+	),
+
+	TP_printk("IOMMU:%s type=%d reason=%d addr=0x%016llx fetch=0x%016llx pasid=%d group=%d flags=%x prot=%d",
+		__get_str(device),
+		__entry->type,
+		__entry->reason,
+		__entry->addr,
+		__entry->fetch_addr,
+		__entry->pasid,
+		__entry->grpid,
+		__entry->flags,
+		__entry->prot
+	)
+);
+
+TRACE_EVENT(dev_page_response,
+
+	TP_PROTO(struct device *dev,  struct iommu_page_response *msg),
+
+	TP_ARGS(dev, msg),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(dev))
+		__field(int, code)
+		__field(u32, pasid)
+		__field(u32, grpid)
+	),
+
+	TP_fast_assign(
+		__assign_str(device, dev_name(dev));
+		__entry->code = msg->code;
+		__entry->pasid = msg->pasid;
+		__entry->grpid = msg->grpid;
+	),
+
+	TP_printk("IOMMU:%s code=%d pasid=%d group=%d",
+		__get_str(device),
+		__entry->code,
+		__entry->pasid,
+		__entry->grpid
+	)
+);
+
 #endif /* _TRACE_IOMMU_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index abe5f2b6581be7f1b484fb655d33bf1e73e3cac4..c7e502bf5a6fadaed33107127ece5c0e4e5125f1 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,5 +33,8 @@
 
 #define AT_EXECFN  31	/* filename of program */
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ	51	/* minimal stack size for signal delivery */
+#endif
 
 #endif /* _UAPI_LINUX_AUXVEC_H */
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
new file mode 100644
index 0000000000000000000000000000000000000000..55837fa9c7ce98915f7851fe888e97e31205c539
--- /dev/null
+++ b/include/uapi/linux/idxd.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _USR_IDXD_H_
+#define _USR_IDXD_H_
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/* Driver command error status */
+enum idxd_scmd_stat {
+	IDXD_SCMD_DEV_ENABLED = 0x80000010,
+	IDXD_SCMD_DEV_NOT_ENABLED = 0x80000020,
+	IDXD_SCMD_WQ_ENABLED = 0x80000021,
+	IDXD_SCMD_DEV_DMA_ERR = 0x80020000,
+	IDXD_SCMD_WQ_NO_GRP = 0x80030000,
+	IDXD_SCMD_WQ_NO_NAME = 0x80040000,
+	IDXD_SCMD_WQ_NO_SVM = 0x80050000,
+	IDXD_SCMD_WQ_NO_THRESH = 0x80060000,
+	IDXD_SCMD_WQ_PORTAL_ERR = 0x80070000,
+	IDXD_SCMD_WQ_RES_ALLOC_ERR = 0x80080000,
+	IDXD_SCMD_PERCPU_ERR = 0x80090000,
+	IDXD_SCMD_DMA_CHAN_ERR = 0x800a0000,
+	IDXD_SCMD_CDEV_ERR = 0x800b0000,
+	IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c0000,
+	IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d0000,
+	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
+	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
+	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
+	IDXD_SCMD_WQ_PASID_ERR = 0x80200000,
+	IDXD_SCMD_WQ_NO_DRV_NAME = 0x80300000,
+};
+
+#define IDXD_SCMD_SOFTERR_MASK	0x80000000
+#define IDXD_SCMD_SOFTERR_SHIFT	16
+
+/* Descriptor flags */
+#define IDXD_OP_FLAG_FENCE	0x0001
+#define IDXD_OP_FLAG_BOF	0x0002
+#define IDXD_OP_FLAG_CRAV	0x0004
+#define IDXD_OP_FLAG_RCR	0x0008
+#define IDXD_OP_FLAG_RCI	0x0010
+#define IDXD_OP_FLAG_CRSTS	0x0020
+#define IDXD_OP_FLAG_CR		0x0080
+#define IDXD_OP_FLAG_CC		0x0100
+#define IDXD_OP_FLAG_ADDR1_TCS	0x0200
+#define IDXD_OP_FLAG_ADDR2_TCS	0x0400
+#define IDXD_OP_FLAG_ADDR3_TCS	0x0800
+#define IDXD_OP_FLAG_CR_TCS	0x1000
+#define IDXD_OP_FLAG_STORD	0x2000
+#define IDXD_OP_FLAG_DRDBK	0x4000
+#define IDXD_OP_FLAG_DSTS	0x8000
+
+/* IAX */
+#define IDXD_OP_FLAG_RD_SRC2_AECS	0x010000
+#define IDXD_OP_FLAG_RD_SRC2_2ND	0x020000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_COMP	0x040000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_OVFL	0x080000
+#define IDXD_OP_FLAG_SRC2_STS		0x100000
+#define IDXD_OP_FLAG_CRC_RFC3720	0x200000
+
+/* Opcode */
+enum dsa_opcode {
+	DSA_OPCODE_NOOP = 0,
+	DSA_OPCODE_BATCH,
+	DSA_OPCODE_DRAIN,
+	DSA_OPCODE_MEMMOVE,
+	DSA_OPCODE_MEMFILL,
+	DSA_OPCODE_COMPARE,
+	DSA_OPCODE_COMPVAL,
+	DSA_OPCODE_CR_DELTA,
+	DSA_OPCODE_AP_DELTA,
+	DSA_OPCODE_DUALCAST,
+	DSA_OPCODE_CRCGEN = 0x10,
+	DSA_OPCODE_COPY_CRC,
+	DSA_OPCODE_DIF_CHECK,
+	DSA_OPCODE_DIF_INS,
+	DSA_OPCODE_DIF_STRP,
+	DSA_OPCODE_DIF_UPDT,
+	DSA_OPCODE_CFLUSH = 0x20,
+};
+
+enum iax_opcode {
+	IAX_OPCODE_NOOP = 0,
+	IAX_OPCODE_DRAIN = 2,
+	IAX_OPCODE_MEMMOVE,
+	IAX_OPCODE_DECOMPRESS = 0x42,
+	IAX_OPCODE_COMPRESS,
+	IAX_OPCODE_CRC64,
+	IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
+	IAX_OPCODE_ZERO_DECOMP_16,
+	IAX_OPCODE_DECOMP_32 = 0x4c,
+	IAX_OPCODE_DECOMP_16,
+	IAX_OPCODE_SCAN = 0x50,
+	IAX_OPCODE_SET_MEMBER,
+	IAX_OPCODE_EXTRACT,
+	IAX_OPCODE_SELECT,
+	IAX_OPCODE_RLE_BURST,
+	IAX_OPCDE_FIND_UNIQUE,
+	IAX_OPCODE_EXPAND,
+};
+
+/* Completion record status */
+enum dsa_completion_status {
+	DSA_COMP_NONE = 0,
+	DSA_COMP_SUCCESS,
+	DSA_COMP_SUCCESS_PRED,
+	DSA_COMP_PAGE_FAULT_NOBOF,
+	DSA_COMP_PAGE_FAULT_IR,
+	DSA_COMP_BATCH_FAIL,
+	DSA_COMP_BATCH_PAGE_FAULT,
+	DSA_COMP_DR_OFFSET_NOINC,
+	DSA_COMP_DR_OFFSET_ERANGE,
+	DSA_COMP_DIF_ERR,
+	DSA_COMP_BAD_OPCODE = 0x10,
+	DSA_COMP_INVALID_FLAGS,
+	DSA_COMP_NOZERO_RESERVE,
+	DSA_COMP_XFER_ERANGE,
+	DSA_COMP_DESC_CNT_ERANGE,
+	DSA_COMP_DR_ERANGE,
+	DSA_COMP_OVERLAP_BUFFERS,
+	DSA_COMP_DCAST_ERR,
+	DSA_COMP_DESCLIST_ALIGN,
+	DSA_COMP_INT_HANDLE_INVAL,
+	DSA_COMP_CRA_XLAT,
+	DSA_COMP_CRA_ALIGN,
+	DSA_COMP_ADDR_ALIGN,
+	DSA_COMP_PRIV_BAD,
+	DSA_COMP_TRAFFIC_CLASS_CONF,
+	DSA_COMP_PFAULT_RDBA,
+	DSA_COMP_HW_ERR1,
+	DSA_COMP_HW_ERR_DRB,
+	DSA_COMP_TRANSLATION_FAIL,
+	DSA_ERR_PCI_CFG = 0x51,
+	DSA_ERR_CMD_REG,
+};
+
+enum iax_completion_status {
+	IAX_COMP_NONE = 0,
+	IAX_COMP_SUCCESS,
+	IAX_COMP_PAGE_FAULT_IR = 0x04,
+	IAX_COMP_ANALYTICS_ERROR = 0x0a,
+	IAX_COMP_OUTBUF_OVERFLOW,
+	IAX_COMP_BAD_OPCODE = 0x10,
+	IAX_COMP_INVALID_FLAGS,
+	IAX_COMP_NOZERO_RESERVE,
+	IAX_COMP_INVALID_SIZE,
+	IAX_COMP_OVERLAP_BUFFERS = 0x16,
+	IAX_COMP_INT_HANDLE_INVAL = 0x19,
+	IAX_COMP_CRA_XLAT,
+	IAX_COMP_CRA_ALIGN,
+	IAX_COMP_ADDR_ALIGN,
+	IAX_COMP_PRIV_BAD,
+	IAX_COMP_TRAFFIC_CLASS_CONF,
+	IAX_COMP_PFAULT_RDBA,
+	IAX_COMP_HW_ERR1,
+	IAX_COMP_HW_ERR_DRB,
+	IAX_COMP_TRANSLATION_FAIL,
+	IAX_COMP_PRS_TIMEOUT,
+	IAX_COMP_WATCHDOG,
+	IAX_COMP_INVALID_COMP_FLAG = 0x30,
+	IAX_COMP_INVALID_FILTER_FLAG,
+	IAX_COMP_INVALID_INPUT_SIZE,
+	IAX_COMP_INVALID_NUM_ELEMS,
+	IAX_COMP_INVALID_SRC1_WIDTH,
+	IAX_COMP_INVALID_INVERT_OUT,
+};
+
+#define DSA_COMP_STATUS_MASK		0x7f
+#define DSA_COMP_STATUS_WRITE		0x80
+
+struct dsa_hw_desc {
+	uint32_t	pasid:20;
+	uint32_t	rsvd:11;
+	uint32_t	priv:1;
+	uint32_t	flags:24;
+	uint32_t	opcode:8;
+	uint64_t	completion_addr;
+	union {
+		uint64_t	src_addr;
+		uint64_t	rdback_addr;
+		uint64_t	pattern;
+		uint64_t	desc_list_addr;
+	};
+	union {
+		uint64_t	dst_addr;
+		uint64_t	rdback_addr2;
+		uint64_t	src2_addr;
+		uint64_t	comp_pattern;
+	};
+	union {
+		uint32_t	xfer_size;
+		uint32_t	desc_count;
+	};
+	uint16_t	int_handle;
+	uint16_t	rsvd1;
+	union {
+		uint8_t		expected_res;
+		/* create delta record */
+		struct {
+			uint64_t	delta_addr;
+			uint32_t	max_delta_size;
+			uint32_t 	delt_rsvd;
+			uint8_t 	expected_res_mask;
+		};
+		uint32_t	delta_rec_size;
+		uint64_t	dest2;
+		/* CRC */
+		struct {
+			uint32_t	crc_seed;
+			uint32_t	crc_rsvd;
+			uint64_t	seed_addr;
+		};
+		/* DIF check or strip */
+		struct {
+			uint8_t		src_dif_flags;
+			uint8_t		dif_chk_res;
+			uint8_t		dif_chk_flags;
+			uint8_t		dif_chk_res2[5];
+			uint32_t	chk_ref_tag_seed;
+			uint16_t	chk_app_tag_mask;
+			uint16_t	chk_app_tag_seed;
+		};
+		/* DIF insert */
+		struct {
+			uint8_t		dif_ins_res;
+			uint8_t		dest_dif_flag;
+			uint8_t		dif_ins_flags;
+			uint8_t		dif_ins_res2[13];
+			uint32_t	ins_ref_tag_seed;
+			uint16_t	ins_app_tag_mask;
+			uint16_t	ins_app_tag_seed;
+		};
+		/* DIF update */
+		struct {
+			uint8_t		src_upd_flags;
+			uint8_t		upd_dest_flags;
+			uint8_t		dif_upd_flags;
+			uint8_t		dif_upd_res[5];
+			uint32_t	src_ref_tag_seed;
+			uint16_t	src_app_tag_mask;
+			uint16_t	src_app_tag_seed;
+			uint32_t	dest_ref_tag_seed;
+			uint16_t	dest_app_tag_mask;
+			uint16_t	dest_app_tag_seed;
+		};
+
+		uint8_t		op_specific[24];
+	};
+} __attribute__((packed));
+
+struct iax_hw_desc {
+	uint32_t        pasid:20;
+	uint32_t        rsvd:11;
+	uint32_t        priv:1;
+	uint32_t        flags:24;
+	uint32_t        opcode:8;
+	uint64_t        completion_addr;
+	uint64_t        src1_addr;
+	uint64_t        dst_addr;
+	uint32_t        src1_size;
+	uint16_t        int_handle;
+	union {
+		uint16_t        compr_flags;
+		uint16_t        decompr_flags;
+	};
+	uint64_t        src2_addr;
+	uint32_t        max_dst_size;
+	uint32_t        src2_size;
+	uint32_t	filter_flags;
+	uint32_t	num_inputs;
+} __attribute__((packed));
+
+struct dsa_raw_desc {
+	uint64_t	field[8];
+} __attribute__((packed));
+
+/*
+ * The status field will be modified by hardware, therefore it should be
+ * volatile and prevent the compiler from optimize the read.
+ */
+struct dsa_completion_record {
+	volatile uint8_t	status;
+	union {
+		uint8_t		result;
+		uint8_t		dif_status;
+	};
+	uint16_t		rsvd;
+	uint32_t		bytes_completed;
+	uint64_t		fault_addr;
+	union {
+		/* common record */
+		struct {
+			uint32_t	invalid_flags:24;
+			uint32_t	rsvd2:8;
+		};
+
+		uint32_t	delta_rec_size;
+		uint32_t	crc_val;
+
+		/* DIF check & strip */
+		struct {
+			uint32_t	dif_chk_ref_tag;
+			uint16_t	dif_chk_app_tag_mask;
+			uint16_t	dif_chk_app_tag;
+		};
+
+		/* DIF insert */
+		struct {
+			uint64_t	dif_ins_res;
+			uint32_t	dif_ins_ref_tag;
+			uint16_t	dif_ins_app_tag_mask;
+			uint16_t	dif_ins_app_tag;
+		};
+
+		/* DIF update */
+		struct {
+			uint32_t	dif_upd_src_ref_tag;
+			uint16_t	dif_upd_src_app_tag_mask;
+			uint16_t	dif_upd_src_app_tag;
+			uint32_t	dif_upd_dest_ref_tag;
+			uint16_t	dif_upd_dest_app_tag_mask;
+			uint16_t	dif_upd_dest_app_tag;
+		};
+
+		uint8_t		op_specific[16];
+	};
+} __attribute__((packed));
+
+struct dsa_raw_completion_record {
+	uint64_t	field[4];
+} __attribute__((packed));
+
+struct iax_completion_record {
+	volatile uint8_t        status;
+	uint8_t                 error_code;
+	uint16_t                rsvd;
+	uint32_t                bytes_completed;
+	uint64_t                fault_addr;
+	uint32_t                invalid_flags;
+	uint32_t                rsvd2;
+	uint32_t                output_size;
+	uint8_t                 output_bits;
+	uint8_t                 rsvd3;
+	uint16_t                xor_csum;
+	uint32_t                crc;
+	uint32_t                min;
+	uint32_t                max;
+	uint32_t                sum;
+	uint64_t                rsvd4[2];
+} __attribute__((packed));
+
+struct iax_raw_completion_record {
+	uint64_t	field[8];
+} __attribute__((packed));
+
+#endif
diff --git a/include/uapi/linux/ioasid.h b/include/uapi/linux/ioasid.h
new file mode 100644
index 0000000000000000000000000000000000000000..1529070c031743dde0ac9532ccec2ff2dd96462e
--- /dev/null
+++ b/include/uapi/linux/ioasid.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * PASID (Processor Address Space ID) is a PCIe concept for tagging
+ * address spaces in DMA requests. When system-wide PASID allocation
+ * is required by the underlying iommu driver (e.g. Intel VT-d), this
+ * provides an interface for userspace to request ioasid alloc/free
+ * for its assigned devices.
+ *
+ * Copyright (C) 2021 Intel Corporation.  All rights reserved.
+ *     Author: Liu Yi L <yi.l.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _UAPI_IOASID_H
+#define _UAPI_IOASID_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/ioasid.h>
+
+#define IOASID_API_VERSION	0
+
+
+/* Kernel & User level defines for IOASID IOCTLs. */
+
+#define IOASID_TYPE	('i')
+#define IOASID_BASE	100
+
+/* -------- IOCTLs for IOASID file descriptor (/dev/ioasid) -------- */
+
+/**
+ * IOASID_GET_API_VERSION - _IO(IOASID_TYPE, IOASID_BASE + 0)
+ *
+ * Report the version of the IOASID API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: IOASID_API_VERSION
+ * Availability: Always
+ */
+#define IOASID_GET_API_VERSION		_IO(IOASID_TYPE, IOASID_BASE + 0)
+
+/**
+ * IOASID_GET_INFO - _IOR(IOASID_TYPE, IOASID_BASE + 1, struct ioasid_info)
+ *
+ * Retrieve information about the IOASID object. Fills in provided
+ * struct ioasid_info. Caller sets argsz.
+ *
+ * @argsz:	 user filled size of this data.
+ * @flags:	 currently reserved for future extension. must set to 0.
+ * @ioasid_bits: maximum supported PASID bits, 0 represents no PASID
+ *		 support.
+
+ * Availability: Always
+ */
+struct ioasid_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	ioasid_bits;
+};
+#define IOASID_GET_INFO _IO(IOASID_TYPE, IOASID_BASE + 1)
+
+/**
+ * IOASID_REQUEST_ALLOC - _IOWR(IOASID_TYPE, IOASID_BASE + 2,
+ *					struct ioasid_request)
+ *
+ * Alloc a PASID within @range. @range is [min, max], which means both
+ * @min and @max are inclusive.
+ * User space should provide min, max no more than the ioasid bits reports
+ * in ioasid_info via IOASID_GET_INFO.
+ *
+ * @argsz: user filled size of this data.
+ * @flags: currently reserved for future extension. must set to 0.
+ * @range: allocated ioasid is expected in the range.
+ *
+ * returns: allocated ID on success, -errno on failure
+ */
+struct ioasid_alloc_request {
+	__u32	argsz;
+	__u32	flags;
+	struct {
+		__u32	min;
+		__u32	max;
+	} range;
+};
+#define IOASID_REQUEST_ALLOC	_IO(IOASID_TYPE, IOASID_BASE + 2)
+
+/**
+ * IOASID_REQUEST_FREE - _IOWR(IOASID_TYPE, IOASID_BASE + 3, int)
+ *
+ * Free a PASID.
+ *
+ * returns: 0 on success, -errno on failure
+ */
+#define IOASID_REQUEST_FREE	_IO(IOASID_TYPE, IOASID_BASE + 3)
+
+#endif /* _UAPI_IOASID_H */
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index fc00c5d4741b8decf20dcefaac2798120cdcc6ca..e68a5ae67ebdbef5f0491eb16daf96f317abff84 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -81,7 +81,10 @@ struct iommu_fault_unrecoverable {
 /**
  * struct iommu_fault_page_request - Page Request data
  * @flags: encodes whether the corresponding fields are valid and whether this
- *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values)
+ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
+ *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
+ *         must have the same PASID value as the page request. When it is clear,
+ *         the page response should not have a PASID.
  * @pasid: Process Address Space ID
  * @grpid: Page Request Group Index
  * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
@@ -92,6 +95,7 @@ struct iommu_fault_page_request {
 #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
 #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
 #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
 	__u32	flags;
 	__u32	pasid;
 	__u32	grpid;
@@ -130,11 +134,12 @@ struct iommu_fault {
 enum iommu_page_response_code {
 	IOMMU_PAGE_RESP_SUCCESS = 0,
 	IOMMU_PAGE_RESP_INVALID,
-	IOMMU_PAGE_RESP_FAILURE,
+	IOMMU_PAGE_RESP_FAILURE = 0xf,
 };
 
 /**
  * struct iommu_page_response - Generic page response information
+ * @argsz: User filled size of this data
  * @version: API version of this structure
  * @flags: encodes whether the corresponding fields are valid
  *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
@@ -143,6 +148,7 @@ enum iommu_page_response_code {
  * @code: response code from &enum iommu_page_response_code
  */
 struct iommu_page_response {
+	__u32	argsz;
 #define IOMMU_PAGE_RESP_VERSION_1	1
 	__u32	version;
 #define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
@@ -152,4 +158,260 @@ struct iommu_page_response {
 	__u32	code;
 };
 
+/* defines the granularity of the invalidation */
+enum iommu_inv_granularity {
+	IOMMU_INV_GRANU_DOMAIN,	/* domain-selective invalidation */
+	IOMMU_INV_GRANU_PASID,	/* PASID-selective invalidation */
+	IOMMU_INV_GRANU_ADDR,	/* page-selective invalidation */
+	IOMMU_INV_GRANU_NR,	/* number of invalidation granularities */
+};
+
+/**
+ * struct iommu_inv_addr_info - Address Selective Invalidation Structure
+ *
+ * @flags: indicates the granularity of the address-selective invalidation
+ * - If the PASID bit is set, the @pasid field is populated and the invalidation
+ *   relates to cache entries tagged with this PASID and matching the address
+ *   range.
+ * - If ARCHID bit is set, @archid is populated and the invalidation relates
+ *   to cache entries tagged with this architecture specific ID and matching
+ *   the address range.
+ * - Both PASID and ARCHID can be set as they may tag different caches.
+ * - If neither PASID or ARCHID is set, global addr invalidation applies.
+ * - The LEAF flag indicates whether only the leaf PTE caching needs to be
+ *   invalidated and other paging structure caches can be preserved.
+ * @pasid: process address space ID
+ * @archid: architecture-specific ID
+ * @addr: first stage/level input address
+ * @granule_size: page/block size of the mapping in bytes
+ * @nb_granules: number of contiguous granules to be invalidated
+ */
+struct iommu_inv_addr_info {
+#define IOMMU_INV_ADDR_FLAGS_PASID	(1 << 0)
+#define IOMMU_INV_ADDR_FLAGS_ARCHID	(1 << 1)
+#define IOMMU_INV_ADDR_FLAGS_LEAF	(1 << 2)
+	__u32	flags;
+	__u32	archid;
+	__u64	pasid;
+	__u64	addr;
+	__u64	granule_size;
+	__u64	nb_granules;
+};
+
+/**
+ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure
+ *
+ * @flags: indicates the granularity of the PASID-selective invalidation
+ * - If the PASID bit is set, the @pasid field is populated and the invalidation
+ *   relates to cache entries tagged with this PASID and matching the address
+ *   range.
+ * - If the ARCHID bit is set, the @archid is populated and the invalidation
+ *   relates to cache entries tagged with this architecture specific ID and
+ *   matching the address range.
+ * - Both PASID and ARCHID can be set as they may tag different caches.
+ * - At least one of PASID or ARCHID must be set.
+ * @pasid: process address space ID
+ * @archid: architecture-specific ID
+ */
+struct iommu_inv_pasid_info {
+#define IOMMU_INV_PASID_FLAGS_PASID	(1 << 0)
+#define IOMMU_INV_PASID_FLAGS_ARCHID	(1 << 1)
+	__u32	flags;
+	__u32	archid;
+	__u64	pasid;
+};
+
+/**
+ * struct iommu_cache_invalidate_info - First level/stage invalidation
+ *     information
+ * @argsz: User filled size of this data
+ * @version: API version of this structure
+ * @cache: bitfield that allows to select which caches to invalidate
+ * @granularity: defines the lowest granularity used for the invalidation:
+ *     domain > PASID > addr
+ * @padding: reserved for future use (should be zero)
+ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID
+ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR
+ *
+ * Not all the combinations of cache/granularity are valid:
+ *
+ * +--------------+---------------+---------------+---------------+
+ * | type /       |   DEV_IOTLB   |     IOTLB     |      PASID    |
+ * | granularity  |               |               |      cache    |
+ * +==============+===============+===============+===============+
+ * | DOMAIN       |       N/A     |       Y       |       Y       |
+ * +--------------+---------------+---------------+---------------+
+ * | PASID        |       Y       |       Y       |       Y       |
+ * +--------------+---------------+---------------+---------------+
+ * | ADDR         |       Y       |       Y       |       N/A     |
+ * +--------------+---------------+---------------+---------------+
+ *
+ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than
+ * @version and @cache.
+ *
+ * If multiple cache types are invalidated simultaneously, they all
+ * must support the used granularity.
+ */
+struct iommu_cache_invalidate_info {
+	__u32	argsz;
+#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
+	__u32	version;
+/* IOMMU paging structure cache */
+#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
+#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
+#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
+#define IOMMU_CACHE_INV_TYPE_NR		(3)
+	__u8	cache;
+	__u8	granularity;
+	__u8	padding[6];
+	union {
+		struct iommu_inv_pasid_info pasid_info;
+		struct iommu_inv_addr_info addr_info;
+	} granu;
+};
+
+/**
+ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
+ * SVA binding.
+ *
+ * @flags:	VT-d PASID table entry attributes
+ * @pat:	Page attribute table data to compute effective memory type
+ * @emt:	Extended memory type
+ *
+ * Only guest vIOMMU selectable and effective options are passed down to
+ * the host IOMMU.
+ */
+struct iommu_gpasid_bind_data_vtd {
+#define IOMMU_SVA_VTD_GPASID_SRE	(1 << 0) /* supervisor request */
+#define IOMMU_SVA_VTD_GPASID_EAFE	(1 << 1) /* extended access enable */
+#define IOMMU_SVA_VTD_GPASID_PCD	(1 << 2) /* page-level cache disable */
+#define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
+#define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
+#define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
+#define IOMMU_SVA_VTD_GPASID_WPE	(1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 7)
+	__u64 flags;
+	__u32 pat;
+	__u32 emt;
+};
+
+#define IOMMU_SVA_VTD_GPASID_MTS_MASK	(IOMMU_SVA_VTD_GPASID_CD | \
+					 IOMMU_SVA_VTD_GPASID_EMTE | \
+					 IOMMU_SVA_VTD_GPASID_PCD |  \
+					 IOMMU_SVA_VTD_GPASID_PWT)
+
+/**
+ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
+ * @argsz:	User filled size of this data
+ * @version:	Version of this data structure
+ * @format:	PASID table entry format
+ * @flags:	Additional information on guest bind request
+ * @gpgd:	Guest page directory base of the guest mm to bind
+ * @hpasid:	Process address space ID used for the guest mm in host IOMMU
+ * @gpasid:	Process address space ID used for the guest mm in guest IOMMU
+ * @addr_width:	Guest virtual address width
+ * @padding:	Reserved for future use (should be zero)
+ * @vtd:	Intel VT-d specific data
+ *
+ * Guest to host PASID mapping can be an identity or non-identity, where guest
+ * has its own PASID space. For non-identify mapping, guest to host PASID lookup
+ * is needed when VM programs guest PASID into an assigned device. VMM may
+ * trap such PASID programming then request host IOMMU driver to convert guest
+ * PASID to host PASID based on this bind data.
+ */
+struct iommu_gpasid_bind_data {
+	__u32 argsz;
+#define IOMMU_GPASID_BIND_VERSION_1	1
+	__u32 version;
+#define IOMMU_PASID_FORMAT_INTEL_VTD	1
+#define IOMMU_PASID_FORMAT_LAST		2
+	__u32 format;
+	__u32 addr_width;
+#define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
+#define IOMMU_SVA_HPASID_DEF	(1 << 1) /* use default host PASID */
+#define IOMMU_SVA_SL_ONLY	(1 << 2) /* only setup SLT */
+	__u64 flags;
+	__u64 gpgd;
+	__u64 hpasid;
+	__u64 gpasid;
+	__u8  padding[8];
+	/* Vendor specific data */
+	union {
+		struct iommu_gpasid_bind_data_vtd vtd;
+	} vendor;
+};
+
+/*
+ * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info.
+ *
+ * @flags:	VT-d specific flags. Currently reserved for future
+ *		extension. must be set to 0.
+ * @cap_reg:	Describe basic capabilities as defined in VT-d capability
+ *		register.
+ * @ecap_reg:	Describe the extended capabilities as defined in VT-d
+ *		extended capability register.
+ */
+struct iommu_nesting_info_vtd {
+	__u32	flags;
+	__u8	padding[12];
+	__u64	cap_reg;
+	__u64	ecap_reg;
+};
+
+/*
+ * struct iommu_nesting_info - Information for nesting-capable IOMMU.
+ *			       userspace should check it before using
+ *			       nesting capability.
+ *
+ * @argsz:	size of the whole structure.
+ * @flags:	currently reserved for future extension. must set to 0.
+ * @format:	PASID table entry format, the same definition as struct
+ *		iommu_gpasid_bind_data @format.
+ * @features:	supported nesting features.
+ * @addr_width:	the output addr width of first level/stage translation
+ * @pasid_bits:	maximum supported PASID bits, 0 represents no PASID
+ *		support.
+ * @vendor:	vendor specific data, structure type can be deduced from
+ *		@format field.
+ *
+ * +===============+======================================================+
+ * | feature       |  Notes                                               |
+ * +===============+======================================================+
+ * | BIND_PGTBL    |  IOMMU vendor driver sets it to mandate userspace to |
+ * |               |  bind the first level/stage page table to associated |
+ * |               |  PASID (either the one specified in bind request or  |
+ * |               |  the default PASID of iommu domain), through IOMMU   |
+ * |               |  UAPI.                                               |
+ * +---------------+------------------------------------------------------+
+ * | CACHE_INVLD   |  IOMMU vendor driver sets it to mandate userspace to |
+ * |               |  explicitly invalidate the IOMMU cache through IOMMU |
+ * |               |  UAPI according to vendor-specific requirement when  |
+ * |               |  changing the 1st level/stage page table.            |
+ * +---------------+------------------------------------------------------+
+ *
+ * data struct types defined for @format:
+ * +================================+=====================================+
+ * | @format                        | data struct                         |
+ * +================================+=====================================+
+ * | IOMMU_PASID_FORMAT_INTEL_VTD   | struct iommu_nesting_info_vtd       |
+ * +--------------------------------+-------------------------------------+
+ *
+ */
+struct iommu_nesting_info {
+	__u32	argsz;
+	__u32	flags;
+	__u32	format;
+#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 0)
+#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 1)
+#define IOMMU_NESTING_FEAT_PAGE_RESP		(1 << 2)
+	__u32	features;
+	__u16	addr_width;
+	__u16	pasid_bits;
+	__u8	padding[12];
+	/* Vendor specific data */
+	union {
+		struct iommu_nesting_info_vtd vtd;
+	} vendor;
+};
+
 #endif /* _UAPI_IOMMU_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1b6b8e05868dd82e87abec280d5fcc6c49a9f878..c49bf3a938f00d2052b7a4e3060df095ddb2b168 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -304,6 +304,7 @@ struct kvm_run {
 			__u8  data[8];
 			__u32 len;
 			__u8  is_write;
+			__u8  np_data[64];
 		} mmio;
 		/* KVM_EXIT_HYPERCALL */
 		struct {
@@ -1003,6 +1004,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
+#define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1464,6 +1467,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 29d6e93fd15e3616f5969d0dc0db3b98506cf490..725e671fe7f29cf63cab2f152c89c3e68553e143 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -34,6 +34,7 @@
  * of which the first 64 bytes are standardized as follows:
  */
 #define PCI_STD_HEADER_SIZEOF	64
+#define PCI_STD_NUM_BARS	6	/* Number of standard BARs */
 #define PCI_VENDOR_ID		0x00	/* 16 bits */
 #define PCI_DEVICE_ID		0x02	/* 16 bits */
 #define PCI_COMMAND		0x04	/* 16 bits */
@@ -714,6 +715,7 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
@@ -822,6 +824,13 @@
 #define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
 #define PCI_EXT_CAP_PWR_SIZEOF	16
 
+/* Root Complex Event Collector Endpoint Association  */
+#define PCI_RCEC_RCIEP_BITMAP	4	/* Associated Bitmap for RCiEPs */
+#define PCI_RCEC_BUSN		8	/* RCEC Associated Bus Numbers */
+#define  PCI_RCEC_BUSN_REG_VER	0x02	/* Least version with BUSN present */
+#define  PCI_RCEC_BUSN_NEXT(x)	(((x) >> 8) & 0xff)
+#define  PCI_RCEC_BUSN_LAST(x)	(((x) >> 16) & 0xff)
+
 /* Vendor-Specific (VSEC, PCI_EXT_CAP_ID_VNDR) */
 #define PCI_VNDR_HEADER		4	/* Vendor-Specific Header */
 #define  PCI_VNDR_HEADER_ID(x)	((x) & 0xffff)
@@ -1056,6 +1065,10 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
+#define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ceccd980ffcfe44c5204920f4ad73216bd20ae3c..a49001fa86e22bb23649ddebe0096cc11d696842 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,12 +141,18 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_AUX				= 1U << 20,
+	PERF_SAMPLE_CGROUP			= 1U << 21,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 22,
+	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 23,
+	PERF_SAMPLE_WEIGHT_STRUCT		= 1U << 24,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 25,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
 
+#define PERF_SAMPLE_WEIGHT_TYPE	(PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
 /*
  * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
  *
@@ -180,6 +186,8 @@ enum perf_branch_sample_type_shift {
 
 	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
 
+	PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT	= 17, /* save low level index of raw branch records */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -207,6 +215,8 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
 		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
 
+	PERF_SAMPLE_BRANCH_HW_INDEX	= 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
@@ -849,7 +859,9 @@ enum perf_event_type {
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
 	 *
 	 *	{ u64                   nr;
-	 *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
+	 *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
+	 *        { u64 from, to, flags } lbr[nr];
+	 *      } && PERF_SAMPLE_BRANCH_STACK
 	 *
 	 * 	{ u64			abi; # enum perf_sample_regs_abi
 	 * 	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
@@ -858,7 +870,24 @@ enum perf_event_type {
 	 * 	  char			data[size];
 	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
 	 *
-	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+	 *	{ union perf_sample_weight
+	 *	 {
+	 *		u64		full; && PERF_SAMPLE_WEIGHT
+	 *	#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u32	var1_dw;
+	 *			u16	var2_w;
+	 *			u16	var3_w;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#elif defined(__BIG_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u16	var3_w;
+	 *			u16	var2_w;
+	 *			u32	var1_dw;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#endif
+	 *	 }
+	 *	}
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
@@ -1058,14 +1087,16 @@ union perf_mem_data_src {
 			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_remote:1,   /* remote */
 			mem_snoopx:2,	/* snoop mode, ext */
-			mem_rsvd:24;
+			mem_blk:3,	/* access blocked */
+			mem_rsvd:21;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:24,
+		__u64	mem_rsvd:21,
+			mem_blk:3,	/* access blocked */
 			mem_snoopx:2,	/* snoop mode, ext */
 			mem_remote:1,   /* remote */
 			mem_lvl_num:4,	/* memory hierarchy level number */
@@ -1148,6 +1179,12 @@ union perf_mem_data_src {
 #define PERF_MEM_TLB_OS		0x40 /* OS fault handler */
 #define PERF_MEM_TLB_SHIFT	26
 
+/* Access blocked */
+#define PERF_MEM_BLK_NA		0x01 /* not available */
+#define PERF_MEM_BLK_DATA	0x02 /* data could not be forwarded */
+#define PERF_MEM_BLK_ADDR	0x04 /* address conflict */
+#define PERF_MEM_BLK_SHIFT	40
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
@@ -1179,4 +1216,23 @@ struct perf_branch_entry {
 		reserved:40;
 };
 
+union perf_sample_weight {
+	__u64		full;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	struct {
+		__u32	var1_dw;
+		__u16	var2_w;
+		__u16	var3_w;
+	};
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	struct {
+		__u16	var3_w;
+		__u16	var2_w;
+		__u32	var1_dw;
+	};
+#else
+#error "Unknown endianness"
+#endif
+};
+
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/include/uapi/linux/pfru.h b/include/uapi/linux/pfru.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4d5c0078cfbc33c530364fbf3a8a48db034e8eb
--- /dev/null
+++ b/include/uapi/linux/pfru.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Platform Firmware Runtime Update header
+ *
+ * Copyright(c) 2021 Intel Corporation. All rights reserved.
+ */
+#ifndef __PFRU_H__
+#define __PFRU_H__
+
+#include <linux/ioctl.h>
+#include <linux/uuid.h>
+
+#define PFRU_UUID		"ECF9533B-4A3C-4E89-939E-C77112601C6D"
+#define PFRU_CODE_INJ_UUID		"B2F84B79-7B6E-4E45-885F-3FB9BB185402"
+#define PFRU_DRV_UPDATE_UUID		"4569DD8C-75F1-429A-A3D6-24DE8097A0DF"
+
+#define FUNC_STANDARD_QUERY	0
+#define FUNC_QUERY_UPDATE_CAP	1
+#define FUNC_QUERY_BUF		2
+#define FUNC_START		3
+
+#define CODE_INJECT_TYPE	1
+#define DRIVER_UPDATE_TYPE	2
+
+#define REVID_1		1
+#define REVID_2		2
+
+#define PFRU_MAGIC 0xEE
+
+#define PFRU_IOC_SET_REV _IOW(PFRU_MAGIC, 0x01, unsigned int)
+#define PFRU_IOC_STAGE _IOW(PFRU_MAGIC, 0x02, unsigned int)
+#define PFRU_IOC_ACTIVATE _IOW(PFRU_MAGIC, 0x03, unsigned int)
+#define PFRU_IOC_STAGE_ACTIVATE _IOW(PFRU_MAGIC, 0x04, unsigned int)
+
+static inline int valid_revid(int id)
+{
+	return (id == REVID_1) || (id == REVID_2);
+}
+
+/* Capsule file payload header */
+struct payload_hdr {
+	__u32	sig;
+	__u32	hdr_version;
+	__u32	hdr_size;
+	__u32	hw_ver;
+	__u32	rt_ver;
+	guid_t	platform_id;
+};
+
+enum start_action {
+	START_STAGE,
+	START_ACTIVATE,
+	START_STAGE_ACTIVATE,
+};
+
+enum dsm_status {
+	DSM_SUCCEED,
+	DSM_FUNC_NOT_SUPPORT,
+	DSM_INVAL_INPUT,
+	DSM_HARDWARE_ERR,
+	DSM_RETRY_SUGGESTED,
+	DSM_UNKNOWN,
+	DSM_FUNC_SPEC_ERR,
+};
+
+struct update_cap_info {
+	enum dsm_status status;
+	int update_cap;
+
+	guid_t code_type;
+	int fw_version;
+	int code_rt_version;
+
+	guid_t drv_type;
+	int drv_rt_version;
+	int drv_svn;
+
+	guid_t platform_id;
+	guid_t oem_id;
+
+	char oem_info[];
+};
+
+struct com_buf_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	int buf_size;
+};
+
+struct capsulate_buf_info {
+	unsigned long src;
+	int size;
+};
+
+struct updated_result {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long low_auth_time;
+	unsigned long high_auth_time;
+	unsigned long low_exec_time;
+	unsigned long high_exec_time;
+};
+
+#define PFRU_TELEMETRY_UUID	"75191659-8178-4D9D-B88F-AC5E5E93E8BF"
+
+/* Telemetry structures. */
+struct telem_data_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	/* Maximum supported size of data of
+	 * all Data Chunks combined.
+	 */
+	unsigned long chunk1_addr_lo;
+	unsigned long chunk1_addr_hi;
+	unsigned long chunk2_addr_lo;
+	unsigned long chunk2_addr_hi;
+	int max_data_size;
+	int chunk1_size;
+	int chunk2_size;
+	int rollover_cnt;
+	int reset_cnt;
+};
+
+struct telem_info {
+	int log_level;
+	int log_type;
+	int log_revid;
+};
+
+/* Two logs: history and execution log */
+#define LOG_EXEC_IDX	0
+#define LOG_HISTORY_IDX	1
+#define NR_LOG_TYPE	2
+
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_VERB	4
+
+#define FUNC_SET_LEV		1
+#define FUNC_GET_LEV		2
+#define FUNC_GET_DATA		3
+
+#define LOG_NAME_SIZE		10
+
+#define PFRU_LOG_IOC_SET_INFO _IOW(PFRU_MAGIC, 0x05, struct telem_info)
+#define PFRU_LOG_IOC_GET_INFO _IOR(PFRU_MAGIC, 0x06, struct telem_info)
+#define PFRU_LOG_IOC_GET_DATA_INFO _IOR(PFRU_MAGIC, 0x07, struct telem_data_info)
+
+#endif /* __PFRU_H__ */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cabc93118f9c8faadb96c31caf292c8a57af7df3..933df045b8c0d12211fb9d5a62343f5cb8d3f391 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/iommu.h>
 
 #define VFIO_API_VERSION	0
 
@@ -46,6 +47,12 @@
  */
 #define VFIO_NOIOMMU_IOMMU		8
 
+/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
+#define VFIO_UNMAP_ALL			9
+
+/* Supports the vaddr flag for DMA map and unmap */
+#define VFIO_UPDATE_VADDR		10
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -201,8 +208,10 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
 #define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
 #define VFIO_DEVICE_FLAGS_AP	(1 << 5)	/* vfio-ap device */
+#define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
+	__u32   cap_offset;	/* Offset within info struct of first cap */
 };
 #define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
 
@@ -218,6 +227,15 @@ struct vfio_device_info {
 #define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
 #define VFIO_DEVICE_API_AP_STRING		"vfio-ap"
 
+/*
+ * The following capabilities are unique to s390 zPCI devices.  Their contents
+ * are further-defined in vfio_zdev.h
+ */
+#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE		1
+#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP		2
+#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL		3
+#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP		4
+
 /**
  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
  *				       struct vfio_region_info)
@@ -237,6 +255,7 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
 #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
 #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
+#define VFIO_REGION_INFO_FLAG_DYNAMIC_TRAP	(1 << 4) /* Region supports dynamic trap/untrap */
 	__u32	index;		/* Region index */
 	__u32	cap_offset;	/* Offset within info struct of first cap */
 	__u64	size;		/* Region size (bytes) */
@@ -305,6 +324,8 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
 #define VFIO_REGION_TYPE_GFX                    (1)
 #define VFIO_REGION_TYPE_CCW			(2)
+#define VFIO_REGION_TYPE_MIGRATION              (3)
+#define VFIO_REGION_TYPE_NESTED			(4)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -314,17 +335,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
 /* 10de vendor PCI sub-types */
-/*
- * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
- */
-#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
 
 /* 1014 vendor PCI sub-types */
-/*
- * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
- * to do TLB invalidation on a GPU.
- */
-#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -379,6 +393,236 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD	(1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION           (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *      - The user application writes to this field to inform the vendor driver
+ *        about the device state to be transitioned to.
+ *      - The vendor driver should take the necessary actions to change the
+ *        device state. After successful transition to a given state, the
+ *        vendor driver should return success on write(device_state, state)
+ *        system call. If the device state transition fails, the vendor driver
+ *        should return an appropriate -errno for the fault condition.
+ *      - On the user application side, if the device state transition fails,
+ *	  that is, if write(device_state, state) returns an error, read
+ *	  device_state again to determine the current state of the device from
+ *	  the vendor driver.
+ *      - The vendor driver should return previous state of the device unless
+ *        the vendor driver has encountered an internal error, in which case
+ *        the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
+ *      - The user application must use the device reset ioctl to recover the
+ *        device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *        indicated to be in a valid device state by reading device_state, the
+ *        user application may attempt to transition the device to any valid
+ *        state reachable from the current state or terminate itself.
+ *
+ *      device_state consists of 3 bits:
+ *      - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *        it indicates the _STOP state. When the device state is changed to
+ *        _STOP, driver should stop the device before write() returns.
+ *      - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *        driver should start gathering device state information that will be
+ *        provided to the VFIO user application to save the device's state.
+ *      - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *        the driver should prepare to resume the device. Data provided through
+ *        the migration region should be used to resume the device.
+ *      Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *      application should perform a read-modify-write operation on this
+ *      field when modifying the specified bits.
+ *
+ *  +------- _RESUMING
+ *  |+------ _SAVING
+ *  ||+----- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
+ *                (100b)     (001b)     (011b)        (010b)       (000b)
+ * 0. Running or default state
+ *                             |
+ *
+ * 1. Normal Shutdown (optional)
+ *                             |------------------------------------->|
+ *
+ * 2. Save the state or suspend
+ *                             |------------------------->|---------->|
+ *
+ * 3. Save the state during live migration
+ *                             |----------->|------------>|---------->|
+ *
+ * 4. Resuming
+ *                  |<---------|
+ *
+ * 5. Resumed
+ *                  |--------->|
+ *
+ * 0. Default state of VFIO device is _RUNNING when the user application starts.
+ * 1. During normal shutdown of the user application, the user application may
+ *    optionally change the VFIO device state from _RUNNING to _STOP. This
+ *    transition is optional. The vendor driver must support this transition but
+ *    must not require it.
+ * 2. When the user application saves state or suspends the application, the
+ *    device state transitions from _RUNNING to stop-and-copy and then to _STOP.
+ *    On state transition from _RUNNING to stop-and-copy, driver must stop the
+ *    device, save the device state and send it to the application through the
+ *    migration region. The sequence to be followed for such transition is given
+ *    below.
+ * 3. In live migration of user application, the state transitions from _RUNNING
+ *    to pre-copy, to stop-and-copy, and to _STOP.
+ *    On state transition from _RUNNING to pre-copy, the driver should start
+ *    gathering the device state while the application is still running and send
+ *    the device state data to application through the migration region.
+ *    On state transition from pre-copy to stop-and-copy, the driver must stop
+ *    the device, save the device state and send it to the user application
+ *    through the migration region.
+ *    Vendor drivers must support the pre-copy state even for implementations
+ *    where no data is provided to the user before the stop-and-copy state. The
+ *    user must not be required to consume all migration data before the device
+ *    transitions to a new state, including the stop-and-copy state.
+ *    The sequence to be followed for above two transitions is given below.
+ * 4. To start the resuming phase, the device state should be transitioned from
+ *    the _RUNNING to the _RESUMING state.
+ *    In the _RESUMING state, the driver should use the device state data
+ *    received through the migration region to resume the device.
+ * 5. After providing saved device data to the driver, the application should
+ *    change the state from _RESUMING to _RUNNING.
+ *
+ * reserved:
+ *      Reads on this field return zero and writes are ignored.
+ *
+ * pending_bytes: (read only)
+ *      The number of pending bytes still to be migrated from the vendor driver.
+ *
+ * data_offset: (read only)
+ *      The user application should read data_offset field from the migration
+ *      region. The user application should read the device data from this
+ *      offset within the migration region during the _SAVING state or write
+ *      the device data during the _RESUMING state. See below for details of
+ *      sequence to be followed.
+ *
+ * data_size: (read/write)
+ *      The user application should read data_size to get the size in bytes of
+ *      the data copied in the migration region during the _SAVING state and
+ *      write the size in bytes of the data copied in the migration region
+ *      during the _RESUMING state.
+ *
+ * The format of the migration region is as follows:
+ *  ------------------------------------------------------------------
+ * |vfio_device_migration_info|    data section                      |
+ * |                          |     ///////////////////////////////  |
+ * ------------------------------------------------------------------
+ *   ^                              ^
+ *  offset 0-trapped part        data_offset
+ *
+ * The structure vfio_device_migration_info is always followed by the data
+ * section in the region, so data_offset will always be nonzero. The offset
+ * from where the data is copied is decided by the kernel driver. The data
+ * section can be trapped, mmapped, or partitioned, depending on how the kernel
+ * driver defines the data section. The data section partition can be defined
+ * as mapped by the sparse mmap capability. If mmapped, data_offset must be
+ * page aligned, whereas initial section which contains the
+ * vfio_device_migration_info structure, might not end at the offset, which is
+ * page aligned. The user is not required to access through mmap regardless
+ * of the capabilities of the region mmap.
+ * The vendor driver should determine whether and how to partition the data
+ * section. The vendor driver should return data_offset accordingly.
+ *
+ * The sequence to be followed while in pre-copy state and stop-and-copy state
+ * is as follows:
+ * a. Read pending_bytes, indicating the start of a new iteration to get device
+ *    data. Repeated read on pending_bytes at this stage should have no side
+ *    effects.
+ *    If pending_bytes == 0, the user application should not iterate to get data
+ *    for that device.
+ *    If pending_bytes > 0, perform the following steps.
+ * b. Read data_offset, indicating that the vendor driver should make data
+ *    available through the data section. The vendor driver should return this
+ *    read operation only after data is available from (region + data_offset)
+ *    to (region + data_offset + data_size).
+ * c. Read data_size, which is the amount of data in bytes available through
+ *    the migration region.
+ *    Read on data_offset and data_size should return the offset and size of
+ *    the current buffer if the user application reads data_offset and
+ *    data_size more than once here.
+ * d. Read data_size bytes of data from (region + data_offset) from the
+ *    migration region.
+ * e. Process the data.
+ * f. Read pending_bytes, which indicates that the data from the previous
+ *    iteration has been read. If pending_bytes > 0, go to step b.
+ *
+ * The user application can transition from the _SAVING|_RUNNING
+ * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
+ * number of pending bytes. The user application should iterate in _SAVING
+ * (stop-and-copy) until pending_bytes is 0.
+ *
+ * The sequence to be followed while _RESUMING device state is as follows:
+ * While data for this device is available, repeat the following steps:
+ * a. Read data_offset from where the user application should write data.
+ * b. Write migration data starting at the migration region + data_offset for
+ *    the length determined by data_size from the migration source.
+ * c. Write data_size, which indicates to the vendor driver that data is
+ *    written in the migration region. Vendor driver must return this write
+ *    operations on consuming data. Vendor driver should apply the
+ *    user-provided migration region data to the device resume state.
+ *
+ * If an error occurs during the above sequences, the vendor driver can return
+ * an error code for next read() or write() operation, which will terminate the
+ * loop. The user application should then take the next necessary action, for
+ * example, failing migration or terminating the user application.
+ *
+ * For the user application, data is opaque. The user application should write
+ * data in the same order as the data is received and the data should be of
+ * same transaction size at the source.
+ */
+
+struct vfio_device_migration_info {
+	__u32 device_state;         /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP      (0)
+#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING    (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
+#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_RUNNING | \
+				     VFIO_DEVICE_STATE_SAVING |  \
+				     VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+	(state & VFIO_DEVICE_STATE_RESUMING ? \
+	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+					      VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+					     VFIO_DEVICE_STATE_RESUMING)
+
+	__u32 reserved;
+	__u64 pending_bytes;
+	__u64 data_offset;
+	__u64 data_size;
+};
+
+/* sub-types for VFIO_REGION_TYPE_NESTED */
+#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT	(1)
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
@@ -389,31 +633,39 @@ struct vfio_region_gfx_edid {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
+/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
+
+/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
+
 /*
- * Capability with compressed real address (aka SSA - small system address)
- * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
- * and by the userspace to associate a NVLink bridge with a GPU.
+ * Capability exposed by the DMA fault region
+ * @version: ABI version
  */
-#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
+#define VFIO_REGION_INFO_CAP_DMA_FAULT	6
 
-struct vfio_region_info_cap_nvlink2_ssatgt {
+struct vfio_region_info_cap_fault {
 	struct vfio_info_cap_header header;
-	__u64 tgt;
+	__u32 version;
 };
 
 /*
- * Capability with an NVLink link speed. The value is read by
- * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
- * property in the device tree. The value is fixed in the hardware
- * and failing to provide the correct value results in the link
- * not working with no indication from the driver why.
+ * DMA Fault Region Layout
+ * @tail: index relative to the start of the ring buffer at which the
+ *        consumer finds the next item in the buffer
+ * @entry_size: fault ring buffer entry size in bytes
+ * @nb_entries: max capacity of the fault ring buffer
+ * @offset: ring buffer offset relative to the start of the region
+ * @head: index relative to the start of the ring buffer at which the
+ *        producer (kernel) inserts items into the buffers
  */
-#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
-
-struct vfio_region_info_cap_nvlink2_lnkspd {
-	struct vfio_info_cap_header header;
-	__u32 link_speed;
-	__u32 __pad;
+struct vfio_region_dma_fault {
+	/* Write-Only */
+	__u32   tail;
+	/* Read-Only */
+	__u32   entry_size;
+	__u32	nb_entries;
+	__u32	offset;
+	__u32   head;
 };
 
 /**
@@ -455,11 +707,30 @@ struct vfio_irq_info {
 #define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
 #define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
 #define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+#define VFIO_IRQ_INFO_FLAG_CAPS		(1 << 4) /* Info supports caps */
 	__u32	index;		/* IRQ index */
 	__u32	count;		/* Number of IRQs within this index */
+	__u32	cap_offset;	/* Offset within info struct of first cap */
 };
 #define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
 
+/*
+ * The irq type capability allows IRQs unique to a specific device or
+ * class of devices to be exposed.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IRQ_INFO_CAP_TYPE      3
+
+struct vfio_irq_info_cap_type {
+	struct vfio_info_cap_header header;
+	__u32 type;     /* global per bus driver */
+	__u32 subtype;  /* type specific */
+};
+
+#define VFIO_IRQ_TYPE_NESTED				(1)
+#define VFIO_IRQ_SUBTYPE_DMA_FAULT			(1)
+
 /**
  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
  *
@@ -561,7 +832,8 @@ enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
-	VFIO_PCI_NUM_IRQS
+	VFIO_PCI_NUM_IRQS = 5	/* Fixed user ABI, IRQ indexes >=5 use   */
+				/* device specific cap to define content */
 };
 
 /*
@@ -707,6 +979,43 @@ struct vfio_device_ioeventfd {
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17,
+ *			       struct vfio_device_feature)
+ *
+ * Get, set, or probe feature data of the device.  The feature is selected
+ * using the FEATURE_MASK portion of the flags field.  Support for a feature
+ * can be probed by setting both the FEATURE_MASK and PROBE bits.  A probe
+ * may optionally include the GET and/or SET bits to determine read vs write
+ * access of the feature respectively.  Probing a feature will return success
+ * if the feature is supported and all of the optionally indicated GET/SET
+ * methods are supported.  The format of the data portion of the structure is
+ * specific to the given feature.  The data portion is not required for
+ * probing.  GET and SET are mutually exclusive, except for use with PROBE.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+struct vfio_device_feature {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FEATURE_MASK	(0xffff) /* 16-bit feature index */
+#define VFIO_DEVICE_FEATURE_GET		(1 << 16) /* Get feature into data[] */
+#define VFIO_DEVICE_FEATURE_SET		(1 << 17) /* Set feature from data[] */
+#define VFIO_DEVICE_FEATURE_PROBE	(1 << 18) /* Probe feature support */
+	__u8	data[];
+};
+
+#define VFIO_DEVICE_FEATURE		_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/*
+ * Provide support for setting a PCI VF Token, which is used as a shared
+ * secret between PF and VF drivers.  This feature may only be set on a
+ * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing
+ * open VFs.  Data provided when setting this feature is a 16-byte array
+ * (__u8 b[16]), representing a UUID.
+ */
+#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
@@ -763,6 +1072,47 @@ struct vfio_iommu_type1_info_dma_avail {
 	__u32	avail;
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates that IOMMU kernel driver supports
+ * dirty page logging.
+ *
+ * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty
+ * page logging.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes that can be used by user applications when getting the dirty
+ * bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  2
+
+struct vfio_iommu_type1_info_cap_migration {
+	struct	vfio_info_cap_header header;
+	__u32	flags;
+	__u64	pgsize_bitmap;
+	__u64	max_dirty_bitmap_size;		/* in bytes */
+};
+
+/*
+ * The nesting capability allows to report the related capability
+ * and info for nesting iommu type.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * Nested capabilities should be checked by the userspace after
+ * setting VFIO_TYPE1_NESTING_IOMMU.
+ *
+ * @info: the nesting info provided by IOMMU driver.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING  4
+
+struct vfio_iommu_type1_info_cap_nesting {
+	struct	vfio_info_cap_header header;
+	struct	iommu_nesting_info info;
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
@@ -770,12 +1120,22 @@ struct vfio_iommu_type1_info_dma_avail {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
+ * unblock translation of host virtual addresses in the iova range.  The vaddr
+ * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR.  To
+ * maintain memory consistency within the user application, the updated vaddr
+ * must address the same memory object as originally mapped.  Failure to do so
+ * will result in user memory corruption and/or device misbehavior.  iova and
+ * size must match those in the original MAP_DMA call.  Protection is not
+ * changed, and the READ & WRITE flags must be 0.
  */
 struct vfio_iommu_type1_dma_map {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
@@ -783,6 +1143,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+	__u64        pgsize;	/* page size for bitmap in bytes */
+	__u64        size;	/* in bytes */
+	__u64 __user *data;	/* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  *							struct vfio_dma_unmap)
@@ -792,12 +1158,34 @@ struct vfio_iommu_type1_dma_map {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ *
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
+ * before unmapping IO virtual addresses. When this flag is set, the user must
+ * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
+ * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field.
+ * A bit in the bitmap represents one page, of user provided page size in
+ * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set
+ * indicates that the page at that offset from iova is dirty. A Bitmap of the
+ * pages in the range of unmapped size is returned in the user-provided
+ * vfio_bitmap.data.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
+ * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
+ * virtual addresses in the iova range.  Tasks that attempt to translate an
+ * iova's vaddr will block.  DMA to already-mapped pages continues.  This
+ * cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
+#define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
+#define VFIO_DMA_UNMAP_FLAG_VADDR	     (1 << 2)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
+	__u8    data[];
 };
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
@@ -809,6 +1197,98 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ *                                     struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages logging.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
+ * the IOMMU driver to log pages that are dirtied or potentially dirtied by
+ * the device; designed to be used when a migration is in progress. Dirty pages
+ * are logged until logging is disabled by user application by calling the IOCTL
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
+ * the IOMMU driver to stop logging dirtied pages.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
+ * The user must specify the IOVA range and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports getting a bitmap of the smallest supported pgsize only and can be
+ * modified in future to get a bitmap of any specified supported pgsize. The
+ * user must provide a zeroed memory area for the bitmap memory and specify its
+ * size in bitmap.size. One bit is used to represent one page consecutively
+ * starting from iova offset. The user should provide page size in bitmap.pgsize
+ * field. A bit set in the bitmap indicates that the page at that offset from
+ * iova is dirty. The caller must set argsz to a value including the size of
+ * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the
+ * actual bitmap. If dirty pages logging is not enabled, an error will be
+ * returned.
+ *
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+	__u32        argsz;
+	__u32        flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
+	__u8         data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+	__u64              iova;	/* IO virtual address */
+	__u64              size;	/* Size of iova range */
+	struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_NESTING_OP - _IOW(VFIO_TYPE, VFIO_BASE + 18,
+ *				struct vfio_iommu_type1_nesting_op)
+ *
+ * This interface allows userspace to utilize the nesting IOMMU
+ * capabilities as reported in VFIO_IOMMU_TYPE1_INFO_CAP_NESTING
+ * cap through VFIO_IOMMU_GET_INFO. For platforms which require
+ * system wide PASID, PASID will be allocated by VFIO_IOMMU_PASID
+ * _REQUEST.
+ *
+ * @data[] types defined for each op:
+ * +=================+===============================================+
+ * | NESTING OP      |      @data[]                                  |
+ * +=================+===============================================+
+ * | BIND_PGTBL      |      struct iommu_gpasid_bind_data            |
+ * +-----------------+-----------------------------------------------+
+ * | UNBIND_PGTBL    |      struct iommu_gpasid_bind_data            |
+ * +-----------------+-----------------------------------------------+
+ * | CACHE_INVLD     |      struct iommu_cache_invalidate_info       |
+ * +-----------------+-----------------------------------------------+
+ * | PAGE_RESP       |      struct iommu_page_response               |
+ * +-----------------+-----------------------------------------------+
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_type1_nesting_op {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_NESTING_OP_MASK	(0xffff) /* lower 16-bits for op */
+	__u8	data[];
+};
+
+enum {
+	VFIO_IOMMU_NESTING_OP_BIND_PGTBL,
+	VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL,
+	VFIO_IOMMU_NESTING_OP_CACHE_INVLD,
+	VFIO_IOMMU_NESTING_OP_PAGE_RESP,
+};
+
+#define VFIO_IOMMU_NESTING_OP		_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4309397b6b273bb66e80cc53da769625cec939a
--- /dev/null
+++ b/include/uapi/linux/vfio_zdev.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * VFIO Region definitions for ZPCI devices
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ *            Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef _VFIO_ZDEV_H_
+#define _VFIO_ZDEV_H_
+
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_BASE - Base PCI Function information
+ *
+ * This capability provides a set of descriptive information about the
+ * associated PCI function.
+ */
+struct vfio_device_info_cap_zpci_base {
+	struct vfio_info_cap_header header;
+	__u64 start_dma;	/* Start of available DMA addresses */
+	__u64 end_dma;		/* End of available DMA addresses */
+	__u16 pchid;		/* Physical Channel ID */
+	__u16 vfn;		/* Virtual function number */
+	__u16 fmb_length;	/* Measurement Block Length (in bytes) */
+	__u8 pft;		/* PCI Function Type */
+	__u8 gid;		/* PCI function group ID */
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_GROUP - Base PCI Function Group information
+ *
+ * This capability provides a set of descriptive information about the group of
+ * PCI functions that the associated device belongs to.
+ */
+struct vfio_device_info_cap_zpci_group {
+	struct vfio_info_cap_header header;
+	__u64 dasm;		/* DMA Address space mask */
+	__u64 msi_addr;		/* MSI address */
+	__u64 flags;
+#define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */
+	__u16 mui;		/* Measurement Block Update Interval */
+	__u16 noi;		/* Maximum number of MSIs */
+	__u16 maxstbl;		/* Maximum Store Block Length */
+	__u8 version;		/* Supported PCI Version */
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_UTIL - Utility String
+ *
+ * This capability provides the utility string for the associated device, which
+ * is a device identifier string made up of EBCDID characters.  'size' specifies
+ * the length of 'util_str'.
+ */
+struct vfio_device_info_cap_zpci_util {
+	struct vfio_info_cap_header header;
+	__u32 size;
+	__u8 util_str[];
+};
+
+/**
+ * VFIO_DEVICE_INFO_CAP_ZPCI_PFIP - PCI Function Path
+ *
+ * This capability provides the PCI function path string, which is an identifier
+ * that describes the internal hardware path of the device. 'size' specifies
+ * the length of 'pfip'.
+ */
+struct vfio_device_info_cap_zpci_pfip {
+	struct vfio_info_cap_header header;
+	__u32 size;
+	__u8 pfip[];
+};
+
+#endif
diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h
index d71380f6ed0b2c7570f37957baa24d453d825006..ffc0d3902b71735fb199bc69397f5514de1d088f 100644
--- a/include/xen/swiotlb-xen.h
+++ b/include/xen/swiotlb-xen.h
@@ -4,10 +4,10 @@
 
 #include <linux/swiotlb.h>
 
-void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir);
-void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle,
-		phys_addr_t paddr, size_t size, enum dma_data_direction dir);
+void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
 
 extern int xen_swiotlb_init(int verbose, bool early);
 extern const struct dma_map_ops xen_swiotlb_dma_ops;
diff --git a/init/Kconfig b/init/Kconfig
index 0babb0a34952109dde9d90431fcea9a286fc942a..2b14dada83016730ba8f81c88c0a405011225028 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -936,6 +936,13 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_IOASIDS
+	bool "IOASIDs controller"
+	depends on IOASID
+	help
+	  Provides enforcement of IO Address Space ID limits in the scope of a
+	  cgroup.
+
 config CGROUP_RDMA
 	bool "RDMA controller"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index cd6db723bc1ecf5e30ee6145215c90216140b950..1f2f8bda63bccb19395da4c80db9ba9380b010e7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o ucount.o
+	    async.o range.o smpboot.o ucount.o regset.o
 
 obj-y += tkernel/
 
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index cc3765e92885e21e18b93cff4bc056b256d165bf..b3720c2e4bd1343f5a44601f4d4e65f628acb8c7 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -3,6 +3,7 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_IOASIDS) += ioasids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/ioasids.c b/kernel/cgroup/ioasids.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac43813da6adb19a0b881903347c678f496696f3
--- /dev/null
+++ b/kernel/cgroup/ioasids.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IO Address Space ID limiting controller for cgroups.
+ *
+ */
+#define pr_fmt(fmt)	"ioasids_cg: " fmt
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/ioasid.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+
+#define IOASIDS_MAX_STR "max"
+static DEFINE_MUTEX(ioasids_cg_lock);
+
+struct ioasids_cgroup {
+	struct cgroup_subsys_state	css;
+	atomic64_t			counter;
+	atomic64_t			limit;
+	struct cgroup_file		events_file;
+	/* Number of times allocations failed because limit was hit. */
+	atomic64_t			events_limit;
+};
+
+static struct ioasids_cgroup *css_ioasids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct ioasids_cgroup, css);
+}
+
+static struct ioasids_cgroup *parent_ioasids(struct ioasids_cgroup *ioasids)
+{
+	return css_ioasids(ioasids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+ioasids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct ioasids_cgroup *ioasids;
+
+	ioasids = kzalloc(sizeof(struct ioasids_cgroup), GFP_KERNEL);
+	if (!ioasids)
+		return ERR_PTR(-ENOMEM);
+
+	atomic64_set(&ioasids->counter, 0);
+	atomic64_set(&ioasids->limit, 0);
+	atomic64_set(&ioasids->events_limit, 0);
+	return &ioasids->css;
+}
+
+static void ioasids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_ioasids(css));
+}
+
+/**
+ * ioasids_cancel - uncharge the local IOASID count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to cancel
+ *
+ */
+static void ioasids_cancel(struct ioasids_cgroup *ioasids, int num)
+{
+	WARN_ON_ONCE(atomic64_add_negative(-num, &ioasids->counter));
+}
+
+/**
+ * ioasids_uncharge - hierarchically uncharge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to uncharge
+ */
+static void ioasids_uncharge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p))
+		ioasids_cancel(p, num);
+}
+
+/**
+ * ioasids_charge - hierarchically charge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to charge
+ */
+static void ioasids_charge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * ioasids_try_charge - hierarchically try to charge the ioasid count
+ * @ioasids: the ioasid cgroup state
+ * @num: the number of ioasids to charge
+ */
+static int ioasids_try_charge(struct ioasids_cgroup *ioasids, int num)
+{
+	struct ioasids_cgroup *p, *q;
+
+	for (p = ioasids; parent_ioasids(p); p = parent_ioasids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+		int64_t limit = atomic64_read(&p->limit);
+
+		if (new > limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = ioasids; q != p; q = parent_ioasids(q))
+		ioasids_cancel(q, num);
+	ioasids_cancel(p, num);
+	cgroup_file_notify(&ioasids->events_file);
+
+	return -EAGAIN;
+}
+
+
+/**
+ * ioasid_cg_charge - Check and charge IOASIDs cgroup
+ *
+ * @set: IOASID set used for allocation
+ *
+ * The IOASID quota is managed per cgroup, all process based allocations
+ * must be validated per cgroup hierarchy.
+ * Return 0 if a single IOASID can be allocated or error if failed in various
+ * checks.
+ */
+int ioasid_cg_charge(struct ioasid_set *set)
+{
+	struct mm_struct *mm = get_task_mm(current);
+	struct cgroup_subsys_state *css;
+	struct ioasids_cgroup *ioasids;
+	int ret = 0;
+
+	/* Must be called with a valid mm, not during process exit */
+	if (set->type != IOASID_SET_TYPE_MM)
+		return ret;
+	if (!mm)
+		return -EINVAL;
+	/* We only charge user process allocated PASIDs */
+	if (set->type != IOASID_SET_TYPE_MM) {
+		ret = -EINVAL;
+		goto exit_drop;
+	}
+	if (set->token != mm) {
+		pr_err("No permisson to allocate IOASID\n");
+		ret = -EPERM;
+		goto exit_drop;
+	}
+	rcu_read_lock();
+	css = task_css(current, ioasids_cgrp_id);
+	ioasids = css_ioasids(css);
+	rcu_read_unlock();
+	ret = ioasids_try_charge(ioasids, 1);
+	if (ret)
+		pr_warn("%s: Unable to charge IOASID %d\n", __func__, ret);
+exit_drop:
+	mmput_async(mm);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_cg_charge);
+
+/* Uncharge IOASIDs cgroup after freeing an IOASID */
+void ioasid_cg_uncharge(struct ioasid_set *set)
+{
+	struct cgroup_subsys_state *css;
+	struct ioasids_cgroup *ioasids;
+	struct mm_struct *mm;
+
+	/* We only charge user process allocated PASIDs */
+	if (set->type != IOASID_SET_TYPE_MM)
+		return;
+	mm = set->token;
+	if (!mmget_not_zero(mm)) {
+		pr_err("MM defunct! Cannot uncharge IOASID\n");
+		return;
+	}
+	rcu_read_lock();
+	css = task_css(current, ioasids_cgrp_id);
+	ioasids = css_ioasids(css);
+	rcu_read_unlock();
+	ioasids_uncharge(ioasids, 1);
+	mmput_async(mm);
+}
+EXPORT_SYMBOL_GPL(ioasid_cg_uncharge);
+
+static int ioasids_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	static struct ioasid_set *set;
+	struct task_struct *leader;
+
+	/*
+	 * IOASIDs are managed at per process level, we only support domain mode
+	 * in task management model. Loop through all processes by each thread
+	 * leader, charge the leader's css.
+	 */
+	cgroup_taskset_for_each_leader(leader, dst_css, tset) {
+		struct ioasids_cgroup *ioasids = css_ioasids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct ioasids_cgroup *old_ioasids;
+		struct mm_struct *mm = get_task_mm(leader);
+
+		set = ioasid_find_mm_set(mm);
+		mmput(mm);
+		if (!set)
+			continue;
+
+		old_css = task_css(leader, ioasids_cgrp_id);
+		old_ioasids = css_ioasids(old_css);
+
+		ioasids_charge(ioasids, atomic_read(&set->nr_ioasids));
+		ioasids_uncharge(old_ioasids, atomic_read(&set->nr_ioasids));
+	}
+
+	return 0;
+}
+
+static void ioasids_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct ioasids_cgroup *ioasids = css_ioasids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct ioasids_cgroup *old_ioasids;
+
+		old_css = task_css(task, ioasids_cgrp_id);
+		old_ioasids = css_ioasids(old_css);
+
+		ioasids_charge(old_ioasids, 1);
+		ioasids_uncharge(ioasids, 1);
+	}
+}
+
+static ssize_t ioasids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+	int64_t limit, limit_cur;
+	int err;
+
+	mutex_lock(&ioasids_cg_lock);
+	/* Check whether we are growing or shrinking */
+	limit_cur = atomic64_read(&ioasids->limit);
+	buf = strstrip(buf);
+	if (!strcmp(buf, IOASIDS_MAX_STR)) {
+		/* Returns how many IOASIDs was in the pool */
+		limit = ioasid_reserve_capacity(0);
+		ioasid_reserve_capacity(limit - limit_cur);
+		goto set_limit;
+	}
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		goto done_unlock;
+
+	err = nbytes;
+	/* Check whether we are growing or shrinking */
+	limit_cur = atomic64_read(&ioasids->limit);
+	if (limit < 0 || limit == limit_cur) {
+		err = -EINVAL;
+		goto done_unlock;
+	}
+	if (limit < limit_cur)
+		err = ioasid_cancel_capacity(limit_cur - limit);
+	else
+		err = ioasid_reserve_capacity(limit - limit_cur);
+	if (err < 0)
+		goto done_unlock;
+
+set_limit:
+	err = nbytes;
+	atomic64_set(&ioasids->limit, limit);
+done_unlock:
+	mutex_unlock(&ioasids_cg_lock);
+	return err;
+}
+
+static int ioasids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+	int64_t limit = atomic64_read(&ioasids->limit);
+
+	seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 ioasids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct ioasids_cgroup *ioasids = css_ioasids(css);
+
+	return atomic64_read(&ioasids->counter);
+}
+
+static int ioasids_events_show(struct seq_file *sf, void *v)
+{
+	struct ioasids_cgroup *ioasids = css_ioasids(seq_css(sf));
+
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&ioasids->events_limit));
+	return 0;
+}
+
+static struct cftype ioasids_files[] = {
+	{
+		.name = "max",
+		.write = ioasids_max_write,
+		.seq_show = ioasids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = ioasids_current_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "events",
+		.seq_show = ioasids_events_show,
+		.file_offset = offsetof(struct ioasids_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys ioasids_cgrp_subsys = {
+	.css_alloc	= ioasids_css_alloc,
+	.css_free	= ioasids_css_free,
+	.can_attach	= ioasids_can_attach,
+	.cancel_attach	= ioasids_cancel_attach,
+	.legacy_cftypes	= ioasids_files,
+	.dfl_cftypes	= ioasids_files,
+	.threaded	= false,
+};
+
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 73c5c2b8e82459c043fecaec865d45e084db47ac..4cfd623129fc5b2f6a1def04ab2f2f1c5a33ec75 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -5,6 +5,14 @@ config HAS_DMA
 	depends on !NO_DMA
 	default y
 
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
@@ -51,9 +59,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 config ARCH_HAS_DMA_PREP_COHERENT
 	bool
 
-config ARCH_HAS_DMA_COHERENT_TO_PFN
-	bool
-
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	bool
 
@@ -68,14 +73,28 @@ config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
 
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+	default y if !MMU
+	bool
+
 config DMA_REMAP
 	depends on MMU
-	select GENERIC_ALLOCATOR
+	select DMA_NONCOHERENT_MMAP
 	bool
 
-config DMA_DIRECT_REMAP
+config DMA_COHERENT_POOL
 	bool
 	select DMA_REMAP
+	select GENERIC_ALLOCATOR
+
+config DMA_DIRECT_REMAP
+	bool
+	select DMA_COHERENT_POOL
 
 config DMA_CMA
 	bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index d237cf3dc181295ad3a70516716907366ed49325..370f63344e9cd9de663eb51a7738777968b74c3d 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
+obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
 obj-$(CONFIG_DMA_REMAP)			+= remap.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 69cfb4345388c3f55a9ebef8df15fe9794bc7269..5b2920d94f8338168e4b28e6ff6171c1469c0d69 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -221,8 +221,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
  * @gfp:   Allocation flags.
  *
  * This function allocates contiguous memory buffer for specified device. It
- * first tries to use device specific contiguous memory area if available or
- * the default global one, then tries a fallback allocation of normal pages.
+ * tries to use device specific contiguous memory area if available, or the
+ * default global one.
  *
  * Note that it byapss one-page size of allocations from the global area as
  * the addresses within one page are always contiguous, so there is no need
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index cb6425e52bf7a339277282483459ed9777dbab0e..b1a9a1eff8dea388fac11e3d531d734729f41adc 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1118,20 +1118,10 @@ static void check_for_stack(struct device *dev,
 	}
 }
 
-static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
-{
-	unsigned long a1 = (unsigned long)addr;
-	unsigned long b1 = a1 + len;
-	unsigned long a2 = (unsigned long)start;
-	unsigned long b2 = (unsigned long)end;
-
-	return !(b1 <= a2 || a1 >= b2);
-}
-
 static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
 {
-	if (overlap(addr, len, _stext, _etext) ||
-	    overlap(addr, len, __start_rodata, __end_rodata))
+	if (memory_intersects(_stext, _etext, addr, len) ||
+	    memory_intersects(__start_rodata, __end_rodata, addr, len))
 		err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
 }
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0a093a675b632efab531a890d343886c392c899b..b75b40a80ef81990578a61cb91713949fde18c1a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (C) 2018 Christoph Hellwig.
+ * Copyright (C) 2018-2020 Christoph Hellwig.
  *
  * DMA operations that map physical memory directly without using an IOMMU.
  */
@@ -10,10 +10,9 @@
 #include <linux/dma-direct.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
+#include <linux/vmalloc.h>
 #include <linux/set_memory.h>
-#include <linux/swiotlb.h>
 
 /*
  * Most architectures use ZONE_DMA for the first 16 Megabytes, but
@@ -23,26 +22,20 @@
 #define ARCH_ZONE_DMA_BITS 24
 #endif
 
-static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
-{
-	if (!dev->dma_mask) {
-		dev_err_once(dev, "DMA map on device without dma_mask\n");
-	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
-		dev_err_once(dev,
-			"overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
-			&dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
-	}
-	WARN_ON_ONCE(1);
-}
-
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
 	if (force_dma_unencrypted(dev))
-		return __phys_to_dma(dev, phys);
+		return phys_to_dma_unencrypted(dev, phys);
 	return phys_to_dma(dev, phys);
 }
 
+static inline struct page *dma_direct_to_page(struct device *dev,
+		dma_addr_t dma_addr)
+{
+	return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
 u64 dma_direct_get_required_mask(struct device *dev)
 {
 	phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
@@ -51,16 +44,10 @@ u64 dma_direct_get_required_mask(struct device *dev)
 	return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
-static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-		u64 *phys_mask)
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+				  u64 *phys_limit)
 {
-	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
-		dma_mask = dev->bus_dma_mask;
-
-	if (force_dma_unencrypted(dev))
-		*phys_mask = __dma_to_phys(dev, dma_mask);
-	else
-		*phys_mask = dma_to_phys(dev, dma_mask);
+	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
 	/*
 	 * Optimistically try the zone that the physical address mask falls
@@ -70,9 +57,10 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	 * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
 	 * zones.
 	 */
-	if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+	*phys_limit = dma_to_phys(dev, dma_limit);
+	if (*phys_limit <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
 		return GFP_DMA;
-	if (*phys_mask <= DMA_BIT_MASK(32))
+	if (*phys_limit <= DMA_BIT_MASK(32))
 		return GFP_DMA32;
 	return 0;
 }
@@ -80,38 +68,34 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
 	return phys_to_dma_direct(dev, phys) + size - 1 <=
-			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
-struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
+		gfp_t gfp)
 {
-	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
-	u64 phys_mask;
+	u64 phys_limit;
 
-	if (attrs & DMA_ATTR_NO_WARN)
-		gfp |= __GFP_NOWARN;
+	WARN_ON_ONCE(!PAGE_ALIGNED(size));
 
-	/* we always manually zero the memory once we are done: */
-	gfp &= ~__GFP_ZERO;
-	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-			&phys_mask);
-	page = dma_alloc_contiguous(dev, alloc_size, gfp);
+	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					   &phys_limit);
+	page = dma_alloc_contiguous(dev, size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-		dma_free_contiguous(dev, page, alloc_size);
+		dma_free_contiguous(dev, page, size);
 		page = NULL;
 	}
 again:
 	if (!page)
-		page = alloc_pages_node(node, gfp, get_order(alloc_size));
+		page = alloc_pages_node(node, gfp, get_order(size));
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, size);
 		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
-		    phys_mask < DMA_BIT_MASK(64) &&
+		    phys_limit < DMA_BIT_MASK(64) &&
 		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
 			gfp |= GFP_DMA32;
 			goto again;
@@ -126,26 +110,88 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	return page;
 }
 
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp)
 {
 	struct page *page;
+	u64 phys_mask;
 	void *ret;
 
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					   &phys_mask);
+	page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
 	if (!page)
 		return NULL;
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+	return ret;
+}
+
+void *dma_direct_alloc(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+	struct page *page;
+	void *ret;
+	int err;
+
+	size = PAGE_ALIGN(size);
+	if (attrs & DMA_ATTR_NO_WARN)
+		gfp |= __GFP_NOWARN;
 
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
+		page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+		if (!page)
+			return NULL;
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
+		*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 		/* return the page pointer as the opaque cookie */
 		return page;
 	}
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    !dev_is_dma_coherent(dev))
+		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+
+	/*
+	 * Remapping or decrypting memory may block. If either is required and
+	 * we can't block, allocate the memory from the atomic pools.
+	 */
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    !gfpflags_allow_blocking(gfp) &&
+	    (force_dma_unencrypted(dev) ||
+	     (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+	/* we always manually zero the memory once we are done */
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+	if (!page)
+		return NULL;
+
+	if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	     !dev_is_dma_coherent(dev)) ||
+	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+		/* remove any dirty cache lines on the kernel alias */
+		arch_dma_prep_coherent(page, size);
+
+		/* create a coherent mapping */
+		ret = dma_common_contiguous_remap(page, size,
+				dma_pgprot(dev, PAGE_KERNEL, attrs),
+				__builtin_return_address(0));
+		if (!ret)
+			goto out_encrypt_pages;
+		if (force_dma_unencrypted(dev)) {
+			err = set_memory_decrypted((unsigned long)ret,
+						   1 << get_order(size));
+			if (err)
+				goto out_free_pages;
+		}
+		memset(ret, 0, size);
+		goto done;
+	}
+
 	if (PageHighMem(page)) {
 		/*
 		 * Depending on the cma= arguments and per-arch setup
@@ -154,88 +200,136 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		 * so log an error and fail.
 		 */
 		dev_info(dev, "Rejecting highmem page from CMA.\n");
-		__dma_direct_free_pages(dev, size, page);
-		return NULL;
+		goto out_free_pages;
 	}
 
 	ret = page_address(page);
 	if (force_dma_unencrypted(dev)) {
-		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
-		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
-	} else {
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
+		err = set_memory_decrypted((unsigned long)ret,
+					   1 << get_order(size));
+		if (err)
+			goto out_free_pages;
 	}
+
 	memset(ret, 0, size);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs)) {
+	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !dev_is_dma_coherent(dev)) {
 		arch_dma_prep_coherent(page, size);
-		ret = uncached_kernel_address(ret);
+		ret = arch_dma_set_uncached(ret, size);
+		if (IS_ERR(ret))
+			goto out_free_pages;
 	}
-
+done:
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return ret;
-}
 
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
-{
+out_encrypt_pages:
+	if (force_dma_unencrypted(dev)) {
+		err = set_memory_encrypted((unsigned long)page_address(page),
+					   1 << get_order(size));
+		/* If memory cannot be re-encrypted, it must be leaked */
+		if (err)
+			return NULL;
+	}
+out_free_pages:
 	dma_free_contiguous(dev, page, size);
+	return NULL;
 }
 
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
-		dma_addr_t dma_addr, unsigned long attrs)
+void dma_direct_free(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
 	unsigned int page_order = get_order(size);
 
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
-		__dma_direct_free_pages(dev, size, cpu_addr);
+		dma_free_contiguous(dev, cpu_addr, size);
 		return;
 	}
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    !dev_is_dma_coherent(dev)) {
+		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+		return;
+	}
+
+	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+		return;
+
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		cpu_addr = cached_kernel_address(cpu_addr);
-	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
-}
+	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+		vunmap(cpu_addr);
+	else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+		arch_dma_clear_uncached(cpu_addr, size);
 
-void *dma_direct_alloc(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
-{
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
-	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
-void dma_direct_free(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
 {
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
-	else
-		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+	struct page *page;
+	void *ret;
+
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+	page = __dma_direct_alloc_pages(dev, size, gfp);
+	if (!page)
+		return NULL;
+	if (PageHighMem(page)) {
+		/*
+		 * Depending on the cma= arguments and per-arch setup
+		 * dma_alloc_contiguous could return highmem pages.
+		 * Without remapping there is no way to return them here,
+		 * so log an error and fail.
+		 */
+		dev_info(dev, "Rejecting highmem page from CMA.\n");
+		goto out_free_pages;
+	}
+
+	ret = page_address(page);
+	if (force_dma_unencrypted(dev)) {
+		if (set_memory_decrypted((unsigned long)ret,
+				1 << get_order(size)))
+			goto out_free_pages;
+	}
+	memset(ret, 0, size);
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+	return page;
+out_free_pages:
+	dma_free_contiguous(dev, page, size);
+	return NULL;
 }
 
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+void dma_direct_free_pages(struct device *dev, size_t size,
+		struct page *page, dma_addr_t dma_addr,
+		enum dma_data_direction dir)
 {
-	phys_addr_t paddr = dma_to_phys(dev, addr);
+	unsigned int page_order = get_order(size);
+	void *vaddr = page_address(page);
 
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    dma_free_from_pool(dev, vaddr, size))
+		return;
 
-	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_device(dev, paddr, size, dir);
+	if (force_dma_unencrypted(dev))
+		set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
+
+	dma_free_contiguous(dev, page, size);
 }
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -246,35 +340,19 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length,
-					dir, SYNC_FOR_DEVICE);
+			swiotlb_sync_single_for_device(dev, paddr, sg->length,
+						       dir);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(dev, paddr, sg->length,
+			arch_sync_dma_for_device(paddr, sg->length,
 					dir);
 	}
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(dev, paddr, size, dir);
-		arch_sync_dma_for_cpu_all(dev);
-	}
-
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -285,30 +363,16 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
-					SYNC_FOR_CPU);
+			swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+						    dir);
 	}
 
 	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_cpu_all(dev);
+		arch_sync_dma_for_cpu_all();
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	phys_addr_t phys = dma_to_phys(dev, addr);
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
-	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
-}
-EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -320,35 +384,8 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
-static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
-		size_t size)
-{
-	return swiotlb_force != SWIOTLB_FORCE &&
-		dma_capable(dev, dma_addr, size);
-}
-
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
-	if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
-	    !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
-		report_addr(dev, dma_addr, size);
-		return DMA_MAPPING_ERROR;
-	}
-
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(dev, phys, size, dir);
-	return dma_addr;
-}
-EXPORT_SYMBOL(dma_direct_map_page);
-
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
 {
@@ -367,47 +404,85 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 
 out_unmap:
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-	return 0;
+	return -EIO;
 }
-EXPORT_SYMBOL(dma_direct_map_sg);
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	dma_addr_t dma_addr = paddr;
 
-	if (unlikely(!dma_capable(dev, dma_addr, size))) {
-		report_addr(dev, dma_addr, size);
+	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
+		dev_err_once(dev,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		WARN_ON_ONCE(1);
 		return DMA_MAPPING_ERROR;
 	}
 
 	return dma_addr;
 }
-EXPORT_SYMBOL(dma_direct_map_resource);
 
-/*
- * Because 32-bit DMA masks are so common we expect every architecture to be
- * able to satisfy them - either by not supporting more physical memory, or by
- * providing a ZONE_DMA32.  If neither is the case, the architecture needs to
- * use an IOMMU instead of the direct mapping.
- */
-int dma_direct_supported(struct device *dev, u64 mask)
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
 {
-	u64 min_mask;
+	struct page *page = dma_direct_to_page(dev, dma_addr);
+	int ret;
 
-	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
-	else
-		min_mask = DMA_BIT_MASK(32);
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (!ret)
+		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	return ret;
+}
 
-	min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
+bool dma_direct_can_mmap(struct device *dev)
+{
+	return dev_is_dma_coherent(dev) ||
+		IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned long user_count = vma_pages(vma);
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+	int ret = -ENXIO;
+
+	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+		return -ENXIO;
+	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+			user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+
+int dma_direct_supported(struct device *dev, u64 mask)
+{
+	u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
+
+	/*
+	 * Because 32-bit DMA masks are so common we expect every architecture
+	 * to be able to satisfy them - either by not supporting more physical
+	 * memory, or by providing a ZONE_DMA32.  If neither is the case, the
+	 * architecture needs to use an IOMMU instead of the direct mapping.
+	 */
+	if (mask >= DMA_BIT_MASK(32))
+		return 1;
 
 	/*
-	 * This check needs to be against the actual bit mask value, so
-	 * use __phys_to_dma() here so that the SME encryption mask isn't
+	 * This check needs to be against the actual bit mask value, so use
+	 * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
 	 * part of the check.
 	 */
-	return mask >= __phys_to_dma(dev, min_mask);
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS));
+	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
 size_t dma_direct_max_mapping_size(struct device *dev)
@@ -418,3 +493,9 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 		return swiotlb_max_mapping_size(dev);
 	return SIZE_MAX;
 }
+
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return !dev_is_dma_coherent(dev) ||
+		is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+}
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 05607642c888d917224e44211d0072fae7378f0b..01eb048892b38efe547a2d6beacc95fdefb281ba 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
 		int nelems, enum dma_data_direction dir,
 		unsigned long attrs)
 {
-	return 0;
+	return -EINVAL;
 }
 
 static int dma_dummy_supported(struct device *hwdev, u64 mask)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 8682a5305cb3682fa473a4be9a0ffacdf83b3438..c77fc5b8e26caf99984212535fcf1f32b08a62ee 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -8,6 +8,7 @@
 #include <linux/memblock.h> /* for max_pfn */
 #include <linux/acpi.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-contiguous.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
@@ -105,6 +106,271 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+		const struct dma_map_ops *ops)
+{
+	if (likely(!ops))
+		return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+	if (dev->dma_ops_bypass)
+		return min_not_zero(mask, dev->bus_dma_limit) >=
+			    dma_direct_get_required_mask(dev);
+#endif
+	return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, *dev->dma_mask, ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return DMA_MAPPING_ERROR;
+
+	if (dma_map_direct(dev, ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	debug_dma_map_page(dev, page, offset, size, dir, addr);
+
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_map_direct(dev, ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+	 int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	int ents;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return 0;
+
+	if (dma_map_direct(dev, ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
+
+	if (ents > 0)
+		debug_dma_map_sg(dev, sg, nents, ents, dir);
+	else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+			      ents != -EIO))
+		return -EIO;
+
+	return ents;
+}
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sg:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		    int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	int ret;
+
+	ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+	if (ret < 0)
+		return 0;
+	return ret;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev:	The device for which to perform the DMA operation
+ * @sgt:	The sg_table object describing the buffer
+ * @dir:	DMA direction
+ * @attrs:	Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain.  One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ *   -EINVAL - An invalid argument, unaligned access or other error
+ *	       in usage. Will not succeed if retried.
+ *   -ENOMEM - Insufficient resources (like memory or IOVA space) to
+ *	       complete the mapping. Should succeed if retried later.
+ *   -EIO    - Legacy error code with an unknown meaning. eg. this is
+ *	       returned if a lower level call returned DMA_MAPPING_ERROR.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		    enum dma_data_direction dir, unsigned long attrs)
+{
+	int nents;
+
+	nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+	if (nents < 0)
+		return nents;
+	sgt->nents = nents;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+	if (dma_map_direct(dev, ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr = DMA_MAPPING_ERROR;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	if (WARN_ON_ONCE(!dev->dma_mask))
+		return DMA_MAPPING_ERROR;
+
+	/* Don't allow RAM to be mapped */
+	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+		return DMA_MAPPING_ERROR;
+
+	if (dma_map_direct(dev, ops))
+		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+	else if (ops->map_resource)
+		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (!dma_map_direct(dev, ops) && ops->unmap_resource)
+		ops->unmap_resource(dev, addr, size, dir, attrs);
+	debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_map_direct(dev, ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
+	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_map_direct(dev, ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
+	debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_map_direct(dev, ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_map_direct(dev, ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
 /*
  * Create scatter-list for the already allocated DMA buffer.
  */
@@ -112,24 +378,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		 unsigned long attrs)
 {
-	struct page *page;
+	struct page *page = virt_to_page(cpu_addr);
 	int ret;
 
-	if (!dev_is_dma_coherent(dev)) {
-		unsigned long pfn;
-
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-		page = pfn_to_page(pfn);
-	} else {
-		page = virt_to_page(cpu_addr);
-	}
-
 	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
 	if (!ret)
 		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
@@ -153,8 +404,8 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
-		return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+	if (dma_alloc_direct(dev, ops))
+		return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
 				size, attrs);
 	if (!ops->get_sgtable)
 		return -ENXIO;
@@ -171,9 +422,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
 {
 	if (force_dma_unencrypted(dev))
 		prot = pgprot_decrypted(prot);
-	if (dev_is_dma_coherent(dev) ||
-	    (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
-             (attrs & DMA_ATTR_NON_CONSISTENT)))
+	if (dev_is_dma_coherent(dev))
 		return prot;
 #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
 	if (attrs & DMA_ATTR_WRITE_COMBINE)
@@ -194,7 +443,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long user_count = vma_pages(vma);
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	unsigned long off = vma->vm_pgoff;
-	unsigned long pfn;
 	int ret = -ENXIO;
 
 	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -205,25 +453,48 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	if (off >= count || user_count > count - off)
 		return -ENXIO;
 
-	if (!dev_is_dma_coherent(dev)) {
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-	} else {
-		pfn = page_to_pfn(virt_to_page(cpu_addr));
-	}
-
-	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+	return remap_pfn_range(vma, vma->vm_start,
+			page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
 			user_count << PAGE_SHIFT, vma->vm_page_prot);
 #else
 	return -ENXIO;
 #endif /* CONFIG_MMU */
 }
 
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	struct page *page;
+
+	page = dma_alloc_contiguous(dev, size, gfp);
+	if (!page)
+		page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size));
+	if (!page)
+		return NULL;
+
+	*dma_handle = ops->map_page(dev, page, 0, size, dir,
+				    DMA_ATTR_SKIP_CPU_SYNC);
+	if (*dma_handle == DMA_MAPPING_ERROR) {
+		dma_free_contiguous(dev, page, size);
+		return NULL;
+	}
+
+	memset(page_address(page), 0, size);
+	return page;
+}
+
+void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops->unmap_page)
+		ops->unmap_page(dev, dma_handle, size, dir,
+				DMA_ATTR_SKIP_CPU_SYNC);
+	dma_free_contiguous(dev, page, size);
+}
+
 /**
  * dma_can_mmap - check if a given device supports dma_mmap_*
  * @dev: device to check
@@ -235,12 +506,8 @@ bool dma_can_mmap(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops)) {
-		return IS_ENABLED(CONFIG_MMU) &&
-		       (dev_is_dma_coherent(dev) ||
-			IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
-	}
-
+	if (dma_alloc_direct(dev, ops))
+		return dma_direct_can_mmap(dev);
 	return ops->mmap != NULL;
 }
 EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -264,8 +531,8 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
-		return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+	if (dma_alloc_direct(dev, ops))
+		return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
 				attrs);
 	if (!ops->mmap)
 		return -ENXIO;
@@ -277,7 +544,7 @@ u64 dma_get_required_mask(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_get_required_mask(dev);
 	if (ops->get_required_mask)
 		return ops->get_required_mask(dev);
@@ -308,7 +575,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
 		cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
@@ -340,18 +607,62 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
 	else if (ops->free)
 		ops->free(dev, size, cpu_addr, dma_handle, attrs);
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	struct page *page;
+
+	if (WARN_ON_ONCE(!dev->coherent_dma_mask))
+		return NULL;
+	if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
+		return NULL;
+
+	size = PAGE_ALIGN(size);
+	if (dma_alloc_direct(dev, ops))
+		page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+	else if (ops->alloc_pages)
+		page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+	else
+		return NULL;
+
+	debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+
+	return page;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_pages);
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+		dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	size = PAGE_ALIGN(size);
+	debug_dma_unmap_page(dev, dma_handle, size, dir);
+
+	if (dma_alloc_direct(dev, ops))
+		dma_direct_free_pages(dev, size, page, dma_handle, dir);
+	else if (ops->free_pages)
+		ops->free_pages(dev, size, page, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_pages);
+
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	/*
+	 * ->dma_supported sets the bypass flag, so we must always call
+	 * into the method here unless the device is truly direct mapped.
+	 */
+	if (!ops)
 		return dma_direct_supported(dev, mask);
 	if (!ops->dma_supported)
 		return 1;
@@ -407,7 +718,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 	BUG_ON(!valid_dma_direction(dir));
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		arch_dma_cache_sync(dev, vaddr, size, dir);
 	else if (ops->cache_sync)
 		ops->cache_sync(dev, vaddr, size, dir);
@@ -419,7 +730,7 @@ size_t dma_max_mapping_size(struct device *dev)
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	size_t size = SIZE_MAX;
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		size = dma_direct_max_mapping_size(dev);
 	else if (ops && ops->max_mapping_size)
 		size = ops->max_mapping_size(dev);
@@ -428,6 +739,16 @@ size_t dma_max_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_max_mapping_size);
 
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (dma_map_direct(dev, ops))
+		return dma_direct_need_sync(dev, dma_addr);
+	return ops->sync_single_for_cpu || ops->sync_single_for_device;
+}
+EXPORT_SYMBOL_GPL(dma_need_sync);
+
 unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
new file mode 100644
index 0000000000000000000000000000000000000000..fe11643ff9cc7b9fb3b69be76ce117dc9ef4a0b8
--- /dev/null
+++ b/kernel/dma/pool.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2020 Google LLC
+ */
+#include <linux/cma.h>
+#include <linux/debugfs.h>
+#include <linux/dma-contiguous.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static struct gen_pool *atomic_pool_dma __ro_after_init;
+static unsigned long pool_size_dma;
+static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static unsigned long pool_size_dma32;
+static struct gen_pool *atomic_pool_kernel __ro_after_init;
+static unsigned long pool_size_kernel;
+
+/* Size can be defined by the coherent_pool command line */
+static size_t atomic_pool_size;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
+
+static int __init early_coherent_pool(char *p)
+{
+	atomic_pool_size = memparse(p, &p);
+	return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static void __init dma_atomic_pool_debugfs_init(void)
+{
+	struct dentry *root;
+
+	root = debugfs_create_dir("dma_pools", NULL);
+	if (IS_ERR_OR_NULL(root))
+		return;
+
+	debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
+	debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
+	debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+	if (gfp & __GFP_DMA)
+		pool_size_dma += size;
+	else if (gfp & __GFP_DMA32)
+		pool_size_dma32 += size;
+	else
+		pool_size_kernel += size;
+}
+
+static bool cma_in_zone(gfp_t gfp)
+{
+	unsigned long size;
+	phys_addr_t end;
+	struct cma *cma;
+
+	cma = dev_get_cma_area(NULL);
+	if (!cma)
+		return false;
+
+	size = cma_get_size(cma);
+	if (!size)
+		return false;
+
+	/* CMA can't cross zone boundaries, see cma_activate_area() */
+	end = cma_get_base(cma) + size - 1;
+	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+		return end <= DMA_BIT_MASK(zone_dma_bits);
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+		return end <= DMA_BIT_MASK(32);
+	return true;
+}
+
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+			      gfp_t gfp)
+{
+	unsigned int order;
+	struct page *page = NULL;
+	void *addr;
+	int ret = -ENOMEM;
+
+	/* Cannot allocate larger than MAX_ORDER-1 */
+	order = min(get_order(pool_size), MAX_ORDER-1);
+
+	do {
+		pool_size = 1 << (PAGE_SHIFT + order);
+		if (cma_in_zone(gfp))
+			page = dma_alloc_from_contiguous(NULL, 1 << order,
+							 order, false);
+		if (!page)
+			page = alloc_pages(gfp, order);
+	} while (!page && order-- > 0);
+	if (!page)
+		goto out;
+
+	arch_dma_prep_coherent(page, pool_size);
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+	addr = dma_common_contiguous_remap(page, pool_size,
+					   pgprot_dmacoherent(PAGE_KERNEL),
+					   __builtin_return_address(0));
+	if (!addr)
+		goto free_page;
+#else
+	addr = page_to_virt(page);
+#endif
+	/*
+	 * Memory in the atomic DMA pools must be unencrypted, the pools do not
+	 * shrink so no re-encryption occurs in dma_direct_free().
+	 */
+	ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+				   1 << order);
+	if (ret)
+		goto remove_mapping;
+	ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+				pool_size, NUMA_NO_NODE);
+	if (ret)
+		goto encrypt_mapping;
+
+	dma_atomic_pool_size_add(gfp, pool_size);
+	return 0;
+
+encrypt_mapping:
+	ret = set_memory_encrypted((unsigned long)page_to_virt(page),
+				   1 << order);
+	if (WARN_ON_ONCE(ret)) {
+		/* Decrypt succeeded but encrypt failed, purposely leak */
+		goto out;
+	}
+remove_mapping:
+#ifdef CONFIG_DMA_DIRECT_REMAP
+	dma_common_free_remap(addr, pool_size);
+#endif
+free_page: __maybe_unused
+	__free_pages(page, order);
+out:
+	return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+	if (pool && gen_pool_avail(pool) < atomic_pool_size)
+		atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		atomic_pool_resize(atomic_pool_dma,
+				   GFP_KERNEL | GFP_DMA);
+	if (IS_ENABLED(CONFIG_ZONE_DMA32))
+		atomic_pool_resize(atomic_pool_dma32,
+				   GFP_KERNEL | GFP_DMA32);
+	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
+}
+
+static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
+						      gfp_t gfp)
+{
+	struct gen_pool *pool;
+	int ret;
+
+	pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+	if (!pool)
+		return NULL;
+
+	gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+	ret = atomic_pool_expand(pool, pool_size, gfp);
+	if (ret) {
+		gen_pool_destroy(pool);
+		pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+		       pool_size >> 10, &gfp);
+		return NULL;
+	}
+
+	pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
+		gen_pool_size(pool) >> 10, &gfp);
+	return pool;
+}
+
+static int __init dma_atomic_pool_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * If coherent_pool was not used on the command line, default the pool
+	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+	 */
+	if (!atomic_pool_size) {
+		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
+		pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
+		atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
+	}
+	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
+
+	atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+						    GFP_KERNEL);
+	if (!atomic_pool_kernel)
+		ret = -ENOMEM;
+	if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+		atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
+						GFP_KERNEL | GFP_DMA);
+		if (!atomic_pool_dma)
+			ret = -ENOMEM;
+	}
+	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
+						GFP_KERNEL | GFP_DMA32);
+		if (!atomic_pool_dma32)
+			ret = -ENOMEM;
+	}
+
+	dma_atomic_pool_debugfs_init();
+	return ret;
+}
+postcore_initcall(dma_atomic_pool_init);
+
+static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
+{
+	if (prev == NULL) {
+		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+			return atomic_pool_dma32;
+		if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+			return atomic_pool_dma;
+		return atomic_pool_kernel;
+	}
+	if (prev == atomic_pool_kernel)
+		return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
+	if (prev == atomic_pool_dma32)
+		return atomic_pool_dma;
+	return NULL;
+}
+
+static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
+		struct gen_pool *pool, void **cpu_addr,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
+{
+	unsigned long addr;
+	phys_addr_t phys;
+
+	addr = gen_pool_alloc(pool, size);
+	if (!addr)
+		return NULL;
+
+	phys = gen_pool_virt_to_phys(pool, addr);
+	if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) {
+		gen_pool_free(pool, addr, size);
+		return NULL;
+	}
+
+	if (gen_pool_avail(pool) < atomic_pool_size)
+		schedule_work(&atomic_pool_work);
+
+	*cpu_addr = (void *)addr;
+	memset(*cpu_addr, 0, size);
+	return pfn_to_page(__phys_to_pfn(phys));
+}
+
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+		void **cpu_addr, gfp_t gfp,
+		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
+{
+	struct gen_pool *pool = NULL;
+	struct page *page;
+
+	while ((pool = dma_guess_pool(pool, gfp))) {
+		page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+					     phys_addr_ok);
+		if (page)
+			return page;
+	}
+
+	WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+	return NULL;
+}
+
+bool dma_free_from_pool(struct device *dev, void *start, size_t size)
+{
+	struct gen_pool *pool = NULL;
+
+	while ((pool = dma_guess_pool(pool, 0))) {
+		if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+			continue;
+		gen_pool_free(pool, (unsigned long)start, size);
+		return true;
+	}
+
+	return false;
+}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index c00b9258fa6abaa77aecbd6a6f53a7e75dc00d93..f7b402849891e9afdb804a18a1c4a9c0607fce33 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -1,13 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (C) 2012 ARM Ltd.
  * Copyright (c) 2014 The Linux Foundation
  */
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/dma-contiguous.h>
-#include <linux/init.h>
-#include <linux/genalloc.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -97,172 +92,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
 	unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
 	vunmap(cpu_addr);
 }
-
-#ifdef CONFIG_DMA_DIRECT_REMAP
-static struct gen_pool *atomic_pool __ro_after_init;
-
-#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
-
-static int __init early_coherent_pool(char *p)
-{
-	atomic_pool_size = memparse(p, &p);
-	return 0;
-}
-early_param("coherent_pool", early_coherent_pool);
-
-static gfp_t dma_atomic_pool_gfp(void)
-{
-	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		return GFP_DMA;
-	if (IS_ENABLED(CONFIG_ZONE_DMA32))
-		return GFP_DMA32;
-	return GFP_KERNEL;
-}
-
-static int __init dma_atomic_pool_init(void)
-{
-	unsigned int pool_size_order = get_order(atomic_pool_size);
-	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
-	struct page *page;
-	void *addr;
-	int ret;
-
-	if (dev_get_cma_area(NULL))
-		page = dma_alloc_from_contiguous(NULL, nr_pages,
-						 pool_size_order, false);
-	else
-		page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
-	if (!page)
-		goto out;
-
-	arch_dma_prep_coherent(page, atomic_pool_size);
-
-	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-	if (!atomic_pool)
-		goto free_page;
-
-	addr = dma_common_contiguous_remap(page, atomic_pool_size,
-					   pgprot_dmacoherent(PAGE_KERNEL),
-					   __builtin_return_address(0));
-	if (!addr)
-		goto destroy_genpool;
-
-	ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
-				page_to_phys(page), atomic_pool_size, -1);
-	if (ret)
-		goto remove_mapping;
-	gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
-
-	pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
-		atomic_pool_size / 1024);
-	return 0;
-
-remove_mapping:
-	dma_common_free_remap(addr, atomic_pool_size);
-destroy_genpool:
-	gen_pool_destroy(atomic_pool);
-	atomic_pool = NULL;
-free_page:
-	if (!dma_release_from_contiguous(NULL, page, nr_pages))
-		__free_pages(page, pool_size_order);
-out:
-	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
-		atomic_pool_size / 1024);
-	return -ENOMEM;
-}
-postcore_initcall(dma_atomic_pool_init);
-
-bool dma_in_atomic_pool(void *start, size_t size)
-{
-	if (unlikely(!atomic_pool))
-		return false;
-
-	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
-}
-
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
-{
-	unsigned long val;
-	void *ptr = NULL;
-
-	if (!atomic_pool) {
-		WARN(1, "coherent pool not initialised!\n");
-		return NULL;
-	}
-
-	val = gen_pool_alloc(atomic_pool, size);
-	if (val) {
-		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
-
-		*ret_page = pfn_to_page(__phys_to_pfn(phys));
-		ptr = (void *)val;
-		memset(ptr, 0, size);
-	}
-
-	return ptr;
-}
-
-bool dma_free_from_pool(void *start, size_t size)
-{
-	if (!dma_in_atomic_pool(start, size))
-		return false;
-	gen_pool_free(atomic_pool, (unsigned long)start, size);
-	return true;
-}
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t flags, unsigned long attrs)
-{
-	struct page *page = NULL;
-	void *ret;
-
-	size = PAGE_ALIGN(size);
-
-	if (!gfpflags_allow_blocking(flags)) {
-		ret = dma_alloc_from_pool(size, &page, flags);
-		if (!ret)
-			return NULL;
-		goto done;
-	}
-
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
-	if (!page)
-		return NULL;
-
-	/* remove any dirty cache lines on the kernel alias */
-	arch_dma_prep_coherent(page, size);
-
-	/* create a coherent mapping */
-	ret = dma_common_contiguous_remap(page, size,
-			dma_pgprot(dev, PAGE_KERNEL, attrs),
-			__builtin_return_address(0));
-	if (!ret) {
-		__dma_direct_free_pages(dev, size, page);
-		return ret;
-	}
-
-	memset(ret, 0, size);
-done:
-	*dma_handle = phys_to_dma(dev, page_to_phys(page));
-	return ret;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs)
-{
-	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
-		phys_addr_t phys = dma_to_phys(dev, dma_handle);
-		struct page *page = pfn_to_page(__phys_to_pfn(phys));
-
-		vunmap(vaddr);
-		__dma_direct_free_pages(dev, size, page);
-	}
-}
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
-#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f99b79d7e123539f1fd8e310a09d0013cfb45a6f..0927626ba06def20584477115f339c4fa0d0195e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -22,6 +22,7 @@
 
 #include <linux/cache.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
@@ -49,9 +50,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/swiotlb.h>
 
-#define OFFSET(val,align) ((unsigned long)	\
-	                   ( (val) & ( (align) - 1)))
-
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 
 /*
@@ -92,7 +90,7 @@ static unsigned int io_tlb_index;
  * Max segment that we can provide which (if pages are contingous) will
  * not be bounced (unless SWIOTLB_FORCE is set).
  */
-unsigned int max_segment;
+static unsigned int max_segment;
 
 /*
  * We need to save away the original address corresponding to a mapped entry
@@ -101,6 +99,11 @@ unsigned int max_segment;
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 static phys_addr_t *io_tlb_orig_addr;
 
+/*
+ * The mapped buffer's size should be validated during a sync operation.
+ */
+static size_t *io_tlb_alloc_size;
+
 /*
  * Protect the above data structures in the map and unmap calls
  */
@@ -151,8 +154,6 @@ void swiotlb_set_max_segment(unsigned int val)
 		max_segment = rounddown(val, PAGE_SIZE);
 }
 
-/* default to 64MB */
-#define IO_TLB_DEFAULT_SIZE (64UL<<20)
 unsigned long swiotlb_size_or_default(void)
 {
 	unsigned long size;
@@ -162,6 +163,24 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __init swiotlb_adjust_size(unsigned long new_size)
+{
+	unsigned long size;
+
+	/*
+	 * If swiotlb parameter has not been specified, give a chance to
+	 * architectures such as those supporting memory encryption to
+	 * adjust/expand SWIOTLB size for their use.
+	 */
+	if (!io_tlb_nslabs) {
+		size = ALIGN(new_size, IO_TLB_SIZE);
+		io_tlb_nslabs = size >> IO_TLB_SHIFT;
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+		pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+	}
+}
+
 void swiotlb_print_info(void)
 {
 	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
@@ -171,12 +190,20 @@ void swiotlb_print_info(void)
 		return;
 	}
 
-	pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
-	       (unsigned long long)io_tlb_start,
-	       (unsigned long long)io_tlb_end,
+	pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
 	       bytes >> 20);
 }
 
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+	return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+	return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
 /*
  * Early SWIOTLB allocation may be too early to allow an architecture to
  * perform the desired operations.  This function allows the architecture to
@@ -225,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
+	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
+	io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!io_tlb_alloc_size)
+		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+		      __func__, alloc_size, PAGE_SIZE);
+
 	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -348,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	 * between io_tlb_start and io_tlb_end.
 	 */
 	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
-	                              get_order(io_tlb_nslabs * sizeof(int)));
+				      get_order(io_tlb_nslabs * sizeof(int)));
 	if (!io_tlb_list)
 		goto cleanup3;
 
@@ -359,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!io_tlb_orig_addr)
 		goto cleanup4;
 
+	io_tlb_alloc_size = (size_t *)
+		__get_free_pages(GFP_KERNEL,
+				 get_order(io_tlb_nslabs *
+					   sizeof(size_t)));
+	if (!io_tlb_alloc_size)
+		goto cleanup5;
+
+
 	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -374,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 
 	return 0;
 
+cleanup5:
+	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+							      sizeof(phys_addr_t)));
+
 cleanup4:
 	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
 	                                                 sizeof(int)));
@@ -389,6 +436,8 @@ void __init swiotlb_exit(void)
 		return;
 
 	if (late_alloc) {
+		free_pages((unsigned long)io_tlb_alloc_size,
+			   get_order(io_tlb_nslabs * sizeof(size_t)));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
 		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -398,6 +447,8 @@ void __init swiotlb_exit(void)
 	} else {
 		memblock_free_late(__pa(io_tlb_orig_addr),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+		memblock_free_late(__pa(io_tlb_alloc_size),
+				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
 		memblock_free_late(__pa(io_tlb_list),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
 		memblock_free_late(io_tlb_start,
@@ -409,12 +460,25 @@ void __init swiotlb_exit(void)
 /*
  * Bounce: copy the swiotlb buffer from or back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
-			   size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+			   enum dma_data_direction dir)
 {
+	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	size_t alloc_size = io_tlb_alloc_size[index];
+	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
 
+	if (orig_addr == INVALID_PHYS_ADDR)
+		return;
+
+	if (size > alloc_size) {
+		dev_WARN_ONCE(dev, 1,
+			"Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+			alloc_size, size);
+		size = alloc_size;
+	}
+
 	if (PageHighMem(pfn_to_page(pfn))) {
 		/* The buffer does not have a mapping.  Map it in and copy */
 		unsigned int offset = orig_addr & ~PAGE_MASK;
@@ -446,82 +510,71 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 	}
 }
 
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
-				   dma_addr_t tbl_dma_addr,
-				   phys_addr_t orig_addr,
-				   size_t mapping_size,
-				   size_t alloc_size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	unsigned long flags;
-	phys_addr_t tlb_addr;
-	unsigned int nslots, stride, index, wrap;
-	int i;
-	unsigned long mask;
-	unsigned long offset_slots;
-	unsigned long max_slots;
-	unsigned long tmp_io_tlb_used;
-
-	if (no_iotlb_memory)
-		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
-
-	if (mem_encrypt_active())
-		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+#define slot_addr(start, idx)	((start) + ((idx) << IO_TLB_SHIFT))
 
-	if (mapping_size > alloc_size) {
-		dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
-			      mapping_size, alloc_size);
-		return (phys_addr_t)DMA_MAPPING_ERROR;
-	}
+/*
+ * Return the offset into a iotlb slot required to keep the device happy.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+{
+	return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+}
 
-	mask = dma_get_seg_boundary(hwdev);
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+	if (boundary_mask == ~0UL)
+		return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+	return nr_slots(boundary_mask + 1);
+}
 
-	tbl_dma_addr &= mask;
+static unsigned int wrap_index(unsigned int index)
+{
+	if (index >= io_tlb_nslabs)
+		return 0;
+	return index;
+}
 
-	offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+/*
+ * Find a suitable number of IO TLB entries size that will fit this request and
+ * allocate a buffer from that IO TLB pool.
+ */
+static int find_slots(struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size)
+{
+	unsigned long boundary_mask = dma_get_seg_boundary(dev);
+	dma_addr_t tbl_dma_addr =
+		phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+	unsigned long max_slots = get_max_slots(boundary_mask);
+	unsigned int iotlb_align_mask =
+		dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+	unsigned int nslots = nr_slots(alloc_size), stride;
+	unsigned int index, wrap, count = 0, i;
+	unsigned long flags;
 
-	/*
-	 * Carefully handle integer overflow which can occur when mask == ~0UL.
-	 */
-	max_slots = mask + 1
-		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
-		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+	BUG_ON(!nslots);
 
 	/*
-	 * For mappings greater than or equal to a page, we limit the stride
-	 * (and hence alignment) to a page size.
+	 * For mappings with an alignment requirement don't bother looping to
+	 * unaligned slots once we found an aligned one.  For allocations of
+	 * PAGE_SIZE or larger only look for page aligned allocations.
 	 */
-	nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
 	if (alloc_size >= PAGE_SIZE)
-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
-	else
-		stride = 1;
+		stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 
-	BUG_ON(!nslots);
-
-	/*
-	 * Find suitable number of IO TLB entries size that will fit this
-	 * request and allocate a buffer from that IO TLB pool.
-	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
-
 	if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
 		goto not_found;
 
-	index = ALIGN(io_tlb_index, stride);
-	if (index >= io_tlb_nslabs)
-		index = 0;
-	wrap = index;
-
+	index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
 	do {
-		while (iommu_is_span_boundary(index, nslots, offset_slots,
-					      max_slots)) {
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
-			if (index == wrap)
-				goto not_found;
+		if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+		    (orig_addr & iotlb_align_mask)) {
+			index = wrap_index(index + 1);
+			continue;
 		}
 
 		/*
@@ -529,52 +582,86 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 		 * contiguous buffers, we allocate the buffers from that slot
 		 * and mark the entries as '0' indicating unavailable.
 		 */
-		if (io_tlb_list[index] >= nslots) {
-			int count = 0;
-
-			for (i = index; i < (int) (index + nslots); i++)
-				io_tlb_list[i] = 0;
-			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
-				io_tlb_list[i] = ++count;
-			tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
-			/*
-			 * Update the indices to avoid searching in the next
-			 * round.
-			 */
-			io_tlb_index = ((index + nslots) < io_tlb_nslabs
-					? (index + nslots) : 0);
-
-			goto found;
+		if (!iommu_is_span_boundary(index, nslots,
+					    nr_slots(tbl_dma_addr),
+					    max_slots)) {
+			if (io_tlb_list[index] >= nslots)
+				goto found;
 		}
-		index += stride;
-		if (index >= io_tlb_nslabs)
-			index = 0;
+		index = wrap_index(index + stride);
 	} while (index != wrap);
 
 not_found:
-	tmp_io_tlb_used = io_tlb_used;
-
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
-	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
-		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-			 alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
-	return (phys_addr_t)DMA_MAPPING_ERROR;
+	return -1;
 found:
+	for (i = index; i < index + nslots; i++)
+		io_tlb_list[i] = 0;
+	for (i = index - 1;
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+	     io_tlb_list[i]; i--)
+		io_tlb_list[i] = ++count;
+
+	/*
+	 * Update the indices to avoid searching in the next round.
+	 */
+	if (index + nslots < io_tlb_nslabs)
+		io_tlb_index = index + nslots;
+	else
+		io_tlb_index = 0;
 	io_tlb_used += nslots;
+
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	return index;
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+		size_t mapping_size, size_t alloc_size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+	unsigned int i;
+	int index;
+	phys_addr_t tlb_addr;
+
+	if (no_iotlb_memory)
+		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
+	if (mem_encrypt_active())
+		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+	if (mapping_size > alloc_size) {
+		dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+			      mapping_size, alloc_size);
+		return (phys_addr_t)DMA_MAPPING_ERROR;
+	}
 
+	index = find_slots(dev, orig_addr, alloc_size + offset);
+	if (index == -1) {
+		if (!(attrs & DMA_ATTR_NO_WARN))
+			dev_warn_ratelimited(dev,
+	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+				 alloc_size, io_tlb_nslabs, io_tlb_used);
+		return (phys_addr_t)DMA_MAPPING_ERROR;
+	}
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nslots; i++)
-		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
-
+	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
+		io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+	}
+	tlb_addr = slot_addr(io_tlb_start, index) + offset;
+	/*
+	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+	 * to the tlb buffer, if we knew for sure the device will
+	 * overwirte the entire current content. But we don't. Thus
+	 * unconditional bounce may prevent leaking swiotlb content (i.e.
+	 * kernel memory) to user-space.
+	 */
+	swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
@@ -582,21 +669,21 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t mapping_size, size_t alloc_size,
-			      enum dma_data_direction dir, unsigned long attrs)
+			      size_t mapping_size, enum dma_data_direction dir,
+			      unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
+	int nslots = nr_slots(io_tlb_alloc_size[index] + offset);
+	int count, i;
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if (orig_addr != INVALID_PHYS_ADDR &&
-	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+		swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -605,95 +692,88 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * with slots below and above the pool being returned.
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
-			 io_tlb_list[index + nslots] : 0);
-		/*
-		 * Step 1: return the slots to the free list, merging the
-		 * slots with superceeding slots
-		 */
-		for (i = index + nslots - 1; i >= index; i--) {
-			io_tlb_list[i] = ++count;
-			io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		}
-		/*
-		 * Step 2: merge the returned slots with the preceding slots,
-		 * if available (non zero)
-		 */
-		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
-			io_tlb_list[i] = ++count;
+	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+		count = io_tlb_list[index + nslots];
+	else
+		count = 0;
 
-		io_tlb_used -= nslots;
+	/*
+	 * Step 1: return the slots to the free list, merging the slots with
+	 * superceeding slots
+	 */
+	for (i = index + nslots - 1; i >= index; i--) {
+		io_tlb_list[i] = ++count;
+		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+		io_tlb_alloc_size[i] = 0;
 	}
+
+	/*
+	 * Step 2: merge the returned slots with the preceding slots, if
+	 * available (non zero)
+	 */
+	for (i = index - 1;
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+	     i--)
+		io_tlb_list[i] = ++count;
+	io_tlb_used -= nslots;
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 }
 
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
-			     size_t size, enum dma_data_direction dir,
-			     enum dma_sync_target target)
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
 {
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+	else
+		BUG_ON(dir != DMA_FROM_DEVICE);
+}
 
-	if (orig_addr == INVALID_PHYS_ADDR)
-		return;
-	orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
-
-	switch (target) {
-	case SYNC_FOR_CPU:
-		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_FROM_DEVICE);
-		else
-			BUG_ON(dir != DMA_TO_DEVICE);
-		break;
-	case SYNC_FOR_DEVICE:
-		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_TO_DEVICE);
-		else
-			BUG_ON(dir != DMA_FROM_DEVICE);
-		break;
-	default:
-		BUG();
-	}
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
+{
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+	else
+		BUG_ON(dir != DMA_TO_DEVICE);
 }
 
 /*
- * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
+ * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing
  * to the device copy the data into it as well.
  */
-bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
-	trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
+	phys_addr_t swiotlb_addr;
+	dma_addr_t dma_addr;
 
-	if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
-		dev_warn_ratelimited(dev,
-			"Cannot do DMA to address %pa\n", phys);
-		return false;
-	}
+	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
+			      swiotlb_force);
 
-	/* Oh well, have to allocate and map a bounce buffer. */
-	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
-			*phys, size, size, dir, attrs);
-	if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
-		return false;
+	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+			attrs);
+	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
+		return DMA_MAPPING_ERROR;
 
 	/* Ensure that the address returned is DMA'ble */
-	*dma_addr = __phys_to_dma(dev, *phys);
-	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
-		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
+	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-		return false;
+		dev_WARN_ONCE(dev, 1,
+			"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		return DMA_MAPPING_ERROR;
 	}
 
-	return true;
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(swiotlb_addr, size, dir);
+	return dma_addr;
 }
 
 size_t swiotlb_max_mapping_size(struct device *dev)
 {
-	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
 bool is_swiotlb_active(void)
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index ebe128833af7b55cc132a090e45c2609fb59e431..6986bf1fd6689c9bb68249b3d2691f930d445632 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = {
 	.free			= dma_virt_free,
 	.map_page		= dma_virt_map_page,
 	.map_sg			= dma_virt_map_sg,
+	.alloc_pages		= dma_common_alloc_pages,
+	.free_pages		= dma_common_free_pages,
 };
 EXPORT_SYMBOL(dma_virt_ops);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8e04df75cb3462dc4b76188ebe9128b60e10bb67..0f8cdcb69c4fd57e68a2e6a14080f6e3b0a020e8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1181,12 +1181,26 @@ static void get_ctx(struct perf_event_context *ctx)
 	refcount_inc(&ctx->refcount);
 }
 
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+	if (pmu->task_ctx_cache)
+		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
+	return NULL;
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+	if (pmu->task_ctx_cache && task_ctx_data)
+		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+}
+
 static void free_ctx(struct rcu_head *head)
 {
 	struct perf_event_context *ctx;
 
 	ctx = container_of(head, struct perf_event_context, rcu_head);
-	kfree(ctx->task_ctx_data);
+	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
 	kfree(ctx);
 }
 
@@ -1744,8 +1758,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		size += sizeof(data->period);
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		size += sizeof(data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		size += sizeof(data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
@@ -3240,10 +3254,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 		raw_spin_lock(&ctx->lock);
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
+			struct pmu *pmu = ctx->pmu;
+
 			WRITE_ONCE(ctx->task, next);
 			WRITE_ONCE(next_ctx->task, task);
 
-			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+			/*
+			 * PMU specific parts of task perf context can require
+			 * additional synchronization. As an example of such
+			 * synchronization see implementation details of Intel
+			 * LBR call stack data profiling;
+			 */
+			if (pmu->swap_task_ctx)
+				pmu->swap_task_ctx(ctx, next_ctx);
+			else
+				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
@@ -4288,7 +4313,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		goto errout;
 
 	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!task_ctx_data) {
 			err = -ENOMEM;
 			goto errout;
@@ -4346,11 +4371,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		}
 	}
 
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ctx;
 
 errout:
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ERR_PTR(err);
 }
 
@@ -6394,6 +6419,11 @@ static void perf_output_read(struct perf_output_handle *handle,
 		perf_output_read_one(handle, event, enabled, running);
 }
 
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
 void perf_output_sample(struct perf_output_handle *handle,
 			struct perf_event_header *header,
 			struct perf_sample_data *data,
@@ -6482,6 +6512,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 			     * sizeof(struct perf_branch_entry);
 
 			perf_output_put(handle, data->br_stack->nr);
+			if (perf_sample_save_hw_index(event))
+				perf_output_put(handle, data->br_stack->hw_idx);
 			perf_output_copy(handle, data->br_stack->entries, size);
 		} else {
 			/*
@@ -6515,8 +6547,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 					  data->regs_user.regs);
 	}
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		perf_output_put(handle, data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		perf_output_put(handle, data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
@@ -6671,6 +6703,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 		int size = sizeof(u64); /* nr */
 		if (data->br_stack) {
+			if (perf_sample_save_hw_index(event))
+				size += sizeof(u64);
+
 			size += data->br_stack->nr
 			      * sizeof(struct perf_branch_entry);
 		}
@@ -10767,6 +10802,15 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
+
+	if (attr->sample_type & (PERF_SAMPLE_AUX | PERF_SAMPLE_CGROUP
+		| PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_CODE_PAGE_SIZE))
+		return -EINVAL;
+
+	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -11854,8 +11898,7 @@ inherit_event(struct perf_event *parent_event,
 	    !child_ctx->task_ctx_data) {
 		struct pmu *pmu = child_event->pmu;
 
-		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
-						   GFP_KERNEL);
+		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!child_ctx->task_ctx_data) {
 			free_event(child_event);
 			return ERR_PTR(-ENOMEM);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2511cd4718e7199da9d4a58f2e273731ffff9937..09cc35f385c5bdf6f42510f153a23862109bbc5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,6 +94,7 @@
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
+#include <linux/sched/mm.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -952,6 +953,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->use_memdelay = 0;
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+	tsk->pasid_activated = 0;
+#endif
+
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
@@ -1032,6 +1037,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	mm_pasid_init(mm);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_mm_init(mm);
 	init_tlb_flush_pending(mm);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8fc2539eb08e2e9fb1868c79f319afe917c9a3fa..f1acfe93c8cf01dbba120144e455282ddb58668c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,10 @@ config GENERIC_MSI_IRQ_DOMAIN
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_MSI_IRQ
 
+config DEVICE_MSI
+	bool
+	select GENERIC_MSI_IRQ_DOMAIN
+
 config IRQ_MSI_IOMMU
 	bool
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41d8818ee84fc287ca682c4810e1c8d8b7453e7c..bcb736017f57185b92a12913b4070a4f6e6bed2d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
 	} else {
 		switch (__irq_startup_managed(desc, aff, force)) {
 		case IRQ_STARTUP_NORMAL:
+			if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+				irq_setup_affinity(desc);
 			ret = __irq_startup(desc);
-			irq_setup_affinity(desc);
+			if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+				irq_setup_affinity(desc);
 			break;
 		case IRQ_STARTUP_MANAGED:
 			irq_do_set_affinity(d, aff, false);
@@ -1509,18 +1512,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
  */
 int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_data *pos = NULL;
+	struct irq_data *pos;
 
-#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
-	for (; data; data = data->parent_data)
-#endif
+	for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
 		if (data->chip && data->chip->irq_compose_msi_msg)
 			pos = data;
+	}
+
 	if (!pos)
 		return -ENOSYS;
 
 	pos->chip->irq_compose_msi_msg(pos, msg);
-
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7db284b10ac9ca0360c91668e0e69da76ec5b03f..54363527feea4dcd302f9a8a36edc12df0d196cc 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data)
 }
 #endif
 
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	return irqd->parent_data;
+#else
+	return NULL;
+#endif
+}
+
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 #include <linux/debugfs.h>
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5e03cbee70d6722013c001c9b9b08cb1ee2b21e2..f5b33728fceccc98982876b2898945ce19d9e297 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -42,7 +42,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
 static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
 #endif
 
-const struct fwnode_operations irqchip_fwnode_ops;
+static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+	struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+	return fwid->name;
+}
+
+const struct fwnode_operations irqchip_fwnode_ops = {
+	.get_name = irqchip_fwnode_get_name,
+};
 EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
 
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 918fe0593386219c9e3449bf82f391086c2fd3b5..0aca7401a2337c7beffe48c01b8469f38405035a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -332,21 +332,52 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
 	return ret;
 }
 
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+			      bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 	int ret;
 
+	desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	if (!desc)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
@@ -2751,3 +2782,35 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_set_auxdata - Set auxiliary data
+ * @irq:	Interrupt to update
+ * @which:	Selector which data to update
+ * @auxval:	Auxiliary data value
+ *
+ * Function to update auxiliary data for an interrupt, e.g. to update data
+ * which is stored in a shared register or data storage (e.g. IMS).
+ */
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val)
+{
+	struct irq_desc *desc;
+	struct irq_data *data;
+	unsigned long flags;
+	int res = -ENODEV;
+
+	desc = irq_get_desc_buslock(irq, &flags, 0);
+	if (!desc)
+		return -EINVAL;
+
+	for (data = &desc->irq_data; data; data = irqd_get_parent_data(data)) {
+		if (data->chip->irq_set_auxdata) {
+			res = data->chip->irq_set_auxdata(data, which, val);
+			break;
+		}
+	}
+
+	irq_put_desc_busunlock(desc, flags);
+	return res;
+}
+EXPORT_SYMBOL_GPL(irq_set_auxdata);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5d3da0db092ff85de18c3b8fb59a9a72415531f7..0b7d91a7510022e2d611e2efaf5a65539568f327 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,6 +14,7 @@
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
+#include <linux/pci.h>
 
 #include "internals.h"
 
@@ -69,7 +70,194 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 }
 EXPORT_SYMBOL_GPL(get_cached_msi_msg);
 
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct msi_desc *entry;
+	bool is_msix = false;
+	unsigned long irq;
+	int retval;
+
+	retval = kstrtoul(attr->attr.name, 10, &irq);
+	if (retval)
+		return retval;
+
+	entry = irq_get_msi_desc(irq);
+	if (!entry)
+		return -ENODEV;
+
+	if (dev_is_pci(dev))
+		is_msix = entry->msi_attrib.is_msix;
+
+	return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+/**
+ * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
+ * @dev:	The device(PCI, platform etc) who will get sysfs entries
+ *
+ * Return attribute_group ** so that specific bus MSI can save it to
+ * somewhere during initilizing msi irqs. If devices has no MSI irq,
+ * return NULL; if it fails to populate sysfs, return ERR_PTR
+ */
+const struct attribute_group **msi_populate_sysfs(struct device *dev)
+{
+	const struct attribute_group **msi_irq_groups;
+	struct attribute **msi_attrs, *msi_attr;
+	struct device_attribute *msi_dev_attr;
+	struct attribute_group *msi_irq_group;
+	struct msi_desc *entry;
+	int ret = -ENOMEM;
+	int num_msi = 0;
+	int count = 0;
+	int i;
+
+	entry = first_msi_entry(dev);
+	if (entry->msi_attrib.is_msix) {
+		/* Since msi-x vectors can be allocated multiple times,
+		 * allocate maximum no. of vectors supported by the device
+		 */
+#ifdef CONFIG_PCI_MSI
+		num_msi = pci_msix_vec_count(to_pci_dev(dev));
+#endif
+	}
+
+	if (!num_msi) {
+		/* Determine how many msi entries we have */
+		for_each_msi_entry(entry, dev)
+			num_msi += entry->nvec_used;
+	}
+
+	if (!num_msi)
+		return NULL;
+
+	/* Dynamically create the MSI attributes for the device */
+	msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
+	if (!msi_attrs)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_msi_entry(entry, dev) {
+		for (i = 0; i < entry->nvec_used; i++) {
+			msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
+			if (!msi_dev_attr)
+				goto error_attrs;
+			msi_attrs[count] = &msi_dev_attr->attr;
+
+			sysfs_attr_init(&msi_dev_attr->attr);
+			msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
+							    entry->irq + i);
+			if (!msi_dev_attr->attr.name)
+				goto error_attrs;
+			msi_dev_attr->attr.mode = 0444;
+			msi_dev_attr->show = msi_mode_show;
+			++count;
+		}
+	}
+
+	msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
+	if (!msi_irq_group)
+		goto error_attrs;
+	msi_irq_group->name = "msi_irqs";
+	msi_irq_group->attrs = msi_attrs;
+
+	msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
+	if (!msi_irq_groups)
+		goto error_irq_group;
+	msi_irq_groups[0] = msi_irq_group;
+
+	ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
+	if (ret)
+		goto error_irq_groups;
+
+	return msi_irq_groups;
+
+error_irq_groups:
+	kfree(msi_irq_groups);
+error_irq_group:
+	kfree(msi_irq_group);
+error_attrs:
+	count = 0;
+	msi_attr = msi_attrs[count];
+	while (msi_attr) {
+		msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
+		kfree(msi_attr->name);
+		kfree(msi_dev_attr);
+		++count;
+		msi_attr = msi_attrs[count];
+	}
+	kfree(msi_attrs);
+	return ERR_PTR(ret);
+}
+
+/**
+ * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
+ * @dev:		The device(PCI, platform etc) who will remove sysfs entries
+ * @msi_irq_groups:	attribute_group for device msi_irqs entries
+ */
+void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
+{
+	struct device_attribute *dev_attr;
+	struct attribute **msi_attrs;
+	struct msi_desc *entry;
+	int count = 0;
+
+	if (!msi_irq_groups)
+		return;
+
+	sysfs_remove_groups(&dev->kobj, msi_irq_groups);
+	msi_attrs = msi_irq_groups[0]->attrs;
+
+	entry = first_msi_entry(dev);
+	if (entry->msi_attrib.is_msix) {
+		for_each_msi_entry(entry, dev) {
+			if (msi_attrs[entry->msi_attrib.entry_nr]) {
+				dev_attr = container_of(msi_attrs[entry->msi_attrib.entry_nr],
+							struct device_attribute, attr);
+				kfree(dev_attr->attr.name);
+				kfree(dev_attr);
+				msi_attrs[entry->msi_attrib.entry_nr] = NULL;
+			}
+		}
+	} else {
+		while (msi_attrs[count]) {
+			dev_attr = container_of(msi_attrs[count],
+					struct device_attribute, attr);
+			kfree(dev_attr->attr.name);
+			kfree(dev_attr);
+			++count;
+		}
+	}
+
+	kfree(msi_attrs);
+	kfree(msi_irq_groups[0]);
+	kfree(msi_irq_groups);
+}
+
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+void msi_domain_set_default_info_flags(struct msi_domain_info *info)
+{
+	/* Required so that a device latches a valid MSI message on startup */
+	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
+
+	/*
+	 * Interrupt reservation mode allows to stear the MSI message of an
+	 * inactive device to a special (usually spurious interrupt) target.
+	 * This allows to prevent interrupt vector exhaustion e.g. on x86.
+	 * But (PCI)MSI interrupts are activated early - see above - so the
+	 * interrupt request/startup sequence would not try to allocate a
+	 * usable vector which means that the device interrupts would end
+	 * up on the special vector and issue spurious interrupt messages.
+	 * Setting the reactivation flag ensures that when the interrupt
+	 * is requested the activation is invoked again so that a real
+	 * vector can be allocated.
+	 */
+	if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+		info->flags |= MSI_FLAG_MUST_REACTIVATE;
+
+	/* MSI is oneshot-safe at least in theory */
+	info->chip->flags |= IRQCHIP_ONESHOT_SAFE;
+}
+
 static inline void irq_chip_write_msi_msg(struct irq_data *data,
 					  struct msi_msg *msg)
 {
@@ -187,7 +375,6 @@ static const struct irq_domain_ops msi_domain_ops = {
 	.deactivate	= msi_domain_deactivate,
 };
 
-#ifdef GENERIC_MSI_DOMAIN_OPS
 static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
 						msi_alloc_info_t *arg)
 {
@@ -206,11 +393,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
 {
 	arg->desc = desc;
 }
-#else
-#define msi_domain_ops_get_hwirq	NULL
-#define msi_domain_ops_prepare		NULL
-#define msi_domain_ops_set_desc		NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
 
 static int msi_domain_ops_init(struct irq_domain *domain,
 			       struct msi_domain_info *info,
@@ -235,11 +417,14 @@ static int msi_domain_ops_check(struct irq_domain *domain,
 }
 
 static struct msi_domain_ops msi_domain_ops_default = {
-	.get_hwirq	= msi_domain_ops_get_hwirq,
-	.msi_init	= msi_domain_ops_init,
-	.msi_check	= msi_domain_ops_check,
-	.msi_prepare	= msi_domain_ops_prepare,
-	.set_desc	= msi_domain_ops_set_desc,
+	.get_hwirq		= msi_domain_ops_get_hwirq,
+	.msi_init		= msi_domain_ops_init,
+	.msi_check		= msi_domain_ops_check,
+	.msi_prepare		= msi_domain_ops_prepare,
+	.set_desc		= msi_domain_ops_set_desc,
+	.domain_alloc_irqs	= __msi_domain_alloc_irqs,
+	.domain_free_irqs	= __msi_domain_free_irqs,
+	.domain_free_irq	= __msi_domain_free_irq,
 };
 
 static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -251,6 +436,16 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
 		return;
 	}
 
+	if (ops->domain_alloc_irqs == NULL)
+		ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
+	if (ops->domain_free_irqs == NULL)
+		ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
+	if (ops->domain_free_irq == NULL)
+		ops->domain_free_irq = msi_domain_ops_default.domain_free_irq;
+
+	if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+		return;
+
 	if (ops->get_hwirq == NULL)
 		ops->get_hwirq = msi_domain_ops_default.get_hwirq;
 	if (ops->msi_init == NULL)
@@ -284,8 +479,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 {
 	struct irq_domain *domain;
 
-	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
-		msi_domain_update_dom_ops(info);
+	msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		msi_domain_update_chip_ops(info);
 
@@ -370,8 +564,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
 {
 	struct msi_desc *desc;
 
-	if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+	switch(domain->bus_token) {
+	case DOMAIN_BUS_PCI_MSI:
+	case DOMAIN_BUS_VMD_MSI:
+		break;
+	default:
 		return false;
+	}
 
 	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
 		return false;
@@ -387,31 +586,38 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
 	return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
 }
 
-/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain:	The domain to allocate from
- * @dev:	Pointer to device struct of the device for which the interrupts
- *		are allocated
- * @nvec:	The number of interrupts to allocate
- *
- * Returns 0 on success or an error code.
- */
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
-			  int nvec)
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			    int nvec)
 {
 	struct msi_domain_info *info = domain->host_data;
 	struct msi_domain_ops *ops = info->ops;
 	struct irq_data *irq_data;
 	struct msi_desc *desc;
-	msi_alloc_info_t arg;
+	msi_alloc_info_t arg = { };
 	int i, ret, virq;
 	bool can_reserve;
+	struct list_head *msi_last_list;
+	struct list_head *msi_list;
 
 	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
 		return ret;
 
-	for_each_msi_entry(desc, dev) {
+	if (ops->msi_alloc_store) {
+		ret = ops->msi_alloc_store(domain, dev, nvec);
+		if (ret)
+			return ret;
+	}
+
+	if (domain->bus_token == DOMAIN_BUS_DEVICE_MSI) {
+		msi_last_list = dev->dev_msi_last_list;
+		msi_list = dev_to_dev_msi_list(dev);
+	} else {
+		msi_last_list = dev->msi_last_list;
+		msi_list = dev_to_msi_list(dev);
+	}
+
+	__for_each_new_msi_entry(desc, msi_last_list, msi_list) {
 		ops->set_desc(&arg, desc);
 
 		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -445,7 +651,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
 		goto skip_activate;
 
-	for_each_msi_vector(desc, i, dev) {
+	__for_each_new_msi_vector(desc, i, msi_last_list, msi_list) {
 		if (desc->irq == i) {
 			virq = desc->irq;
 			dev_dbg(dev, "irq [%d-%d] for MSI\n",
@@ -469,7 +675,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	 * so request_irq() will assign the final vector.
 	 */
 	if (can_reserve) {
-		for_each_msi_vector(desc, i, dev) {
+		__for_each_new_msi_vector(desc, i, msi_last_list, msi_list) {
 			irq_data = irq_domain_get_irq_data(domain, i);
 			irqd_clr_activated(irq_data);
 		}
@@ -477,26 +683,48 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	return 0;
 
 cleanup:
-	for_each_msi_vector(desc, i, dev) {
-		irq_data = irq_domain_get_irq_data(domain, i);
-		if (irqd_is_activated(irq_data))
-			irq_domain_deactivate_irq(irq_data);
-	}
 	msi_domain_free_irqs(domain, dev);
 	return ret;
 }
 
 /**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain:	The domain to managing the interrupts
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain:	The domain to allocate from
  * @dev:	Pointer to device struct of the device for which the interrupts
- *		are free
+ *		are allocated
+ * @nvec:	The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
  */
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+			  int nvec)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_alloc_irqs(domain, dev, nvec);
+}
+EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs);
+
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 {
+	struct irq_data *irq_data;
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
 	struct msi_desc *desc;
+	int i;
+	struct list_head *msi_list;
 
-	for_each_msi_entry(desc, dev) {
+	msi_list = (domain->bus_token == DOMAIN_BUS_DEVICE_MSI)
+			 ? dev_to_dev_msi_list(dev) : dev_to_msi_list(dev);
+
+	__for_each_msi_vector(desc, i, msi_list) {
+		irq_data = irq_domain_get_irq_data(domain, i);
+		if (irqd_is_activated(irq_data))
+			irq_domain_deactivate_irq(irq_data);
+	}
+
+	__for_each_msi_entry(desc, msi_list) {
 		/*
 		 * We might have failed to allocate an MSI early
 		 * enough that there is no IRQ associated to this
@@ -507,7 +735,55 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
 			desc->irq = 0;
 		}
 	}
+
+	if (ops->msi_free_store)
+		ops->msi_free_store(domain, dev);
+}
+
+/**
+ * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain:	The domain to managing the interrupts
+ * @dev:	Pointer to device struct of the device for which the interrupts
+ *		are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_free_irqs(domain, dev);
 }
+EXPORT_SYMBOL_GPL(msi_domain_free_irqs);
+
+/**
+ * msi_domain_free_irq - Free interrupt from a MSI interrupt @domain associated to @dev
+ * @domain:	The domain to managing the interrupts
+ * @dev:	Pointer to device struct for which the interrupt needs to be freed
+ * @irq:	Interrupt to be freed
+ */
+void __msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_desc *desc = irq_get_msi_desc(irq);
+	struct irq_data *irq_data;
+
+	if (irq) {
+		irq_data = irq_domain_get_irq_data(domain, irq);
+		if (irqd_is_activated(irq_data))
+			irq_domain_deactivate_irq(irq_data);
+
+		irq_domain_free_irqs(desc->irq, 1);
+		desc->irq = 0;
+	}
+}
+
+void msi_domain_free_irq(struct irq_domain *domain, struct device *dev, unsigned int irq)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct msi_domain_ops *ops = info->ops;
+
+	return ops->domain_free_irq(domain, dev, irq);
+}
+EXPORT_SYMBOL_GPL(msi_domain_free_irq);
 
 /**
  * msi_get_domain_info - Get the MSI interrupt domain info for @domain
@@ -521,4 +797,61 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
 	return (struct msi_domain_info *)domain->host_data;
 }
 
+/**
+ * get_dev_msi_entry - Get the nth device MSI entry
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Return NULL or the nth dev_msi entry
+ */
+static struct msi_desc *get_dev_msi_entry(struct device *dev, unsigned int nr)
+{
+	struct msi_desc *entry;
+	int i = 0;
+
+	if (list_empty(dev_to_dev_msi_list(dev)))
+		return NULL;
+
+	for_each_dev_msi_entry(entry, dev) {
+		if (i == nr)
+			return entry;
+		i++;
+	}
+
+	WARN_ON_ONCE(!entry);
+	return entry;
+}
+
+/**
+ * dev_msi_irq_vector - Get the Linux IRQ number of a device vector
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Returns the Linux IRQ number of a device vector, or -1 if the
+ * entry is not found.
+ */
+int dev_msi_irq_vector(struct device *dev, unsigned int nr)
+{
+	struct msi_desc *entry = get_dev_msi_entry(dev, nr);
+
+	return entry ? entry->irq : -1;
+}
+EXPORT_SYMBOL_GPL(dev_msi_irq_vector);
+
+/**
+ * dev_msi_hwirq - Get the device MSI hw IRQ number of a device vector
+ * @dev: device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ *
+ * Return the dev_msi hw IRQ number of a device vector, or -1 if the
+ * entry is not found.
+ */
+int dev_msi_hwirq(struct device *dev, unsigned int nr)
+{
+	struct msi_desc *entry = get_dev_msi_entry(dev, nr);
+
+	return entry ? entry->device_msi.hwirq : -1;
+}
+EXPORT_SYMBOL_GPL(dev_msi_hwirq);
+
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index f6d5ffe4e72ec12c69a2a11b46fe7e214f86939c..c6352021ca8448ddafb391841b1fc65bf8560f34 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -91,6 +91,7 @@ static int notifier_call_chain(struct notifier_block **nl,
 #ifdef CONFIG_DEBUG_NOTIFIERS
 		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
+			pr_alert("%s: nb call %llx", __func__, (u64)nb->notifier_call);
 			nb = next_nb;
 			continue;
 		}
diff --git a/kernel/panic.c b/kernel/panic.c
index f470a038b05bd1111bc3e0dd2195d6b15e823d14..b69ee9e76cb2ae0c561ffec54a83e5ee162d1082 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -671,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 #endif
 
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err)
-{
-	WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
-		err, (void *)instruction_pointer(regs),
-		current->comm, task_pid_nr(current),
-		from_kuid_munged(&init_user_ns, current_uid()),
-		from_kuid_munged(&init_user_ns, current_euid()));
-}
-#endif
-
 core_param(panic, panic_timeout, int, 0644);
 core_param(panic_print, panic_print, ulong, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index b2b0f526f249e7edbbe11616ffe828899ce4f983..660f9a6bf73a1404870085ab03959f59baccf2c6 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -6,9 +6,11 @@
 
 #ifdef CONFIG_PRINTK
 
-#define PRINTK_SAFE_CONTEXT_MASK	 0x3fffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK	 0x40000000
-#define PRINTK_NMI_CONTEXT_MASK		 0x80000000
+#define PRINTK_SAFE_CONTEXT_MASK	0x007ffffff
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK	0x008000000
+#define PRINTK_NMI_CONTEXT_MASK		0xff0000000
+
+#define PRINTK_NMI_CONTEXT_OFFSET	0x010000000
 
 extern raw_spinlock_t logbuf_lock;
 
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 6cfc5a00c67d69bfa76e0e4d4da524b28c1601f5..8078205f5b103d4715251f48cf657226a5cf5078 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -303,12 +303,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
 
 void notrace printk_nmi_enter(void)
 {
-	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+	this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
 }
 
 void notrace printk_nmi_exit(void)
 {
-	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+	this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
 }
 
 /*
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 0000000000000000000000000000000000000000..eaeaefbbd39ec0ca0a49e9fdca63a7311bba5d1f
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int size,
+			void **data)
+{
+	void *p = *data, *to_free = NULL;
+	int res;
+
+	if (!regset->get && !regset->regset_get)
+		return -EOPNOTSUPP;
+	if (size > regset->n * regset->size)
+		size = regset->n * regset->size;
+	if (!p) {
+		to_free = p = kzalloc(size, GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
+	if (regset->regset_get) {
+		res = regset->regset_get(target, regset,
+				   (struct membuf){.p = p, .left = size});
+		if (res < 0) {
+			kfree(to_free);
+			return res;
+		}
+		*data = p;
+		return size - res;
+	}
+	res = regset->get(target, regset, 0, size, p, NULL);
+	if (unlikely(res < 0)) {
+		kfree(to_free);
+		return res;
+	}
+	*data = p;
+	if (regset->get_size) { // arm64-only kludge, will go away
+		unsigned max_size = regset->get_size(target, regset);
+		if (size > max_size)
+			size = max_size;
+	}
+	return size;
+}
+
+int regset_get(struct task_struct *target,
+	       const struct user_regset *regset,
+	       unsigned int size,
+	       void *data)
+{
+	return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int size,
+		     void **data)
+{
+	*data = NULL;
+	return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target:	thread to be examined
+ * @view:	&struct user_regset_view describing user thread machine state
+ * @setno:	index in @view->regsets
+ * @offset:	offset into the regset data, in bytes
+ * @size:	amount of data to copy, in bytes
+ * @data:	user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+			const struct user_regset_view *view,
+			unsigned int setno,
+			unsigned int offset, unsigned int size,
+			void __user *data)
+{
+	const struct user_regset *regset = &view->regsets[setno];
+	void *buf;
+	int ret;
+
+	ret = regset_get_alloc(target, regset, size, &buf);
+	if (ret > 0)
+		ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+	kfree(buf);
+	return ret;
+}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3f8c7867c14c15ee863f466bcf2b09cb3c587d71..75a2dbbedcdc79c67997cc101dd3042d26320a92 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * update no idle residency and return.
 	 */
 	if (current_clr_polling_and_test()) {
-		dev->last_residency = 0;
+		dev->last_residency_ns = 0;
 		local_irq_enable();
 		return -EBUSY;
 	}
@@ -165,7 +165,9 @@ static void cpuidle_idle_call(void)
 	 * until a proper wakeup interrupt happens.
 	 */
 
-	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+		u64 max_latency_ns;
+
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
@@ -176,12 +178,16 @@ static void cpuidle_idle_call(void)
 			}
 
 			rcu_idle_exit();
+
+			max_latency_ns = U64_MAX;
+		} else {
+			max_latency_ns = dev->forced_idle_latency_limit_ns;
 		}
 
 		tick_nohz_idle_stop_tick();
 		rcu_idle_enter();
 
-		next_state = cpuidle_find_deepest_state(drv, dev);
+		next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
 		call_cpuidle(drv, dev, next_state);
 	} else {
 		bool stop_tick = true;
@@ -312,7 +318,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void play_idle(unsigned long duration_us)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
 {
 	struct idle_timer it;
 
@@ -324,29 +330,29 @@ void play_idle(unsigned long duration_us)
 	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
 	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
 	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
-	WARN_ON_ONCE(!duration_us);
+	WARN_ON_ONCE(!duration_ns);
 
 	rcu_sleep_check();
 	preempt_disable();
 	current->flags |= PF_IDLE;
-	cpuidle_use_deepest_state(true);
+	cpuidle_use_deepest_state(latency_ns);
 
 	it.done = 0;
 	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	it.timer.function = idle_inject_timer_fn;
-	hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+	hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
 		      HRTIMER_MODE_REL_PINNED);
 
 	while (!READ_ONCE(it.done))
 		do_idle();
 
-	cpuidle_use_deepest_state(false);
+	cpuidle_use_deepest_state(0);
 	current->flags &= ~PF_IDLE;
 
 	preempt_fold_need_resched();
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e745f09789dd3432af35d7db91f96068d1fe..10eeebf1808e71c1dc28d3f64d3407665fb80f81 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -818,17 +818,17 @@ void psi_memstall_enter(unsigned long *flags)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	*flags = current->flags & PF_MEMSTALL;
+	*flags = current->in_memstall;
 	if (*flags)
 		return;
 	/*
-	 * PF_MEMSTALL setting & accounting needs to be atomic wrt
+	 * in_memstall setting & accounting needs to be atomic wrt
 	 * changes to the task's scheduling state, otherwise we can
 	 * race with CPU migration.
 	 */
 	rq = this_rq_lock_irq(&rf);
 
-	current->flags |= PF_MEMSTALL;
+	current->in_memstall = 1;
 	psi_task_change(current, 0, TSK_MEMSTALL);
 
 	rq_unlock_irq(rq, &rf);
@@ -851,13 +851,13 @@ void psi_memstall_leave(unsigned long *flags)
 	if (*flags)
 		return;
 	/*
-	 * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+	 * in_memstall clearing & accounting needs to be atomic wrt
 	 * changes to the task's scheduling state, otherwise we could
 	 * race with CPU migration.
 	 */
 	rq = this_rq_lock_irq(&rf);
 
-	current->flags &= ~PF_MEMSTALL;
+	current->in_memstall = 0;
 	psi_task_change(current, TSK_MEMSTALL, 0);
 
 	rq_unlock_irq(rq, &rf);
@@ -921,7 +921,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 	else if (task->in_iowait)
 		task_flags = TSK_IOWAIT;
 
-	if (task->flags & PF_MEMSTALL)
+	if (task->in_memstall)
 		task_flags |= TSK_MEMSTALL;
 
 	if (task_flags)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4543af7b140c83857136ab57dd3453d9d38a5158..46213bb177c5c98789e83b8457ef5fe70501ae6a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -71,7 +71,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 		return;
 
 	if (!wakeup || p->sched_psi_wake_requeue) {
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
 		if (p->sched_psi_wake_requeue)
 			p->sched_psi_wake_requeue = 0;
@@ -91,7 +91,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		return;
 
 	if (!sleep) {
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			clear |= TSK_MEMSTALL;
 	} else {
 		if (p->in_iowait)
@@ -110,14 +110,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
 	 * deregister its sleep-persistent psi states from the old
 	 * queue, and let psi_enqueue() know it has to requeue.
 	 */
-	if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+	if (unlikely(p->in_iowait || p->in_memstall)) {
 		struct rq_flags rf;
 		struct rq *rq;
 		int clear = 0;
 
 		if (p->in_iowait)
 			clear |= TSK_IOWAIT;
-		if (p->flags & PF_MEMSTALL)
+		if (p->in_memstall)
 			clear |= TSK_MEMSTALL;
 
 		rq = __task_rq_lock(p, &rf);
@@ -132,7 +132,7 @@ static inline void psi_task_tick(struct rq *rq)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (unlikely(rq->curr->flags & PF_MEMSTALL))
+	if (unlikely(rq->curr->in_memstall))
 		psi_memstall_tick(rq->curr, cpu_of(rq));
 }
 #else /* CONFIG_PSI */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8c97fc72d78bd41fd52cd6ca78018e783ba6c497..509e31ae24abe4377eb2de3fdcec8479b2ad3fc1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4001,11 +4001,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	return 0;
 }
 
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+static inline void sigaltstack_lock(void)
+	__acquires(&current->sighand->siglock)
+{
+	spin_lock_irq(&current->sighand->siglock);
+}
+
+static inline void sigaltstack_unlock(void)
+	__releases(&current->sighand->siglock)
+{
+	spin_unlock_irq(&current->sighand->siglock);
+}
+#else
+static inline void sigaltstack_lock(void) { }
+static inline void sigaltstack_unlock(void) { }
+#endif
+
 static int
 do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
 		size_t min_ss_size)
 {
 	struct task_struct *t = current;
+	int ret = 0;
 
 	if (oss) {
 		memset(oss, 0, sizeof(stack_t));
@@ -4029,19 +4047,33 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
 				ss_mode != 0))
 			return -EINVAL;
 
+		/*
+		 * Return before taking any locks if no actual
+		 * sigaltstack changes were requested.
+		 */
+		if (t->sas_ss_sp == (unsigned long)ss_sp &&
+		    t->sas_ss_size == ss_size &&
+		    t->sas_ss_flags == ss_flags)
+			return 0;
+
+		sigaltstack_lock();
 		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
 			ss_sp = NULL;
 		} else {
 			if (unlikely(ss_size < min_ss_size))
-				return -ENOMEM;
+				ret = -ENOMEM;
+			if (!sigaltstack_size_valid(ss_size))
+				ret = -ENOMEM;
 		}
-
-		t->sas_ss_sp = (unsigned long) ss_sp;
-		t->sas_ss_size = ss_size;
-		t->sas_ss_flags = ss_flags;
+		if (!ret) {
+			t->sas_ss_sp = (unsigned long) ss_sp;
+			t->sas_ss_size = ss_size;
+			t->sas_ss_flags = ss_flags;
+		}
+		sigaltstack_unlock();
 	}
-	return 0;
+	return ret;
 }
 
 SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index c9ea7eb2cb1a321e0896a9f49398c273968da52d..7b3c0e801d298d2d2f530feb83a30e51606e1c8a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
+#include <linux/uaccess.h>
 
 /**
  * stack_trace_print - Print the entries in the stack trace
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 428beb69426a85577085ee6f644ab785ebc700e9..dc9fbf42ed34783446068358f262b87d29264077 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -124,6 +124,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions.
+ */
+#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
+
 static void clocksource_watchdog_work(struct work_struct *work)
 {
 	/*
@@ -184,12 +191,74 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
+static ulong max_cswd_read_retries = 2;
+module_param(max_cswd_read_retries, ulong, 0644);
+
+enum wd_read_status {
+	WD_READ_SUCCESS,
+	WD_READ_UNSTABLE,
+	WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+	unsigned int nretries;
+	u64 wd_end, wd_end2, wd_delta;
+	int64_t wd_delay, wd_seq_delay;
+
+	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+		local_irq_disable();
+		*wdnow = watchdog->read(watchdog);
+		*csnow = cs->read(cs);
+		wd_end = watchdog->read(watchdog);
+		wd_end2 = watchdog->read(watchdog);
+		local_irq_enable();
+
+		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
+		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
+					      watchdog->shift);
+		if (wd_delay <= WATCHDOG_MAX_SKEW) {
+			if (nretries > 1 || nretries >= max_cswd_read_retries) {
+				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+					smp_processor_id(), watchdog->name, nretries);
+			}
+			return WD_READ_SUCCESS;
+		}
+
+		/*
+		 * Now compute delay in consecutive watchdog read to see if
+		 * there is too much external interferences that cause
+		 * significant delay in reading both clocksource and watchdog.
+		 *
+		 * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
+		 * report system busy, reinit the watchdog and skip the current
+		 * watchdog test.
+		 */
+		wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
+		wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+		if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+			goto skip_test;
+	}
+
+	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
+		smp_processor_id(), watchdog->name, wd_delay, nretries);
+	return WD_READ_UNSTABLE;
+
+skip_test:
+	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+		smp_processor_id(), watchdog->name, wd_seq_delay);
+	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+		cs->name, wd_delay);
+	return WD_READ_SKIP;
+}
+
 static void clocksource_watchdog(struct timer_list *unused)
 {
-	struct clocksource *cs;
 	u64 csnow, wdnow, cslast, wdlast, delta;
-	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;
+	int64_t wd_nsec, cs_nsec;
+	struct clocksource *cs;
+	enum wd_read_status read_ret;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
@@ -206,10 +275,14 @@ static void clocksource_watchdog(struct timer_list *unused)
 			continue;
 		}
 
-		local_irq_disable();
-		csnow = cs->read(cs);
-		wdnow = watchdog->read(watchdog);
-		local_irq_enable();
+		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+		if (read_ret != WD_READ_SUCCESS) {
+			if (read_ret == WD_READ_UNSTABLE)
+				/* Clock readout unreliable, so give it up. */
+				__clocksource_unstable(cs);
+			continue;
+		}
 
 		/* Clocksource initialized ? */
 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
diff --git a/kernel/tkernel/ttools/ttools_module.c b/kernel/tkernel/ttools/ttools_module.c
index a4a7b7c6bdeecc5cb9c0daf4612653b1b3fa9bcd..0ea5efd7e3501309eab0fa62569f53bf3c9bbab4 100644
--- a/kernel/tkernel/ttools/ttools_module.c
+++ b/kernel/tkernel/ttools/ttools_module.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include "ttools.h"
+#include <asm/sync_core.h>
 
 #define TTOOLS_MINOR		254
 #define TTOOLS_VER		"2.0"
diff --git a/lib/Kconfig b/lib/Kconfig
index 3321d04dfa5a5a703909d1f201ffecf75a7767fa..681b7e50490e1b600c1c118534e3a989cffa379a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -605,6 +605,9 @@ config ARCH_NO_SG_CHAIN
 config ARCH_HAS_PMEM_API
 	bool
 
+config MEMREGION
+	bool
+
 # use memcpy to implement user copies for nommu architectures
 config UACCESS_MEMCPY
 	bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee00c6c8a373e437ee397f8f9551bbf0b39be819..af036ce4dcfe86f609bb30489a0963bd92c2bbea 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1069,6 +1069,7 @@ config PROVE_LOCKING
 	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
+	select PREEMPT_COUNT if !ARCH_NO_PREEMPT
 	select TRACE_IRQFLAGS
 	default n
 	help
diff --git a/lib/Makefile b/lib/Makefile
index 3b8977aed1b4cd6f722fa06c90896524466fac63..e4f42e71b32508550bcd5033e91e06a9e2497ec2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -212,6 +212,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_SG_POOL) += sg_pool.o
+obj-$(CONFIG_MEMREGION) += memregion.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
diff --git a/lib/devres.c b/lib/devres.c
index 77c80ca9e48563aa01c84c5826953ca049a42ff5..36a467ec7f4964d93cfd22eaea1acc56767934c6 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(devm_ioport_unmap);
 /*
  * PCI iomap devres
  */
-#define PCIM_IOMAP_MAX	PCI_ROM_RESOURCE
+#define PCIM_IOMAP_MAX	PCI_STD_NUM_BARS
 
 struct pcim_iomap_devres {
 	void __iomem *table[PCIM_IOMAP_MAX];
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 80d10d02cf388d5a0121e73a12a9ffb3782cf0f1..cc5f615bc9f24749bb786d7de40b71d220f363eb 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -541,7 +541,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
 EXPORT_SYMBOL(gen_pool_for_each_chunk);
 
 /**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
  * @pool:	the generic memory pool
  * @start:	start address
  * @size:	size of the region
@@ -549,7 +549,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
  * Check if the range of addresses falls within the specified pool. Returns
  * true if the entire range is contained in the pool and false otherwise.
  */
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
 			size_t size)
 {
 	bool found = false;
@@ -568,6 +568,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 	rcu_read_unlock();
 	return found;
 }
+EXPORT_SYMBOL(gen_pool_has_addr);
 
 /**
  * gen_pool_avail - get available free space of the pool
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 147133f8eb2fcc27994296020b7efb569ae8f928..9007d325e1428336b40ebb1f35dbb962bab60978 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -244,7 +244,7 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
 	int i, linelen, remaining = len;
 	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
 
-	if (rowsize != 16 && rowsize != 32)
+	if (rowsize != 8 && rowsize != 16 && rowsize != 32)
 		rowsize = 16;
 
 	for (i = 0; i < len; i += rowsize) {
diff --git a/lib/memregion.c b/lib/memregion.c
new file mode 100644
index 0000000000000000000000000000000000000000..77c85b5251da57a18576b3c0ba96eaf39c0dcc99
--- /dev/null
+++ b/lib/memregion.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* identifiers for device / performance-differentiated memory regions */
+#include <linux/idr.h>
+#include <linux/types.h>
+
+static DEFINE_IDA(memregion_ids);
+
+int memregion_alloc(gfp_t gfp)
+{
+	return ida_alloc(&memregion_ids, gfp);
+}
+EXPORT_SYMBOL(memregion_alloc);
+
+void memregion_free(int id)
+{
+	ida_free(&memregion_ids, id);
+}
+EXPORT_SYMBOL(memregion_free);
diff --git a/lib/refcount.c b/lib/refcount.c
index 6e904af0fb3e10de360705e7a6c0474da7fb1322..ebac8b7d15a7c5ac297b23e07f2c6ec108b85418 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -1,41 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Variant of atomic_t specialized for reference counts.
- *
- * The interface matches the atomic_t interface (to aid in porting) but only
- * provides the few functions one should use for reference counting.
- *
- * It differs in that the counter saturates at UINT_MAX and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free issues.
- *
- * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
- * and provide only what is strictly required for refcounts.
- *
- * The increments are fully relaxed; these will not provide ordering. The
- * rationale is that whatever is used to obtain the object we're increasing the
- * reference count on will provide the ordering. For locked data structures,
- * its the lock acquire, for RCU/lockless data structures its the dependent
- * load.
- *
- * Do note that inc_not_zero() provides a control dependency which will order
- * future stores against the inc, this ensures we'll never modify the object
- * if we did not in fact acquire a reference.
- *
- * The decrements will provide release order, such that all the prior loads and
- * stores will be issued before, it also provides a control dependency, which
- * will order us against the subsequent free().
- *
- * The control dependency is against the load of the cmpxchg (ll/sc) that
- * succeeded. This means the stores aren't fully ordered, but this is fine
- * because the 1->0 transition indicates no concurrency.
- *
- * Note that the allocator is responsible for ordering things between free()
- * and alloc().
- *
- * The decrements dec_and_test() and sub_and_test() also provide acquire
- * ordering on success.
- *
+ * Out-of-line refcount functions.
  */
 
 #include <linux/mutex.h>
@@ -43,199 +8,33 @@
 #include <linux/spinlock.h>
 #include <linux/bug.h>
 
-/**
- * refcount_add_not_zero_checked - add a value to a refcount unless it is 0
- * @i: the value to add to the refcount
- * @r: the refcount
- *
- * Will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_inc(), or one of its variants, should instead be used to
- * increment a reference count.
- *
- * Return: false if the passed refcount is 0, true otherwise
- */
-bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		if (!val)
-			return false;
-
-		if (unlikely(val == UINT_MAX))
-			return true;
-
-		new = val + i;
-		if (new < val)
-			new = UINT_MAX;
-
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
-
-	WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-EXPORT_SYMBOL(refcount_add_not_zero_checked);
-
-/**
- * refcount_add_checked - add a value to a refcount
- * @i: the value to add to the refcount
- * @r: the refcount
- *
- * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_inc(), or one of its variants, should instead be used to
- * increment a reference count.
- */
-void refcount_add_checked(unsigned int i, refcount_t *r)
-{
-	WARN_ONCE(!refcount_add_not_zero_checked(i, r), "refcount_t: addition on 0; use-after-free.\n");
-}
-EXPORT_SYMBOL(refcount_add_checked);
-
-/**
- * refcount_inc_not_zero_checked - increment a refcount unless it is 0
- * @r: the refcount to increment
- *
- * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller has guaranteed the
- * object memory to be stable (RCU, etc.). It does provide a control dependency
- * and thereby orders future stores. See the comment on top.
- *
- * Return: true if the increment was successful, false otherwise
- */
-bool refcount_inc_not_zero_checked(refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		new = val + 1;
-
-		if (!val)
-			return false;
-
-		if (unlikely(!new))
-			return true;
-
-	} while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
+#define REFCOUNT_WARN(str)	WARN_ONCE(1, "refcount_t: " str ".\n")
 
-	WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
-
-	return true;
-}
-EXPORT_SYMBOL(refcount_inc_not_zero_checked);
-
-/**
- * refcount_inc_checked - increment a refcount
- * @r: the refcount to increment
- *
- * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
- *
- * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object.
- *
- * Will WARN if the refcount is 0, as this represents a possible use-after-free
- * condition.
- */
-void refcount_inc_checked(refcount_t *r)
+void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
 {
-	WARN_ONCE(!refcount_inc_not_zero_checked(r), "refcount_t: increment on 0; use-after-free.\n");
-}
-EXPORT_SYMBOL(refcount_inc_checked);
-
-/**
- * refcount_sub_and_test_checked - subtract from a refcount and test if it is 0
- * @i: amount to subtract from the refcount
- * @r: the refcount
- *
- * Similar to atomic_dec_and_test(), but it will WARN, return false and
- * ultimately leak on underflow and will fail to decrement when saturated
- * at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides an acquire ordering on success such that free()
- * must come after.
- *
- * Use of this function is not recommended for the normal reference counting
- * use case in which references are taken and released one at a time.  In these
- * cases, refcount_dec(), or one of its variants, should instead be used to
- * decrement a reference count.
- *
- * Return: true if the resulting refcount is 0, false otherwise
- */
-bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r)
-{
-	unsigned int new, val = atomic_read(&r->refs);
-
-	do {
-		if (unlikely(val == UINT_MAX))
-			return false;
-
-		new = val - i;
-		if (new > val) {
-			WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
-			return false;
-		}
-
-	} while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
-
-	if (!new) {
-		smp_acquire__after_ctrl_dep();
-		return true;
+	refcount_set(r, REFCOUNT_SATURATED);
+
+	switch (t) {
+	case REFCOUNT_ADD_NOT_ZERO_OVF:
+		REFCOUNT_WARN("saturated; leaking memory");
+		break;
+	case REFCOUNT_ADD_OVF:
+		REFCOUNT_WARN("saturated; leaking memory");
+		break;
+	case REFCOUNT_ADD_UAF:
+		REFCOUNT_WARN("addition on 0; use-after-free");
+		break;
+	case REFCOUNT_SUB_UAF:
+		REFCOUNT_WARN("underflow; use-after-free");
+		break;
+	case REFCOUNT_DEC_LEAK:
+		REFCOUNT_WARN("decrement hit 0; leaking memory");
+		break;
+	default:
+		REFCOUNT_WARN("unknown saturation event!?");
 	}
-	return false;
-
-}
-EXPORT_SYMBOL(refcount_sub_and_test_checked);
-
-/**
- * refcount_dec_and_test_checked - decrement a refcount and test if it is 0
- * @r: the refcount
- *
- * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before, and provides an acquire ordering on success such that free()
- * must come after.
- *
- * Return: true if the resulting refcount is 0, false otherwise
- */
-bool refcount_dec_and_test_checked(refcount_t *r)
-{
-	return refcount_sub_and_test_checked(1, r);
-}
-EXPORT_SYMBOL(refcount_dec_and_test_checked);
-
-/**
- * refcount_dec_checked - decrement a refcount
- * @r: the refcount
- *
- * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
- * when saturated at UINT_MAX.
- *
- * Provides release memory ordering, such that prior loads and stores are done
- * before.
- */
-void refcount_dec_checked(refcount_t *r)
-{
-	WARN_ONCE(refcount_dec_and_test_checked(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
-EXPORT_SYMBOL(refcount_dec_checked);
+EXPORT_SYMBOL(refcount_warn_saturate);
 
 /**
  * refcount_dec_if_one - decrement a refcount if it is 1
@@ -277,7 +76,7 @@ bool refcount_dec_not_one(refcount_t *r)
 	unsigned int new, val = atomic_read(&r->refs);
 
 	do {
-		if (unlikely(val == UINT_MAX))
+		if (unlikely(val == REFCOUNT_SATURATED))
 			return true;
 
 		if (val == 1)
@@ -302,7 +101,7 @@ EXPORT_SYMBOL(refcount_dec_not_one);
  * @lock: the mutex to be locked
  *
  * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
- * to decrement when saturated at UINT_MAX.
+ * to decrement when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
@@ -333,7 +132,7 @@ EXPORT_SYMBOL(refcount_dec_and_mutex_lock);
  * @lock: the spinlock to be locked
  *
  * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
- * decrement when saturated at UINT_MAX.
+ * decrement when saturated at REFCOUNT_SATURATED.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a830bc940d1874a92d645def3ed636208b83fcf..006d1186ec50222cb02e35090a95635c7879c9c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3297,10 +3297,6 @@ ssize_t generic_perform_write(struct file *file,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -3327,24 +3323,22 @@ ssize_t generic_perform_write(struct file *file,
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made ->write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 19603302a77ffa9745861183d1afeacd547e0730..2c31799d3514e1f0a8c46f3a0ab0378c55320ceb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -9,6 +9,7 @@
 
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
+#include <linux/ioasid.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 
@@ -37,5 +38,8 @@ struct mm_struct init_mm = {
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
+#ifdef CONFIG_IOMMU_SVA
+	.pasid		= INVALID_IOASID,
+#endif
 	INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 63e188a7d40b2636760bffa1c36a39eb1d5fc09c..e715dd67ce05698cc1215560ed13307b4720b6ab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -527,6 +528,150 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	kfree(tk);
 }
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	tk->addr = addr;
+	tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	set_to_kill(tk, addr, shift);
+	return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	pmd_t pmd = *pmdp;
+	unsigned long pfn;
+	unsigned long hwpoison_vaddr;
+
+	if (!pmd_present(pmd))
+		return 0;
+	pfn = pmd_pfn(pmd);
+	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	down_read(&(p->mm->mmap_sem));
+	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+			      (void *)&priv);
+	if (ret == 1 && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	else
+		ret = 0;
+	up_read(&(p->mm->mmap_sem));
+	return ret > 0 ? -EHWPOISON : -EFAULT;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -630,6 +775,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
+	unlock_page(p);
 	return MF_IGNORED;
 }
 
@@ -639,6 +785,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+	unlock_page(p);
 	return MF_FAILED;
 }
 
@@ -647,6 +794,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
 	struct address_space *mapping;
 
 	delete_from_lru_cache(p);
@@ -655,8 +803,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
-	if (PageAnon(p))
-		return MF_RECOVERED;
+	if (PageAnon(p)) {
+		ret = MF_RECOVERED;
+		goto out;
+	}
 
 	/*
 	 * Now truncate the page in the page cache. This is really
@@ -670,7 +820,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		return MF_FAILED;
+		ret = MF_FAILED;
+		goto out;
 	}
 
 	/*
@@ -678,7 +829,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
-	return truncate_error_page(p, pfn, mapping);
+	ret = truncate_error_page(p, pfn, mapping);
+out:
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -754,24 +908,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_DELAYED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+	unlock_page(p);
+	return ret;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	delete_from_swap_cache(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_RECOVERED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -792,6 +948,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	mapping = page_mapping(hpage);
 	if (mapping) {
 		res = truncate_error_page(hpage, pfn, mapping);
+		unlock_page(hpage);
 	} else {
 		unlock_page(hpage);
 		/*
@@ -803,7 +960,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 			put_page(hpage);
 		dissolve_free_huge_page(p);
 		res = MF_RECOVERED;
-		lock_page(hpage);
 	}
 
 	return res;
@@ -836,6 +992,8 @@ static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	enum mf_action_page_type type;
+
+	/* Callback ->action() has to unlock the relevant page inside it. */
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
@@ -900,6 +1058,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	int result;
 	int count;
 
+	/* page p should be unlocked after returning from ps->action().  */
 	result = ps->action(p, pfn);
 
 	count = page_count(p) - 1;
@@ -1091,7 +1250,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return 0;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1147,7 +1309,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	res = identify_page_state(pfn, p, page_flags);
+	return identify_page_state(pfn, p, page_flags);
 out:
 	unlock_page(head);
 	return res;
@@ -1251,8 +1413,9 @@ int memory_failure(unsigned long pfn, int flags)
 	struct page *hpage;
 	struct page *orig_head;
 	struct dev_pagemap *pgmap;
-	int res;
+	int res = 0;
 	unsigned long page_flags;
+	static DEFINE_MUTEX(mf_mutex);
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
@@ -1270,12 +1433,20 @@ int memory_failure(unsigned long pfn, int flags)
 		return -ENXIO;
 	}
 
-	if (PageHuge(p))
-		return memory_failure_hugetlb(pfn, flags);
+	mutex_lock(&mf_mutex);
+
+	if (PageHuge(p)) {
+		res = memory_failure_hugetlb(pfn, flags);
+		goto unlock_mutex;
+	}
+
 	if (TestSetPageHWPoison(p)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
-		return 0;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
+		goto unlock_mutex;
 	}
 
 	orig_head = hpage = compound_head(p);
@@ -1298,8 +1469,9 @@ int memory_failure(unsigned long pfn, int flags)
 			return 0;
 		} else {
 			action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
-			return -EBUSY;
+			res = -EBUSY;
 		}
+		goto unlock_mutex;
 	}
 
 	if (PageTransHuge(hpage)) {
@@ -1315,7 +1487,8 @@ int memory_failure(unsigned long pfn, int flags)
 			if (TestClearPageHWPoison(p))
 				num_poisoned_pages_dec();
 			put_hwpoison_page(p);
-			return -EBUSY;
+			res = -EBUSY;
+			goto unlock_mutex;
 		}
 		unlock_page(p);
 		VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1337,7 +1510,8 @@ int memory_failure(unsigned long pfn, int flags)
 			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
 		else
 			action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
-		return 0;
+		res = 0;
+		goto unlock_mutex;
 	}
 
 	lock_page(p);
@@ -1349,7 +1523,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageCompound(p) && compound_head(p) != orig_head) {
 		action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 	/*
@@ -1372,14 +1546,14 @@ int memory_failure(unsigned long pfn, int flags)
 		num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		return 0;
+		goto unlock_mutex;
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (!PageTransTail(p) && !PageLRU(p))
@@ -1401,7 +1575,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 	/*
@@ -1410,13 +1584,17 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
 		action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		res = -EBUSY;
-		goto out;
+		goto unlock_page;
 	}
 
 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
-out:
+	mutex_unlock(&mf_mutex);
+	return res;
+unlock_page:
 	unlock_page(p);
+unlock_mutex:
+	mutex_unlock(&mf_mutex);
 	return res;
 }
 EXPORT_SYMBOL_GPL(memory_failure);
diff --git a/mm/memory.c b/mm/memory.c
index 465c6b6bbeb0d565777f194f7786240fac445e48..8eda1dc25d81d3c8f333abaddb73b4c8133a2962 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4325,6 +4325,73 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
+int follow_pte_pud_lockless(struct mm_struct *mm, unsigned long address,
+			    pte_t *ptep, pmd_t *pmdp, pud_t *pudp)
+{
+	pgd_t *pgd, pgdval;
+	p4d_t *p4d, p4dval;
+	pud_t *pud, pudval;
+	pmd_t *pmd, pmdval;
+	pte_t *pte, pteval;
+
+	pgd = pgd_offset(mm, address);
+	pgdval = *pgd;
+	if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+		goto out;
+
+	p4d = p4d_offset(&pgdval, address);
+	p4dval = *p4d;
+	if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
+		goto out;
+
+	pud = pud_offset(&p4dval, address);
+	pudval = *pud;
+	if (!pud_present(pudval))
+		goto out;
+	BUG_ON(pud_trans_huge(pudval));
+
+	if (pud_huge(pudval)) {
+		if (!pudp)
+			goto out;
+
+		*pudp = pudval;
+		return 0;
+	}
+
+	if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+		goto out;
+
+	pmd = pmd_offset(&pudval, address);
+	pmdval = *pmd;
+	if (!pmd_present(pmdval))
+		goto out;
+	BUG_ON(pmd_trans_huge(pmdval));
+
+	if (pmd_huge(pmdval)) {
+		if (!pmdp)
+			goto out;
+
+		*pmdp = pmdval;
+		return 0;
+	}
+
+	if (pmd_none(pmdval) || unlikely(pmd_bad(pmdval)))
+		goto out;
+
+	pte = pte_offset_map(&pmdval, address);
+	if (!pte)
+		goto out;
+	pteval = *pte;
+	pte_unmap(pte);
+	if (!pte_present(pteval))
+		goto out;
+	*ptep = pteval;
+	return 0;
+out:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(follow_pte_pud_lockless);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index daeae783ddf5ca3a6d7915a03f2e2161117a16ce..3feb9ac606d0571c2494fe9cd627c5ba762d2a7c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2817,7 +2817,7 @@ EXPORT_SYMBOL(__test_set_page_writeback);
  */
 void wait_on_page_writeback(struct page *page)
 {
-	if (PageWriteback(page)) {
+	while (PageWriteback(page)) {
 		trace_wait_on_page_writeback(page, page_mapping(page));
 		wait_on_page_bit(page, PG_writeback);
 	}
diff --git a/mm/percpu.c b/mm/percpu.c
index 806bc16f88eb82529615d529d3e8bfe657790972..63a3f6f0a42d712a704eb9d698e01c0803c98d40 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 	       pcpu_unit_page_offset(cpu, page_idx);
 }
 
-static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
-{
-	*rs = find_next_zero_bit(bitmap, end, *rs);
-	*re = find_next_bit(bitmap, end, *rs + 1);
-}
-
-static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
-{
-	*rs = find_next_bit(bitmap, end, *rs);
-	*re = find_next_zero_bit(bitmap, end, *rs + 1);
-}
-
-/*
- * Bitmap region iterators.  Iterates over the bitmap between
- * [@start, @end) in @chunk.  @rs and @re should be integer variables
- * and will be set to start and end index of the current free region.
- */
-#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
-	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
-
-#define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
-	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
-	     (rs) < (re);						     \
-	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
-
 /*
  * The following are helper functions to help access bitmaps and convert
  * between bitmap offsets to address offsets.
@@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
 	}
 
 	bits = 0;
-	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+	pcpu_for_each_md_free_region(chunk, bit_off, bits)
 		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
-	}
 }
 
 /**
@@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
 {
 	struct pcpu_block_md *block = chunk->md_blocks + index;
 	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
-	int rs, re, start;	/* region start, region end */
+	unsigned int rs, re, start;	/* region start, region end */
 
 	/* promote scan_hint to contig_hint */
 	if (block->scan_hint) {
@@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
 	block->right_free = 0;
 
 	/* iterate over free areas and update the contig hints */
-	pcpu_for_each_unpop_region(alloc_map, rs, re, start,
-				   PCPU_BITMAP_BLOCK_BITS) {
+	bitmap_for_each_clear_region(alloc_map, rs, re, start,
+				     PCPU_BITMAP_BLOCK_BITS)
 		pcpu_block_update(block, rs, re);
-	}
 }
 
 /**
@@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
 			      int *next_off)
 {
-	int page_start, page_end, rs, re;
+	unsigned int page_start, page_end, rs, re;
 
 	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
 	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
 
 	rs = page_start;
-	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+	bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
 	if (rs >= page_end)
 		return true;
 
@@ -1702,13 +1673,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 	/* populate if not all pages are already there */
 	if (!is_atomic) {
-		int page_start, page_end, rs, re;
+		unsigned int page_start, page_end, rs, re;
 
 		page_start = PFN_DOWN(off);
 		page_end = PFN_UP(off + size);
 
-		pcpu_for_each_unpop_region(chunk->populated, rs, re,
-					   page_start, page_end) {
+		bitmap_for_each_clear_region(chunk->populated, rs, re,
+					     page_start, page_end) {
 			WARN_ON(chunk->immutable);
 
 			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	spin_unlock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, &to_free, list) {
-		int rs, re;
+		unsigned int rs, re;
 
-		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
-					 chunk->nr_pages) {
+		bitmap_for_each_set_region(chunk->populated, rs, re, 0,
+					   chunk->nr_pages) {
 			pcpu_depopulate_chunk(chunk, rs, re);
 			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_depopulated(chunk, rs, re);
@@ -1893,7 +1864,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	}
 
 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
-		int nr_unpop = 0, rs, re;
+		unsigned int nr_unpop = 0, rs, re;
 
 		if (!nr_to_pop)
 			break;
@@ -1910,9 +1881,9 @@ static void pcpu_balance_workfn(struct work_struct *work)
 			continue;
 
 		/* @chunk can't go away while pcpu_alloc_mutex is held */
-		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
-					   chunk->nr_pages) {
-			int nr = min(re - rs, nr_to_pop);
+		bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
+					     chunk->nr_pages) {
+			int nr = min_t(int, re - rs, nr_to_pop);
 
 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
 			if (!ret) {
diff --git a/package/default/config.default_kasan b/package/default/config.default_kasan
index 4590481565e257343db6473cea5639ab9f301f73..2ff9d4f5d4150fb5815e74ccce8b136df231d1e9 100644
--- a/package/default/config.default_kasan
+++ b/package/default/config.default_kasan
@@ -157,6 +157,7 @@ CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_IOASIDS=y
 # CONFIG_CGROUP_RDMA is not set
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
@@ -455,6 +456,7 @@ CONFIG_LEGACY_VSYSCALL_EMULATE=y
 # CONFIG_LEGACY_VSYSCALL_NONE is not set
 # CONFIG_CMDLINE_BOOL is not set
 CONFIG_MODIFY_LDT_SYSCALL=y
+# CONFIG_STRICT_SIGALTSTACK_SIZE is not set
 CONFIG_HAVE_LIVEPATCH=y
 CONFIG_LIVEPATCH=y
 # end of Processor type and features
@@ -536,11 +538,14 @@ CONFIG_ACPI_APEI_MEMORY_FAILURE=y
 CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
 # CONFIG_DPTF_POWER is not set
+CONFIG_ACPI_PFRU=y
+CONFIG_ACPI_PFRU_TELEMETRY=y
 CONFIG_ACPI_EXTLOG=m
 CONFIG_ACPI_ADXL=y
 # CONFIG_PMIC_OPREGION is not set
 # CONFIG_ACPI_CONFIGFS is not set
 CONFIG_X86_PM_TIMER=y
+CONFIG_ACPI_PRMT=y
 # CONFIG_SFI is not set
 
 #
@@ -698,7 +703,8 @@ CONFIG_OPROFILE_EVENT_MULTIPLEX=y
 CONFIG_HAVE_OPROFILE=y
 CONFIG_OPROFILE_NMI_TIMER=y
 CONFIG_KPROBES=y
-# CONFIG_JUMP_LABEL is not set
+CONFIG_JUMP_LABEL=y
+# CONFIG_STATIC_KEYS_SELFTEST is not set
 CONFIG_OPTPROBES=y
 CONFIG_KPROBES_ON_FTRACE=y
 CONFIG_UPROBES=y
@@ -2346,6 +2352,7 @@ CONFIG_I40E=m
 # CONFIG_I40E_DCB is not set
 CONFIG_IAVF=m
 CONFIG_I40EVF=m
+CONFIG_ICE=m
 # CONFIG_ICE is not set
 # CONFIG_FM10K is not set
 # CONFIG_IGC is not set
@@ -3099,6 +3106,7 @@ CONFIG_BCMA_POSSIBLE=y
 # CONFIG_LPC_SCH is not set
 # CONFIG_MFD_INTEL_LPSS_ACPI is not set
 # CONFIG_MFD_INTEL_LPSS_PCI is not set
+CONFIG_MFD_INTEL_PMT=y
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_KEMPLD is not set
 # CONFIG_MFD_88PM800 is not set
@@ -3779,6 +3787,11 @@ CONFIG_DMA_VIRTUAL_CHANNELS=y
 CONFIG_DMA_ACPI=y
 # CONFIG_ALTERA_MSGDMA is not set
 CONFIG_INTEL_IDMA64=y
+CONFIG_INTEL_IDXD_BUS=m
+CONFIG_INTEL_IDXD=m
+# CONFIG_INTEL_IDXD_COMPAT is not set
+CONFIG_INTEL_IDXD_SVM=y
+CONFIG_INTEL_IDXD_PERFMON=y
 CONFIG_INTEL_IOATDMA=y
 # CONFIG_QCOM_HIDMA_MGMT is not set
 # CONFIG_QCOM_HIDMA is not set
@@ -3794,6 +3807,7 @@ CONFIG_HSU_DMA=y
 #
 # CONFIG_ASYNC_TX_DMA is not set
 # CONFIG_DMATEST is not set
+CONFIG_DMATEST=m
 CONFIG_DMA_ENGINE_RAID=y
 
 #
@@ -3827,6 +3841,7 @@ CONFIG_VFIO_PCI_MMAP=y
 CONFIG_VFIO_PCI_INTX=y
 CONFIG_VFIO_PCI_IGD=y
 CONFIG_VFIO_MDEV=m
+CONFIG_VFIO_MDEV_IDXD=m
 CONFIG_VFIO_MDEV_DEVICE=m
 CONFIG_IRQ_BYPASS_MANAGER=m
 CONFIG_VIRT_DRIVERS=y
@@ -3929,7 +3944,9 @@ CONFIG_IOMMU_SUPPORT=y
 # end of Generic IOMMU Pagetable Support
 
 # CONFIG_IOMMU_DEBUGFS is not set
-# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set
+# CONFIG_IOMMU_DEFAULT_DMA_STRICT is not set
+# CONFIG_IOMMU_DEFAULT_DMA_LAZY is not set
+CONFIG_IOMMU_DEFAULT_PASSTHROUGH=y
 CONFIG_AMD_IOMMU=y
 CONFIG_AMD_IOMMU_V2=m
 CONFIG_DMAR_TABLE=y
@@ -3937,6 +3954,7 @@ CONFIG_INTEL_IOMMU=y
 CONFIG_INTEL_IOMMU_SVM=y
 # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
 CONFIG_INTEL_IOMMU_FLOPPY_WA=y
+CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y
 CONFIG_IRQ_REMAP=y
 # CONFIG_SMMU_BYPASS_DEV is not set
 
@@ -4602,7 +4620,7 @@ CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m
 #
 # Compression
 #
-CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
 # CONFIG_CRYPTO_842 is not set
 # CONFIG_CRYPTO_LZ4 is not set
@@ -4641,6 +4659,8 @@ CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
 CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
 CONFIG_CRYPTO_DEV_QAT_C62XVF=m
 # CONFIG_CRYPTO_DEV_NITROX_CNN55XX is not set
+CONFIG_CRYPTO_DEV_IAX_CRYPTO=m
+CONFIG_CRYPTO_DEV_IAX_CRYPTO_STATS=y
 # CONFIG_CRYPTO_DEV_CHELSIO is not set
 # CONFIG_CRYPTO_DEV_CHELSIO_TLS is not set
 CONFIG_CRYPTO_DEV_VIRTIO=m
diff --git a/samples/Kconfig b/samples/Kconfig
index c8dacb4dda80c0d44548e81391bd4a65f276048b..e6663b91dbae7bcf0ea2b0d1e7f3232b6758b094 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -118,14 +118,14 @@ config SAMPLE_SECCOMP
 
 config SAMPLE_VFIO_MDEV_MTTY
 	tristate "Build VFIO mtty example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	help
 	  Build a virtual tty sample driver for use as a VFIO
 	  mediated device
 
 config SAMPLE_VFIO_MDEV_MDPY
 	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	help
 	  Build a virtual display sample driver for use as a VFIO
 	  mediated device.  It is a simple framebuffer and supports
@@ -142,7 +142,7 @@ config SAMPLE_VFIO_MDEV_MDPY_FB
 
 config SAMPLE_VFIO_MDEV_MBOCHS
 	tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
-	depends on VFIO_MDEV_DEVICE && m
+	depends on VFIO_MDEV && m
 	select DMA_SHARED_BUFFER
 	help
 	  Build a virtual display sample driver for use as a VFIO
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index ac5c8c17b1ff21e99e69372f82b6c0eac2a2318d..8fcb640375f573bd731c56245b83b0d7e37900ab 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -129,7 +129,8 @@ static dev_t		mbochs_devt;
 static struct class	*mbochs_class;
 static struct cdev	mbochs_cdev;
 static struct device	mbochs_dev;
-static int		mbochs_used_mbytes;
+static atomic_t mbochs_avail_mbytes;
+static const struct vfio_device_ops mbochs_dev_ops;
 
 struct vfio_region_info_ext {
 	struct vfio_region_info          base;
@@ -160,6 +161,7 @@ struct mbochs_dmabuf {
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	u8 *vconfig;
 	u64 bar_mask[3];
 	u32 memory_bar_mask;
@@ -205,16 +207,6 @@ static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
 static struct page *mbochs_get_page(struct mdev_state *mdev_state,
 				    pgoff_t pgoff);
 
-static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
-		if (strcmp(mbochs_types[i].name, kobj->name) == 0)
-			return mbochs_types + i;
-	return NULL;
-}
-
 static void mbochs_create_config_space(struct mdev_state *mdev_state)
 {
 	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
@@ -435,11 +427,9 @@ static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset,
 		memcpy(buf, mdev_state->edid_blob + offset, count);
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
-			   loff_t pos, bool is_write)
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+			   size_t count, loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	struct page *pg;
 	loff_t poff;
 	char *map;
@@ -488,7 +478,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 		put_page(pg);
 
 	} else {
-		dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
+		dev_dbg(mdev_state->vdev.dev, "%s: %s @0x%llx (unhandled)\n",
 			__func__, is_write ? "WR" : "RD", pos);
 		ret = -1;
 		goto accessfailed;
@@ -503,9 +493,8 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 	return ret;
 }
 
-static int mbochs_reset(struct mdev_device *mdev)
+static int mbochs_reset(struct mdev_state *mdev_state)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	u32 size64k = mdev_state->memsize / (64 * 1024);
 	int i;
 
@@ -516,20 +505,25 @@ static int mbochs_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mbochs_probe(struct mdev_device *mdev)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
+	int avail_mbytes = atomic_read(&mbochs_avail_mbytes);
+	const struct mbochs_type *type =
+		&mbochs_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
+	int ret = -ENOMEM;
 
-	if (!type)
-		type = &mbochs_types[0];
-	if (type->mbytes + mbochs_used_mbytes > max_mbytes)
-		return -ENOMEM;
+	do {
+		if (avail_mbytes < type->mbytes)
+			return -ENOSPC;
+	} while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes,
+				     avail_mbytes - type->mbytes));
 
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
-		return -ENOMEM;
+		goto err_avail;
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops);
 
 	mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
 	if (mdev_state->vconfig == NULL)
@@ -544,11 +538,10 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
 		goto err_mem;
 
 	dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
-		 kobj->name, type->mbytes, mdev_state->pagecount);
+		 type->name, type->mbytes, mdev_state->pagecount);
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
 	INIT_LIST_HEAD(&mdev_state->dmabufs);
 	mdev_state->next_id = 1;
 
@@ -558,32 +551,40 @@ static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
 	mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET;
 	mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob);
 	mbochs_create_config_space(mdev_state);
-	mbochs_reset(mdev);
+	mbochs_reset(mdev_state);
 
-	mbochs_used_mbytes += type->mbytes;
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret)
+		goto err_mem;
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
-
 err_mem:
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
-	return -ENOMEM;
+err_avail:
+	atomic_add(type->mbytes, &mbochs_avail_mbytes);
+	return ret;
 }
 
-static int mbochs_remove(struct mdev_device *mdev)
+static void mbochs_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
-	mbochs_used_mbytes -= mdev_state->type->mbytes;
-	mdev_set_drvdata(mdev, NULL);
+	vfio_unregister_group_dev(&mdev_state->vdev);
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes);
 	kfree(mdev_state->pages);
 	kfree(mdev_state->vconfig);
 	kfree(mdev_state);
-	return 0;
 }
 
-static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -593,7 +594,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
+			ret =  mdev_access(mdev_state, (char *)&val, sizeof(val),
 					   *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -605,7 +606,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -617,7 +618,7 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -640,9 +641,11 @@ static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mbochs_write(struct vfio_device *vdev, const char __user *buf,
 			    size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -655,7 +658,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -667,7 +670,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -679,7 +682,7 @@ static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -765,9 +768,10 @@ static const struct vm_operations_struct mbochs_region_vm_ops = {
 	.fault = mbochs_region_vm_fault,
 };
 
-static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+static int mbochs_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
 		return -EINVAL;
@@ -989,7 +993,7 @@ mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
 static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
 {
 	struct mdev_state *mdev_state = dmabuf->mdev_state;
-	struct device *dev = mdev_dev(mdev_state->mdev);
+	struct device *dev = mdev_state->vdev.dev;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
 	struct dma_buf *buf;
 
@@ -1017,15 +1021,10 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
 	return 0;
 }
 
-static int mbochs_get_region_info(struct mdev_device *mdev,
+static int mbochs_get_region_info(struct mdev_state *mdev_state,
 				  struct vfio_region_info_ext *ext)
 {
 	struct vfio_region_info *region_info = &ext->base;
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
 
 	if (region_info->index >= MBOCHS_NUM_REGIONS)
 		return -EINVAL;
@@ -1073,15 +1072,13 @@ static int mbochs_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mbochs_get_irq_info(struct mdev_device *mdev,
-			       struct vfio_irq_info *irq_info)
+static int mbochs_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	irq_info->count = 0;
 	return 0;
 }
 
-static int mbochs_get_device_info(struct mdev_device *mdev,
-				  struct vfio_device_info *dev_info)
+static int mbochs_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = MBOCHS_NUM_REGIONS;
@@ -1089,11 +1086,9 @@ static int mbochs_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mbochs_query_gfx_plane(struct mdev_device *mdev,
+static int mbochs_query_gfx_plane(struct mdev_state *mdev_state,
 				  struct vfio_device_gfx_plane_info *plane)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	struct mbochs_dmabuf *dmabuf;
 	struct mbochs_mode mode;
 	int ret;
@@ -1147,18 +1142,16 @@ static int mbochs_query_gfx_plane(struct mdev_device *mdev,
 done:
 	if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
 	    mdev_state->active_id != plane->dmabuf_id) {
-		dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
-			mdev_state->active_id, plane->dmabuf_id);
+		dev_dbg(mdev_state->vdev.dev, "%s: primary: %d => %d\n",
+			__func__, mdev_state->active_id, plane->dmabuf_id);
 		mdev_state->active_id = plane->dmabuf_id;
 	}
 	mutex_unlock(&mdev_state->ops_lock);
 	return 0;
 }
 
-static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
-				 u32 id)
+static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	struct mbochs_dmabuf *dmabuf;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -1180,9 +1173,11 @@ static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
 	return dma_buf_fd(dmabuf->buf, 0);
 }
 
-static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
-			unsigned long arg)
+static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
+			 unsigned long arg)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	int ret = 0;
 	unsigned long minsz, outsz;
 
@@ -1199,7 +1194,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mbochs_get_device_info(mdev, &info);
+		ret = mbochs_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -1223,7 +1218,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (outsz > sizeof(info))
 			return -EINVAL;
 
-		ret = mbochs_get_region_info(mdev, &info);
+		ret = mbochs_get_region_info(mdev_state, &info);
 		if (ret)
 			return ret;
 
@@ -1246,7 +1241,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= VFIO_PCI_NUM_IRQS))
 			return -EINVAL;
 
-		ret = mbochs_get_irq_info(mdev, &info);
+		ret = mbochs_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -1269,7 +1264,7 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (plane.argsz < minsz)
 			return -EINVAL;
 
-		ret = mbochs_query_gfx_plane(mdev, &plane);
+		ret = mbochs_query_gfx_plane(mdev_state, &plane);
 		if (ret)
 			return ret;
 
@@ -1286,29 +1281,22 @@ static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (get_user(dmabuf_id, (__u32 __user *)arg))
 			return -EFAULT;
 
-		return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
+		return mbochs_get_gfx_dmabuf(mdev_state, dmabuf_id);
 	}
 
 	case VFIO_DEVICE_SET_IRQS:
 		return -EINVAL;
 
 	case VFIO_DEVICE_RESET:
-		return mbochs_reset(mdev);
+		return mbochs_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mbochs_open(struct mdev_device *mdev)
+static void mbochs_close_device(struct vfio_device *vdev)
 {
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
-	return 0;
-}
-
-static void mbochs_close(struct mdev_device *mdev)
-{
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	struct mbochs_dmabuf *dmabuf, *tmp;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -1325,15 +1313,13 @@ static void mbochs_close(struct mdev_device *mdev)
 	mbochs_put_pages(mdev_state);
 
 	mutex_unlock(&mdev_state->ops_lock);
-	module_put(THIS_MODULE);
 }
 
 static ssize_t
 memory_show(struct device *dev, struct device_attribute *attr,
 	    char *buf)
 {
-	struct mdev_device *mdev = mdev_from_dev(dev);
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(dev);
 
 	return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
 }
@@ -1349,44 +1335,50 @@ static const struct attribute_group mdev_dev_group = {
 	.attrs = mdev_dev_attrs,
 };
 
-const struct attribute_group *mdev_dev_groups[] = {
+static const struct attribute_group *mdev_dev_groups[] = {
 	&mdev_dev_group,
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
-MDEV_TYPE_ATTR_RO(name);
+static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %d MB video memory\n",
 		       type ? type->mbytes  : 0);
 }
-MDEV_TYPE_ATTR_RO(description);
+static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
-	const struct mbochs_type *type = mbochs_find_type(kobj);
-	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(mtype)];
+	int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes;
 
 	return sprintf(buf, "%d\n", count);
 }
-MDEV_TYPE_ATTR_RO(available_instances);
+static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(device_api);
 
 static struct attribute *mdev_types_attrs[] = {
 	&mdev_type_attr_name.attr,
@@ -1418,18 +1410,29 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mbochs_dev_ops = {
+	.close_device = mbochs_close_device,
+	.read = mbochs_read,
+	.write = mbochs_write,
+	.ioctl = mbochs_ioctl,
+	.mmap = mbochs_mmap,
+};
+
+static struct mdev_driver mbochs_driver = {
+	.driver = {
+		.name = "mbochs",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mbochs_probe,
+	.remove	= mbochs_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner			= THIS_MODULE,
-	.mdev_attr_groups	= mdev_dev_groups,
+	.device_driver		= &mbochs_driver,
 	.supported_type_groups	= mdev_type_groups,
-	.create			= mbochs_create,
-	.remove			= mbochs_remove,
-	.open			= mbochs_open,
-	.release		= mbochs_close,
-	.read			= mbochs_read,
-	.write			= mbochs_write,
-	.ioctl			= mbochs_ioctl,
-	.mmap			= mbochs_mmap,
 };
 
 static const struct file_operations vd_fops = {
@@ -1445,6 +1448,8 @@ static int __init mbochs_dev_init(void)
 {
 	int ret = 0;
 
+	atomic_set(&mbochs_avail_mbytes, max_mbytes);
+
 	ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME);
 	if (ret < 0) {
 		pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
@@ -1454,11 +1459,15 @@ static int __init mbochs_dev_init(void)
 	cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1);
 	pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
 
+	ret = mdev_register_driver(&mbochs_driver);
+	if (ret)
+		goto err_cdev;
+
 	mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
 	if (IS_ERR(mbochs_class)) {
 		pr_err("Error: failed to register mbochs_dev class\n");
 		ret = PTR_ERR(mbochs_class);
-		goto failed1;
+		goto err_driver;
 	}
 	mbochs_dev.class = mbochs_class;
 	mbochs_dev.release = mbochs_device_release;
@@ -1466,19 +1475,21 @@ static int __init mbochs_dev_init(void)
 
 	ret = device_register(&mbochs_dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mbochs_dev, &mdev_fops);
 	if (ret)
-		goto failed3;
+		goto err_device;
 
 	return 0;
 
-failed3:
+err_device:
 	device_unregister(&mbochs_dev);
-failed2:
+err_class:
 	class_destroy(mbochs_class);
-failed1:
+err_driver:
+	mdev_unregister_driver(&mbochs_driver);
+err_cdev:
 	cdev_del(&mbochs_cdev);
 	unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
 	return ret;
@@ -1490,6 +1501,7 @@ static void __exit mbochs_dev_exit(void)
 	mdev_unregister_device(&mbochs_dev);
 
 	device_unregister(&mbochs_dev);
+	mdev_unregister_driver(&mbochs_driver);
 	cdev_del(&mbochs_cdev);
 	unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
 	class_destroy(mbochs_class);
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 9894693f3be17817345afd206e277d4e9421f41f..8d1a80a0722aa9bc544e490e7b5949c863783ebb 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -85,9 +85,11 @@ static struct class	*mdpy_class;
 static struct cdev	mdpy_cdev;
 static struct device	mdpy_dev;
 static u32		mdpy_count;
+static const struct vfio_device_ops mdpy_dev_ops;
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	u8 *vconfig;
 	u32 bar_mask;
 	struct mutex ops_lock;
@@ -99,16 +101,6 @@ struct mdev_state {
 	void *memblk;
 };
 
-static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
-		if (strcmp(mdpy_types[i].name, kobj->name) == 0)
-			return mdpy_types + i;
-	return NULL;
-}
-
 static void mdpy_create_config_space(struct mdev_state *mdev_state)
 {
 	STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
@@ -172,11 +164,9 @@ static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
 	}
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
-			   loff_t pos, bool is_write)
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+			   size_t count, loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
 	int ret = 0;
 
 	mutex_lock(&mdev_state->ops_lock);
@@ -197,8 +187,9 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 			memcpy(buf, mdev_state->memblk, count);
 
 	} else {
-		dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
-			 __func__, is_write ? "WR" : "RD", pos);
+		dev_info(mdev_state->vdev.dev,
+			 "%s: %s @0x%llx (unhandled)\n", __func__,
+			 is_write ? "WR" : "RD", pos);
 		ret = -1;
 		goto accessfailed;
 	}
@@ -212,9 +203,8 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
 	return ret;
 }
 
-static int mdpy_reset(struct mdev_device *mdev)
+static int mdpy_reset(struct mdev_state *mdev_state)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
 	u32 stride, i;
 
 	/* initialize with gray gradient */
@@ -226,12 +216,14 @@ static int mdpy_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mdpy_probe(struct mdev_device *mdev)
 {
-	const struct mdpy_type *type = mdpy_find_type(kobj);
+	const struct mdpy_type *type =
+		&mdpy_types[mdev_get_type_group_id(mdev)];
 	struct device *dev = mdev_dev(mdev);
 	struct mdev_state *mdev_state;
 	u32 fbsize;
+	int ret;
 
 	if (mdpy_count >= max_devices)
 		return -ENOMEM;
@@ -239,58 +231,68 @@ static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
 	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
 	if (mdev_state == NULL)
 		return -ENOMEM;
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mdpy_dev_ops);
 
 	mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
 	if (mdev_state->vconfig == NULL) {
-		kfree(mdev_state);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_state;
 	}
 
-	if (!type)
-		type = &mdpy_types[0];
 	fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
 
 	mdev_state->memblk = vmalloc_user(fbsize);
 	if (!mdev_state->memblk) {
-		kfree(mdev_state->vconfig);
-		kfree(mdev_state);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_vconfig;
 	}
-	dev_info(dev, "%s: %s (%dx%d)\n",
-		 __func__, kobj->name, type->width, type->height);
+	dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width,
+		 type->height);
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
-
 	mdev_state->type    = type;
 	mdev_state->memsize = fbsize;
 	mdpy_create_config_space(mdev_state);
-	mdpy_reset(mdev);
+	mdpy_reset(mdev_state);
 
 	mdpy_count++;
+
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret)
+		goto err_mem;
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
+err_mem:
+	vfree(mdev_state->memblk);
+err_vconfig:
+	kfree(mdev_state->vconfig);
+err_state:
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state);
+	return ret;
 }
 
-static int mdpy_remove(struct mdev_device *mdev)
+static void mdpy_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	struct device *dev = mdev_dev(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
 
-	dev_info(dev, "%s\n", __func__);
+	dev_info(&mdev->dev, "%s\n", __func__);
 
-	mdev_set_drvdata(mdev, NULL);
+	vfio_unregister_group_dev(&mdev_state->vdev);
 	vfree(mdev_state->memblk);
 	kfree(mdev_state->vconfig);
+	vfio_uninit_group_dev(&mdev_state->vdev);
 	kfree(mdev_state);
 
 	mdpy_count--;
-	return 0;
 }
 
-static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -300,8 +302,8 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (char *)&val, sizeof(val),
-					   *ppos, false);
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
 
@@ -312,7 +314,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -324,7 +326,7 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -347,9 +349,11 @@ static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mdpy_write(struct vfio_device *vdev, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -362,7 +366,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -374,7 +378,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -386,7 +390,7 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (char *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -404,9 +408,10 @@ static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
 	return -EFAULT;
 }
 
-static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
 		return -EINVAL;
@@ -417,21 +422,13 @@ static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
 	if ((vma->vm_flags & VM_SHARED) == 0)
 		return -EINVAL;
 
-	return remap_vmalloc_range_partial(vma, vma->vm_start,
-					   mdev_state->memblk, 0,
-					   vma->vm_end - vma->vm_start);
+	return remap_vmalloc_range(vma, mdev_state->memblk, 0);
 }
 
-static int mdpy_get_region_info(struct mdev_device *mdev,
+static int mdpy_get_region_info(struct mdev_state *mdev_state,
 				struct vfio_region_info *region_info,
 				u16 *cap_type_id, void **cap_type)
 {
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
-
 	if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
 	    region_info->index != MDPY_DISPLAY_REGION)
 		return -EINVAL;
@@ -460,15 +457,13 @@ static int mdpy_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mdpy_get_irq_info(struct mdev_device *mdev,
-			     struct vfio_irq_info *irq_info)
+static int mdpy_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	irq_info->count = 0;
 	return 0;
 }
 
-static int mdpy_get_device_info(struct mdev_device *mdev,
-				struct vfio_device_info *dev_info)
+static int mdpy_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
@@ -476,11 +471,9 @@ static int mdpy_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mdpy_query_gfx_plane(struct mdev_device *mdev,
+static int mdpy_query_gfx_plane(struct mdev_state *mdev_state,
 				struct vfio_device_gfx_plane_info *plane)
 {
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-
 	if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
 		if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
 				     VFIO_GFX_PLANE_TYPE_REGION))
@@ -509,14 +502,13 @@ static int mdpy_query_gfx_plane(struct mdev_device *mdev,
 	return 0;
 }
 
-static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
+static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
 		       unsigned long arg)
 {
 	int ret = 0;
 	unsigned long minsz;
-	struct mdev_state *mdev_state;
-
-	mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 
 	switch (cmd) {
 	case VFIO_DEVICE_GET_INFO:
@@ -531,7 +523,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_get_device_info(mdev, &info);
+		ret = mdpy_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -556,7 +548,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
+		ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id,
 					   &cap_type);
 		if (ret)
 			return ret;
@@ -580,7 +572,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= mdev_state->dev_info.num_irqs))
 			return -EINVAL;
 
-		ret = mdpy_get_irq_info(mdev, &info);
+		ret = mdpy_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -603,7 +595,7 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (plane.argsz < minsz)
 			return -EINVAL;
 
-		ret = mdpy_query_gfx_plane(mdev, &plane);
+		ret = mdpy_query_gfx_plane(mdev_state, &plane);
 		if (ret)
 			return ret;
 
@@ -617,30 +609,16 @@ static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		return -EINVAL;
 
 	case VFIO_DEVICE_RESET:
-		return mdpy_reset(mdev);
+		return mdpy_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mdpy_open(struct mdev_device *mdev)
-{
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
-	return 0;
-}
-
-static void mdpy_close(struct mdev_device *mdev)
-{
-	module_put(THIS_MODULE);
-}
-
 static ssize_t
 resolution_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
-	struct mdev_device *mdev = mdev_from_dev(dev);
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
+	struct mdev_state *mdev_state = dev_get_drvdata(dev);
 
 	return sprintf(buf, "%dx%d\n",
 		       mdev_state->type->width,
@@ -658,42 +636,46 @@ static const struct attribute_group mdev_dev_group = {
 	.attrs = mdev_dev_attrs,
 };
 
-const struct attribute_group *mdev_dev_groups[] = {
+static const struct attribute_group *mdev_dev_groups[] = {
 	&mdev_dev_group,
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mdpy_type *type =
+		&mdpy_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
-MDEV_TYPE_ATTR_RO(name);
+static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
-	const struct mdpy_type *type = mdpy_find_type(kobj);
+	const struct mdpy_type *type =
+		&mdpy_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
-		       type ? type->width  : 0,
-		       type ? type->height : 0);
+		       type->width, type->height);
 }
-MDEV_TYPE_ATTR_RO(description);
+static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n", max_devices - mdpy_count);
 }
-MDEV_TYPE_ATTR_RO(available_instances);
+static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(device_api);
 
 static struct attribute *mdev_types_attrs[] = {
 	&mdev_type_attr_name.attr,
@@ -725,18 +707,28 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mdpy_dev_ops = {
+	.read = mdpy_read,
+	.write = mdpy_write,
+	.ioctl = mdpy_ioctl,
+	.mmap = mdpy_mmap,
+};
+
+static struct mdev_driver mdpy_driver = {
+	.driver = {
+		.name = "mdpy",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mdpy_probe,
+	.remove	= mdpy_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner			= THIS_MODULE,
-	.mdev_attr_groups	= mdev_dev_groups,
+	.device_driver          = &mdpy_driver,
 	.supported_type_groups	= mdev_type_groups,
-	.create			= mdpy_create,
-	.remove			= mdpy_remove,
-	.open			= mdpy_open,
-	.release		= mdpy_close,
-	.read			= mdpy_read,
-	.write			= mdpy_write,
-	.ioctl			= mdpy_ioctl,
-	.mmap			= mdpy_mmap,
 };
 
 static const struct file_operations vd_fops = {
@@ -761,11 +753,15 @@ static int __init mdpy_dev_init(void)
 	cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1);
 	pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
 
+	ret = mdev_register_driver(&mdpy_driver);
+	if (ret)
+		goto err_cdev;
+
 	mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
 	if (IS_ERR(mdpy_class)) {
 		pr_err("Error: failed to register mdpy_dev class\n");
 		ret = PTR_ERR(mdpy_class);
-		goto failed1;
+		goto err_driver;
 	}
 	mdpy_dev.class = mdpy_class;
 	mdpy_dev.release = mdpy_device_release;
@@ -773,19 +769,21 @@ static int __init mdpy_dev_init(void)
 
 	ret = device_register(&mdpy_dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mdpy_dev, &mdev_fops);
 	if (ret)
-		goto failed3;
+		goto err_device;
 
 	return 0;
 
-failed3:
+err_device:
 	device_unregister(&mdpy_dev);
-failed2:
+err_class:
 	class_destroy(mdpy_class);
-failed1:
+err_driver:
+	mdev_unregister_driver(&mdpy_driver);
+err_cdev:
 	cdev_del(&mdpy_cdev);
 	unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
 	return ret;
@@ -797,6 +795,7 @@ static void __exit mdpy_dev_exit(void)
 	mdev_unregister_device(&mdpy_dev);
 
 	device_unregister(&mdpy_dev);
+	mdev_unregister_driver(&mdpy_driver);
 	cdev_del(&mdpy_cdev);
 	unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
 	class_destroy(mdpy_class);
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index ce84a300a4dafd2e6669ef55c5ca6c46d5d05e1e..5983cdb16e3d1d6cc388325868fd9c2fa8a292e1 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -127,6 +127,7 @@ struct serial_port {
 
 /* State of each mdev device */
 struct mdev_state {
+	struct vfio_device vdev;
 	int irq_fd;
 	struct eventfd_ctx *intx_evtfd;
 	struct eventfd_ctx *msi_evtfd;
@@ -143,13 +144,14 @@ struct mdev_state {
 	int nr_ports;
 };
 
-static struct mutex mdev_list_lock;
-static struct list_head mdev_devices_list;
+static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS);
 
 static const struct file_operations vd_fops = {
 	.owner          = THIS_MODULE,
 };
 
+static const struct vfio_device_ops mtty_dev_ops;
+
 /* function prototypes */
 
 static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
@@ -631,23 +633,16 @@ static void mdev_read_base(struct mdev_state *mdev_state)
 	}
 }
 
-static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
+static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count,
 			   loff_t pos, bool is_write)
 {
-	struct mdev_state *mdev_state;
 	unsigned int index;
 	loff_t offset;
 	int ret = 0;
 
-	if (!mdev || !buf)
+	if (!buf)
 		return -EINVAL;
 
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state) {
-		pr_err("%s mdev_state not found\n", __func__);
-		return -EINVAL;
-	}
-
 	mutex_lock(&mdev_state->ops_lock);
 
 	index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos);
@@ -708,30 +703,26 @@ static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
 	return ret;
 }
 
-static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mtty_probe(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
-	char name[MTTY_STRING_LEN];
-	int nr_ports = 0, i;
+	int nr_ports = mdev_get_type_group_id(mdev) + 1;
+	int avail_ports = atomic_read(&mdev_avail_ports);
+	int ret;
 
-	if (!mdev)
-		return -EINVAL;
+	do {
+		if (avail_ports < nr_ports)
+			return -ENOSPC;
+	} while (!atomic_try_cmpxchg(&mdev_avail_ports,
+				     &avail_ports, avail_ports - nr_ports));
 
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			dev_driver_string(mdev_parent_dev(mdev)), i + 1);
-		if (!strcmp(kobj->name, name)) {
-			nr_ports = i + 1;
-			break;
-		}
+	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
+	if (mdev_state == NULL) {
+		ret = -ENOMEM;
+		goto err_nr_ports;
 	}
 
-	if (!nr_ports)
-		return -EINVAL;
-
-	mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
-	if (mdev_state == NULL)
-		return -ENOMEM;
+	vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops);
 
 	mdev_state->nr_ports = nr_ports;
 	mdev_state->irq_index = -1;
@@ -741,64 +732,56 @@ static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
 	mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL);
 
 	if (mdev_state->vconfig == NULL) {
-		kfree(mdev_state);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_state;
 	}
 
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
-	mdev_set_drvdata(mdev, mdev_state);
 
 	mtty_create_config_space(mdev_state);
 
-	mutex_lock(&mdev_list_lock);
-	list_add(&mdev_state->next, &mdev_devices_list);
-	mutex_unlock(&mdev_list_lock);
-
+	ret = vfio_register_group_dev(&mdev_state->vdev);
+	if (ret)
+		goto err_vconfig;
+	dev_set_drvdata(&mdev->dev, mdev_state);
 	return 0;
-}
-
-static int mtty_remove(struct mdev_device *mdev)
-{
-	struct mdev_state *mds, *tmp_mds;
-	struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
-	int ret = -EINVAL;
-
-	mutex_lock(&mdev_list_lock);
-	list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) {
-		if (mdev_state == mds) {
-			list_del(&mdev_state->next);
-			mdev_set_drvdata(mdev, NULL);
-			kfree(mdev_state->vconfig);
-			kfree(mdev_state);
-			ret = 0;
-			break;
-		}
-	}
-	mutex_unlock(&mdev_list_lock);
 
+err_vconfig:
+	kfree(mdev_state->vconfig);
+err_state:
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state);
+err_nr_ports:
+	atomic_add(nr_ports, &mdev_avail_ports);
 	return ret;
 }
 
-static int mtty_reset(struct mdev_device *mdev)
+static void mtty_remove(struct mdev_device *mdev)
 {
-	struct mdev_state *mdev_state;
+	struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
+	int nr_ports = mdev_state->nr_ports;
 
-	if (!mdev)
-		return -EINVAL;
+	vfio_unregister_group_dev(&mdev_state->vdev);
 
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
+	kfree(mdev_state->vconfig);
+	vfio_uninit_group_dev(&mdev_state->vdev);
+	kfree(mdev_state);
+	atomic_add(nr_ports, &mdev_avail_ports);
+}
 
+static int mtty_reset(struct mdev_state *mdev_state)
+{
 	pr_info("%s: called\n", __func__);
 
 	return 0;
 }
 
-static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
+static ssize_t mtty_read(struct vfio_device *vdev, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -808,7 +791,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
-			ret =  mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret =  mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					   *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -820,7 +803,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		} else if (count >= 2 && !(*ppos % 2)) {
 			u16 val;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -832,7 +815,7 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 		} else {
 			u8 val;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, false);
 			if (ret <= 0)
 				goto read_err;
@@ -855,9 +838,11 @@ static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
 	return -EFAULT;
 }
 
-static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
+static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf,
 		   size_t count, loff_t *ppos)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	unsigned int done = 0;
 	int ret;
 
@@ -870,7 +855,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -882,7 +867,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -894,7 +879,7 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 			if (copy_from_user(&val, buf, sizeof(val)))
 				goto write_err;
 
-			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
+			ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
 					  *ppos, true);
 			if (ret <= 0)
 				goto write_err;
@@ -912,19 +897,11 @@ static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
 	return -EFAULT;
 }
 
-static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags,
+static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 			 unsigned int index, unsigned int start,
 			 unsigned int count, void *data)
 {
 	int ret = 0;
-	struct mdev_state *mdev_state;
-
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
 
 	mutex_lock(&mdev_state->ops_lock);
 	switch (index) {
@@ -1040,21 +1017,13 @@ static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
 	return ret;
 }
 
-static int mtty_get_region_info(struct mdev_device *mdev,
+static int mtty_get_region_info(struct mdev_state *mdev_state,
 			 struct vfio_region_info *region_info,
 			 u16 *cap_type_id, void **cap_type)
 {
 	unsigned int size = 0;
-	struct mdev_state *mdev_state;
 	u32 bar_index;
 
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -EINVAL;
-
 	bar_index = region_info->index;
 	if (bar_index >= VFIO_PCI_NUM_REGIONS)
 		return -EINVAL;
@@ -1089,8 +1058,7 @@ static int mtty_get_region_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mtty_get_irq_info(struct mdev_device *mdev,
-			     struct vfio_irq_info *irq_info)
+static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
 {
 	switch (irq_info->index) {
 	case VFIO_PCI_INTX_IRQ_INDEX:
@@ -1114,8 +1082,7 @@ static int mtty_get_irq_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static int mtty_get_device_info(struct mdev_device *mdev,
-			 struct vfio_device_info *dev_info)
+static int mtty_get_device_info(struct vfio_device_info *dev_info)
 {
 	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
 	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
@@ -1124,19 +1091,13 @@ static int mtty_get_device_info(struct mdev_device *mdev,
 	return 0;
 }
 
-static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
+static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd,
 			unsigned long arg)
 {
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
 	int ret = 0;
 	unsigned long minsz;
-	struct mdev_state *mdev_state;
-
-	if (!mdev)
-		return -EINVAL;
-
-	mdev_state = mdev_get_drvdata(mdev);
-	if (!mdev_state)
-		return -ENODEV;
 
 	switch (cmd) {
 	case VFIO_DEVICE_GET_INFO:
@@ -1151,7 +1112,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mtty_get_device_info(mdev, &info);
+		ret = mtty_get_device_info(&info);
 		if (ret)
 			return ret;
 
@@ -1176,7 +1137,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		ret = mtty_get_region_info(mdev, &info, &cap_type_id,
+		ret = mtty_get_region_info(mdev_state, &info, &cap_type_id,
 					   &cap_type);
 		if (ret)
 			return ret;
@@ -1200,7 +1161,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 		    (info.index >= mdev_state->dev_info.num_irqs))
 			return -EINVAL;
 
-		ret = mtty_get_irq_info(mdev, &info);
+		ret = mtty_get_irq_info(&info);
 		if (ret)
 			return ret;
 
@@ -1234,29 +1195,18 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
 				return PTR_ERR(data);
 		}
 
-		ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start,
+		ret = mtty_set_irqs(mdev_state, hdr.flags, hdr.index, hdr.start,
 				    hdr.count, data);
 
 		kfree(ptr);
 		return ret;
 	}
 	case VFIO_DEVICE_RESET:
-		return mtty_reset(mdev);
+		return mtty_reset(mdev_state);
 	}
 	return -ENOTTY;
 }
 
-static int mtty_open(struct mdev_device *mdev)
-{
-	pr_info("%s\n", __func__);
-	return 0;
-}
-
-static void mtty_close(struct mdev_device *mdev)
-{
-	pr_info("%s\n", __func__);
-}
-
 static ssize_t
 sample_mtty_dev_show(struct device *dev, struct device_attribute *attr,
 		     char *buf)
@@ -1308,56 +1258,31 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	char name[MTTY_STRING_LEN];
-	int i;
-	const char *name_str[2] = {"Single port serial", "Dual port serial"};
+	static const char *name_str[2] = { "Single port serial",
+					   "Dual port serial" };
 
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			 dev_driver_string(dev), i + 1);
-		if (!strcmp(kobj->name, name))
-			return sprintf(buf, "%s\n", name_str[i]);
-	}
-
-	return -EINVAL;
+	return sysfs_emit(buf, "%s\n",
+			  name_str[mtype_get_type_group_id(mtype)]);
 }
 
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
-	char name[MTTY_STRING_LEN];
-	int i;
-	struct mdev_state *mds;
-	int ports = 0, used = 0;
-
-	for (i = 0; i < 2; i++) {
-		snprintf(name, MTTY_STRING_LEN, "%s-%d",
-			 dev_driver_string(dev), i + 1);
-		if (!strcmp(kobj->name, name)) {
-			ports = i + 1;
-			break;
-		}
-	}
-
-	if (!ports)
-		return -EINVAL;
-
-	list_for_each_entry(mds, &mdev_devices_list, next)
-		used += mds->nr_ports;
+	unsigned int ports = mtype_get_type_group_id(mtype) + 1;
 
-	return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports);
+	return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / ports);
 }
 
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
@@ -1387,18 +1312,29 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static const struct vfio_device_ops mtty_dev_ops = {
+	.name = "vfio-mtty",
+	.read = mtty_read,
+	.write = mtty_write,
+	.ioctl = mtty_ioctl,
+};
+
+static struct mdev_driver mtty_driver = {
+	.driver = {
+		.name = "mtty",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+		.dev_groups = mdev_dev_groups,
+	},
+	.probe = mtty_probe,
+	.remove	= mtty_remove,
+};
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner                  = THIS_MODULE,
+	.device_driver		= &mtty_driver,
 	.dev_attr_groups        = mtty_dev_groups,
-	.mdev_attr_groups       = mdev_dev_groups,
 	.supported_type_groups  = mdev_type_groups,
-	.create                 = mtty_create,
-	.remove			= mtty_remove,
-	.open                   = mtty_open,
-	.release                = mtty_close,
-	.read                   = mtty_read,
-	.write                  = mtty_write,
-	.ioctl		        = mtty_ioctl,
 };
 
 static void mtty_device_release(struct device *dev)
@@ -1429,12 +1365,16 @@ static int __init mtty_dev_init(void)
 
 	pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt));
 
+	ret = mdev_register_driver(&mtty_driver);
+	if (ret)
+		goto err_cdev;
+
 	mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME);
 
 	if (IS_ERR(mtty_dev.vd_class)) {
 		pr_err("Error: failed to register mtty_dev class\n");
 		ret = PTR_ERR(mtty_dev.vd_class);
-		goto failed1;
+		goto err_driver;
 	}
 
 	mtty_dev.dev.class = mtty_dev.vd_class;
@@ -1443,28 +1383,22 @@ static int __init mtty_dev_init(void)
 
 	ret = device_register(&mtty_dev.dev);
 	if (ret)
-		goto failed2;
+		goto err_class;
 
 	ret = mdev_register_device(&mtty_dev.dev, &mdev_fops);
 	if (ret)
-		goto failed3;
-
-	mutex_init(&mdev_list_lock);
-	INIT_LIST_HEAD(&mdev_devices_list);
-
-	goto all_done;
-
-failed3:
+		goto err_device;
+	return 0;
 
+err_device:
 	device_unregister(&mtty_dev.dev);
-failed2:
+err_class:
 	class_destroy(mtty_dev.vd_class);
-
-failed1:
+err_driver:
+	mdev_unregister_driver(&mtty_driver);
+err_cdev:
 	cdev_del(&mtty_dev.vd_cdev);
 	unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
-
-all_done:
 	return ret;
 }
 
@@ -1475,6 +1409,7 @@ static void __exit mtty_dev_exit(void)
 
 	device_unregister(&mtty_dev.dev);
 	idr_destroy(&mtty_dev.vd_idr);
+	mdev_unregister_driver(&mtty_driver);
 	cdev_del(&mtty_dev.vd_cdev);
 	unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
 	class_destroy(mtty_dev.vd_class);
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index 054405b90ba42e61b5b5106be2d27133ca97ce30..58bcceabff7e323da519257a10af50df8af133ca 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -42,6 +42,7 @@ int main(void)
 	DEVID_FIELD(pci_device_id, subdevice);
 	DEVID_FIELD(pci_device_id, class);
 	DEVID_FIELD(pci_device_id, class_mask);
+	DEVID_FIELD(pci_device_id, override_only);
 
 	DEVID(ccw_device_id);
 	DEVID_FIELD(ccw_device_id, match_flags);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index c91eba751804bd64a3f2b1230c82f504ec7eb2f7..e8d4827f9ca69356aa26652b4ff57d350c07735c 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -426,7 +426,7 @@ static int do_ieee1394_entry(const char *filename,
 	return 1;
 }
 
-/* Looks like: pci:vNdNsvNsdNbcNscNiN. */
+/* Looks like: pci:vNdNsvNsdNbcNscNiN or <prefix>_pci:vNdNsvNsdNbcNscNiN. */
 static int do_pci_entry(const char *filename,
 			void *symval, char *alias)
 {
@@ -440,8 +440,21 @@ static int do_pci_entry(const char *filename,
 	DEF_FIELD(symval, pci_device_id, subdevice);
 	DEF_FIELD(symval, pci_device_id, class);
 	DEF_FIELD(symval, pci_device_id, class_mask);
+	DEF_FIELD(symval, pci_device_id, override_only);
+
+	switch (override_only) {
+	case 0:
+		strcpy(alias, "pci:");
+		break;
+	case PCI_ID_F_VFIO_DRIVER_OVERRIDE:
+		strcpy(alias, "vfio_pci:");
+		break;
+	default:
+		warn("Unknown PCI driver_override alias %08X\n",
+		     override_only);
+		return 0;
+	}
 
-	strcpy(alias, "pci:");
 	ADD(alias, "v", vendor != PCI_ANY_ID, vendor);
 	ADD(alias, "d", device != PCI_ANY_ID, device);
 	ADD(alias, "sv", subvendor != PCI_ANY_ID, subvendor);
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index 55768654e3c6a3ba4686bb0b81147a06b2741503..4ebe7941f352feda64420753d1c748ff926f5d4b 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -219,7 +219,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) + i, loc);
 		w(r(loc + 1) + i + 4, loc + 1);
-		w(r(loc + 2) + i + 8, loc + 2);
+		/* Don't touch the fixup type */
 
 		i += sizeof(uint32_t) * 3;
 	}
@@ -232,7 +232,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) - i, loc);
 		w(r(loc + 1) - (i + 4), loc + 1);
-		w(r(loc + 2) - (i + 8), loc + 2);
+		/* Don't touch the fixup type */
 
 		i += sizeof(uint32_t) * 3;
 	}
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da1676791d2c4f4a70bfad35743daf4c..58204fa6dd75d44d164af6eb9b53e22f23caa30b 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -337,9 +337,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
@@ -396,6 +410,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP	0
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 52641d8ca9e83f25b983f3cc6be115c37bad2d98..1e33f90c3d9ac850e33c8ed0020a5c44ec7cab37 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1000,6 +1000,9 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1031,11 +1034,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1047,6 +1059,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@@ -1073,6 +1086,8 @@ struct kvm_x86_mce {
 #endif
 
 #ifdef KVM_CAP_XEN_HVM
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
+
 struct kvm_xen_hvm_config {
 	__u32 flags;
 	__u32 msr;
@@ -1461,6 +1476,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index a62e032863a89dbd2941739317ea2778e752f294..0c538eb96cd794fd87a6a816176e5d28c3af00c1 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -73,9 +73,10 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 {
 	struct insn insn;
 	int x86_64, sign;
-	unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0,
-		      rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0,
-		      modrm_reg = 0, sib = 0;
+	unsigned char op1, op2, op3,
+		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
+		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
+		      sib = 0;
 
 	x86_64 = is_x86_64(elf);
 	if (x86_64 == -1)
@@ -97,6 +98,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 
 	op1 = insn.opcode.bytes[0];
 	op2 = insn.opcode.bytes[1];
+	op3 = insn.opcode.bytes[2];
 
 	if (insn.rex_prefix.nbytes) {
 		rex = insn.rex_prefix.bytes[0];
@@ -384,6 +386,14 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			/* nopl/nopw */
 			*type = INSN_NOP;
 
+		} else if (op2 == 0x38 && op3 == 0xf8) {
+			if (insn.prefixes.nbytes == 1 &&
+			    insn.prefixes.bytes[0] == 0xf2) {
+				/* ENQCMD cannot be used in the kernel. */
+				WARN("ENQCMD instruction at %s:%lx", sec->name,
+				     offset);
+			}
+
 		} else if (op2 == 0xa0 || op2 == 0xa8) {
 
 			/* push fs/gs */
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0d524ef3606d7350c10b7cdd6cb0dbd75550b2d5..ab65f563dc6550767e1b3670bfe2d7754dc26d1e 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -682,7 +682,7 @@ static int __cmd_inject(struct perf_inject *inject)
 		inject->tool.ordered_events = true;
 		inject->tool.ordering_requires_timestamps = true;
 		/* Allow space in the header for new attributes */
-		output_data_offset = 4096;
+		output_data_offset = roundup(8192 + session->header.data_offset, 4096);
 		if (inject->strip)
 			strip_init(inject);
 	}
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 988326b67a9168d43f74543f1eea513025432fbf..b35baedd0b9006ac568f98648b7ddeca706ae9dc 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4627,10 +4627,19 @@ unsigned int intel_model_duplicates(unsigned int model)
 
 	case INTEL_FAM6_ICELAKE_L:
 	case INTEL_FAM6_ICELAKE_NNPI:
+	case INTEL_FAM6_TIGERLAKE_L:
+	case INTEL_FAM6_TIGERLAKE:
+	case INTEL_FAM6_ROCKETLAKE:
+	case INTEL_FAM6_LAKEFIELD:
+	case INTEL_FAM6_ALDERLAKE:
 		return INTEL_FAM6_CANNONLAKE_L;
 
 	case INTEL_FAM6_ATOM_TREMONT_D:
 		return INTEL_FAM6_ATOM_GOLDMONT_D;
+
+	case INTEL_FAM6_ICELAKE_X:
+	case INTEL_FAM6_SAPPHIRERAPIDS_X:
+		return INTEL_FAM6_SKYLAKE_X;
 	}
 	return model;
 }
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6ede894d5c50caa7d6a4c4093226016549fbaa12..23b2e36ab64540d5b572661083b63e12a2569864 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -43,6 +43,7 @@ TARGETS += seccomp
 TARGETS += sgx
 TARGETS += sigaltstack
 TARGETS += size
+TARGETS += pfru
 TARGETS += sparc64
 TARGETS += splice
 TARGETS += static_keys
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 0ac49d91a26023c3eb10804f4cf77fc004b4d080..2cd5eefed4d2ca6271780246bba013bb28c70b0e 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -6,6 +6,37 @@
  * Copyright (c) 2014 Shuah Khan <shuahkh@osg.samsung.com>
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  *
+ * Using this API consists of first counting how many tests your code
+ * has to run, and then starting up the reporting:
+ *
+ *     ksft_print_header();
+ *     ksft_set_plan(total_number_of_tests);
+ *
+ * For each test, report any progress, debugging, etc with:
+ *
+ *     ksft_print_msg(fmt, ...);
+ *
+ * and finally report the pass/fail/skip/xfail state of the test with one of:
+ *
+ *     ksft_test_result(condition, fmt, ...);
+ *     ksft_test_result_pass(fmt, ...);
+ *     ksft_test_result_fail(fmt, ...);
+ *     ksft_test_result_skip(fmt, ...);
+ *     ksft_test_result_xfail(fmt, ...);
+ *     ksft_test_result_error(fmt, ...);
+ *
+ * When all tests are finished, clean up and exit the program with one of:
+ *
+ *    ksft_exit(condition);
+ *    ksft_exit_pass();
+ *    ksft_exit_fail();
+ *
+ * If the program wants to report details on why the entire program has
+ * failed, it can instead exit with a message (this is usually done when
+ * the program is aborting before finishing all tests):
+ *
+ *    ksft_exit_fail_msg(fmt, ...);
+ *
  */
 #ifndef __KSELFTEST_H
 #define __KSELFTEST_H
@@ -74,7 +105,7 @@ static inline void ksft_print_cnts(void)
 	if (ksft_plan != ksft_test_num())
 		printf("# Planned tests != run tests (%u != %u)\n",
 			ksft_plan, ksft_test_num());
-	printf("# Pass %d Fail %d Xfail %d Xpass %d Skip %d Error %d\n",
+	printf("# Totals: pass:%d fail:%d xfail:%d xpass:%d skip:%d error:%d\n",
 		ksft_cnt.ksft_pass, ksft_cnt.ksft_fail,
 		ksft_cnt.ksft_xfail, ksft_cnt.ksft_xpass,
 		ksft_cnt.ksft_xskip, ksft_cnt.ksft_error);
@@ -120,6 +151,32 @@ static inline void ksft_test_result_fail(const char *msg, ...)
 	va_end(args);
 }
 
+/**
+ * ksft_test_result() - Report test success based on truth of condition
+ *
+ * @condition: if true, report test success, otherwise failure.
+ */
+#define ksft_test_result(condition, fmt, ...) do {	\
+	if (!!(condition))				\
+		ksft_test_result_pass(fmt, ##__VA_ARGS__);\
+	else						\
+		ksft_test_result_fail(fmt, ##__VA_ARGS__);\
+	} while (0)
+
+static inline void ksft_test_result_xfail(const char *msg, ...)
+{
+	int saved_errno = errno;
+	va_list args;
+
+	ksft_cnt.ksft_xfail++;
+
+	va_start(args, msg);
+	printf("ok %d # XFAIL ", ksft_test_num());
+	errno = saved_errno;
+	vprintf(msg, args);
+	va_end(args);
+}
+
 static inline void ksft_test_result_skip(const char *msg, ...)
 {
 	int saved_errno = errno;
@@ -134,6 +191,7 @@ static inline void ksft_test_result_skip(const char *msg, ...)
 	va_end(args);
 }
 
+/* TODO: how does "error" differ from "fail" or "skip"? */
 static inline void ksft_test_result_error(const char *msg, ...)
 {
 	int saved_errno = errno;
@@ -156,11 +214,22 @@ static inline int ksft_exit_pass(void)
 
 static inline int ksft_exit_fail(void)
 {
-	printf("Bail out!\n");
 	ksft_print_cnts();
 	exit(KSFT_FAIL);
 }
 
+/**
+ * ksft_exit() - Exit selftest based on truth of condition
+ *
+ * @condition: if true, exit self test with success, otherwise fail.
+ */
+#define ksft_exit(condition) do {	\
+	if (!!(condition))		\
+		ksft_exit_pass();	\
+	else				\
+		ksft_exit_fail();	\
+	} while (0)
+
 static inline int ksft_exit_fail_msg(const char *msg, ...)
 {
 	int saved_errno = errno;
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 409c1fa75e03575c2c04736b325b0484ae8fd65a..8a0ba90180775f13405c28125c40b1841aed8de3 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -3,6 +3,7 @@
 /x86_64/cr4_cpuid_sync_test
 /x86_64/evmcs_test
 /x86_64/hyperv_cpuid
+/x86_64/max_vcpuid_cap_test
 /x86_64/mmio_warning_test
 /x86_64/platform_info_test
 /x86_64/set_sregs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c5ec868fa1e523fc36419ef008722afb469349c8..9da33dd0432b18c3addc23da60e0b0b01ebd6167 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -25,6 +25,8 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 29cccaf96baf6322dc3b5f8d266096fdb2c06d7c..c34bc4e9e7312232fd016563a0a7294c0e3085fc 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -66,6 +66,7 @@ enum vm_mem_backing_src_type {
 };
 
 int kvm_check_cap(long cap);
+int vm_check_cap(struct kvm_vm *vm, long cap);
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index aead07c24afcf00e0fcff5c745b28764c2a8564a..555d96d6ffe975f1d437dbf32bdb9477ba7d0e43 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -10,6 +10,9 @@
 
 #include <assert.h>
 #include <stdint.h>
+#include <syscall.h>
+
+#include <asm/prctl.h>
 
 #define X86_EFLAGS_FIXED	 (1u << 1)
 
@@ -68,6 +71,21 @@ struct desc_ptr {
 	uint64_t address;
 } __attribute__((packed));
 
+struct kvm_x86_state {
+	struct kvm_xsave *xsave;
+	struct kvm_vcpu_events events;
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	struct kvm_xcrs xcrs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	union {
+		struct kvm_nested_state nested;
+		char nested_[16384];
+	};
+	struct kvm_msrs msrs;
+};
+
 static inline uint64_t get_desc64_base(const struct desc64 *desc)
 {
 	return ((uint64_t)desc->base3 << 32) |
@@ -303,10 +321,10 @@ static inline unsigned long get_xmm(int n)
 
 bool is_intel_cpu(void);
 
-struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
+void kvm_x86_state_cleanup(struct kvm_x86_state *state);
 
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
@@ -327,6 +345,7 @@ void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
 
 uint32_t kvm_get_cpuid_max(void);
 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+void vm_xsave_req_perm(int bit);
 
 /*
  * Basic CPU control in CR0
@@ -1086,6 +1105,14 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
 /* VMX_EPT_VPID_CAP bits */
 #define VMX_EPT_VPID_CAP_AD_BITS	(1ULL << 21)
 
+#define XSTATE_XTILE_CFG_BIT		17
+#define XSTATE_XTILE_DATA_BIT		18
+
+#define XSTATE_XTILE_CFG_MASK		(1ULL << XSTATE_XTILE_CFG_BIT)
+#define XSTATE_XTILE_DATA_MASK		(1ULL << XSTATE_XTILE_DATA_BIT)
+#define XFEATURE_XTILE_MASK		(XSTATE_XTILE_CFG_MASK | \
+					XSTATE_XTILE_DATA_MASK)
+
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
index 231d79e57774e6e3633e7bd2792158ce9f2eac19..7220e0cf95cfe484f2bd10684052b083e6b1a4b3 100644
--- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -56,7 +56,7 @@ int main(int argc, char *argv[])
 		kvm_max_vcpu_id = kvm_max_vcpus;
 
 	TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus,
-		    "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
+		    "KVM_MAX_VCPU_IDS (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
 		    kvm_max_vcpu_id, kvm_max_vcpus);
 
 	test_vcpu_creation(0, kvm_max_vcpus);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 41cf45416060fc6e9ce04fcb2d92d7511e62abb0..87b1971b8e2284385ba842b5546e8a099c69f84b 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -62,6 +62,33 @@ int kvm_check_cap(long cap)
 	return ret;
 }
 
+/* VM Check Capability
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   On success, the Value corresponding to the capability (KVM_CAP_*)
+ *   specified by the value of cap.  On failure a TEST_ASSERT failure
+ *   is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int vm_check_cap(struct kvm_vm *vm, long cap)
+{
+	int ret;
+
+	ret = ioctl(vm->fd, KVM_CHECK_EXTENSION, cap);
+	TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION VM IOCTL failed,\n"
+		"  rc: %i errno: %i", ret, errno);
+
+	return ret;
+}
+
 /* VM Enable Capability
  *
  * Input Args:
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 7d8f7fc736467dc637c54ac8d35012b20f378f3b..88afa1998adb7319487620e90c99913578c8a077 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -641,6 +641,61 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
 	sregs.cr3 = vm->pgd;
 	vcpu_sregs_set(vm, vcpuid, &sregs);
 }
+
+#define CPUID_XFD_BIT (1 << 4)
+static bool is_xfd_supported(void)
+{
+	int eax, ebx, ecx, edx;
+	const int leaf = 0xd, subleaf = 0x1;
+
+	__asm__ __volatile__(
+		"cpuid"
+		: /* output */ "=a"(eax), "=b"(ebx),
+		  "=c"(ecx), "=d"(edx)
+		: /* input */ "0"(leaf), "2"(subleaf));
+
+	return !!(eax & CPUID_XFD_BIT);
+}
+
+void vm_xsave_req_perm(int bit)
+{
+	int kvm_fd;
+	u64 bitmask;
+	long rc;
+	struct kvm_device_attr attr = {
+		.group = 0,
+		.attr = KVM_X86_XCOMP_GUEST_SUPP,
+		.addr = (unsigned long) &bitmask
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+	close(kvm_fd);
+	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+		exit(KSFT_SKIP);
+	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+	if (!(bitmask & (1ULL << bit)))
+		exit(KSFT_SKIP);
+
+	if (!is_xfd_supported())
+		exit(KSFT_SKIP);
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
+
+	/*
+	 * The older kernel version(<5.15) can't support
+	 * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
+	 */
+	if (rc)
+		return;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
+	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
+	TEST_ASSERT(bitmask & (1ULL << bit),
+		    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
+		    bitmask);
+}
+
 /* Adds a vCPU with reasonable defaults (i.e., a stack)
  *
  * Input Args:
@@ -986,21 +1041,6 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 	sregs_dump(stream, &sregs, indent + 4);
 }
 
-struct kvm_x86_state {
-	struct kvm_vcpu_events events;
-	struct kvm_mp_state mp_state;
-	struct kvm_regs regs;
-	struct kvm_xsave xsave;
-	struct kvm_xcrs xcrs;
-	struct kvm_sregs sregs;
-	struct kvm_debugregs debugregs;
-	union {
-		struct kvm_nested_state nested;
-		char nested_[16384];
-	};
-	struct kvm_msrs msrs;
-};
-
 static int kvm_get_num_msrs(struct kvm_vm *vm)
 {
 	struct kvm_msr_list nmsrs;
@@ -1014,6 +1054,22 @@ static int kvm_get_num_msrs(struct kvm_vm *vm)
 	return nmsrs.nmsrs;
 }
 
+static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu,
+				 struct kvm_x86_state *state)
+{
+	int size;
+
+	size = vm_check_cap(vm, KVM_CAP_XSAVE2);
+	if (!size)
+		size = sizeof(struct kvm_xsave);
+
+	state->xsave = malloc(size);
+	if (size == sizeof(struct kvm_xsave))
+		return ioctl(vcpu->fd, KVM_GET_XSAVE, state->xsave);
+	else
+		return ioctl(vcpu->fd, KVM_GET_XSAVE2, state->xsave);
+}
+
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
@@ -1057,7 +1113,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+	r = vcpu_save_xsave_state(vm, vcpu, state);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
                 r);
 
@@ -1102,24 +1158,25 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
-	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
                 r);
 
+	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+	TEST_ASSERT(r == state->msrs.nmsrs,
+		"Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+		r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
 	if (kvm_check_cap(KVM_CAP_XCRS)) {
 		r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
 			    r);
 	}
 
-	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
-        TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
-                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
-
 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
                 r);
@@ -1143,6 +1200,12 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	}
 }
 
+void kvm_x86_state_cleanup(struct kvm_x86_state *state)
+{
+	free(state->xsave);
+	free(state);
+}
+
 bool is_intel_cpu(void)
 {
 	int eax, ebx, ecx, edx;
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..52a3ef6629e80610c2d9776f659869fd757c1341
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * amx tests
+ *
+ * Copyright (C) 2021, Intel, Inc.
+ *
+ * Tests for amx #NM exception and save/restore.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define VCPU_ID				0
+#define X86_FEATURE_XSAVE		(1 << 26)
+#define X86_FEATURE_OSXSAVE		(1 << 27)
+
+#define PAGE_SIZE			(1 << 12)
+#define NUM_TILES			8
+#define TILE_SIZE			1024
+#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+/* Tile configuration associated: */
+#define MAX_TILES			16
+#define RESERVED_BYTES			14
+
+#define XFEATURE_XTILECFG		17
+#define XFEATURE_XTILEDATA		18
+#define XFEATURE_MASK_XTILECFG		(1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA		(1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define TILE_CPUID			0x1d
+#define XSTATE_CPUID			0xd
+#define TILE_PALETTE_CPUID_SUBLEAVE	0x1
+#define XSTATE_USER_STATE_SUBLEAVE	0x0
+
+#define XSAVE_HDR_OFFSET		512
+
+struct xsave_data {
+	u8 area[XSAVE_SIZE];
+} __aligned(64);
+
+struct tile_config {
+	u8  palette_id;
+	u8  start_row;
+	u8  reserved[RESERVED_BYTES];
+	u16 colsb[MAX_TILES];
+	u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+	u8 data[NUM_TILES * TILE_SIZE];
+};
+
+struct xtile_info {
+	u16 bytes_per_tile;
+	u16 bytes_per_row;
+	u16 max_names;
+	u16 max_rows;
+	u32 xsave_offset;
+	u32 xsave_size;
+};
+
+static struct xtile_info xtile;
+
+static inline u64 __xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void __xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+static inline void __ldtilecfg(void *cfg)
+{
+	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+		     : : "a"(cfg));
+}
+
+static inline void __tileloadd(void *tile)
+{
+	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+		     : : "a"(tile), "d"(0));
+}
+
+static inline void __tilerelease(void)
+{
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+static inline void __xsavec(struct xsave_data *data, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsavec (%%rdi)"
+		     : : "D" (data), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static inline void check_cpuid_xsave(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	eax = 1;
+	ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(ecx & X86_FEATURE_XSAVE))
+		GUEST_ASSERT(!"cpuid: no CPU xsave support!");
+	if (!(ecx & X86_FEATURE_OSXSAVE))
+		GUEST_ASSERT(!"cpuid: no OS xsave support!");
+}
+
+static bool check_xsave_supports_xtile(void)
+{
+	return __xgetbv(0) & XFEATURE_MASK_XTILE;
+}
+
+static bool enum_xtile_config(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = TILE_CPUID;
+	ecx = TILE_PALETTE_CPUID_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx || !ecx)
+		return false;
+
+	xtile.max_names = ebx >> 16;
+	if (xtile.max_names < NUM_TILES)
+		return false;
+
+	xtile.bytes_per_tile = eax >> 16;
+	if (xtile.bytes_per_tile < TILE_SIZE)
+		return false;
+
+	xtile.bytes_per_row = ebx;
+	xtile.max_rows = ecx;
+
+	return true;
+}
+
+static bool enum_xsave_tile(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = XSTATE_CPUID;
+	ecx = XFEATURE_XTILEDATA;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx)
+		return false;
+
+	xtile.xsave_offset = ebx;
+	xtile.xsave_size = eax;
+
+	return true;
+}
+
+static bool check_xsave_size(void)
+{
+	u32 eax, ebx, ecx, edx;
+	bool valid = false;
+
+	eax = XSTATE_CPUID;
+	ecx = XSTATE_USER_STATE_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (ebx && ebx <= XSAVE_SIZE)
+		valid = true;
+
+	return valid;
+}
+
+static bool check_xtile_info(void)
+{
+	bool ret = false;
+
+	if (!check_xsave_size())
+		return ret;
+
+	if (!enum_xsave_tile())
+		return ret;
+
+	if (!enum_xtile_config())
+		return ret;
+
+	if (sizeof(struct tile_data) >= xtile.xsave_size)
+		ret = true;
+
+	return ret;
+}
+
+static void set_tilecfg(struct tile_config *cfg)
+{
+	int i;
+
+	/* Only palette id 1 */
+	cfg->palette_id = 1;
+	for (i = 0; i < xtile.max_names; i++) {
+		cfg->colsb[i] = xtile.bytes_per_row;
+		cfg->rows[i] = xtile.max_rows;
+	}
+}
+
+static void set_xstatebv(void *data, uint64_t bv)
+{
+	*(uint64_t *)(data + XSAVE_HDR_OFFSET) = bv;
+}
+
+static u64 get_xstatebv(void *data)
+{
+	return *(u64 *)(data + XSAVE_HDR_OFFSET);
+}
+
+static void init_regs(void)
+{
+	uint64_t cr4, xcr0;
+
+	/* turn on CR4.OSXSAVE */
+	cr4 = get_cr4();
+	cr4 |= X86_CR4_OSXSAVE;
+	set_cr4(cr4);
+
+	xcr0 = __xgetbv(0);
+	xcr0 |= XFEATURE_MASK_XTILE;
+	__xsetbv(0x0, xcr0);
+}
+
+static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
+						    struct tile_data *tiledata,
+						    struct xsave_data *xsave_data)
+{
+	init_regs();
+	check_cpuid_xsave();
+	GUEST_ASSERT(check_xsave_supports_xtile());
+	GUEST_ASSERT(check_xtile_info());
+
+	/* check xtile configs */
+	GUEST_ASSERT(xtile.xsave_offset == 2816);
+	GUEST_ASSERT(xtile.xsave_size == 8192);
+	GUEST_ASSERT(xtile.max_names == 8);
+	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+	GUEST_ASSERT(xtile.bytes_per_row == 64);
+	GUEST_ASSERT(xtile.max_rows == 16);
+	GUEST_SYNC(1);
+
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(2);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	GUEST_SYNC(3);
+	/* Check save/restore when trap to userspace */
+	__tileloadd(tiledata);
+	GUEST_SYNC(4);
+	__tilerelease();
+	GUEST_SYNC(5);
+	/* bit 18 not in the XCOMP_BV after xsavec() */
+	set_xstatebv(xsave_data, XFEATURE_MASK_XTILEDATA);
+	__xsavec(xsave_data, XFEATURE_MASK_XTILEDATA);
+	GUEST_ASSERT((get_xstatebv(xsave_data) & XFEATURE_MASK_XTILEDATA) == 0);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILEDATA);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	/* Trigger #NM exception */
+	__tileloadd(tiledata);
+	GUEST_SYNC(10);
+
+	GUEST_DONE();
+}
+
+void guest_nm_handler(struct ex_regs *regs)
+{
+	/* Check if #NM is triggered by XFEATURE_MASK_XTILEDATA */
+	GUEST_SYNC(7);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	/* Clear xfd_err */
+	wrmsr(MSR_IA32_XFD_ERR, 0);
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(9);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_cpuid_entry2 *entry;
+	struct kvm_regs regs1, regs2;
+	bool amx_supported = false;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	struct kvm_x86_state *state;
+	int xsave_restore_size = 0;
+	vm_vaddr_t amx_cfg, tiledata, xsavedata;
+	struct ucall uc;
+	u32 amx_offset;
+	int stage, ret;
+
+	vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+	entry = kvm_get_supported_cpuid_entry(1);
+	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+		print_skip("XSAVE feature not supported");
+		exit(KSFT_SKIP);
+	}
+
+	if (kvm_get_cpuid_max_basic() >= 0xd) {
+		entry = kvm_get_supported_cpuid_index(0xd, 0);
+		amx_supported = entry && !!(entry->eax & XFEATURE_MASK_XTILE);
+		if (!amx_supported) {
+			print_skip("AMX is not supported by the vCPU (eax=0x%x)", entry->eax);
+			exit(KSFT_SKIP);
+		}
+		/* Get xsave/restore max size */
+		xsave_restore_size = entry->ecx;
+	}
+
+	run = vcpu_state(vm, VCPU_ID);
+	vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+	/* Register #NM handler */
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
+
+	/* amx cfg for guest_code */
+	amx_cfg = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
+
+	/* amx tiledata for guest_code */
+	tiledata = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
+
+	/* xsave data for guest_code */
+	xsavedata = vm_vaddr_alloc_pages(vm, 3);
+	memset(addr_gva2hva(vm, xsavedata), 0, 3 * getpagesize());
+	vcpu_args_set(vm, VCPU_ID, 3, amx_cfg, tiledata, xsavedata);
+
+	for (stage = 1; ; stage++) {
+		_vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Stage %d: unexpected exit reason: %u (%s),\n",
+			    stage, run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+				  __FILE__, uc.args[1]);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			switch (uc.args[1]) {
+			case 1:
+			case 2:
+			case 3:
+			case 5:
+			case 6:
+			case 7:
+			case 8:
+				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
+				break;
+			case 4:
+			case 10:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+
+				/* Compacted mode, get amx offset by xsave area
+				 * size subtract 8K amx size.
+				 */
+				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				state = vcpu_save_state(vm, VCPU_ID);
+				void *amx_start = (void *)state->xsave + amx_offset;
+				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
+				/* Only check TMM0 register, 1 tile */
+				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
+				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d\n", ret);
+				kvm_x86_state_cleanup(state);
+				break;
+			case 9:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			fprintf(stderr, "UCALL_DONE\n");
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		state = vcpu_save_state(vm, VCPU_ID);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		kvm_vm_restart(vm, O_RDWR);
+		vm_vcpu_add(vm, VCPU_ID);
+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+		vcpu_load_state(vm, VCPU_ID, state);
+		run = vcpu_state(vm, VCPU_ID);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 92915e6408e766ef8218f945545205b918286e42..92eeedbf0f4821d1439ab4cd8fe4b46c4bcba1d2 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 		vcpu_enable_evmcs(vm, VCPU_ID);
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f6c1ad86cc637afe02cf7a45c0425f7e193757e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * maximum APIC ID capability tests
+ *
+ * Copyright (C) 2022, Intel, Inc.
+ *
+ * Tests for getting/setting maximum APIC ID capability
+ */
+
+#include "kvm_util.h"
+#include "../lib/kvm_util_internal.h"
+
+#define MAX_VCPU_ID	2
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_enable_cap cap = { 0 };
+	int ret;
+
+	vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+	/* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */
+	ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */
+	cap.cap = KVM_CAP_MAX_VCPU_ID;
+	cap.args[0] = ret + 1;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success to enable KVM_CAP_MAX_VCPU_ID"
+		    "beyond KVM cap!\n");
+
+	/* Set KVM_CAP_MAX_VCPU_ID */
+	cap.cap = KVM_CAP_MAX_VCPU_ID;
+	cap.args[0] = MAX_VCPU_ID;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret == 0,
+		    "Unexpected failure to enable KVM_CAP_MAX_VCPU_ID!\n");
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID again */
+	cap.args[0] = MAX_VCPU_ID + 1;
+	ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success to enable KVM_CAP_MAX_VCPU_ID again\n");
+
+	/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
+	ret = ioctl(vm->fd, KVM_CREATE_VCPU, MAX_VCPU_ID);
+	TEST_ASSERT(ret < 0,
+		    "Unexpected success in creating a vCPU with VCPU ID out of range\n");
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index 8c063646f2a00dd293f8fb944615ca870fad7fcf..9cd269ab056deacb6dd89113556cd45e2516cfe2 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -148,7 +148,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 	}
 
 done:
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 3ab5ec3da9f42eaadf6efd12b4bb63f4b69e2487..1f07b57b65913af5f00675c913f17213c7de3ad2 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -180,7 +180,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
index 9ef7fab39d4878b937a435dce9fa7859dc687f39..7d4d28009dc71700cb2570489250d0b3ff1427ba 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -217,7 +217,7 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
 	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
 
-	free(state);
+	kvm_x86_state_cleanup(state);
 }
 
 int main(int argc, char *argv[])
diff --git a/tools/testing/selftests/pfru/Makefile b/tools/testing/selftests/pfru/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c61916ccf637b834314ecc355ab4f35bfbc25b51
--- /dev/null
+++ b/tools/testing/selftests/pfru/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+CFLAGS += -Wall -O2
+LDLIBS := -luuid
+
+TEST_GEN_PROGS := pfru_test
+include ../lib.mk
diff --git a/tools/testing/selftests/pfru/config b/tools/testing/selftests/pfru/config
new file mode 100644
index 0000000000000000000000000000000000000000..37f53609acbda0e7078617de4a18dc9b669d4643
--- /dev/null
+++ b/tools/testing/selftests/pfru/config
@@ -0,0 +1,2 @@
+CONFIG_ACPI_PFRU=m
+CONFIG_ACPI_PFRU_TELEMETRY=m
diff --git a/tools/testing/selftests/pfru/pfru.h b/tools/testing/selftests/pfru/pfru.h
new file mode 100644
index 0000000000000000000000000000000000000000..655fc0bd7f6c4ef193ad5909d4e203a3367ed55d
--- /dev/null
+++ b/tools/testing/selftests/pfru/pfru.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Platform Firmware Runtime Update header
+ *
+ * Copyright(c) 2021 Intel Corporation. All rights reserved.
+ */
+#ifndef __PFRU_H__
+#define __PFRU_H__
+
+#include <linux/ioctl.h>
+#include <uuid/uuid.h>
+
+#define PFRU_UUID		"ECF9533B-4A3C-4E89-939E-C77112601C6D"
+#define PFRU_CODE_INJ_UUID		"B2F84B79-7B6E-4E45-885F-3FB9BB185402"
+#define PFRU_DRV_UPDATE_UUID		"4569DD8C-75F1-429A-A3D6-24DE8097A0DF"
+
+#define FUNC_STANDARD_QUERY	0
+#define FUNC_QUERY_UPDATE_CAP	1
+#define FUNC_QUERY_BUF		2
+#define FUNC_START		3
+
+#define CODE_INJECT_TYPE	1
+#define DRIVER_UPDATE_TYPE	2
+
+#define REVID_1		1
+#define REVID_2		2
+
+#define PFRU_MAGIC 0xEE
+
+#define PFRU_IOC_SET_REV _IOW(PFRU_MAGIC, 0x01, unsigned int)
+#define PFRU_IOC_STAGE _IOW(PFRU_MAGIC, 0x02, unsigned int)
+#define PFRU_IOC_ACTIVATE _IOW(PFRU_MAGIC, 0x03, unsigned int)
+#define PFRU_IOC_STAGE_ACTIVATE _IOW(PFRU_MAGIC, 0x04, unsigned int)
+
+static inline int valid_revid(int id)
+{
+	return (id == REVID_1) || (id == REVID_2);
+}
+
+typedef unsigned int __u32;
+
+/* Capsule file payload header */
+struct payload_hdr {
+	__u32	sig;
+	__u32	hdr_version;
+	__u32	hdr_size;
+	__u32	hw_ver;
+	__u32	rt_ver;
+	uuid_t	platform_id;
+};
+
+enum start_action {
+	START_STAGE,
+	START_ACTIVATE,
+	START_STAGE_ACTIVATE,
+};
+
+enum dsm_status {
+	DSM_SUCCEED,
+	DSM_FUNC_NOT_SUPPORT,
+	DSM_INVAL_INPUT,
+	DSM_HARDWARE_ERR,
+	DSM_RETRY_SUGGESTED,
+	DSM_UNKNOWN,
+	DSM_FUNC_SPEC_ERR,
+};
+
+struct update_cap_info {
+	enum dsm_status status;
+	int update_cap;
+
+	uuid_t code_type;
+	int fw_version;
+	int code_rt_version;
+
+	uuid_t drv_type;
+	int drv_rt_version;
+	int drv_svn;
+
+	uuid_t platform_id;
+	uuid_t oem_id;
+
+	char oem_info[];
+};
+
+struct com_buf_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	int buf_size;
+};
+
+struct capsulate_buf_info {
+	unsigned long src;
+	int size;
+};
+
+struct updated_result {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	unsigned long low_auth_time;
+	unsigned long high_auth_time;
+	unsigned long low_exec_time;
+	unsigned long high_exec_time;
+};
+
+#define PFRU_TELEMETRY_UUID	"75191659-8178-4D9D-B88F-AC5E5E93E8BF"
+
+/* Telemetry structures. */
+struct telem_data_info {
+	enum dsm_status status;
+	enum dsm_status ext_status;
+	/* Maximum supported size of data of
+	 * all Data Chunks combined.
+	 */
+	unsigned long chunk1_addr_lo;
+	unsigned long chunk1_addr_hi;
+	unsigned long chunk2_addr_lo;
+	unsigned long chunk2_addr_hi;
+	int max_data_size;
+	int chunk1_size;
+	int chunk2_size;
+	int rollover_cnt;
+	int reset_cnt;
+};
+
+struct telem_info {
+	int log_level;
+	int log_type;
+	int log_revid;
+};
+
+/* Two logs: history and execution log */
+#define LOG_EXEC_IDX	0
+#define LOG_HISTORY_IDX	1
+#define NR_LOG_TYPE	2
+
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_VERB	4
+
+#define FUNC_SET_LEV		1
+#define FUNC_GET_LEV		2
+#define FUNC_GET_DATA		3
+
+#define LOG_NAME_SIZE		10
+
+#define PFRU_LOG_IOC_SET_INFO _IOW(PFRU_MAGIC, 0x05, struct telem_info)
+#define PFRU_LOG_IOC_GET_INFO _IOR(PFRU_MAGIC, 0x06, struct telem_info)
+#define PFRU_LOG_IOC_GET_DATA_INFO _IOR(PFRU_MAGIC, 0x07, struct telem_data_info)
+
+#endif /* __PFRU_H__ */
diff --git a/tools/testing/selftests/pfru/pfru_test.c b/tools/testing/selftests/pfru/pfru_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..aae77e6bdad260a776e19d246ea0f0a44f0b0449
--- /dev/null
+++ b/tools/testing/selftests/pfru/pfru_test.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Runtime Update/Telemetry (see Documentation/x86/pfru_update.rst)
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include "pfru.h"
+
+#define MAX_LOG_SIZE 65536
+
+struct update_cap_info cap_info;
+struct com_buf_info buf_info;
+struct capsulate_buf_info image_info;
+struct telem_data_info data_info;
+char *capsule_name;
+int action, query_cap, log_type, log_level, log_read, log_getinfo,
+	revid, log_revid;
+int set_log_level, set_log_type,
+	set_revid, set_log_revid;
+
+char *progname;
+
+static int valid_log_level(int level)
+{
+	return (level == LOG_ERR) || (level == LOG_WARN) ||
+		(level == LOG_INFO) || (level == LOG_VERB);
+}
+
+static int valid_log_type(int type)
+{
+	return (type == LOG_EXEC_IDX) || (type == LOG_HISTORY_IDX);
+}
+
+static void help(void)
+{
+	fprintf(stderr,
+		"usage: %s [OPTIONS]\n"
+		" code injection:\n"
+		"  -l, --load\n"
+		"  -s, --stage\n"
+		"  -a, --activate\n"
+		"  -u, --update [stage and activate]\n"
+		"  -q, --query\n"
+		"  -d, --revid update\n"
+		" telemetry:\n"
+		"  -G, --getloginfo\n"
+		"  -T, --type(0:execution, 1:history)\n"
+		"  -L, --level(0, 1, 2, 4)\n"
+		"  -R, --read\n"
+		"  -D, --revid log\n",
+		progname);
+}
+
+char *option_string = "l:sauqd:GT:L:RD:h";
+static struct option long_options[] = {
+	{"load", required_argument, 0, 'l'},
+	{"stage", no_argument, 0, 's'},
+	{"activate", no_argument, 0, 'a'},
+	{"update", no_argument, 0, 'u'},
+	{"query", no_argument, 0, 'q'},
+	{"getloginfo", no_argument, 0, 'G'},
+	{"type", required_argument, 0, 'T'},
+	{"level", required_argument, 0, 'L'},
+	{"read", no_argument, 0, 'R'},
+	{"setrev", required_argument, 0, 'd'},
+	{"setrevlog", required_argument, 0, 'D'},
+	{"help", no_argument, 0, 'h'},
+	{}
+};
+
+static void parse_options(int argc, char **argv)
+{
+	char *pathname;
+	int c;
+
+	pathname = strdup(argv[0]);
+	progname = basename(pathname);
+
+	while (1) {
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, option_string,
+				long_options, &option_index);
+		if (c == -1)
+			break;
+		switch (c) {
+		case 'l':
+			capsule_name = optarg;
+			break;
+		case 's':
+			action = 1;
+			break;
+		case 'a':
+			action = 2;
+			break;
+		case 'u':
+			action = 3;
+			break;
+		case 'q':
+			query_cap = 1;
+			break;
+		case 'G':
+			log_getinfo = 1;
+			break;
+		case 'T':
+			log_type = atoi(optarg);
+			set_log_type = 1;
+			break;
+		case 'L':
+			log_level = atoi(optarg);
+			set_log_level = 1;
+			break;
+		case 'R':
+			log_read = 1;
+			break;
+		case 'd':
+			revid = atoi(optarg);
+			set_revid = 1;
+			break;
+		case 'D':
+			log_revid = atoi(optarg);
+			set_log_revid = 1;
+			break;
+		case 'h':
+			help();
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void print_cap(struct update_cap_info *cap)
+{
+	char *uuid = malloc(37);
+
+	if (!uuid) {
+		perror("Can not allocate uuid buffer\n");
+		exit(1);
+	}
+	uuid_unparse(cap->code_type, uuid);
+	printf("code injection image type:%s\n", uuid);
+	printf("fw_version:%d\n", cap->fw_version);
+	printf("code_rt_version:%d\n", cap->code_rt_version);
+
+	uuid_unparse(cap->drv_type, uuid);
+	printf("driver update image type:%s\n", uuid);
+	printf("drv_rt_version:%d\n", cap->drv_rt_version);
+	printf("drv_svn:%d\n", cap->drv_svn);
+
+	uuid_unparse(cap->platform_id, uuid);
+	printf("platform id:%s\n", uuid);
+	uuid_unparse(cap->oem_id, uuid);
+	printf("oem id:%s\n", uuid);
+
+	free(uuid);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd_update, fd_log, fd_capsule;
+	struct telem_data_info data_info;
+	struct telem_info info;
+	struct update_cap_info cap;
+	void *addr_map_capsule;
+	struct stat st;
+	char *log_buf;
+	int ret = 0;
+
+	parse_options(argc, argv);
+
+	fd_log = open("/dev/pfru/telemetry", O_RDWR);
+	if (fd_log < 0) {
+		perror("Cannot open telemetry device...");
+		return 1;
+	}
+	fd_update = open("/dev/pfru/update", O_RDWR);
+	if (fd_update < 0) {
+		perror("Cannot open code injection device...");
+		return 1;
+	}
+
+	if (query_cap) {
+		ret = read(fd_update, &cap, sizeof(cap));
+		if (ret == -1) {
+			perror("Read error.");
+			return 1;
+		}
+		print_cap(&cap);
+	}
+
+	if (log_getinfo) {
+		ret = ioctl(fd_log, PFRU_LOG_IOC_GET_DATA_INFO, &data_info);
+		if (ret) {
+			perror("Get log data info failed.");
+			return 1;
+		}
+		ret = ioctl(fd_log, PFRU_LOG_IOC_GET_INFO, &info);
+		if (ret) {
+			perror("Get log info failed.");
+			return 1;
+		}
+		printf("log_level:%d\n", info.log_level);
+		printf("log_type:%d\n", info.log_type);
+		printf("log_revid:%d\n", info.log_revid);
+		printf("max_data_size:%d\n", data_info.max_data_size);
+		printf("chunk1_size:%d\n", data_info.chunk1_size);
+		printf("chunk2_size:%d\n", data_info.chunk2_size);
+		printf("rollover_cnt:%d\n", data_info.rollover_cnt);
+		printf("reset_cnt:%d\n", data_info.reset_cnt);
+
+		return 0;
+	}
+
+	info.log_level = -1;
+	info.log_type = -1;
+	info.log_revid = -1;
+
+	if (set_log_level) {
+		if (!valid_log_level(log_level)) {
+			printf("Invalid log level %d\n",
+			       log_level);
+		} else {
+			info.log_level = log_level;
+		}
+	}
+	if (set_log_type) {
+		if (!valid_log_type(log_type)) {
+			printf("Invalid log type %d\n",
+			       log_type);
+		} else {
+			info.log_type = log_type;
+		}
+	}
+	if (set_log_revid) {
+		if (!valid_revid(log_revid)) {
+			printf("Invalid log revid %d\n",
+			       log_revid);
+		} else {
+			info.log_revid = log_revid;
+		}
+	}
+
+	ret = ioctl(fd_log, PFRU_LOG_IOC_SET_INFO, &info);
+	if (ret) {
+		perror("Log information set failed.(log_level, log_type, log_revid)");
+		return 1;
+	}
+
+	if (set_revid) {
+		ret = ioctl(fd_update, PFRU_IOC_SET_REV, &revid);
+		if (ret) {
+			perror("mru update revid set failed");
+			return 1;
+		}
+		printf("mru update revid set to %d\n", revid);
+	}
+
+	if (capsule_name) {
+		fd_capsule = open(capsule_name, O_RDONLY);
+		if (fd_capsule < 0) {
+			perror("Can not open capsule file...");
+			return 1;
+		}
+		if (fstat(fd_capsule, &st) < 0) {
+			perror("Can not fstat capsule file...");
+			return 1;
+		}
+		addr_map_capsule = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED,
+					fd_capsule, 0);
+		if (addr_map_capsule == MAP_FAILED) {
+			perror("Failed to mmap capsule file.");
+			return 1;
+		}
+		ret = write(fd_update, (char *)addr_map_capsule, st.st_size);
+		printf("Load %d bytes of capsule file into the system\n",
+		       ret);
+		if (ret == -1) {
+			perror("Failed to load capsule file");
+			return 1;
+		}
+		munmap(addr_map_capsule, st.st_size);
+		printf("Load done.\n");
+	}
+
+	if (action) {
+		if (action == 1)
+			ret = ioctl(fd_update, PFRU_IOC_STAGE, NULL);
+		else if (action == 2)
+			ret = ioctl(fd_update, PFRU_IOC_ACTIVATE, NULL);
+		else if (action == 3)
+			ret = ioctl(fd_update, PFRU_IOC_STAGE_ACTIVATE, NULL);
+		else
+			return 1;
+		printf("Update finished, return %d\n", ret);
+	}
+
+	if (log_read) {
+		log_buf = malloc(MAX_LOG_SIZE + 1);
+		if (!log_buf) {
+			perror("log_buf allocate failed.");
+			return 1;
+		}
+		ret = read(fd_log, log_buf, MAX_LOG_SIZE);
+		if (ret == -1) {
+			perror("Read error.");
+			return 1;
+		}
+		log_buf[ret] = '\0';
+		printf("%s\n", log_buf);
+		free(log_buf);
+	}
+
+	return 0;
+}
diff --git a/drivers/acpi/hmat/Makefile b/tools/testing/selftests/resctrl/.gitignore
similarity index 54%
rename from drivers/acpi/hmat/Makefile
rename to tools/testing/selftests/resctrl/.gitignore
index 1c20ef36a3857666c1c6da00c3a8fa8ad80c0fe1..ab68442b6bc8da44f2a9431c7cac617bf27827f6 100644
--- a/drivers/acpi/hmat/Makefile
+++ b/tools/testing/selftests/resctrl/.gitignore
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_ACPI_HMAT) := hmat.o
+resctrl_tests
diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..6bcee2ec91a9c7d32b8cc0aff818a259fa19cbc1
--- /dev/null
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -0,0 +1,17 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2
+SRCS=$(wildcard *.c)
+OBJS=$(SRCS:.c=.o)
+
+all: resctrl_tests
+
+$(OBJS): $(SRCS)
+	$(CC) $(CFLAGS) -c $(SRCS)
+
+resctrl_tests: $(OBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
+.PHONY: clean
+
+clean:
+	$(RM) $(OBJS) resctrl_tests
diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
new file mode 100644
index 0000000000000000000000000000000000000000..3d2bbd4fa3aa1370064fc05a77d52243c84d4926
--- /dev/null
+++ b/tools/testing/selftests/resctrl/README
@@ -0,0 +1,53 @@
+resctrl_tests - resctrl file system test suit
+
+Authors:
+	Fenghua Yu <fenghua.yu@intel.com>
+	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+
+resctrl_tests tests various resctrl functionalities and interfaces including
+both software and hardware.
+
+Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth
+Allocation test on Intel RDT hardware. More tests will be added in the future.
+And the test suit can be extended to cover AMD QoS and ARM MPAM hardware
+as well.
+
+BUILD
+-----
+
+Run "make" to build executable file "resctrl_tests".
+
+RUN
+---
+
+To use resctrl_tests, root or sudoer privileges are required. This is because
+the test needs to mount resctrl file system and change contents in the file
+system.
+
+Executing the test without any parameter will run all supported tests:
+
+	sudo ./resctrl_tests
+
+OVERVIEW OF EXECUTION
+---------------------
+
+A test case has four stages:
+
+  - setup: mount resctrl file system, create group, setup schemata, move test
+    process pids to tasks, start benchmark.
+  - execute: let benchmark run
+  - verify: get resctrl data and verify the data with another source, e.g.
+    perf event.
+  - teardown: umount resctrl and clear temporary files.
+
+ARGUMENTS
+---------
+
+Parameter '-h' shows usage information.
+
+usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
+        -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf
+        -t test list: run tests specified in the test list, e.g. -t mbm,mba,cmt,cat
+        -n no_of_bits: run cache tests using specified no of bits in cache bit mask
+        -p cpu_no: specify CPU number to run the test. 1 is default
+        -h: help
diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
new file mode 100644
index 0000000000000000000000000000000000000000..68ff856d36f0bc52720f31dae88bea29d01470bb
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdint.h>
+#include "resctrl.h"
+
+struct read_format {
+	__u64 nr;			/* The number of events */
+	struct {
+		__u64 value;		/* The value of the event */
+	} values[2];
+};
+
+static struct perf_event_attr pea_llc_miss;
+static struct read_format rf_cqm;
+static int fd_lm;
+char llc_occup_path[1024];
+
+static void initialize_perf_event_attr(void)
+{
+	pea_llc_miss.type = PERF_TYPE_HARDWARE;
+	pea_llc_miss.size = sizeof(struct perf_event_attr);
+	pea_llc_miss.read_format = PERF_FORMAT_GROUP;
+	pea_llc_miss.exclude_kernel = 1;
+	pea_llc_miss.exclude_hv = 1;
+	pea_llc_miss.exclude_idle = 1;
+	pea_llc_miss.exclude_callchain_kernel = 1;
+	pea_llc_miss.inherit = 1;
+	pea_llc_miss.exclude_guest = 1;
+	pea_llc_miss.disabled = 1;
+}
+
+static void ioctl_perf_event_ioc_reset_enable(void)
+{
+	ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0);
+	ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static int perf_event_open_llc_miss(pid_t pid, int cpu_no)
+{
+	fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+	if (fd_lm == -1) {
+		perror("Error opening leader");
+		ctrlc_handler(0, NULL, NULL);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int initialize_llc_perf(void)
+{
+	memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr));
+	memset(&rf_cqm, 0, sizeof(struct read_format));
+
+	/* Initialize perf_event_attr structures for HW_CACHE_MISSES */
+	initialize_perf_event_attr();
+
+	pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES;
+
+	rf_cqm.nr = 1;
+
+	return 0;
+}
+
+static int reset_enable_llc_perf(pid_t pid, int cpu_no)
+{
+	int ret = 0;
+
+	ret = perf_event_open_llc_miss(pid, cpu_no);
+	if (ret < 0)
+		return ret;
+
+	/* Start counters to log values */
+	ioctl_perf_event_ioc_reset_enable();
+
+	return 0;
+}
+
+/*
+ * get_llc_perf:	llc cache miss through perf events
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ *
+ * Perf events like HW_CACHE_MISSES could be used to validate number of
+ * cache lines allocated.
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_perf(unsigned long *llc_perf_miss)
+{
+	__u64 total_misses;
+
+	/* Stop counters after one span to get miss rate */
+
+	ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0);
+
+	if (read(fd_lm, &rf_cqm, sizeof(struct read_format)) == -1) {
+		perror("Could not get llc misses through perf");
+
+		return -1;
+	}
+
+	total_misses = rf_cqm.values[0].value;
+
+	close(fd_lm);
+
+	*llc_perf_miss = total_misses;
+
+	return 0;
+}
+
+/*
+ * Get LLC Occupancy as reported by RESCTRL FS
+ * For CMT,
+ * 1. If con_mon grp and mon grp given, then read from mon grp in
+ * con_mon grp
+ * 2. If only con_mon grp given, then read from con_mon grp
+ * 3. If both not given, then read from root con_mon grp
+ * For CAT,
+ * 1. If con_mon grp given, then read from it
+ * 2. If con_mon grp not given, then read from root con_mon grp
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_occu_resctrl(unsigned long *llc_occupancy)
+{
+	FILE *fp;
+
+	fp = fopen(llc_occup_path, "r");
+	if (!fp) {
+		perror("Failed to open results file");
+
+		return errno;
+	}
+	if (fscanf(fp, "%lu", llc_occupancy) <= 0) {
+		perror("Could not get llc occupancy");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * print_results_cache:	the cache results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @llc_value:		perf miss value /
+ *			llc occupancy value reported by resctrl FS
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_cache(char *filename, int bm_pid,
+			       unsigned long llc_value)
+{
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t LLC_value: %lu\n", bm_pid,
+		       llc_value);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value);
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
+{
+	unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0;
+	int ret;
+
+	/*
+	 * Measure cache miss from perf.
+	 */
+	if (!strncmp(param->resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+		ret = get_llc_perf(&llc_perf_miss);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_perf_miss;
+	}
+
+	/*
+	 * Measure llc occupancy from resctrl.
+	 */
+	if (!strncmp(param->resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+		ret = get_llc_occu_resctrl(&llc_occu_resc);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_occu_resc;
+	}
+	ret = print_results_cache(param->filename, bm_pid, llc_value);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * cache_val:		execute benchmark and measure LLC occupancy resctrl
+ * and perf cache miss for the benchmark
+ * @param:		parameters passed to cache_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int cat_val(struct resctrl_val_param *param)
+{
+	int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0;
+	char *resctrl_val = param->resctrl_val;
+	pid_t bm_pid;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	bm_pid = getpid();
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		return ret;
+
+	/* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		return ret;
+
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+		ret = initialize_llc_perf();
+		if (ret)
+			return ret;
+	}
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+			ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
+			if (ret)
+				break;
+
+			if (run_fill_buf(param->span, malloc_and_init_memory,
+					 memflush, operation, resctrl_val)) {
+				fprintf(stderr, "Error-running fill buffer\n");
+				ret = -1;
+				break;
+			}
+
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * show_cache_info:	show cache test result information
+ * @sum_llc_val:	sum of LLC cache result data
+ * @no_of_bits:		number of bits
+ * @cache_span:		cache span in bytes for CMT or in lines for CAT
+ * @max_diff:		max difference
+ * @max_diff_percent:	max difference percentage
+ * @num_of_runs:	number of runs
+ * @platform:		show test information on this platform
+ * @cmt:		CMT test or CAT test
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int show_cache_info(unsigned long sum_llc_val, int no_of_bits,
+		    unsigned long cache_span, unsigned long max_diff,
+		    unsigned long max_diff_percent, unsigned long num_of_runs,
+		    bool platform, bool cmt)
+{
+	unsigned long avg_llc_val = 0;
+	float diff_percent;
+	long avg_diff = 0;
+	int ret;
+
+	avg_llc_val = sum_llc_val / (num_of_runs - 1);
+	avg_diff = (long)abs(cache_span - avg_llc_val);
+	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
+
+	ret = platform && abs((int)diff_percent) > max_diff_percent &&
+	      (cmt ? (abs(avg_diff) > max_diff) : true);
+
+	ksft_print_msg("%s Check cache miss rate within %d%%\n",
+		       ret ? "Fail:" : "Pass:", max_diff_percent);
+
+	ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent));
+	ksft_print_msg("Number of bits: %d\n", no_of_bits);
+	ksft_print_msg("Average LLC val: %lu\n", avg_llc_val);
+	ksft_print_msg("Cache span (%s): %lu\n", cmt ? "bytes" : "lines",
+		       cache_span);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd4f68388e0f68f7a002ae24496d0f184673d97c
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Allocation Technology (CAT) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME1	"result_cat1"
+#define RESULT_FILE_NAME2	"result_cat2"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF_PERCENT	4
+#define MAX_DIFF		1000000
+
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
+
+/*
+ * Change schemata. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * Run 5 times in order to get average values.
+ */
+static int cat_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	char schemata[64];
+	va_list param;
+	int ret = 0;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	if (p->num_of_runs == 0) {
+		sprintf(schemata, "%lx", p->mask);
+		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
+				     p->resctrl_val);
+	}
+	p->num_of_runs++;
+
+	return ret;
+}
+
+static int check_results(struct resctrl_val_param *param)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_perf_miss = 0;
+	int runs = 0, no_of_bits = 0;
+	FILE *fp;
+
+	ksft_print_msg("Checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Cannot open file");
+
+		return errno;
+	}
+
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+		/*
+		 * Discard the first value which is inaccurate due to monitoring
+		 * setup transition phase.
+		 */
+		if (runs > 0)
+			sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+	no_of_bits = count_bits(param->mask);
+
+	return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span / 64,
+			       MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS,
+			       !is_amd, false);
+}
+
+void cat_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME1);
+	remove(RESULT_FILE_NAME2);
+}
+
+int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
+{
+	unsigned long l_mask, l_mask_1;
+	int ret, pipefd[2], sibling_cpu_no;
+	char pipe_message;
+	pid_t bm_pid;
+
+	cache_size = 0;
+
+	ret = remount_resctrlfs(true);
+	if (ret)
+		return ret;
+
+	/* Get default cbm mask for L3/L2 cache */
+	ret = get_cbm_mask(cache_type, cbm_mask);
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	/* Get L3/L2 cache size */
+	ret = get_cache_size(cpu_no, cache_type, &cache_size);
+	if (ret)
+		return ret;
+	ksft_print_msg("Cache size :%lu\n", cache_size);
+
+	/* Get max number of bits from default-cabm mask */
+	count_of_bits = count_bits(long_mask);
+
+	if (!n)
+		n = count_of_bits / 2;
+
+	if (n > count_of_bits - 1) {
+		ksft_print_msg("Invalid input value for no_of_bits n!\n");
+		ksft_print_msg("Please enter value in range 1 to %d\n",
+			       count_of_bits - 1);
+		return -1;
+	}
+
+	/* Get core id from same socket for running another thread */
+	sibling_cpu_no = get_core_sibling(cpu_no);
+	if (sibling_cpu_no < 0)
+		return -1;
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= CAT_STR,
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.setup		= cat_setup,
+	};
+
+	l_mask = long_mask >> n;
+	l_mask_1 = ~l_mask & long_mask;
+
+	/* Set param values for parent thread which will be allocated bitmask
+	 * with (max_bits - n) bits
+	 */
+	param.span = cache_size * (count_of_bits - n) / count_of_bits;
+	strcpy(param.ctrlgrp, "c2");
+	strcpy(param.mongrp, "m2");
+	strcpy(param.filename, RESULT_FILE_NAME2);
+	param.mask = l_mask;
+	param.num_of_runs = 0;
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+		return errno;
+	}
+
+	bm_pid = fork();
+
+	/* Set param values for child thread which will be allocated bitmask
+	 * with n bits
+	 */
+	if (bm_pid == 0) {
+		param.mask = l_mask_1;
+		strcpy(param.ctrlgrp, "c1");
+		strcpy(param.mongrp, "m1");
+		param.span = cache_size * n / count_of_bits;
+		strcpy(param.filename, RESULT_FILE_NAME1);
+		param.num_of_runs = 0;
+		param.cpu_no = sibling_cpu_no;
+	}
+
+	remove(param.filename);
+
+	ret = cat_val(&param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param);
+	if (ret)
+		return ret;
+
+	if (bm_pid == 0) {
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			close(pipefd[1]);
+			perror("# failed signaling parent process");
+			return errno;
+		}
+
+		close(pipefd[1]);
+		while (1)
+			;
+	} else {
+		/* Parent waits for child to be ready. */
+		close(pipefd[1]);
+		pipe_message = 0;
+		while (pipe_message != 1) {
+			if (read(pipefd[0], &pipe_message,
+				 sizeof(pipe_message)) < sizeof(pipe_message)) {
+				perror("# failed reading from child process");
+				break;
+			}
+		}
+		close(pipefd[0]);
+		kill(bm_pid, SIGKILL);
+	}
+
+	cat_test_cleanup();
+	if (bm_pid)
+		umount_resctrlfs();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..8968e36db99d79cca16e99321e4f79e72bad4e24
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Monitoring Technology (CMT) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME	"result_cmt"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF		2000000
+#define MAX_DIFF_PERCENT	15
+
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
+
+static int cmt_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	p->num_of_runs++;
+
+	return 0;
+}
+
+static int check_results(struct resctrl_val_param *param, int no_of_bits)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_occu_resc = 0;
+	int runs = 0;
+	FILE *fp;
+
+	ksft_print_msg("Checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Error in opening file\n");
+
+		return errno;
+	}
+
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is llc occ resc value */
+		if (runs > 0)
+			sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+	fclose(fp);
+
+	return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span,
+			       MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS,
+			       true, true);
+}
+
+void cmt_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
+{
+	int ret, mum_resctrlfs;
+
+	cache_size = 0;
+	mum_resctrlfs = 1;
+
+	ret = remount_resctrlfs(mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	if (!validate_resctrl_feature_request(CMT_STR))
+		return -1;
+
+	ret = get_cbm_mask("L3", cbm_mask);
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	ret = get_cache_size(cpu_no, "L3", &cache_size);
+	if (ret)
+		return ret;
+	ksft_print_msg("Cache size :%lu\n", cache_size);
+
+	count_of_bits = count_bits(long_mask);
+
+	if (n < 1 || n > count_of_bits) {
+		ksft_print_msg("Invalid input value for numbr_of_bits n!\n");
+		ksft_print_msg("Please enter value in range 1 to %d\n", count_of_bits);
+		return -1;
+	}
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= CMT_STR,
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.filename	= RESULT_FILE_NAME,
+		.mask		= ~(long_mask << n) & long_mask,
+		.span		= cache_size * n / count_of_bits,
+		.num_of_runs	= 0,
+		.setup		= cmt_setup,
+	};
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0)
+		sprintf(benchmark_cmd[1], "%lu", param.span);
+
+	remove(RESULT_FILE_NAME);
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param, n);
+	if (ret)
+		return ret;
+
+	cmt_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/config b/tools/testing/selftests/resctrl/config
new file mode 100644
index 0000000000000000000000000000000000000000..8d9f2deb56edb2081751600e4fe53bf413adfb5b
--- /dev/null
+++ b/tools/testing/selftests/resctrl/config
@@ -0,0 +1,2 @@
+CONFIG_X86_CPU_RESCTRL=y
+CONFIG_PROC_CPU_RESCTRL=y
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
new file mode 100644
index 0000000000000000000000000000000000000000..51e5cf22632f7aa2d9f559d07c7b75ed39d20e68
--- /dev/null
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fill_buf benchmark
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "resctrl.h"
+
+#define CL_SIZE			(64)
+#define PAGE_SIZE		(4 * 1024)
+#define MB			(1024 * 1024)
+
+static unsigned char *startptr;
+
+static void sb(void)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("sfence\n\t"
+		     : : : "memory");
+#endif
+}
+
+static void ctrl_handler(int signo)
+{
+	free(startptr);
+	printf("\nEnding\n");
+	sb();
+	exit(EXIT_SUCCESS);
+}
+
+static void cl_flush(void *p)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("clflush (%0)\n\t"
+		     : : "r"(p) : "memory");
+#endif
+}
+
+static void mem_flush(void *p, size_t s)
+{
+	char *cp = (char *)p;
+	size_t i = 0;
+
+	s = s / CL_SIZE; /* mem size in cache llines */
+
+	for (i = 0; i < s; i++)
+		cl_flush(&cp[i * CL_SIZE]);
+
+	sb();
+}
+
+static void *malloc_and_init_memory(size_t s)
+{
+	uint64_t *p64;
+	size_t s64;
+
+	void *p = memalign(PAGE_SIZE, s);
+
+	p64 = (uint64_t *)p;
+	s64 = s / sizeof(uint64_t);
+
+	while (s64 > 0) {
+		*p64 = (uint64_t)rand();
+		p64 += (CL_SIZE / sizeof(uint64_t));
+		s64 -= (CL_SIZE / sizeof(uint64_t));
+	}
+
+	return p;
+}
+
+static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char sum, *p;
+
+	sum = 0;
+	p = start_ptr;
+	while (p < end_ptr) {
+		sum += *p;
+		p += (CL_SIZE / 2);
+	}
+
+	return sum;
+}
+
+static
+void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char *p;
+
+	p = start_ptr;
+	while (p < end_ptr) {
+		*p = '1';
+		p += (CL_SIZE / 2);
+	}
+}
+
+static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
+			   char *resctrl_val)
+{
+	int ret = 0;
+	FILE *fp;
+
+	while (1) {
+		ret = fill_one_span_read(start_ptr, end_ptr);
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
+			break;
+	}
+
+	/* Consume read result so that reading memory is not optimized out. */
+	fp = fopen("/dev/null", "w");
+	if (!fp)
+		perror("Unable to write to /dev/null");
+	fprintf(fp, "Sum: %d ", ret);
+	fclose(fp);
+
+	return 0;
+}
+
+static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
+			    char *resctrl_val)
+{
+	while (1) {
+		fill_one_span_write(start_ptr, end_ptr);
+		if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
+			break;
+	}
+
+	return 0;
+}
+
+static int
+fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
+	   int op, char *resctrl_val)
+{
+	unsigned char *start_ptr, *end_ptr;
+	unsigned long long i;
+	int ret;
+
+	if (malloc_and_init)
+		start_ptr = malloc_and_init_memory(buf_size);
+	else
+		start_ptr = malloc(buf_size);
+
+	if (!start_ptr)
+		return -1;
+
+	startptr = start_ptr;
+	end_ptr = start_ptr + buf_size;
+
+	/*
+	 * It's better to touch the memory once to avoid any compiler
+	 * optimizations
+	 */
+	if (!malloc_and_init) {
+		for (i = 0; i < buf_size; i++)
+			*start_ptr++ = (unsigned char)rand();
+	}
+
+	start_ptr = startptr;
+
+	/* Flush the memory before using to avoid "cache hot pages" effect */
+	if (memflush)
+		mem_flush(start_ptr, buf_size);
+
+	if (op == 0)
+		ret = fill_cache_read(start_ptr, end_ptr, resctrl_val);
+	else
+		ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
+
+	if (ret) {
+		printf("\n Error in fill cache read/write...\n");
+		return -1;
+	}
+
+	free(startptr);
+
+	return 0;
+}
+
+int run_fill_buf(unsigned long span, int malloc_and_init_memory,
+		 int memflush, int op, char *resctrl_val)
+{
+	unsigned long long cache_size = span;
+	int ret;
+
+	/* set up ctrl-c handler */
+	if (signal(SIGINT, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGINT!\n");
+	if (signal(SIGHUP, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGHUP!\n");
+
+	ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
+			 resctrl_val);
+	if (ret) {
+		printf("\n Error in fill cache\n");
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..1a1bdb6180cf2ac32c2f30bf5ff7245f4dc5922f
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Allocation (MBA) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mba"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF_PERCENT	5
+#define ALLOCATION_MAX		100
+#define ALLOCATION_MIN		10
+#define ALLOCATION_STEP		10
+
+/*
+ * Change schemata percentage from 100 to 10%. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * For each allocation, run 5 times in order to get average values.
+ */
+static int mba_setup(int num, ...)
+{
+	static int runs_per_allocation, allocation = 100;
+	struct resctrl_val_param *p;
+	char allocation_str[64];
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	if (runs_per_allocation >= NUM_OF_RUNS)
+		runs_per_allocation = 0;
+
+	/* Only set up schemata once every NUM_OF_RUNS of allocations */
+	if (runs_per_allocation++ != 0)
+		return 0;
+
+	if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX)
+		return -1;
+
+	sprintf(allocation_str, "%d", allocation);
+
+	write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val);
+	allocation -= ALLOCATION_STEP;
+
+	return 0;
+}
+
+static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
+{
+	int allocation, runs;
+	bool failed = false;
+
+	ksft_print_msg("Results are displayed in (MB)\n");
+	/* Memory bandwidth from 100% down to 10% */
+	for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
+	     allocation++) {
+		unsigned long avg_bw_imc, avg_bw_resc;
+		unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+		int avg_diff_per;
+		float avg_diff;
+
+		/*
+		 * The first run is discarded due to inaccurate value from
+		 * phase transition.
+		 */
+		for (runs = NUM_OF_RUNS * allocation + 1;
+		     runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) {
+			sum_bw_imc += bw_imc[runs];
+			sum_bw_resc += bw_resc[runs];
+		}
+
+		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
+		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+		avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
+		avg_diff_per = (int)(avg_diff * 100);
+
+		ksft_print_msg("%s Check MBA diff within %d%% for schemata %u\n",
+			       avg_diff_per > MAX_DIFF_PERCENT ?
+			       "Fail:" : "Pass:",
+			       MAX_DIFF_PERCENT,
+			       ALLOCATION_MAX - ALLOCATION_STEP * allocation);
+
+		ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
+		ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
+		ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
+		if (avg_diff_per > MAX_DIFF_PERCENT)
+			failed = true;
+	}
+
+	ksft_print_msg("%s Check schemata change using MBA\n",
+		       failed ? "Fail:" : "Pass:");
+	if (failed)
+		ksft_print_msg("At least one test failed\n");
+}
+
+static int check_results(void)
+{
+	char *token_array[8], output[] = RESULT_FILE_NAME, temp[512];
+	unsigned long bw_imc[1024], bw_resc[1024];
+	int runs;
+	FILE *fp;
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is perf imc value */
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		/* Field 5 is resctrl value */
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+
+	show_mba_info(bw_imc, bw_resc);
+
+	return 0;
+}
+
+void mba_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= MBA_STR,
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	= bw_report,
+		.setup		= mba_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results();
+	if (ret)
+		return ret;
+
+	mba_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..8392e5c55ed02599dd15446f27acce245faca90a
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Monitoring (MBM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mbm"
+#define MAX_DIFF_PERCENT	5
+#define NUM_OF_RUNS		5
+
+static int
+show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
+{
+	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
+	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+	int runs, ret, avg_diff_per;
+	float avg_diff = 0;
+
+	/*
+	 * Discard the first value which is inaccurate due to monitoring setup
+	 * transition phase.
+	 */
+	for (runs = 1; runs < NUM_OF_RUNS ; runs++) {
+		sum_bw_imc += bw_imc[runs];
+		sum_bw_resc += bw_resc[runs];
+	}
+
+	avg_bw_imc = sum_bw_imc / 4;
+	avg_bw_resc = sum_bw_resc / 4;
+	avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc;
+	avg_diff_per = (int)(avg_diff * 100);
+
+	ret = avg_diff_per > MAX_DIFF_PERCENT;
+	ksft_print_msg("%s Check MBM diff within %d%%\n",
+		       ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT);
+	ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per);
+	ksft_print_msg("Span (MB): %d\n", span);
+	ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc);
+	ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc);
+
+	return ret;
+}
+
+static int check_results(int span)
+{
+	unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS];
+	char temp[1024], *token_array[8];
+	char output[] = RESULT_FILE_NAME;
+	int runs, ret;
+	FILE *fp;
+
+	ksft_print_msg("Checking for pass/fail\n");
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int i = 0;
+
+		while (token) {
+			token_array[i++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	ret = show_bw_info(bw_imc, bw_resc, span);
+
+	fclose(fp);
+
+	return ret;
+}
+
+static int mbm_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	static int num_of_runs;
+	va_list param;
+	int ret = 0;
+
+	/* Run NUM_OF_RUNS times */
+	if (num_of_runs++ >= NUM_OF_RUNS)
+		return -1;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Set up shemata with 100% allocation on the first run. */
+	if (num_of_runs == 0)
+		ret = write_schemata(p->ctrlgrp, "100", p->cpu_no,
+				     p->resctrl_val);
+
+	return ret;
+}
+
+void mbm_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= MBM_STR,
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.span		= span,
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	=  bw_report,
+		.setup		= mbm_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(span);
+	if (ret)
+		return ret;
+
+	mbm_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ad10c47e31d1541b474007ac4e5d199b60e4b81
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#ifndef RESCTRL_H
+#define RESCTRL_H
+#include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+#include "../kselftest.h"
+
+#define MB			(1024 * 1024)
+#define RESCTRL_PATH		"/sys/fs/resctrl"
+#define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
+#define CBM_MASK_PATH		"/sys/fs/resctrl/info"
+#define L3_PATH			"/sys/fs/resctrl/info/L3"
+#define MB_PATH			"/sys/fs/resctrl/info/MB"
+#define L3_MON_PATH		"/sys/fs/resctrl/info/L3_MON"
+#define L3_MON_FEATURES_PATH	"/sys/fs/resctrl/info/L3_MON/mon_features"
+
+#define PARENT_EXIT(err_msg)			\
+	do {					\
+		perror(err_msg);		\
+		kill(ppid, SIGKILL);		\
+		exit(EXIT_FAILURE);		\
+	} while (0)
+
+/*
+ * resctrl_val_param:	resctrl test parameters
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @cpu_no:		CPU number to which the benchmark would be binded
+ * @span:		Memory bytes accessed in each benchmark iteration
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ * @filename:		Name of file to which the o/p should be written
+ * @bw_report:		Bandwidth report type (reads vs writes)
+ * @setup:		Call back function to setup test environment
+ */
+struct resctrl_val_param {
+	char		*resctrl_val;
+	char		ctrlgrp[64];
+	char		mongrp[64];
+	int		cpu_no;
+	unsigned long	span;
+	int		mum_resctrlfs;
+	char		filename[64];
+	char		*bw_report;
+	unsigned long	mask;
+	int		num_of_runs;
+	int		(*setup)(int num, ...);
+};
+
+#define MBM_STR			"mbm"
+#define MBA_STR			"mba"
+#define CMT_STR			"cmt"
+#define CAT_STR			"cat"
+
+extern pid_t bm_pid, ppid;
+
+extern char llc_occup_path[1024];
+extern bool is_amd;
+
+bool check_resctrlfs_support(void);
+int filter_dmesg(void);
+int remount_resctrlfs(bool mum_resctrlfs);
+int get_resource_id(int cpu_no, int *resource_id);
+int umount_resctrlfs(void);
+int validate_bw_report_request(char *bw_report);
+bool validate_resctrl_feature_request(const char *resctrl_val);
+char *fgrep(FILE *inf, const char *str);
+int taskset_benchmark(pid_t bm_pid, int cpu_no);
+void run_benchmark(int signum, siginfo_t *info, void *ucontext);
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no,
+		   char *resctrl_val);
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val);
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags);
+int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
+		 int op, char *resctrl_va);
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
+void tests_cleanup(void);
+void mbm_test_cleanup(void);
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
+void mba_test_cleanup(void);
+int get_cbm_mask(char *cache_type, char *cbm_mask);
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cat_val(struct resctrl_val_param *param);
+void cat_test_cleanup(void);
+int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
+int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
+unsigned int count_bits(unsigned long n);
+void cmt_test_cleanup(void);
+int get_core_sibling(int cpu_no);
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
+int show_cache_info(unsigned long sum_llc_val, int no_of_bits,
+		    unsigned long cache_span, unsigned long max_diff,
+		    unsigned long max_diff_percent, unsigned long num_of_runs,
+		    bool platform, bool cmt);
+
+#endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
new file mode 100644
index 0000000000000000000000000000000000000000..973f09a66e1eea9c9bcc952e68b2140e080deccf
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resctrl tests
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define BENCHMARK_ARGS		64
+#define BENCHMARK_ARG_SIZE	64
+
+bool is_amd;
+
+void detect_amd(void)
+{
+	FILE *inf = fopen("/proc/cpuinfo", "r");
+	char *res;
+
+	if (!inf)
+		return;
+
+	res = fgrep(inf, "vendor_id");
+
+	if (res) {
+		char *s = strchr(res, ':');
+
+		is_amd = s && !strcmp(s, ": AuthenticAMD\n");
+		free(res);
+	}
+	fclose(inf);
+}
+
+static void cmd_help(void)
+{
+	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n");
+	printf("\t   default benchmark is builtin fill_buf\n");
+	printf("\t-t test list: run tests specified in the test list, ");
+	printf("e.g. -t mbm,mba,cmt,cat\n");
+	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
+	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
+	printf("\t-h: help\n");
+}
+
+void tests_cleanup(void)
+{
+	mbm_test_cleanup();
+	mba_test_cleanup();
+	cmt_test_cleanup();
+	cat_test_cleanup();
+}
+
+static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span,
+			 int cpu_no, char *bw_report)
+{
+	int res;
+
+	ksft_print_msg("Starting MBM BW change ...\n");
+
+	if (!validate_resctrl_feature_request(MBM_STR)) {
+		ksft_test_result_skip("Hardware does not support MBM or MBM is disabled\n");
+		return;
+	}
+
+	if (!has_ben)
+		sprintf(benchmark_cmd[5], "%s", MBA_STR);
+	res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
+	ksft_test_result(!res, "MBM: bw change\n");
+	mbm_test_cleanup();
+}
+
+static void run_mba_test(bool has_ben, char **benchmark_cmd, int span,
+			 int cpu_no, char *bw_report)
+{
+	int res;
+
+	ksft_print_msg("Starting MBA Schemata change ...\n");
+
+	if (!validate_resctrl_feature_request(MBA_STR)) {
+		ksft_test_result_skip("Hardware does not support MBA or MBA is disabled\n");
+		return;
+	}
+
+	if (!has_ben)
+		sprintf(benchmark_cmd[1], "%d", span);
+	res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
+	ksft_test_result(!res, "MBA: schemata change\n");
+	mba_test_cleanup();
+}
+
+static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no)
+{
+	int res;
+
+	ksft_print_msg("Starting CMT test ...\n");
+	if (!validate_resctrl_feature_request(CMT_STR)) {
+		ksft_test_result_skip("Hardware does not support CMT or CMT is disabled\n");
+		return;
+	}
+
+	if (!has_ben)
+		sprintf(benchmark_cmd[5], "%s", CMT_STR);
+	res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd);
+	ksft_test_result(!res, "CMT: test\n");
+	cmt_test_cleanup();
+}
+
+static void run_cat_test(int cpu_no, int no_of_bits)
+{
+	int res;
+
+	ksft_print_msg("Starting CAT test ...\n");
+
+	if (!validate_resctrl_feature_request(CAT_STR)) {
+		ksft_test_result_skip("Hardware does not support CAT or CAT is disabled\n");
+		return;
+	}
+
+	res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
+	ksft_test_result(!res, "CAT: test\n");
+	cat_test_cleanup();
+}
+
+int main(int argc, char **argv)
+{
+	bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true;
+	int c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0;
+	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
+	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
+	int ben_ind, ben_count, tests = 0;
+	bool cat_test = true;
+
+	for (i = 0; i < argc; i++) {
+		if (strcmp(argv[i], "-b") == 0) {
+			ben_ind = i + 1;
+			ben_count = argc - ben_ind;
+			argc_new = ben_ind - 1;
+			has_ben = true;
+			break;
+		}
+	}
+
+	while ((c = getopt(argc_new, argv, "ht:b:n:p:")) != -1) {
+		char *token;
+
+		switch (c) {
+		case 't':
+			token = strtok(optarg, ",");
+
+			mbm_test = false;
+			mba_test = false;
+			cmt_test = false;
+			cat_test = false;
+			while (token) {
+				if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) {
+					mbm_test = true;
+					tests++;
+				} else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) {
+					mba_test = true;
+					tests++;
+				} else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) {
+					cmt_test = true;
+					tests++;
+				} else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) {
+					cat_test = true;
+					tests++;
+				} else {
+					printf("invalid argument\n");
+
+					return -1;
+				}
+				token = strtok(NULL, ",");
+			}
+			break;
+		case 'p':
+			cpu_no = atoi(optarg);
+			break;
+		case 'n':
+			no_of_bits = atoi(optarg);
+			if (no_of_bits <= 0) {
+				printf("Bail out! invalid argument for no_of_bits\n");
+				return -1;
+			}
+			break;
+		case 'h':
+			cmd_help();
+
+			return 0;
+		default:
+			printf("invalid argument\n");
+
+			return -1;
+		}
+	}
+
+	ksft_print_header();
+
+	/*
+	 * Typically we need root privileges, because:
+	 * 1. We write to resctrl FS
+	 * 2. We execute perf commands
+	 */
+	if (geteuid() != 0)
+		return ksft_exit_fail_msg("Not running as root, abort testing.\n");
+
+	/* Detect AMD vendor */
+	detect_amd();
+
+	if (has_ben) {
+		/* Extract benchmark command from command line. */
+		for (i = ben_ind; i < argc; i++) {
+			benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i];
+			sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]);
+		}
+		benchmark_cmd[ben_count] = NULL;
+	} else {
+		/* If no benchmark is given by "-b" argument, use fill_buf. */
+		for (i = 0; i < 6; i++)
+			benchmark_cmd[i] = benchmark_cmd_area[i];
+
+		strcpy(benchmark_cmd[0], "fill_buf");
+		sprintf(benchmark_cmd[1], "%d", span);
+		strcpy(benchmark_cmd[2], "1");
+		strcpy(benchmark_cmd[3], "1");
+		strcpy(benchmark_cmd[4], "0");
+		strcpy(benchmark_cmd[5], "");
+		benchmark_cmd[6] = NULL;
+	}
+
+	sprintf(bw_report, "reads");
+	sprintf(bm_type, "fill_buf");
+
+	if (!check_resctrlfs_support())
+		return ksft_exit_fail_msg("resctrl FS does not exist\n");
+
+	filter_dmesg();
+
+	ksft_set_plan(tests ? : 4);
+
+	if (!is_amd && mbm_test)
+		run_mbm_test(has_ben, benchmark_cmd, span, cpu_no, bw_report);
+
+	if (!is_amd && mba_test)
+		run_mba_test(has_ben, benchmark_cmd, span, cpu_no, bw_report);
+
+	if (cmt_test)
+		run_cmt_test(has_ben, benchmark_cmd, cpu_no);
+
+	if (cat_test)
+		run_cat_test(cpu_no, no_of_bits);
+
+	umount_resctrlfs();
+
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
new file mode 100644
index 0000000000000000000000000000000000000000..95224345c78e75c755f925314c03f4f19136df6d
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory bandwidth monitoring and allocation library
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define UNCORE_IMC		"uncore_imc"
+#define READ_FILE_NAME		"events/cas_count_read"
+#define WRITE_FILE_NAME		"events/cas_count_write"
+#define DYN_PMU_PATH		"/sys/bus/event_source/devices"
+#define SCALE			0.00006103515625
+#define MAX_IMCS		20
+#define MAX_TOKENS		5
+#define READ			0
+#define WRITE			1
+#define CON_MON_MBM_LOCAL_BYTES_PATH				\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MBM_LOCAL_BYTES_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MON_MBM_LOCAL_BYTES_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MBM_LOCAL_BYTES_PATH			\
+	"%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define CON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define MON_LCC_OCCUP_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define LCC_OCCUP_PATH			\
+	"%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+struct membw_read_format {
+	__u64 value;         /* The value of the event */
+	__u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+	__u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+	__u64 id;            /* if PERF_FORMAT_ID */
+};
+
+struct imc_counter_config {
+	__u32 type;
+	__u64 event;
+	__u64 umask;
+	struct perf_event_attr pe;
+	struct membw_read_format return_value;
+	int fd;
+};
+
+static char mbm_total_path[1024];
+static int imcs;
+static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+
+void membw_initialize_perf_event_attr(int i, int j)
+{
+	memset(&imc_counters_config[i][j].pe, 0,
+	       sizeof(struct perf_event_attr));
+	imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
+	imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
+	imc_counters_config[i][j].pe.disabled = 1;
+	imc_counters_config[i][j].pe.inherit = 1;
+	imc_counters_config[i][j].pe.exclude_guest = 0;
+	imc_counters_config[i][j].pe.config =
+		imc_counters_config[i][j].umask << 8 |
+		imc_counters_config[i][j].event;
+	imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
+	imc_counters_config[i][j].pe.read_format =
+		PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+}
+
+void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void membw_ioctl_perf_event_ioc_disable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+/*
+ * get_event_and_umask:	Parse config into event and umask
+ * @cas_count_cfg:	Config
+ * @count:		iMC number
+ * @op:			Operation (read/write)
+ */
+void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+{
+	char *token[MAX_TOKENS];
+	int i = 0;
+
+	strcat(cas_count_cfg, ",");
+	token[0] = strtok(cas_count_cfg, "=,");
+
+	for (i = 1; i < MAX_TOKENS; i++)
+		token[i] = strtok(NULL, "=,");
+
+	for (i = 0; i < MAX_TOKENS; i++) {
+		if (!token[i])
+			break;
+		if (strcmp(token[i], "event") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].event =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].event =
+				strtol(token[i + 1], NULL, 16);
+		}
+		if (strcmp(token[i], "umask") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].umask =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].umask =
+				strtol(token[i + 1], NULL, 16);
+		}
+	}
+}
+
+static int open_perf_event(int i, int cpu_no, int j)
+{
+	imc_counters_config[i][j].fd =
+		perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+
+	if (imc_counters_config[i][j].fd == -1) {
+		fprintf(stderr, "Error opening leader %llx\n",
+			imc_counters_config[i][j].pe.config);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Get type and config (read and write) of an iMC counter */
+static int read_from_imc_dir(char *imc_dir, int count)
+{
+	char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
+	FILE *fp;
+
+	/* Get type of iMC counter */
+	sprintf(imc_counter_type, "%s%s", imc_dir, "type");
+	fp = fopen(imc_counter_type, "r");
+	if (!fp) {
+		perror("Failed to open imc counter type file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
+		perror("Could not get imc type");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	imc_counters_config[count][WRITE].type =
+				imc_counters_config[count][READ].type;
+
+	/* Get read config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count read");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, READ);
+
+	/* Get write config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if  (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count write");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, WRITE);
+
+	return 0;
+}
+
+/*
+ * A system can have 'n' number of iMC (Integrated Memory Controller)
+ * counters, get that 'n'. For each iMC counter get it's type and config.
+ * Also, each counter has two configs, one for read and the other for write.
+ * A config again has two parts, event and umask.
+ * Enumerate all these details into an array of structures.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static int num_of_imcs(void)
+{
+	char imc_dir[512], *temp;
+	unsigned int count = 0;
+	struct dirent *ep;
+	int ret;
+	DIR *dp;
+
+	dp = opendir(DYN_PMU_PATH);
+	if (dp) {
+		while ((ep = readdir(dp))) {
+			temp = strstr(ep->d_name, UNCORE_IMC);
+			if (!temp)
+				continue;
+
+			/*
+			 * imc counters are named as "uncore_imc_<n>", hence
+			 * increment the pointer to point to <n>. Note that
+			 * sizeof(UNCORE_IMC) would count for null character as
+			 * well and hence the last underscore character in
+			 * uncore_imc'_' need not be counted.
+			 */
+			temp = temp + sizeof(UNCORE_IMC);
+
+			/*
+			 * Some directories under "DYN_PMU_PATH" could have
+			 * names like "uncore_imc_free_running", hence, check if
+			 * first character is a numerical digit or not.
+			 */
+			if (temp[0] >= '0' && temp[0] <= '9') {
+				sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
+					ep->d_name);
+				ret = read_from_imc_dir(imc_dir, count);
+				if (ret) {
+					closedir(dp);
+
+					return ret;
+				}
+				count++;
+			}
+		}
+		closedir(dp);
+		if (count == 0) {
+			perror("Unable find iMC counters!\n");
+
+			return -1;
+		}
+	} else {
+		perror("Unable to open PMU directory!\n");
+
+		return -1;
+	}
+
+	return count;
+}
+
+static int initialize_mem_bw_imc(void)
+{
+	int imc, j;
+
+	imcs = num_of_imcs();
+	if (imcs <= 0)
+		return imcs;
+
+	/* Initialize perf_event_attr structures for all iMC's */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_initialize_perf_event_attr(imc, j);
+	}
+
+	return 0;
+}
+
+/*
+ * get_mem_bw_imc:	Memory band width as reported by iMC counters
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @bw_report:		Bandwidth report type (reads, writes)
+ *
+ * Memory B/W utilized by a process on a socket can be calculated using
+ * iMC counters. Perf events are used to read these counters.
+ *
+ * Return: = 0 on success. < 0 on failure.
+ */
+static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc)
+{
+	float reads, writes, of_mul_read, of_mul_write;
+	int imc, j, ret;
+
+	/* Start all iMC counters to log values (both read and write) */
+	reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++) {
+			ret = open_perf_event(imc, cpu_no, j);
+			if (ret)
+				return -1;
+		}
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_reset_enable(imc, j);
+	}
+
+	sleep(1);
+
+	/* Stop counters after a second to get results (both read and write) */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_disable(imc, j);
+	}
+
+	/*
+	 * Get results which are stored in struct type imc_counter_config
+	 * Take over flow into consideration before calculating total b/w
+	 */
+	for (imc = 0; imc < imcs; imc++) {
+		struct imc_counter_config *r =
+			&imc_counters_config[imc][READ];
+		struct imc_counter_config *w =
+			&imc_counters_config[imc][WRITE];
+
+		if (read(r->fd, &r->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get read b/w through iMC");
+
+			return -1;
+		}
+
+		if (read(w->fd, &w->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get write bw through iMC");
+
+			return -1;
+		}
+
+		__u64 r_time_enabled = r->return_value.time_enabled;
+		__u64 r_time_running = r->return_value.time_running;
+
+		if (r_time_enabled != r_time_running)
+			of_mul_read = (float)r_time_enabled /
+					(float)r_time_running;
+
+		__u64 w_time_enabled = w->return_value.time_enabled;
+		__u64 w_time_running = w->return_value.time_running;
+
+		if (w_time_enabled != w_time_running)
+			of_mul_write = (float)w_time_enabled /
+					(float)w_time_running;
+		reads += r->return_value.value * of_mul_read * SCALE;
+		writes += w->return_value.value * of_mul_write * SCALE;
+	}
+
+	for (imc = 0; imc < imcs; imc++) {
+		close(imc_counters_config[imc][READ].fd);
+		close(imc_counters_config[imc][WRITE].fd);
+	}
+
+	if (strcmp(bw_report, "reads") == 0) {
+		*bw_imc = reads;
+		return 0;
+	}
+
+	if (strcmp(bw_report, "writes") == 0) {
+		*bw_imc = writes;
+		return 0;
+	}
+
+	*bw_imc = reads + writes;
+	return 0;
+}
+
+void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
+{
+	if (ctrlgrp && mongrp)
+		sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
+			RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
+	else if (!ctrlgrp && mongrp)
+		sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			mongrp, resource_id);
+	else if (ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			ctrlgrp, resource_id);
+	else if (!ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			resource_id);
+}
+
+/*
+ * initialize_mem_bw_resctrl:	Appropriately populate "mbm_total_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: mbm, mba.. etc)
+ */
+static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
+				      int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("Could not get resource_id");
+		return;
+	}
+
+	if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)))
+		set_mbm_path(ctrlgrp, mongrp, resource_id);
+
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+		if (ctrlgrp)
+			sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, ctrlgrp, resource_id);
+		else
+			sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, resource_id);
+	}
+}
+
+/*
+ * Get MBM Local bytes as reported by resctrl FS
+ * For MBM,
+ * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
+ * 2. If only con_mon grp is given, then read from con_mon grp
+ * 3. If both are not given, then read from root con_mon grp
+ * For MBA,
+ * 1. If con_mon grp is given, then read from it
+ * 2. If con_mon grp is not given, then read from root con_mon grp
+ */
+static int get_mem_bw_resctrl(unsigned long *mbm_total)
+{
+	FILE *fp;
+
+	fp = fopen(mbm_total_path, "r");
+	if (!fp) {
+		perror("Failed to open total bw file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%lu", mbm_total) <= 0) {
+		perror("Could not get mbm local bytes");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+pid_t bm_pid, ppid;
+
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+{
+	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+	tests_cleanup();
+	ksft_print_msg("Ending\n\n");
+
+	exit(EXIT_SUCCESS);
+}
+
+/*
+ * print_results_bw:	the memory bandwidth results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @bw_imc:		perf imc counter value
+ * @bw_resc:		memory bandwidth value
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
+			    unsigned long bw_resc)
+{
+	unsigned long diff = fabs(bw_imc - bw_resc);
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
+		printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
+			    bm_pid, bw_imc, bw_resc, diff) <= 0) {
+			fclose(fp);
+			perror("Could not log results.");
+
+			return errno;
+		}
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+static void set_cmt_path(const char *ctrlgrp, const char *mongrp, char sock_num)
+{
+	if (strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	CON_MON_LCC_OCCUP_PATH,	RESCTRL_PATH,
+			ctrlgrp, mongrp, sock_num);
+	else if (!strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	MON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			mongrp, sock_num);
+	else if (strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path,	CON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			ctrlgrp, sock_num);
+	else if (!strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path, LCC_OCCUP_PATH,	RESCTRL_PATH, sock_num);
+}
+
+/*
+ * initialize_llc_occu_resctrl:	Appropriately populate "llc_occup_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: cat, cmt.. etc)
+ */
+static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
+					int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("# Unable to resource_id");
+		return;
+	}
+
+	if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+		set_cmt_path(ctrlgrp, mongrp, resource_id);
+}
+
+static int
+measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
+{
+	unsigned long bw_resc, bw_resc_end;
+	float bw_imc;
+	int ret;
+
+	/*
+	 * Measure memory bandwidth from resctrl and from
+	 * another source which is perf imc value or could
+	 * be something else if perf imc event is not available.
+	 * Compare the two values to validate resctrl value.
+	 * It takes 1sec to measure the data.
+	 */
+	ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc);
+	if (ret < 0)
+		return ret;
+
+	ret = get_mem_bw_resctrl(&bw_resc_end);
+	if (ret < 0)
+		return ret;
+
+	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
+	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
+	if (ret)
+		return ret;
+
+	*bw_resc_start = bw_resc_end;
+
+	return 0;
+}
+
+/*
+ * resctrl_val:	execute benchmark and measure memory bandwidth on
+ *			the benchmark
+ * @benchmark_cmd:	benchmark command and its arguments
+ * @param:		parameters passed to resctrl_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
+{
+	char *resctrl_val = param->resctrl_val;
+	unsigned long bw_resc_start = 0;
+	struct sigaction sigact;
+	int ret = 0, pipefd[2];
+	char pipe_message = 0;
+	union sigval value;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) ||
+	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+		ret = validate_bw_report_request(param->bw_report);
+		if (ret)
+			return ret;
+	}
+
+	ret = remount_resctrlfs(param->mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	/*
+	 * If benchmark wasn't successfully started by child, then child should
+	 * kill parent, so save parent's pid
+	 */
+	ppid = getpid();
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+
+		return -1;
+	}
+
+	/*
+	 * Fork to start benchmark, save child's pid so that it can be killed
+	 * when needed
+	 */
+	bm_pid = fork();
+	if (bm_pid == -1) {
+		perror("# Unable to fork");
+
+		return -1;
+	}
+
+	if (bm_pid == 0) {
+		/*
+		 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
+		 * start benchmark
+		 */
+		sigfillset(&sigact.sa_mask);
+		sigdelset(&sigact.sa_mask, SIGUSR1);
+
+		sigact.sa_sigaction = run_benchmark;
+		sigact.sa_flags = SA_SIGINFO;
+
+		/* Register for "SIGUSR1" signal from parent */
+		if (sigaction(SIGUSR1, &sigact, NULL))
+			PARENT_EXIT("Can't register child for signal");
+
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed signaling parent process");
+			close(pipefd[1]);
+			return -1;
+		}
+		close(pipefd[1]);
+
+		/* Suspend child until delivery of "SIGUSR1" from parent */
+		sigsuspend(&sigact.sa_mask);
+
+		PARENT_EXIT("Child is done");
+	}
+
+	ksft_print_msg("Benchmark PID: %d\n", bm_pid);
+
+	/*
+	 * Register CTRL-C handler for parent, as it has to kill benchmark
+	 * before exiting
+	 */
+	sigact.sa_sigaction = ctrlc_handler;
+	sigemptyset(&sigact.sa_mask);
+	sigact.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGINT, &sigact, NULL) ||
+	    sigaction(SIGHUP, &sigact, NULL)) {
+		perror("# sigaction");
+		ret = errno;
+		goto out;
+	}
+
+	value.sival_ptr = benchmark_cmd;
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		goto out;
+
+	/* Write benchmark to specified control&monitoring grp in resctrl FS */
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		goto out;
+
+	if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+	    !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+		ret = initialize_mem_bw_imc();
+		if (ret)
+			goto out;
+
+		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
+					  param->cpu_no, resctrl_val);
+	} else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+		initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
+					    param->cpu_no, resctrl_val);
+
+	/* Parent waits for child to be ready. */
+	close(pipefd[1]);
+	while (pipe_message != 1) {
+		if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed reading message from child process");
+			close(pipefd[0]);
+			goto out;
+		}
+	}
+	close(pipefd[0]);
+
+	/* Signal child to start benchmark */
+	if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
+		perror("# sigqueue SIGUSR1 to child");
+		ret = errno;
+		goto out;
+	}
+
+	/* Give benchmark enough time to fully run */
+	sleep(1);
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+		    !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			ret = measure_vals(param, &bw_resc_start);
+			if (ret)
+				break;
+		} else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+out:
+	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+
+	return ret;
+}
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..5f5a166ade60a268a93b4f34a5584342ddf5a788
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -0,0 +1,746 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic resctrl file system operations
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+static int find_resctrl_mount(char *buffer)
+{
+	FILE *mounts;
+	char line[256], *fs, *mntpoint;
+
+	mounts = fopen("/proc/mounts", "r");
+	if (!mounts) {
+		perror("/proc/mounts");
+		return -ENXIO;
+	}
+	while (!feof(mounts)) {
+		if (!fgets(line, 256, mounts))
+			break;
+		fs = strtok(line, " \t");
+		if (!fs)
+			continue;
+		mntpoint = strtok(NULL, " \t");
+		if (!mntpoint)
+			continue;
+		fs = strtok(NULL, " \t");
+		if (!fs)
+			continue;
+		if (strcmp(fs, "resctrl"))
+			continue;
+
+		fclose(mounts);
+		if (buffer)
+			strncpy(buffer, mntpoint, 256);
+
+		return 0;
+	}
+
+	fclose(mounts);
+
+	return -ENOENT;
+}
+
+/*
+ * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ *
+ * If not mounted, mount it.
+ * If mounted and mum_resctrlfs then remount resctrl FS.
+ * If mounted and !mum_resctrlfs then noop
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int remount_resctrlfs(bool mum_resctrlfs)
+{
+	char mountpoint[256];
+	int ret;
+
+	ret = find_resctrl_mount(mountpoint);
+	if (ret)
+		strcpy(mountpoint, RESCTRL_PATH);
+
+	if (!ret && mum_resctrlfs && umount(mountpoint))
+		ksft_print_msg("Fail: unmounting \"%s\"\n", mountpoint);
+
+	if (!ret && !mum_resctrlfs)
+		return 0;
+
+	ksft_print_msg("Mounting resctrl to \"%s\"\n", RESCTRL_PATH);
+	ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL);
+	if (ret)
+		perror("# mount");
+
+	return ret;
+}
+
+int umount_resctrlfs(void)
+{
+	if (find_resctrl_mount(NULL))
+		return 0;
+
+	if (umount(RESCTRL_PATH)) {
+		perror("# Unable to umount resctrl");
+
+		return errno;
+	}
+
+	return 0;
+}
+
+/*
+ * get_resource_id - Get socket number/l3 id for a specified CPU
+ * @cpu_no:	CPU number
+ * @resource_id: Socket number or l3_id
+ *
+ * Return: >= 0 on success, < 0 on failure.
+ */
+int get_resource_id(int cpu_no, int *resource_id)
+{
+	char phys_pkg_path[1024];
+	FILE *fp;
+
+	if (is_amd)
+		sprintf(phys_pkg_path, "%s%d/cache/index3/id",
+			PHYS_ID_PATH, cpu_no);
+	else
+		sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+			PHYS_ID_PATH, cpu_no);
+
+	fp = fopen(phys_pkg_path, "r");
+	if (!fp) {
+		perror("Failed to open physical_package_id");
+
+		return -1;
+	}
+	if (fscanf(fp, "%d", resource_id) <= 0) {
+		perror("Could not get socket number or l3 id");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * get_cache_size - Get cache size for a specified CPU
+ * @cpu_no:	CPU number
+ * @cache_type:	Cache level L2/L3
+ * @cache_size:	pointer to cache_size
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
+{
+	char cache_path[1024], cache_str[64];
+	int length, i, cache_num;
+	FILE *fp;
+
+	if (!strcmp(cache_type, "L3")) {
+		cache_num = 3;
+	} else if (!strcmp(cache_type, "L2")) {
+		cache_num = 2;
+	} else {
+		perror("Invalid cache level");
+		return -1;
+	}
+
+	sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size",
+		cpu_no, cache_num);
+	fp = fopen(cache_path, "r");
+	if (!fp) {
+		perror("Failed to open cache size");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cache_str) <= 0) {
+		perror("Could not get cache_size");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	length = (int)strlen(cache_str);
+
+	*cache_size = 0;
+
+	for (i = 0; i < length; i++) {
+		if ((cache_str[i] >= '0') && (cache_str[i] <= '9'))
+
+			*cache_size = *cache_size * 10 + (cache_str[i] - '0');
+
+		else if (cache_str[i] == 'K')
+
+			*cache_size = *cache_size * 1024;
+
+		else if (cache_str[i] == 'M')
+
+			*cache_size = *cache_size * 1024 * 1024;
+
+		else
+			break;
+	}
+
+	return 0;
+}
+
+#define CORE_SIBLINGS_PATH	"/sys/bus/cpu/devices/cpu"
+
+/*
+ * get_cbm_mask - Get cbm mask for given cache
+ * @cache_type:	Cache level L2/L3
+ * @cbm_mask:	cbm_mask returned as a string
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cbm_mask(char *cache_type, char *cbm_mask)
+{
+	char cbm_mask_path[1024];
+	FILE *fp;
+
+	if (!cbm_mask)
+		return -1;
+
+	sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type);
+
+	fp = fopen(cbm_mask_path, "r");
+	if (!fp) {
+		perror("Failed to open cache level");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cbm_mask) <= 0) {
+		perror("Could not get max cbm_mask");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * get_core_sibling - Get sibling core id from the same socket for given CPU
+ * @cpu_no:	CPU number
+ *
+ * Return:	> 0 on success, < 0 on failure.
+ */
+int get_core_sibling(int cpu_no)
+{
+	char core_siblings_path[1024], cpu_list_str[64];
+	int sibling_cpu_no = -1;
+	FILE *fp;
+
+	sprintf(core_siblings_path, "%s%d/topology/core_siblings_list",
+		CORE_SIBLINGS_PATH, cpu_no);
+
+	fp = fopen(core_siblings_path, "r");
+	if (!fp) {
+		perror("Failed to open core siblings path");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cpu_list_str) <= 0) {
+		perror("Could not get core_siblings list");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	char *token = strtok(cpu_list_str, "-,");
+
+	while (token) {
+		sibling_cpu_no = atoi(token);
+		/* Skipping core 0 as we don't want to run test on core 0 */
+		if (sibling_cpu_no != 0 && sibling_cpu_no != cpu_no)
+			break;
+		token = strtok(NULL, "-,");
+	}
+
+	return sibling_cpu_no;
+}
+
+/*
+ * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
+ * @bm_pid:	PID that should be binded
+ * @cpu_no:	CPU number at which the PID would be binded
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int taskset_benchmark(pid_t bm_pid, int cpu_no)
+{
+	cpu_set_t my_set;
+
+	CPU_ZERO(&my_set);
+	CPU_SET(cpu_no, &my_set);
+
+	if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) {
+		perror("Unable to taskset benchmark");
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
+ *		   in specified signal. Direct benchmark stdio to /dev/null.
+ * @signum:	signal number
+ * @info:	signal info
+ * @ucontext:	user context in signal handling
+ *
+ * Return: void
+ */
+void run_benchmark(int signum, siginfo_t *info, void *ucontext)
+{
+	int operation, ret, malloc_and_init_memory, memflush;
+	unsigned long span, buffer_span;
+	char **benchmark_cmd;
+	char resctrl_val[64];
+	FILE *fp;
+
+	benchmark_cmd = info->si_ptr;
+
+	/*
+	 * Direct stdio of child to /dev/null, so that only parent writes to
+	 * stdio (console)
+	 */
+	fp = freopen("/dev/null", "w", stdout);
+	if (!fp)
+		PARENT_EXIT("Unable to direct benchmark status to /dev/null");
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
+		/* Execute default fill_buf benchmark */
+		span = strtoul(benchmark_cmd[1], NULL, 10);
+		malloc_and_init_memory = atoi(benchmark_cmd[2]);
+		memflush =  atoi(benchmark_cmd[3]);
+		operation = atoi(benchmark_cmd[4]);
+		sprintf(resctrl_val, "%s", benchmark_cmd[5]);
+
+		if (strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+			buffer_span = span * MB;
+		else
+			buffer_span = span;
+
+		if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush,
+				 operation, resctrl_val))
+			fprintf(stderr, "Error in running fill buffer\n");
+	} else {
+		/* Execute specified benchmark */
+		ret = execvp(benchmark_cmd[0], benchmark_cmd);
+		if (ret)
+			perror("wrong\n");
+	}
+
+	fclose(stdout);
+	PARENT_EXIT("Unable to run specified benchmark");
+}
+
+/*
+ * create_grp - Create a group only if one doesn't exist
+ * @grp_name:	Name of the group
+ * @grp:	Full path and name of the group
+ * @parent_grp:	Full path and name of the parent group
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
+{
+	int found_grp = 0;
+	struct dirent *ep;
+	DIR *dp;
+
+	/*
+	 * At this point, we are guaranteed to have resctrl FS mounted and if
+	 * length of grp_name == 0, it means, user wants to use root con_mon
+	 * grp, so do nothing
+	 */
+	if (strlen(grp_name) == 0)
+		return 0;
+
+	/* Check if requested grp exists or not */
+	dp = opendir(parent_grp);
+	if (dp) {
+		while ((ep = readdir(dp)) != NULL) {
+			if (strcmp(ep->d_name, grp_name) == 0)
+				found_grp = 1;
+		}
+		closedir(dp);
+	} else {
+		perror("Unable to open resctrl for group");
+
+		return -1;
+	}
+
+	/* Requested grp doesn't exist, hence create it */
+	if (found_grp == 0) {
+		if (mkdir(grp, 0) == -1) {
+			perror("Unable to create group");
+
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int write_pid_to_tasks(char *tasks, pid_t pid)
+{
+	FILE *fp;
+
+	fp = fopen(tasks, "w");
+	if (!fp) {
+		perror("Failed to open tasks file");
+
+		return -1;
+	}
+	if (fprintf(fp, "%d\n", pid) < 0) {
+		perror("Failed to wr pid to tasks file");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS
+ * @bm_pid:		PID that should be written
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * If a con_mon grp is requested, create it and write pid to it, otherwise
+ * write pid to root con_mon grp.
+ * If a mon grp is requested, create it and write pid to it, otherwise
+ * pid is not written, this means that pid is in con_mon grp and hence
+ * should consult con_mon grp's mon_data directory for results.
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val)
+{
+	char controlgroup[128], monitorgroup[512], monitorgroup_p[256];
+	char tasks[1024];
+	int ret = 0;
+
+	if (strlen(ctrlgrp))
+		sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s", RESCTRL_PATH);
+
+	/* Create control and monitoring group and write pid into it */
+	ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH);
+	if (ret)
+		goto out;
+	sprintf(tasks, "%s/tasks", controlgroup);
+	ret = write_pid_to_tasks(tasks, bm_pid);
+	if (ret)
+		goto out;
+
+	/* Create mon grp and write pid into it for "mbm" and "cmt" test */
+	if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)) ||
+	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+		if (strlen(mongrp)) {
+			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
+			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
+			ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
+			if (ret)
+				goto out;
+
+			sprintf(tasks, "%s/mon_groups/%s/tasks",
+				controlgroup, mongrp);
+			ret = write_pid_to_tasks(tasks, bm_pid);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	ksft_print_msg("Writing benchmark parameters to resctrl FS\n");
+	if (ret)
+		perror("# writing to resctrlfs");
+
+	return ret;
+}
+
+/*
+ * write_schemata - Update schemata of a con_mon grp
+ * @ctrlgrp:		Name of the con_mon grp
+ * @schemata:		Schemata that should be updated to
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * Update schemata of a con_mon grp *only* if requested resctrl feature is
+ * allocation type
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
+{
+	char controlgroup[1024], schema[1024], reason[64];
+	int resource_id, ret = 0;
+	FILE *fp;
+
+	if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) &&
+	    strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) &&
+	    strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+		return -ENOENT;
+
+	if (!schemata) {
+		ksft_print_msg("Skipping empty schemata update\n");
+
+		return -1;
+	}
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		sprintf(reason, "Failed to get resource id");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (strlen(ctrlgrp) != 0)
+		sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
+
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) ||
+	    !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)))
+		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
+	if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)))
+		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+
+	fp = fopen(controlgroup, "w");
+	if (!fp) {
+		sprintf(reason, "Failed to open control group");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (fprintf(fp, "%s\n", schema) < 0) {
+		sprintf(reason, "Failed to write schemata in control group");
+		fclose(fp);
+		ret = -1;
+
+		goto out;
+	}
+	fclose(fp);
+
+out:
+	ksft_print_msg("Write schema \"%s\" to resctrl FS%s%s\n",
+		       schema, ret ? " # " : "",
+		       ret ? reason : "");
+
+	return ret;
+}
+
+bool check_resctrlfs_support(void)
+{
+	FILE *inf = fopen("/proc/filesystems", "r");
+	DIR *dp;
+	char *res;
+	bool ret = false;
+
+	if (!inf)
+		return false;
+
+	res = fgrep(inf, "nodev\tresctrl\n");
+
+	if (res) {
+		ret = true;
+		free(res);
+	}
+
+	fclose(inf);
+
+	ksft_print_msg("%s Check kernel supports resctrl filesystem\n",
+		       ret ? "Pass:" : "Fail:");
+
+	if (!ret)
+		return ret;
+
+	dp = opendir(RESCTRL_PATH);
+	ksft_print_msg("%s Check resctrl mountpoint \"%s\" exists\n",
+		       dp ? "Pass:" : "Fail:", RESCTRL_PATH);
+	if (dp)
+		closedir(dp);
+
+	ksft_print_msg("resctrl filesystem %s mounted\n",
+		       find_resctrl_mount(NULL) ? "not" : "is");
+
+	return ret;
+}
+
+char *fgrep(FILE *inf, const char *str)
+{
+	char line[256];
+	int slen = strlen(str);
+
+	while (!feof(inf)) {
+		if (!fgets(line, 256, inf))
+			break;
+		if (strncmp(line, str, slen))
+			continue;
+
+		return strdup(line);
+	}
+
+	return NULL;
+}
+
+/*
+ * validate_resctrl_feature_request - Check if requested feature is valid.
+ * @resctrl_val:	Requested feature
+ *
+ * Return: True if the feature is supported, else false
+ */
+bool validate_resctrl_feature_request(const char *resctrl_val)
+{
+	struct stat statbuf;
+	bool found = false;
+	char *res;
+	FILE *inf;
+
+	if (!resctrl_val)
+		return false;
+
+	if (remount_resctrlfs(false))
+		return false;
+
+	if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+		if (!stat(L3_PATH, &statbuf))
+			return true;
+	} else if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+		if (!stat(MB_PATH, &statbuf))
+			return true;
+	} else if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+		   !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+		if (!stat(L3_MON_PATH, &statbuf)) {
+			inf = fopen(L3_MON_FEATURES_PATH, "r");
+			if (!inf)
+				return false;
+
+			if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
+				res = fgrep(inf, "llc_occupancy");
+				if (res) {
+					found = true;
+					free(res);
+				}
+			}
+
+			if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+				res = fgrep(inf, "mbm_total_bytes");
+				if (res) {
+					free(res);
+					res = fgrep(inf, "mbm_local_bytes");
+					if (res) {
+						found = true;
+						free(res);
+					}
+				}
+			}
+			fclose(inf);
+		}
+	}
+
+	return found;
+}
+
+int filter_dmesg(void)
+{
+	char line[1024];
+	FILE *fp;
+	int pipefds[2];
+	pid_t pid;
+	int ret;
+
+	ret = pipe(pipefds);
+	if (ret) {
+		perror("pipe");
+		return ret;
+	}
+	pid = fork();
+	if (pid == 0) {
+		close(pipefds[0]);
+		dup2(pipefds[1], STDOUT_FILENO);
+		execlp("dmesg", "dmesg", NULL);
+		perror("executing dmesg");
+		exit(1);
+	}
+	close(pipefds[1]);
+	fp = fdopen(pipefds[0], "r");
+	if (!fp) {
+		perror("fdopen(pipe)");
+		kill(pid, SIGTERM);
+
+		return -1;
+	}
+
+	while (fgets(line, 1024, fp)) {
+		if (strstr(line, "intel_rdt:"))
+			ksft_print_msg("dmesg: %s", line);
+		if (strstr(line, "resctrl:"))
+			ksft_print_msg("dmesg: %s", line);
+	}
+	fclose(fp);
+	waitpid(pid, NULL, 0);
+
+	return 0;
+}
+
+int validate_bw_report_request(char *bw_report)
+{
+	if (strcmp(bw_report, "reads") == 0)
+		return 0;
+	if (strcmp(bw_report, "writes") == 0)
+		return 0;
+	if (strcmp(bw_report, "nt-writes") == 0) {
+		strcpy(bw_report, "writes");
+		return 0;
+	}
+	if (strcmp(bw_report, "total") == 0)
+		return 0;
+
+	fprintf(stderr, "Requested iMC B/W report type unavailable\n");
+
+	return -1;
+}
+
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	int ret;
+
+	ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+		      group_fd, flags);
+	return ret;
+}
+
+unsigned int count_bits(unsigned long n)
+{
+	unsigned int count = 0;
+
+	while (n) {
+		count += n & 1;
+		n >>= 1;
+	}
+
+	return count;
+}
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
index ad0f8df2ca0af7bccd6a6bd46d36d22480503675..cf2f99218d9419dd209043f4f27006317db9b066 100644
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -17,6 +17,7 @@
 #include <string.h>
 #include <assert.h>
 #include <errno.h>
+#include <sys/auxv.h>
 
 #include "../kselftest.h"
 
@@ -24,6 +25,11 @@
 #define SS_AUTODISARM  (1U << 31)
 #endif
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ	51
+#endif
+
+static unsigned int stack_size;
 static void *sstack, *ustack;
 static ucontext_t uc, sc;
 static const char *msg = "[OK]\tStack preserved";
@@ -47,7 +53,7 @@ void my_usr1(int sig, siginfo_t *si, void *u)
 #endif
 
 	if (sp < (unsigned long)sstack ||
-			sp >= (unsigned long)sstack + SIGSTKSZ) {
+			sp >= (unsigned long)sstack + stack_size) {
 		ksft_exit_fail_msg("SP is not on sigaltstack\n");
 	}
 	/* put some data on stack. other sighandler will try to overwrite it */
@@ -108,6 +114,10 @@ int main(void)
 	stack_t stk;
 	int err;
 
+	/* Make sure more than the required minimum. */
+	stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+	ksft_print_msg("[NOTE]\tthe stack size is %lu\n", stack_size);
+
 	ksft_print_header();
 	ksft_set_plan(3);
 
@@ -117,7 +127,7 @@ int main(void)
 	sigaction(SIGUSR1, &act, NULL);
 	act.sa_sigaction = my_usr2;
 	sigaction(SIGUSR2, &act, NULL);
-	sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+	sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
 		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 	if (sstack == MAP_FAILED) {
 		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -139,7 +149,7 @@ int main(void)
 	}
 
 	stk.ss_sp = sstack;
-	stk.ss_size = SIGSTKSZ;
+	stk.ss_size = stack_size;
 	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
 	err = sigaltstack(&stk, NULL);
 	if (err) {
@@ -161,7 +171,7 @@ int main(void)
 		}
 	}
 
-	ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+	ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
 		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 	if (ustack == MAP_FAILED) {
 		ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -170,7 +180,7 @@ int main(void)
 	getcontext(&uc);
 	uc.uc_link = NULL;
 	uc.uc_stack.ss_sp = ustack;
-	uc.uc_stack.ss_size = SIGSTKSZ;
+	uc.uc_stack.ss_size = stack_size;
 	makecontext(&uc, switch_fn, 0);
 	raise(SIGUSR1);
 
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index ac44a33b4c39bbc5468e1cd54757662ce0ff5a98..0e2e8f6ce2fb393b24bc8550c3d3c1c1695736d7 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,11 +13,12 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
 			check_initial_reg_state sigreturn iopl ioperm \
 			protection_keys test_vdso test_vsyscall mov_ss_trap \
-			syscall_arg_fault fsgsbase_restore
+			syscall_arg_fault sigaltstack fsgsbase_restore
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \
+			amx
 # Some selftests require 32bit support enabled also on 64bit systems
 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
new file mode 100644
index 0000000000000000000000000000000000000000..2189f0322d8bf34eca506fc4a33af52875da2e02
--- /dev/null
+++ b/tools/testing/selftests/x86/amx.c
@@ -0,0 +1,863 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <pthread.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define XSAVE_HDR_OFFSET	512
+#define XSAVE_HDR_SIZE		64
+
+struct xsave_buffer {
+	union {
+		struct {
+			char legacy[XSAVE_HDR_OFFSET];
+			char header[XSAVE_HDR_SIZE];
+			char extended[0];
+		};
+		char bytes[0];
+	};
+};
+
+static inline uint64_t xgetbv(uint32_t index)
+{
+	uint32_t eax, edx;
+
+	asm volatile("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((uint64_t)edx << 32);
+}
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	asm volatile("cpuid;"
+		     : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+		     : "0" (*eax), "2" (*ecx));
+}
+
+static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsave (%%rdi)"
+		     : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xrstor (%%rdi)"
+		     : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi));
+}
+
+/* err() exits and will not return */
+#define fatal_error(msg, ...)	err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		fatal_error("sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		fatal_error("sigaction");
+}
+
+#define XFEATURE_XTILECFG	17
+#define XFEATURE_XTILEDATA	18
+#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define CPUID_LEAF1_ECX_XSAVE_MASK	(1 << 26)
+#define CPUID_LEAF1_ECX_OSXSAVE_MASK	(1 << 27)
+static inline void check_cpuid_xsave(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	/*
+	 * CPUID.1:ECX.XSAVE[bit 26] enumerates general
+	 * support for the XSAVE feature set, including
+	 * XGETBV.
+	 */
+	eax = 1;
+	ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK))
+		fatal_error("cpuid: no CPU xsave support");
+	if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK))
+		fatal_error("cpuid: no OS xsave support");
+}
+
+static uint32_t xbuf_size;
+
+static struct {
+	uint32_t xbuf_offset;
+	uint32_t size;
+} xtiledata;
+
+#define CPUID_LEAF_XSTATE		0xd
+#define CPUID_SUBLEAF_XSTATE_USER	0x0
+#define TILE_CPUID			0x1d
+#define TILE_PALETTE_ID			0x1
+
+static void check_cpuid_xtiledata(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	eax = CPUID_LEAF_XSTATE;
+	ecx = CPUID_SUBLEAF_XSTATE_USER;
+	cpuid(&eax, &ebx, &ecx, &edx);
+
+	/*
+	 * EBX enumerates the size (in bytes) required by the XSAVE
+	 * instruction for an XSAVE area containing all the user state
+	 * components corresponding to bits currently set in XCR0.
+	 *
+	 * Stash that off so it can be used to allocate buffers later.
+	 */
+	xbuf_size = ebx;
+
+	eax = CPUID_LEAF_XSTATE;
+	ecx = XFEATURE_XTILEDATA;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	/*
+	 * eax: XTILEDATA state component size
+	 * ebx: XTILEDATA state component offset in user buffer
+	 */
+	if (!eax || !ebx)
+		fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d",
+				eax, ebx);
+
+	xtiledata.size	      = eax;
+	xtiledata.xbuf_offset = ebx;
+}
+
+/* The helpers for managing XSAVE buffer and tile states: */
+
+struct xsave_buffer *alloc_xbuf(void)
+{
+	struct xsave_buffer *xbuf;
+
+	/* XSAVE buffer should be 64B-aligned. */
+	xbuf = aligned_alloc(64, xbuf_size);
+	if (!xbuf)
+		fatal_error("aligned_alloc()");
+	return xbuf;
+}
+
+static inline void clear_xstate_header(struct xsave_buffer *buffer)
+{
+	memset(&buffer->header, 0, sizeof(buffer->header));
+}
+
+static inline uint64_t get_xstatebv(struct xsave_buffer *buffer)
+{
+	/* XSTATE_BV is at the beginning of the header: */
+	return *(uint64_t *)&buffer->header;
+}
+
+static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv)
+{
+	/* XSTATE_BV is at the beginning of the header: */
+	*(uint64_t *)(&buffer->header) = bv;
+}
+
+static void set_rand_tiledata(struct xsave_buffer *xbuf)
+{
+	int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset];
+	int data;
+	int i;
+
+	/*
+	 * Ensure that 'data' is never 0.  This ensures that
+	 * the registers are never in their initial configuration
+	 * and thus never tracked as being in the init state.
+	 */
+	data = rand() | 1;
+
+	for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++)
+		*ptr = data;
+}
+
+struct xsave_buffer *stashed_xsave;
+
+static void init_stashed_xsave(void)
+{
+	stashed_xsave = alloc_xbuf();
+	if (!stashed_xsave)
+		fatal_error("failed to allocate stashed_xsave\n");
+	clear_xstate_header(stashed_xsave);
+}
+
+static void free_stashed_xsave(void)
+{
+	free(stashed_xsave);
+}
+
+/* See 'struct _fpx_sw_bytes' at sigcontext.h */
+#define SW_BYTES_OFFSET		464
+/* N.B. The struct's field name varies so read from the offset. */
+#define SW_BYTES_BV_OFFSET	(SW_BYTES_OFFSET + 8)
+
+static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer)
+{
+	return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET);
+}
+
+static inline uint64_t get_fpx_sw_bytes_features(void *buffer)
+{
+	return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET);
+}
+
+/* Work around printf() being unsafe in signals: */
+#define SIGNAL_BUF_LEN 1000
+char signal_message_buffer[SIGNAL_BUF_LEN];
+void sig_print(char *msg)
+{
+	int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1;
+
+	strncat(signal_message_buffer, msg, left);
+}
+
+static volatile bool noperm_signaled;
+static int noperm_errs;
+/*
+ * Signal handler for when AMX is used but
+ * permission has not been obtained.
+ */
+static void handle_noperm(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
+	void *xbuf = ctx->uc_mcontext.fpregs;
+	struct _fpx_sw_bytes *sw_bytes;
+	uint64_t features;
+
+	/* Reset the signal message buffer: */
+	signal_message_buffer[0] = '\0';
+	sig_print("\tAt SIGILL handler,\n");
+
+	if (si->si_code != ILL_ILLOPC) {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid signal code.\n");
+	} else {
+		sig_print("[OK]\tValid signal code (ILL_ILLOPC).\n");
+	}
+
+	sw_bytes = get_fpx_sw_bytes(xbuf);
+	/*
+	 * Without permission, the signal XSAVE buffer should not
+	 * have room for AMX register state (aka. xtiledata).
+	 * Check that the size does not overlap with where xtiledata
+	 * will reside.
+	 *
+	 * This also implies that no state components *PAST*
+	 * XTILEDATA (features >=19) can be present in the buffer.
+	 */
+	if (sw_bytes->xstate_size <= xtiledata.xbuf_offset) {
+		sig_print("[OK]\tValid xstate size\n");
+	} else {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid xstate size\n");
+	}
+
+	features = get_fpx_sw_bytes_features(xbuf);
+	/*
+	 * Without permission, the XTILEDATA feature
+	 * bit should not be set.
+	 */
+	if ((features & XFEATURE_MASK_XTILEDATA) == 0) {
+		sig_print("[OK]\tValid xstate mask\n");
+	} else {
+		noperm_errs++;
+		sig_print("[FAIL]\tInvalid xstate mask\n");
+	}
+
+	noperm_signaled = true;
+	ctx->uc_mcontext.gregs[REG_RIP] += 3; /* Skip the faulting XRSTOR */
+}
+
+/* Return true if XRSTOR is successful; otherwise, false. */
+static inline bool xrstor_safe(struct xsave_buffer *xbuf, uint64_t mask)
+{
+	noperm_signaled = false;
+	xrstor(xbuf, mask);
+
+	/* Print any messages produced by the signal code: */
+	printf("%s", signal_message_buffer);
+	/*
+	 * Reset the buffer to make sure any future printing
+	 * only outputs new messages:
+	 */
+	signal_message_buffer[0] = '\0';
+
+	if (noperm_errs)
+		fatal_error("saw %d errors in noperm signal handler\n", noperm_errs);
+
+	return !noperm_signaled;
+}
+
+/*
+ * Use XRSTOR to populate the XTILEDATA registers with
+ * random data.
+ *
+ * Return true if successful; otherwise, false.
+ */
+static inline bool load_rand_tiledata(struct xsave_buffer *xbuf)
+{
+	clear_xstate_header(xbuf);
+	set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA);
+	set_rand_tiledata(xbuf);
+	return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA);
+}
+
+/* Return XTILEDATA to its initial configuration. */
+static inline void init_xtiledata(void)
+{
+	clear_xstate_header(stashed_xsave);
+	xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA);
+}
+
+enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED };
+
+/* arch_prctl() and sigaltstack() test */
+
+#define ARCH_GET_XCOMP_PERM	0x1022
+#define ARCH_REQ_XCOMP_PERM	0x1023
+
+static void req_xtiledata_perm(void)
+{
+	syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+}
+
+static void validate_req_xcomp_perm(enum expected_result exp)
+{
+	unsigned long bitmask, expected_bitmask;
+	long rc;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+	if (rc) {
+		fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
+	} else if (!(bitmask & XFEATURE_MASK_XTILECFG)) {
+		fatal_error("ARCH_GET_XCOMP_PERM returns XFEATURE_XTILECFG off.");
+	}
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+	if (exp == FAIL_EXPECTED) {
+		if (rc) {
+			printf("[OK]\tARCH_REQ_XCOMP_PERM saw expected failure..\n");
+			return;
+		}
+
+		fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected success.\n");
+	} else if (rc) {
+		fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected failure.\n");
+	}
+
+	expected_bitmask = bitmask | XFEATURE_MASK_XTILEDATA;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+	if (rc) {
+		fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
+	} else if (bitmask != expected_bitmask) {
+		fatal_error("ARCH_REQ_XCOMP_PERM set a wrong bitmask: %lx, expected: %lx.\n",
+			    bitmask, expected_bitmask);
+	} else {
+		printf("\tARCH_REQ_XCOMP_PERM is successful.\n");
+	}
+}
+
+static void validate_xcomp_perm(enum expected_result exp)
+{
+	bool load_success = load_rand_tiledata(stashed_xsave);
+
+	if (exp == FAIL_EXPECTED) {
+		if (load_success) {
+			noperm_errs++;
+			printf("[FAIL]\tLoad tiledata succeeded.\n");
+		} else {
+			printf("[OK]\tLoad tiledata failed.\n");
+		}
+	} else if (exp == SUCCESS_EXPECTED) {
+		if (load_success) {
+			printf("[OK]\tLoad tiledata succeeded.\n");
+		} else {
+			noperm_errs++;
+			printf("[FAIL]\tLoad tiledata failed.\n");
+		}
+	}
+}
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ	51
+#endif
+
+static void *alloc_altstack(unsigned int size)
+{
+	void *altstack;
+
+	altstack = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+
+	if (altstack == MAP_FAILED)
+		fatal_error("mmap() for altstack");
+
+	return altstack;
+}
+
+static void setup_altstack(void *addr, unsigned long size, enum expected_result exp)
+{
+	stack_t ss;
+	int rc;
+
+	memset(&ss, 0, sizeof(ss));
+	ss.ss_size = size;
+	ss.ss_sp = addr;
+
+	rc = sigaltstack(&ss, NULL);
+
+	if (exp == FAIL_EXPECTED) {
+		if (rc) {
+			printf("[OK]\tsigaltstack() failed.\n");
+		} else {
+			fatal_error("sigaltstack() succeeded unexpectedly.\n");
+		}
+	} else if (rc) {
+		fatal_error("sigaltstack()");
+	}
+}
+
+static void test_dynamic_sigaltstack(void)
+{
+	unsigned int small_size, enough_size;
+	unsigned long minsigstksz;
+	void *altstack;
+
+	minsigstksz = getauxval(AT_MINSIGSTKSZ);
+	printf("\tAT_MINSIGSTKSZ = %lu\n", minsigstksz);
+	/*
+	 * getauxval() itself can return 0 for failure or
+	 * success.  But, in this case, AT_MINSIGSTKSZ
+	 * will always return a >=0 value if implemented.
+	 * Just check for 0.
+	 */
+	if (minsigstksz == 0) {
+		printf("no support for AT_MINSIGSTKSZ, skipping sigaltstack tests\n");
+		return;
+	}
+
+	enough_size = minsigstksz * 2;
+
+	altstack = alloc_altstack(enough_size);
+	printf("\tAllocate memory for altstack (%u bytes).\n", enough_size);
+
+	/*
+	 * Try setup_altstack() with a size which can not fit
+	 * XTILEDATA.  ARCH_REQ_XCOMP_PERM should fail.
+	 */
+	small_size = minsigstksz - xtiledata.size;
+	printf("\tAfter sigaltstack() with small size (%u bytes).\n", small_size);
+	setup_altstack(altstack, small_size, SUCCESS_EXPECTED);
+	validate_req_xcomp_perm(FAIL_EXPECTED);
+
+	/*
+	 * Try setup_altstack() with a size derived from
+	 * AT_MINSIGSTKSZ.  It should be more than large enough
+	 * and thus ARCH_REQ_XCOMP_PERM should succeed.
+	 */
+	printf("\tAfter sigaltstack() with enough size (%u bytes).\n", enough_size);
+	setup_altstack(altstack, enough_size, SUCCESS_EXPECTED);
+	validate_req_xcomp_perm(SUCCESS_EXPECTED);
+
+	/*
+	 * Try to coerce setup_altstack() to again accept a
+	 * too-small altstack.  This ensures that big-enough
+	 * sigaltstacks can not shrink to a too-small value
+	 * once XTILEDATA permission is established.
+	 */
+	printf("\tThen, sigaltstack() with small size (%u bytes).\n", small_size);
+	setup_altstack(altstack, small_size, FAIL_EXPECTED);
+}
+
+static void test_dynamic_state(void)
+{
+	pid_t parent, child, grandchild;
+
+	parent = fork();
+	if (parent < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (parent > 0) {
+		int status;
+		/* fork() succeeded.  Now in the parent. */
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("arch_prctl test parent exit");
+		return;
+	}
+	/* fork() succeeded.  Now in the child . */
+
+	printf("[RUN]\tCheck ARCH_REQ_XCOMP_PERM around process fork() and sigaltack() test.\n");
+
+	printf("\tFork a child.\n");
+	child = fork();
+	if (child < 0) {
+		fatal_error("fork");
+	} else if (child > 0) {
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("arch_prctl test child exit");
+		_exit(0);
+	}
+
+	/*
+	 * The permission request should fail without an
+	 * XTILEDATA-compatible signal stack
+	 */
+	printf("\tTest XCOMP_PERM at child.\n");
+	validate_xcomp_perm(FAIL_EXPECTED);
+
+	/*
+	 * Set up an XTILEDATA-compatible signal stack and
+	 * also obtain permission to populate XTILEDATA.
+	 */
+	printf("\tTest dynamic sigaltstack at child:\n");
+	test_dynamic_sigaltstack();
+
+	/* Ensure that XTILEDATA can be populated. */
+	printf("\tTest XCOMP_PERM again at child.\n");
+	validate_xcomp_perm(SUCCESS_EXPECTED);
+
+	printf("\tFork a grandchild.\n");
+	grandchild = fork();
+	if (grandchild < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (!grandchild) {
+		/* fork() succeeded.  Now in the (grand)child. */
+		printf("\tTest XCOMP_PERM at grandchild.\n");
+
+		/*
+		 * Ensure that the grandchild inherited
+		 * permission and a compatible sigaltstack:
+		 */
+		validate_xcomp_perm(SUCCESS_EXPECTED);
+	} else {
+		int status;
+		/* fork() succeeded.  Now in the parent. */
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test grandchild");
+	}
+
+	_exit(0);
+}
+
+/*
+ * Save current register state and compare it to @xbuf1.'
+ *
+ * Returns false if @xbuf1 matches the registers.
+ * Returns true  if @xbuf1 differs from the registers.
+ */
+static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1)
+{
+	struct xsave_buffer *xbuf2;
+	int ret;
+
+	xbuf2 = alloc_xbuf();
+	if (!xbuf2)
+		fatal_error("failed to allocate XSAVE buffer\n");
+
+	xsave(xbuf2, XFEATURE_MASK_XTILEDATA);
+	ret = memcmp(&xbuf1->bytes[xtiledata.xbuf_offset],
+		     &xbuf2->bytes[xtiledata.xbuf_offset],
+		     xtiledata.size);
+
+	free(xbuf2);
+
+	if (ret == 0)
+		return false;
+	return true;
+}
+
+static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf)
+{
+	int ret = __validate_tiledata_regs(xbuf);
+
+	if (ret != 0)
+		fatal_error("TILEDATA registers changed");
+}
+
+static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf)
+{
+	int ret = __validate_tiledata_regs(xbuf);
+
+	if (ret == 0)
+		fatal_error("TILEDATA registers did not change");
+}
+
+/* tiledata inheritance test */
+
+static void test_fork(void)
+{
+	pid_t child, grandchild;
+
+	child = fork();
+	if (child < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (child > 0) {
+		/* fork() succeeded.  Now in the parent. */
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test child");
+		return;
+	}
+	/* fork() succeeded.  Now in the child. */
+	printf("[RUN]\tCheck tile data inheritance.\n\tBefore fork(), load tiledata\n");
+
+	load_rand_tiledata(stashed_xsave);
+
+	grandchild = fork();
+	if (grandchild < 0) {
+		/* fork() failed */
+		fatal_error("fork");
+	} else if (grandchild > 0) {
+		/* fork() succeeded.  Still in the first child. */
+		int status;
+
+		wait(&status);
+		if (!WIFEXITED(status) || WEXITSTATUS(status))
+			fatal_error("fork test grand child");
+		_exit(0);
+	}
+	/* fork() succeeded.  Now in the (grand)child. */
+
+	/*
+	 * TILEDATA registers are not preserved across fork().
+	 * Ensure that their value has changed:
+	 */
+	validate_tiledata_regs_changed(stashed_xsave);
+
+	_exit(0);
+}
+
+/* Context switching test */
+
+static struct _ctxtswtest_cfg {
+	unsigned int iterations;
+	unsigned int num_threads;
+} ctxtswtest_config;
+
+struct futex_info {
+	pthread_t thread;
+	int nr;
+	pthread_mutex_t mutex;
+	struct futex_info *next;
+};
+
+static void *check_tiledata(void *info)
+{
+	struct futex_info *finfo = (struct futex_info *)info;
+	struct xsave_buffer *xbuf;
+	int i;
+
+	xbuf = alloc_xbuf();
+	if (!xbuf)
+		fatal_error("unable to allocate XSAVE buffer");
+
+	/*
+	 * Load random data into 'xbuf' and then restore
+	 * it to the tile registers themselves.
+	 */
+	load_rand_tiledata(xbuf);
+	for (i = 0; i < ctxtswtest_config.iterations; i++) {
+		pthread_mutex_lock(&finfo->mutex);
+
+		/*
+		 * Ensure the register values have not
+		 * diverged from those recorded in 'xbuf'.
+		 */
+		validate_tiledata_regs_same(xbuf);
+
+		/* Load new, random values into xbuf and registers */
+		load_rand_tiledata(xbuf);
+
+		/*
+		 * The last thread's last unlock will be for
+		 * thread 0's mutex.  However, thread 0 will
+		 * have already exited the loop and the mutex
+		 * will already be unlocked.
+		 *
+		 * Because this is not an ERRORCHECK mutex,
+		 * that inconsistency will be silently ignored.
+		 */
+		pthread_mutex_unlock(&finfo->next->mutex);
+	}
+
+	free(xbuf);
+	/*
+	 * Return this thread's finfo, which is
+	 * a unique value for this thread.
+	 */
+	return finfo;
+}
+
+static int create_threads(int num, struct futex_info *finfo)
+{
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int next_nr;
+
+		finfo[i].nr = i;
+		/*
+		 * Thread 'i' will wait on this mutex to
+		 * be unlocked.  Lock it immediately after
+		 * initialization:
+		 */
+		pthread_mutex_init(&finfo[i].mutex, NULL);
+		pthread_mutex_lock(&finfo[i].mutex);
+
+		next_nr = (i + 1) % num;
+		finfo[i].next = &finfo[next_nr];
+
+		if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i]))
+			fatal_error("pthread_create()");
+	}
+	return 0;
+}
+
+static void affinitize_cpu0(void)
+{
+	cpu_set_t cpuset;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		fatal_error("sched_setaffinity to CPU 0");
+}
+
+static void test_context_switch(void)
+{
+	struct futex_info *finfo;
+	int i;
+
+	/* Affinitize to one CPU to force context switches */
+	affinitize_cpu0();
+
+	req_xtiledata_perm();
+
+	printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n",
+	       ctxtswtest_config.iterations,
+	       ctxtswtest_config.num_threads);
+
+
+	finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads);
+	if (!finfo)
+		fatal_error("malloc()");
+
+	create_threads(ctxtswtest_config.num_threads, finfo);
+
+	/*
+	 * This thread wakes up thread 0
+	 * Thread 0 will wake up 1
+	 * Thread 1 will wake up 2
+	 * ...
+	 * the last thread will wake up 0
+	 *
+	 * ... this will repeat for the configured
+	 * number of iterations.
+	 */
+	pthread_mutex_unlock(&finfo[0].mutex);
+
+	/* Wait for all the threads to finish: */
+	for (i = 0; i < ctxtswtest_config.num_threads; i++) {
+		void *thread_retval;
+		int rc;
+
+		rc = pthread_join(finfo[i].thread, &thread_retval);
+
+		if (rc)
+			fatal_error("pthread_join() failed for thread %d err: %d\n",
+					i, rc);
+
+		if (thread_retval != &finfo[i])
+			fatal_error("unexpected thread retval for thread %d: %p\n",
+					i, thread_retval);
+
+	}
+
+	printf("[OK]\tNo incorrect case was found.\n");
+
+	free(finfo);
+}
+
+int main(void)
+{
+	/* Check hardware availability at first */
+	check_cpuid_xsave();
+	check_cpuid_xtiledata();
+
+	init_stashed_xsave();
+	sethandler(SIGILL, handle_noperm, 0);
+
+	test_dynamic_state();
+
+	/* Request permission for the following tests */
+	req_xtiledata_perm();
+
+	test_fork();
+
+	ctxtswtest_config.iterations = 10;
+	ctxtswtest_config.num_threads = 5;
+	test_context_switch();
+
+	clearhandler(SIGILL);
+	free_stashed_xsave();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index 0000000000000000000000000000000000000000..f689af75e979eaa4e86b3944ee2d25a3e496e8cf
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <setjmp.h>
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ	2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ	51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+	stack_t ss;
+
+	memset(&ss, 0, sizeof(ss));
+	ss.ss_size = size;
+	ss.ss_sp = start;
+
+	return sigaltstack(&ss, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	if (sigalrm_expected) {
+		printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected SIGALRM).");
+		nerrs++;
+	} else {
+		printf("[OK]\tSIGSEGV signal delivered.\n");
+	}
+
+	siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+	if (!sigalrm_expected) {
+		printf("[FAIL]\tWrong signal delivered: SIGALRM (expected SIGSEGV).");
+		nerrs++;
+	} else {
+		printf("[OK]\tSIGALRM signal delivered.\n");
+	}
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+	if (setup_altstack(altstack, size))
+		err(1, "sigaltstack()");
+
+	sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+	sethandler(SIGSEGV, sigsegv, 0);
+	sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+	if (!sigsetjmp(jmpbuf, 1)) {
+		printf("[RUN]\tTest an alternate signal stack of %ssufficient size.\n",
+		       sigalrm_expected ? "" : "in");
+		printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+		       sigalrm_expected ? "It" : "SIGSEGV");
+		raise(SIGALRM);
+	}
+
+	clearhandler(SIGALRM);
+	clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+	void *altstack;
+
+	at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+	altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (altstack == MAP_FAILED)
+		err(1, "mmap()");
+
+	if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+		test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+	test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+	return nerrs == 0 ? 0 : 1;
+}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 2e7d2b3f290791294d76485e022ffab117b0a612..4d2f141a804ac331d6fb8e1dd4f35b2ab8e2e108 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -253,21 +253,22 @@ void kvm_arch_free_vm(struct kvm *kvm)
 		vfree(kvm);
 }
 
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+		return -EBUSY;
+
+	if (id >= kvm->arch.max_vcpus)
+		return -EINVAL;
+
+	return 0;
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct kvm_vcpu *vcpu;
 
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (id >= kvm->arch.max_vcpus) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu) {
 		err = -ENOMEM;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f83fa0aeeb4518551adb076abbc0c0b300bc0b5f..6a85f7be61b8081129afc4361a6f93adac474d49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
 #include <linux/io.h>
 #include <linux/lockdep.h>
 #include <linux/kthread.h>
+#include <linux/proc_fs.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -1389,6 +1390,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 {
 	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
@@ -2803,7 +2805,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	int r;
 	struct kvm_vcpu *vcpu;
 
-	if (id >= KVM_MAX_VCPU_ID)
+	if (id >= KVM_MAX_VCPU_IDS)
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
@@ -2812,6 +2814,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		return -EINVAL;
 	}
 
+	r = kvm_arch_vcpu_precreate(kvm, id);
+	if (r) {
+		mutex_unlock(&kvm->lock);
+		return r;
+	}
+
 	kvm->created_vcpus++;
 	mutex_unlock(&kvm->lock);
 
@@ -4412,6 +4420,113 @@ static void check_processor_compat(void *rtn)
 	*(int *)rtn = kvm_arch_check_processor_compat();
 }
 
+static int kvmmmu_limit_nr_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_limit_nr);
+	return 0; 
+}
+
+static int kvmmmu_limit_nr_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_limit_nr_proc_show, NULL);
+}
+
+static ssize_t kvmmmu_limit_nr_proc_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	char str[16] = {0};
+	if (count > 1) {
+		int value;
+		int size = min((int)count - 1, 10);
+
+		if (copy_from_user(str, buf, size))
+			goto out;
+		
+		if (!kstrtoint(str, 10, &value)) {
+			if (value < 0)
+				value = 0;
+			kvm_mmu_limit_nr = value;
+		}
+	}
+
+out:
+	return count;
+}
+
+static const struct file_operations kvmmmu_limit_nr_proc_fops = {
+	.open		= kvmmmu_limit_nr_proc_open,
+	.read		= seq_read,
+	.write		= kvmmmu_limit_nr_proc_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int kvmmmu_reclaim_try_times_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_reclaim_try_times);
+	return 0; 
+}
+
+static int kvmmmu_reclaim_try_times_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_reclaim_try_times_proc_show, NULL);
+}
+
+static const struct file_operations kvmmmu_reclaim_try_times_proc_fops = {
+	.open		= kvmmmu_reclaim_try_times_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int kvmmmu_reclaim_times_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", kvm_mmu_reclaim_times);
+	return 0; 
+}
+
+static int kvmmmu_reclaim_times_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmmmu_reclaim_times_proc_show, NULL);
+}
+
+static const struct file_operations kvmmmu_reclaim_times_proc_fops = {
+	.open		= kvmmmu_reclaim_times_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void kvm_proc_interface_uninit(void)
+{
+	remove_proc_entry("kvm_control/kvmmmu_reclaim_times", NULL);
+	remove_proc_entry("kvm_control/kvmmmu_reclaim_try_times", NULL);
+	remove_proc_entry("kvm_control/kvmmmu_limit_nr", NULL);
+	remove_proc_entry("kvm_control", NULL);
+}
+
+int kvm_proc_interface_init(void)
+{
+	int ret = -ENOMEM;
+	if (!proc_mkdir("kvm_control", NULL))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_limit_nr", 0, NULL, &kvmmmu_limit_nr_proc_fops))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_reclaim_try_times", 0, NULL, &kvmmmu_reclaim_try_times_proc_fops))
+		goto err;
+
+	if (!proc_create("kvm_control/kvmmmu_reclaim_times", 0, NULL, &kvmmmu_reclaim_times_proc_fops))
+		goto err;
+
+	return 0;
+err:
+	kvm_proc_interface_uninit();
+	return ret;
+}
+
+
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
@@ -4492,6 +4607,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	r = kvm_vfio_ops_init();
 	WARN_ON(r);
 
+	if (kvm_proc_interface_init())
+		pr_err("kvm: create kvm proc control interface failed\n");
+
 	return 0;
 
 out_unreg:
@@ -4530,6 +4648,7 @@ void kvm_exit(void)
 	kvm_irqfd_exit();
 	free_cpumask_var(cpus_hardware_enabled);
 	kvm_vfio_ops_exit();
+	kvm_proc_interface_uninit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);